@dragon708/docmind-browser 1.2.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +65 -27
- package/dist/index.js +385 -164
- package/package.json +3 -3
package/dist/index.d.ts
CHANGED
|
@@ -1,17 +1,30 @@
|
|
|
1
1
|
import { DocMindAnalyzeOptions, AnalysisResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
|
|
2
2
|
export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
|
|
3
|
-
import { OcrOptions } from '@dragon708/docmind-ocr';
|
|
3
|
+
import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
|
|
4
|
+
import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
|
|
5
|
+
export { AnalyzeDocxIncludeFlags } from '@dragon708/docmind-docx';
|
|
4
6
|
|
|
7
|
+
/**
|
|
8
|
+
* Opciones DOCX para el facade browser (Mammoth + inclusiones v2 de `@dragon708/docmind-docx`; sin APIs Node-only).
|
|
9
|
+
*/
|
|
10
|
+
interface BrowserAnalyzeDocxOptionsSlice {
|
|
11
|
+
readonly include?: AnalyzeDocxIncludeFlags;
|
|
12
|
+
readonly html?: DocxToHtmlOptions;
|
|
13
|
+
}
|
|
5
14
|
/**
|
|
6
15
|
* OCR behavior for browser intents that touch raster images.
|
|
7
16
|
* - `off`: do not invoke Tesseract; text stays empty with an explanatory warning.
|
|
8
17
|
* - `auto` (default): run OCR when the input is classified as an image.
|
|
9
18
|
* - `force`: same as `auto` in the browser runtime (no PDF-style text layer to compare); reserved for parity with Node.
|
|
19
|
+
*
|
|
20
|
+
* Multipage TIFF (when sniffed): `maxPages` and `pageSeparator` match `OcrTiffOptions` in `@dragon708/docmind-ocr` (best-effort UTIF in-browser).
|
|
21
|
+
* Optional {@link PreprocessImageOptions} runs in-browser (canvas) on the normalized raster before Tesseract when using `ocrImageDetailed`.
|
|
10
22
|
*/
|
|
11
23
|
type BrowserOcrMode = "off" | "auto" | "force";
|
|
12
|
-
/** Browser OCR options: Tesseract knobs from `@dragon708/docmind-ocr` plus optional {@link BrowserOcrMode}. */
|
|
13
|
-
interface BrowserOcrOptions extends OcrOptions {
|
|
24
|
+
/** Browser OCR options: Tesseract knobs from `@dragon708/docmind-ocr` plus optional {@link BrowserOcrMode}, TIFF caps, and canvas preprocess. */
|
|
25
|
+
interface BrowserOcrOptions extends OcrOptions, Pick<OcrTiffOptions, "maxPages" | "pageSeparator"> {
|
|
14
26
|
readonly mode?: BrowserOcrMode;
|
|
27
|
+
readonly preprocess?: PreprocessImageOptions;
|
|
15
28
|
}
|
|
16
29
|
/**
|
|
17
30
|
* Options for public browser methods (`analyzeFile`, intent APIs).
|
|
@@ -20,6 +33,8 @@ interface BrowserOcrOptions extends OcrOptions {
|
|
|
20
33
|
interface BrowserAnalyzeOptions extends DocMindAnalyzeOptions {
|
|
21
34
|
/** Image OCR only; no PDF in this runtime. See {@link BrowserOcrOptions.mode}. */
|
|
22
35
|
readonly ocr?: BrowserOcrOptions;
|
|
36
|
+
/** Solo DOCX: ver {@link BrowserAnalyzeDocxOptionsSlice}. */
|
|
37
|
+
readonly docx?: BrowserAnalyzeDocxOptionsSlice;
|
|
23
38
|
}
|
|
24
39
|
|
|
25
40
|
/**
|
|
@@ -56,10 +71,49 @@ declare function extractMetadata(input: BrowserAnalyzeInput, options?: BrowserAn
|
|
|
56
71
|
declare function convertToHtml(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOptions): Promise<AnalysisResult>;
|
|
57
72
|
/**
|
|
58
73
|
* OCR-focused intent. Honors {@link BrowserAnalyzeOptions.ocr} **mode** (`off` | `auto` | `force`) for images.
|
|
74
|
+
* Raster path uses `normalizeImageForOcr` via `ocrImageDetailed` (or `ocrTiff` for TIFF); no Node-only libraries.
|
|
75
|
+
* HEIC/HEIF and hard failures yield empty text + warnings instead of throwing (abort still propagates).
|
|
59
76
|
* DOCX returns structured extract with a notice (no OCR). Text decodes as UTF-8 (no OCR).
|
|
60
77
|
*/
|
|
61
78
|
declare function runOcr(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOptions): Promise<AnalysisResult>;
|
|
62
79
|
|
|
80
|
+
/** High-level features the user can ask DocMind for (per input kind and runtime). */
|
|
81
|
+
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
|
|
82
|
+
declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
|
|
83
|
+
/** DOCX `word/media` en runtime browser (JSZip; sin pipeline Node). */
|
|
84
|
+
interface DocxEmbeddedImageCapabilities {
|
|
85
|
+
readonly canExtractEmbeddedImages: true;
|
|
86
|
+
readonly documentsMayIncludeImagesRequiringWebConversion: true;
|
|
87
|
+
/** En browser no hay conversión EMF/WMF a PNG empaquetada; `convertDocxEmbeddedImageToWeb` devuelve bytes originales + avisos. */
|
|
88
|
+
readonly webFriendlyRasterConversionInBrowser: false;
|
|
89
|
+
readonly notes: readonly string[];
|
|
90
|
+
}
|
|
91
|
+
declare const DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER: DocxEmbeddedImageCapabilities;
|
|
92
|
+
interface DocxStructuralCapabilities {
|
|
93
|
+
readonly ooxmlExtractorsAvailable: true;
|
|
94
|
+
readonly activatedViaDocxInclude: true;
|
|
95
|
+
readonly features: readonly string[];
|
|
96
|
+
readonly notes: readonly string[];
|
|
97
|
+
}
|
|
98
|
+
declare const DOCX_STRUCTURE_CAPABILITIES_BROWSER: DocxStructuralCapabilities;
|
|
99
|
+
/** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
|
|
100
|
+
interface PublicCapabilitySupport {
|
|
101
|
+
readonly id: PublicCapabilityId;
|
|
102
|
+
readonly supported: boolean;
|
|
103
|
+
readonly warnings?: readonly string[];
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
|
|
107
|
+
*/
|
|
108
|
+
interface GetCapabilitiesReport {
|
|
109
|
+
readonly kind: FileKind;
|
|
110
|
+
readonly runtime: RuntimeDescriptor;
|
|
111
|
+
readonly capabilities: readonly PublicCapabilitySupport[];
|
|
112
|
+
readonly docxEmbeddedImages?: DocxEmbeddedImageCapabilities;
|
|
113
|
+
readonly docxStructure?: DocxStructuralCapabilities;
|
|
114
|
+
readonly warnings?: readonly string[];
|
|
115
|
+
}
|
|
116
|
+
|
|
63
117
|
/**
|
|
64
118
|
* Whether DocMind will try a non-OCR text/HTML path (e.g. Mammoth, UTF-8, PDF text layer — when available).
|
|
65
119
|
*/
|
|
@@ -91,40 +145,24 @@ interface ExplainAnalysisPlanReport {
|
|
|
91
145
|
readonly limitations: readonly string[];
|
|
92
146
|
/** Ordered pipeline steps (planned/skipped/done metadata only). */
|
|
93
147
|
readonly plan: ProcessingPlanDescriptor;
|
|
148
|
+
readonly docxEmbeddedImages?: DocxEmbeddedImageCapabilities;
|
|
149
|
+
readonly docxStructure?: DocxStructuralCapabilities;
|
|
94
150
|
readonly warnings?: readonly string[];
|
|
95
151
|
}
|
|
96
152
|
|
|
97
|
-
/**
|
|
98
|
-
type
|
|
99
|
-
/** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
|
|
100
|
-
interface PublicCapabilitySupport {
|
|
101
|
-
readonly id: PublicCapabilityId;
|
|
102
|
-
readonly supported: boolean;
|
|
103
|
-
readonly warnings?: readonly string[];
|
|
104
|
-
}
|
|
105
|
-
/**
|
|
106
|
-
* Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
|
|
107
|
-
*/
|
|
108
|
-
interface GetCapabilitiesReport {
|
|
109
|
-
readonly kind: FileKind;
|
|
110
|
-
readonly runtime: RuntimeDescriptor;
|
|
111
|
-
readonly capabilities: readonly PublicCapabilitySupport[];
|
|
112
|
-
readonly warnings?: readonly string[];
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
/** Options for {@link explainAnalysisPlan}: shared fields plus optional `ocr` for accurate OCR-step preview. */
|
|
116
|
-
type BrowserExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<BrowserAnalyzeOptions, "ocr">;
|
|
153
|
+
/** Options for {@link explainAnalysisPlan}: shared fields plus optional `ocr` / `docx` for accurate step preview. */
|
|
154
|
+
type BrowserExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<BrowserAnalyzeOptions, "ocr" | "docx">;
|
|
117
155
|
|
|
118
156
|
/**
|
|
119
157
|
* Epic 1 — **Capabilities:** detects kind from the same hints as `analyzeFile`, then lists which of
|
|
120
|
-
* `text` | `metadata` | `html` | `ocr` | `pages` apply in the browser (PDF always unsupported).
|
|
121
|
-
* No Mammoth/Tesseract/PDF parsing.
|
|
158
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` and image-specific ids (`image-normalization`, `bmp`, `gif-first-frame`, `heic`, `tiff`) apply in the browser (PDF always unsupported).
|
|
159
|
+
* No Mammoth/Tesseract/PDF parsing. For DOCX, {@link GetCapabilitiesReport.docxStructure} / `docxEmbeddedImages` describe v2 opt-in features.
|
|
122
160
|
*/
|
|
123
161
|
declare function getCapabilities(input: BrowserAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
|
|
124
162
|
/**
|
|
125
163
|
* Epic 1 — **Plan preview:** structured explanation (analyzer, native extraction vs OCR, `limitations`, `plan.steps`)
|
|
126
|
-
* for a {@link DocMindPublicIntent}. Optional `ocr`
|
|
164
|
+
* for a {@link DocMindPublicIntent}. Optional `ocr` refines image steps; optional `docx.include` adds planned OOXML parallel steps for DOCX. No heavy I/O.
|
|
127
165
|
*/
|
|
128
166
|
declare function explainAnalysisPlan(input: BrowserAnalyzeInput, options?: BrowserExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
|
|
129
167
|
|
|
130
|
-
export { BROWSER_PDF_UNSUPPORTED_WARNING, type BrowserAnalyzeInput, type BrowserAnalyzeOptions, type BrowserExplainAnalysisPlanOptions, type BrowserOcrMode, type BrowserOcrOptions, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, runOcr };
|
|
168
|
+
export { BROWSER_PDF_UNSUPPORTED_WARNING, type BrowserAnalyzeDocxOptionsSlice, type BrowserAnalyzeInput, type BrowserAnalyzeOptions, type BrowserExplainAnalysisPlanOptions, type BrowserOcrMode, type BrowserOcrOptions, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, runOcr };
|
package/dist/index.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, InvalidInputError } from '@dragon708/docmind-shared';
|
|
1
|
+
import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, InvalidInputError, getMimeType } from '@dragon708/docmind-shared';
|
|
2
2
|
export { detectFileKind } from '@dragon708/docmind-shared';
|
|
3
3
|
import { analyzeDocx } from '@dragon708/docmind-docx';
|
|
4
|
-
import {
|
|
4
|
+
import { preprocessHasEffect, resolveImageFormat, normalizeImageForOcr, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
|
|
5
5
|
|
|
6
6
|
// src/analyzeFile.ts
|
|
7
7
|
function assertBrowserInput(input) {
|
|
@@ -22,7 +22,41 @@ function prepareBrowserAnalyzeInput(input) {
|
|
|
22
22
|
assertValidAnalyzeFileInput(input);
|
|
23
23
|
return input;
|
|
24
24
|
}
|
|
25
|
-
|
|
25
|
+
|
|
26
|
+
// src/docxBrowserMapper.ts
|
|
27
|
+
function analyzeDocxOptionsFromBrowser(options) {
|
|
28
|
+
const sig = options?.signal;
|
|
29
|
+
const dx = options?.docx;
|
|
30
|
+
if (!dx?.include && !dx?.html && !sig) return void 0;
|
|
31
|
+
const out = { ...dx?.html ?? {} };
|
|
32
|
+
if (dx?.include) out.include = dx.include;
|
|
33
|
+
if (sig) out.signal = sig;
|
|
34
|
+
return out;
|
|
35
|
+
}
|
|
36
|
+
function docxPackageResultToAnalysisResult(r) {
|
|
37
|
+
const base = {
|
|
38
|
+
fileKind: "docx",
|
|
39
|
+
analyzer: "docx",
|
|
40
|
+
status: "ok",
|
|
41
|
+
kind: "docx",
|
|
42
|
+
text: r.text,
|
|
43
|
+
html: r.html,
|
|
44
|
+
warnings: [...r.warnings]
|
|
45
|
+
};
|
|
46
|
+
const v2 = {
|
|
47
|
+
...r.structure !== void 0 ? { structure: r.structure } : {},
|
|
48
|
+
...r.headings !== void 0 ? { headings: r.headings } : {},
|
|
49
|
+
...r.tables !== void 0 ? { tables: r.tables } : {},
|
|
50
|
+
...r.blocks !== void 0 ? { blocks: r.blocks } : {},
|
|
51
|
+
...r.pagesApprox !== void 0 ? { pagesApprox: r.pagesApprox } : {},
|
|
52
|
+
...r.embeddedImages !== void 0 ? { embeddedImages: r.embeddedImages } : {}
|
|
53
|
+
};
|
|
54
|
+
return { ...base, ...v2 };
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// src/analyzers/docx.ts
|
|
58
|
+
async function analyzeDocxForBrowser(input, options) {
|
|
59
|
+
const signal = options?.signal;
|
|
26
60
|
if (signal?.aborted) {
|
|
27
61
|
const err = new Error("The operation was aborted");
|
|
28
62
|
err.name = "AbortError";
|
|
@@ -40,17 +74,97 @@ async function analyzeDocxForBrowser(input, signal) {
|
|
|
40
74
|
warnings: ["No document bytes were provided for analysis."]
|
|
41
75
|
};
|
|
42
76
|
}
|
|
43
|
-
const
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
77
|
+
const docxOpts = analyzeDocxOptionsFromBrowser(options);
|
|
78
|
+
const r = docxOpts !== void 0 ? await analyzeDocx(data, docxOpts) : await analyzeDocx(data);
|
|
79
|
+
return docxPackageResultToAnalysisResult(r);
|
|
80
|
+
}
|
|
81
|
+
var BROWSER_TIFF_BEST_EFFORT_WARNING = "Browser TIFF support is best-effort: decoding uses UTIF in JavaScript/WebAssembly\u2014some compressions, color modes, very large or multipage files may fail, hang, or exhaust memory. For heavy TIFF workloads use @dragon708/docmind-node (optional sharp).";
|
|
82
|
+
function meanPageConfidence(pages) {
|
|
83
|
+
if (pages.length === 0) return 0;
|
|
84
|
+
return pages.reduce((s, p) => s + p.confidence, 0) / pages.length;
|
|
85
|
+
}
|
|
86
|
+
function rethrowIfAbort(e) {
|
|
87
|
+
if (e instanceof Error && e.name === "AbortError") throw e;
|
|
88
|
+
}
|
|
89
|
+
function ocrFailureWarnings(prefix, e) {
|
|
90
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
91
|
+
return [`${prefix} ${msg}`];
|
|
92
|
+
}
|
|
93
|
+
async function runRasterOcrForBrowser(data, input, options) {
|
|
94
|
+
const signal = options?.ocr?.signal ?? options?.signal;
|
|
95
|
+
const langs = options?.ocr?.langs;
|
|
96
|
+
const mimeHint = getMimeType(input);
|
|
97
|
+
const format = resolveImageFormat(data, mimeHint);
|
|
98
|
+
if (format === "heic" || format === "heif") {
|
|
99
|
+
const norm = await normalizeImageForOcr(data, { signal, mimeHint });
|
|
100
|
+
return {
|
|
101
|
+
text: "",
|
|
102
|
+
confidence: 0,
|
|
103
|
+
ocrUsed: true,
|
|
104
|
+
warnings: [
|
|
105
|
+
"HEIC/HEIF cannot be OCR'd in the browser; convert to PNG or JPEG server-side (e.g. @dragon708/docmind-node with sharp), then retry.",
|
|
106
|
+
...norm.warnings
|
|
107
|
+
],
|
|
108
|
+
inputFormat: norm.format,
|
|
109
|
+
normalizedFormat: norm.normalizedFormat
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
if (format === "tiff") {
|
|
113
|
+
try {
|
|
114
|
+
const tiff = await ocrTiff(data, {
|
|
115
|
+
langs,
|
|
116
|
+
signal,
|
|
117
|
+
maxPages: options?.ocr?.maxPages,
|
|
118
|
+
pageSeparator: options?.ocr?.pageSeparator
|
|
119
|
+
});
|
|
120
|
+
return {
|
|
121
|
+
text: tiff.text.trim(),
|
|
122
|
+
confidence: meanPageConfidence(tiff.textByPage),
|
|
123
|
+
ocrUsed: true,
|
|
124
|
+
warnings: [BROWSER_TIFF_BEST_EFFORT_WARNING, ...tiff.warnings],
|
|
125
|
+
pages: tiff.pagesProcessed,
|
|
126
|
+
textByPage: tiff.textByPage
|
|
127
|
+
};
|
|
128
|
+
} catch (e) {
|
|
129
|
+
rethrowIfAbort(e);
|
|
130
|
+
return {
|
|
131
|
+
text: "",
|
|
132
|
+
confidence: 0,
|
|
133
|
+
ocrUsed: true,
|
|
134
|
+
warnings: [
|
|
135
|
+
BROWSER_TIFF_BEST_EFFORT_WARNING,
|
|
136
|
+
...ocrFailureWarnings("TIFF OCR failed in the browser:", e)
|
|
137
|
+
]
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
try {
|
|
142
|
+
const detailed = await ocrImageDetailed(data, {
|
|
143
|
+
langs,
|
|
144
|
+
signal,
|
|
145
|
+
preprocess: options?.ocr?.preprocess
|
|
146
|
+
});
|
|
147
|
+
return {
|
|
148
|
+
text: detailed.text.trim(),
|
|
149
|
+
confidence: detailed.confidence,
|
|
150
|
+
ocrUsed: true,
|
|
151
|
+
warnings: [...detailed.warnings],
|
|
152
|
+
pages: detailed.pages,
|
|
153
|
+
inputFormat: detailed.inputFormat,
|
|
154
|
+
normalizedFormat: detailed.normalizedFormat
|
|
155
|
+
};
|
|
156
|
+
} catch (e) {
|
|
157
|
+
rethrowIfAbort(e);
|
|
158
|
+
return {
|
|
159
|
+
text: "",
|
|
160
|
+
confidence: 0,
|
|
161
|
+
ocrUsed: true,
|
|
162
|
+
warnings: ocrFailureWarnings("OCR could not complete in the browser:", e)
|
|
163
|
+
};
|
|
164
|
+
}
|
|
53
165
|
}
|
|
166
|
+
|
|
167
|
+
// src/analyzers/image.ts
|
|
54
168
|
var OCR_OFF_WARNING = 'OCR mode is "off"; no recognition was run. Use mode "auto" or "force" to extract text from images.';
|
|
55
169
|
function resolveOcrMode(options) {
|
|
56
170
|
return options?.ocr?.mode ?? "auto";
|
|
@@ -87,21 +201,26 @@ async function analyzeImageForBrowser(input, options) {
|
|
|
87
201
|
warnings: [OCR_OFF_WARNING]
|
|
88
202
|
};
|
|
89
203
|
}
|
|
90
|
-
const
|
|
91
|
-
|
|
92
|
-
signal: options?.ocr?.signal ?? options?.signal
|
|
93
|
-
};
|
|
94
|
-
const r = await ocr(data, ocrOpts);
|
|
95
|
-
return {
|
|
204
|
+
const ocrPart = await runRasterOcrForBrowser(data, input, options);
|
|
205
|
+
const base = {
|
|
96
206
|
fileKind: "image",
|
|
97
207
|
analyzer: "image",
|
|
98
208
|
status: "ok",
|
|
99
209
|
kind: "image",
|
|
100
|
-
text:
|
|
101
|
-
confidence:
|
|
102
|
-
ocrUsed:
|
|
103
|
-
warnings:
|
|
210
|
+
text: ocrPart.text,
|
|
211
|
+
confidence: ocrPart.confidence,
|
|
212
|
+
ocrUsed: true,
|
|
213
|
+
warnings: ocrPart.warnings
|
|
104
214
|
};
|
|
215
|
+
const extra = {};
|
|
216
|
+
if (ocrPart.pages !== void 0) extra.pages = ocrPart.pages;
|
|
217
|
+
if (ocrPart.textByPage !== void 0) extra.textByPage = ocrPart.textByPage;
|
|
218
|
+
if (ocrPart.inputFormat !== void 0) extra.inputFormat = ocrPart.inputFormat;
|
|
219
|
+
if (ocrPart.normalizedFormat !== void 0) extra.normalizedFormat = ocrPart.normalizedFormat;
|
|
220
|
+
if (Object.keys(extra).length > 0) {
|
|
221
|
+
return { ...base, ...extra };
|
|
222
|
+
}
|
|
223
|
+
return base;
|
|
105
224
|
}
|
|
106
225
|
|
|
107
226
|
// src/analyzeFile.ts
|
|
@@ -120,7 +239,7 @@ async function analyzeFile(input, options) {
|
|
|
120
239
|
case "pdf":
|
|
121
240
|
return notImplementedResult("pdf", "pdf", [BROWSER_PDF_UNSUPPORTED_WARNING]);
|
|
122
241
|
case "docx":
|
|
123
|
-
return analyzeDocxForBrowser(bytesInput, options
|
|
242
|
+
return analyzeDocxForBrowser(bytesInput, options);
|
|
124
243
|
case "image":
|
|
125
244
|
return analyzeImageForBrowser(bytesInput, options);
|
|
126
245
|
case "text":
|
|
@@ -129,7 +248,7 @@ async function analyzeFile(input, options) {
|
|
|
129
248
|
return notImplementedResult(fileKind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
130
249
|
}
|
|
131
250
|
}
|
|
132
|
-
var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not
|
|
251
|
+
var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not returned by extractMetadata; use analyzeFile, extractText, or convertToHtml with options.docx.include for OOXML structure, headings, tables, blocks, approximate pages, and embedded images.";
|
|
133
252
|
var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
|
|
134
253
|
function escapeHtmlMinimal(s) {
|
|
135
254
|
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
@@ -243,19 +362,16 @@ async function runOcr(input, options) {
|
|
|
243
362
|
warnings: ["No document bytes were provided for analysis."]
|
|
244
363
|
};
|
|
245
364
|
}
|
|
246
|
-
const
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
status: "ok",
|
|
251
|
-
kind: "docx",
|
|
252
|
-
text: r.text,
|
|
253
|
-
html: r.html,
|
|
365
|
+
const opt = analyzeDocxOptionsFromBrowser(options);
|
|
366
|
+
const raw = opt !== void 0 ? await analyzeDocx(data, opt) : await analyzeDocx(data);
|
|
367
|
+
const withNote = {
|
|
368
|
+
...raw,
|
|
254
369
|
warnings: [
|
|
255
|
-
...
|
|
370
|
+
...raw.warnings,
|
|
256
371
|
"OCR does not apply to DOCX; returned structured text/HTML extract."
|
|
257
372
|
]
|
|
258
373
|
};
|
|
374
|
+
return docxPackageResultToAnalysisResult(withNote);
|
|
259
375
|
}
|
|
260
376
|
case "text":
|
|
261
377
|
return analyzeText(bytesInput, { signal });
|
|
@@ -264,16 +380,164 @@ async function runOcr(input, options) {
|
|
|
264
380
|
}
|
|
265
381
|
}
|
|
266
382
|
|
|
383
|
+
// src/capabilityReport.ts
|
|
384
|
+
function docxIncludeRequested(flags) {
|
|
385
|
+
if (!flags) return false;
|
|
386
|
+
return !!(flags.structure || flags.headings || flags.tables || flags.blocks || flags.pagesApprox || flags.embeddedImages);
|
|
387
|
+
}
|
|
388
|
+
var DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER = {
|
|
389
|
+
canExtractEmbeddedImages: true,
|
|
390
|
+
documentsMayIncludeImagesRequiringWebConversion: true,
|
|
391
|
+
webFriendlyRasterConversionInBrowser: false,
|
|
392
|
+
notes: [
|
|
393
|
+
"`extractImagesFromDocx` from `@dragon708/docmind-docx` runs in-browser on the same ZIP bytes as Mammoth.",
|
|
394
|
+
"PNG, JPEG, GIF, WebP, SVG, TIFF, BMP, ICO are browser-embeddable; EMF/WMF need an external converter or server-side Node tooling."
|
|
395
|
+
]
|
|
396
|
+
};
|
|
397
|
+
var DOCX_STRUCTURE_CAPABILITIES_BROWSER = {
|
|
398
|
+
ooxmlExtractorsAvailable: true,
|
|
399
|
+
activatedViaDocxInclude: true,
|
|
400
|
+
features: [
|
|
401
|
+
"OOXML structure (body blocks)",
|
|
402
|
+
"headings",
|
|
403
|
+
"tables",
|
|
404
|
+
"semantic blocks",
|
|
405
|
+
"approximate pages (OOXML page-break hints)",
|
|
406
|
+
"embedded images (word/media; mode web/both still browser-safe but EMF/WMF stay non-raster without a converter)"
|
|
407
|
+
],
|
|
408
|
+
notes: [
|
|
409
|
+
"Use options.docx.include on analyzeFile, extractText, convertToHtml, or runOcr to merge Mammoth with selected `@dragon708/docmind-docx` extractors.",
|
|
410
|
+
"extractMetadata for DOCX stays a stub in the browser facade."
|
|
411
|
+
]
|
|
412
|
+
};
|
|
413
|
+
var DOCX_META = "Structured document metadata is not exposed separately in the browser runtime; extractMetadata returns a stub for DOCX.";
|
|
414
|
+
var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
|
|
415
|
+
var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
|
|
416
|
+
var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
|
|
417
|
+
var OCR_OFF_NOTE = 'Image OCR may be skipped when `ocr.mode` is "off" in analyze options.';
|
|
418
|
+
var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
|
|
419
|
+
function slot(id, supported, warnings) {
|
|
420
|
+
return warnings?.length ? { id, supported, warnings } : { id, supported };
|
|
421
|
+
}
|
|
422
|
+
function buildBrowserCapabilityReport(kind) {
|
|
423
|
+
const runtime = { id: "browser" };
|
|
424
|
+
const pdf = BROWSER_PDF_UNSUPPORTED_WARNING;
|
|
425
|
+
let capabilities;
|
|
426
|
+
const topWarnings = [];
|
|
427
|
+
switch (kind) {
|
|
428
|
+
case "pdf":
|
|
429
|
+
capabilities = [
|
|
430
|
+
slot("text", false, [pdf]),
|
|
431
|
+
slot("metadata", false, [pdf]),
|
|
432
|
+
slot("html", false, [pdf]),
|
|
433
|
+
slot("ocr", false, [pdf]),
|
|
434
|
+
slot("pages", false, [pdf])
|
|
435
|
+
];
|
|
436
|
+
break;
|
|
437
|
+
case "docx":
|
|
438
|
+
capabilities = [
|
|
439
|
+
slot("text", true, [
|
|
440
|
+
"Mammoth plain text in analyzeFile; extractText clears html. Optional OOXML fields merge when options.docx.include is set."
|
|
441
|
+
]),
|
|
442
|
+
slot("metadata", false, [
|
|
443
|
+
`${DOCX_META} Use analyzeFile-style routes with options.docx.include for OOXML structure, headings, tables, blocks, approximate pages, and embedded images.`
|
|
444
|
+
]),
|
|
445
|
+
slot("html", true, [
|
|
446
|
+
"Mammoth HTML in-browser; docxImagesAsDataUri for web-safe images; EMF/WMF placeholders in HTML unless you handle media separately."
|
|
447
|
+
]),
|
|
448
|
+
slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
|
|
449
|
+
slot("pages", false, [
|
|
450
|
+
"No PDF page count; approximate DOCX pages via options.docx.include.pagesApprox (heuristic, not print layout)."
|
|
451
|
+
])
|
|
452
|
+
];
|
|
453
|
+
break;
|
|
454
|
+
case "image":
|
|
455
|
+
capabilities = [
|
|
456
|
+
slot("text", true, [
|
|
457
|
+
"Text via `@dragon708/docmind-ocr` when `ocr.mode` is not off: PNG, JPEG, WebP, BMP, GIF (first frame), TIFF (partial), after sniff/MIME."
|
|
458
|
+
]),
|
|
459
|
+
slot("metadata", false, [IMAGE_META]),
|
|
460
|
+
slot("html", false, [IMAGE_HTML]),
|
|
461
|
+
slot("ocr", true, [
|
|
462
|
+
OCR_OFF_NOTE,
|
|
463
|
+
"Uses `ocrImageDetailed` (single-frame path) or multipage `ocrTiff` for TIFF; WASM Tesseract in-browser."
|
|
464
|
+
]),
|
|
465
|
+
slot("image-normalization", true, [
|
|
466
|
+
"`normalizeImageForOcr` runs inside the OCR package (canvas/`createImageBitmap` in-browser for BMP, GIF, etc.; not HEIC)."
|
|
467
|
+
]),
|
|
468
|
+
slot("bmp", true, [
|
|
469
|
+
"BMP is decoded via browser canvas/`createImageBitmap` into a PNG-oriented buffer before Tesseract."
|
|
470
|
+
]),
|
|
471
|
+
slot("gif-first-frame", true, [
|
|
472
|
+
"Animated GIF: only the first decoded frame is normalized and OCR'd; see result warnings when multi-frame is detected."
|
|
473
|
+
]),
|
|
474
|
+
slot("heic", false, [
|
|
475
|
+
"HEIC/HEIF is not decoded in the browser. `runOcr` / `analyzeFile` return empty text with explicit warnings; convert server-side (e.g. @dragon708/docmind-node)."
|
|
476
|
+
]),
|
|
477
|
+
slot("tiff", true, [
|
|
478
|
+
"Partial / best-effort: multipage `ocrTiff` with UTIF in JS/WASM\u2014not all compressions or huge files; prefer Node for production TIFF."
|
|
479
|
+
]),
|
|
480
|
+
slot("pages", true, [
|
|
481
|
+
"Multipage TIFF may populate `pages` and `textByPage` when OCR succeeds; other formats may expose `pages` when normalization reports it."
|
|
482
|
+
])
|
|
483
|
+
];
|
|
484
|
+
break;
|
|
485
|
+
case "text":
|
|
486
|
+
capabilities = [
|
|
487
|
+
slot("text", true),
|
|
488
|
+
slot("metadata", true, [TEXT_META_NOTE]),
|
|
489
|
+
slot("html", true),
|
|
490
|
+
slot("ocr", false, ["OCR does not apply to plain text files."]),
|
|
491
|
+
slot("pages", false)
|
|
492
|
+
];
|
|
493
|
+
break;
|
|
494
|
+
default:
|
|
495
|
+
topWarnings.push(UNKNOWN_KIND);
|
|
496
|
+
capabilities = [
|
|
497
|
+
slot("text", false),
|
|
498
|
+
slot("metadata", false),
|
|
499
|
+
slot("html", false),
|
|
500
|
+
slot("ocr", false),
|
|
501
|
+
slot("pages", false)
|
|
502
|
+
];
|
|
503
|
+
}
|
|
504
|
+
return {
|
|
505
|
+
kind,
|
|
506
|
+
runtime,
|
|
507
|
+
capabilities,
|
|
508
|
+
...kind === "docx" ? {
|
|
509
|
+
docxEmbeddedImages: DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER,
|
|
510
|
+
docxStructure: DOCX_STRUCTURE_CAPABILITIES_BROWSER
|
|
511
|
+
} : {},
|
|
512
|
+
warnings: topWarnings.length > 0 ? topWarnings : void 0
|
|
513
|
+
};
|
|
514
|
+
}
|
|
515
|
+
|
|
267
516
|
// src/analysisPlanReport.ts
|
|
268
517
|
function lim(...items) {
|
|
269
518
|
return items.filter(Boolean);
|
|
270
519
|
}
|
|
271
|
-
|
|
520
|
+
var DOCX_MAMMOTH_PLUS_OPTIONAL_BROWSER = "Mammoth (`analyzeDocx`) extracts text and HTML from OOXML in-browser; optional parallel OOXML/ZIP extractors run when options.docx.include is set.";
|
|
521
|
+
var DOCX_ZIP_NOTE_BROWSER = "Embedded files under word/media are available via @dragon708/docmind-docx when options.docx.include requests embeddedImages (or call extractImagesFromDocx on the same bytes).";
|
|
522
|
+
var BROWSER_TIFF_RASTER_NOTE = "TIFF (if detected): multipage OCR uses `ocrTiff` with UTIF in-browser\u2014best-effort only; failures return empty text + warnings (no throw). Prefer `@dragon708/docmind-node` for production TIFF.";
|
|
523
|
+
var BROWSER_IMAGE_PIPELINE = "Browser raster OCR: sniff format \u2192 `normalizeImageForOcr` (canvas/`createImageBitmap` for BMP/GIF; not HEIC) \u2192 optional `preprocessImageForOcr` when `options.ocr.preprocess` applies \u2192 Tesseract via `ocrImageDetailed`, or `ocrTiff` for TIFF. HEIC/HEIF: no decode\u2014expect empty text and explicit warnings. GIF: first frame only.";
|
|
524
|
+
var BROWSER_HEIC_NOTE = "HEIC/HEIF is never decoded in-browser; there is no `sharp` dependency. Convert server-side, then OCR PNG/JPEG bytes.";
|
|
525
|
+
function finalizeBrowserDocxExplainReport(report) {
|
|
526
|
+
if (report.kind !== "docx") return report;
|
|
527
|
+
const limitations = report.limitations.includes(DOCX_ZIP_NOTE_BROWSER) ? report.limitations : [...report.limitations, DOCX_ZIP_NOTE_BROWSER];
|
|
528
|
+
return {
|
|
529
|
+
...report,
|
|
530
|
+
docxEmbeddedImages: DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER,
|
|
531
|
+
docxStructure: DOCX_STRUCTURE_CAPABILITIES_BROWSER,
|
|
532
|
+
limitations
|
|
533
|
+
};
|
|
534
|
+
}
|
|
535
|
+
function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocrSlice) {
|
|
272
536
|
const runtime = { id: "browser" };
|
|
273
537
|
const imageOcrActive = ocrMode !== "off";
|
|
274
538
|
let primaryAnalyzer = kind === "pdf" ? "pdf" : kind === "docx" ? "docx" : kind === "image" ? "image" : kind === "text" ? "text" : "none";
|
|
275
539
|
let nativeExtraction;
|
|
276
|
-
let
|
|
540
|
+
let ocr;
|
|
277
541
|
let limitations = [];
|
|
278
542
|
const ocrOffNote = ocrMode === "off" ? 'Image OCR is skipped when ocr.mode is "off".' : "";
|
|
279
543
|
if (kind === "pdf") {
|
|
@@ -282,40 +546,40 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
|
|
|
282
546
|
willAttempt: false,
|
|
283
547
|
description: "PDF is not processed in the browser runtime; use @dragon708/docmind-node."
|
|
284
548
|
};
|
|
285
|
-
|
|
549
|
+
ocr = {
|
|
286
550
|
mayUse: false,
|
|
287
551
|
description: "PDF OCR is not available in the browser."
|
|
288
552
|
};
|
|
289
|
-
return {
|
|
553
|
+
return finalizeBrowserDocxExplainReport({
|
|
290
554
|
kind,
|
|
291
555
|
detectedKind: kind,
|
|
292
556
|
runtime,
|
|
293
557
|
intent,
|
|
294
558
|
primaryAnalyzer: "pdf",
|
|
295
559
|
nativeExtraction,
|
|
296
|
-
ocr
|
|
560
|
+
ocr,
|
|
297
561
|
limitations,
|
|
298
562
|
plan,
|
|
299
563
|
warnings: [BROWSER_PDF_UNSUPPORTED_WARNING]
|
|
300
|
-
};
|
|
564
|
+
});
|
|
301
565
|
}
|
|
302
566
|
if (kind === "unknown") {
|
|
303
567
|
limitations = lim(
|
|
304
568
|
"Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve."
|
|
305
569
|
);
|
|
306
570
|
nativeExtraction = { willAttempt: false, description: "No analyzer selected without a known file kind." };
|
|
307
|
-
|
|
308
|
-
return {
|
|
571
|
+
ocr = { mayUse: false, description: "OCR is not used for unknown kinds." };
|
|
572
|
+
return finalizeBrowserDocxExplainReport({
|
|
309
573
|
kind,
|
|
310
574
|
detectedKind: kind,
|
|
311
575
|
runtime,
|
|
312
576
|
intent,
|
|
313
577
|
primaryAnalyzer: "none",
|
|
314
578
|
nativeExtraction,
|
|
315
|
-
ocr
|
|
579
|
+
ocr,
|
|
316
580
|
limitations,
|
|
317
581
|
plan
|
|
318
|
-
};
|
|
582
|
+
});
|
|
319
583
|
}
|
|
320
584
|
switch (intent) {
|
|
321
585
|
case "analyzeFile":
|
|
@@ -323,25 +587,25 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
|
|
|
323
587
|
if (kind === "docx") {
|
|
324
588
|
nativeExtraction = {
|
|
325
589
|
willAttempt: true,
|
|
326
|
-
description: "Mammoth
|
|
590
|
+
description: docxIncludeRequested(docxInclude) ? "Mammoth plus parallel OOXML extractors (per options.docx.include)." + (intent === "extractText" ? " HTML cleared in extractText." : "") : DOCX_MAMMOTH_PLUS_OPTIONAL_BROWSER + (intent === "extractText" ? " HTML omitted in extractText." : "")
|
|
327
591
|
};
|
|
328
|
-
|
|
592
|
+
ocr = { mayUse: false, description: "DOCX does not use OCR in DocMind." };
|
|
329
593
|
} else if (kind === "image") {
|
|
330
594
|
nativeExtraction = {
|
|
331
595
|
willAttempt: false,
|
|
332
|
-
description:
|
|
596
|
+
description: BROWSER_IMAGE_PIPELINE
|
|
333
597
|
};
|
|
334
|
-
|
|
598
|
+
ocr = {
|
|
335
599
|
mayUse: imageOcrActive,
|
|
336
|
-
description: imageOcrActive ? "
|
|
600
|
+
description: imageOcrActive ? "`ocrImageDetailed` (normalize + optional preprocess) for single-frame paths; TIFF \u2192 `ocrTiff` (UTIF, partial). HEIC unsupported in-browser." : "OCR skipped while ocr.mode is off."
|
|
337
601
|
};
|
|
338
|
-
limitations = lim(ocrOffNote);
|
|
602
|
+
limitations = lim(ocrOffNote, BROWSER_TIFF_RASTER_NOTE, BROWSER_HEIC_NOTE);
|
|
339
603
|
} else {
|
|
340
604
|
nativeExtraction = {
|
|
341
605
|
willAttempt: true,
|
|
342
606
|
description: "Plain text is decoded as UTF-8 (BOM stripped, replacement on invalid bytes)."
|
|
343
607
|
};
|
|
344
|
-
|
|
608
|
+
ocr = { mayUse: false, description: "OCR does not apply to text files." };
|
|
345
609
|
}
|
|
346
610
|
break;
|
|
347
611
|
case "extractMetadata":
|
|
@@ -350,16 +614,16 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
|
|
|
350
614
|
willAttempt: false,
|
|
351
615
|
description: "No heavy extractor; extractMetadata returns a stub with guidance."
|
|
352
616
|
};
|
|
353
|
-
|
|
617
|
+
ocr = { mayUse: false, description: "OCR is not invoked for this metadata path." };
|
|
354
618
|
limitations = lim(
|
|
355
|
-
kind === "docx" ? "Structured DOCX metadata is not exposed separately in the browser." : "Raster images have no document metadata bundle
|
|
619
|
+
kind === "docx" ? "Structured DOCX metadata is not exposed separately in the browser; use analyzeFile / extractText / convertToHtml with options.docx.include for OOXML fields." : "Raster images have no document metadata bundle. TIFF/HEIC caveats: see getCapabilities (`tiff` partial, `heic` unsupported) and runOcr warnings."
|
|
356
620
|
);
|
|
357
621
|
} else {
|
|
358
622
|
nativeExtraction = {
|
|
359
623
|
willAttempt: true,
|
|
360
624
|
description: "Plain text is decoded; metadata is limited to decoded content."
|
|
361
625
|
};
|
|
362
|
-
|
|
626
|
+
ocr = { mayUse: false, description: "OCR does not apply." };
|
|
363
627
|
limitations = lim("Plain text has no structured document metadata.");
|
|
364
628
|
}
|
|
365
629
|
break;
|
|
@@ -367,23 +631,23 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
|
|
|
367
631
|
if (kind === "docx") {
|
|
368
632
|
nativeExtraction = {
|
|
369
633
|
willAttempt: true,
|
|
370
|
-
description: "Mammoth
|
|
634
|
+
description: docxIncludeRequested(docxInclude) ? "Mammoth HTML via analyzeFile plus optional OOXML extractors." : "Mammoth HTML via analyzeFile; optional OOXML v2 when options.docx.include is set."
|
|
371
635
|
};
|
|
372
|
-
|
|
636
|
+
ocr = { mayUse: false, description: "DOCX path does not use OCR." };
|
|
373
637
|
} else if (kind === "text") {
|
|
374
638
|
nativeExtraction = {
|
|
375
639
|
willAttempt: true,
|
|
376
640
|
description: "UTF-8 decode then wrap in a <pre> element."
|
|
377
641
|
};
|
|
378
|
-
|
|
642
|
+
ocr = { mayUse: false, description: "OCR does not apply." };
|
|
379
643
|
} else {
|
|
380
644
|
nativeExtraction = {
|
|
381
645
|
willAttempt: false,
|
|
382
646
|
description: "No rich HTML path for this kind in the browser."
|
|
383
647
|
};
|
|
384
|
-
|
|
648
|
+
ocr = { mayUse: false, description: "OCR does not produce layout HTML here." };
|
|
385
649
|
limitations = lim(
|
|
386
|
-
kind === "image" ? "Raster images have no HTML representation; use extractText or runOcr." : ""
|
|
650
|
+
kind === "image" ? "Raster images have no HTML representation; use extractText or runOcr. Expect HEIC to yield warnings only; TIFF is best-effort." : ""
|
|
387
651
|
);
|
|
388
652
|
}
|
|
389
653
|
break;
|
|
@@ -391,120 +655,77 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
|
|
|
391
655
|
if (kind === "image") {
|
|
392
656
|
nativeExtraction = {
|
|
393
657
|
willAttempt: false,
|
|
394
|
-
description:
|
|
658
|
+
description: BROWSER_IMAGE_PIPELINE
|
|
395
659
|
};
|
|
396
|
-
|
|
660
|
+
ocr = {
|
|
397
661
|
mayUse: imageOcrActive,
|
|
398
|
-
description: imageOcrActive ? "
|
|
662
|
+
description: imageOcrActive ? "Same as analyzeFile: `normalizeImageForOcr` inside `ocrImageDetailed`, optional canvas preprocess, or `ocrTiff` for TIFF. Errors surface as warnings, not uncaught exceptions (except abort)." : "OCR skipped while ocr.mode is off."
|
|
399
663
|
};
|
|
400
|
-
limitations = lim(ocrOffNote);
|
|
664
|
+
limitations = lim(ocrOffNote, BROWSER_TIFF_RASTER_NOTE, BROWSER_HEIC_NOTE);
|
|
401
665
|
} else if (kind === "docx") {
|
|
402
666
|
nativeExtraction = {
|
|
403
667
|
willAttempt: true,
|
|
404
|
-
description: "Mammoth
|
|
668
|
+
description: docxIncludeRequested(docxInclude) ? "Mammoth text/HTML plus optional OOXML extractors; not OCR." : "Mammoth text/HTML; optional OOXML v2 via options.docx.include; not OCR."
|
|
405
669
|
};
|
|
406
|
-
|
|
670
|
+
ocr = { mayUse: false, description: "DOCX is not OCR'd." };
|
|
407
671
|
limitations = lim("Returned content is structured extract, not OCR output.");
|
|
408
672
|
} else {
|
|
409
673
|
nativeExtraction = {
|
|
410
674
|
willAttempt: true,
|
|
411
675
|
description: "Plain text is UTF-8 decoded only."
|
|
412
676
|
};
|
|
413
|
-
|
|
677
|
+
ocr = { mayUse: false, description: "OCR does not apply to text files." };
|
|
414
678
|
}
|
|
415
679
|
break;
|
|
416
680
|
default:
|
|
417
681
|
nativeExtraction = { willAttempt: false, description: "Intent not specialized in this runtime." };
|
|
418
|
-
|
|
682
|
+
ocr = { mayUse: false, description: "See plan steps." };
|
|
419
683
|
}
|
|
420
|
-
|
|
684
|
+
if (kind === "image" && preprocessHasEffect(ocrSlice?.preprocess)) {
|
|
685
|
+
limitations = [
|
|
686
|
+
...limitations,
|
|
687
|
+
"options.ocr.preprocess applies to the `ocrImageDetailed` path only; multipage TIFF (`ocrTiff`) does not run preprocess per frame."
|
|
688
|
+
];
|
|
689
|
+
}
|
|
690
|
+
return finalizeBrowserDocxExplainReport({
|
|
421
691
|
kind,
|
|
422
692
|
detectedKind: kind,
|
|
423
693
|
runtime,
|
|
424
694
|
intent,
|
|
425
695
|
primaryAnalyzer,
|
|
426
696
|
nativeExtraction,
|
|
427
|
-
ocr
|
|
697
|
+
ocr,
|
|
428
698
|
limitations,
|
|
429
699
|
plan
|
|
430
|
-
};
|
|
700
|
+
});
|
|
431
701
|
}
|
|
432
702
|
|
|
433
|
-
// src/
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
|
|
437
|
-
var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
|
|
438
|
-
var OCR_OFF_NOTE = 'Image OCR may be skipped when `ocr.mode` is "off" in analyze options.';
|
|
439
|
-
var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
|
|
440
|
-
function slot(id, supported, warnings) {
|
|
441
|
-
return warnings?.length ? { id, supported, warnings } : { id, supported };
|
|
703
|
+
// src/introspection.ts
|
|
704
|
+
function resolveOcrMode2(ocr) {
|
|
705
|
+
return ocr?.mode ?? "auto";
|
|
442
706
|
}
|
|
443
|
-
function
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
slot("metadata", false, [pdf]),
|
|
453
|
-
slot("html", false, [pdf]),
|
|
454
|
-
slot("ocr", false, [pdf]),
|
|
455
|
-
slot("pages", false, [pdf])
|
|
456
|
-
];
|
|
457
|
-
break;
|
|
458
|
-
case "docx":
|
|
459
|
-
capabilities = [
|
|
460
|
-
slot("text", true),
|
|
461
|
-
slot("metadata", false, [DOCX_META]),
|
|
462
|
-
slot("html", true),
|
|
463
|
-
slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
|
|
464
|
-
slot("pages", false)
|
|
465
|
-
];
|
|
466
|
-
break;
|
|
467
|
-
case "image":
|
|
468
|
-
capabilities = [
|
|
469
|
-
slot("text", true, ["Text is obtained via OCR when enabled."]),
|
|
470
|
-
slot("metadata", false, [IMAGE_META]),
|
|
471
|
-
slot("html", false, [IMAGE_HTML]),
|
|
472
|
-
slot("ocr", true, [OCR_OFF_NOTE]),
|
|
473
|
-
slot("pages", false)
|
|
474
|
-
];
|
|
475
|
-
break;
|
|
476
|
-
case "text":
|
|
477
|
-
capabilities = [
|
|
478
|
-
slot("text", true),
|
|
479
|
-
slot("metadata", true, [TEXT_META_NOTE]),
|
|
480
|
-
slot("html", true),
|
|
481
|
-
slot("ocr", false, ["OCR does not apply to plain text files."]),
|
|
482
|
-
slot("pages", false)
|
|
483
|
-
];
|
|
484
|
-
break;
|
|
485
|
-
default:
|
|
486
|
-
topWarnings.push(UNKNOWN_KIND);
|
|
487
|
-
capabilities = [
|
|
488
|
-
slot("text", false),
|
|
489
|
-
slot("metadata", false),
|
|
490
|
-
slot("html", false),
|
|
491
|
-
slot("ocr", false),
|
|
492
|
-
slot("pages", false)
|
|
493
|
-
];
|
|
707
|
+
function imageBrowserPlanSteps(ocrMode, ocr) {
|
|
708
|
+
if (ocrMode === "off") {
|
|
709
|
+
return [
|
|
710
|
+
{ id: "detect_kind", status: "done" },
|
|
711
|
+
{ id: "image_format_detect", status: "skipped" },
|
|
712
|
+
{ id: "normalize_image_for_ocr", status: "skipped" },
|
|
713
|
+
{ id: "preprocess_image_for_ocr", status: "skipped" },
|
|
714
|
+
{ id: "tesseract_ocr", status: "skipped" }
|
|
715
|
+
];
|
|
494
716
|
}
|
|
495
|
-
return
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
return ocr2?.mode ?? "auto";
|
|
717
|
+
return [
|
|
718
|
+
{ id: "detect_kind", status: "done" },
|
|
719
|
+
{ id: "image_format_detect", status: "planned" },
|
|
720
|
+
{ id: "normalize_image_for_ocr", status: "planned" },
|
|
721
|
+
{
|
|
722
|
+
id: "preprocess_image_for_ocr",
|
|
723
|
+
status: preprocessHasEffect(ocr?.preprocess) ? "planned" : "skipped"
|
|
724
|
+
},
|
|
725
|
+
{ id: "tesseract_ocr", status: "planned" }
|
|
726
|
+
];
|
|
506
727
|
}
|
|
507
|
-
function planForAnalyzeFile(kind, ocrMode) {
|
|
728
|
+
function planForAnalyzeFile(kind, ocrMode, docxInclude, ocr) {
|
|
508
729
|
switch (kind) {
|
|
509
730
|
case "pdf":
|
|
510
731
|
return {
|
|
@@ -514,24 +735,21 @@ function planForAnalyzeFile(kind, ocrMode) {
|
|
|
514
735
|
{ id: "pdf_pipeline", status: "skipped" }
|
|
515
736
|
]
|
|
516
737
|
};
|
|
517
|
-
case "docx":
|
|
738
|
+
case "docx": {
|
|
739
|
+
const parallel = docxIncludeRequested(docxInclude);
|
|
518
740
|
return {
|
|
519
741
|
intent: "analyzeFile",
|
|
520
742
|
steps: [
|
|
521
743
|
{ id: "detect_kind", status: "done" },
|
|
522
|
-
{ id: "docx_mammoth", status: "planned" }
|
|
744
|
+
{ id: "docx_mammoth", status: "planned" },
|
|
745
|
+
...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
|
|
523
746
|
]
|
|
524
747
|
};
|
|
748
|
+
}
|
|
525
749
|
case "image":
|
|
526
750
|
return {
|
|
527
751
|
intent: "analyzeFile",
|
|
528
|
-
steps:
|
|
529
|
-
{ id: "detect_kind", status: "done" },
|
|
530
|
-
{
|
|
531
|
-
id: "image_ocr",
|
|
532
|
-
status: ocrMode === "off" ? "skipped" : "planned"
|
|
533
|
-
}
|
|
534
|
-
]
|
|
752
|
+
steps: imageBrowserPlanSteps(ocrMode, ocr)
|
|
535
753
|
};
|
|
536
754
|
case "text":
|
|
537
755
|
return {
|
|
@@ -548,11 +766,11 @@ function planForAnalyzeFile(kind, ocrMode) {
|
|
|
548
766
|
};
|
|
549
767
|
}
|
|
550
768
|
}
|
|
551
|
-
function planForIntent(intentOpt, kind, ocrMode) {
|
|
769
|
+
function planForIntent(intentOpt, kind, ocrMode, docxInclude, ocr) {
|
|
552
770
|
const intent = intentOpt ?? "analyzeFile";
|
|
553
|
-
if (intent === "analyzeFile") return planForAnalyzeFile(kind, ocrMode);
|
|
771
|
+
if (intent === "analyzeFile") return planForAnalyzeFile(kind, ocrMode, docxInclude, ocr);
|
|
554
772
|
if (intent === "extractText") {
|
|
555
|
-
const base = planForAnalyzeFile(kind, ocrMode);
|
|
773
|
+
const base = planForAnalyzeFile(kind, ocrMode, docxInclude, ocr);
|
|
556
774
|
return { ...base, intent: "extractText" };
|
|
557
775
|
}
|
|
558
776
|
if (intent === "extractMetadata") {
|
|
@@ -575,11 +793,13 @@ function planForIntent(intentOpt, kind, ocrMode) {
|
|
|
575
793
|
}
|
|
576
794
|
if (intent === "convertToHtml") {
|
|
577
795
|
if (kind === "docx") {
|
|
796
|
+
const parallel = docxIncludeRequested(docxInclude);
|
|
578
797
|
return {
|
|
579
798
|
intent: "convertToHtml",
|
|
580
799
|
steps: [
|
|
581
800
|
{ id: "detect_kind", status: "done" },
|
|
582
|
-
{ id: "docx_mammoth_html", status: "planned" }
|
|
801
|
+
{ id: "docx_mammoth_html", status: "planned" },
|
|
802
|
+
...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
|
|
583
803
|
]
|
|
584
804
|
};
|
|
585
805
|
}
|
|
@@ -605,18 +825,17 @@ function planForIntent(intentOpt, kind, ocrMode) {
|
|
|
605
825
|
if (kind === "image") {
|
|
606
826
|
return {
|
|
607
827
|
intent: "runOcr",
|
|
608
|
-
steps:
|
|
609
|
-
{ id: "detect_kind", status: "done" },
|
|
610
|
-
{ id: "tesseract_ocr", status: ocrMode === "off" ? "skipped" : "planned" }
|
|
611
|
-
]
|
|
828
|
+
steps: imageBrowserPlanSteps(ocrMode, ocr)
|
|
612
829
|
};
|
|
613
830
|
}
|
|
614
831
|
if (kind === "docx") {
|
|
832
|
+
const parallel = docxIncludeRequested(docxInclude);
|
|
615
833
|
return {
|
|
616
834
|
intent: "runOcr",
|
|
617
835
|
steps: [
|
|
618
836
|
{ id: "detect_kind", status: "done" },
|
|
619
|
-
{ id: "
|
|
837
|
+
{ id: "docx_mammoth", status: "planned" },
|
|
838
|
+
...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
|
|
620
839
|
]
|
|
621
840
|
};
|
|
622
841
|
}
|
|
@@ -628,7 +847,7 @@ function planForIntent(intentOpt, kind, ocrMode) {
|
|
|
628
847
|
]
|
|
629
848
|
};
|
|
630
849
|
}
|
|
631
|
-
return planForAnalyzeFile(kind, ocrMode);
|
|
850
|
+
return planForAnalyzeFile(kind, ocrMode, docxInclude, ocr);
|
|
632
851
|
}
|
|
633
852
|
async function getCapabilities(input, options) {
|
|
634
853
|
throwIfAborted(options?.signal);
|
|
@@ -642,10 +861,12 @@ async function explainAnalysisPlan(input, options) {
|
|
|
642
861
|
const kind = detectFileKind(input);
|
|
643
862
|
const intent = options?.intent ?? "analyzeFile";
|
|
644
863
|
const ocrMode = resolveOcrMode2(options?.ocr);
|
|
645
|
-
const
|
|
646
|
-
|
|
864
|
+
const docxInc = options?.docx?.include;
|
|
865
|
+
const ocrSlice = options?.ocr;
|
|
866
|
+
const plan = planForIntent(intent, kind, ocrMode, docxInc, ocrSlice);
|
|
867
|
+
return buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInc, ocrSlice);
|
|
647
868
|
}
|
|
648
869
|
|
|
649
|
-
export { BROWSER_PDF_UNSUPPORTED_WARNING, analyzeFile, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, runOcr };
|
|
870
|
+
export { BROWSER_PDF_UNSUPPORTED_WARNING, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, runOcr };
|
|
650
871
|
//# sourceMappingURL=index.js.map
|
|
651
872
|
//# sourceMappingURL=index.js.map
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-browser",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.5.0",
|
|
4
4
|
"description": "Official DocMind browser facade: analyzeFile and intent APIs (DOCX, image OCR, text). PDF and fs paths use @dragon708/docmind-node.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"sideEffects": false,
|
|
@@ -33,8 +33,8 @@
|
|
|
33
33
|
],
|
|
34
34
|
"license": "MIT",
|
|
35
35
|
"dependencies": {
|
|
36
|
-
"@dragon708/docmind-docx": "^1.
|
|
37
|
-
"@dragon708/docmind-ocr": "^1.
|
|
36
|
+
"@dragon708/docmind-docx": "^1.7.0",
|
|
37
|
+
"@dragon708/docmind-ocr": "^1.1.0",
|
|
38
38
|
"@dragon708/docmind-shared": "^1.1.0"
|
|
39
39
|
},
|
|
40
40
|
"devDependencies": {
|