@dragon708/docmind-browser 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +11 -5
- package/dist/index.js +198 -62
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { DocMindAnalyzeOptions, AnalysisResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
|
|
2
2
|
export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
|
|
3
|
-
import { OcrOptions } from '@dragon708/docmind-ocr';
|
|
3
|
+
import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
|
|
4
4
|
import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
|
|
5
5
|
export { AnalyzeDocxIncludeFlags } from '@dragon708/docmind-docx';
|
|
6
6
|
|
|
@@ -16,11 +16,15 @@ interface BrowserAnalyzeDocxOptionsSlice {
|
|
|
16
16
|
* - `off`: do not invoke Tesseract; text stays empty with an explanatory warning.
|
|
17
17
|
* - `auto` (default): run OCR when the input is classified as an image.
|
|
18
18
|
* - `force`: same as `auto` in the browser runtime (no PDF-style text layer to compare); reserved for parity with Node.
|
|
19
|
+
*
|
|
20
|
+
* Multipage TIFF (when sniffed): `maxPages` and `pageSeparator` match `OcrTiffOptions` in `@dragon708/docmind-ocr` (best-effort UTIF in-browser).
|
|
21
|
+
* Optional {@link PreprocessImageOptions} runs in-browser (canvas) on the normalized raster before Tesseract when using `ocrImageDetailed`.
|
|
19
22
|
*/
|
|
20
23
|
type BrowserOcrMode = "off" | "auto" | "force";
|
|
21
|
-
/** Browser OCR options: Tesseract knobs from `@dragon708/docmind-ocr` plus optional {@link BrowserOcrMode}. */
|
|
22
|
-
interface BrowserOcrOptions extends OcrOptions {
|
|
24
|
+
/** Browser OCR options: Tesseract knobs from `@dragon708/docmind-ocr` plus optional {@link BrowserOcrMode}, TIFF caps, and canvas preprocess. */
|
|
25
|
+
interface BrowserOcrOptions extends OcrOptions, Pick<OcrTiffOptions, "maxPages" | "pageSeparator"> {
|
|
23
26
|
readonly mode?: BrowserOcrMode;
|
|
27
|
+
readonly preprocess?: PreprocessImageOptions;
|
|
24
28
|
}
|
|
25
29
|
/**
|
|
26
30
|
* Options for public browser methods (`analyzeFile`, intent APIs).
|
|
@@ -67,12 +71,14 @@ declare function extractMetadata(input: BrowserAnalyzeInput, options?: BrowserAn
|
|
|
67
71
|
declare function convertToHtml(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOptions): Promise<AnalysisResult>;
|
|
68
72
|
/**
|
|
69
73
|
* OCR-focused intent. Honors {@link BrowserAnalyzeOptions.ocr} **mode** (`off` | `auto` | `force`) for images.
|
|
74
|
+
* Raster path uses `normalizeImageForOcr` via `ocrImageDetailed` (or `ocrTiff` for TIFF); no Node-only libraries.
|
|
75
|
+
* HEIC/HEIF and hard failures yield empty text + warnings instead of throwing (abort still propagates).
|
|
70
76
|
* DOCX returns structured extract with a notice (no OCR). Text decodes as UTF-8 (no OCR).
|
|
71
77
|
*/
|
|
72
78
|
declare function runOcr(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOptions): Promise<AnalysisResult>;
|
|
73
79
|
|
|
74
80
|
/** High-level features the user can ask DocMind for (per input kind and runtime). */
|
|
75
|
-
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
|
|
81
|
+
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
|
|
76
82
|
declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
|
|
77
83
|
/** DOCX `word/media` en runtime browser (JSZip; sin pipeline Node). */
|
|
78
84
|
interface DocxEmbeddedImageCapabilities {
|
|
@@ -149,7 +155,7 @@ type BrowserExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<Brows
|
|
|
149
155
|
|
|
150
156
|
/**
|
|
151
157
|
* Epic 1 — **Capabilities:** detects kind from the same hints as `analyzeFile`, then lists which of
|
|
152
|
-
* `text` | `metadata` | `html` | `ocr` | `pages` apply in the browser (PDF always unsupported).
|
|
158
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` and image-specific ids (`image-normalization`, `bmp`, `gif-first-frame`, `heic`, `tiff`) apply in the browser (PDF always unsupported).
|
|
153
159
|
* No Mammoth/Tesseract/PDF parsing. For DOCX, {@link GetCapabilitiesReport.docxStructure} / `docxEmbeddedImages` describe v2 opt-in features.
|
|
154
160
|
*/
|
|
155
161
|
declare function getCapabilities(input: BrowserAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
|
package/dist/index.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, InvalidInputError } from '@dragon708/docmind-shared';
|
|
1
|
+
import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, InvalidInputError, getMimeType } from '@dragon708/docmind-shared';
|
|
2
2
|
export { detectFileKind } from '@dragon708/docmind-shared';
|
|
3
3
|
import { analyzeDocx } from '@dragon708/docmind-docx';
|
|
4
|
-
import {
|
|
4
|
+
import { preprocessHasEffect, resolveImageFormat, normalizeImageForOcr, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
|
|
5
5
|
|
|
6
6
|
// src/analyzeFile.ts
|
|
7
7
|
function assertBrowserInput(input) {
|
|
@@ -78,6 +78,93 @@ async function analyzeDocxForBrowser(input, options) {
|
|
|
78
78
|
const r = docxOpts !== void 0 ? await analyzeDocx(data, docxOpts) : await analyzeDocx(data);
|
|
79
79
|
return docxPackageResultToAnalysisResult(r);
|
|
80
80
|
}
|
|
81
|
+
var BROWSER_TIFF_BEST_EFFORT_WARNING = "Browser TIFF support is best-effort: decoding uses UTIF in JavaScript/WebAssembly\u2014some compressions, color modes, very large or multipage files may fail, hang, or exhaust memory. For heavy TIFF workloads use @dragon708/docmind-node (optional sharp).";
|
|
82
|
+
function meanPageConfidence(pages) {
|
|
83
|
+
if (pages.length === 0) return 0;
|
|
84
|
+
return pages.reduce((s, p) => s + p.confidence, 0) / pages.length;
|
|
85
|
+
}
|
|
86
|
+
function rethrowIfAbort(e) {
|
|
87
|
+
if (e instanceof Error && e.name === "AbortError") throw e;
|
|
88
|
+
}
|
|
89
|
+
function ocrFailureWarnings(prefix, e) {
|
|
90
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
91
|
+
return [`${prefix} ${msg}`];
|
|
92
|
+
}
|
|
93
|
+
async function runRasterOcrForBrowser(data, input, options) {
|
|
94
|
+
const signal = options?.ocr?.signal ?? options?.signal;
|
|
95
|
+
const langs = options?.ocr?.langs;
|
|
96
|
+
const mimeHint = getMimeType(input);
|
|
97
|
+
const format = resolveImageFormat(data, mimeHint);
|
|
98
|
+
if (format === "heic" || format === "heif") {
|
|
99
|
+
const norm = await normalizeImageForOcr(data, { signal, mimeHint });
|
|
100
|
+
return {
|
|
101
|
+
text: "",
|
|
102
|
+
confidence: 0,
|
|
103
|
+
ocrUsed: true,
|
|
104
|
+
warnings: [
|
|
105
|
+
"HEIC/HEIF cannot be OCR'd in the browser; convert to PNG or JPEG server-side (e.g. @dragon708/docmind-node with sharp), then retry.",
|
|
106
|
+
...norm.warnings
|
|
107
|
+
],
|
|
108
|
+
inputFormat: norm.format,
|
|
109
|
+
normalizedFormat: norm.normalizedFormat
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
if (format === "tiff") {
|
|
113
|
+
try {
|
|
114
|
+
const tiff = await ocrTiff(data, {
|
|
115
|
+
langs,
|
|
116
|
+
signal,
|
|
117
|
+
maxPages: options?.ocr?.maxPages,
|
|
118
|
+
pageSeparator: options?.ocr?.pageSeparator
|
|
119
|
+
});
|
|
120
|
+
return {
|
|
121
|
+
text: tiff.text.trim(),
|
|
122
|
+
confidence: meanPageConfidence(tiff.textByPage),
|
|
123
|
+
ocrUsed: true,
|
|
124
|
+
warnings: [BROWSER_TIFF_BEST_EFFORT_WARNING, ...tiff.warnings],
|
|
125
|
+
pages: tiff.pagesProcessed,
|
|
126
|
+
textByPage: tiff.textByPage
|
|
127
|
+
};
|
|
128
|
+
} catch (e) {
|
|
129
|
+
rethrowIfAbort(e);
|
|
130
|
+
return {
|
|
131
|
+
text: "",
|
|
132
|
+
confidence: 0,
|
|
133
|
+
ocrUsed: true,
|
|
134
|
+
warnings: [
|
|
135
|
+
BROWSER_TIFF_BEST_EFFORT_WARNING,
|
|
136
|
+
...ocrFailureWarnings("TIFF OCR failed in the browser:", e)
|
|
137
|
+
]
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
try {
|
|
142
|
+
const detailed = await ocrImageDetailed(data, {
|
|
143
|
+
langs,
|
|
144
|
+
signal,
|
|
145
|
+
preprocess: options?.ocr?.preprocess
|
|
146
|
+
});
|
|
147
|
+
return {
|
|
148
|
+
text: detailed.text.trim(),
|
|
149
|
+
confidence: detailed.confidence,
|
|
150
|
+
ocrUsed: true,
|
|
151
|
+
warnings: [...detailed.warnings],
|
|
152
|
+
pages: detailed.pages,
|
|
153
|
+
inputFormat: detailed.inputFormat,
|
|
154
|
+
normalizedFormat: detailed.normalizedFormat
|
|
155
|
+
};
|
|
156
|
+
} catch (e) {
|
|
157
|
+
rethrowIfAbort(e);
|
|
158
|
+
return {
|
|
159
|
+
text: "",
|
|
160
|
+
confidence: 0,
|
|
161
|
+
ocrUsed: true,
|
|
162
|
+
warnings: ocrFailureWarnings("OCR could not complete in the browser:", e)
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// src/analyzers/image.ts
|
|
81
168
|
var OCR_OFF_WARNING = 'OCR mode is "off"; no recognition was run. Use mode "auto" or "force" to extract text from images.';
|
|
82
169
|
function resolveOcrMode(options) {
|
|
83
170
|
return options?.ocr?.mode ?? "auto";
|
|
@@ -114,21 +201,26 @@ async function analyzeImageForBrowser(input, options) {
|
|
|
114
201
|
warnings: [OCR_OFF_WARNING]
|
|
115
202
|
};
|
|
116
203
|
}
|
|
117
|
-
const
|
|
118
|
-
|
|
119
|
-
signal: options?.ocr?.signal ?? options?.signal
|
|
120
|
-
};
|
|
121
|
-
const r = await ocr(data, ocrOpts);
|
|
122
|
-
return {
|
|
204
|
+
const ocrPart = await runRasterOcrForBrowser(data, input, options);
|
|
205
|
+
const base = {
|
|
123
206
|
fileKind: "image",
|
|
124
207
|
analyzer: "image",
|
|
125
208
|
status: "ok",
|
|
126
209
|
kind: "image",
|
|
127
|
-
text:
|
|
128
|
-
confidence:
|
|
129
|
-
ocrUsed:
|
|
130
|
-
warnings:
|
|
210
|
+
text: ocrPart.text,
|
|
211
|
+
confidence: ocrPart.confidence,
|
|
212
|
+
ocrUsed: true,
|
|
213
|
+
warnings: ocrPart.warnings
|
|
131
214
|
};
|
|
215
|
+
const extra = {};
|
|
216
|
+
if (ocrPart.pages !== void 0) extra.pages = ocrPart.pages;
|
|
217
|
+
if (ocrPart.textByPage !== void 0) extra.textByPage = ocrPart.textByPage;
|
|
218
|
+
if (ocrPart.inputFormat !== void 0) extra.inputFormat = ocrPart.inputFormat;
|
|
219
|
+
if (ocrPart.normalizedFormat !== void 0) extra.normalizedFormat = ocrPart.normalizedFormat;
|
|
220
|
+
if (Object.keys(extra).length > 0) {
|
|
221
|
+
return { ...base, ...extra };
|
|
222
|
+
}
|
|
223
|
+
return base;
|
|
132
224
|
}
|
|
133
225
|
|
|
134
226
|
// src/analyzeFile.ts
|
|
@@ -361,11 +453,33 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
361
453
|
break;
|
|
362
454
|
case "image":
|
|
363
455
|
capabilities = [
|
|
364
|
-
slot("text", true, [
|
|
456
|
+
slot("text", true, [
|
|
457
|
+
"Text via `@dragon708/docmind-ocr` when `ocr.mode` is not off: PNG, JPEG, WebP, BMP, GIF (first frame), TIFF (partial), after sniff/MIME."
|
|
458
|
+
]),
|
|
365
459
|
slot("metadata", false, [IMAGE_META]),
|
|
366
460
|
slot("html", false, [IMAGE_HTML]),
|
|
367
|
-
slot("ocr", true, [
|
|
368
|
-
|
|
461
|
+
slot("ocr", true, [
|
|
462
|
+
OCR_OFF_NOTE,
|
|
463
|
+
"Uses `ocrImageDetailed` (single-frame path) or multipage `ocrTiff` for TIFF; WASM Tesseract in-browser."
|
|
464
|
+
]),
|
|
465
|
+
slot("image-normalization", true, [
|
|
466
|
+
"`normalizeImageForOcr` runs inside the OCR package (canvas/`createImageBitmap` in-browser for BMP, GIF, etc.; not HEIC)."
|
|
467
|
+
]),
|
|
468
|
+
slot("bmp", true, [
|
|
469
|
+
"BMP is decoded via browser canvas/`createImageBitmap` into a PNG-oriented buffer before Tesseract."
|
|
470
|
+
]),
|
|
471
|
+
slot("gif-first-frame", true, [
|
|
472
|
+
"Animated GIF: only the first decoded frame is normalized and OCR'd; see result warnings when multi-frame is detected."
|
|
473
|
+
]),
|
|
474
|
+
slot("heic", false, [
|
|
475
|
+
"HEIC/HEIF is not decoded in the browser. `runOcr` / `analyzeFile` return empty text with explicit warnings; convert server-side (e.g. @dragon708/docmind-node)."
|
|
476
|
+
]),
|
|
477
|
+
slot("tiff", true, [
|
|
478
|
+
"Partial / best-effort: multipage `ocrTiff` with UTIF in JS/WASM\u2014not all compressions or huge files; prefer Node for production TIFF."
|
|
479
|
+
]),
|
|
480
|
+
slot("pages", true, [
|
|
481
|
+
"Multipage TIFF may populate `pages` and `textByPage` when OCR succeeds; other formats may expose `pages` when normalization reports it."
|
|
482
|
+
])
|
|
369
483
|
];
|
|
370
484
|
break;
|
|
371
485
|
case "text":
|
|
@@ -405,6 +519,9 @@ function lim(...items) {
|
|
|
405
519
|
}
|
|
406
520
|
var DOCX_MAMMOTH_PLUS_OPTIONAL_BROWSER = "Mammoth (`analyzeDocx`) extracts text and HTML from OOXML in-browser; optional parallel OOXML/ZIP extractors run when options.docx.include is set.";
|
|
407
521
|
var DOCX_ZIP_NOTE_BROWSER = "Embedded files under word/media are available via @dragon708/docmind-docx when options.docx.include requests embeddedImages (or call extractImagesFromDocx on the same bytes).";
|
|
522
|
+
var BROWSER_TIFF_RASTER_NOTE = "TIFF (if detected): multipage OCR uses `ocrTiff` with UTIF in-browser\u2014best-effort only; failures return empty text + warnings (no throw). Prefer `@dragon708/docmind-node` for production TIFF.";
|
|
523
|
+
var BROWSER_IMAGE_PIPELINE = "Browser raster OCR: sniff format \u2192 `normalizeImageForOcr` (canvas/`createImageBitmap` for BMP/GIF; not HEIC) \u2192 optional `preprocessImageForOcr` when `options.ocr.preprocess` applies \u2192 Tesseract via `ocrImageDetailed`, or `ocrTiff` for TIFF. HEIC/HEIF: no decode\u2014expect empty text and explicit warnings. GIF: first frame only.";
|
|
524
|
+
var BROWSER_HEIC_NOTE = "HEIC/HEIF is never decoded in-browser; there is no `sharp` dependency. Convert server-side, then OCR PNG/JPEG bytes.";
|
|
408
525
|
function finalizeBrowserDocxExplainReport(report) {
|
|
409
526
|
if (report.kind !== "docx") return report;
|
|
410
527
|
const limitations = report.limitations.includes(DOCX_ZIP_NOTE_BROWSER) ? report.limitations : [...report.limitations, DOCX_ZIP_NOTE_BROWSER];
|
|
@@ -415,12 +532,12 @@ function finalizeBrowserDocxExplainReport(report) {
|
|
|
415
532
|
limitations
|
|
416
533
|
};
|
|
417
534
|
}
|
|
418
|
-
function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude) {
|
|
535
|
+
function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocrSlice) {
|
|
419
536
|
const runtime = { id: "browser" };
|
|
420
537
|
const imageOcrActive = ocrMode !== "off";
|
|
421
538
|
let primaryAnalyzer = kind === "pdf" ? "pdf" : kind === "docx" ? "docx" : kind === "image" ? "image" : kind === "text" ? "text" : "none";
|
|
422
539
|
let nativeExtraction;
|
|
423
|
-
let
|
|
540
|
+
let ocr;
|
|
424
541
|
let limitations = [];
|
|
425
542
|
const ocrOffNote = ocrMode === "off" ? 'Image OCR is skipped when ocr.mode is "off".' : "";
|
|
426
543
|
if (kind === "pdf") {
|
|
@@ -429,7 +546,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude) {
|
|
|
429
546
|
willAttempt: false,
|
|
430
547
|
description: "PDF is not processed in the browser runtime; use @dragon708/docmind-node."
|
|
431
548
|
};
|
|
432
|
-
|
|
549
|
+
ocr = {
|
|
433
550
|
mayUse: false,
|
|
434
551
|
description: "PDF OCR is not available in the browser."
|
|
435
552
|
};
|
|
@@ -440,7 +557,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude) {
|
|
|
440
557
|
intent,
|
|
441
558
|
primaryAnalyzer: "pdf",
|
|
442
559
|
nativeExtraction,
|
|
443
|
-
ocr
|
|
560
|
+
ocr,
|
|
444
561
|
limitations,
|
|
445
562
|
plan,
|
|
446
563
|
warnings: [BROWSER_PDF_UNSUPPORTED_WARNING]
|
|
@@ -451,7 +568,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude) {
|
|
|
451
568
|
"Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve."
|
|
452
569
|
);
|
|
453
570
|
nativeExtraction = { willAttempt: false, description: "No analyzer selected without a known file kind." };
|
|
454
|
-
|
|
571
|
+
ocr = { mayUse: false, description: "OCR is not used for unknown kinds." };
|
|
455
572
|
return finalizeBrowserDocxExplainReport({
|
|
456
573
|
kind,
|
|
457
574
|
detectedKind: kind,
|
|
@@ -459,7 +576,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude) {
|
|
|
459
576
|
intent,
|
|
460
577
|
primaryAnalyzer: "none",
|
|
461
578
|
nativeExtraction,
|
|
462
|
-
ocr
|
|
579
|
+
ocr,
|
|
463
580
|
limitations,
|
|
464
581
|
plan
|
|
465
582
|
});
|
|
@@ -472,23 +589,23 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude) {
|
|
|
472
589
|
willAttempt: true,
|
|
473
590
|
description: docxIncludeRequested(docxInclude) ? "Mammoth plus parallel OOXML extractors (per options.docx.include)." + (intent === "extractText" ? " HTML cleared in extractText." : "") : DOCX_MAMMOTH_PLUS_OPTIONAL_BROWSER + (intent === "extractText" ? " HTML omitted in extractText." : "")
|
|
474
591
|
};
|
|
475
|
-
|
|
592
|
+
ocr = { mayUse: false, description: "DOCX does not use OCR in DocMind." };
|
|
476
593
|
} else if (kind === "image") {
|
|
477
594
|
nativeExtraction = {
|
|
478
595
|
willAttempt: false,
|
|
479
|
-
description:
|
|
596
|
+
description: BROWSER_IMAGE_PIPELINE
|
|
480
597
|
};
|
|
481
|
-
|
|
598
|
+
ocr = {
|
|
482
599
|
mayUse: imageOcrActive,
|
|
483
|
-
description: imageOcrActive ? "
|
|
600
|
+
description: imageOcrActive ? "`ocrImageDetailed` (normalize + optional preprocess) for single-frame paths; TIFF \u2192 `ocrTiff` (UTIF, partial). HEIC unsupported in-browser." : "OCR skipped while ocr.mode is off."
|
|
484
601
|
};
|
|
485
|
-
limitations = lim(ocrOffNote);
|
|
602
|
+
limitations = lim(ocrOffNote, BROWSER_TIFF_RASTER_NOTE, BROWSER_HEIC_NOTE);
|
|
486
603
|
} else {
|
|
487
604
|
nativeExtraction = {
|
|
488
605
|
willAttempt: true,
|
|
489
606
|
description: "Plain text is decoded as UTF-8 (BOM stripped, replacement on invalid bytes)."
|
|
490
607
|
};
|
|
491
|
-
|
|
608
|
+
ocr = { mayUse: false, description: "OCR does not apply to text files." };
|
|
492
609
|
}
|
|
493
610
|
break;
|
|
494
611
|
case "extractMetadata":
|
|
@@ -497,16 +614,16 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude) {
|
|
|
497
614
|
willAttempt: false,
|
|
498
615
|
description: "No heavy extractor; extractMetadata returns a stub with guidance."
|
|
499
616
|
};
|
|
500
|
-
|
|
617
|
+
ocr = { mayUse: false, description: "OCR is not invoked for this metadata path." };
|
|
501
618
|
limitations = lim(
|
|
502
|
-
kind === "docx" ? "Structured DOCX metadata is not exposed separately in the browser; use analyzeFile / extractText / convertToHtml with options.docx.include for OOXML fields." : "Raster images have no document metadata bundle
|
|
619
|
+
kind === "docx" ? "Structured DOCX metadata is not exposed separately in the browser; use analyzeFile / extractText / convertToHtml with options.docx.include for OOXML fields." : "Raster images have no document metadata bundle. TIFF/HEIC caveats: see getCapabilities (`tiff` partial, `heic` unsupported) and runOcr warnings."
|
|
503
620
|
);
|
|
504
621
|
} else {
|
|
505
622
|
nativeExtraction = {
|
|
506
623
|
willAttempt: true,
|
|
507
624
|
description: "Plain text is decoded; metadata is limited to decoded content."
|
|
508
625
|
};
|
|
509
|
-
|
|
626
|
+
ocr = { mayUse: false, description: "OCR does not apply." };
|
|
510
627
|
limitations = lim("Plain text has no structured document metadata.");
|
|
511
628
|
}
|
|
512
629
|
break;
|
|
@@ -516,21 +633,21 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude) {
|
|
|
516
633
|
willAttempt: true,
|
|
517
634
|
description: docxIncludeRequested(docxInclude) ? "Mammoth HTML via analyzeFile plus optional OOXML extractors." : "Mammoth HTML via analyzeFile; optional OOXML v2 when options.docx.include is set."
|
|
518
635
|
};
|
|
519
|
-
|
|
636
|
+
ocr = { mayUse: false, description: "DOCX path does not use OCR." };
|
|
520
637
|
} else if (kind === "text") {
|
|
521
638
|
nativeExtraction = {
|
|
522
639
|
willAttempt: true,
|
|
523
640
|
description: "UTF-8 decode then wrap in a <pre> element."
|
|
524
641
|
};
|
|
525
|
-
|
|
642
|
+
ocr = { mayUse: false, description: "OCR does not apply." };
|
|
526
643
|
} else {
|
|
527
644
|
nativeExtraction = {
|
|
528
645
|
willAttempt: false,
|
|
529
646
|
description: "No rich HTML path for this kind in the browser."
|
|
530
647
|
};
|
|
531
|
-
|
|
648
|
+
ocr = { mayUse: false, description: "OCR does not produce layout HTML here." };
|
|
532
649
|
limitations = lim(
|
|
533
|
-
kind === "image" ? "Raster images have no HTML representation; use extractText or runOcr." : ""
|
|
650
|
+
kind === "image" ? "Raster images have no HTML representation; use extractText or runOcr. Expect HEIC to yield warnings only; TIFF is best-effort." : ""
|
|
534
651
|
);
|
|
535
652
|
}
|
|
536
653
|
break;
|
|
@@ -538,31 +655,37 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude) {
|
|
|
538
655
|
if (kind === "image") {
|
|
539
656
|
nativeExtraction = {
|
|
540
657
|
willAttempt: false,
|
|
541
|
-
description:
|
|
658
|
+
description: BROWSER_IMAGE_PIPELINE
|
|
542
659
|
};
|
|
543
|
-
|
|
660
|
+
ocr = {
|
|
544
661
|
mayUse: imageOcrActive,
|
|
545
|
-
description: imageOcrActive ? "
|
|
662
|
+
description: imageOcrActive ? "Same as analyzeFile: `normalizeImageForOcr` inside `ocrImageDetailed`, optional canvas preprocess, or `ocrTiff` for TIFF. Errors surface as warnings, not uncaught exceptions (except abort)." : "OCR skipped while ocr.mode is off."
|
|
546
663
|
};
|
|
547
|
-
limitations = lim(ocrOffNote);
|
|
664
|
+
limitations = lim(ocrOffNote, BROWSER_TIFF_RASTER_NOTE, BROWSER_HEIC_NOTE);
|
|
548
665
|
} else if (kind === "docx") {
|
|
549
666
|
nativeExtraction = {
|
|
550
667
|
willAttempt: true,
|
|
551
668
|
description: docxIncludeRequested(docxInclude) ? "Mammoth text/HTML plus optional OOXML extractors; not OCR." : "Mammoth text/HTML; optional OOXML v2 via options.docx.include; not OCR."
|
|
552
669
|
};
|
|
553
|
-
|
|
670
|
+
ocr = { mayUse: false, description: "DOCX is not OCR'd." };
|
|
554
671
|
limitations = lim("Returned content is structured extract, not OCR output.");
|
|
555
672
|
} else {
|
|
556
673
|
nativeExtraction = {
|
|
557
674
|
willAttempt: true,
|
|
558
675
|
description: "Plain text is UTF-8 decoded only."
|
|
559
676
|
};
|
|
560
|
-
|
|
677
|
+
ocr = { mayUse: false, description: "OCR does not apply to text files." };
|
|
561
678
|
}
|
|
562
679
|
break;
|
|
563
680
|
default:
|
|
564
681
|
nativeExtraction = { willAttempt: false, description: "Intent not specialized in this runtime." };
|
|
565
|
-
|
|
682
|
+
ocr = { mayUse: false, description: "See plan steps." };
|
|
683
|
+
}
|
|
684
|
+
if (kind === "image" && preprocessHasEffect(ocrSlice?.preprocess)) {
|
|
685
|
+
limitations = [
|
|
686
|
+
...limitations,
|
|
687
|
+
"options.ocr.preprocess applies to the `ocrImageDetailed` path only; multipage TIFF (`ocrTiff`) does not run preprocess per frame."
|
|
688
|
+
];
|
|
566
689
|
}
|
|
567
690
|
return finalizeBrowserDocxExplainReport({
|
|
568
691
|
kind,
|
|
@@ -571,17 +694,38 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude) {
|
|
|
571
694
|
intent,
|
|
572
695
|
primaryAnalyzer,
|
|
573
696
|
nativeExtraction,
|
|
574
|
-
ocr
|
|
697
|
+
ocr,
|
|
575
698
|
limitations,
|
|
576
699
|
plan
|
|
577
700
|
});
|
|
578
701
|
}
|
|
579
702
|
|
|
580
703
|
// src/introspection.ts
|
|
581
|
-
function resolveOcrMode2(
|
|
582
|
-
return
|
|
704
|
+
function resolveOcrMode2(ocr) {
|
|
705
|
+
return ocr?.mode ?? "auto";
|
|
706
|
+
}
|
|
707
|
+
function imageBrowserPlanSteps(ocrMode, ocr) {
|
|
708
|
+
if (ocrMode === "off") {
|
|
709
|
+
return [
|
|
710
|
+
{ id: "detect_kind", status: "done" },
|
|
711
|
+
{ id: "image_format_detect", status: "skipped" },
|
|
712
|
+
{ id: "normalize_image_for_ocr", status: "skipped" },
|
|
713
|
+
{ id: "preprocess_image_for_ocr", status: "skipped" },
|
|
714
|
+
{ id: "tesseract_ocr", status: "skipped" }
|
|
715
|
+
];
|
|
716
|
+
}
|
|
717
|
+
return [
|
|
718
|
+
{ id: "detect_kind", status: "done" },
|
|
719
|
+
{ id: "image_format_detect", status: "planned" },
|
|
720
|
+
{ id: "normalize_image_for_ocr", status: "planned" },
|
|
721
|
+
{
|
|
722
|
+
id: "preprocess_image_for_ocr",
|
|
723
|
+
status: preprocessHasEffect(ocr?.preprocess) ? "planned" : "skipped"
|
|
724
|
+
},
|
|
725
|
+
{ id: "tesseract_ocr", status: "planned" }
|
|
726
|
+
];
|
|
583
727
|
}
|
|
584
|
-
function planForAnalyzeFile(kind, ocrMode, docxInclude) {
|
|
728
|
+
function planForAnalyzeFile(kind, ocrMode, docxInclude, ocr) {
|
|
585
729
|
switch (kind) {
|
|
586
730
|
case "pdf":
|
|
587
731
|
return {
|
|
@@ -605,13 +749,7 @@ function planForAnalyzeFile(kind, ocrMode, docxInclude) {
|
|
|
605
749
|
case "image":
|
|
606
750
|
return {
|
|
607
751
|
intent: "analyzeFile",
|
|
608
|
-
steps:
|
|
609
|
-
{ id: "detect_kind", status: "done" },
|
|
610
|
-
{
|
|
611
|
-
id: "image_ocr",
|
|
612
|
-
status: ocrMode === "off" ? "skipped" : "planned"
|
|
613
|
-
}
|
|
614
|
-
]
|
|
752
|
+
steps: imageBrowserPlanSteps(ocrMode, ocr)
|
|
615
753
|
};
|
|
616
754
|
case "text":
|
|
617
755
|
return {
|
|
@@ -628,11 +766,11 @@ function planForAnalyzeFile(kind, ocrMode, docxInclude) {
|
|
|
628
766
|
};
|
|
629
767
|
}
|
|
630
768
|
}
|
|
631
|
-
function planForIntent(intentOpt, kind, ocrMode, docxInclude) {
|
|
769
|
+
function planForIntent(intentOpt, kind, ocrMode, docxInclude, ocr) {
|
|
632
770
|
const intent = intentOpt ?? "analyzeFile";
|
|
633
|
-
if (intent === "analyzeFile") return planForAnalyzeFile(kind, ocrMode, docxInclude);
|
|
771
|
+
if (intent === "analyzeFile") return planForAnalyzeFile(kind, ocrMode, docxInclude, ocr);
|
|
634
772
|
if (intent === "extractText") {
|
|
635
|
-
const base = planForAnalyzeFile(kind, ocrMode, docxInclude);
|
|
773
|
+
const base = planForAnalyzeFile(kind, ocrMode, docxInclude, ocr);
|
|
636
774
|
return { ...base, intent: "extractText" };
|
|
637
775
|
}
|
|
638
776
|
if (intent === "extractMetadata") {
|
|
@@ -687,10 +825,7 @@ function planForIntent(intentOpt, kind, ocrMode, docxInclude) {
|
|
|
687
825
|
if (kind === "image") {
|
|
688
826
|
return {
|
|
689
827
|
intent: "runOcr",
|
|
690
|
-
steps:
|
|
691
|
-
{ id: "detect_kind", status: "done" },
|
|
692
|
-
{ id: "tesseract_ocr", status: ocrMode === "off" ? "skipped" : "planned" }
|
|
693
|
-
]
|
|
828
|
+
steps: imageBrowserPlanSteps(ocrMode, ocr)
|
|
694
829
|
};
|
|
695
830
|
}
|
|
696
831
|
if (kind === "docx") {
|
|
@@ -712,7 +847,7 @@ function planForIntent(intentOpt, kind, ocrMode, docxInclude) {
|
|
|
712
847
|
]
|
|
713
848
|
};
|
|
714
849
|
}
|
|
715
|
-
return planForAnalyzeFile(kind, ocrMode, docxInclude);
|
|
850
|
+
return planForAnalyzeFile(kind, ocrMode, docxInclude, ocr);
|
|
716
851
|
}
|
|
717
852
|
async function getCapabilities(input, options) {
|
|
718
853
|
throwIfAborted(options?.signal);
|
|
@@ -727,8 +862,9 @@ async function explainAnalysisPlan(input, options) {
|
|
|
727
862
|
const intent = options?.intent ?? "analyzeFile";
|
|
728
863
|
const ocrMode = resolveOcrMode2(options?.ocr);
|
|
729
864
|
const docxInc = options?.docx?.include;
|
|
730
|
-
const
|
|
731
|
-
|
|
865
|
+
const ocrSlice = options?.ocr;
|
|
866
|
+
const plan = planForIntent(intent, kind, ocrMode, docxInc, ocrSlice);
|
|
867
|
+
return buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInc, ocrSlice);
|
|
732
868
|
}
|
|
733
869
|
|
|
734
870
|
export { BROWSER_PDF_UNSUPPORTED_WARNING, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, runOcr };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-browser",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.5.0",
|
|
4
4
|
"description": "Official DocMind browser facade: analyzeFile and intent APIs (DOCX, image OCR, text). PDF and fs paths use @dragon708/docmind-node.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"sideEffects": false,
|
|
@@ -34,7 +34,7 @@
|
|
|
34
34
|
"license": "MIT",
|
|
35
35
|
"dependencies": {
|
|
36
36
|
"@dragon708/docmind-docx": "^1.7.0",
|
|
37
|
-
"@dragon708/docmind-ocr": "^1.
|
|
37
|
+
"@dragon708/docmind-ocr": "^1.1.0",
|
|
38
38
|
"@dragon708/docmind-shared": "^1.1.0"
|
|
39
39
|
},
|
|
40
40
|
"devDependencies": {
|