@dragon708/docmind-browser 1.5.1 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +37 -11
- package/dist/index.js +275 -21
- package/package.json +4 -4
package/dist/index.d.ts
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import { DocMindAnalyzeOptions, AnalysisResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
|
|
2
|
-
export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
|
|
1
|
+
import { DocMindAnalyzeOptions, AnalyzeFileOutputOptions, AnalysisResult, NormalizeStructuredOptions, StructuredDocumentResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
|
|
2
|
+
export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocumentBlock, DocumentImageRef, DocumentPage, DocumentTable, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, StructuredDocumentResult, TextAnalysisResult, analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
|
|
3
3
|
import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
|
|
4
|
+
export { ExtractStructuredDataFromImageOptions, extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
|
|
4
5
|
import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
|
|
5
|
-
export { AnalyzeDocxIncludeFlags } from '@dragon708/docmind-docx';
|
|
6
|
+
export { AnalyzeDocxIncludeFlags, ExtractStructuredDataFromDocxOptions, extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
|
|
6
7
|
|
|
7
8
|
/**
|
|
8
9
|
* Opciones DOCX para el facade browser (Mammoth + inclusiones v2 de `@dragon708/docmind-docx`; sin APIs Node-only).
|
|
@@ -30,7 +31,7 @@ interface BrowserOcrOptions extends OcrOptions, Pick<OcrTiffOptions, "maxPages"
|
|
|
30
31
|
* Options for public browser methods (`analyzeFile`, intent APIs).
|
|
31
32
|
* There is no PDF pipeline in the browser; {@link BrowserOcrOptions.mode} applies to images only.
|
|
32
33
|
*/
|
|
33
|
-
interface BrowserAnalyzeOptions extends DocMindAnalyzeOptions {
|
|
34
|
+
interface BrowserAnalyzeOptions extends DocMindAnalyzeOptions, AnalyzeFileOutputOptions {
|
|
34
35
|
/** Image OCR only; no PDF in this runtime. See {@link BrowserOcrOptions.mode}. */
|
|
35
36
|
readonly ocr?: BrowserOcrOptions;
|
|
36
37
|
/** Solo DOCX: ver {@link BrowserAnalyzeDocxOptionsSlice}. */
|
|
@@ -42,15 +43,21 @@ interface BrowserAnalyzeOptions extends DocMindAnalyzeOptions {
|
|
|
42
43
|
*/
|
|
43
44
|
type BrowserAnalyzeInput = File | Blob | ArrayBuffer;
|
|
44
45
|
|
|
46
|
+
/** PDF is not processed in the browser; use `@dragon708/docmind-node` on the server. */
|
|
47
|
+
declare const BROWSER_PDF_UNSUPPORTED_WARNING = "PDF text extraction is not available in the browser runtime; use @dragon708/docmind-node on the server.";
|
|
48
|
+
/** PDF structured extraction is not available in the browser; use the Node PDF helpers. */
|
|
49
|
+
declare const BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING = "PDF text extraction is not available in the browser runtime; use @dragon708/docmind-node on the server. Structured document extraction for PDF is not available in the browser; use @dragon708/docmind-node (e.g. extractStructuredDataFromPdf).";
|
|
50
|
+
|
|
45
51
|
/**
|
|
46
52
|
* Browser `analyzeFile` router. Package-level scope and limitations are documented on the package entry (`index.ts`).
|
|
47
53
|
*/
|
|
48
54
|
|
|
49
|
-
/** PDF is not processed in the browser; use `@dragon708/docmind-node` on the server. */
|
|
50
|
-
declare const BROWSER_PDF_UNSUPPORTED_WARNING = "PDF text extraction is not available in the browser runtime; use @dragon708/docmind-node on the server.";
|
|
51
55
|
/**
|
|
52
56
|
* Full-document router: DOCX (text + HTML), images (OCR subject to {@link BrowserOcrOptions.mode}),
|
|
53
57
|
* plain text (UTF-8 decode). PDF yields `not_implemented` with {@link BROWSER_PDF_UNSUPPORTED_WARNING}.
|
|
58
|
+
*
|
|
59
|
+
* When `structuredOutput` is true or `output` includes `"structured"`, successful results may include
|
|
60
|
+
* optional `structured` (see `@dragon708/docmind-shared` DocMindV2Extensions).
|
|
54
61
|
*/
|
|
55
62
|
declare function analyzeFile(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOptions): Promise<AnalysisResult>;
|
|
56
63
|
|
|
@@ -77,8 +84,27 @@ declare function convertToHtml(input: BrowserAnalyzeInput, options?: BrowserAnal
|
|
|
77
84
|
*/
|
|
78
85
|
declare function runOcr(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOptions): Promise<AnalysisResult>;
|
|
79
86
|
|
|
87
|
+
/**
|
|
88
|
+
* Browser-safe structured extraction: DOCX and images delegate to `@dragon708/docmind-docx` /
|
|
89
|
+
* `@dragon708/docmind-ocr`; plain text is normalized via `@dragon708/docmind-shared`.
|
|
90
|
+
* PDF is not supported in this runtime (clear warnings, no PDF package import).
|
|
91
|
+
*/
|
|
92
|
+
|
|
93
|
+
/** Options for {@link extractStructuredData}: same as {@link BrowserAnalyzeOptions} plus shared normalize knobs. */
|
|
94
|
+
type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
|
|
95
|
+
readonly normalize?: NormalizeStructuredOptions;
|
|
96
|
+
};
|
|
97
|
+
/**
|
|
98
|
+
* Returns a {@link StructuredDocumentResult} for inputs the browser runtime actually supports:
|
|
99
|
+
* **DOCX** (`extractStructuredDataFromDocx`), **images** (`extractStructuredDataFromImage` when OCR is not off),
|
|
100
|
+
* **plain text** (UTF-8 decode + `normalizeToStructuredResult`).
|
|
101
|
+
*
|
|
102
|
+
* **PDF** yields an empty structured envelope plus `BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING` from the facade (no PDF parser in this runtime).
|
|
103
|
+
*/
|
|
104
|
+
declare function extractStructuredData(input: BrowserAnalyzeInput, options?: BrowserExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
|
|
105
|
+
|
|
80
106
|
/** High-level features the user can ask DocMind for (per input kind and runtime). */
|
|
81
|
-
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
|
|
107
|
+
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
|
|
82
108
|
declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
|
|
83
109
|
/** DOCX `word/media` en runtime browser (JSZip; sin pipeline Node). */
|
|
84
110
|
interface DocxEmbeddedImageCapabilities {
|
|
@@ -151,18 +177,18 @@ interface ExplainAnalysisPlanReport {
|
|
|
151
177
|
}
|
|
152
178
|
|
|
153
179
|
/** Options for {@link explainAnalysisPlan}: shared fields plus optional `ocr` / `docx` for accurate step preview. */
|
|
154
|
-
type BrowserExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<BrowserAnalyzeOptions, "ocr" | "docx">;
|
|
180
|
+
type BrowserExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<BrowserAnalyzeOptions, "ocr" | "docx" | "structuredOutput" | "output">;
|
|
155
181
|
|
|
156
182
|
/**
|
|
157
183
|
* Epic 1 — **Capabilities:** detects kind from the same hints as `analyzeFile`, then lists which of
|
|
158
|
-
* `text` | `metadata` | `html` | `ocr` | `pages` and image-specific ids (`image-normalization`, `bmp`, `gif-first-frame`, `heic`, `tiff`) apply in the browser (PDF always unsupported).
|
|
184
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` and image-specific ids (`image-normalization`, `bmp`, `gif-first-frame`, `heic`, `tiff`) apply in the browser (PDF always unsupported).
|
|
159
185
|
* No Mammoth/Tesseract/PDF parsing. For DOCX, {@link GetCapabilitiesReport.docxStructure} / `docxEmbeddedImages` describe v2 opt-in features.
|
|
160
186
|
*/
|
|
161
187
|
declare function getCapabilities(input: BrowserAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
|
|
162
188
|
/**
|
|
163
189
|
* Epic 1 — **Plan preview:** structured explanation (analyzer, native extraction vs OCR, `limitations`, `plan.steps`)
|
|
164
|
-
* for a {@link DocMindPublicIntent}. Optional `ocr` refines image steps; optional `docx.include` adds planned OOXML parallel steps for DOCX. No heavy I/O.
|
|
190
|
+
* for a {@link DocMindPublicIntent} (including `extractStructuredData`). Optional `ocr` refines image steps; optional `docx.include` adds planned OOXML parallel steps for DOCX. No heavy I/O.
|
|
165
191
|
*/
|
|
166
192
|
declare function explainAnalysisPlan(input: BrowserAnalyzeInput, options?: BrowserExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
|
|
167
193
|
|
|
168
|
-
export { BROWSER_PDF_UNSUPPORTED_WARNING, type BrowserAnalyzeDocxOptionsSlice, type BrowserAnalyzeInput, type BrowserAnalyzeOptions, type BrowserExplainAnalysisPlanOptions, type BrowserOcrMode, type BrowserOcrOptions, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, runOcr };
|
|
194
|
+
export { BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING, BROWSER_PDF_UNSUPPORTED_WARNING, type BrowserAnalyzeDocxOptionsSlice, type BrowserAnalyzeInput, type BrowserAnalyzeOptions, type BrowserExplainAnalysisPlanOptions, type BrowserExtractStructuredDataOptions, type BrowserOcrMode, type BrowserOcrOptions, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, runOcr };
|
package/dist/index.js
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
-
import {
|
|
2
|
-
export { detectFileKind } from '@dragon708/docmind-shared';
|
|
3
|
-
import { analyzeDocx } from '@dragon708/docmind-docx';
|
|
4
|
-
|
|
1
|
+
import { detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, assertValidAnalyzeFileInput, notImplementedResult, analyzeFileRequestsStructured, InvalidInputError, getMimeType } from '@dragon708/docmind-shared';
|
|
2
|
+
export { analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
|
|
3
|
+
import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-docx';
|
|
4
|
+
export { extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
|
|
5
|
+
import { extractStructuredDataFromImage, preprocessHasEffect, resolveImageFormat, normalizeImageForOcr, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
|
|
6
|
+
export { extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
|
|
5
7
|
|
|
6
8
|
// src/analyzeFile.ts
|
|
7
9
|
function assertBrowserInput(input) {
|
|
@@ -223,8 +225,125 @@ async function analyzeImageForBrowser(input, options) {
|
|
|
223
225
|
return base;
|
|
224
226
|
}
|
|
225
227
|
|
|
226
|
-
// src/
|
|
228
|
+
// src/browserPdfWarnings.ts
|
|
227
229
|
var BROWSER_PDF_UNSUPPORTED_WARNING = "PDF text extraction is not available in the browser runtime; use @dragon708/docmind-node on the server.";
|
|
230
|
+
var BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING = `${BROWSER_PDF_UNSUPPORTED_WARNING} Structured document extraction for PDF is not available in the browser; use @dragon708/docmind-node (e.g. extractStructuredDataFromPdf).`;
|
|
231
|
+
var OCR_OFF_STRUCTURED_WARNING = 'Structured image extraction uses OCR; ocr.mode is "off". Use "auto" or "force" to run Tesseract.';
|
|
232
|
+
function resolveOcrMode2(ocr) {
|
|
233
|
+
return ocr?.mode ?? "auto";
|
|
234
|
+
}
|
|
235
|
+
async function extractStructuredData(input, options) {
|
|
236
|
+
throwIfAborted(options?.signal);
|
|
237
|
+
prepareBrowserAnalyzeInput(input);
|
|
238
|
+
const kind = detectFileKind(input);
|
|
239
|
+
const bytesInput = input;
|
|
240
|
+
const norm = options?.normalize;
|
|
241
|
+
switch (kind) {
|
|
242
|
+
case "pdf":
|
|
243
|
+
return normalizeToStructuredResult(
|
|
244
|
+
{
|
|
245
|
+
kind: "pdf",
|
|
246
|
+
text: "",
|
|
247
|
+
blocks: [],
|
|
248
|
+
warnings: [BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING],
|
|
249
|
+
metadata: {
|
|
250
|
+
extra: { browserStructuredPdfUnsupported: true }
|
|
251
|
+
}
|
|
252
|
+
},
|
|
253
|
+
norm
|
|
254
|
+
);
|
|
255
|
+
case "docx": {
|
|
256
|
+
const data = await toUint8Array(bytesInput);
|
|
257
|
+
if (data.byteLength === 0) {
|
|
258
|
+
return normalizeToStructuredResult(
|
|
259
|
+
{
|
|
260
|
+
kind: "docx",
|
|
261
|
+
text: "",
|
|
262
|
+
blocks: [],
|
|
263
|
+
warnings: ["No document bytes were provided for analysis."]
|
|
264
|
+
},
|
|
265
|
+
norm
|
|
266
|
+
);
|
|
267
|
+
}
|
|
268
|
+
const docxOpts = analyzeDocxOptionsFromBrowser(options);
|
|
269
|
+
return extractStructuredDataFromDocx(data, {
|
|
270
|
+
...docxOpts ?? {},
|
|
271
|
+
normalize: norm
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
case "image": {
|
|
275
|
+
const data = await toUint8Array(bytesInput);
|
|
276
|
+
if (data.byteLength === 0) {
|
|
277
|
+
return normalizeToStructuredResult(
|
|
278
|
+
{
|
|
279
|
+
kind: "image",
|
|
280
|
+
text: "",
|
|
281
|
+
blocks: [],
|
|
282
|
+
tables: [],
|
|
283
|
+
pages: [],
|
|
284
|
+
images: [],
|
|
285
|
+
warnings: ["No image bytes were provided for analysis."]
|
|
286
|
+
},
|
|
287
|
+
norm
|
|
288
|
+
);
|
|
289
|
+
}
|
|
290
|
+
if (resolveOcrMode2(options?.ocr) === "off") {
|
|
291
|
+
return normalizeToStructuredResult(
|
|
292
|
+
{
|
|
293
|
+
kind: "image",
|
|
294
|
+
text: "",
|
|
295
|
+
blocks: [],
|
|
296
|
+
tables: [],
|
|
297
|
+
pages: [],
|
|
298
|
+
images: [],
|
|
299
|
+
warnings: [OCR_OFF_STRUCTURED_WARNING],
|
|
300
|
+
metadata: {
|
|
301
|
+
extra: { ocrSkipped: true }
|
|
302
|
+
}
|
|
303
|
+
},
|
|
304
|
+
norm
|
|
305
|
+
);
|
|
306
|
+
}
|
|
307
|
+
const ocr = options?.ocr;
|
|
308
|
+
return extractStructuredDataFromImage(input, {
|
|
309
|
+
signal: options?.signal,
|
|
310
|
+
normalize: norm,
|
|
311
|
+
langs: ocr?.langs,
|
|
312
|
+
preprocess: ocr?.preprocess,
|
|
313
|
+
maxPages: ocr?.maxPages,
|
|
314
|
+
pageSeparator: ocr?.pageSeparator
|
|
315
|
+
});
|
|
316
|
+
}
|
|
317
|
+
case "text": {
|
|
318
|
+
const textResult = await analyzeText(bytesInput, { signal: options?.signal });
|
|
319
|
+
const blocks = textResult.text.length > 0 ? [{ type: "paragraph", id: "p-0", text: textResult.text }] : [];
|
|
320
|
+
return normalizeToStructuredResult(
|
|
321
|
+
{
|
|
322
|
+
kind: "text",
|
|
323
|
+
text: textResult.text,
|
|
324
|
+
blocks,
|
|
325
|
+
warnings: [...textResult.warnings],
|
|
326
|
+
metadata: {
|
|
327
|
+
extra: { plainTextUtf8: true }
|
|
328
|
+
}
|
|
329
|
+
},
|
|
330
|
+
norm
|
|
331
|
+
);
|
|
332
|
+
}
|
|
333
|
+
default:
|
|
334
|
+
return normalizeToStructuredResult(
|
|
335
|
+
{
|
|
336
|
+
kind: "unknown",
|
|
337
|
+
text: "",
|
|
338
|
+
blocks: [],
|
|
339
|
+
warnings: [UNKNOWN_FORMAT_WARNING]
|
|
340
|
+
},
|
|
341
|
+
norm
|
|
342
|
+
);
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// src/analyzeFile.ts
|
|
228
347
|
async function analyzeFile(input, options) {
|
|
229
348
|
if (options?.signal?.aborted) {
|
|
230
349
|
const err = new Error("The operation was aborted");
|
|
@@ -235,17 +354,38 @@ async function analyzeFile(input, options) {
|
|
|
235
354
|
assertValidAnalyzeFileInput(input);
|
|
236
355
|
const fileKind = detectFileKind(input);
|
|
237
356
|
const bytesInput = input;
|
|
357
|
+
let result;
|
|
238
358
|
switch (fileKind) {
|
|
239
359
|
case "pdf":
|
|
240
|
-
|
|
360
|
+
result = notImplementedResult("pdf", "pdf", [BROWSER_PDF_UNSUPPORTED_WARNING]);
|
|
361
|
+
break;
|
|
241
362
|
case "docx":
|
|
242
|
-
|
|
363
|
+
result = await analyzeDocxForBrowser(bytesInput, options);
|
|
364
|
+
break;
|
|
243
365
|
case "image":
|
|
244
|
-
|
|
366
|
+
result = await analyzeImageForBrowser(bytesInput, options);
|
|
367
|
+
break;
|
|
245
368
|
case "text":
|
|
246
|
-
|
|
369
|
+
result = await analyzeText(bytesInput, { signal: options?.signal });
|
|
370
|
+
break;
|
|
247
371
|
default:
|
|
248
|
-
|
|
372
|
+
result = notImplementedResult(fileKind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
373
|
+
}
|
|
374
|
+
if (!analyzeFileRequestsStructured(options) || result.status !== "ok") {
|
|
375
|
+
return result;
|
|
376
|
+
}
|
|
377
|
+
try {
|
|
378
|
+
const structured = await extractStructuredData(input, {
|
|
379
|
+
signal: options?.signal,
|
|
380
|
+
ocr: options?.ocr,
|
|
381
|
+
docx: options?.docx
|
|
382
|
+
});
|
|
383
|
+
return { ...result, structured };
|
|
384
|
+
} catch (e) {
|
|
385
|
+
if (e instanceof Error && e.name === "AbortError") throw e;
|
|
386
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
387
|
+
const prev = "warnings" in result && Array.isArray(result.warnings) ? [...result.warnings] : [];
|
|
388
|
+
return { ...result, warnings: [...prev, `warning: analyzeFile structured merge failed: ${msg}`] };
|
|
249
389
|
}
|
|
250
390
|
}
|
|
251
391
|
var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not returned by extractMetadata; use analyzeFile, extractText, or convertToHtml with options.docx.include for OOXML structure, headings, tables, blocks, approximate pages, and embedded images.";
|
|
@@ -415,6 +555,7 @@ var IMAGE_META = "Raster images have no document metadata bundle; extractMetadat
|
|
|
415
555
|
var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
|
|
416
556
|
var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
|
|
417
557
|
var OCR_OFF_NOTE = 'Image OCR may be skipped when `ocr.mode` is "off" in analyze options.';
|
|
558
|
+
var STRUCTURED_OCR_OFF = 'Structured image output uses OCR; when `ocr.mode` is "off", `extractStructuredData` returns an empty envelope with a warning.';
|
|
418
559
|
var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
|
|
419
560
|
function slot(id, supported, warnings) {
|
|
420
561
|
return warnings?.length ? { id, supported, warnings } : { id, supported };
|
|
@@ -431,7 +572,8 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
431
572
|
slot("metadata", false, [pdf]),
|
|
432
573
|
slot("html", false, [pdf]),
|
|
433
574
|
slot("ocr", false, [pdf]),
|
|
434
|
-
slot("pages", false, [pdf])
|
|
575
|
+
slot("pages", false, [pdf]),
|
|
576
|
+
slot("structured-output", false, [BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING])
|
|
435
577
|
];
|
|
436
578
|
break;
|
|
437
579
|
case "docx":
|
|
@@ -448,6 +590,9 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
448
590
|
slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
|
|
449
591
|
slot("pages", false, [
|
|
450
592
|
"No PDF page count; approximate DOCX pages via options.docx.include.pagesApprox (heuristic, not print layout)."
|
|
593
|
+
]),
|
|
594
|
+
slot("structured-output", true, [
|
|
595
|
+
"`extractStructuredData` uses `@dragon708/docmind-docx` (Mammoth + OOXML) and returns `StructuredDocumentResult`; optional `options.docx` slices are forwarded."
|
|
451
596
|
])
|
|
452
597
|
];
|
|
453
598
|
break;
|
|
@@ -479,6 +624,11 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
479
624
|
]),
|
|
480
625
|
slot("pages", true, [
|
|
481
626
|
"Multipage TIFF may populate `pages` and `textByPage` when OCR succeeds; other formats may expose `pages` when normalization reports it."
|
|
627
|
+
]),
|
|
628
|
+
slot("structured-output", true, [
|
|
629
|
+
"`extractStructuredData` uses `extractStructuredDataFromImage` (same OCR path as analyzeFile when `ocr.mode` is not off).",
|
|
630
|
+
STRUCTURED_OCR_OFF,
|
|
631
|
+
"HEIC/HEIF and TIFF limitations match `getCapabilities` (`heic`, `tiff`) and OCR warnings."
|
|
482
632
|
])
|
|
483
633
|
];
|
|
484
634
|
break;
|
|
@@ -488,7 +638,10 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
488
638
|
slot("metadata", true, [TEXT_META_NOTE]),
|
|
489
639
|
slot("html", true),
|
|
490
640
|
slot("ocr", false, ["OCR does not apply to plain text files."]),
|
|
491
|
-
slot("pages", false)
|
|
641
|
+
slot("pages", false),
|
|
642
|
+
slot("structured-output", true, [
|
|
643
|
+
"`extractStructuredData` decodes UTF-8 (via `analyzeText`) and normalizes to `StructuredDocumentResult` (paragraph block rollup)."
|
|
644
|
+
])
|
|
492
645
|
];
|
|
493
646
|
break;
|
|
494
647
|
default:
|
|
@@ -498,7 +651,8 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
498
651
|
slot("metadata", false),
|
|
499
652
|
slot("html", false),
|
|
500
653
|
slot("ocr", false),
|
|
501
|
-
slot("pages", false)
|
|
654
|
+
slot("pages", false),
|
|
655
|
+
slot("structured-output", false, [UNKNOWN_KIND])
|
|
502
656
|
];
|
|
503
657
|
}
|
|
504
658
|
return {
|
|
@@ -541,7 +695,10 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
541
695
|
let limitations = [];
|
|
542
696
|
const ocrOffNote = ocrMode === "off" ? 'Image OCR is skipped when ocr.mode is "off".' : "";
|
|
543
697
|
if (kind === "pdf") {
|
|
544
|
-
limitations = lim(
|
|
698
|
+
limitations = lim(
|
|
699
|
+
BROWSER_PDF_UNSUPPORTED_WARNING,
|
|
700
|
+
intent === "extractStructuredData" ? "`extractStructuredData` only returns an empty `StructuredDocumentResult` with warnings for PDF in-browser; use @dragon708/docmind-node for real PDF structured extraction." : ""
|
|
701
|
+
);
|
|
545
702
|
nativeExtraction = {
|
|
546
703
|
willAttempt: false,
|
|
547
704
|
description: "PDF is not processed in the browser runtime; use @dragon708/docmind-node."
|
|
@@ -565,7 +722,8 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
565
722
|
}
|
|
566
723
|
if (kind === "unknown") {
|
|
567
724
|
limitations = lim(
|
|
568
|
-
"Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve."
|
|
725
|
+
"Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve.",
|
|
726
|
+
intent === "extractStructuredData" ? "`extractStructuredData` needs a known kind (text, DOCX, or image) to produce structured output." : ""
|
|
569
727
|
);
|
|
570
728
|
nativeExtraction = { willAttempt: false, description: "No analyzer selected without a known file kind." };
|
|
571
729
|
ocr = { mayUse: false, description: "OCR is not used for unknown kinds." };
|
|
@@ -677,6 +835,32 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
677
835
|
ocr = { mayUse: false, description: "OCR does not apply to text files." };
|
|
678
836
|
}
|
|
679
837
|
break;
|
|
838
|
+
case "extractStructuredData":
|
|
839
|
+
if (kind === "docx") {
|
|
840
|
+
nativeExtraction = {
|
|
841
|
+
willAttempt: true,
|
|
842
|
+
description: "`extractStructuredDataFromDocx`: Mammoth plus required OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages unless disabled), then `normalizeToStructuredResult`. Optional `options.docx` is forwarded."
|
|
843
|
+
};
|
|
844
|
+
ocr = { mayUse: false, description: "DOCX structured path does not use OCR." };
|
|
845
|
+
limitations = lim(DOCX_ZIP_NOTE_BROWSER);
|
|
846
|
+
} else if (kind === "image") {
|
|
847
|
+
nativeExtraction = {
|
|
848
|
+
willAttempt: false,
|
|
849
|
+
description: BROWSER_IMAGE_PIPELINE
|
|
850
|
+
};
|
|
851
|
+
ocr = {
|
|
852
|
+
mayUse: imageOcrActive,
|
|
853
|
+
description: imageOcrActive ? "`extractStructuredDataFromImage` mirrors the OCR pipeline (normalize \u2192 optional preprocess \u2192 Tesseract, or `ocrTiff` for TIFF)." : "OCR skipped while ocr.mode is off; structured output will be empty with a warning."
|
|
854
|
+
};
|
|
855
|
+
limitations = lim(ocrOffNote, BROWSER_TIFF_RASTER_NOTE, BROWSER_HEIC_NOTE);
|
|
856
|
+
} else {
|
|
857
|
+
nativeExtraction = {
|
|
858
|
+
willAttempt: true,
|
|
859
|
+
description: "UTF-8 decode via `analyzeText`, then `normalizeToStructuredResult` with a paragraph block rollup."
|
|
860
|
+
};
|
|
861
|
+
ocr = { mayUse: false, description: "OCR does not apply to text files." };
|
|
862
|
+
}
|
|
863
|
+
break;
|
|
680
864
|
default:
|
|
681
865
|
nativeExtraction = { willAttempt: false, description: "Intent not specialized in this runtime." };
|
|
682
866
|
ocr = { mayUse: false, description: "See plan steps." };
|
|
@@ -701,7 +885,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
701
885
|
}
|
|
702
886
|
|
|
703
887
|
// src/introspection.ts
|
|
704
|
-
function
|
|
888
|
+
function resolveOcrMode3(ocr) {
|
|
705
889
|
return ocr?.mode ?? "auto";
|
|
706
890
|
}
|
|
707
891
|
function imageBrowserPlanSteps(ocrMode, ocr) {
|
|
@@ -766,9 +950,76 @@ function planForAnalyzeFile(kind, ocrMode, docxInclude, ocr) {
|
|
|
766
950
|
};
|
|
767
951
|
}
|
|
768
952
|
}
|
|
769
|
-
function planForIntent(intentOpt, kind, ocrMode, docxInclude, ocr) {
|
|
953
|
+
function planForIntent(intentOpt, kind, ocrMode, docxInclude, ocr, analyzeFileOutput) {
|
|
770
954
|
const intent = intentOpt ?? "analyzeFile";
|
|
771
|
-
if (intent === "
|
|
955
|
+
if (intent === "extractStructuredData") {
|
|
956
|
+
switch (kind) {
|
|
957
|
+
case "pdf":
|
|
958
|
+
return {
|
|
959
|
+
intent: "extractStructuredData",
|
|
960
|
+
steps: [
|
|
961
|
+
{ id: "detect_kind", status: "done" },
|
|
962
|
+
{ id: "pdf_pipeline", status: "skipped" },
|
|
963
|
+
{ id: "structured_output", status: "skipped" }
|
|
964
|
+
]
|
|
965
|
+
};
|
|
966
|
+
case "docx":
|
|
967
|
+
return {
|
|
968
|
+
intent: "extractStructuredData",
|
|
969
|
+
steps: [
|
|
970
|
+
{ id: "detect_kind", status: "done" },
|
|
971
|
+
{ id: "docx_mammoth", status: "planned" },
|
|
972
|
+
{ id: "docx_ooxml_parallel", status: "planned" },
|
|
973
|
+
{ id: "structured_normalize", status: "planned" }
|
|
974
|
+
]
|
|
975
|
+
};
|
|
976
|
+
case "image": {
|
|
977
|
+
if (ocrMode === "off") {
|
|
978
|
+
return {
|
|
979
|
+
intent: "extractStructuredData",
|
|
980
|
+
steps: [
|
|
981
|
+
{ id: "detect_kind", status: "done" },
|
|
982
|
+
{ id: "image_format_detect", status: "skipped" },
|
|
983
|
+
{ id: "normalize_image_for_ocr", status: "skipped" },
|
|
984
|
+
{ id: "preprocess_image_for_ocr", status: "skipped" },
|
|
985
|
+
{ id: "tesseract_ocr", status: "skipped" },
|
|
986
|
+
{ id: "structured_normalize", status: "skipped" }
|
|
987
|
+
]
|
|
988
|
+
};
|
|
989
|
+
}
|
|
990
|
+
const imgSteps = imageBrowserPlanSteps(ocrMode, ocr);
|
|
991
|
+
return {
|
|
992
|
+
intent: "extractStructuredData",
|
|
993
|
+
steps: [...imgSteps, { id: "structured_normalize", status: "planned" }]
|
|
994
|
+
};
|
|
995
|
+
}
|
|
996
|
+
case "text":
|
|
997
|
+
return {
|
|
998
|
+
intent: "extractStructuredData",
|
|
999
|
+
steps: [
|
|
1000
|
+
{ id: "detect_kind", status: "done" },
|
|
1001
|
+
{ id: "utf8_decode", status: "planned" },
|
|
1002
|
+
{ id: "structured_normalize", status: "planned" }
|
|
1003
|
+
]
|
|
1004
|
+
};
|
|
1005
|
+
default:
|
|
1006
|
+
return {
|
|
1007
|
+
intent: "extractStructuredData",
|
|
1008
|
+
steps: [
|
|
1009
|
+
{ id: "detect_kind", status: "done" },
|
|
1010
|
+
{ id: "route", status: "failed" }
|
|
1011
|
+
]
|
|
1012
|
+
};
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
if (intent === "analyzeFile") {
|
|
1016
|
+
const base = planForAnalyzeFile(kind, ocrMode, docxInclude, ocr);
|
|
1017
|
+
if (!analyzeFileRequestsStructured(analyzeFileOutput)) return base;
|
|
1018
|
+
return {
|
|
1019
|
+
...base,
|
|
1020
|
+
steps: [...base.steps ?? [], { id: "structured_merge", status: "planned" }]
|
|
1021
|
+
};
|
|
1022
|
+
}
|
|
772
1023
|
if (intent === "extractText") {
|
|
773
1024
|
const base = planForAnalyzeFile(kind, ocrMode, docxInclude, ocr);
|
|
774
1025
|
return { ...base, intent: "extractText" };
|
|
@@ -860,13 +1111,16 @@ async function explainAnalysisPlan(input, options) {
|
|
|
860
1111
|
prepareBrowserAnalyzeInput(input);
|
|
861
1112
|
const kind = detectFileKind(input);
|
|
862
1113
|
const intent = options?.intent ?? "analyzeFile";
|
|
863
|
-
const ocrMode =
|
|
1114
|
+
const ocrMode = resolveOcrMode3(options?.ocr);
|
|
864
1115
|
const docxInc = options?.docx?.include;
|
|
865
1116
|
const ocrSlice = options?.ocr;
|
|
866
|
-
const plan = planForIntent(intent, kind, ocrMode, docxInc, ocrSlice
|
|
1117
|
+
const plan = planForIntent(intent, kind, ocrMode, docxInc, ocrSlice, {
|
|
1118
|
+
structuredOutput: options?.structuredOutput,
|
|
1119
|
+
output: options?.output
|
|
1120
|
+
});
|
|
867
1121
|
return buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInc, ocrSlice);
|
|
868
1122
|
}
|
|
869
1123
|
|
|
870
|
-
export { BROWSER_PDF_UNSUPPORTED_WARNING, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, runOcr };
|
|
1124
|
+
export { BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING, BROWSER_PDF_UNSUPPORTED_WARNING, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, runOcr };
|
|
871
1125
|
//# sourceMappingURL=index.js.map
|
|
872
1126
|
//# sourceMappingURL=index.js.map
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-browser",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.6.0",
|
|
4
4
|
"description": "Official DocMind browser facade: analyzeFile and intent APIs (DOCX, image OCR, text). PDF and fs paths use @dragon708/docmind-node.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"sideEffects": false,
|
|
@@ -33,9 +33,9 @@
|
|
|
33
33
|
],
|
|
34
34
|
"license": "MIT",
|
|
35
35
|
"dependencies": {
|
|
36
|
-
"@dragon708/docmind-docx": "^1.
|
|
37
|
-
"@dragon708/docmind-ocr": "^1.1.
|
|
38
|
-
"@dragon708/docmind-shared": "^1.
|
|
36
|
+
"@dragon708/docmind-docx": "^1.8.0",
|
|
37
|
+
"@dragon708/docmind-ocr": "^1.1.4",
|
|
38
|
+
"@dragon708/docmind-shared": "^1.2.0"
|
|
39
39
|
},
|
|
40
40
|
"devDependencies": {
|
|
41
41
|
"@types/node": "^20.19.37",
|