@dragon708/docmind-node 1.9.1 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +32 -12
- package/dist/index.js +218 -27
- package/package.json +5 -5
package/dist/index.d.ts
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
import { DocMindAnalyzeOptions, DetectFileKindInput, NamedInput, AnalysisResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
|
|
2
|
-
export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
|
|
3
|
-
import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
|
|
4
|
-
|
|
5
|
-
import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
|
|
6
|
-
export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
|
|
1
|
+
import { DocMindAnalyzeOptions, AnalyzeFileOutputOptions, NormalizeStructuredOptions, DetectFileKindInput, NamedInput, AnalysisResult, StructuredDocumentResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
|
|
2
|
+
export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocumentBlock, DocumentImageRef, DocumentPage, DocumentTable, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, StructuredDocumentResult, TextAnalysisResult, analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
|
|
3
|
+
import { OcrOptions, OcrTiffOptions, PreprocessImageOptions, ExtractStructuredDataFromImageOptions } from '@dragon708/docmind-ocr';
|
|
4
|
+
export { ExtractStructuredDataFromImageOptions, extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
|
|
5
|
+
import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDocxOptions } from '@dragon708/docmind-docx';
|
|
6
|
+
export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
|
|
7
|
+
import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
|
|
8
|
+
export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
|
|
7
9
|
|
|
8
10
|
/**
|
|
9
11
|
* Options for Node public APIs (`analyzeFile`, intent methods).
|
|
@@ -24,7 +26,7 @@ interface NodeAnalyzeDocxOptionsSlice {
|
|
|
24
26
|
/** Opciones Mammoth para HTML (p. ej. `convertImage`). */
|
|
25
27
|
readonly html?: DocxToHtmlOptions;
|
|
26
28
|
}
|
|
27
|
-
interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
|
|
29
|
+
interface NodeAnalyzeOptions extends DocMindAnalyzeOptions, AnalyzeFileOutputOptions {
|
|
28
30
|
readonly pdf?: PdfAnalyzeOptions;
|
|
29
31
|
readonly ocr?: OcrOptions & Pick<OcrTiffOptions, "maxPages" | "pageSeparator"> & {
|
|
30
32
|
readonly preprocess?: PreprocessImageOptions;
|
|
@@ -38,6 +40,16 @@ interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
|
|
|
38
40
|
*/
|
|
39
41
|
readonly pdfNativeTextSource?: "pdf-parse" | "pdfjs-per-page";
|
|
40
42
|
}
|
|
43
|
+
/**
|
|
44
|
+
* Opciones para {@link extractStructuredData}: reenvío por rama (`pdf` / `docx` / `ocr`) más
|
|
45
|
+
* `normalize` opcional aplicado a texto plano o como respaldo cuando la rama no define `normalize`.
|
|
46
|
+
*/
|
|
47
|
+
interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
|
|
48
|
+
readonly pdf?: ExtractStructuredDataFromPdfOptions;
|
|
49
|
+
readonly docx?: ExtractStructuredDataFromDocxOptions;
|
|
50
|
+
readonly ocr?: ExtractStructuredDataFromImageOptions;
|
|
51
|
+
readonly normalize?: NormalizeStructuredOptions;
|
|
52
|
+
}
|
|
41
53
|
|
|
42
54
|
/**
|
|
43
55
|
* Inputs accepted by {@link analyzeFile} in this package.
|
|
@@ -85,8 +97,15 @@ declare function convertToHtml(input: NodeAnalyzeInput, options?: NodeAnalyzeOpt
|
|
|
85
97
|
*/
|
|
86
98
|
declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
87
99
|
|
|
100
|
+
/**
|
|
101
|
+
* Resuelve entrada Node, clasifica el archivo y devuelve un {@link StructuredDocumentResult} vía
|
|
102
|
+
* el extractor estructurado del paquete correspondiente (PDF, DOCX, imagen OCR) o
|
|
103
|
+
* {@link normalizeToStructuredResult} para texto UTF-8.
|
|
104
|
+
*/
|
|
105
|
+
declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
|
|
106
|
+
|
|
88
107
|
/** High-level features the user can ask DocMind for (per input kind and runtime). */
|
|
89
|
-
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
|
|
108
|
+
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
|
|
90
109
|
/**
|
|
91
110
|
* DOCX-only: what the stack can do with OOXML embedded bitmaps/vector payloads under `word/media`.
|
|
92
111
|
* Present on {@link GetCapabilitiesReport} when `kind === "docx"`.
|
|
@@ -160,7 +179,8 @@ interface ExplainAnalysisPlanReport {
|
|
|
160
179
|
readonly kind: FileKind;
|
|
161
180
|
readonly detectedKind: FileKind;
|
|
162
181
|
readonly runtime: RuntimeDescriptor;
|
|
163
|
-
|
|
182
|
+
/** Incluye intents extendidos en Node (p. ej. `extractStructuredData`). */
|
|
183
|
+
readonly intent: DocMindPublicIntent | string;
|
|
164
184
|
readonly primaryAnalyzer: AnalysisAnalyzer;
|
|
165
185
|
readonly nativeExtraction: NativeExtractionPlan;
|
|
166
186
|
readonly ocr: OcrPlan;
|
|
@@ -174,11 +194,11 @@ interface ExplainAnalysisPlanReport {
|
|
|
174
194
|
}
|
|
175
195
|
|
|
176
196
|
/** Options for {@link explainAnalysisPlan} including PDF/OCR/DOCX hints for accurate planning. */
|
|
177
|
-
type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx">;
|
|
197
|
+
type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx" | "structuredOutput" | "output">;
|
|
178
198
|
|
|
179
199
|
/**
|
|
180
200
|
* Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
|
|
181
|
-
* `text` | `metadata` | `html` | `ocr` | `pages` (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
|
|
201
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
|
|
182
202
|
* `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
|
|
183
203
|
* For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
|
|
184
204
|
* Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
|
|
@@ -190,4 +210,4 @@ declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilit
|
|
|
190
210
|
*/
|
|
191
211
|
declare function explainAnalysisPlan(input: NodeAnalyzeInput, options?: NodeExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
|
|
192
212
|
|
|
193
|
-
export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
|
213
|
+
export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type NodeExtractStructuredDataOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
package/dist/index.js
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
import { assertValidAnalyzeFileInput, detectFileKind,
|
|
2
|
-
export { detectFileKind } from '@dragon708/docmind-shared';
|
|
3
|
-
import { analyzeDocx } from '@dragon708/docmind-docx';
|
|
4
|
-
export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
|
|
5
|
-
import { preprocessHasEffect, resolveImageFormat, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
|
|
6
|
-
|
|
1
|
+
import { assertValidAnalyzeFileInput, detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, notImplementedResult, analyzeFileRequestsStructured, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
|
|
2
|
+
export { analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
|
|
3
|
+
import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-docx';
|
|
4
|
+
export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
|
|
5
|
+
import { extractStructuredDataFromImage, preprocessHasEffect, resolveImageFormat, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
|
|
6
|
+
export { extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
|
|
7
|
+
import { extractStructuredDataFromPdf, extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
|
|
8
|
+
export { extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
|
|
7
9
|
import { readFile } from 'fs/promises';
|
|
8
10
|
import { basename } from 'path';
|
|
9
11
|
import { fileURLToPath } from 'url';
|
|
@@ -252,6 +254,72 @@ async function resolveNodeAnalyzeInput(input) {
|
|
|
252
254
|
return input;
|
|
253
255
|
}
|
|
254
256
|
|
|
257
|
+
// src/internal/abort.ts
|
|
258
|
+
function throwIfAborted(signal) {
|
|
259
|
+
if (signal?.aborted) {
|
|
260
|
+
const err = new Error("The operation was aborted");
|
|
261
|
+
err.name = "AbortError";
|
|
262
|
+
throw err;
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// src/extractStructuredData.ts
|
|
267
|
+
var PLAIN_TEXT_STRUCTURED_NOTE = "Plain text: structured output wraps decoded UTF-8 in a unified envelope (no layout blocks).";
|
|
268
|
+
async function extractStructuredData(input, options) {
|
|
269
|
+
throwIfAborted(options?.signal);
|
|
270
|
+
const resolved = await resolveNodeAnalyzeInput(input);
|
|
271
|
+
assertValidAnalyzeFileInput(resolved);
|
|
272
|
+
const kind = detectFileKind(resolved);
|
|
273
|
+
const signal = options?.signal;
|
|
274
|
+
const normFallback = options?.normalize;
|
|
275
|
+
switch (kind) {
|
|
276
|
+
case "pdf": {
|
|
277
|
+
const data = await bytesFromDetectInput(resolved);
|
|
278
|
+
return extractStructuredDataFromPdf(data, {
|
|
279
|
+
...options?.pdf,
|
|
280
|
+
signal: options?.pdf?.signal ?? signal,
|
|
281
|
+
normalize: options?.pdf?.normalize ?? normFallback
|
|
282
|
+
});
|
|
283
|
+
}
|
|
284
|
+
case "docx": {
|
|
285
|
+
const data = await bytesFromDetectInput(resolved);
|
|
286
|
+
return extractStructuredDataFromDocx(data, {
|
|
287
|
+
...options?.docx,
|
|
288
|
+
signal: options?.docx?.signal ?? signal,
|
|
289
|
+
normalize: options?.docx?.normalize ?? normFallback
|
|
290
|
+
});
|
|
291
|
+
}
|
|
292
|
+
case "image": {
|
|
293
|
+
const data = await bytesFromDetectInput(resolved);
|
|
294
|
+
return extractStructuredDataFromImage(data, {
|
|
295
|
+
...options?.ocr,
|
|
296
|
+
signal: options?.ocr?.signal ?? signal,
|
|
297
|
+
normalize: options?.ocr?.normalize ?? normFallback
|
|
298
|
+
});
|
|
299
|
+
}
|
|
300
|
+
case "text": {
|
|
301
|
+
const r = await analyzeText(resolved, { signal });
|
|
302
|
+
return normalizeToStructuredResult(
|
|
303
|
+
{
|
|
304
|
+
kind: "text",
|
|
305
|
+
text: r.text,
|
|
306
|
+
warnings: [...r.warnings, PLAIN_TEXT_STRUCTURED_NOTE]
|
|
307
|
+
},
|
|
308
|
+
normFallback
|
|
309
|
+
);
|
|
310
|
+
}
|
|
311
|
+
default:
|
|
312
|
+
return normalizeToStructuredResult(
|
|
313
|
+
{
|
|
314
|
+
kind: "unknown",
|
|
315
|
+
text: "",
|
|
316
|
+
warnings: [UNKNOWN_FORMAT_WARNING]
|
|
317
|
+
},
|
|
318
|
+
normFallback
|
|
319
|
+
);
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
255
323
|
// src/analyze.ts
|
|
256
324
|
async function analyzeFile(input, options) {
|
|
257
325
|
if (options?.signal?.aborted) {
|
|
@@ -262,17 +330,39 @@ async function analyzeFile(input, options) {
|
|
|
262
330
|
const resolved = await resolveNodeAnalyzeInput(input);
|
|
263
331
|
assertValidAnalyzeFileInput(resolved);
|
|
264
332
|
const fileKind = detectFileKind(resolved);
|
|
333
|
+
let result;
|
|
265
334
|
switch (fileKind) {
|
|
266
335
|
case "pdf":
|
|
267
|
-
|
|
336
|
+
result = await analyzePdfForNode(resolved, options);
|
|
337
|
+
break;
|
|
268
338
|
case "docx":
|
|
269
|
-
|
|
339
|
+
result = await analyzeDocxForNode(resolved, options);
|
|
340
|
+
break;
|
|
270
341
|
case "image":
|
|
271
|
-
|
|
342
|
+
result = await analyzeImageForNode(resolved, options);
|
|
343
|
+
break;
|
|
272
344
|
case "text":
|
|
273
|
-
|
|
345
|
+
result = await analyzeText(resolved, { signal: options?.signal });
|
|
346
|
+
break;
|
|
274
347
|
default:
|
|
275
|
-
|
|
348
|
+
result = notImplementedResult(fileKind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
349
|
+
}
|
|
350
|
+
if (!analyzeFileRequestsStructured(options) || result.status !== "ok") {
|
|
351
|
+
return result;
|
|
352
|
+
}
|
|
353
|
+
try {
|
|
354
|
+
const structured = await extractStructuredData(resolved, {
|
|
355
|
+
signal: options?.signal,
|
|
356
|
+
pdf: options?.pdf,
|
|
357
|
+
docx: options?.docx,
|
|
358
|
+
ocr: options?.ocr
|
|
359
|
+
});
|
|
360
|
+
return { ...result, structured };
|
|
361
|
+
} catch (e) {
|
|
362
|
+
if (e instanceof Error && e.name === "AbortError") throw e;
|
|
363
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
364
|
+
const prev = "warnings" in result && Array.isArray(result.warnings) ? [...result.warnings] : [];
|
|
365
|
+
return { ...result, warnings: [...prev, `warning: analyzeFile structured merge failed: ${msg}`] };
|
|
276
366
|
}
|
|
277
367
|
}
|
|
278
368
|
|
|
@@ -287,15 +377,6 @@ function withPdfOcrDefaultOff(options) {
|
|
|
287
377
|
};
|
|
288
378
|
}
|
|
289
379
|
|
|
290
|
-
// src/internal/abort.ts
|
|
291
|
-
function throwIfAborted(signal) {
|
|
292
|
-
if (signal?.aborted) {
|
|
293
|
-
const err = new Error("The operation was aborted");
|
|
294
|
-
err.name = "AbortError";
|
|
295
|
-
throw err;
|
|
296
|
-
}
|
|
297
|
-
}
|
|
298
|
-
|
|
299
380
|
// src/publicActions.ts
|
|
300
381
|
var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not returned by extractMetadata; use analyzeFile, extractText, or convertToHtml with options.docx.include for OOXML structure, headings, tables, blocks, approximate pages, and embedded images.";
|
|
301
382
|
var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
|
|
@@ -595,7 +676,10 @@ function buildNodeCapabilityReport(kind) {
|
|
|
595
676
|
slot("ocr", true, [
|
|
596
677
|
"Raster OCR pipeline (pdf.ocr auto with quality heuristics, force, or runOcr)."
|
|
597
678
|
]),
|
|
598
|
-
slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."])
|
|
679
|
+
slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."]),
|
|
680
|
+
slot("structured-output", true, [
|
|
681
|
+
"extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
|
|
682
|
+
])
|
|
599
683
|
];
|
|
600
684
|
break;
|
|
601
685
|
case "docx":
|
|
@@ -612,6 +696,9 @@ function buildNodeCapabilityReport(kind) {
|
|
|
612
696
|
slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
|
|
613
697
|
slot("pages", false, [
|
|
614
698
|
"No PDF-style page count; approximate DOCX pages via options.docx.include.pagesApprox (OOXML hints, not print layout)."
|
|
699
|
+
]),
|
|
700
|
+
slot("structured-output", true, [
|
|
701
|
+
"extractStructuredData runs analyzeDocx with merged OOXML includes (blocks, tables, headings, embedded images, etc.) into StructuredDocumentResult."
|
|
615
702
|
])
|
|
616
703
|
];
|
|
617
704
|
break;
|
|
@@ -643,6 +730,9 @@ function buildNodeCapabilityReport(kind) {
|
|
|
643
730
|
]),
|
|
644
731
|
slot("pages", true, [
|
|
645
732
|
"TIFF: `pages` and `textByPage` mirror frames processed. Other formats may expose `pages` when the normalizer reports it (e.g. GIF)."
|
|
733
|
+
]),
|
|
734
|
+
slot("structured-output", true, [
|
|
735
|
+
"extractStructuredData: OCR + layout blocks (ocrImageDetailed / ocrTiff) normalized to StructuredDocumentResult."
|
|
646
736
|
])
|
|
647
737
|
];
|
|
648
738
|
break;
|
|
@@ -652,7 +742,10 @@ function buildNodeCapabilityReport(kind) {
|
|
|
652
742
|
slot("metadata", true, [TEXT_META_NOTE]),
|
|
653
743
|
slot("html", true),
|
|
654
744
|
slot("ocr", false, ["OCR does not apply to plain text files."]),
|
|
655
|
-
slot("pages", false)
|
|
745
|
+
slot("pages", false),
|
|
746
|
+
slot("structured-output", true, [
|
|
747
|
+
"extractStructuredData wraps UTF-8 decode in normalizeToStructuredResult (rollup text only unless you add blocks upstream)."
|
|
748
|
+
])
|
|
656
749
|
];
|
|
657
750
|
break;
|
|
658
751
|
default:
|
|
@@ -662,7 +755,8 @@ function buildNodeCapabilityReport(kind) {
|
|
|
662
755
|
slot("metadata", false),
|
|
663
756
|
slot("html", false),
|
|
664
757
|
slot("ocr", false),
|
|
665
|
-
slot("pages", false)
|
|
758
|
+
slot("pages", false),
|
|
759
|
+
slot("structured-output", false)
|
|
666
760
|
];
|
|
667
761
|
}
|
|
668
762
|
return {
|
|
@@ -872,6 +966,40 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
|
|
|
872
966
|
ocr = { mayUse: false, description: "OCR does not apply to text files." };
|
|
873
967
|
}
|
|
874
968
|
break;
|
|
969
|
+
case "extractStructuredData":
|
|
970
|
+
if (kind === "pdf") {
|
|
971
|
+
nativeExtraction = {
|
|
972
|
+
willAttempt: true,
|
|
973
|
+
description: "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult."
|
|
974
|
+
};
|
|
975
|
+
ocr = {
|
|
976
|
+
mayUse: pdfOcr !== "off",
|
|
977
|
+
description: pdfOcr === "off" ? "Raster OCR is off (pdf.ocr: off); structured text uses native extraction only." : pdfOcr === "force" ? "Raster OCR may run on all pages (pdf.ocr: force)." : "Raster OCR may run when heuristics suggest weak native text (pdf.ocr: auto)."
|
|
978
|
+
};
|
|
979
|
+
} else if (kind === "docx") {
|
|
980
|
+
nativeExtraction = {
|
|
981
|
+
willAttempt: true,
|
|
982
|
+
description: "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope."
|
|
983
|
+
};
|
|
984
|
+
ocr = { mayUse: false, description: "DOCX does not use OCR." };
|
|
985
|
+
} else if (kind === "image") {
|
|
986
|
+
nativeExtraction = {
|
|
987
|
+
willAttempt: false,
|
|
988
|
+
description: NODE_IMAGE_OCR_PIPELINE
|
|
989
|
+
};
|
|
990
|
+
ocr = {
|
|
991
|
+
mayUse: true,
|
|
992
|
+
description: "OCR + layout blocks: same pipeline as package extractStructuredDataFromImage (normalize \u2192 optional preprocess \u2192 Tesseract; TIFF multipage via ocrTiff)."
|
|
993
|
+
};
|
|
994
|
+
} else {
|
|
995
|
+
nativeExtraction = {
|
|
996
|
+
willAttempt: true,
|
|
997
|
+
description: "UTF-8 decode with BOM handling; normalizeToStructuredResult produces the structured envelope."
|
|
998
|
+
};
|
|
999
|
+
ocr = { mayUse: false, description: "OCR does not apply to text files." };
|
|
1000
|
+
limitations = lim("Plain text has no native layout blocks; `text` carries the decoded content.");
|
|
1001
|
+
}
|
|
1002
|
+
break;
|
|
875
1003
|
default:
|
|
876
1004
|
nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
|
|
877
1005
|
ocr = { mayUse: false, description: "See plan steps." };
|
|
@@ -957,9 +1085,16 @@ function planAnalyzeFile(kind, pdfOcr, docxInclude, ocr) {
|
|
|
957
1085
|
};
|
|
958
1086
|
}
|
|
959
1087
|
}
|
|
960
|
-
function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr) {
|
|
1088
|
+
function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr, analyzeFileOutput) {
|
|
961
1089
|
const intent = intentOpt ?? "analyzeFile";
|
|
962
|
-
if (intent === "analyzeFile")
|
|
1090
|
+
if (intent === "analyzeFile") {
|
|
1091
|
+
const base = planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
|
|
1092
|
+
if (!analyzeFileRequestsStructured(analyzeFileOutput)) return base;
|
|
1093
|
+
return {
|
|
1094
|
+
...base,
|
|
1095
|
+
steps: [...base.steps ?? [], { id: "structured_merge", status: "planned" }]
|
|
1096
|
+
};
|
|
1097
|
+
}
|
|
963
1098
|
if (intent === "extractText") {
|
|
964
1099
|
if (kind === "pdf") {
|
|
965
1100
|
return {
|
|
@@ -1078,6 +1213,59 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr) {
|
|
|
1078
1213
|
]
|
|
1079
1214
|
};
|
|
1080
1215
|
}
|
|
1216
|
+
if (intent === "extractStructuredData") {
|
|
1217
|
+
switch (kind) {
|
|
1218
|
+
case "pdf":
|
|
1219
|
+
return {
|
|
1220
|
+
intent: "extractStructuredData",
|
|
1221
|
+
steps: [
|
|
1222
|
+
{ id: "detect_kind", status: "done" },
|
|
1223
|
+
{ id: "pdf_analyze", status: "planned" },
|
|
1224
|
+
{ id: "pdf_structure_extract", status: "planned" },
|
|
1225
|
+
{
|
|
1226
|
+
id: "pdf_ocr",
|
|
1227
|
+
status: pdfOcrForAnalyze === "off" ? "skipped" : "planned"
|
|
1228
|
+
},
|
|
1229
|
+
{ id: "structured_normalize", status: "planned" }
|
|
1230
|
+
]
|
|
1231
|
+
};
|
|
1232
|
+
case "docx":
|
|
1233
|
+
return {
|
|
1234
|
+
intent: "extractStructuredData",
|
|
1235
|
+
steps: [
|
|
1236
|
+
{ id: "detect_kind", status: "done" },
|
|
1237
|
+
{ id: "docx_mammoth", status: "planned" },
|
|
1238
|
+
{ id: "docx_ooxml_parallel", status: "planned" },
|
|
1239
|
+
{ id: "structured_normalize", status: "planned" }
|
|
1240
|
+
]
|
|
1241
|
+
};
|
|
1242
|
+
case "image":
|
|
1243
|
+
return {
|
|
1244
|
+
intent: "extractStructuredData",
|
|
1245
|
+
steps: [
|
|
1246
|
+
...imageOcrPlanSteps(ocr),
|
|
1247
|
+
{ id: "structured_normalize", status: "planned" }
|
|
1248
|
+
]
|
|
1249
|
+
};
|
|
1250
|
+
case "text":
|
|
1251
|
+
return {
|
|
1252
|
+
intent: "extractStructuredData",
|
|
1253
|
+
steps: [
|
|
1254
|
+
{ id: "detect_kind", status: "done" },
|
|
1255
|
+
{ id: "utf8_decode", status: "planned" },
|
|
1256
|
+
{ id: "structured_normalize", status: "planned" }
|
|
1257
|
+
]
|
|
1258
|
+
};
|
|
1259
|
+
default:
|
|
1260
|
+
return {
|
|
1261
|
+
intent: "extractStructuredData",
|
|
1262
|
+
steps: [
|
|
1263
|
+
{ id: "detect_kind", status: "done" },
|
|
1264
|
+
{ id: "route", status: "failed" }
|
|
1265
|
+
]
|
|
1266
|
+
};
|
|
1267
|
+
}
|
|
1268
|
+
}
|
|
1081
1269
|
return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
|
|
1082
1270
|
}
|
|
1083
1271
|
async function getCapabilities(input, options) {
|
|
@@ -1096,10 +1284,13 @@ async function explainAnalysisPlan(input, options) {
|
|
|
1096
1284
|
const pdfOcrAnalyze = resolvePdfOcrMode(options?.pdf);
|
|
1097
1285
|
const docxInc = options?.docx?.include;
|
|
1098
1286
|
const ocrSlice = options?.ocr;
|
|
1099
|
-
const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc, ocrSlice
|
|
1287
|
+
const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc, ocrSlice, {
|
|
1288
|
+
structuredOutput: options?.structuredOutput,
|
|
1289
|
+
output: options?.output
|
|
1290
|
+
});
|
|
1100
1291
|
return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan, docxInc, ocrSlice);
|
|
1101
1292
|
}
|
|
1102
1293
|
|
|
1103
|
-
export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
|
1294
|
+
export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
|
1104
1295
|
//# sourceMappingURL=index.js.map
|
|
1105
1296
|
//# sourceMappingURL=index.js.map
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-node",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.10.0",
|
|
4
4
|
"description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -32,10 +32,10 @@
|
|
|
32
32
|
],
|
|
33
33
|
"license": "MIT",
|
|
34
34
|
"dependencies": {
|
|
35
|
-
"@dragon708/docmind-docx": "^1.
|
|
36
|
-
"@dragon708/docmind-ocr": "^1.1.
|
|
37
|
-
"@dragon708/docmind-pdf": "^2.
|
|
38
|
-
"@dragon708/docmind-shared": "^1.
|
|
35
|
+
"@dragon708/docmind-docx": "^1.8.0",
|
|
36
|
+
"@dragon708/docmind-ocr": "^1.1.4",
|
|
37
|
+
"@dragon708/docmind-pdf": "^2.2.0",
|
|
38
|
+
"@dragon708/docmind-shared": "^1.2.0"
|
|
39
39
|
},
|
|
40
40
|
"devDependencies": {
|
|
41
41
|
"@types/node": "^20.19.37",
|