@dragon708/docmind-node 1.9.1 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,9 +1,11 @@
1
- import { DocMindAnalyzeOptions, DetectFileKindInput, NamedInput, AnalysisResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
2
- export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
3
- import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
4
- import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
5
- import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
6
- export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
1
+ import { DocMindAnalyzeOptions, AnalyzeFileOutputOptions, NormalizeStructuredOptions, DetectFileKindInput, NamedInput, AnalysisResult, StructuredDocumentResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
2
+ export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocumentBlock, DocumentImageRef, DocumentPage, DocumentTable, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, StructuredDocumentResult, TextAnalysisResult, analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
3
+ import { OcrOptions, OcrTiffOptions, PreprocessImageOptions, ExtractStructuredDataFromImageOptions } from '@dragon708/docmind-ocr';
4
+ export { ExtractStructuredDataFromImageOptions, extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
5
+ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDocxOptions } from '@dragon708/docmind-docx';
6
+ export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
7
+ import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
8
+ export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
7
9
 
8
10
  /**
9
11
  * Options for Node public APIs (`analyzeFile`, intent methods).
@@ -24,7 +26,7 @@ interface NodeAnalyzeDocxOptionsSlice {
24
26
  /** Opciones Mammoth para HTML (p. ej. `convertImage`). */
25
27
  readonly html?: DocxToHtmlOptions;
26
28
  }
27
- interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
29
+ interface NodeAnalyzeOptions extends DocMindAnalyzeOptions, AnalyzeFileOutputOptions {
28
30
  readonly pdf?: PdfAnalyzeOptions;
29
31
  readonly ocr?: OcrOptions & Pick<OcrTiffOptions, "maxPages" | "pageSeparator"> & {
30
32
  readonly preprocess?: PreprocessImageOptions;
@@ -38,6 +40,16 @@ interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
38
40
  */
39
41
  readonly pdfNativeTextSource?: "pdf-parse" | "pdfjs-per-page";
40
42
  }
43
+ /**
44
+ * Opciones para {@link extractStructuredData}: reenvío por rama (`pdf` / `docx` / `ocr`) más
45
+ * `normalize` opcional aplicado a texto plano o como respaldo cuando la rama no define `normalize`.
46
+ */
47
+ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
48
+ readonly pdf?: ExtractStructuredDataFromPdfOptions;
49
+ readonly docx?: ExtractStructuredDataFromDocxOptions;
50
+ readonly ocr?: ExtractStructuredDataFromImageOptions;
51
+ readonly normalize?: NormalizeStructuredOptions;
52
+ }
41
53
 
42
54
  /**
43
55
  * Inputs accepted by {@link analyzeFile} in this package.
@@ -85,8 +97,15 @@ declare function convertToHtml(input: NodeAnalyzeInput, options?: NodeAnalyzeOpt
85
97
  */
86
98
  declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
87
99
 
100
+ /**
101
+ * Resuelve entrada Node, clasifica el archivo y devuelve un {@link StructuredDocumentResult} vía
102
+ * el extractor estructurado del paquete correspondiente (PDF, DOCX, imagen OCR) o
103
+ * {@link normalizeToStructuredResult} para texto UTF-8.
104
+ */
105
+ declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
106
+
88
107
  /** High-level features the user can ask DocMind for (per input kind and runtime). */
89
- type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
108
+ type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
90
109
  /**
91
110
  * DOCX-only: what the stack can do with OOXML embedded bitmaps/vector payloads under `word/media`.
92
111
  * Present on {@link GetCapabilitiesReport} when `kind === "docx"`.
@@ -160,7 +179,8 @@ interface ExplainAnalysisPlanReport {
160
179
  readonly kind: FileKind;
161
180
  readonly detectedKind: FileKind;
162
181
  readonly runtime: RuntimeDescriptor;
163
- readonly intent: DocMindPublicIntent | (string & {});
182
+ /** Incluye intents extendidos en Node (p. ej. `extractStructuredData`). */
183
+ readonly intent: DocMindPublicIntent | string;
164
184
  readonly primaryAnalyzer: AnalysisAnalyzer;
165
185
  readonly nativeExtraction: NativeExtractionPlan;
166
186
  readonly ocr: OcrPlan;
@@ -174,11 +194,11 @@ interface ExplainAnalysisPlanReport {
174
194
  }
175
195
 
176
196
  /** Options for {@link explainAnalysisPlan} including PDF/OCR/DOCX hints for accurate planning. */
177
- type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx">;
197
+ type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx" | "structuredOutput" | "output">;
178
198
 
179
199
  /**
180
200
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
181
- * `text` | `metadata` | `html` | `ocr` | `pages` (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
201
+ * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
182
202
  * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
183
203
  * For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
184
204
  * Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
@@ -190,4 +210,4 @@ declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilit
190
210
  */
191
211
  declare function explainAnalysisPlan(input: NodeAnalyzeInput, options?: NodeExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
192
212
 
193
- export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
213
+ export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type NodeExtractStructuredDataOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
package/dist/index.js CHANGED
@@ -1,9 +1,11 @@
1
- import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
2
- export { detectFileKind } from '@dragon708/docmind-shared';
3
- import { analyzeDocx } from '@dragon708/docmind-docx';
4
- export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
5
- import { preprocessHasEffect, resolveImageFormat, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
6
- import { extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
1
+ import { assertValidAnalyzeFileInput, detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, notImplementedResult, analyzeFileRequestsStructured, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
2
+ export { analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
3
+ import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-docx';
4
+ export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
5
+ import { extractStructuredDataFromImage, preprocessHasEffect, resolveImageFormat, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
6
+ export { extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
7
+ import { extractStructuredDataFromPdf, extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
8
+ export { extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
7
9
  import { readFile } from 'fs/promises';
8
10
  import { basename } from 'path';
9
11
  import { fileURLToPath } from 'url';
@@ -252,6 +254,72 @@ async function resolveNodeAnalyzeInput(input) {
252
254
  return input;
253
255
  }
254
256
 
257
+ // src/internal/abort.ts
258
+ function throwIfAborted(signal) {
259
+ if (signal?.aborted) {
260
+ const err = new Error("The operation was aborted");
261
+ err.name = "AbortError";
262
+ throw err;
263
+ }
264
+ }
265
+
266
+ // src/extractStructuredData.ts
267
+ var PLAIN_TEXT_STRUCTURED_NOTE = "Plain text: structured output wraps decoded UTF-8 in a unified envelope (no layout blocks).";
268
+ async function extractStructuredData(input, options) {
269
+ throwIfAborted(options?.signal);
270
+ const resolved = await resolveNodeAnalyzeInput(input);
271
+ assertValidAnalyzeFileInput(resolved);
272
+ const kind = detectFileKind(resolved);
273
+ const signal = options?.signal;
274
+ const normFallback = options?.normalize;
275
+ switch (kind) {
276
+ case "pdf": {
277
+ const data = await bytesFromDetectInput(resolved);
278
+ return extractStructuredDataFromPdf(data, {
279
+ ...options?.pdf,
280
+ signal: options?.pdf?.signal ?? signal,
281
+ normalize: options?.pdf?.normalize ?? normFallback
282
+ });
283
+ }
284
+ case "docx": {
285
+ const data = await bytesFromDetectInput(resolved);
286
+ return extractStructuredDataFromDocx(data, {
287
+ ...options?.docx,
288
+ signal: options?.docx?.signal ?? signal,
289
+ normalize: options?.docx?.normalize ?? normFallback
290
+ });
291
+ }
292
+ case "image": {
293
+ const data = await bytesFromDetectInput(resolved);
294
+ return extractStructuredDataFromImage(data, {
295
+ ...options?.ocr,
296
+ signal: options?.ocr?.signal ?? signal,
297
+ normalize: options?.ocr?.normalize ?? normFallback
298
+ });
299
+ }
300
+ case "text": {
301
+ const r = await analyzeText(resolved, { signal });
302
+ return normalizeToStructuredResult(
303
+ {
304
+ kind: "text",
305
+ text: r.text,
306
+ warnings: [...r.warnings, PLAIN_TEXT_STRUCTURED_NOTE]
307
+ },
308
+ normFallback
309
+ );
310
+ }
311
+ default:
312
+ return normalizeToStructuredResult(
313
+ {
314
+ kind: "unknown",
315
+ text: "",
316
+ warnings: [UNKNOWN_FORMAT_WARNING]
317
+ },
318
+ normFallback
319
+ );
320
+ }
321
+ }
322
+
255
323
  // src/analyze.ts
256
324
  async function analyzeFile(input, options) {
257
325
  if (options?.signal?.aborted) {
@@ -262,17 +330,39 @@ async function analyzeFile(input, options) {
262
330
  const resolved = await resolveNodeAnalyzeInput(input);
263
331
  assertValidAnalyzeFileInput(resolved);
264
332
  const fileKind = detectFileKind(resolved);
333
+ let result;
265
334
  switch (fileKind) {
266
335
  case "pdf":
267
- return analyzePdfForNode(resolved, options);
336
+ result = await analyzePdfForNode(resolved, options);
337
+ break;
268
338
  case "docx":
269
- return analyzeDocxForNode(resolved, options);
339
+ result = await analyzeDocxForNode(resolved, options);
340
+ break;
270
341
  case "image":
271
- return analyzeImageForNode(resolved, options);
342
+ result = await analyzeImageForNode(resolved, options);
343
+ break;
272
344
  case "text":
273
- return analyzeText(resolved, { signal: options?.signal });
345
+ result = await analyzeText(resolved, { signal: options?.signal });
346
+ break;
274
347
  default:
275
- return notImplementedResult(fileKind, "none", [UNKNOWN_FORMAT_WARNING]);
348
+ result = notImplementedResult(fileKind, "none", [UNKNOWN_FORMAT_WARNING]);
349
+ }
350
+ if (!analyzeFileRequestsStructured(options) || result.status !== "ok") {
351
+ return result;
352
+ }
353
+ try {
354
+ const structured = await extractStructuredData(resolved, {
355
+ signal: options?.signal,
356
+ pdf: options?.pdf,
357
+ docx: options?.docx,
358
+ ocr: options?.ocr
359
+ });
360
+ return { ...result, structured };
361
+ } catch (e) {
362
+ if (e instanceof Error && e.name === "AbortError") throw e;
363
+ const msg = e instanceof Error ? e.message : String(e);
364
+ const prev = "warnings" in result && Array.isArray(result.warnings) ? [...result.warnings] : [];
365
+ return { ...result, warnings: [...prev, `warning: analyzeFile structured merge failed: ${msg}`] };
276
366
  }
277
367
  }
278
368
 
@@ -287,15 +377,6 @@ function withPdfOcrDefaultOff(options) {
287
377
  };
288
378
  }
289
379
 
290
- // src/internal/abort.ts
291
- function throwIfAborted(signal) {
292
- if (signal?.aborted) {
293
- const err = new Error("The operation was aborted");
294
- err.name = "AbortError";
295
- throw err;
296
- }
297
- }
298
-
299
380
  // src/publicActions.ts
300
381
  var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not returned by extractMetadata; use analyzeFile, extractText, or convertToHtml with options.docx.include for OOXML structure, headings, tables, blocks, approximate pages, and embedded images.";
301
382
  var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
@@ -595,7 +676,10 @@ function buildNodeCapabilityReport(kind) {
595
676
  slot("ocr", true, [
596
677
  "Raster OCR pipeline (pdf.ocr auto with quality heuristics, force, or runOcr)."
597
678
  ]),
598
- slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."])
679
+ slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."]),
680
+ slot("structured-output", true, [
681
+ "extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
682
+ ])
599
683
  ];
600
684
  break;
601
685
  case "docx":
@@ -612,6 +696,9 @@ function buildNodeCapabilityReport(kind) {
612
696
  slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
613
697
  slot("pages", false, [
614
698
  "No PDF-style page count; approximate DOCX pages via options.docx.include.pagesApprox (OOXML hints, not print layout)."
699
+ ]),
700
+ slot("structured-output", true, [
701
+ "extractStructuredData runs analyzeDocx with merged OOXML includes (blocks, tables, headings, embedded images, etc.) into StructuredDocumentResult."
615
702
  ])
616
703
  ];
617
704
  break;
@@ -643,6 +730,9 @@ function buildNodeCapabilityReport(kind) {
643
730
  ]),
644
731
  slot("pages", true, [
645
732
  "TIFF: `pages` and `textByPage` mirror frames processed. Other formats may expose `pages` when the normalizer reports it (e.g. GIF)."
733
+ ]),
734
+ slot("structured-output", true, [
735
+ "extractStructuredData: OCR + layout blocks (ocrImageDetailed / ocrTiff) normalized to StructuredDocumentResult."
646
736
  ])
647
737
  ];
648
738
  break;
@@ -652,7 +742,10 @@ function buildNodeCapabilityReport(kind) {
652
742
  slot("metadata", true, [TEXT_META_NOTE]),
653
743
  slot("html", true),
654
744
  slot("ocr", false, ["OCR does not apply to plain text files."]),
655
- slot("pages", false)
745
+ slot("pages", false),
746
+ slot("structured-output", true, [
747
+ "extractStructuredData wraps UTF-8 decode in normalizeToStructuredResult (rollup text only unless you add blocks upstream)."
748
+ ])
656
749
  ];
657
750
  break;
658
751
  default:
@@ -662,7 +755,8 @@ function buildNodeCapabilityReport(kind) {
662
755
  slot("metadata", false),
663
756
  slot("html", false),
664
757
  slot("ocr", false),
665
- slot("pages", false)
758
+ slot("pages", false),
759
+ slot("structured-output", false)
666
760
  ];
667
761
  }
668
762
  return {
@@ -872,6 +966,40 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
872
966
  ocr = { mayUse: false, description: "OCR does not apply to text files." };
873
967
  }
874
968
  break;
969
+ case "extractStructuredData":
970
+ if (kind === "pdf") {
971
+ nativeExtraction = {
972
+ willAttempt: true,
973
+ description: "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult."
974
+ };
975
+ ocr = {
976
+ mayUse: pdfOcr !== "off",
977
+ description: pdfOcr === "off" ? "Raster OCR is off (pdf.ocr: off); structured text uses native extraction only." : pdfOcr === "force" ? "Raster OCR may run on all pages (pdf.ocr: force)." : "Raster OCR may run when heuristics suggest weak native text (pdf.ocr: auto)."
978
+ };
979
+ } else if (kind === "docx") {
980
+ nativeExtraction = {
981
+ willAttempt: true,
982
+ description: "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope."
983
+ };
984
+ ocr = { mayUse: false, description: "DOCX does not use OCR." };
985
+ } else if (kind === "image") {
986
+ nativeExtraction = {
987
+ willAttempt: false,
988
+ description: NODE_IMAGE_OCR_PIPELINE
989
+ };
990
+ ocr = {
991
+ mayUse: true,
992
+ description: "OCR + layout blocks: same pipeline as package extractStructuredDataFromImage (normalize \u2192 optional preprocess \u2192 Tesseract; TIFF multipage via ocrTiff)."
993
+ };
994
+ } else {
995
+ nativeExtraction = {
996
+ willAttempt: true,
997
+ description: "UTF-8 decode with BOM handling; normalizeToStructuredResult produces the structured envelope."
998
+ };
999
+ ocr = { mayUse: false, description: "OCR does not apply to text files." };
1000
+ limitations = lim("Plain text has no native layout blocks; `text` carries the decoded content.");
1001
+ }
1002
+ break;
875
1003
  default:
876
1004
  nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
877
1005
  ocr = { mayUse: false, description: "See plan steps." };
@@ -957,9 +1085,16 @@ function planAnalyzeFile(kind, pdfOcr, docxInclude, ocr) {
957
1085
  };
958
1086
  }
959
1087
  }
960
- function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr) {
1088
+ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr, analyzeFileOutput) {
961
1089
  const intent = intentOpt ?? "analyzeFile";
962
- if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
1090
+ if (intent === "analyzeFile") {
1091
+ const base = planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
1092
+ if (!analyzeFileRequestsStructured(analyzeFileOutput)) return base;
1093
+ return {
1094
+ ...base,
1095
+ steps: [...base.steps ?? [], { id: "structured_merge", status: "planned" }]
1096
+ };
1097
+ }
963
1098
  if (intent === "extractText") {
964
1099
  if (kind === "pdf") {
965
1100
  return {
@@ -1078,6 +1213,59 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr) {
1078
1213
  ]
1079
1214
  };
1080
1215
  }
1216
+ if (intent === "extractStructuredData") {
1217
+ switch (kind) {
1218
+ case "pdf":
1219
+ return {
1220
+ intent: "extractStructuredData",
1221
+ steps: [
1222
+ { id: "detect_kind", status: "done" },
1223
+ { id: "pdf_analyze", status: "planned" },
1224
+ { id: "pdf_structure_extract", status: "planned" },
1225
+ {
1226
+ id: "pdf_ocr",
1227
+ status: pdfOcrForAnalyze === "off" ? "skipped" : "planned"
1228
+ },
1229
+ { id: "structured_normalize", status: "planned" }
1230
+ ]
1231
+ };
1232
+ case "docx":
1233
+ return {
1234
+ intent: "extractStructuredData",
1235
+ steps: [
1236
+ { id: "detect_kind", status: "done" },
1237
+ { id: "docx_mammoth", status: "planned" },
1238
+ { id: "docx_ooxml_parallel", status: "planned" },
1239
+ { id: "structured_normalize", status: "planned" }
1240
+ ]
1241
+ };
1242
+ case "image":
1243
+ return {
1244
+ intent: "extractStructuredData",
1245
+ steps: [
1246
+ ...imageOcrPlanSteps(ocr),
1247
+ { id: "structured_normalize", status: "planned" }
1248
+ ]
1249
+ };
1250
+ case "text":
1251
+ return {
1252
+ intent: "extractStructuredData",
1253
+ steps: [
1254
+ { id: "detect_kind", status: "done" },
1255
+ { id: "utf8_decode", status: "planned" },
1256
+ { id: "structured_normalize", status: "planned" }
1257
+ ]
1258
+ };
1259
+ default:
1260
+ return {
1261
+ intent: "extractStructuredData",
1262
+ steps: [
1263
+ { id: "detect_kind", status: "done" },
1264
+ { id: "route", status: "failed" }
1265
+ ]
1266
+ };
1267
+ }
1268
+ }
1081
1269
  return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
1082
1270
  }
1083
1271
  async function getCapabilities(input, options) {
@@ -1096,10 +1284,13 @@ async function explainAnalysisPlan(input, options) {
1096
1284
  const pdfOcrAnalyze = resolvePdfOcrMode(options?.pdf);
1097
1285
  const docxInc = options?.docx?.include;
1098
1286
  const ocrSlice = options?.ocr;
1099
- const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc, ocrSlice);
1287
+ const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc, ocrSlice, {
1288
+ structuredOutput: options?.structuredOutput,
1289
+ output: options?.output
1290
+ });
1100
1291
  return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan, docxInc, ocrSlice);
1101
1292
  }
1102
1293
 
1103
- export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
1294
+ export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
1104
1295
  //# sourceMappingURL=index.js.map
1105
1296
  //# sourceMappingURL=index.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-node",
3
- "version": "1.9.1",
3
+ "version": "1.10.0",
4
4
  "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -32,10 +32,10 @@
32
32
  ],
33
33
  "license": "MIT",
34
34
  "dependencies": {
35
- "@dragon708/docmind-docx": "^1.7.1",
36
- "@dragon708/docmind-ocr": "^1.1.3",
37
- "@dragon708/docmind-pdf": "^2.1.1",
38
- "@dragon708/docmind-shared": "^1.1.1"
35
+ "@dragon708/docmind-docx": "^1.8.0",
36
+ "@dragon708/docmind-ocr": "^1.1.4",
37
+ "@dragon708/docmind-pdf": "^2.2.0",
38
+ "@dragon708/docmind-shared": "^1.2.0"
39
39
  },
40
40
  "devDependencies": {
41
41
  "@types/node": "^20.19.37",