npm - @dragon708/docmind-node - Versions diffs - 1.9.1 → 1.10.0 - Mend

@dragon708/docmind-node 1.9.1 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -1,9 +1,11 @@
-import { DocMindAnalyzeOptions, DetectFileKindInput, NamedInput, AnalysisResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
-export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
-import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
-import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
-import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
-export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
+import { DocMindAnalyzeOptions, AnalyzeFileOutputOptions, NormalizeStructuredOptions, DetectFileKindInput, NamedInput, AnalysisResult, StructuredDocumentResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
+export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocumentBlock, DocumentImageRef, DocumentPage, DocumentTable, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, StructuredDocumentResult, TextAnalysisResult, analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
+import { OcrOptions, OcrTiffOptions, PreprocessImageOptions, ExtractStructuredDataFromImageOptions } from '@dragon708/docmind-ocr';
+export { ExtractStructuredDataFromImageOptions, extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
+import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDocxOptions } from '@dragon708/docmind-docx';
+export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
+import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
+export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
 /**
  * Options for Node public APIs (`analyzeFile`, intent methods).
@@ -24,7 +26,7 @@ interface NodeAnalyzeDocxOptionsSlice {
     /** Opciones Mammoth para HTML (p. ej. `convertImage`). */
     readonly html?: DocxToHtmlOptions;
 }
-interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
+interface NodeAnalyzeOptions extends DocMindAnalyzeOptions, AnalyzeFileOutputOptions {
     readonly pdf?: PdfAnalyzeOptions;
     readonly ocr?: OcrOptions & Pick<OcrTiffOptions, "maxPages" | "pageSeparator"> & {
         readonly preprocess?: PreprocessImageOptions;
@@ -38,6 +40,16 @@ interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
      */
     readonly pdfNativeTextSource?: "pdf-parse" | "pdfjs-per-page";
 }
+/**
+ * Opciones para {@link extractStructuredData}: reenvío por rama (`pdf` / `docx` / `ocr`) más
+ * `normalize` opcional aplicado a texto plano o como respaldo cuando la rama no define `normalize`.
+ */
+interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
+    readonly pdf?: ExtractStructuredDataFromPdfOptions;
+    readonly docx?: ExtractStructuredDataFromDocxOptions;
+    readonly ocr?: ExtractStructuredDataFromImageOptions;
+    readonly normalize?: NormalizeStructuredOptions;
+}
 /**
  * Inputs accepted by {@link analyzeFile} in this package.
@@ -85,8 +97,15 @@ declare function convertToHtml(input: NodeAnalyzeInput, options?: NodeAnalyzeOpt
  */
 declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
+/**
+ * Resuelve entrada Node, clasifica el archivo y devuelve un {@link StructuredDocumentResult} vía
+ * el extractor estructurado del paquete correspondiente (PDF, DOCX, imagen OCR) o
+ * {@link normalizeToStructuredResult} para texto UTF-8.
+ */
+declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
 /** High-level features the user can ask DocMind for (per input kind and runtime). */
-type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
+type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
 /**
  * DOCX-only: what the stack can do with OOXML embedded bitmaps/vector payloads under `word/media`.
  * Present on {@link GetCapabilitiesReport} when `kind === "docx"`.
@@ -160,7 +179,8 @@ interface ExplainAnalysisPlanReport {
     readonly kind: FileKind;
     readonly detectedKind: FileKind;
     readonly runtime: RuntimeDescriptor;
-    readonly intent: DocMindPublicIntent | (string & {});
+    /** Incluye intents extendidos en Node (p. ej. `extractStructuredData`). */
+    readonly intent: DocMindPublicIntent | string;
     readonly primaryAnalyzer: AnalysisAnalyzer;
     readonly nativeExtraction: NativeExtractionPlan;
     readonly ocr: OcrPlan;
@@ -174,11 +194,11 @@ interface ExplainAnalysisPlanReport {
 }
 /** Options for {@link explainAnalysisPlan} including PDF/OCR/DOCX hints for accurate planning. */
-type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx">;
+type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx" | "structuredOutput" | "output">;
 /**
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
- * `text` | `metadata` | `html` | `ocr` | `pages` (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
+ * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
  * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
  * For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
  * Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
@@ -190,4 +210,4 @@ declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilit
  */
 declare function explainAnalysisPlan(input: NodeAnalyzeInput, options?: NodeExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
-export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
+export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type NodeExtractStructuredDataOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };

package/dist/index.js CHANGED Viewed

@@ -1,9 +1,11 @@
-import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
-export { detectFileKind } from '@dragon708/docmind-shared';
-import { analyzeDocx } from '@dragon708/docmind-docx';
-export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
-import { preprocessHasEffect, resolveImageFormat, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
-import { extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
+import { assertValidAnalyzeFileInput, detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, notImplementedResult, analyzeFileRequestsStructured, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
+export { analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
+import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-docx';
+export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
+import { extractStructuredDataFromImage, preprocessHasEffect, resolveImageFormat, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
+export { extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
+import { extractStructuredDataFromPdf, extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
+export { extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
 import { readFile } from 'fs/promises';
 import { basename } from 'path';
 import { fileURLToPath } from 'url';
@@ -252,6 +254,72 @@ async function resolveNodeAnalyzeInput(input) {
   return input;
 }
+// src/internal/abort.ts
+function throwIfAborted(signal) {
+  if (signal?.aborted) {
+    const err = new Error("The operation was aborted");
+    err.name = "AbortError";
+    throw err;
+  }
+}
+// src/extractStructuredData.ts
+var PLAIN_TEXT_STRUCTURED_NOTE = "Plain text: structured output wraps decoded UTF-8 in a unified envelope (no layout blocks).";
+async function extractStructuredData(input, options) {
+  throwIfAborted(options?.signal);
+  const resolved = await resolveNodeAnalyzeInput(input);
+  assertValidAnalyzeFileInput(resolved);
+  const kind = detectFileKind(resolved);
+  const signal = options?.signal;
+  const normFallback = options?.normalize;
+  switch (kind) {
+    case "pdf": {
+      const data = await bytesFromDetectInput(resolved);
+      return extractStructuredDataFromPdf(data, {
+        ...options?.pdf,
+        signal: options?.pdf?.signal ?? signal,
+        normalize: options?.pdf?.normalize ?? normFallback
+      });
+    }
+    case "docx": {
+      const data = await bytesFromDetectInput(resolved);
+      return extractStructuredDataFromDocx(data, {
+        ...options?.docx,
+        signal: options?.docx?.signal ?? signal,
+        normalize: options?.docx?.normalize ?? normFallback
+      });
+    }
+    case "image": {
+      const data = await bytesFromDetectInput(resolved);
+      return extractStructuredDataFromImage(data, {
+        ...options?.ocr,
+        signal: options?.ocr?.signal ?? signal,
+        normalize: options?.ocr?.normalize ?? normFallback
+      });
+    }
+    case "text": {
+      const r = await analyzeText(resolved, { signal });
+      return normalizeToStructuredResult(
+        {
+          kind: "text",
+          text: r.text,
+          warnings: [...r.warnings, PLAIN_TEXT_STRUCTURED_NOTE]
+        },
+        normFallback
+      );
+    }
+    default:
+      return normalizeToStructuredResult(
+        {
+          kind: "unknown",
+          text: "",
+          warnings: [UNKNOWN_FORMAT_WARNING]
+        },
+        normFallback
+      );
+  }
+}
 // src/analyze.ts
 async function analyzeFile(input, options) {
   if (options?.signal?.aborted) {
@@ -262,17 +330,39 @@ async function analyzeFile(input, options) {
   const resolved = await resolveNodeAnalyzeInput(input);
   assertValidAnalyzeFileInput(resolved);
   const fileKind = detectFileKind(resolved);
+  let result;
   switch (fileKind) {
     case "pdf":
-      return analyzePdfForNode(resolved, options);
+      result = await analyzePdfForNode(resolved, options);
+      break;
     case "docx":
-      return analyzeDocxForNode(resolved, options);
+      result = await analyzeDocxForNode(resolved, options);
+      break;
     case "image":
-      return analyzeImageForNode(resolved, options);
+      result = await analyzeImageForNode(resolved, options);
+      break;
     case "text":
-      return analyzeText(resolved, { signal: options?.signal });
+      result = await analyzeText(resolved, { signal: options?.signal });
+      break;
     default:
-      return notImplementedResult(fileKind, "none", [UNKNOWN_FORMAT_WARNING]);
+      result = notImplementedResult(fileKind, "none", [UNKNOWN_FORMAT_WARNING]);
+  }
+  if (!analyzeFileRequestsStructured(options) || result.status !== "ok") {
+    return result;
+  }
+  try {
+    const structured = await extractStructuredData(resolved, {
+      signal: options?.signal,
+      pdf: options?.pdf,
+      docx: options?.docx,
+      ocr: options?.ocr
+    });
+    return { ...result, structured };
+  } catch (e) {
+    if (e instanceof Error && e.name === "AbortError") throw e;
+    const msg = e instanceof Error ? e.message : String(e);
+    const prev = "warnings" in result && Array.isArray(result.warnings) ? [...result.warnings] : [];
+    return { ...result, warnings: [...prev, `warning: analyzeFile structured merge failed: ${msg}`] };
   }
 }
@@ -287,15 +377,6 @@ function withPdfOcrDefaultOff(options) {
   };
 }
-// src/internal/abort.ts
-function throwIfAborted(signal) {
-  if (signal?.aborted) {
-    const err = new Error("The operation was aborted");
-    err.name = "AbortError";
-    throw err;
-  }
-}
 // src/publicActions.ts
 var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not returned by extractMetadata; use analyzeFile, extractText, or convertToHtml with options.docx.include for OOXML structure, headings, tables, blocks, approximate pages, and embedded images.";
 var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
@@ -595,7 +676,10 @@ function buildNodeCapabilityReport(kind) {
         slot("ocr", true, [
           "Raster OCR pipeline (pdf.ocr auto with quality heuristics, force, or runOcr)."
         ]),
-        slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."])
+        slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."]),
+        slot("structured-output", true, [
+          "extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
+        ])
       ];
       break;
     case "docx":
@@ -612,6 +696,9 @@ function buildNodeCapabilityReport(kind) {
         slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
         slot("pages", false, [
           "No PDF-style page count; approximate DOCX pages via options.docx.include.pagesApprox (OOXML hints, not print layout)."
+        ]),
+        slot("structured-output", true, [
+          "extractStructuredData runs analyzeDocx with merged OOXML includes (blocks, tables, headings, embedded images, etc.) into StructuredDocumentResult."
         ])
       ];
       break;
@@ -643,6 +730,9 @@ function buildNodeCapabilityReport(kind) {
         ]),
         slot("pages", true, [
           "TIFF: `pages` and `textByPage` mirror frames processed. Other formats may expose `pages` when the normalizer reports it (e.g. GIF)."
+        ]),
+        slot("structured-output", true, [
+          "extractStructuredData: OCR + layout blocks (ocrImageDetailed / ocrTiff) normalized to StructuredDocumentResult."
         ])
       ];
       break;
@@ -652,7 +742,10 @@ function buildNodeCapabilityReport(kind) {
         slot("metadata", true, [TEXT_META_NOTE]),
         slot("html", true),
         slot("ocr", false, ["OCR does not apply to plain text files."]),
-        slot("pages", false)
+        slot("pages", false),
+        slot("structured-output", true, [
+          "extractStructuredData wraps UTF-8 decode in normalizeToStructuredResult (rollup text only unless you add blocks upstream)."
+        ])
       ];
       break;
     default:
@@ -662,7 +755,8 @@ function buildNodeCapabilityReport(kind) {
         slot("metadata", false),
         slot("html", false),
         slot("ocr", false),
-        slot("pages", false)
+        slot("pages", false),
+        slot("structured-output", false)
       ];
   }
   return {
@@ -872,6 +966,40 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
         ocr = { mayUse: false, description: "OCR does not apply to text files." };
       }
       break;
+    case "extractStructuredData":
+      if (kind === "pdf") {
+        nativeExtraction = {
+          willAttempt: true,
+          description: "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult."
+        };
+        ocr = {
+          mayUse: pdfOcr !== "off",
+          description: pdfOcr === "off" ? "Raster OCR is off (pdf.ocr: off); structured text uses native extraction only." : pdfOcr === "force" ? "Raster OCR may run on all pages (pdf.ocr: force)." : "Raster OCR may run when heuristics suggest weak native text (pdf.ocr: auto)."
+        };
+      } else if (kind === "docx") {
+        nativeExtraction = {
+          willAttempt: true,
+          description: "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope."
+        };
+        ocr = { mayUse: false, description: "DOCX does not use OCR." };
+      } else if (kind === "image") {
+        nativeExtraction = {
+          willAttempt: false,
+          description: NODE_IMAGE_OCR_PIPELINE
+        };
+        ocr = {
+          mayUse: true,
+          description: "OCR + layout blocks: same pipeline as package extractStructuredDataFromImage (normalize \u2192 optional preprocess \u2192 Tesseract; TIFF multipage via ocrTiff)."
+        };
+      } else {
+        nativeExtraction = {
+          willAttempt: true,
+          description: "UTF-8 decode with BOM handling; normalizeToStructuredResult produces the structured envelope."
+        };
+        ocr = { mayUse: false, description: "OCR does not apply to text files." };
+        limitations = lim("Plain text has no native layout blocks; `text` carries the decoded content.");
+      }
+      break;
     default:
       nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
       ocr = { mayUse: false, description: "See plan steps." };
@@ -957,9 +1085,16 @@ function planAnalyzeFile(kind, pdfOcr, docxInclude, ocr) {
       };
   }
 }
-function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr) {
+function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr, analyzeFileOutput) {
   const intent = intentOpt ?? "analyzeFile";
-  if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
+  if (intent === "analyzeFile") {
+    const base = planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
+    if (!analyzeFileRequestsStructured(analyzeFileOutput)) return base;
+    return {
+      ...base,
+      steps: [...base.steps ?? [], { id: "structured_merge", status: "planned" }]
+    };
+  }
   if (intent === "extractText") {
     if (kind === "pdf") {
       return {
@@ -1078,6 +1213,59 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr) {
       ]
     };
   }
+  if (intent === "extractStructuredData") {
+    switch (kind) {
+      case "pdf":
+        return {
+          intent: "extractStructuredData",
+          steps: [
+            { id: "detect_kind", status: "done" },
+            { id: "pdf_analyze", status: "planned" },
+            { id: "pdf_structure_extract", status: "planned" },
+            {
+              id: "pdf_ocr",
+              status: pdfOcrForAnalyze === "off" ? "skipped" : "planned"
+            },
+            { id: "structured_normalize", status: "planned" }
+          ]
+        };
+      case "docx":
+        return {
+          intent: "extractStructuredData",
+          steps: [
+            { id: "detect_kind", status: "done" },
+            { id: "docx_mammoth", status: "planned" },
+            { id: "docx_ooxml_parallel", status: "planned" },
+            { id: "structured_normalize", status: "planned" }
+          ]
+        };
+      case "image":
+        return {
+          intent: "extractStructuredData",
+          steps: [
+            ...imageOcrPlanSteps(ocr),
+            { id: "structured_normalize", status: "planned" }
+          ]
+        };
+      case "text":
+        return {
+          intent: "extractStructuredData",
+          steps: [
+            { id: "detect_kind", status: "done" },
+            { id: "utf8_decode", status: "planned" },
+            { id: "structured_normalize", status: "planned" }
+          ]
+        };
+      default:
+        return {
+          intent: "extractStructuredData",
+          steps: [
+            { id: "detect_kind", status: "done" },
+            { id: "route", status: "failed" }
+          ]
+        };
+    }
+  }
   return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
 }
 async function getCapabilities(input, options) {
@@ -1096,10 +1284,13 @@ async function explainAnalysisPlan(input, options) {
   const pdfOcrAnalyze = resolvePdfOcrMode(options?.pdf);
   const docxInc = options?.docx?.include;
   const ocrSlice = options?.ocr;
-  const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc, ocrSlice);
+  const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc, ocrSlice, {
+    structuredOutput: options?.structuredOutput,
+    output: options?.output
+  });
   return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan, docxInc, ocrSlice);
 }
-export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
+export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
 //# sourceMappingURL=index.js.map
 //# sourceMappingURL=index.js.map

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@dragon708/docmind-node",
-  "version": "1.9.1",
+  "version": "1.10.0",
   "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
   "type": "module",
   "main": "./dist/index.js",
@@ -32,10 +32,10 @@
   ],
   "license": "MIT",
   "dependencies": {
-    "@dragon708/docmind-docx": "^1.7.1",
-    "@dragon708/docmind-ocr": "^1.1.3",
-    "@dragon708/docmind-pdf": "^2.1.1",
-    "@dragon708/docmind-shared": "^1.1.1"
+    "@dragon708/docmind-docx": "^1.8.0",
+    "@dragon708/docmind-ocr": "^1.1.4",
+    "@dragon708/docmind-pdf": "^2.2.0",
+    "@dragon708/docmind-shared": "^1.2.0"
   },
   "devDependencies": {
     "@types/node": "^20.19.37",