npm - @dragon708/docmind-node - Versions diffs - 1.2.0 → 1.7.0 - Mend

@dragon708/docmind-node 1.2.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -2,6 +2,8 @@ import { DocMindAnalyzeOptions, DetectFileKindInput, NamedInput, AnalysisResult,
 export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
 import { OcrOptions } from '@dragon708/docmind-ocr';
 import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
+import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
+export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
 /**
  * Options for Node public APIs (`analyzeFile`, intent methods).
@@ -9,10 +11,30 @@ import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
  * - **`pdf`**: forwarded to `@dragon708/docmind-pdf`. `analyzeFile` defaults `pdf.ocr` to `"auto"` when omitted.
  *   {@link extractText} / {@link convertToHtml} merge a default of `ocr: "off"` unless you set `pdf.ocr` explicitly.
  * - **`ocr`**: forwarded to `@dragon708/docmind-ocr` for raster images; language string also feeds PDF OCR when `pdf.ocrLangs` is unset.
+ * - **`pdfNativeTextSource`**: when `pdf.ocr` resolves to `"off"`, chooses how native text is obtained (see {@link extractText} default).
  */
+/**
+ * Opciones DOCX para el facade Node (Mammoth + inclusiones v2 opcionales de `@dragon708/docmind-docx`).
+ */
+interface NodeAnalyzeDocxOptionsSlice {
+    /**
+     * Pasa a `analyzeDocx` → extractores OOXML/ZIP en paralelo con Mammoth (`structure`, `headings`, `tables`, `blocks`, `pagesApprox`, `embeddedImages`).
+     */
+    readonly include?: AnalyzeDocxIncludeFlags;
+    /** Opciones Mammoth para HTML (p. ej. `convertImage`). */
+    readonly html?: DocxToHtmlOptions;
+}
 interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
     readonly pdf?: PdfAnalyzeOptions;
     readonly ocr?: OcrOptions;
+    /** Solo DOCX: ver {@link NodeAnalyzeDocxOptionsSlice}. */
+    readonly docx?: NodeAnalyzeDocxOptionsSlice;
+    /**
+     * Native PDF text when `pdf.ocr` is `"off"`:
+     * - **`pdfjs-per-page`** (default for {@link extractText}): PDF.js text per page, merged for `text` (aligns with OCR raster engine).
+     * - **`pdf-parse`**: single pdf-parse pass (default for {@link analyzeFile} when you set `pdf.ocr: "off"` without this flag).
+     */
+    readonly pdfNativeTextSource?: "pdf-parse" | "pdfjs-per-page";
 }
 /**
@@ -38,8 +60,9 @@ declare function resolveNodeAnalyzeInput(input: NodeAnalyzeInput): Promise<Detec
 declare function analyzeFile(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
 /**
- * Plain-text extraction using {@link analyzeFile} routing. PDFs default to **text layer only**
- * (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly.
+ * Plain-text extraction using {@link analyzeFile} routing. PDFs default to **native text only**
+ * (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly, and to **PDF.js per-page** assembly
+ * (`pdfNativeTextSource: "pdfjs-per-page"`) unless you set `options.pdfNativeTextSource` or `pdf.ocr` enables OCR.
  */
 declare function extractText(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
 /**
@@ -58,6 +81,62 @@ declare function convertToHtml(input: NodeAnalyzeInput, options?: NodeAnalyzeOpt
  */
 declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
+/** High-level features the user can ask DocMind for (per input kind and runtime). */
+type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
+/**
+ * DOCX-only: what the stack can do with OOXML embedded bitmaps/vector payloads under `word/media`.
+ * Present on {@link GetCapabilitiesReport} when `kind === "docx"`.
+ */
+interface DocxEmbeddedImageCapabilities {
+    /** Bytes under `word/media/*` can be read (see `@dragon708/docmind-docx` / facade {@link extractImagesFromDocx}). */
+    readonly canExtractEmbeddedImages: true;
+    /**
+     * DOCX files may contain EMF, WMF, HEIC, etc., which are not reliably usable in a browser `<img>` without conversion.
+     * This flag is static (kind-based); it does not inspect the open document.
+     */
+    readonly documentsMayIncludeImagesRequiringWebConversion: true;
+    /**
+     * In-browser conversion for those formats is **not** provided by DocMind; Node helpers may attempt best-effort conversion
+     * (currently stub — see package warnings).
+     */
+    readonly webFriendlyConversionNodeFirst: true;
+    readonly notes: readonly string[];
+}
+/** Shared slice for {@link GetCapabilitiesReport} and {@link ExplainAnalysisPlanReport}. */
+/** True si `options.docx.include` solicita al menos un extractor OOXML v2. */
+declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
+/**
+ * Capacidades estructurales DOCX v2 en Node (vía `@dragon708/docmind-docx` + `options.docx.include`).
+ * Presente en {@link GetCapabilitiesReport} cuando `kind === "docx"`.
+ */
+interface DocxStructuralCapabilities {
+    readonly ooxmlExtractorsAvailable: true;
+    readonly activatedViaDocxInclude: true;
+    readonly features: readonly string[];
+    readonly notes: readonly string[];
+}
+declare const DOCX_STRUCTURE_CAPABILITIES: DocxStructuralCapabilities;
+declare const DOCX_EMBEDDED_IMAGE_CAPABILITIES: DocxEmbeddedImageCapabilities;
+/** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
+interface PublicCapabilitySupport {
+    readonly id: PublicCapabilityId;
+    readonly supported: boolean;
+    readonly warnings?: readonly string[];
+}
+/**
+ * Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
+ */
+interface GetCapabilitiesReport {
+    readonly kind: FileKind;
+    readonly runtime: RuntimeDescriptor;
+    readonly capabilities: readonly PublicCapabilitySupport[];
+    /** Only when {@link GetCapabilitiesReport.kind} is `"docx"`. */
+    readonly docxEmbeddedImages?: DocxEmbeddedImageCapabilities;
+    /** Only when `kind === "docx"`: extractores OOXML v2 disponibles con `options.docx.include`. */
+    readonly docxStructure?: DocxStructuralCapabilities;
+    readonly warnings?: readonly string[];
+}
 /**
  * Whether DocMind will try a non-OCR text/HTML path (e.g. Mammoth, pdf-parse text layer, UTF-8).
  */
@@ -83,33 +162,21 @@ interface ExplainAnalysisPlanReport {
     readonly ocr: OcrPlan;
     readonly limitations: readonly string[];
     readonly plan: ProcessingPlanDescriptor;
+    /** Only when `kind === "docx"` (same payload as `getCapabilities` → `docxEmbeddedImages`). */
+    readonly docxEmbeddedImages?: DocxEmbeddedImageCapabilities;
+    /** Only when `kind === "docx"` (same payload as `getCapabilities` → `docxStructure`). */
+    readonly docxStructure?: DocxStructuralCapabilities;
     readonly warnings?: readonly string[];
 }
-/** High-level features the user can ask DocMind for (per input kind and runtime). */
-type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
-/** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
-interface PublicCapabilitySupport {
-    readonly id: PublicCapabilityId;
-    readonly supported: boolean;
-    readonly warnings?: readonly string[];
-}
-/**
- * Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
- */
-interface GetCapabilitiesReport {
-    readonly kind: FileKind;
-    readonly runtime: RuntimeDescriptor;
-    readonly capabilities: readonly PublicCapabilitySupport[];
-    readonly warnings?: readonly string[];
-}
-/** Options for {@link explainAnalysisPlan} including PDF/OCR hints for accurate planning. */
-type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr">;
+/** Options for {@link explainAnalysisPlan} including PDF/OCR/DOCX hints for accurate planning. */
+type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx">;
 /**
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
- * `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (PDF fully supported).
+ * `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (for PDF, `text` / `metadata` /
+ * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
+ * For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
  * Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
  */
 declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
@@ -119,4 +186,4 @@ declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilit
  */
 declare function explainAnalysisPlan(input: NodeAnalyzeInput, options?: NodeExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
-export { type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
+export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };

package/dist/index.js CHANGED Viewed

@@ -1,13 +1,45 @@
 import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile } from '@dragon708/docmind-shared';
 export { detectFileKind } from '@dragon708/docmind-shared';
 import { analyzeDocx } from '@dragon708/docmind-docx';
+export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
 import { ocr } from '@dragon708/docmind-ocr';
-import { extractPdfMetadata, extractTextFromPdf, analyzePdf } from '@dragon708/docmind-pdf';
+import { extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
 import { readFile } from 'fs/promises';
 import { basename } from 'path';
 import { fileURLToPath } from 'url';
 // src/analyze.ts
+// src/docxNodeMapper.ts
+function analyzeDocxOptionsFromNode(options) {
+  const sig = options?.signal;
+  const dx = options?.docx;
+  if (!dx?.include && !dx?.html && !sig) return void 0;
+  const out = { ...dx?.html ?? {} };
+  if (dx?.include) out.include = dx.include;
+  if (sig) out.signal = sig;
+  return out;
+}
+function docxPackageResultToAnalysisResult(r) {
+  const base = {
+    fileKind: "docx",
+    analyzer: "docx",
+    status: "ok",
+    kind: "docx",
+    text: r.text,
+    html: r.html,
+    warnings: [...r.warnings]
+  };
+  const v2 = {
+    ...r.structure !== void 0 ? { structure: r.structure } : {},
+    ...r.headings !== void 0 ? { headings: r.headings } : {},
+    ...r.tables !== void 0 ? { tables: r.tables } : {},
+    ...r.blocks !== void 0 ? { blocks: r.blocks } : {},
+    ...r.pagesApprox !== void 0 ? { pagesApprox: r.pagesApprox } : {},
+    ...r.embeddedImages !== void 0 ? { embeddedImages: r.embeddedImages } : {}
+  };
+  return { ...base, ...v2 };
+}
 function isByteBackedInput(input) {
   return isNamedInput(input) || isBinaryInput(input) || isBlob(input) || isFile(input);
 }
@@ -19,7 +51,8 @@ async function bytesFromDetectInput(input) {
 }
 // src/analyzers/docx.ts
-async function analyzeDocxForNode(input, signal) {
+async function analyzeDocxForNode(input, options) {
+  const signal = options?.signal;
   if (signal?.aborted) {
     const err = new Error("The operation was aborted");
     err.name = "AbortError";
@@ -37,16 +70,9 @@ async function analyzeDocxForNode(input, signal) {
       warnings: ["No document bytes were provided for analysis."]
     };
   }
-  const r = await analyzeDocx(data);
-  return {
-    fileKind: "docx",
-    analyzer: "docx",
-    status: "ok",
-    kind: "docx",
-    text: r.text,
-    html: r.html,
-    warnings: [...r.warnings]
-  };
+  const docxOpts = analyzeDocxOptionsFromNode(options);
+  const r = docxOpts !== void 0 ? await analyzeDocx(data, docxOpts) : await analyzeDocx(data);
+  return docxPackageResultToAnalysisResult(r);
 }
 async function analyzeImageForNode(input, options) {
   if (options?.signal?.aborted) {
@@ -112,17 +138,45 @@ async function analyzePdfForNode(input, options) {
     signal: userPdf?.signal ?? options?.signal
   };
   const r = await analyzePdf(data, pdfOpts);
+  const usePdfJsPerPage = pdfOpts.ocr === "off" && (options?.pdfNativeTextSource ?? "pdf-parse") === "pdfjs-per-page";
+  if (!usePdfJsPerPage) {
+    return {
+      fileKind: "pdf",
+      analyzer: "pdf",
+      status: "ok",
+      kind: "pdf",
+      text: r.text,
+      pages: r.pages,
+      metadata: r.metadata,
+      warnings: [...r.warnings],
+      needsOCR: r.needsOCR,
+      ocrUsed: r.ocrUsed
+    };
+  }
+  let text = r.text;
+  const extra = [];
+  try {
+    const rows = await extractPdfTextByPage(data, {
+      maxPages: pdfOpts.maxPages,
+      signal: pdfOpts.signal
+    });
+    text = rows.map((row) => row.text).join("\n\n");
+  } catch (e) {
+    const msg = e instanceof Error ? e.message : String(e);
+    extra.push(`warning: PDF.js per-page text failed; using pdf-parse text: ${msg}`);
+  }
+  const needsOCR = r.pages > 0 && text.trim().length === 0;
   return {
     fileKind: "pdf",
     analyzer: "pdf",
     status: "ok",
     kind: "pdf",
-    text: r.text,
+    text,
     pages: r.pages,
     metadata: r.metadata,
-    warnings: [...r.warnings],
-    needsOCR: r.needsOCR,
-    ocrUsed: r.ocrUsed
+    warnings: [...r.warnings, ...extra],
+    needsOCR,
+    ocrUsed: false
   };
 }
 function toPathString(pathOrUrl) {
@@ -160,7 +214,7 @@ async function analyzeFile(input, options) {
     case "pdf":
       return analyzePdfForNode(resolved, options);
     case "docx":
-      return analyzeDocxForNode(resolved, options?.signal);
+      return analyzeDocxForNode(resolved, options);
     case "image":
       return analyzeImageForNode(resolved, options);
     case "text":
@@ -191,7 +245,7 @@ function throwIfAborted(signal) {
 }
 // src/publicActions.ts
-var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not exposed as a separate API; use extractText or analyzeFile.";
+var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not returned by extractMetadata; use analyzeFile, extractText, or convertToHtml with options.docx.include for OOXML structure, headings, tables, blocks, approximate pages, and embedded images.";
 var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
 var RUN_OCR_PDF_FORCE_SEMANTICS = 'runOcr: PDF pipeline ran with `ocr: "force"` so text may include raster OCR output even when a text layer exists.';
 function escapeHtmlMinimal(s) {
@@ -211,7 +265,11 @@ function toExtractTextResult(full) {
 }
 async function extractText(input, options) {
   throwIfAborted(options?.signal);
-  const full = await analyzeFile(input, withPdfOcrDefaultOff(options));
+  const merged = {
+    ...withPdfOcrDefaultOff(options),
+    pdfNativeTextSource: options?.pdfNativeTextSource ?? "pdfjs-per-page"
+  };
+  const full = await analyzeFile(input, merged);
   return toExtractTextResult(full);
 }
 async function extractMetadata(input, options) {
@@ -433,19 +491,16 @@ async function runOcr(input, options) {
           warnings: ["No document bytes were provided for analysis."]
         };
       }
-      const r = await analyzeDocx(data);
-      return {
-        fileKind: "docx",
-        analyzer: "docx",
-        status: "ok",
-        kind: "docx",
-        text: r.text,
-        html: r.html,
+      const opt = analyzeDocxOptionsFromNode(options);
+      const raw = opt !== void 0 ? await analyzeDocx(data, opt) : await analyzeDocx(data);
+      const withNote = {
+        ...raw,
         warnings: [
-          ...r.warnings,
+          ...raw.warnings,
           "OCR does not apply to DOCX; returned structured text/HTML extract."
         ]
       };
+      return docxPackageResultToAnalysisResult(withNote);
     }
     case "text":
       return analyzeText(resolved, { signal });
@@ -454,11 +509,141 @@ async function runOcr(input, options) {
   }
 }
+// src/capabilityReport.ts
+function docxIncludeRequested(flags) {
+  if (!flags) return false;
+  return !!(flags.structure || flags.headings || flags.tables || flags.blocks || flags.pagesApprox || flags.embeddedImages);
+}
+var DOCX_STRUCTURE_CAPABILITIES = {
+  ooxmlExtractorsAvailable: true,
+  activatedViaDocxInclude: true,
+  features: [
+    "OOXML structure (body blocks)",
+    "headings",
+    "tables",
+    "semantic blocks (RAG-friendly)",
+    "approximate pages (OOXML page-break hints)",
+    "embedded images (word/media; optional web/both modes)"
+  ],
+  notes: [
+    "Use options.docx.include on analyzeFile, extractText, convertToHtml, or runOcr to merge Mammoth output with selected extractors.",
+    "extractMetadata for DOCX remains a lightweight stub and does not run these extractors."
+  ]
+};
+var DOCX_EMBEDDED_IMAGE_CAPABILITIES = {
+  canExtractEmbeddedImages: true,
+  documentsMayIncludeImagesRequiringWebConversion: true,
+  webFriendlyConversionNodeFirst: true,
+  notes: [
+    "Use extractImagesFromDocx (re-exported from @dragon708/docmind-node) for raw ZIP media; optional mode: web | both for web-oriented bytes.",
+    "PNG, JPEG, GIF, WebP, SVG, TIFF, BMP, ICO are treated as browser-embeddable; EMF/WMF and similar may require an external Node pipeline.",
+    "@dragon708/docmind-docx does not ship a bundled EMF/WMF converter; convertDocxEmbeddedImageToWeb surfaces clear warnings until you wire ImageMagick, Sharp, or similar."
+  ]
+};
+var DOCX_META = "Structured document metadata is not exposed separately; extractMetadata returns a stub for DOCX.";
+var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
+var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
+var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
+var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
+function slot(id, supported, warnings) {
+  return warnings?.length ? { id, supported, warnings } : { id, supported };
+}
+function buildNodeCapabilityReport(kind) {
+  const runtime = { id: "node" };
+  let capabilities;
+  const topWarnings = [];
+  switch (kind) {
+    case "pdf":
+      capabilities = [
+        slot("text", true, [
+          "Native text via pdf-parse and (in Node extractText) PDF.js per-page text; set pdf.ocr for raster OCR."
+        ]),
+        slot("metadata", true, [
+          "Document info / XMP-style metadata via pdf-parse without running the OCR pipeline."
+        ]),
+        slot("pages", true, [
+          "Page count and per-page native extraction (PDF.js) where used; OCR respects pdf.maxPages."
+        ]),
+        slot("ocr", true, [
+          "Raster OCR pipeline (pdf.ocr auto with quality heuristics, force, or runOcr)."
+        ]),
+        slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."])
+      ];
+      break;
+    case "docx":
+      capabilities = [
+        slot("text", true, [
+          "Mammoth plain text; extractText clears html. Optional OOXML extractors merge when options.docx.include is set."
+        ]),
+        slot("metadata", false, [
+          `${DOCX_META} OOXML structure, headings, tables, blocks, approximate pages, and embedded images are available via analyzeFile-style routes with options.docx.include.`
+        ]),
+        slot("html", true, [
+          "Mammoth HTML uses docxImagesAsDataUri for web-safe images; EMF/WMF and other non-web types appear as placeholders, not extracted media."
+        ]),
+        slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
+        slot("pages", false, [
+          "No PDF-style page count; approximate DOCX pages via options.docx.include.pagesApprox (OOXML hints, not print layout)."
+        ])
+      ];
+      break;
+    case "image":
+      capabilities = [
+        slot("text", true, ["Text is obtained via OCR."]),
+        slot("metadata", false, [IMAGE_META]),
+        slot("html", false, [IMAGE_HTML]),
+        slot("ocr", true),
+        slot("pages", false)
+      ];
+      break;
+    case "text":
+      capabilities = [
+        slot("text", true),
+        slot("metadata", true, [TEXT_META_NOTE]),
+        slot("html", true),
+        slot("ocr", false, ["OCR does not apply to plain text files."]),
+        slot("pages", false)
+      ];
+      break;
+    default:
+      topWarnings.push(UNKNOWN_KIND);
+      capabilities = [
+        slot("text", false),
+        slot("metadata", false),
+        slot("html", false),
+        slot("ocr", false),
+        slot("pages", false)
+      ];
+  }
+  return {
+    kind,
+    runtime,
+    capabilities,
+    ...kind === "docx" ? {
+      docxEmbeddedImages: DOCX_EMBEDDED_IMAGE_CAPABILITIES,
+      docxStructure: DOCX_STRUCTURE_CAPABILITIES
+    } : {},
+    warnings: topWarnings.length > 0 ? topWarnings : void 0
+  };
+}
 // src/analysisPlanReport.ts
 function lim(...items) {
   return items.filter(Boolean);
 }
-function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
+var DOCX_ZIP_MEDIA_PLAN_NOTE = "ZIP embedded images (word/media) use extractImagesFromDocx (re-exported from @dragon708/docmind-node); not merged into this intent pipeline.";
+function finalizeDocxExplainReport(report) {
+  if (report.kind !== "docx") return report;
+  const limitations = report.limitations.includes(DOCX_ZIP_MEDIA_PLAN_NOTE) ? report.limitations : [...report.limitations, DOCX_ZIP_MEDIA_PLAN_NOTE];
+  return {
+    ...report,
+    docxEmbeddedImages: DOCX_EMBEDDED_IMAGE_CAPABILITIES,
+    docxStructure: DOCX_STRUCTURE_CAPABILITIES,
+    limitations
+  };
+}
+var DOCX_MAMMOTH_PLUS_OPTIONAL = "Mammoth extracts text and HTML from OOXML; optional parallel OOXML/ZIP extractors run when options.docx.include is set (structure, headings, tables, blocks, pagesApprox, embeddedImages).";
+function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
   const runtime = { id: "node" };
   const primaryAnalyzer = kind === "pdf" ? "pdf" : kind === "docx" ? "docx" : kind === "image" ? "image" : kind === "text" ? "text" : "none";
   let nativeExtraction;
@@ -468,7 +653,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
     limitations = lim(
       "Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve."
     );
-    return {
+    return finalizeDocxExplainReport({
       kind,
       detectedKind: kind,
       runtime,
@@ -478,23 +663,23 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
       ocr: { mayUse: false, description: "OCR is not used for unknown kinds." },
       limitations,
       plan
-    };
+    });
   }
   switch (intent) {
     case "analyzeFile":
       if (kind === "pdf") {
         nativeExtraction = {
           willAttempt: true,
-          description: "pdf-parse extracts embedded text and page count first."
+          description: "pdf-parse supplies embedded text, metadata, and page count; PDF.js drives raster OCR when enabled."
         };
         ocr3 = {
           mayUse: pdfOcr !== "off",
-          description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when the text layer is empty (pdf.ocr: auto)."
+          description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when native text looks insufficient (pdf.ocr: auto + heuristics)."
         };
       } else if (kind === "docx") {
         nativeExtraction = {
           willAttempt: true,
-          description: "Mammoth extracts text and HTML from OOXML."
+          description: docxIncludeRequested(docxInclude) ? "Mammoth plus parallel OOXML extractors (per options.docx.include)." : DOCX_MAMMOTH_PLUS_OPTIONAL
         };
         ocr3 = { mayUse: false, description: "DOCX does not use OCR in DocMind." };
       } else if (kind === "image") {
@@ -515,16 +700,16 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
       if (kind === "pdf") {
         nativeExtraction = {
           willAttempt: true,
-          description: "Text layer via pdf-parse; defaults to pdf.ocr off unless you override."
+          description: "Node: pdf-parse for metadata/page baseline, then PDF.js per-page text merged into `text` (pdfNativeTextSource pdfjs-per-page default)."
         };
         ocr3 = {
           mayUse: false,
-          description: "extractText merges pdf.ocr default off \u2014 no raster OCR unless you set pdf.ocr explicitly."
+          description: "extractText defaults pdf.ocr off; set pdf.ocr explicitly to allow auto/force raster OCR."
         };
       } else if (kind === "docx") {
         nativeExtraction = {
           willAttempt: true,
-          description: "Mammoth plain text; HTML cleared in the extractText response."
+          description: docxIncludeRequested(docxInclude) ? "Same DOCX router as analyzeFile: Mammoth text + optional OOXML fields; HTML cleared in extractText." : "Mammoth plain text; HTML cleared. Optional OOXML v2 fields when options.docx.include is set."
         };
         ocr3 = { mayUse: false, description: "DOCX does not use OCR." };
       } else if (kind === "image") {
@@ -552,7 +737,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
         };
         ocr3 = { mayUse: false, description: "OCR not used for this metadata path." };
         limitations = lim(
-          kind === "docx" ? "Structured DOCX metadata is not exposed separately." : "Raster images have no document metadata bundle."
+          kind === "docx" ? "Structured DOCX metadata is not exposed separately; use analyzeFile / extractText / convertToHtml with options.docx.include for OOXML structure." : "Raster images have no document metadata bundle."
         );
       } else {
         nativeExtraction = {
@@ -574,7 +759,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
       } else if (kind === "docx") {
         nativeExtraction = {
           willAttempt: true,
-          description: "Mammoth HTML output via analyzeFile routing."
+          description: docxIncludeRequested(docxInclude) ? "Mammoth HTML plus optional OOXML extractors (same router as analyzeFile)." : "Mammoth HTML via analyzeFile routing; optional OOXML v2 when options.docx.include is set."
         };
         ocr3 = { mayUse: false, description: "DOCX path does not use OCR." };
       } else if (kind === "text") {
@@ -609,7 +794,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
       } else if (kind === "docx") {
         nativeExtraction = {
           willAttempt: true,
-          description: "Full Mammoth extract (text + HTML); not OCR."
+          description: docxIncludeRequested(docxInclude) ? "Mammoth text/HTML plus optional OOXML extractors; still not OCR." : "Full Mammoth extract (text + HTML); optional OOXML v2 via options.docx.include; not OCR."
         };
         ocr3 = { mayUse: false, description: "DOCX is not OCR'd." };
         limitations = lim("Result is structured extract, not OCR output.");
@@ -625,7 +810,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
       nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
       ocr3 = { mayUse: false, description: "See plan steps." };
   }
-  return {
+  return finalizeDocxExplainReport({
     kind,
     detectedKind: kind,
     runtime,
@@ -635,82 +820,14 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
     ocr: ocr3,
     limitations,
     plan
-  };
-}
-// src/capabilityReport.ts
-var DOCX_META = "Structured document metadata is not exposed separately; extractMetadata returns a stub for DOCX.";
-var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
-var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
-var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
-var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
-function slot(id, supported, warnings) {
-  return warnings?.length ? { id, supported, warnings } : { id, supported };
-}
-function buildNodeCapabilityReport(kind) {
-  const runtime = { id: "node" };
-  let capabilities;
-  const topWarnings = [];
-  switch (kind) {
-    case "pdf":
-      capabilities = [
-        slot("text", true, ["Includes text layer extraction; use extractText options to avoid PDF OCR."]),
-        slot("metadata", true),
-        slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."]),
-        slot("ocr", true, ["Raster OCR is available (e.g. analyzeFile with pdf.ocr auto/force, or runOcr)."]),
-        slot("pages", true)
-      ];
-      break;
-    case "docx":
-      capabilities = [
-        slot("text", true),
-        slot("metadata", false, [DOCX_META]),
-        slot("html", true),
-        slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
-        slot("pages", false)
-      ];
-      break;
-    case "image":
-      capabilities = [
-        slot("text", true, ["Text is obtained via OCR."]),
-        slot("metadata", false, [IMAGE_META]),
-        slot("html", false, [IMAGE_HTML]),
-        slot("ocr", true),
-        slot("pages", false)
-      ];
-      break;
-    case "text":
-      capabilities = [
-        slot("text", true),
-        slot("metadata", true, [TEXT_META_NOTE]),
-        slot("html", true),
-        slot("ocr", false, ["OCR does not apply to plain text files."]),
-        slot("pages", false)
-      ];
-      break;
-    default:
-      topWarnings.push(UNKNOWN_KIND);
-      capabilities = [
-        slot("text", false),
-        slot("metadata", false),
-        slot("html", false),
-        slot("ocr", false),
-        slot("pages", false)
-      ];
-  }
-  return {
-    kind,
-    runtime,
-    capabilities,
-    warnings: topWarnings.length > 0 ? topWarnings : void 0
-  };
+  });
 }
 // src/introspection.ts
 function resolvePdfOcrMode(pdf) {
   return pdf?.ocr ?? "auto";
 }
-function planAnalyzeFile(kind, pdfOcr) {
+function planAnalyzeFile(kind, pdfOcr, docxInclude) {
   switch (kind) {
     case "pdf":
       return {
@@ -724,14 +841,17 @@ function planAnalyzeFile(kind, pdfOcr) {
           }
         ]
       };
-    case "docx":
+    case "docx": {
+      const parallel = docxIncludeRequested(docxInclude);
       return {
         intent: "analyzeFile",
         steps: [
           { id: "detect_kind", status: "done" },
-          { id: "docx_mammoth", status: "planned" }
+          { id: "docx_mammoth", status: "planned" },
+          ...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
         ]
       };
+    }
     case "image":
       return {
         intent: "analyzeFile",
@@ -758,11 +878,22 @@ function planAnalyzeFile(kind, pdfOcr) {
       };
   }
 }
-function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
+function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude) {
   const intent = intentOpt ?? "analyzeFile";
-  if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze);
+  if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude);
   if (intent === "extractText") {
-    const p = planAnalyzeFile(kind, "off");
+    if (kind === "pdf") {
+      return {
+        intent: "extractText",
+        steps: [
+          { id: "detect_kind", status: "done" },
+          { id: "pdf_parse", status: "planned" },
+          { id: "pdfjs_per_page", status: "planned" },
+          { id: "pdf_ocr", status: "skipped" }
+        ]
+      };
+    }
+    const p = planAnalyzeFile(kind, "off", docxInclude);
     return { ...p, intent: "extractText" };
   }
   if (intent === "extractMetadata") {
@@ -794,11 +925,13 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
   }
   if (intent === "convertToHtml") {
     if (kind === "docx") {
+      const parallel = docxIncludeRequested(docxInclude);
       return {
         intent: "convertToHtml",
         steps: [
           { id: "detect_kind", status: "done" },
-          { id: "docx_mammoth_html", status: "planned" }
+          { id: "docx_mammoth_html", status: "planned" },
+          ...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
         ]
       };
     }
@@ -851,11 +984,13 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
       };
     }
     if (kind === "docx") {
+      const parallel = docxIncludeRequested(docxInclude);
       return {
         intent: "runOcr",
         steps: [
           { id: "detect_kind", status: "done" },
-          { id: "docx_structured_extract", status: "planned" }
+          { id: "docx_mammoth", status: "planned" },
+          ...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
         ]
       };
     }
@@ -867,7 +1002,7 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
       ]
     };
   }
-  return planAnalyzeFile(kind, pdfOcrForAnalyze);
+  return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude);
 }
 async function getCapabilities(input, options) {
   throwIfAborted(options?.signal);
@@ -883,10 +1018,11 @@ async function explainAnalysisPlan(input, options) {
   const kind = detectFileKind(resolved);
   const intent = options?.intent ?? "analyzeFile";
   const pdfOcrAnalyze = resolvePdfOcrMode(options?.pdf);
-  const plan = planForIntent(intent, kind, pdfOcrAnalyze);
-  return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan);
+  const docxInc = options?.docx?.include;
+  const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc);
+  return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan, docxInc);
 }
-export { analyzeFile, bufferToInput, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
+export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
 //# sourceMappingURL=index.js.map
 //# sourceMappingURL=index.js.map

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@dragon708/docmind-node",
-  "version": "1.2.0",
+  "version": "1.7.0",
   "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
   "type": "module",
   "main": "./dist/index.js",
@@ -32,9 +32,9 @@
   ],
   "license": "MIT",
   "dependencies": {
-    "@dragon708/docmind-docx": "^1.0.0",
+    "@dragon708/docmind-docx": "^1.7.0",
     "@dragon708/docmind-ocr": "^1.0.0",
-    "@dragon708/docmind-pdf": "^1.0.0",
+    "@dragon708/docmind-pdf": "^2.0.0",
     "@dragon708/docmind-shared": "^1.1.0"
   },
   "devDependencies": {