@dragon708/docmind-node 1.4.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/dist/index.d.ts +79 -21
  2. package/dist/index.js +205 -120
  3. package/package.json +2 -2
package/dist/index.d.ts CHANGED
@@ -2,6 +2,8 @@ import { DocMindAnalyzeOptions, DetectFileKindInput, NamedInput, AnalysisResult,
2
2
  export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
3
3
  import { OcrOptions } from '@dragon708/docmind-ocr';
4
4
  import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
5
+ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
6
+ export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
5
7
 
6
8
  /**
7
9
  * Options for Node public APIs (`analyzeFile`, intent methods).
@@ -11,9 +13,22 @@ import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
11
13
  * - **`ocr`**: forwarded to `@dragon708/docmind-ocr` for raster images; language string also feeds PDF OCR when `pdf.ocrLangs` is unset.
12
14
  * - **`pdfNativeTextSource`**: when `pdf.ocr` resolves to `"off"`, chooses how native text is obtained (see {@link extractText} default).
13
15
  */
16
+ /**
17
+ * Opciones DOCX para el facade Node (Mammoth + inclusiones v2 opcionales de `@dragon708/docmind-docx`).
18
+ */
19
+ interface NodeAnalyzeDocxOptionsSlice {
20
+ /**
21
+ * Pasa a `analyzeDocx` → extractores OOXML/ZIP en paralelo con Mammoth (`structure`, `headings`, `tables`, `blocks`, `pagesApprox`, `embeddedImages`).
22
+ */
23
+ readonly include?: AnalyzeDocxIncludeFlags;
24
+ /** Opciones Mammoth para HTML (p. ej. `convertImage`). */
25
+ readonly html?: DocxToHtmlOptions;
26
+ }
14
27
  interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
15
28
  readonly pdf?: PdfAnalyzeOptions;
16
29
  readonly ocr?: OcrOptions;
30
+ /** Solo DOCX: ver {@link NodeAnalyzeDocxOptionsSlice}. */
31
+ readonly docx?: NodeAnalyzeDocxOptionsSlice;
17
32
  /**
18
33
  * Native PDF text when `pdf.ocr` is `"off"`:
19
34
  * - **`pdfjs-per-page`** (default for {@link extractText}): PDF.js text per page, merged for `text` (aligns with OCR raster engine).
@@ -66,6 +81,62 @@ declare function convertToHtml(input: NodeAnalyzeInput, options?: NodeAnalyzeOpt
66
81
  */
67
82
  declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
68
83
 
84
+ /** High-level features the user can ask DocMind for (per input kind and runtime). */
85
+ type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
86
+ /**
87
+ * DOCX-only: what the stack can do with OOXML embedded bitmaps/vector payloads under `word/media`.
88
+ * Present on {@link GetCapabilitiesReport} when `kind === "docx"`.
89
+ */
90
+ interface DocxEmbeddedImageCapabilities {
91
+ /** Bytes under `word/media/*` can be read (see `@dragon708/docmind-docx` / facade {@link extractImagesFromDocx}). */
92
+ readonly canExtractEmbeddedImages: true;
93
+ /**
94
+ * DOCX files may contain EMF, WMF, HEIC, etc., which are not reliably usable in a browser `<img>` without conversion.
95
+ * This flag is static (kind-based); it does not inspect the open document.
96
+ */
97
+ readonly documentsMayIncludeImagesRequiringWebConversion: true;
98
+ /**
99
+ * In-browser conversion for those formats is **not** provided by DocMind; Node helpers may attempt best-effort conversion
100
+ * (currently stub — see package warnings).
101
+ */
102
+ readonly webFriendlyConversionNodeFirst: true;
103
+ readonly notes: readonly string[];
104
+ }
105
+ /** Shared slice for {@link GetCapabilitiesReport} and {@link ExplainAnalysisPlanReport}. */
106
+ /** True si `options.docx.include` solicita al menos un extractor OOXML v2. */
107
+ declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
108
+ /**
109
+ * Capacidades estructurales DOCX v2 en Node (vía `@dragon708/docmind-docx` + `options.docx.include`).
110
+ * Presente en {@link GetCapabilitiesReport} cuando `kind === "docx"`.
111
+ */
112
+ interface DocxStructuralCapabilities {
113
+ readonly ooxmlExtractorsAvailable: true;
114
+ readonly activatedViaDocxInclude: true;
115
+ readonly features: readonly string[];
116
+ readonly notes: readonly string[];
117
+ }
118
+ declare const DOCX_STRUCTURE_CAPABILITIES: DocxStructuralCapabilities;
119
+ declare const DOCX_EMBEDDED_IMAGE_CAPABILITIES: DocxEmbeddedImageCapabilities;
120
+ /** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
121
+ interface PublicCapabilitySupport {
122
+ readonly id: PublicCapabilityId;
123
+ readonly supported: boolean;
124
+ readonly warnings?: readonly string[];
125
+ }
126
+ /**
127
+ * Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
128
+ */
129
+ interface GetCapabilitiesReport {
130
+ readonly kind: FileKind;
131
+ readonly runtime: RuntimeDescriptor;
132
+ readonly capabilities: readonly PublicCapabilitySupport[];
133
+ /** Only when {@link GetCapabilitiesReport.kind} is `"docx"`. */
134
+ readonly docxEmbeddedImages?: DocxEmbeddedImageCapabilities;
135
+ /** Only when `kind === "docx"`: extractores OOXML v2 disponibles con `options.docx.include`. */
136
+ readonly docxStructure?: DocxStructuralCapabilities;
137
+ readonly warnings?: readonly string[];
138
+ }
139
+
69
140
  /**
70
141
  * Whether DocMind will try a non-OCR text/HTML path (e.g. Mammoth, pdf-parse text layer, UTF-8).
71
142
  */
@@ -91,34 +162,21 @@ interface ExplainAnalysisPlanReport {
91
162
  readonly ocr: OcrPlan;
92
163
  readonly limitations: readonly string[];
93
164
  readonly plan: ProcessingPlanDescriptor;
165
+ /** Only when `kind === "docx"` (same payload as `getCapabilities` → `docxEmbeddedImages`). */
166
+ readonly docxEmbeddedImages?: DocxEmbeddedImageCapabilities;
167
+ /** Only when `kind === "docx"` (same payload as `getCapabilities` → `docxStructure`). */
168
+ readonly docxStructure?: DocxStructuralCapabilities;
94
169
  readonly warnings?: readonly string[];
95
170
  }
96
171
 
97
- /** High-level features the user can ask DocMind for (per input kind and runtime). */
98
- type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
99
- /** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
100
- interface PublicCapabilitySupport {
101
- readonly id: PublicCapabilityId;
102
- readonly supported: boolean;
103
- readonly warnings?: readonly string[];
104
- }
105
- /**
106
- * Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
107
- */
108
- interface GetCapabilitiesReport {
109
- readonly kind: FileKind;
110
- readonly runtime: RuntimeDescriptor;
111
- readonly capabilities: readonly PublicCapabilitySupport[];
112
- readonly warnings?: readonly string[];
113
- }
114
-
115
- /** Options for {@link explainAnalysisPlan} including PDF/OCR hints for accurate planning. */
116
- type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr">;
172
+ /** Options for {@link explainAnalysisPlan} including PDF/OCR/DOCX hints for accurate planning. */
173
+ type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx">;
117
174
 
118
175
  /**
119
176
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
120
177
  * `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (for PDF, `text` / `metadata` /
121
178
  * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
179
+ * For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
122
180
  * Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
123
181
  */
124
182
  declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
@@ -128,4 +186,4 @@ declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilit
128
186
  */
129
187
  declare function explainAnalysisPlan(input: NodeAnalyzeInput, options?: NodeExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
130
188
 
131
- export { type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
189
+ export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
package/dist/index.js CHANGED
@@ -1,6 +1,7 @@
1
1
  import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile } from '@dragon708/docmind-shared';
2
2
  export { detectFileKind } from '@dragon708/docmind-shared';
3
3
  import { analyzeDocx } from '@dragon708/docmind-docx';
4
+ export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
4
5
  import { ocr } from '@dragon708/docmind-ocr';
5
6
  import { extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
6
7
  import { readFile } from 'fs/promises';
@@ -8,6 +9,37 @@ import { basename } from 'path';
8
9
  import { fileURLToPath } from 'url';
9
10
 
10
11
  // src/analyze.ts
12
+
13
+ // src/docxNodeMapper.ts
14
+ function analyzeDocxOptionsFromNode(options) {
15
+ const sig = options?.signal;
16
+ const dx = options?.docx;
17
+ if (!dx?.include && !dx?.html && !sig) return void 0;
18
+ const out = { ...dx?.html ?? {} };
19
+ if (dx?.include) out.include = dx.include;
20
+ if (sig) out.signal = sig;
21
+ return out;
22
+ }
23
+ function docxPackageResultToAnalysisResult(r) {
24
+ const base = {
25
+ fileKind: "docx",
26
+ analyzer: "docx",
27
+ status: "ok",
28
+ kind: "docx",
29
+ text: r.text,
30
+ html: r.html,
31
+ warnings: [...r.warnings]
32
+ };
33
+ const v2 = {
34
+ ...r.structure !== void 0 ? { structure: r.structure } : {},
35
+ ...r.headings !== void 0 ? { headings: r.headings } : {},
36
+ ...r.tables !== void 0 ? { tables: r.tables } : {},
37
+ ...r.blocks !== void 0 ? { blocks: r.blocks } : {},
38
+ ...r.pagesApprox !== void 0 ? { pagesApprox: r.pagesApprox } : {},
39
+ ...r.embeddedImages !== void 0 ? { embeddedImages: r.embeddedImages } : {}
40
+ };
41
+ return { ...base, ...v2 };
42
+ }
11
43
  function isByteBackedInput(input) {
12
44
  return isNamedInput(input) || isBinaryInput(input) || isBlob(input) || isFile(input);
13
45
  }
@@ -19,7 +51,8 @@ async function bytesFromDetectInput(input) {
19
51
  }
20
52
 
21
53
  // src/analyzers/docx.ts
22
- async function analyzeDocxForNode(input, signal) {
54
+ async function analyzeDocxForNode(input, options) {
55
+ const signal = options?.signal;
23
56
  if (signal?.aborted) {
24
57
  const err = new Error("The operation was aborted");
25
58
  err.name = "AbortError";
@@ -37,16 +70,9 @@ async function analyzeDocxForNode(input, signal) {
37
70
  warnings: ["No document bytes were provided for analysis."]
38
71
  };
39
72
  }
40
- const r = await analyzeDocx(data);
41
- return {
42
- fileKind: "docx",
43
- analyzer: "docx",
44
- status: "ok",
45
- kind: "docx",
46
- text: r.text,
47
- html: r.html,
48
- warnings: [...r.warnings]
49
- };
73
+ const docxOpts = analyzeDocxOptionsFromNode(options);
74
+ const r = docxOpts !== void 0 ? await analyzeDocx(data, docxOpts) : await analyzeDocx(data);
75
+ return docxPackageResultToAnalysisResult(r);
50
76
  }
51
77
  async function analyzeImageForNode(input, options) {
52
78
  if (options?.signal?.aborted) {
@@ -188,7 +214,7 @@ async function analyzeFile(input, options) {
188
214
  case "pdf":
189
215
  return analyzePdfForNode(resolved, options);
190
216
  case "docx":
191
- return analyzeDocxForNode(resolved, options?.signal);
217
+ return analyzeDocxForNode(resolved, options);
192
218
  case "image":
193
219
  return analyzeImageForNode(resolved, options);
194
220
  case "text":
@@ -219,7 +245,7 @@ function throwIfAborted(signal) {
219
245
  }
220
246
 
221
247
  // src/publicActions.ts
222
- var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not exposed as a separate API; use extractText or analyzeFile.";
248
+ var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not returned by extractMetadata; use analyzeFile, extractText, or convertToHtml with options.docx.include for OOXML structure, headings, tables, blocks, approximate pages, and embedded images.";
223
249
  var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
224
250
  var RUN_OCR_PDF_FORCE_SEMANTICS = 'runOcr: PDF pipeline ran with `ocr: "force"` so text may include raster OCR output even when a text layer exists.';
225
251
  function escapeHtmlMinimal(s) {
@@ -465,19 +491,16 @@ async function runOcr(input, options) {
465
491
  warnings: ["No document bytes were provided for analysis."]
466
492
  };
467
493
  }
468
- const r = await analyzeDocx(data);
469
- return {
470
- fileKind: "docx",
471
- analyzer: "docx",
472
- status: "ok",
473
- kind: "docx",
474
- text: r.text,
475
- html: r.html,
494
+ const opt = analyzeDocxOptionsFromNode(options);
495
+ const raw = opt !== void 0 ? await analyzeDocx(data, opt) : await analyzeDocx(data);
496
+ const withNote = {
497
+ ...raw,
476
498
  warnings: [
477
- ...r.warnings,
499
+ ...raw.warnings,
478
500
  "OCR does not apply to DOCX; returned structured text/HTML extract."
479
501
  ]
480
502
  };
503
+ return docxPackageResultToAnalysisResult(withNote);
481
504
  }
482
505
  case "text":
483
506
  return analyzeText(resolved, { signal });
@@ -486,11 +509,141 @@ async function runOcr(input, options) {
486
509
  }
487
510
  }
488
511
 
512
+ // src/capabilityReport.ts
513
+ function docxIncludeRequested(flags) {
514
+ if (!flags) return false;
515
+ return !!(flags.structure || flags.headings || flags.tables || flags.blocks || flags.pagesApprox || flags.embeddedImages);
516
+ }
517
+ var DOCX_STRUCTURE_CAPABILITIES = {
518
+ ooxmlExtractorsAvailable: true,
519
+ activatedViaDocxInclude: true,
520
+ features: [
521
+ "OOXML structure (body blocks)",
522
+ "headings",
523
+ "tables",
524
+ "semantic blocks (RAG-friendly)",
525
+ "approximate pages (OOXML page-break hints)",
526
+ "embedded images (word/media; optional web/both modes)"
527
+ ],
528
+ notes: [
529
+ "Use options.docx.include on analyzeFile, extractText, convertToHtml, or runOcr to merge Mammoth output with selected extractors.",
530
+ "extractMetadata for DOCX remains a lightweight stub and does not run these extractors."
531
+ ]
532
+ };
533
+ var DOCX_EMBEDDED_IMAGE_CAPABILITIES = {
534
+ canExtractEmbeddedImages: true,
535
+ documentsMayIncludeImagesRequiringWebConversion: true,
536
+ webFriendlyConversionNodeFirst: true,
537
+ notes: [
538
+ "Use extractImagesFromDocx (re-exported from @dragon708/docmind-node) for raw ZIP media; optional mode: web | both for web-oriented bytes.",
539
+ "PNG, JPEG, GIF, WebP, SVG, TIFF, BMP, ICO are treated as browser-embeddable; EMF/WMF and similar may require an external Node pipeline.",
540
+ "@dragon708/docmind-docx does not ship a bundled EMF/WMF converter; convertDocxEmbeddedImageToWeb surfaces clear warnings until you wire ImageMagick, Sharp, or similar."
541
+ ]
542
+ };
543
+ var DOCX_META = "Structured document metadata is not exposed separately; extractMetadata returns a stub for DOCX.";
544
+ var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
545
+ var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
546
+ var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
547
+ var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
548
+ function slot(id, supported, warnings) {
549
+ return warnings?.length ? { id, supported, warnings } : { id, supported };
550
+ }
551
+ function buildNodeCapabilityReport(kind) {
552
+ const runtime = { id: "node" };
553
+ let capabilities;
554
+ const topWarnings = [];
555
+ switch (kind) {
556
+ case "pdf":
557
+ capabilities = [
558
+ slot("text", true, [
559
+ "Native text via pdf-parse and (in Node extractText) PDF.js per-page text; set pdf.ocr for raster OCR."
560
+ ]),
561
+ slot("metadata", true, [
562
+ "Document info / XMP-style metadata via pdf-parse without running the OCR pipeline."
563
+ ]),
564
+ slot("pages", true, [
565
+ "Page count and per-page native extraction (PDF.js) where used; OCR respects pdf.maxPages."
566
+ ]),
567
+ slot("ocr", true, [
568
+ "Raster OCR pipeline (pdf.ocr auto with quality heuristics, force, or runOcr)."
569
+ ]),
570
+ slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."])
571
+ ];
572
+ break;
573
+ case "docx":
574
+ capabilities = [
575
+ slot("text", true, [
576
+ "Mammoth plain text; extractText clears html. Optional OOXML extractors merge when options.docx.include is set."
577
+ ]),
578
+ slot("metadata", false, [
579
+ `${DOCX_META} OOXML structure, headings, tables, blocks, approximate pages, and embedded images are available via analyzeFile-style routes with options.docx.include.`
580
+ ]),
581
+ slot("html", true, [
582
+ "Mammoth HTML uses docxImagesAsDataUri for web-safe images; EMF/WMF and other non-web types appear as placeholders, not extracted media."
583
+ ]),
584
+ slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
585
+ slot("pages", false, [
586
+ "No PDF-style page count; approximate DOCX pages via options.docx.include.pagesApprox (OOXML hints, not print layout)."
587
+ ])
588
+ ];
589
+ break;
590
+ case "image":
591
+ capabilities = [
592
+ slot("text", true, ["Text is obtained via OCR."]),
593
+ slot("metadata", false, [IMAGE_META]),
594
+ slot("html", false, [IMAGE_HTML]),
595
+ slot("ocr", true),
596
+ slot("pages", false)
597
+ ];
598
+ break;
599
+ case "text":
600
+ capabilities = [
601
+ slot("text", true),
602
+ slot("metadata", true, [TEXT_META_NOTE]),
603
+ slot("html", true),
604
+ slot("ocr", false, ["OCR does not apply to plain text files."]),
605
+ slot("pages", false)
606
+ ];
607
+ break;
608
+ default:
609
+ topWarnings.push(UNKNOWN_KIND);
610
+ capabilities = [
611
+ slot("text", false),
612
+ slot("metadata", false),
613
+ slot("html", false),
614
+ slot("ocr", false),
615
+ slot("pages", false)
616
+ ];
617
+ }
618
+ return {
619
+ kind,
620
+ runtime,
621
+ capabilities,
622
+ ...kind === "docx" ? {
623
+ docxEmbeddedImages: DOCX_EMBEDDED_IMAGE_CAPABILITIES,
624
+ docxStructure: DOCX_STRUCTURE_CAPABILITIES
625
+ } : {},
626
+ warnings: topWarnings.length > 0 ? topWarnings : void 0
627
+ };
628
+ }
629
+
489
630
  // src/analysisPlanReport.ts
490
631
  function lim(...items) {
491
632
  return items.filter(Boolean);
492
633
  }
493
- function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
634
+ var DOCX_ZIP_MEDIA_PLAN_NOTE = "ZIP embedded images (word/media) use extractImagesFromDocx (re-exported from @dragon708/docmind-node); not merged into this intent pipeline.";
635
+ function finalizeDocxExplainReport(report) {
636
+ if (report.kind !== "docx") return report;
637
+ const limitations = report.limitations.includes(DOCX_ZIP_MEDIA_PLAN_NOTE) ? report.limitations : [...report.limitations, DOCX_ZIP_MEDIA_PLAN_NOTE];
638
+ return {
639
+ ...report,
640
+ docxEmbeddedImages: DOCX_EMBEDDED_IMAGE_CAPABILITIES,
641
+ docxStructure: DOCX_STRUCTURE_CAPABILITIES,
642
+ limitations
643
+ };
644
+ }
645
+ var DOCX_MAMMOTH_PLUS_OPTIONAL = "Mammoth extracts text and HTML from OOXML; optional parallel OOXML/ZIP extractors run when options.docx.include is set (structure, headings, tables, blocks, pagesApprox, embeddedImages).";
646
+ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
494
647
  const runtime = { id: "node" };
495
648
  const primaryAnalyzer = kind === "pdf" ? "pdf" : kind === "docx" ? "docx" : kind === "image" ? "image" : kind === "text" ? "text" : "none";
496
649
  let nativeExtraction;
@@ -500,7 +653,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
500
653
  limitations = lim(
501
654
  "Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve."
502
655
  );
503
- return {
656
+ return finalizeDocxExplainReport({
504
657
  kind,
505
658
  detectedKind: kind,
506
659
  runtime,
@@ -510,7 +663,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
510
663
  ocr: { mayUse: false, description: "OCR is not used for unknown kinds." },
511
664
  limitations,
512
665
  plan
513
- };
666
+ });
514
667
  }
515
668
  switch (intent) {
516
669
  case "analyzeFile":
@@ -526,7 +679,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
526
679
  } else if (kind === "docx") {
527
680
  nativeExtraction = {
528
681
  willAttempt: true,
529
- description: "Mammoth extracts text and HTML from OOXML."
682
+ description: docxIncludeRequested(docxInclude) ? "Mammoth plus parallel OOXML extractors (per options.docx.include)." : DOCX_MAMMOTH_PLUS_OPTIONAL
530
683
  };
531
684
  ocr3 = { mayUse: false, description: "DOCX does not use OCR in DocMind." };
532
685
  } else if (kind === "image") {
@@ -556,7 +709,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
556
709
  } else if (kind === "docx") {
557
710
  nativeExtraction = {
558
711
  willAttempt: true,
559
- description: "Mammoth plain text; HTML cleared in the extractText response."
712
+ description: docxIncludeRequested(docxInclude) ? "Same DOCX router as analyzeFile: Mammoth text + optional OOXML fields; HTML cleared in extractText." : "Mammoth plain text; HTML cleared. Optional OOXML v2 fields when options.docx.include is set."
560
713
  };
561
714
  ocr3 = { mayUse: false, description: "DOCX does not use OCR." };
562
715
  } else if (kind === "image") {
@@ -584,7 +737,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
584
737
  };
585
738
  ocr3 = { mayUse: false, description: "OCR not used for this metadata path." };
586
739
  limitations = lim(
587
- kind === "docx" ? "Structured DOCX metadata is not exposed separately." : "Raster images have no document metadata bundle."
740
+ kind === "docx" ? "Structured DOCX metadata is not exposed separately; use analyzeFile / extractText / convertToHtml with options.docx.include for OOXML structure." : "Raster images have no document metadata bundle."
588
741
  );
589
742
  } else {
590
743
  nativeExtraction = {
@@ -606,7 +759,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
606
759
  } else if (kind === "docx") {
607
760
  nativeExtraction = {
608
761
  willAttempt: true,
609
- description: "Mammoth HTML output via analyzeFile routing."
762
+ description: docxIncludeRequested(docxInclude) ? "Mammoth HTML plus optional OOXML extractors (same router as analyzeFile)." : "Mammoth HTML via analyzeFile routing; optional OOXML v2 when options.docx.include is set."
610
763
  };
611
764
  ocr3 = { mayUse: false, description: "DOCX path does not use OCR." };
612
765
  } else if (kind === "text") {
@@ -641,7 +794,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
641
794
  } else if (kind === "docx") {
642
795
  nativeExtraction = {
643
796
  willAttempt: true,
644
- description: "Full Mammoth extract (text + HTML); not OCR."
797
+ description: docxIncludeRequested(docxInclude) ? "Mammoth text/HTML plus optional OOXML extractors; still not OCR." : "Full Mammoth extract (text + HTML); optional OOXML v2 via options.docx.include; not OCR."
645
798
  };
646
799
  ocr3 = { mayUse: false, description: "DOCX is not OCR'd." };
647
800
  limitations = lim("Result is structured extract, not OCR output.");
@@ -657,7 +810,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
657
810
  nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
658
811
  ocr3 = { mayUse: false, description: "See plan steps." };
659
812
  }
660
- return {
813
+ return finalizeDocxExplainReport({
661
814
  kind,
662
815
  detectedKind: kind,
663
816
  runtime,
@@ -667,90 +820,14 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
667
820
  ocr: ocr3,
668
821
  limitations,
669
822
  plan
670
- };
671
- }
672
-
673
- // src/capabilityReport.ts
674
- var DOCX_META = "Structured document metadata is not exposed separately; extractMetadata returns a stub for DOCX.";
675
- var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
676
- var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
677
- var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
678
- var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
679
- function slot(id, supported, warnings) {
680
- return warnings?.length ? { id, supported, warnings } : { id, supported };
681
- }
682
- function buildNodeCapabilityReport(kind) {
683
- const runtime = { id: "node" };
684
- let capabilities;
685
- const topWarnings = [];
686
- switch (kind) {
687
- case "pdf":
688
- capabilities = [
689
- slot("text", true, [
690
- "Native text via pdf-parse and (in Node extractText) PDF.js per-page text; set pdf.ocr for raster OCR."
691
- ]),
692
- slot("metadata", true, [
693
- "Document info / XMP-style metadata via pdf-parse without running the OCR pipeline."
694
- ]),
695
- slot("pages", true, [
696
- "Page count and per-page native extraction (PDF.js) where used; OCR respects pdf.maxPages."
697
- ]),
698
- slot("ocr", true, [
699
- "Raster OCR pipeline (pdf.ocr auto with quality heuristics, force, or runOcr)."
700
- ]),
701
- slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."])
702
- ];
703
- break;
704
- case "docx":
705
- capabilities = [
706
- slot("text", true),
707
- slot("metadata", false, [DOCX_META]),
708
- slot("html", true),
709
- slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
710
- slot("pages", false)
711
- ];
712
- break;
713
- case "image":
714
- capabilities = [
715
- slot("text", true, ["Text is obtained via OCR."]),
716
- slot("metadata", false, [IMAGE_META]),
717
- slot("html", false, [IMAGE_HTML]),
718
- slot("ocr", true),
719
- slot("pages", false)
720
- ];
721
- break;
722
- case "text":
723
- capabilities = [
724
- slot("text", true),
725
- slot("metadata", true, [TEXT_META_NOTE]),
726
- slot("html", true),
727
- slot("ocr", false, ["OCR does not apply to plain text files."]),
728
- slot("pages", false)
729
- ];
730
- break;
731
- default:
732
- topWarnings.push(UNKNOWN_KIND);
733
- capabilities = [
734
- slot("text", false),
735
- slot("metadata", false),
736
- slot("html", false),
737
- slot("ocr", false),
738
- slot("pages", false)
739
- ];
740
- }
741
- return {
742
- kind,
743
- runtime,
744
- capabilities,
745
- warnings: topWarnings.length > 0 ? topWarnings : void 0
746
- };
823
+ });
747
824
  }
748
825
 
749
826
  // src/introspection.ts
750
827
  function resolvePdfOcrMode(pdf) {
751
828
  return pdf?.ocr ?? "auto";
752
829
  }
753
- function planAnalyzeFile(kind, pdfOcr) {
830
+ function planAnalyzeFile(kind, pdfOcr, docxInclude) {
754
831
  switch (kind) {
755
832
  case "pdf":
756
833
  return {
@@ -764,14 +841,17 @@ function planAnalyzeFile(kind, pdfOcr) {
764
841
  }
765
842
  ]
766
843
  };
767
- case "docx":
844
+ case "docx": {
845
+ const parallel = docxIncludeRequested(docxInclude);
768
846
  return {
769
847
  intent: "analyzeFile",
770
848
  steps: [
771
849
  { id: "detect_kind", status: "done" },
772
- { id: "docx_mammoth", status: "planned" }
850
+ { id: "docx_mammoth", status: "planned" },
851
+ ...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
773
852
  ]
774
853
  };
854
+ }
775
855
  case "image":
776
856
  return {
777
857
  intent: "analyzeFile",
@@ -798,9 +878,9 @@ function planAnalyzeFile(kind, pdfOcr) {
798
878
  };
799
879
  }
800
880
  }
801
- function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
881
+ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude) {
802
882
  const intent = intentOpt ?? "analyzeFile";
803
- if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze);
883
+ if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude);
804
884
  if (intent === "extractText") {
805
885
  if (kind === "pdf") {
806
886
  return {
@@ -813,7 +893,7 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
813
893
  ]
814
894
  };
815
895
  }
816
- const p = planAnalyzeFile(kind, "off");
896
+ const p = planAnalyzeFile(kind, "off", docxInclude);
817
897
  return { ...p, intent: "extractText" };
818
898
  }
819
899
  if (intent === "extractMetadata") {
@@ -845,11 +925,13 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
845
925
  }
846
926
  if (intent === "convertToHtml") {
847
927
  if (kind === "docx") {
928
+ const parallel = docxIncludeRequested(docxInclude);
848
929
  return {
849
930
  intent: "convertToHtml",
850
931
  steps: [
851
932
  { id: "detect_kind", status: "done" },
852
- { id: "docx_mammoth_html", status: "planned" }
933
+ { id: "docx_mammoth_html", status: "planned" },
934
+ ...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
853
935
  ]
854
936
  };
855
937
  }
@@ -902,11 +984,13 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
902
984
  };
903
985
  }
904
986
  if (kind === "docx") {
987
+ const parallel = docxIncludeRequested(docxInclude);
905
988
  return {
906
989
  intent: "runOcr",
907
990
  steps: [
908
991
  { id: "detect_kind", status: "done" },
909
- { id: "docx_structured_extract", status: "planned" }
992
+ { id: "docx_mammoth", status: "planned" },
993
+ ...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
910
994
  ]
911
995
  };
912
996
  }
@@ -918,7 +1002,7 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
918
1002
  ]
919
1003
  };
920
1004
  }
921
- return planAnalyzeFile(kind, pdfOcrForAnalyze);
1005
+ return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude);
922
1006
  }
923
1007
  async function getCapabilities(input, options) {
924
1008
  throwIfAborted(options?.signal);
@@ -934,10 +1018,11 @@ async function explainAnalysisPlan(input, options) {
934
1018
  const kind = detectFileKind(resolved);
935
1019
  const intent = options?.intent ?? "analyzeFile";
936
1020
  const pdfOcrAnalyze = resolvePdfOcrMode(options?.pdf);
937
- const plan = planForIntent(intent, kind, pdfOcrAnalyze);
938
- return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan);
1021
+ const docxInc = options?.docx?.include;
1022
+ const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc);
1023
+ return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan, docxInc);
939
1024
  }
940
1025
 
941
- export { analyzeFile, bufferToInput, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
1026
+ export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
942
1027
  //# sourceMappingURL=index.js.map
943
1028
  //# sourceMappingURL=index.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-node",
3
- "version": "1.4.0",
3
+ "version": "1.7.0",
4
4
  "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -32,7 +32,7 @@
32
32
  ],
33
33
  "license": "MIT",
34
34
  "dependencies": {
35
- "@dragon708/docmind-docx": "^1.0.0",
35
+ "@dragon708/docmind-docx": "^1.7.0",
36
36
  "@dragon708/docmind-ocr": "^1.0.0",
37
37
  "@dragon708/docmind-pdf": "^2.0.0",
38
38
  "@dragon708/docmind-shared": "^1.1.0"