@dragon708/docmind-node 1.2.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/dist/index.d.ts +91 -24
  2. package/dist/index.js +258 -122
  3. package/package.json +3 -3
package/dist/index.d.ts CHANGED
@@ -2,6 +2,8 @@ import { DocMindAnalyzeOptions, DetectFileKindInput, NamedInput, AnalysisResult,
2
2
  export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
3
3
  import { OcrOptions } from '@dragon708/docmind-ocr';
4
4
  import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
5
+ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
6
+ export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
5
7
 
6
8
  /**
7
9
  * Options for Node public APIs (`analyzeFile`, intent methods).
@@ -9,10 +11,30 @@ import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
9
11
  * - **`pdf`**: forwarded to `@dragon708/docmind-pdf`. `analyzeFile` defaults `pdf.ocr` to `"auto"` when omitted.
10
12
  * {@link extractText} / {@link convertToHtml} merge a default of `ocr: "off"` unless you set `pdf.ocr` explicitly.
11
13
  * - **`ocr`**: forwarded to `@dragon708/docmind-ocr` for raster images; language string also feeds PDF OCR when `pdf.ocrLangs` is unset.
14
+ * - **`pdfNativeTextSource`**: when `pdf.ocr` resolves to `"off"`, chooses how native text is obtained (see {@link extractText} default).
12
15
  */
16
+ /**
17
+ * Opciones DOCX para el facade Node (Mammoth + inclusiones v2 opcionales de `@dragon708/docmind-docx`).
18
+ */
19
+ interface NodeAnalyzeDocxOptionsSlice {
20
+ /**
21
+ * Pasa a `analyzeDocx` → extractores OOXML/ZIP en paralelo con Mammoth (`structure`, `headings`, `tables`, `blocks`, `pagesApprox`, `embeddedImages`).
22
+ */
23
+ readonly include?: AnalyzeDocxIncludeFlags;
24
+ /** Opciones Mammoth para HTML (p. ej. `convertImage`). */
25
+ readonly html?: DocxToHtmlOptions;
26
+ }
13
27
  interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
14
28
  readonly pdf?: PdfAnalyzeOptions;
15
29
  readonly ocr?: OcrOptions;
30
+ /** Solo DOCX: ver {@link NodeAnalyzeDocxOptionsSlice}. */
31
+ readonly docx?: NodeAnalyzeDocxOptionsSlice;
32
+ /**
33
+ * Native PDF text when `pdf.ocr` is `"off"`:
34
+ * - **`pdfjs-per-page`** (default for {@link extractText}): PDF.js text per page, merged for `text` (aligns with OCR raster engine).
35
+ * - **`pdf-parse`**: single pdf-parse pass (default for {@link analyzeFile} when you set `pdf.ocr: "off"` without this flag).
36
+ */
37
+ readonly pdfNativeTextSource?: "pdf-parse" | "pdfjs-per-page";
16
38
  }
17
39
 
18
40
  /**
@@ -38,8 +60,9 @@ declare function resolveNodeAnalyzeInput(input: NodeAnalyzeInput): Promise<Detec
38
60
  declare function analyzeFile(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
39
61
 
40
62
  /**
41
- * Plain-text extraction using {@link analyzeFile} routing. PDFs default to **text layer only**
42
- * (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly.
63
+ * Plain-text extraction using {@link analyzeFile} routing. PDFs default to **native text only**
64
+ * (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly, and to **PDF.js per-page** assembly
65
+ * (`pdfNativeTextSource: "pdfjs-per-page"`) unless you set `options.pdfNativeTextSource` or `pdf.ocr` enables OCR.
43
66
  */
44
67
  declare function extractText(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
45
68
  /**
@@ -58,6 +81,62 @@ declare function convertToHtml(input: NodeAnalyzeInput, options?: NodeAnalyzeOpt
58
81
  */
59
82
  declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
60
83
 
84
+ /** High-level features the user can ask DocMind for (per input kind and runtime). */
85
+ type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
86
+ /**
87
+ * DOCX-only: what the stack can do with OOXML embedded bitmaps/vector payloads under `word/media`.
88
+ * Present on {@link GetCapabilitiesReport} when `kind === "docx"`.
89
+ */
90
+ interface DocxEmbeddedImageCapabilities {
91
+ /** Bytes under `word/media/*` can be read (see `@dragon708/docmind-docx` / facade {@link extractImagesFromDocx}). */
92
+ readonly canExtractEmbeddedImages: true;
93
+ /**
94
+ * DOCX files may contain EMF, WMF, HEIC, etc., which are not reliably usable in a browser `<img>` without conversion.
95
+ * This flag is static (kind-based); it does not inspect the open document.
96
+ */
97
+ readonly documentsMayIncludeImagesRequiringWebConversion: true;
98
+ /**
99
+ * In-browser conversion for those formats is **not** provided by DocMind; Node helpers may attempt best-effort conversion
100
+ * (currently stub — see package warnings).
101
+ */
102
+ readonly webFriendlyConversionNodeFirst: true;
103
+ readonly notes: readonly string[];
104
+ }
105
+ /** Shared slice for {@link GetCapabilitiesReport} and {@link ExplainAnalysisPlanReport}. */
106
+ /** True si `options.docx.include` solicita al menos un extractor OOXML v2. */
107
+ declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
108
+ /**
109
+ * Capacidades estructurales DOCX v2 en Node (vía `@dragon708/docmind-docx` + `options.docx.include`).
110
+ * Presente en {@link GetCapabilitiesReport} cuando `kind === "docx"`.
111
+ */
112
+ interface DocxStructuralCapabilities {
113
+ readonly ooxmlExtractorsAvailable: true;
114
+ readonly activatedViaDocxInclude: true;
115
+ readonly features: readonly string[];
116
+ readonly notes: readonly string[];
117
+ }
118
+ declare const DOCX_STRUCTURE_CAPABILITIES: DocxStructuralCapabilities;
119
+ declare const DOCX_EMBEDDED_IMAGE_CAPABILITIES: DocxEmbeddedImageCapabilities;
120
+ /** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
121
+ interface PublicCapabilitySupport {
122
+ readonly id: PublicCapabilityId;
123
+ readonly supported: boolean;
124
+ readonly warnings?: readonly string[];
125
+ }
126
+ /**
127
+ * Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
128
+ */
129
+ interface GetCapabilitiesReport {
130
+ readonly kind: FileKind;
131
+ readonly runtime: RuntimeDescriptor;
132
+ readonly capabilities: readonly PublicCapabilitySupport[];
133
+ /** Only when {@link GetCapabilitiesReport.kind} is `"docx"`. */
134
+ readonly docxEmbeddedImages?: DocxEmbeddedImageCapabilities;
135
+ /** Only when `kind === "docx"`: extractores OOXML v2 disponibles con `options.docx.include`. */
136
+ readonly docxStructure?: DocxStructuralCapabilities;
137
+ readonly warnings?: readonly string[];
138
+ }
139
+
61
140
  /**
62
141
  * Whether DocMind will try a non-OCR text/HTML path (e.g. Mammoth, pdf-parse text layer, UTF-8).
63
142
  */
@@ -83,33 +162,21 @@ interface ExplainAnalysisPlanReport {
83
162
  readonly ocr: OcrPlan;
84
163
  readonly limitations: readonly string[];
85
164
  readonly plan: ProcessingPlanDescriptor;
165
+ /** Only when `kind === "docx"` (same payload as `getCapabilities` → `docxEmbeddedImages`). */
166
+ readonly docxEmbeddedImages?: DocxEmbeddedImageCapabilities;
167
+ /** Only when `kind === "docx"` (same payload as `getCapabilities` → `docxStructure`). */
168
+ readonly docxStructure?: DocxStructuralCapabilities;
86
169
  readonly warnings?: readonly string[];
87
170
  }
88
171
 
89
- /** High-level features the user can ask DocMind for (per input kind and runtime). */
90
- type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
91
- /** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
92
- interface PublicCapabilitySupport {
93
- readonly id: PublicCapabilityId;
94
- readonly supported: boolean;
95
- readonly warnings?: readonly string[];
96
- }
97
- /**
98
- * Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
99
- */
100
- interface GetCapabilitiesReport {
101
- readonly kind: FileKind;
102
- readonly runtime: RuntimeDescriptor;
103
- readonly capabilities: readonly PublicCapabilitySupport[];
104
- readonly warnings?: readonly string[];
105
- }
106
-
107
- /** Options for {@link explainAnalysisPlan} including PDF/OCR hints for accurate planning. */
108
- type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr">;
172
+ /** Options for {@link explainAnalysisPlan} including PDF/OCR/DOCX hints for accurate planning. */
173
+ type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx">;
109
174
 
110
175
  /**
111
176
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
112
- * `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (PDF fully supported).
177
+ * `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (for PDF, `text` / `metadata` /
178
+ * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
179
+ * For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
113
180
  * Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
114
181
  */
115
182
  declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
@@ -119,4 +186,4 @@ declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilit
119
186
  */
120
187
  declare function explainAnalysisPlan(input: NodeAnalyzeInput, options?: NodeExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
121
188
 
122
- export { type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
189
+ export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
package/dist/index.js CHANGED
@@ -1,13 +1,45 @@
1
1
  import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile } from '@dragon708/docmind-shared';
2
2
  export { detectFileKind } from '@dragon708/docmind-shared';
3
3
  import { analyzeDocx } from '@dragon708/docmind-docx';
4
+ export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
4
5
  import { ocr } from '@dragon708/docmind-ocr';
5
- import { extractPdfMetadata, extractTextFromPdf, analyzePdf } from '@dragon708/docmind-pdf';
6
+ import { extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
6
7
  import { readFile } from 'fs/promises';
7
8
  import { basename } from 'path';
8
9
  import { fileURLToPath } from 'url';
9
10
 
10
11
  // src/analyze.ts
12
+
13
+ // src/docxNodeMapper.ts
14
+ function analyzeDocxOptionsFromNode(options) {
15
+ const sig = options?.signal;
16
+ const dx = options?.docx;
17
+ if (!dx?.include && !dx?.html && !sig) return void 0;
18
+ const out = { ...dx?.html ?? {} };
19
+ if (dx?.include) out.include = dx.include;
20
+ if (sig) out.signal = sig;
21
+ return out;
22
+ }
23
+ function docxPackageResultToAnalysisResult(r) {
24
+ const base = {
25
+ fileKind: "docx",
26
+ analyzer: "docx",
27
+ status: "ok",
28
+ kind: "docx",
29
+ text: r.text,
30
+ html: r.html,
31
+ warnings: [...r.warnings]
32
+ };
33
+ const v2 = {
34
+ ...r.structure !== void 0 ? { structure: r.structure } : {},
35
+ ...r.headings !== void 0 ? { headings: r.headings } : {},
36
+ ...r.tables !== void 0 ? { tables: r.tables } : {},
37
+ ...r.blocks !== void 0 ? { blocks: r.blocks } : {},
38
+ ...r.pagesApprox !== void 0 ? { pagesApprox: r.pagesApprox } : {},
39
+ ...r.embeddedImages !== void 0 ? { embeddedImages: r.embeddedImages } : {}
40
+ };
41
+ return { ...base, ...v2 };
42
+ }
11
43
  function isByteBackedInput(input) {
12
44
  return isNamedInput(input) || isBinaryInput(input) || isBlob(input) || isFile(input);
13
45
  }
@@ -19,7 +51,8 @@ async function bytesFromDetectInput(input) {
19
51
  }
20
52
 
21
53
  // src/analyzers/docx.ts
22
- async function analyzeDocxForNode(input, signal) {
54
+ async function analyzeDocxForNode(input, options) {
55
+ const signal = options?.signal;
23
56
  if (signal?.aborted) {
24
57
  const err = new Error("The operation was aborted");
25
58
  err.name = "AbortError";
@@ -37,16 +70,9 @@ async function analyzeDocxForNode(input, signal) {
37
70
  warnings: ["No document bytes were provided for analysis."]
38
71
  };
39
72
  }
40
- const r = await analyzeDocx(data);
41
- return {
42
- fileKind: "docx",
43
- analyzer: "docx",
44
- status: "ok",
45
- kind: "docx",
46
- text: r.text,
47
- html: r.html,
48
- warnings: [...r.warnings]
49
- };
73
+ const docxOpts = analyzeDocxOptionsFromNode(options);
74
+ const r = docxOpts !== void 0 ? await analyzeDocx(data, docxOpts) : await analyzeDocx(data);
75
+ return docxPackageResultToAnalysisResult(r);
50
76
  }
51
77
  async function analyzeImageForNode(input, options) {
52
78
  if (options?.signal?.aborted) {
@@ -112,17 +138,45 @@ async function analyzePdfForNode(input, options) {
112
138
  signal: userPdf?.signal ?? options?.signal
113
139
  };
114
140
  const r = await analyzePdf(data, pdfOpts);
141
+ const usePdfJsPerPage = pdfOpts.ocr === "off" && (options?.pdfNativeTextSource ?? "pdf-parse") === "pdfjs-per-page";
142
+ if (!usePdfJsPerPage) {
143
+ return {
144
+ fileKind: "pdf",
145
+ analyzer: "pdf",
146
+ status: "ok",
147
+ kind: "pdf",
148
+ text: r.text,
149
+ pages: r.pages,
150
+ metadata: r.metadata,
151
+ warnings: [...r.warnings],
152
+ needsOCR: r.needsOCR,
153
+ ocrUsed: r.ocrUsed
154
+ };
155
+ }
156
+ let text = r.text;
157
+ const extra = [];
158
+ try {
159
+ const rows = await extractPdfTextByPage(data, {
160
+ maxPages: pdfOpts.maxPages,
161
+ signal: pdfOpts.signal
162
+ });
163
+ text = rows.map((row) => row.text).join("\n\n");
164
+ } catch (e) {
165
+ const msg = e instanceof Error ? e.message : String(e);
166
+ extra.push(`warning: PDF.js per-page text failed; using pdf-parse text: ${msg}`);
167
+ }
168
+ const needsOCR = r.pages > 0 && text.trim().length === 0;
115
169
  return {
116
170
  fileKind: "pdf",
117
171
  analyzer: "pdf",
118
172
  status: "ok",
119
173
  kind: "pdf",
120
- text: r.text,
174
+ text,
121
175
  pages: r.pages,
122
176
  metadata: r.metadata,
123
- warnings: [...r.warnings],
124
- needsOCR: r.needsOCR,
125
- ocrUsed: r.ocrUsed
177
+ warnings: [...r.warnings, ...extra],
178
+ needsOCR,
179
+ ocrUsed: false
126
180
  };
127
181
  }
128
182
  function toPathString(pathOrUrl) {
@@ -160,7 +214,7 @@ async function analyzeFile(input, options) {
160
214
  case "pdf":
161
215
  return analyzePdfForNode(resolved, options);
162
216
  case "docx":
163
- return analyzeDocxForNode(resolved, options?.signal);
217
+ return analyzeDocxForNode(resolved, options);
164
218
  case "image":
165
219
  return analyzeImageForNode(resolved, options);
166
220
  case "text":
@@ -191,7 +245,7 @@ function throwIfAborted(signal) {
191
245
  }
192
246
 
193
247
  // src/publicActions.ts
194
- var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not exposed as a separate API; use extractText or analyzeFile.";
248
+ var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not returned by extractMetadata; use analyzeFile, extractText, or convertToHtml with options.docx.include for OOXML structure, headings, tables, blocks, approximate pages, and embedded images.";
195
249
  var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
196
250
  var RUN_OCR_PDF_FORCE_SEMANTICS = 'runOcr: PDF pipeline ran with `ocr: "force"` so text may include raster OCR output even when a text layer exists.';
197
251
  function escapeHtmlMinimal(s) {
@@ -211,7 +265,11 @@ function toExtractTextResult(full) {
211
265
  }
212
266
  async function extractText(input, options) {
213
267
  throwIfAborted(options?.signal);
214
- const full = await analyzeFile(input, withPdfOcrDefaultOff(options));
268
+ const merged = {
269
+ ...withPdfOcrDefaultOff(options),
270
+ pdfNativeTextSource: options?.pdfNativeTextSource ?? "pdfjs-per-page"
271
+ };
272
+ const full = await analyzeFile(input, merged);
215
273
  return toExtractTextResult(full);
216
274
  }
217
275
  async function extractMetadata(input, options) {
@@ -433,19 +491,16 @@ async function runOcr(input, options) {
433
491
  warnings: ["No document bytes were provided for analysis."]
434
492
  };
435
493
  }
436
- const r = await analyzeDocx(data);
437
- return {
438
- fileKind: "docx",
439
- analyzer: "docx",
440
- status: "ok",
441
- kind: "docx",
442
- text: r.text,
443
- html: r.html,
494
+ const opt = analyzeDocxOptionsFromNode(options);
495
+ const raw = opt !== void 0 ? await analyzeDocx(data, opt) : await analyzeDocx(data);
496
+ const withNote = {
497
+ ...raw,
444
498
  warnings: [
445
- ...r.warnings,
499
+ ...raw.warnings,
446
500
  "OCR does not apply to DOCX; returned structured text/HTML extract."
447
501
  ]
448
502
  };
503
+ return docxPackageResultToAnalysisResult(withNote);
449
504
  }
450
505
  case "text":
451
506
  return analyzeText(resolved, { signal });
@@ -454,11 +509,141 @@ async function runOcr(input, options) {
454
509
  }
455
510
  }
456
511
 
512
+ // src/capabilityReport.ts
513
+ function docxIncludeRequested(flags) {
514
+ if (!flags) return false;
515
+ return !!(flags.structure || flags.headings || flags.tables || flags.blocks || flags.pagesApprox || flags.embeddedImages);
516
+ }
517
+ var DOCX_STRUCTURE_CAPABILITIES = {
518
+ ooxmlExtractorsAvailable: true,
519
+ activatedViaDocxInclude: true,
520
+ features: [
521
+ "OOXML structure (body blocks)",
522
+ "headings",
523
+ "tables",
524
+ "semantic blocks (RAG-friendly)",
525
+ "approximate pages (OOXML page-break hints)",
526
+ "embedded images (word/media; optional web/both modes)"
527
+ ],
528
+ notes: [
529
+ "Use options.docx.include on analyzeFile, extractText, convertToHtml, or runOcr to merge Mammoth output with selected extractors.",
530
+ "extractMetadata for DOCX remains a lightweight stub and does not run these extractors."
531
+ ]
532
+ };
533
+ var DOCX_EMBEDDED_IMAGE_CAPABILITIES = {
534
+ canExtractEmbeddedImages: true,
535
+ documentsMayIncludeImagesRequiringWebConversion: true,
536
+ webFriendlyConversionNodeFirst: true,
537
+ notes: [
538
+ "Use extractImagesFromDocx (re-exported from @dragon708/docmind-node) for raw ZIP media; optional mode: web | both for web-oriented bytes.",
539
+ "PNG, JPEG, GIF, WebP, SVG, TIFF, BMP, ICO are treated as browser-embeddable; EMF/WMF and similar may require an external Node pipeline.",
540
+ "@dragon708/docmind-docx does not ship a bundled EMF/WMF converter; convertDocxEmbeddedImageToWeb surfaces clear warnings until you wire ImageMagick, Sharp, or similar."
541
+ ]
542
+ };
543
+ var DOCX_META = "Structured document metadata is not exposed separately; extractMetadata returns a stub for DOCX.";
544
+ var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
545
+ var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
546
+ var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
547
+ var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
548
+ function slot(id, supported, warnings) {
549
+ return warnings?.length ? { id, supported, warnings } : { id, supported };
550
+ }
551
+ function buildNodeCapabilityReport(kind) {
552
+ const runtime = { id: "node" };
553
+ let capabilities;
554
+ const topWarnings = [];
555
+ switch (kind) {
556
+ case "pdf":
557
+ capabilities = [
558
+ slot("text", true, [
559
+ "Native text via pdf-parse and (in Node extractText) PDF.js per-page text; set pdf.ocr for raster OCR."
560
+ ]),
561
+ slot("metadata", true, [
562
+ "Document info / XMP-style metadata via pdf-parse without running the OCR pipeline."
563
+ ]),
564
+ slot("pages", true, [
565
+ "Page count and per-page native extraction (PDF.js) where used; OCR respects pdf.maxPages."
566
+ ]),
567
+ slot("ocr", true, [
568
+ "Raster OCR pipeline (pdf.ocr auto with quality heuristics, force, or runOcr)."
569
+ ]),
570
+ slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."])
571
+ ];
572
+ break;
573
+ case "docx":
574
+ capabilities = [
575
+ slot("text", true, [
576
+ "Mammoth plain text; extractText clears html. Optional OOXML extractors merge when options.docx.include is set."
577
+ ]),
578
+ slot("metadata", false, [
579
+ `${DOCX_META} OOXML structure, headings, tables, blocks, approximate pages, and embedded images are available via analyzeFile-style routes with options.docx.include.`
580
+ ]),
581
+ slot("html", true, [
582
+ "Mammoth HTML uses docxImagesAsDataUri for web-safe images; EMF/WMF and other non-web types appear as placeholders, not extracted media."
583
+ ]),
584
+ slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
585
+ slot("pages", false, [
586
+ "No PDF-style page count; approximate DOCX pages via options.docx.include.pagesApprox (OOXML hints, not print layout)."
587
+ ])
588
+ ];
589
+ break;
590
+ case "image":
591
+ capabilities = [
592
+ slot("text", true, ["Text is obtained via OCR."]),
593
+ slot("metadata", false, [IMAGE_META]),
594
+ slot("html", false, [IMAGE_HTML]),
595
+ slot("ocr", true),
596
+ slot("pages", false)
597
+ ];
598
+ break;
599
+ case "text":
600
+ capabilities = [
601
+ slot("text", true),
602
+ slot("metadata", true, [TEXT_META_NOTE]),
603
+ slot("html", true),
604
+ slot("ocr", false, ["OCR does not apply to plain text files."]),
605
+ slot("pages", false)
606
+ ];
607
+ break;
608
+ default:
609
+ topWarnings.push(UNKNOWN_KIND);
610
+ capabilities = [
611
+ slot("text", false),
612
+ slot("metadata", false),
613
+ slot("html", false),
614
+ slot("ocr", false),
615
+ slot("pages", false)
616
+ ];
617
+ }
618
+ return {
619
+ kind,
620
+ runtime,
621
+ capabilities,
622
+ ...kind === "docx" ? {
623
+ docxEmbeddedImages: DOCX_EMBEDDED_IMAGE_CAPABILITIES,
624
+ docxStructure: DOCX_STRUCTURE_CAPABILITIES
625
+ } : {},
626
+ warnings: topWarnings.length > 0 ? topWarnings : void 0
627
+ };
628
+ }
629
+
457
630
  // src/analysisPlanReport.ts
458
631
  function lim(...items) {
459
632
  return items.filter(Boolean);
460
633
  }
461
- function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
634
+ var DOCX_ZIP_MEDIA_PLAN_NOTE = "ZIP embedded images (word/media) use extractImagesFromDocx (re-exported from @dragon708/docmind-node); not merged into this intent pipeline.";
635
+ function finalizeDocxExplainReport(report) {
636
+ if (report.kind !== "docx") return report;
637
+ const limitations = report.limitations.includes(DOCX_ZIP_MEDIA_PLAN_NOTE) ? report.limitations : [...report.limitations, DOCX_ZIP_MEDIA_PLAN_NOTE];
638
+ return {
639
+ ...report,
640
+ docxEmbeddedImages: DOCX_EMBEDDED_IMAGE_CAPABILITIES,
641
+ docxStructure: DOCX_STRUCTURE_CAPABILITIES,
642
+ limitations
643
+ };
644
+ }
645
+ var DOCX_MAMMOTH_PLUS_OPTIONAL = "Mammoth extracts text and HTML from OOXML; optional parallel OOXML/ZIP extractors run when options.docx.include is set (structure, headings, tables, blocks, pagesApprox, embeddedImages).";
646
+ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
462
647
  const runtime = { id: "node" };
463
648
  const primaryAnalyzer = kind === "pdf" ? "pdf" : kind === "docx" ? "docx" : kind === "image" ? "image" : kind === "text" ? "text" : "none";
464
649
  let nativeExtraction;
@@ -468,7 +653,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
468
653
  limitations = lim(
469
654
  "Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve."
470
655
  );
471
- return {
656
+ return finalizeDocxExplainReport({
472
657
  kind,
473
658
  detectedKind: kind,
474
659
  runtime,
@@ -478,23 +663,23 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
478
663
  ocr: { mayUse: false, description: "OCR is not used for unknown kinds." },
479
664
  limitations,
480
665
  plan
481
- };
666
+ });
482
667
  }
483
668
  switch (intent) {
484
669
  case "analyzeFile":
485
670
  if (kind === "pdf") {
486
671
  nativeExtraction = {
487
672
  willAttempt: true,
488
- description: "pdf-parse extracts embedded text and page count first."
673
+ description: "pdf-parse supplies embedded text, metadata, and page count; PDF.js drives raster OCR when enabled."
489
674
  };
490
675
  ocr3 = {
491
676
  mayUse: pdfOcr !== "off",
492
- description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when the text layer is empty (pdf.ocr: auto)."
677
+ description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when native text looks insufficient (pdf.ocr: auto + heuristics)."
493
678
  };
494
679
  } else if (kind === "docx") {
495
680
  nativeExtraction = {
496
681
  willAttempt: true,
497
- description: "Mammoth extracts text and HTML from OOXML."
682
+ description: docxIncludeRequested(docxInclude) ? "Mammoth plus parallel OOXML extractors (per options.docx.include)." : DOCX_MAMMOTH_PLUS_OPTIONAL
498
683
  };
499
684
  ocr3 = { mayUse: false, description: "DOCX does not use OCR in DocMind." };
500
685
  } else if (kind === "image") {
@@ -515,16 +700,16 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
515
700
  if (kind === "pdf") {
516
701
  nativeExtraction = {
517
702
  willAttempt: true,
518
- description: "Text layer via pdf-parse; defaults to pdf.ocr off unless you override."
703
+ description: "Node: pdf-parse for metadata/page baseline, then PDF.js per-page text merged into `text` (pdfNativeTextSource pdfjs-per-page default)."
519
704
  };
520
705
  ocr3 = {
521
706
  mayUse: false,
522
- description: "extractText merges pdf.ocr default off \u2014 no raster OCR unless you set pdf.ocr explicitly."
707
+ description: "extractText defaults pdf.ocr off; set pdf.ocr explicitly to allow auto/force raster OCR."
523
708
  };
524
709
  } else if (kind === "docx") {
525
710
  nativeExtraction = {
526
711
  willAttempt: true,
527
- description: "Mammoth plain text; HTML cleared in the extractText response."
712
+ description: docxIncludeRequested(docxInclude) ? "Same DOCX router as analyzeFile: Mammoth text + optional OOXML fields; HTML cleared in extractText." : "Mammoth plain text; HTML cleared. Optional OOXML v2 fields when options.docx.include is set."
528
713
  };
529
714
  ocr3 = { mayUse: false, description: "DOCX does not use OCR." };
530
715
  } else if (kind === "image") {
@@ -552,7 +737,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
552
737
  };
553
738
  ocr3 = { mayUse: false, description: "OCR not used for this metadata path." };
554
739
  limitations = lim(
555
- kind === "docx" ? "Structured DOCX metadata is not exposed separately." : "Raster images have no document metadata bundle."
740
+ kind === "docx" ? "Structured DOCX metadata is not exposed separately; use analyzeFile / extractText / convertToHtml with options.docx.include for OOXML structure." : "Raster images have no document metadata bundle."
556
741
  );
557
742
  } else {
558
743
  nativeExtraction = {
@@ -574,7 +759,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
574
759
  } else if (kind === "docx") {
575
760
  nativeExtraction = {
576
761
  willAttempt: true,
577
- description: "Mammoth HTML output via analyzeFile routing."
762
+ description: docxIncludeRequested(docxInclude) ? "Mammoth HTML plus optional OOXML extractors (same router as analyzeFile)." : "Mammoth HTML via analyzeFile routing; optional OOXML v2 when options.docx.include is set."
578
763
  };
579
764
  ocr3 = { mayUse: false, description: "DOCX path does not use OCR." };
580
765
  } else if (kind === "text") {
@@ -609,7 +794,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
609
794
  } else if (kind === "docx") {
610
795
  nativeExtraction = {
611
796
  willAttempt: true,
612
- description: "Full Mammoth extract (text + HTML); not OCR."
797
+ description: docxIncludeRequested(docxInclude) ? "Mammoth text/HTML plus optional OOXML extractors; still not OCR." : "Full Mammoth extract (text + HTML); optional OOXML v2 via options.docx.include; not OCR."
613
798
  };
614
799
  ocr3 = { mayUse: false, description: "DOCX is not OCR'd." };
615
800
  limitations = lim("Result is structured extract, not OCR output.");
@@ -625,7 +810,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
625
810
  nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
626
811
  ocr3 = { mayUse: false, description: "See plan steps." };
627
812
  }
628
- return {
813
+ return finalizeDocxExplainReport({
629
814
  kind,
630
815
  detectedKind: kind,
631
816
  runtime,
@@ -635,82 +820,14 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
635
820
  ocr: ocr3,
636
821
  limitations,
637
822
  plan
638
- };
639
- }
640
-
641
- // src/capabilityReport.ts
642
- var DOCX_META = "Structured document metadata is not exposed separately; extractMetadata returns a stub for DOCX.";
643
- var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
644
- var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
645
- var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
646
- var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
647
- function slot(id, supported, warnings) {
648
- return warnings?.length ? { id, supported, warnings } : { id, supported };
649
- }
650
- function buildNodeCapabilityReport(kind) {
651
- const runtime = { id: "node" };
652
- let capabilities;
653
- const topWarnings = [];
654
- switch (kind) {
655
- case "pdf":
656
- capabilities = [
657
- slot("text", true, ["Includes text layer extraction; use extractText options to avoid PDF OCR."]),
658
- slot("metadata", true),
659
- slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."]),
660
- slot("ocr", true, ["Raster OCR is available (e.g. analyzeFile with pdf.ocr auto/force, or runOcr)."]),
661
- slot("pages", true)
662
- ];
663
- break;
664
- case "docx":
665
- capabilities = [
666
- slot("text", true),
667
- slot("metadata", false, [DOCX_META]),
668
- slot("html", true),
669
- slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
670
- slot("pages", false)
671
- ];
672
- break;
673
- case "image":
674
- capabilities = [
675
- slot("text", true, ["Text is obtained via OCR."]),
676
- slot("metadata", false, [IMAGE_META]),
677
- slot("html", false, [IMAGE_HTML]),
678
- slot("ocr", true),
679
- slot("pages", false)
680
- ];
681
- break;
682
- case "text":
683
- capabilities = [
684
- slot("text", true),
685
- slot("metadata", true, [TEXT_META_NOTE]),
686
- slot("html", true),
687
- slot("ocr", false, ["OCR does not apply to plain text files."]),
688
- slot("pages", false)
689
- ];
690
- break;
691
- default:
692
- topWarnings.push(UNKNOWN_KIND);
693
- capabilities = [
694
- slot("text", false),
695
- slot("metadata", false),
696
- slot("html", false),
697
- slot("ocr", false),
698
- slot("pages", false)
699
- ];
700
- }
701
- return {
702
- kind,
703
- runtime,
704
- capabilities,
705
- warnings: topWarnings.length > 0 ? topWarnings : void 0
706
- };
823
+ });
707
824
  }
708
825
 
709
826
  // src/introspection.ts
710
827
  function resolvePdfOcrMode(pdf) {
711
828
  return pdf?.ocr ?? "auto";
712
829
  }
713
- function planAnalyzeFile(kind, pdfOcr) {
830
+ function planAnalyzeFile(kind, pdfOcr, docxInclude) {
714
831
  switch (kind) {
715
832
  case "pdf":
716
833
  return {
@@ -724,14 +841,17 @@ function planAnalyzeFile(kind, pdfOcr) {
724
841
  }
725
842
  ]
726
843
  };
727
- case "docx":
844
+ case "docx": {
845
+ const parallel = docxIncludeRequested(docxInclude);
728
846
  return {
729
847
  intent: "analyzeFile",
730
848
  steps: [
731
849
  { id: "detect_kind", status: "done" },
732
- { id: "docx_mammoth", status: "planned" }
850
+ { id: "docx_mammoth", status: "planned" },
851
+ ...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
733
852
  ]
734
853
  };
854
+ }
735
855
  case "image":
736
856
  return {
737
857
  intent: "analyzeFile",
@@ -758,11 +878,22 @@ function planAnalyzeFile(kind, pdfOcr) {
758
878
  };
759
879
  }
760
880
  }
761
- function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
881
+ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude) {
762
882
  const intent = intentOpt ?? "analyzeFile";
763
- if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze);
883
+ if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude);
764
884
  if (intent === "extractText") {
765
- const p = planAnalyzeFile(kind, "off");
885
+ if (kind === "pdf") {
886
+ return {
887
+ intent: "extractText",
888
+ steps: [
889
+ { id: "detect_kind", status: "done" },
890
+ { id: "pdf_parse", status: "planned" },
891
+ { id: "pdfjs_per_page", status: "planned" },
892
+ { id: "pdf_ocr", status: "skipped" }
893
+ ]
894
+ };
895
+ }
896
+ const p = planAnalyzeFile(kind, "off", docxInclude);
766
897
  return { ...p, intent: "extractText" };
767
898
  }
768
899
  if (intent === "extractMetadata") {
@@ -794,11 +925,13 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
794
925
  }
795
926
  if (intent === "convertToHtml") {
796
927
  if (kind === "docx") {
928
+ const parallel = docxIncludeRequested(docxInclude);
797
929
  return {
798
930
  intent: "convertToHtml",
799
931
  steps: [
800
932
  { id: "detect_kind", status: "done" },
801
- { id: "docx_mammoth_html", status: "planned" }
933
+ { id: "docx_mammoth_html", status: "planned" },
934
+ ...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
802
935
  ]
803
936
  };
804
937
  }
@@ -851,11 +984,13 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
851
984
  };
852
985
  }
853
986
  if (kind === "docx") {
987
+ const parallel = docxIncludeRequested(docxInclude);
854
988
  return {
855
989
  intent: "runOcr",
856
990
  steps: [
857
991
  { id: "detect_kind", status: "done" },
858
- { id: "docx_structured_extract", status: "planned" }
992
+ { id: "docx_mammoth", status: "planned" },
993
+ ...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
859
994
  ]
860
995
  };
861
996
  }
@@ -867,7 +1002,7 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
867
1002
  ]
868
1003
  };
869
1004
  }
870
- return planAnalyzeFile(kind, pdfOcrForAnalyze);
1005
+ return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude);
871
1006
  }
872
1007
  async function getCapabilities(input, options) {
873
1008
  throwIfAborted(options?.signal);
@@ -883,10 +1018,11 @@ async function explainAnalysisPlan(input, options) {
883
1018
  const kind = detectFileKind(resolved);
884
1019
  const intent = options?.intent ?? "analyzeFile";
885
1020
  const pdfOcrAnalyze = resolvePdfOcrMode(options?.pdf);
886
- const plan = planForIntent(intent, kind, pdfOcrAnalyze);
887
- return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan);
1021
+ const docxInc = options?.docx?.include;
1022
+ const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc);
1023
+ return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan, docxInc);
888
1024
  }
889
1025
 
890
- export { analyzeFile, bufferToInput, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
1026
+ export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
891
1027
  //# sourceMappingURL=index.js.map
892
1028
  //# sourceMappingURL=index.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-node",
3
- "version": "1.2.0",
3
+ "version": "1.7.0",
4
4
  "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -32,9 +32,9 @@
32
32
  ],
33
33
  "license": "MIT",
34
34
  "dependencies": {
35
- "@dragon708/docmind-docx": "^1.0.0",
35
+ "@dragon708/docmind-docx": "^1.7.0",
36
36
  "@dragon708/docmind-ocr": "^1.0.0",
37
- "@dragon708/docmind-pdf": "^1.0.0",
37
+ "@dragon708/docmind-pdf": "^2.0.0",
38
38
  "@dragon708/docmind-shared": "^1.1.0"
39
39
  },
40
40
  "devDependencies": {