@dragon708/docmind-node 1.9.1 → 1.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,9 +1,13 @@
1
- import { DocMindAnalyzeOptions, DetectFileKindInput, NamedInput, AnalysisResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
2
- export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
3
- import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
4
- import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
5
- import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
6
- export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
1
+ import { DocMindAnalyzeOptions, AnalyzeFileOutputOptions, NormalizeStructuredOptions, DetectFileKindInput, NamedInput, AnalysisResult, StructuredDocumentResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
2
+ export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocumentBlock, DocumentImageRef, DocumentPage, DocumentTable, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, StructuredDocumentResult, TextAnalysisResult, analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
3
+ import { OcrOptions, OcrTiffOptions, PreprocessImageOptions, ExtractStructuredDataFromImageOptions } from '@dragon708/docmind-ocr';
4
+ export { ExtractStructuredDataFromImageOptions, extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
5
+ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDocxOptions } from '@dragon708/docmind-docx';
6
+ export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
7
+ import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
8
+ export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
9
+ import { RenderLlmTextOptions, RenderMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
10
+ export { MarkdownSection } from '@dragon708/docmind-markdown';
7
11
 
8
12
  /**
9
13
  * Options for Node public APIs (`analyzeFile`, intent methods).
@@ -24,7 +28,7 @@ interface NodeAnalyzeDocxOptionsSlice {
24
28
  /** Opciones Mammoth para HTML (p. ej. `convertImage`). */
25
29
  readonly html?: DocxToHtmlOptions;
26
30
  }
27
- interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
31
+ interface NodeAnalyzeOptions extends DocMindAnalyzeOptions, AnalyzeFileOutputOptions {
28
32
  readonly pdf?: PdfAnalyzeOptions;
29
33
  readonly ocr?: OcrOptions & Pick<OcrTiffOptions, "maxPages" | "pageSeparator"> & {
30
34
  readonly preprocess?: PreprocessImageOptions;
@@ -38,6 +42,34 @@ interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
38
42
  */
39
43
  readonly pdfNativeTextSource?: "pdf-parse" | "pdfjs-per-page";
40
44
  }
45
+ /**
46
+ * Opciones para {@link extractStructuredData}: reenvío por rama (`pdf` / `docx` / `ocr`) más
47
+ * `normalize` opcional aplicado a texto plano o como respaldo cuando la rama no define `normalize`.
48
+ */
49
+ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
50
+ readonly pdf?: ExtractStructuredDataFromPdfOptions;
51
+ readonly docx?: ExtractStructuredDataFromDocxOptions;
52
+ readonly ocr?: ExtractStructuredDataFromImageOptions;
53
+ readonly normalize?: NormalizeStructuredOptions;
54
+ }
55
+ /**
56
+ * {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; set `markdown` for `renderMarkdown` knobs (`@dragon708/docmind-markdown`).
57
+ */
58
+ interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
59
+ readonly markdown?: RenderMarkdownOptions;
60
+ }
61
+ /**
62
+ * {@link extractLlmContent}: same structured fields; `llm` forwards to `renderLlmText`.
63
+ */
64
+ interface NodeExtractLlmContentOptions extends NodeExtractStructuredDataOptions {
65
+ readonly llm?: RenderLlmTextOptions;
66
+ }
67
+ /**
68
+ * {@link extractStructuredChunks}: `chunks` maps to split/render options (`maxChars`, `preferHeadings`, etc.).
69
+ */
70
+ interface NodeExtractStructuredChunksOptions extends NodeExtractStructuredDataOptions {
71
+ readonly chunks?: RenderMarkdownSectionsOptions;
72
+ }
41
73
 
42
74
  /**
43
75
  * Inputs accepted by {@link analyzeFile} in this package.
@@ -85,8 +117,41 @@ declare function convertToHtml(input: NodeAnalyzeInput, options?: NodeAnalyzeOpt
85
117
  */
86
118
  declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
87
119
 
120
+ /**
121
+ * Resuelve entrada Node, clasifica el archivo y devuelve un {@link StructuredDocumentResult} vía
122
+ * el extractor estructurado del paquete correspondiente (PDF, DOCX, imagen OCR) o
123
+ * {@link normalizeToStructuredResult} para texto UTF-8.
124
+ */
125
+ declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
126
+
127
+ /**
128
+ * End-to-end: resolve bytes → {@link extractStructuredData} → `renderMarkdown` (`@dragon708/docmind-markdown`).
129
+ *
130
+ * @param input - Path, buffer, or {@link NodeAnalyzeInput} accepted by the Node facade.
131
+ * @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`) plus optional `markdown` render options.
132
+ */
133
+ declare function extractMarkdown(input: NodeAnalyzeInput, options?: NodeExtractMarkdownOptions): Promise<string>;
134
+ /**
135
+ * Same routing as {@link extractMarkdown}, then `renderLlmText` for prompts / RAG.
136
+ *
137
+ * @param options - Optional `llm` slice forwarded to `@dragon708/docmind-markdown`.
138
+ */
139
+ declare function extractLlmContent(input: NodeAnalyzeInput, options?: NodeExtractLlmContentOptions): Promise<string>;
140
+ /**
141
+ * Structured extract → `renderMarkdownSections` (Markdown + optional parallel `text` per slice).
142
+ *
143
+ * @param options - Optional `chunks` (e.g. `maxChars`, `preferHeadings`) from `@dragon708/docmind-markdown`.
144
+ */
145
+ declare function extractStructuredChunks(input: NodeAnalyzeInput, options?: NodeExtractStructuredChunksOptions): Promise<readonly MarkdownSection[]>;
146
+
88
147
  /** High-level features the user can ask DocMind for (per input kind and runtime). */
89
- type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
148
+ type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
149
+ /** Node: {@link extractMarkdown} after {@link extractStructuredData} via `@dragon708/docmind-markdown`. */
150
+ | "markdown"
151
+ /** Node: {@link extractLlmContent} (LLM-oriented plain text). */
152
+ | "llm-text"
153
+ /** Node: {@link extractStructuredChunks} (Markdown sections / chunking). */
154
+ | "structured-chunks" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
90
155
  /**
91
156
  * DOCX-only: what the stack can do with OOXML embedded bitmaps/vector payloads under `word/media`.
92
157
  * Present on {@link GetCapabilitiesReport} when `kind === "docx"`.
@@ -160,7 +225,8 @@ interface ExplainAnalysisPlanReport {
160
225
  readonly kind: FileKind;
161
226
  readonly detectedKind: FileKind;
162
227
  readonly runtime: RuntimeDescriptor;
163
- readonly intent: DocMindPublicIntent | (string & {});
228
+ /** Incluye intents extendidos en Node (p. ej. `extractStructuredData`). */
229
+ readonly intent: DocMindPublicIntent | string;
164
230
  readonly primaryAnalyzer: AnalysisAnalyzer;
165
231
  readonly nativeExtraction: NativeExtractionPlan;
166
232
  readonly ocr: OcrPlan;
@@ -173,12 +239,17 @@ interface ExplainAnalysisPlanReport {
173
239
  readonly warnings?: readonly string[];
174
240
  }
175
241
 
242
+ /** Node-only intents layered on `@dragon708/docmind-markdown` after structured extraction. */
243
+ type NodeMarkdownFacadeIntent = "extractMarkdown" | "extractLlmContent" | "extractStructuredChunks";
176
244
  /** Options for {@link explainAnalysisPlan} including PDF/OCR/DOCX hints for accurate planning. */
177
- type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx">;
245
+ type NodeExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "intent"> & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx" | "structuredOutput" | "output"> & {
246
+ readonly intent?: DocMindPublicIntent | NodeMarkdownFacadeIntent;
247
+ };
178
248
 
179
249
  /**
180
250
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
181
- * `text` | `metadata` | `html` | `ocr` | `pages` (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
251
+ * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` | `llm-text` | `structured-chunks`
252
+ * (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
182
253
  * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
183
254
  * For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
184
255
  * Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
@@ -190,4 +261,4 @@ declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilit
190
261
  */
191
262
  declare function explainAnalysisPlan(input: NodeAnalyzeInput, options?: NodeExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
192
263
 
193
- export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
264
+ export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type NodeExtractLlmContentOptions, type NodeExtractMarkdownOptions, type NodeExtractStructuredChunksOptions, type NodeExtractStructuredDataOptions, type NodeMarkdownFacadeIntent, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractLlmContent, extractMarkdown, extractMetadata, extractStructuredChunks, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
package/dist/index.js CHANGED
@@ -1,12 +1,15 @@
1
- import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
2
- export { detectFileKind } from '@dragon708/docmind-shared';
3
- import { analyzeDocx } from '@dragon708/docmind-docx';
4
- export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
5
- import { preprocessHasEffect, resolveImageFormat, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
6
- import { extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
1
+ import { assertValidAnalyzeFileInput, detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, notImplementedResult, analyzeFileRequestsStructured, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
2
+ export { analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
3
+ import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-docx';
4
+ export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
5
+ import { extractStructuredDataFromImage, preprocessHasEffect, resolveImageFormat, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
6
+ export { extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
7
+ import { extractStructuredDataFromPdf, extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
8
+ export { extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
7
9
  import { readFile } from 'fs/promises';
8
10
  import { basename } from 'path';
9
11
  import { fileURLToPath } from 'url';
12
+ import { renderMarkdown, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
10
13
 
11
14
  // src/analyze.ts
12
15
 
@@ -252,6 +255,72 @@ async function resolveNodeAnalyzeInput(input) {
252
255
  return input;
253
256
  }
254
257
 
258
+ // src/internal/abort.ts
259
+ function throwIfAborted(signal) {
260
+ if (signal?.aborted) {
261
+ const err = new Error("The operation was aborted");
262
+ err.name = "AbortError";
263
+ throw err;
264
+ }
265
+ }
266
+
267
+ // src/extractStructuredData.ts
268
+ var PLAIN_TEXT_STRUCTURED_NOTE = "Plain text: structured output wraps decoded UTF-8 in a unified envelope (no layout blocks).";
269
+ async function extractStructuredData(input, options) {
270
+ throwIfAborted(options?.signal);
271
+ const resolved = await resolveNodeAnalyzeInput(input);
272
+ assertValidAnalyzeFileInput(resolved);
273
+ const kind = detectFileKind(resolved);
274
+ const signal = options?.signal;
275
+ const normFallback = options?.normalize;
276
+ switch (kind) {
277
+ case "pdf": {
278
+ const data = await bytesFromDetectInput(resolved);
279
+ return extractStructuredDataFromPdf(data, {
280
+ ...options?.pdf,
281
+ signal: options?.pdf?.signal ?? signal,
282
+ normalize: options?.pdf?.normalize ?? normFallback
283
+ });
284
+ }
285
+ case "docx": {
286
+ const data = await bytesFromDetectInput(resolved);
287
+ return extractStructuredDataFromDocx(data, {
288
+ ...options?.docx,
289
+ signal: options?.docx?.signal ?? signal,
290
+ normalize: options?.docx?.normalize ?? normFallback
291
+ });
292
+ }
293
+ case "image": {
294
+ const data = await bytesFromDetectInput(resolved);
295
+ return extractStructuredDataFromImage(data, {
296
+ ...options?.ocr,
297
+ signal: options?.ocr?.signal ?? signal,
298
+ normalize: options?.ocr?.normalize ?? normFallback
299
+ });
300
+ }
301
+ case "text": {
302
+ const r = await analyzeText(resolved, { signal });
303
+ return normalizeToStructuredResult(
304
+ {
305
+ kind: "text",
306
+ text: r.text,
307
+ warnings: [...r.warnings, PLAIN_TEXT_STRUCTURED_NOTE]
308
+ },
309
+ normFallback
310
+ );
311
+ }
312
+ default:
313
+ return normalizeToStructuredResult(
314
+ {
315
+ kind: "unknown",
316
+ text: "",
317
+ warnings: [UNKNOWN_FORMAT_WARNING]
318
+ },
319
+ normFallback
320
+ );
321
+ }
322
+ }
323
+
255
324
  // src/analyze.ts
256
325
  async function analyzeFile(input, options) {
257
326
  if (options?.signal?.aborted) {
@@ -262,17 +331,39 @@ async function analyzeFile(input, options) {
262
331
  const resolved = await resolveNodeAnalyzeInput(input);
263
332
  assertValidAnalyzeFileInput(resolved);
264
333
  const fileKind = detectFileKind(resolved);
334
+ let result;
265
335
  switch (fileKind) {
266
336
  case "pdf":
267
- return analyzePdfForNode(resolved, options);
337
+ result = await analyzePdfForNode(resolved, options);
338
+ break;
268
339
  case "docx":
269
- return analyzeDocxForNode(resolved, options);
340
+ result = await analyzeDocxForNode(resolved, options);
341
+ break;
270
342
  case "image":
271
- return analyzeImageForNode(resolved, options);
343
+ result = await analyzeImageForNode(resolved, options);
344
+ break;
272
345
  case "text":
273
- return analyzeText(resolved, { signal: options?.signal });
346
+ result = await analyzeText(resolved, { signal: options?.signal });
347
+ break;
274
348
  default:
275
- return notImplementedResult(fileKind, "none", [UNKNOWN_FORMAT_WARNING]);
349
+ result = notImplementedResult(fileKind, "none", [UNKNOWN_FORMAT_WARNING]);
350
+ }
351
+ if (!analyzeFileRequestsStructured(options) || result.status !== "ok") {
352
+ return result;
353
+ }
354
+ try {
355
+ const structured = await extractStructuredData(resolved, {
356
+ signal: options?.signal,
357
+ pdf: options?.pdf,
358
+ docx: options?.docx,
359
+ ocr: options?.ocr
360
+ });
361
+ return { ...result, structured };
362
+ } catch (e) {
363
+ if (e instanceof Error && e.name === "AbortError") throw e;
364
+ const msg = e instanceof Error ? e.message : String(e);
365
+ const prev = "warnings" in result && Array.isArray(result.warnings) ? [...result.warnings] : [];
366
+ return { ...result, warnings: [...prev, `warning: analyzeFile structured merge failed: ${msg}`] };
276
367
  }
277
368
  }
278
369
 
@@ -287,15 +378,6 @@ function withPdfOcrDefaultOff(options) {
287
378
  };
288
379
  }
289
380
 
290
- // src/internal/abort.ts
291
- function throwIfAborted(signal) {
292
- if (signal?.aborted) {
293
- const err = new Error("The operation was aborted");
294
- err.name = "AbortError";
295
- throw err;
296
- }
297
- }
298
-
299
381
  // src/publicActions.ts
300
382
  var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not returned by extractMetadata; use analyzeFile, extractText, or convertToHtml with options.docx.include for OOXML structure, headings, tables, blocks, approximate pages, and embedded images.";
301
383
  var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
@@ -536,6 +618,24 @@ async function runOcr(input, options) {
536
618
  return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
537
619
  }
538
620
  }
621
+ async function extractMarkdown(input, options) {
622
+ throwIfAborted(options?.signal);
623
+ const { markdown: markdownOpts, ...structuredOpts } = options ?? {};
624
+ const structured = await extractStructuredData(input, structuredOpts);
625
+ return renderMarkdown(structured, markdownOpts);
626
+ }
627
+ async function extractLlmContent(input, options) {
628
+ throwIfAborted(options?.signal);
629
+ const { llm: llmOpts, ...structuredOpts } = options ?? {};
630
+ const structured = await extractStructuredData(input, structuredOpts);
631
+ return renderLlmText(structured, llmOpts);
632
+ }
633
+ async function extractStructuredChunks(input, options) {
634
+ throwIfAborted(options?.signal);
635
+ const { chunks: chunkOpts, ...structuredOpts } = options ?? {};
636
+ const structured = await extractStructuredData(input, structuredOpts);
637
+ return renderMarkdownSections(structured, chunkOpts);
638
+ }
539
639
 
540
640
  // src/capabilityReport.ts
541
641
  function docxIncludeRequested(flags) {
@@ -595,7 +695,19 @@ function buildNodeCapabilityReport(kind) {
595
695
  slot("ocr", true, [
596
696
  "Raster OCR pipeline (pdf.ocr auto with quality heuristics, force, or runOcr)."
597
697
  ]),
598
- slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."])
698
+ slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."]),
699
+ slot("structured-output", true, [
700
+ "extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
701
+ ]),
702
+ slot("markdown", true, [
703
+ "extractMarkdown: same structured pipeline, then `@dragon708/docmind-markdown` renderMarkdown (GFM-style)."
704
+ ]),
705
+ slot("llm-text", true, [
706
+ "extractLlmContent: structured \u2192 compact plain text (renderLlmText) for prompts / embeddings."
707
+ ]),
708
+ slot("structured-chunks", true, [
709
+ "extractStructuredChunks: structured \u2192 Markdown sections via split/render (heading-aware chunking)."
710
+ ])
599
711
  ];
600
712
  break;
601
713
  case "docx":
@@ -612,6 +724,16 @@ function buildNodeCapabilityReport(kind) {
612
724
  slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
613
725
  slot("pages", false, [
614
726
  "No PDF-style page count; approximate DOCX pages via options.docx.include.pagesApprox (OOXML hints, not print layout)."
727
+ ]),
728
+ slot("structured-output", true, [
729
+ "extractStructuredData runs analyzeDocx with merged OOXML includes (blocks, tables, headings, embedded images, etc.) into StructuredDocumentResult."
730
+ ]),
731
+ slot("markdown", true, [
732
+ "extractMarkdown: structured DOCX envelope \u2192 Markdown (`@dragon708/docmind-markdown`)."
733
+ ]),
734
+ slot("llm-text", true, ["extractLlmContent: structured \u2192 LLM-oriented plain text."]),
735
+ slot("structured-chunks", true, [
736
+ "extractStructuredChunks: structured \u2192 sectioned Markdown chunks."
615
737
  ])
616
738
  ];
617
739
  break;
@@ -643,7 +765,15 @@ function buildNodeCapabilityReport(kind) {
643
765
  ]),
644
766
  slot("pages", true, [
645
767
  "TIFF: `pages` and `textByPage` mirror frames processed. Other formats may expose `pages` when the normalizer reports it (e.g. GIF)."
646
- ])
768
+ ]),
769
+ slot("structured-output", true, [
770
+ "extractStructuredData: OCR + layout blocks (ocrImageDetailed / ocrTiff) normalized to StructuredDocumentResult."
771
+ ]),
772
+ slot("markdown", true, [
773
+ "extractMarkdown: OCR structured layout \u2192 Markdown (tables/lists as GFM where blocks exist)."
774
+ ]),
775
+ slot("llm-text", true, ["extractLlmContent: OCR structured \u2192 LLM plain text."]),
776
+ slot("structured-chunks", true, ["extractStructuredChunks: OCR structured \u2192 Markdown sections."])
647
777
  ];
648
778
  break;
649
779
  case "text":
@@ -652,7 +782,17 @@ function buildNodeCapabilityReport(kind) {
652
782
  slot("metadata", true, [TEXT_META_NOTE]),
653
783
  slot("html", true),
654
784
  slot("ocr", false, ["OCR does not apply to plain text files."]),
655
- slot("pages", false)
785
+ slot("pages", false),
786
+ slot("structured-output", true, [
787
+ "extractStructuredData wraps UTF-8 decode in normalizeToStructuredResult (rollup text only unless you add blocks upstream)."
788
+ ]),
789
+ slot("markdown", true, [
790
+ "extractMarkdown: rollup/decoded text \u2192 Markdown (mostly paragraphs; no layout without upstream blocks)."
791
+ ]),
792
+ slot("llm-text", true, ["extractLlmContent: rollup \u2192 LLM plain text via the same envelope."]),
793
+ slot("structured-chunks", true, [
794
+ "extractStructuredChunks: single-chunk Markdown is typical when only rollup text exists."
795
+ ])
656
796
  ];
657
797
  break;
658
798
  default:
@@ -662,7 +802,11 @@ function buildNodeCapabilityReport(kind) {
662
802
  slot("metadata", false),
663
803
  slot("html", false),
664
804
  slot("ocr", false),
665
- slot("pages", false)
805
+ slot("pages", false),
806
+ slot("structured-output", false),
807
+ slot("markdown", false),
808
+ slot("llm-text", false),
809
+ slot("structured-chunks", false)
666
810
  ];
667
811
  }
668
812
  return {
@@ -872,6 +1016,61 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
872
1016
  ocr = { mayUse: false, description: "OCR does not apply to text files." };
873
1017
  }
874
1018
  break;
1019
+ case "extractStructuredData":
1020
+ case "extractMarkdown":
1021
+ case "extractLlmContent":
1022
+ case "extractStructuredChunks":
1023
+ if (kind === "pdf") {
1024
+ nativeExtraction = {
1025
+ willAttempt: true,
1026
+ description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\`.`
1027
+ };
1028
+ ocr = {
1029
+ mayUse: pdfOcr !== "off",
1030
+ description: pdfOcr === "off" ? "Raster OCR is off (pdf.ocr: off); structured text uses native extraction only." : pdfOcr === "force" ? "Raster OCR may run on all pages (pdf.ocr: force)." : "Raster OCR may run when heuristics suggest weak native text (pdf.ocr: auto)."
1031
+ };
1032
+ } else if (kind === "docx") {
1033
+ nativeExtraction = {
1034
+ willAttempt: true,
1035
+ description: intent === "extractStructuredData" ? "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope." : `${intent}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\`.`
1036
+ };
1037
+ ocr = { mayUse: false, description: "DOCX does not use OCR." };
1038
+ } else if (kind === "image") {
1039
+ nativeExtraction = {
1040
+ willAttempt: false,
1041
+ description: NODE_IMAGE_OCR_PIPELINE
1042
+ };
1043
+ ocr = {
1044
+ mayUse: true,
1045
+ description: "OCR + layout blocks: same pipeline as package extractStructuredDataFromImage (normalize \u2192 optional preprocess \u2192 Tesseract; TIFF multipage via ocrTiff)."
1046
+ };
1047
+ } else {
1048
+ nativeExtraction = {
1049
+ willAttempt: true,
1050
+ description: intent === "extractStructuredData" ? "UTF-8 decode with BOM handling; normalizeToStructuredResult produces the structured envelope." : `${intent}: UTF-8 structured envelope, then \`@dragon708/docmind-markdown\` export.`
1051
+ };
1052
+ ocr = { mayUse: false, description: "OCR does not apply to text files." };
1053
+ limitations = lim("Plain text has no native layout blocks; `text` carries the decoded content.");
1054
+ }
1055
+ if (intent === "extractMarkdown") {
1056
+ limitations = [
1057
+ ...limitations,
1058
+ ...lim("Output: Markdown string via renderMarkdown (tables, headings, lists).")
1059
+ ];
1060
+ } else if (intent === "extractLlmContent") {
1061
+ limitations = [
1062
+ ...limitations,
1063
+ ...lim("Output: compact plain text via renderLlmText (prompt / embedding friendly).")
1064
+ ];
1065
+ } else if (intent === "extractStructuredChunks") {
1066
+ limitations = [
1067
+ ...limitations,
1068
+ ...lim(
1069
+ "Output: MarkdownSection[] via renderMarkdownSections (splitStructuredIntoChunks under the hood)."
1070
+ )
1071
+ ];
1072
+ }
1073
+ break;
875
1074
  default:
876
1075
  nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
877
1076
  ocr = { mayUse: false, description: "See plan steps." };
@@ -957,9 +1156,16 @@ function planAnalyzeFile(kind, pdfOcr, docxInclude, ocr) {
957
1156
  };
958
1157
  }
959
1158
  }
960
- function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr) {
1159
+ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr, analyzeFileOutput) {
961
1160
  const intent = intentOpt ?? "analyzeFile";
962
- if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
1161
+ if (intent === "analyzeFile") {
1162
+ const base = planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
1163
+ if (!analyzeFileRequestsStructured(analyzeFileOutput)) return base;
1164
+ return {
1165
+ ...base,
1166
+ steps: [...base.steps ?? [], { id: "structured_merge", status: "planned" }]
1167
+ };
1168
+ }
963
1169
  if (intent === "extractText") {
964
1170
  if (kind === "pdf") {
965
1171
  return {
@@ -1078,6 +1284,76 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr) {
1078
1284
  ]
1079
1285
  };
1080
1286
  }
1287
+ if (intent === "extractStructuredData") {
1288
+ switch (kind) {
1289
+ case "pdf":
1290
+ return {
1291
+ intent: "extractStructuredData",
1292
+ steps: [
1293
+ { id: "detect_kind", status: "done" },
1294
+ { id: "pdf_analyze", status: "planned" },
1295
+ { id: "pdf_structure_extract", status: "planned" },
1296
+ {
1297
+ id: "pdf_ocr",
1298
+ status: pdfOcrForAnalyze === "off" ? "skipped" : "planned"
1299
+ },
1300
+ { id: "structured_normalize", status: "planned" }
1301
+ ]
1302
+ };
1303
+ case "docx":
1304
+ return {
1305
+ intent: "extractStructuredData",
1306
+ steps: [
1307
+ { id: "detect_kind", status: "done" },
1308
+ { id: "docx_mammoth", status: "planned" },
1309
+ { id: "docx_ooxml_parallel", status: "planned" },
1310
+ { id: "structured_normalize", status: "planned" }
1311
+ ]
1312
+ };
1313
+ case "image":
1314
+ return {
1315
+ intent: "extractStructuredData",
1316
+ steps: [
1317
+ ...imageOcrPlanSteps(ocr),
1318
+ { id: "structured_normalize", status: "planned" }
1319
+ ]
1320
+ };
1321
+ case "text":
1322
+ return {
1323
+ intent: "extractStructuredData",
1324
+ steps: [
1325
+ { id: "detect_kind", status: "done" },
1326
+ { id: "utf8_decode", status: "planned" },
1327
+ { id: "structured_normalize", status: "planned" }
1328
+ ]
1329
+ };
1330
+ default:
1331
+ return {
1332
+ intent: "extractStructuredData",
1333
+ steps: [
1334
+ { id: "detect_kind", status: "done" },
1335
+ { id: "route", status: "failed" }
1336
+ ]
1337
+ };
1338
+ }
1339
+ }
1340
+ if (intent === "extractMarkdown" || intent === "extractLlmContent" || intent === "extractStructuredChunks") {
1341
+ const sub = planForIntent(
1342
+ "extractStructuredData",
1343
+ kind,
1344
+ pdfOcrForAnalyze,
1345
+ docxInclude,
1346
+ ocr,
1347
+ analyzeFileOutput
1348
+ );
1349
+ return {
1350
+ intent,
1351
+ steps: [
1352
+ ...sub.steps ?? [],
1353
+ { id: "docmind_markdown_render", status: "planned" }
1354
+ ]
1355
+ };
1356
+ }
1081
1357
  return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
1082
1358
  }
1083
1359
  async function getCapabilities(input, options) {
@@ -1096,10 +1372,13 @@ async function explainAnalysisPlan(input, options) {
1096
1372
  const pdfOcrAnalyze = resolvePdfOcrMode(options?.pdf);
1097
1373
  const docxInc = options?.docx?.include;
1098
1374
  const ocrSlice = options?.ocr;
1099
- const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc, ocrSlice);
1375
+ const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc, ocrSlice, {
1376
+ structuredOutput: options?.structuredOutput,
1377
+ output: options?.output
1378
+ });
1100
1379
  return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan, docxInc, ocrSlice);
1101
1380
  }
1102
1381
 
1103
- export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
1382
+ export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractLlmContent, extractMarkdown, extractMetadata, extractStructuredChunks, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
1104
1383
  //# sourceMappingURL=index.js.map
1105
1384
  //# sourceMappingURL=index.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-node",
3
- "version": "1.9.1",
3
+ "version": "1.11.0",
4
4
  "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -32,10 +32,11 @@
32
32
  ],
33
33
  "license": "MIT",
34
34
  "dependencies": {
35
- "@dragon708/docmind-docx": "^1.7.1",
36
- "@dragon708/docmind-ocr": "^1.1.3",
37
- "@dragon708/docmind-pdf": "^2.1.1",
38
- "@dragon708/docmind-shared": "^1.1.1"
35
+ "@dragon708/docmind-docx": "^1.8.0",
36
+ "@dragon708/docmind-markdown": "^1.0.0",
37
+ "@dragon708/docmind-ocr": "^1.1.4",
38
+ "@dragon708/docmind-pdf": "^2.2.0",
39
+ "@dragon708/docmind-shared": "^1.2.0"
39
40
  },
40
41
  "devDependencies": {
41
42
  "@types/node": "^20.19.37",