@dragon708/docmind-node 1.10.0 → 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -6,6 +6,8 @@ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDo
6
6
  export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
7
7
  import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
8
8
  export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
9
+ import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
10
+ export { MarkdownSection } from '@dragon708/docmind-markdown';
9
11
 
10
12
  /**
11
13
  * Options for Node public APIs (`analyzeFile`, intent methods).
@@ -50,6 +52,30 @@ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
50
52
  readonly ocr?: ExtractStructuredDataFromImageOptions;
51
53
  readonly normalize?: NormalizeStructuredOptions;
52
54
  }
55
+ /**
56
+ * {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; `markdown` maps to structured-serializer
57
+ * options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf`
58
+ * configure Mammoth→Turndown and `@opendataloader/pdf` respectively — separate from `docx` / `pdf` used only by {@link extractStructuredData}.
59
+ */
60
+ interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
61
+ readonly markdown?: RenderMarkdownOptions;
62
+ /** Forwarded to `extractMarkdown` → `convertDocxToMarkdown` when bytes are DOCX (Node). */
63
+ readonly markdownDocx?: ConvertDocxToMarkdownOptions;
64
+ /** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF. */
65
+ readonly markdownPdf?: ConvertPdfToMarkdownOptions;
66
+ }
67
+ /**
68
+ * {@link extractLlmContent}: same structured fields; `llm` forwards to `renderLlmText`.
69
+ */
70
+ interface NodeExtractLlmContentOptions extends NodeExtractStructuredDataOptions {
71
+ readonly llm?: RenderLlmTextOptions;
72
+ }
73
+ /**
74
+ * {@link extractStructuredChunks}: `chunks` maps to split/render options (`maxChars`, `preferHeadings`, etc.).
75
+ */
76
+ interface NodeExtractStructuredChunksOptions extends NodeExtractStructuredDataOptions {
77
+ readonly chunks?: RenderMarkdownSectionsOptions;
78
+ }
53
79
 
54
80
  /**
55
81
  * Inputs accepted by {@link analyzeFile} in this package.
@@ -104,8 +130,40 @@ declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions):
104
130
  */
105
131
  declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
106
132
 
133
+ /**
134
+ * End-to-end: {@link extractStructuredData} (for fallback + option parity) plus
135
+ * `extractMarkdown` from `@dragon708/docmind-markdown` on `{ data, filename, mimeType }`.
136
+ * On Node, PDF/DOCX bytes use specialized routes (`@opendataloader/pdf`, Mammoth+Turndown) when detection matches;
137
+ * the structured envelope is always passed as `structuredFallback`.
138
+ *
139
+ * @param input - Path, buffer, or {@link NodeAnalyzeInput} accepted by the Node facade.
140
+ * @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`), optional `markdown` serializer knobs,
141
+ * and optional `markdownDocx` / `markdownPdf` for the binary Markdown pipelines (distinct from structured-only `docx` / `pdf`).
142
+ */
143
+ declare function extractMarkdown(input: NodeAnalyzeInput, options?: NodeExtractMarkdownOptions): Promise<string>;
144
+ /**
145
+ * {@link extractStructuredData} then `renderLlmText` in `@dragon708/docmind-markdown` (tagged plain text for LLMs).
146
+ * That package's `extractLlmContent` is the same transform on an in-memory structured result only.
147
+ *
148
+ * @param options - Structured routing plus optional `llm` passed through to `renderLlmText`.
149
+ */
150
+ declare function extractLlmContent(input: NodeAnalyzeInput, options?: NodeExtractLlmContentOptions): Promise<string>;
151
+ /**
152
+ * Structured extract → {@link renderMarkdownSections} (`splitStructuredIntoChunks` with `includeMarkdown: true`;
153
+ * same layer as `extractStructuredChunks` / `splitStructuredIntoChunks` in `@dragon708/docmind-markdown`).
154
+ *
155
+ * @param options - Optional `chunks` (e.g. `maxChars`, `preferHeadings`) from `@dragon708/docmind-markdown`.
156
+ */
157
+ declare function extractStructuredChunks(input: NodeAnalyzeInput, options?: NodeExtractStructuredChunksOptions): Promise<readonly MarkdownSection[]>;
158
+
107
159
  /** High-level features the user can ask DocMind for (per input kind and runtime). */
108
- type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
160
+ type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
161
+ /** Node: {@link extractMarkdown} — hybrid `extractMarkdown` in `@dragon708/docmind-markdown` (binary PDF/DOCX routes + structured fallback). */
162
+ | "markdown"
163
+ /** Node: {@link extractLlmContent} (LLM-oriented plain text). */
164
+ | "llm-text"
165
+ /** Node: {@link extractStructuredChunks} (Markdown sections / chunking). */
166
+ | "structured-chunks" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
109
167
  /**
110
168
  * DOCX-only: what the stack can do with OOXML embedded bitmaps/vector payloads under `word/media`.
111
169
  * Present on {@link GetCapabilitiesReport} when `kind === "docx"`.
@@ -193,12 +251,17 @@ interface ExplainAnalysisPlanReport {
193
251
  readonly warnings?: readonly string[];
194
252
  }
195
253
 
254
+ /** Node-only intents layered on `@dragon708/docmind-markdown` after structured extraction. */
255
+ type NodeMarkdownFacadeIntent = "extractMarkdown" | "extractLlmContent" | "extractStructuredChunks";
196
256
  /** Options for {@link explainAnalysisPlan} including PDF/OCR/DOCX hints for accurate planning. */
197
- type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx" | "structuredOutput" | "output">;
257
+ type NodeExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "intent"> & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx" | "structuredOutput" | "output"> & {
258
+ readonly intent?: DocMindPublicIntent | NodeMarkdownFacadeIntent;
259
+ };
198
260
 
199
261
  /**
200
262
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
201
- * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
263
+ * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (hybrid package extract on Node) | `llm-text` | `structured-chunks` (split + Markdown sections)
264
+ * (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
202
265
  * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
203
266
  * For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
204
267
  * Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
@@ -210,4 +273,4 @@ declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilit
210
273
  */
211
274
  declare function explainAnalysisPlan(input: NodeAnalyzeInput, options?: NodeExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
212
275
 
213
- export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type NodeExtractStructuredDataOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
276
+ export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type NodeExtractLlmContentOptions, type NodeExtractMarkdownOptions, type NodeExtractStructuredChunksOptions, type NodeExtractStructuredDataOptions, type NodeMarkdownFacadeIntent, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractLlmContent, extractMarkdown, extractMetadata, extractStructuredChunks, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
package/dist/index.js CHANGED
@@ -1,4 +1,4 @@
1
- import { assertValidAnalyzeFileInput, detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, notImplementedResult, analyzeFileRequestsStructured, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
1
+ import { assertValidAnalyzeFileInput, detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, notImplementedResult, analyzeFileRequestsStructured, isNamedInput, toUint8Array, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
2
2
  export { analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
3
3
  import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-docx';
4
4
  export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
@@ -9,6 +9,7 @@ export { extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
9
9
  import { readFile } from 'fs/promises';
10
10
  import { basename } from 'path';
11
11
  import { fileURLToPath } from 'url';
12
+ import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
12
13
 
13
14
  // src/analyze.ts
14
15
 
@@ -617,6 +618,41 @@ async function runOcr(input, options) {
617
618
  return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
618
619
  }
619
620
  }
621
+ async function extractMarkdown(input, options) {
622
+ throwIfAborted(options?.signal);
623
+ const { markdown: markdownOpts, markdownDocx, markdownPdf, ...structuredOpts } = options ?? {};
624
+ const resolved = await resolveNodeAnalyzeInput(input);
625
+ const structured = await extractStructuredData(resolved, structuredOpts);
626
+ const data = await bytesFromDetectInput(resolved);
627
+ let filename;
628
+ let mimeType;
629
+ if (isNamedInput(resolved)) {
630
+ filename = resolved.name;
631
+ mimeType = resolved.mimeType;
632
+ }
633
+ const r = await extractMarkdown$1(
634
+ { data, filename, mimeType },
635
+ {
636
+ ...markdownOpts ?? {},
637
+ ...markdownDocx !== void 0 ? { docx: markdownDocx } : {},
638
+ ...markdownPdf !== void 0 ? { pdf: markdownPdf } : {},
639
+ structuredFallback: structured
640
+ }
641
+ );
642
+ return r.markdown;
643
+ }
644
+ async function extractLlmContent(input, options) {
645
+ throwIfAborted(options?.signal);
646
+ const { llm: llmOpts, ...structuredOpts } = options ?? {};
647
+ const structured = await extractStructuredData(input, structuredOpts);
648
+ return renderLlmText(structured, llmOpts);
649
+ }
650
+ async function extractStructuredChunks(input, options) {
651
+ throwIfAborted(options?.signal);
652
+ const { chunks: chunkOpts, ...structuredOpts } = options ?? {};
653
+ const structured = await extractStructuredData(input, structuredOpts);
654
+ return renderMarkdownSections(structured, chunkOpts);
655
+ }
620
656
 
621
657
  // src/capabilityReport.ts
622
658
  function docxIncludeRequested(flags) {
@@ -679,6 +715,15 @@ function buildNodeCapabilityReport(kind) {
679
715
  slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."]),
680
716
  slot("structured-output", true, [
681
717
  "extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
718
+ ]),
719
+ slot("markdown", true, [
720
+ "extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes prefer `@opendataloader/pdf` \u2192 Markdown; structured PDF (`extractStructuredData`, respects pdf.ocr) is always built as fallback and for non-binary inputs."
721
+ ]),
722
+ slot("llm-text", true, [
723
+ "extractLlmContent: structured envelope \u2192 LLM-oriented plain text (`renderLlmText` in `@dragon708/docmind-markdown`)."
724
+ ]),
725
+ slot("structured-chunks", true, [
726
+ "extractStructuredChunks: structured \u2192 `renderMarkdownSections` (splitStructuredIntoChunks + Markdown per slice; heading-aware chunking)."
682
727
  ])
683
728
  ];
684
729
  break;
@@ -699,6 +744,13 @@ function buildNodeCapabilityReport(kind) {
699
744
  ]),
700
745
  slot("structured-output", true, [
701
746
  "extractStructuredData runs analyzeDocx with merged OOXML includes (blocks, tables, headings, embedded images, etc.) into StructuredDocumentResult."
747
+ ]),
748
+ slot("markdown", true, [
749
+ "extractMarkdown: hybrid \u2014 DOCX bytes use Mammoth\u2192Turndown on Node; structured DOCX (`extractStructuredData`, options.docx.include) is always built as fallback."
750
+ ]),
751
+ slot("llm-text", true, ["extractLlmContent: structured \u2192 LLM-oriented plain text."]),
752
+ slot("structured-chunks", true, [
753
+ "extractStructuredChunks: structured \u2192 sectioned Markdown chunks."
702
754
  ])
703
755
  ];
704
756
  break;
@@ -733,7 +785,12 @@ function buildNodeCapabilityReport(kind) {
733
785
  ]),
734
786
  slot("structured-output", true, [
735
787
  "extractStructuredData: OCR + layout blocks (ocrImageDetailed / ocrTiff) normalized to StructuredDocumentResult."
736
- ])
788
+ ]),
789
+ slot("markdown", true, [
790
+ "extractMarkdown: OCR structured layout \u2192 Markdown (tables/lists as GFM where blocks exist)."
791
+ ]),
792
+ slot("llm-text", true, ["extractLlmContent: OCR structured \u2192 LLM plain text."]),
793
+ slot("structured-chunks", true, ["extractStructuredChunks: OCR structured \u2192 Markdown sections."])
737
794
  ];
738
795
  break;
739
796
  case "text":
@@ -745,6 +802,13 @@ function buildNodeCapabilityReport(kind) {
745
802
  slot("pages", false),
746
803
  slot("structured-output", true, [
747
804
  "extractStructuredData wraps UTF-8 decode in normalizeToStructuredResult (rollup text only unless you add blocks upstream)."
805
+ ]),
806
+ slot("markdown", true, [
807
+ "extractMarkdown: rollup/decoded text \u2192 Markdown (mostly paragraphs; no layout without upstream blocks)."
808
+ ]),
809
+ slot("llm-text", true, ["extractLlmContent: rollup \u2192 LLM plain text via the same envelope."]),
810
+ slot("structured-chunks", true, [
811
+ "extractStructuredChunks: single-chunk Markdown is typical when only rollup text exists."
748
812
  ])
749
813
  ];
750
814
  break;
@@ -756,7 +820,10 @@ function buildNodeCapabilityReport(kind) {
756
820
  slot("html", false),
757
821
  slot("ocr", false),
758
822
  slot("pages", false),
759
- slot("structured-output", false)
823
+ slot("structured-output", false),
824
+ slot("markdown", false),
825
+ slot("llm-text", false),
826
+ slot("structured-chunks", false)
760
827
  ];
761
828
  }
762
829
  return {
@@ -967,10 +1034,13 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
967
1034
  }
968
1035
  break;
969
1036
  case "extractStructuredData":
1037
+ case "extractMarkdown":
1038
+ case "extractLlmContent":
1039
+ case "extractStructuredChunks":
970
1040
  if (kind === "pdf") {
971
1041
  nativeExtraction = {
972
1042
  willAttempt: true,
973
- description: "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult."
1043
+ description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : intent === "extractMarkdown" ? "extractMarkdown: structured PDF extract (same as extractStructuredData) for fallback; primary Markdown from `@dragon708/docmind-markdown` tries `@opendataloader/pdf` on PDF bytes when possible." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (renderLlmText or chunk sections).`
974
1044
  };
975
1045
  ocr = {
976
1046
  mayUse: pdfOcr !== "off",
@@ -979,7 +1049,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
979
1049
  } else if (kind === "docx") {
980
1050
  nativeExtraction = {
981
1051
  willAttempt: true,
982
- description: "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope."
1052
+ description: intent === "extractStructuredData" ? "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope." : intent === "extractMarkdown" ? "extractMarkdown: structured DOCX envelope for fallback; primary Markdown from Mammoth\u2192Turndown on DOCX bytes when possible (`@dragon708/docmind-markdown`)." : `${intent}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\`.`
983
1053
  };
984
1054
  ocr = { mayUse: false, description: "DOCX does not use OCR." };
985
1055
  } else if (kind === "image") {
@@ -994,11 +1064,31 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
994
1064
  } else {
995
1065
  nativeExtraction = {
996
1066
  willAttempt: true,
997
- description: "UTF-8 decode with BOM handling; normalizeToStructuredResult produces the structured envelope."
1067
+ description: intent === "extractStructuredData" ? "UTF-8 decode with BOM handling; normalizeToStructuredResult produces the structured envelope." : `${intent}: UTF-8 structured envelope, then \`@dragon708/docmind-markdown\` export.`
998
1068
  };
999
1069
  ocr = { mayUse: false, description: "OCR does not apply to text files." };
1000
1070
  limitations = lim("Plain text has no native layout blocks; `text` carries the decoded content.");
1001
1071
  }
1072
+ if (intent === "extractMarkdown") {
1073
+ limitations = [
1074
+ ...limitations,
1075
+ ...lim(
1076
+ "Output: Markdown string from `@dragon708/docmind-markdown` extractMarkdown (PDF/DOCX binary routes on Node when applicable; structured serializer as fallback)."
1077
+ )
1078
+ ];
1079
+ } else if (intent === "extractLlmContent") {
1080
+ limitations = [
1081
+ ...limitations,
1082
+ ...lim("Output: compact plain text via renderLlmText (prompt / embedding friendly).")
1083
+ ];
1084
+ } else if (intent === "extractStructuredChunks") {
1085
+ limitations = [
1086
+ ...limitations,
1087
+ ...lim(
1088
+ "Output: MarkdownSection[] via renderMarkdownSections (splitStructuredIntoChunks / extractStructuredChunks alias in `@dragon708/docmind-markdown`)."
1089
+ )
1090
+ ];
1091
+ }
1002
1092
  break;
1003
1093
  default:
1004
1094
  nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
@@ -1266,6 +1356,46 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr, anal
1266
1356
  };
1267
1357
  }
1268
1358
  }
1359
+ if (intent === "extractMarkdown") {
1360
+ const sub = planForIntent(
1361
+ "extractStructuredData",
1362
+ kind,
1363
+ pdfOcrForAnalyze,
1364
+ docxInclude,
1365
+ ocr,
1366
+ analyzeFileOutput
1367
+ );
1368
+ return {
1369
+ intent,
1370
+ steps: [
1371
+ ...sub.steps ?? [],
1372
+ {
1373
+ id: "markdown_hybrid_package",
1374
+ status: "planned"
1375
+ }
1376
+ ]
1377
+ };
1378
+ }
1379
+ if (intent === "extractLlmContent" || intent === "extractStructuredChunks") {
1380
+ const sub = planForIntent(
1381
+ "extractStructuredData",
1382
+ kind,
1383
+ pdfOcrForAnalyze,
1384
+ docxInclude,
1385
+ ocr,
1386
+ analyzeFileOutput
1387
+ );
1388
+ return {
1389
+ intent,
1390
+ steps: [
1391
+ ...sub.steps ?? [],
1392
+ {
1393
+ id: intent === "extractLlmContent" ? "docmind_markdown_llm_text" : "docmind_markdown_chunk_sections",
1394
+ status: "planned"
1395
+ }
1396
+ ]
1397
+ };
1398
+ }
1269
1399
  return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
1270
1400
  }
1271
1401
  async function getCapabilities(input, options) {
@@ -1291,6 +1421,6 @@ async function explainAnalysisPlan(input, options) {
1291
1421
  return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan, docxInc, ocrSlice);
1292
1422
  }
1293
1423
 
1294
- export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
1424
+ export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractLlmContent, extractMarkdown, extractMetadata, extractStructuredChunks, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
1295
1425
  //# sourceMappingURL=index.js.map
1296
1426
  //# sourceMappingURL=index.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-node",
3
- "version": "1.10.0",
3
+ "version": "1.12.0",
4
4
  "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -33,6 +33,7 @@
33
33
  "license": "MIT",
34
34
  "dependencies": {
35
35
  "@dragon708/docmind-docx": "^1.8.0",
36
+ "@dragon708/docmind-markdown": "^1.1.0",
36
37
  "@dragon708/docmind-ocr": "^1.1.4",
37
38
  "@dragon708/docmind-pdf": "^2.2.0",
38
39
  "@dragon708/docmind-shared": "^1.2.0"