@dragon708/docmind-node 1.10.0 → 1.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -6,6 +6,8 @@ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDo
6
6
  export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
7
7
  import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
8
8
  export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
9
+ import { RenderLlmTextOptions, RenderMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
10
+ export { MarkdownSection } from '@dragon708/docmind-markdown';
9
11
 
10
12
  /**
11
13
  * Options for Node public APIs (`analyzeFile`, intent methods).
@@ -50,6 +52,24 @@ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
50
52
  readonly ocr?: ExtractStructuredDataFromImageOptions;
51
53
  readonly normalize?: NormalizeStructuredOptions;
52
54
  }
55
+ /**
56
+ * {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; set `markdown` for `renderMarkdown` knobs (`@dragon708/docmind-markdown`).
57
+ */
58
+ interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
59
+ readonly markdown?: RenderMarkdownOptions;
60
+ }
61
+ /**
62
+ * {@link extractLlmContent}: same structured fields; `llm` forwards to `renderLlmText`.
63
+ */
64
+ interface NodeExtractLlmContentOptions extends NodeExtractStructuredDataOptions {
65
+ readonly llm?: RenderLlmTextOptions;
66
+ }
67
+ /**
68
+ * {@link extractStructuredChunks}: `chunks` maps to split/render options (`maxChars`, `preferHeadings`, etc.).
69
+ */
70
+ interface NodeExtractStructuredChunksOptions extends NodeExtractStructuredDataOptions {
71
+ readonly chunks?: RenderMarkdownSectionsOptions;
72
+ }
53
73
 
54
74
  /**
55
75
  * Inputs accepted by {@link analyzeFile} in this package.
@@ -104,8 +124,34 @@ declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions):
104
124
  */
105
125
  declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
106
126
 
127
+ /**
128
+ * End-to-end: resolve bytes → {@link extractStructuredData} → `renderMarkdown` (`@dragon708/docmind-markdown`).
129
+ *
130
+ * @param input - Path, buffer, or {@link NodeAnalyzeInput} accepted by the Node facade.
131
+ * @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`) plus optional `markdown` render options.
132
+ */
133
+ declare function extractMarkdown(input: NodeAnalyzeInput, options?: NodeExtractMarkdownOptions): Promise<string>;
134
+ /**
135
+ * Same routing as {@link extractMarkdown}, then `renderLlmText` for prompts / RAG.
136
+ *
137
+ * @param options - Optional `llm` slice forwarded to `@dragon708/docmind-markdown`.
138
+ */
139
+ declare function extractLlmContent(input: NodeAnalyzeInput, options?: NodeExtractLlmContentOptions): Promise<string>;
140
+ /**
141
+ * Structured extract → `renderMarkdownSections` (Markdown + optional parallel `text` per slice).
142
+ *
143
+ * @param options - Optional `chunks` (e.g. `maxChars`, `preferHeadings`) from `@dragon708/docmind-markdown`.
144
+ */
145
+ declare function extractStructuredChunks(input: NodeAnalyzeInput, options?: NodeExtractStructuredChunksOptions): Promise<readonly MarkdownSection[]>;
146
+
107
147
  /** High-level features the user can ask DocMind for (per input kind and runtime). */
108
- type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
148
+ type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
149
+ /** Node: {@link extractMarkdown} after {@link extractStructuredData} via `@dragon708/docmind-markdown`. */
150
+ | "markdown"
151
+ /** Node: {@link extractLlmContent} (LLM-oriented plain text). */
152
+ | "llm-text"
153
+ /** Node: {@link extractStructuredChunks} (Markdown sections / chunking). */
154
+ | "structured-chunks" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
109
155
  /**
110
156
  * DOCX-only: what the stack can do with OOXML embedded bitmaps/vector payloads under `word/media`.
111
157
  * Present on {@link GetCapabilitiesReport} when `kind === "docx"`.
@@ -193,12 +239,17 @@ interface ExplainAnalysisPlanReport {
193
239
  readonly warnings?: readonly string[];
194
240
  }
195
241
 
242
+ /** Node-only intents layered on `@dragon708/docmind-markdown` after structured extraction. */
243
+ type NodeMarkdownFacadeIntent = "extractMarkdown" | "extractLlmContent" | "extractStructuredChunks";
196
244
  /** Options for {@link explainAnalysisPlan} including PDF/OCR/DOCX hints for accurate planning. */
197
- type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx" | "structuredOutput" | "output">;
245
+ type NodeExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "intent"> & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx" | "structuredOutput" | "output"> & {
246
+ readonly intent?: DocMindPublicIntent | NodeMarkdownFacadeIntent;
247
+ };
198
248
 
199
249
  /**
200
250
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
201
- * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
251
+ * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` | `llm-text` | `structured-chunks`
252
+ * (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
202
253
  * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
203
254
  * For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
204
255
  * Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
@@ -210,4 +261,4 @@ declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilit
210
261
  */
211
262
  declare function explainAnalysisPlan(input: NodeAnalyzeInput, options?: NodeExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
212
263
 
213
- export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type NodeExtractStructuredDataOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
264
+ export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type NodeExtractLlmContentOptions, type NodeExtractMarkdownOptions, type NodeExtractStructuredChunksOptions, type NodeExtractStructuredDataOptions, type NodeMarkdownFacadeIntent, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractLlmContent, extractMarkdown, extractMetadata, extractStructuredChunks, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
package/dist/index.js CHANGED
@@ -9,6 +9,7 @@ export { extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
9
9
  import { readFile } from 'fs/promises';
10
10
  import { basename } from 'path';
11
11
  import { fileURLToPath } from 'url';
12
+ import { renderMarkdown, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
12
13
 
13
14
  // src/analyze.ts
14
15
 
@@ -617,6 +618,24 @@ async function runOcr(input, options) {
617
618
  return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
618
619
  }
619
620
  }
621
+ async function extractMarkdown(input, options) {
622
+ throwIfAborted(options?.signal);
623
+ const { markdown: markdownOpts, ...structuredOpts } = options ?? {};
624
+ const structured = await extractStructuredData(input, structuredOpts);
625
+ return renderMarkdown(structured, markdownOpts);
626
+ }
627
+ async function extractLlmContent(input, options) {
628
+ throwIfAborted(options?.signal);
629
+ const { llm: llmOpts, ...structuredOpts } = options ?? {};
630
+ const structured = await extractStructuredData(input, structuredOpts);
631
+ return renderLlmText(structured, llmOpts);
632
+ }
633
+ async function extractStructuredChunks(input, options) {
634
+ throwIfAborted(options?.signal);
635
+ const { chunks: chunkOpts, ...structuredOpts } = options ?? {};
636
+ const structured = await extractStructuredData(input, structuredOpts);
637
+ return renderMarkdownSections(structured, chunkOpts);
638
+ }
620
639
 
621
640
  // src/capabilityReport.ts
622
641
  function docxIncludeRequested(flags) {
@@ -679,6 +698,15 @@ function buildNodeCapabilityReport(kind) {
679
698
  slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."]),
680
699
  slot("structured-output", true, [
681
700
  "extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
701
+ ]),
702
+ slot("markdown", true, [
703
+ "extractMarkdown: same structured pipeline, then `@dragon708/docmind-markdown` renderMarkdown (GFM-style)."
704
+ ]),
705
+ slot("llm-text", true, [
706
+ "extractLlmContent: structured \u2192 compact plain text (renderLlmText) for prompts / embeddings."
707
+ ]),
708
+ slot("structured-chunks", true, [
709
+ "extractStructuredChunks: structured \u2192 Markdown sections via split/render (heading-aware chunking)."
682
710
  ])
683
711
  ];
684
712
  break;
@@ -699,6 +727,13 @@ function buildNodeCapabilityReport(kind) {
699
727
  ]),
700
728
  slot("structured-output", true, [
701
729
  "extractStructuredData runs analyzeDocx with merged OOXML includes (blocks, tables, headings, embedded images, etc.) into StructuredDocumentResult."
730
+ ]),
731
+ slot("markdown", true, [
732
+ "extractMarkdown: structured DOCX envelope \u2192 Markdown (`@dragon708/docmind-markdown`)."
733
+ ]),
734
+ slot("llm-text", true, ["extractLlmContent: structured \u2192 LLM-oriented plain text."]),
735
+ slot("structured-chunks", true, [
736
+ "extractStructuredChunks: structured \u2192 sectioned Markdown chunks."
702
737
  ])
703
738
  ];
704
739
  break;
@@ -733,7 +768,12 @@ function buildNodeCapabilityReport(kind) {
733
768
  ]),
734
769
  slot("structured-output", true, [
735
770
  "extractStructuredData: OCR + layout blocks (ocrImageDetailed / ocrTiff) normalized to StructuredDocumentResult."
736
- ])
771
+ ]),
772
+ slot("markdown", true, [
773
+ "extractMarkdown: OCR structured layout \u2192 Markdown (tables/lists as GFM where blocks exist)."
774
+ ]),
775
+ slot("llm-text", true, ["extractLlmContent: OCR structured \u2192 LLM plain text."]),
776
+ slot("structured-chunks", true, ["extractStructuredChunks: OCR structured \u2192 Markdown sections."])
737
777
  ];
738
778
  break;
739
779
  case "text":
@@ -745,6 +785,13 @@ function buildNodeCapabilityReport(kind) {
745
785
  slot("pages", false),
746
786
  slot("structured-output", true, [
747
787
  "extractStructuredData wraps UTF-8 decode in normalizeToStructuredResult (rollup text only unless you add blocks upstream)."
788
+ ]),
789
+ slot("markdown", true, [
790
+ "extractMarkdown: rollup/decoded text \u2192 Markdown (mostly paragraphs; no layout without upstream blocks)."
791
+ ]),
792
+ slot("llm-text", true, ["extractLlmContent: rollup \u2192 LLM plain text via the same envelope."]),
793
+ slot("structured-chunks", true, [
794
+ "extractStructuredChunks: single-chunk Markdown is typical when only rollup text exists."
748
795
  ])
749
796
  ];
750
797
  break;
@@ -756,7 +803,10 @@ function buildNodeCapabilityReport(kind) {
756
803
  slot("html", false),
757
804
  slot("ocr", false),
758
805
  slot("pages", false),
759
- slot("structured-output", false)
806
+ slot("structured-output", false),
807
+ slot("markdown", false),
808
+ slot("llm-text", false),
809
+ slot("structured-chunks", false)
760
810
  ];
761
811
  }
762
812
  return {
@@ -967,10 +1017,13 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
967
1017
  }
968
1018
  break;
969
1019
  case "extractStructuredData":
1020
+ case "extractMarkdown":
1021
+ case "extractLlmContent":
1022
+ case "extractStructuredChunks":
970
1023
  if (kind === "pdf") {
971
1024
  nativeExtraction = {
972
1025
  willAttempt: true,
973
- description: "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult."
1026
+ description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\`.`
974
1027
  };
975
1028
  ocr = {
976
1029
  mayUse: pdfOcr !== "off",
@@ -979,7 +1032,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
979
1032
  } else if (kind === "docx") {
980
1033
  nativeExtraction = {
981
1034
  willAttempt: true,
982
- description: "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope."
1035
+ description: intent === "extractStructuredData" ? "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope." : `${intent}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\`.`
983
1036
  };
984
1037
  ocr = { mayUse: false, description: "DOCX does not use OCR." };
985
1038
  } else if (kind === "image") {
@@ -994,11 +1047,29 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
994
1047
  } else {
995
1048
  nativeExtraction = {
996
1049
  willAttempt: true,
997
- description: "UTF-8 decode with BOM handling; normalizeToStructuredResult produces the structured envelope."
1050
+ description: intent === "extractStructuredData" ? "UTF-8 decode with BOM handling; normalizeToStructuredResult produces the structured envelope." : `${intent}: UTF-8 structured envelope, then \`@dragon708/docmind-markdown\` export.`
998
1051
  };
999
1052
  ocr = { mayUse: false, description: "OCR does not apply to text files." };
1000
1053
  limitations = lim("Plain text has no native layout blocks; `text` carries the decoded content.");
1001
1054
  }
1055
+ if (intent === "extractMarkdown") {
1056
+ limitations = [
1057
+ ...limitations,
1058
+ ...lim("Output: Markdown string via renderMarkdown (tables, headings, lists).")
1059
+ ];
1060
+ } else if (intent === "extractLlmContent") {
1061
+ limitations = [
1062
+ ...limitations,
1063
+ ...lim("Output: compact plain text via renderLlmText (prompt / embedding friendly).")
1064
+ ];
1065
+ } else if (intent === "extractStructuredChunks") {
1066
+ limitations = [
1067
+ ...limitations,
1068
+ ...lim(
1069
+ "Output: MarkdownSection[] via renderMarkdownSections (splitStructuredIntoChunks under the hood)."
1070
+ )
1071
+ ];
1072
+ }
1002
1073
  break;
1003
1074
  default:
1004
1075
  nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
@@ -1266,6 +1337,23 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr, anal
1266
1337
  };
1267
1338
  }
1268
1339
  }
1340
+ if (intent === "extractMarkdown" || intent === "extractLlmContent" || intent === "extractStructuredChunks") {
1341
+ const sub = planForIntent(
1342
+ "extractStructuredData",
1343
+ kind,
1344
+ pdfOcrForAnalyze,
1345
+ docxInclude,
1346
+ ocr,
1347
+ analyzeFileOutput
1348
+ );
1349
+ return {
1350
+ intent,
1351
+ steps: [
1352
+ ...sub.steps ?? [],
1353
+ { id: "docmind_markdown_render", status: "planned" }
1354
+ ]
1355
+ };
1356
+ }
1269
1357
  return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
1270
1358
  }
1271
1359
  async function getCapabilities(input, options) {
@@ -1291,6 +1379,6 @@ async function explainAnalysisPlan(input, options) {
1291
1379
  return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan, docxInc, ocrSlice);
1292
1380
  }
1293
1381
 
1294
- export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
1382
+ export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractLlmContent, extractMarkdown, extractMetadata, extractStructuredChunks, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
1295
1383
  //# sourceMappingURL=index.js.map
1296
1384
  //# sourceMappingURL=index.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-node",
3
- "version": "1.10.0",
3
+ "version": "1.11.0",
4
4
  "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -33,6 +33,7 @@
33
33
  "license": "MIT",
34
34
  "dependencies": {
35
35
  "@dragon708/docmind-docx": "^1.8.0",
36
+ "@dragon708/docmind-markdown": "^1.0.0",
36
37
  "@dragon708/docmind-ocr": "^1.1.4",
37
38
  "@dragon708/docmind-pdf": "^2.2.0",
38
39
  "@dragon708/docmind-shared": "^1.2.0"