@dragon708/docmind-node 1.11.0 → 1.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -6,7 +6,7 @@ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDo
6
6
  export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
7
7
  import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
8
8
  export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
9
- import { RenderLlmTextOptions, RenderMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
9
+ import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
10
10
  export { MarkdownSection } from '@dragon708/docmind-markdown';
11
11
 
12
12
  /**
@@ -53,10 +53,16 @@ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
53
53
  readonly normalize?: NormalizeStructuredOptions;
54
54
  }
55
55
  /**
56
- * {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; set `markdown` for `renderMarkdown` knobs (`@dragon708/docmind-markdown`).
56
+ * {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; `markdown` maps to structured-serializer
57
+ * options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf`
58
+ * configure Mammoth→Turndown and `@opendataloader/pdf` respectively — separate from `docx` / `pdf` used only by {@link extractStructuredData}.
57
59
  */
58
60
  interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
59
61
  readonly markdown?: RenderMarkdownOptions;
62
+ /** Forwarded to `extractMarkdown` → `convertDocxToMarkdown` when bytes are DOCX (Node). */
63
+ readonly markdownDocx?: ConvertDocxToMarkdownOptions;
64
+ /** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF. */
65
+ readonly markdownPdf?: ConvertPdfToMarkdownOptions;
60
66
  }
61
67
  /**
62
68
  * {@link extractLlmContent}: same structured fields; `llm` forwards to `renderLlmText`.
@@ -125,20 +131,26 @@ declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions):
125
131
  declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
126
132
 
127
133
  /**
128
- * End-to-end: resolve bytes → {@link extractStructuredData} `renderMarkdown` (`@dragon708/docmind-markdown`).
134
+ * End-to-end: {@link extractStructuredData} (for fallback + option parity) plus
135
+ * `extractMarkdown` from `@dragon708/docmind-markdown` on `{ data, filename, mimeType }`.
136
+ * On Node, PDF/DOCX bytes use specialized routes (`@opendataloader/pdf`, Mammoth+Turndown) when detection matches;
137
+ * the structured envelope is always passed as `structuredFallback`.
129
138
  *
130
139
  * @param input - Path, buffer, or {@link NodeAnalyzeInput} accepted by the Node facade.
131
- * @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`) plus optional `markdown` render options.
140
+ * @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`), optional `markdown` serializer knobs,
141
+ * and optional `markdownDocx` / `markdownPdf` for the binary Markdown pipelines (distinct from structured-only `docx` / `pdf`).
132
142
  */
133
143
  declare function extractMarkdown(input: NodeAnalyzeInput, options?: NodeExtractMarkdownOptions): Promise<string>;
134
144
  /**
135
- * Same routing as {@link extractMarkdown}, then `renderLlmText` for prompts / RAG.
145
+ * {@link extractStructuredData} then `renderLlmText` in `@dragon708/docmind-markdown` (tagged plain text for LLMs).
146
+ * That package's `extractLlmContent` is the same transform on an in-memory structured result only.
136
147
  *
137
- * @param options - Optional `llm` slice forwarded to `@dragon708/docmind-markdown`.
148
+ * @param options - Structured routing plus optional `llm` passed through to `renderLlmText`.
138
149
  */
139
150
  declare function extractLlmContent(input: NodeAnalyzeInput, options?: NodeExtractLlmContentOptions): Promise<string>;
140
151
  /**
141
- * Structured extract → `renderMarkdownSections` (Markdown + optional parallel `text` per slice).
152
+ * Structured extract → {@link renderMarkdownSections} (`splitStructuredIntoChunks` with `includeMarkdown: true`;
153
+ * same layer as `extractStructuredChunks` / `splitStructuredIntoChunks` in `@dragon708/docmind-markdown`).
142
154
  *
143
155
  * @param options - Optional `chunks` (e.g. `maxChars`, `preferHeadings`) from `@dragon708/docmind-markdown`.
144
156
  */
@@ -146,7 +158,7 @@ declare function extractStructuredChunks(input: NodeAnalyzeInput, options?: Node
146
158
 
147
159
  /** High-level features the user can ask DocMind for (per input kind and runtime). */
148
160
  type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
149
- /** Node: {@link extractMarkdown} after {@link extractStructuredData} via `@dragon708/docmind-markdown`. */
161
+ /** Node: {@link extractMarkdown} hybrid `extractMarkdown` in `@dragon708/docmind-markdown` (binary PDF/DOCX routes + structured fallback). */
150
162
  | "markdown"
151
163
  /** Node: {@link extractLlmContent} (LLM-oriented plain text). */
152
164
  | "llm-text"
@@ -248,7 +260,7 @@ type NodeExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "intent">
248
260
 
249
261
  /**
250
262
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
251
- * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` | `llm-text` | `structured-chunks`
263
+ * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (hybrid package extract on Node) | `llm-text` | `structured-chunks` (split + Markdown sections)
252
264
  * (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
253
265
  * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
254
266
  * For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
package/dist/index.js CHANGED
@@ -1,4 +1,4 @@
1
- import { assertValidAnalyzeFileInput, detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, notImplementedResult, analyzeFileRequestsStructured, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
1
+ import { assertValidAnalyzeFileInput, detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, notImplementedResult, analyzeFileRequestsStructured, isNamedInput, toUint8Array, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
2
2
  export { analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
3
3
  import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-docx';
4
4
  export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
@@ -9,7 +9,7 @@ export { extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
9
9
  import { readFile } from 'fs/promises';
10
10
  import { basename } from 'path';
11
11
  import { fileURLToPath } from 'url';
12
- import { renderMarkdown, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
12
+ import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
13
13
 
14
14
  // src/analyze.ts
15
15
 
@@ -620,9 +620,26 @@ async function runOcr(input, options) {
620
620
  }
621
621
  async function extractMarkdown(input, options) {
622
622
  throwIfAborted(options?.signal);
623
- const { markdown: markdownOpts, ...structuredOpts } = options ?? {};
624
- const structured = await extractStructuredData(input, structuredOpts);
625
- return renderMarkdown(structured, markdownOpts);
623
+ const { markdown: markdownOpts, markdownDocx, markdownPdf, ...structuredOpts } = options ?? {};
624
+ const resolved = await resolveNodeAnalyzeInput(input);
625
+ const structured = await extractStructuredData(resolved, structuredOpts);
626
+ const data = await bytesFromDetectInput(resolved);
627
+ let filename;
628
+ let mimeType;
629
+ if (isNamedInput(resolved)) {
630
+ filename = resolved.name;
631
+ mimeType = resolved.mimeType;
632
+ }
633
+ const r = await extractMarkdown$1(
634
+ { data, filename, mimeType },
635
+ {
636
+ ...markdownOpts ?? {},
637
+ ...markdownDocx !== void 0 ? { docx: markdownDocx } : {},
638
+ ...markdownPdf !== void 0 ? { pdf: markdownPdf } : {},
639
+ structuredFallback: structured
640
+ }
641
+ );
642
+ return r.markdown;
626
643
  }
627
644
  async function extractLlmContent(input, options) {
628
645
  throwIfAborted(options?.signal);
@@ -700,13 +717,13 @@ function buildNodeCapabilityReport(kind) {
700
717
  "extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
701
718
  ]),
702
719
  slot("markdown", true, [
703
- "extractMarkdown: same structured pipeline, then `@dragon708/docmind-markdown` renderMarkdown (GFM-style)."
720
+ "extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes prefer `@opendataloader/pdf` \u2192 Markdown; structured PDF (`extractStructuredData`, respects pdf.ocr) is always built as fallback and for non-binary inputs."
704
721
  ]),
705
722
  slot("llm-text", true, [
706
- "extractLlmContent: structured \u2192 compact plain text (renderLlmText) for prompts / embeddings."
723
+ "extractLlmContent: structured envelope \u2192 LLM-oriented plain text (`renderLlmText` in `@dragon708/docmind-markdown`)."
707
724
  ]),
708
725
  slot("structured-chunks", true, [
709
- "extractStructuredChunks: structured \u2192 Markdown sections via split/render (heading-aware chunking)."
726
+ "extractStructuredChunks: structured \u2192 `renderMarkdownSections` (splitStructuredIntoChunks + Markdown per slice; heading-aware chunking)."
710
727
  ])
711
728
  ];
712
729
  break;
@@ -729,7 +746,7 @@ function buildNodeCapabilityReport(kind) {
729
746
  "extractStructuredData runs analyzeDocx with merged OOXML includes (blocks, tables, headings, embedded images, etc.) into StructuredDocumentResult."
730
747
  ]),
731
748
  slot("markdown", true, [
732
- "extractMarkdown: structured DOCX envelope \u2192 Markdown (`@dragon708/docmind-markdown`)."
749
+ "extractMarkdown: hybrid \u2014 DOCX bytes use Mammoth\u2192Turndown on Node; structured DOCX (`extractStructuredData`, options.docx.include) is always built as fallback."
733
750
  ]),
734
751
  slot("llm-text", true, ["extractLlmContent: structured \u2192 LLM-oriented plain text."]),
735
752
  slot("structured-chunks", true, [
@@ -1023,7 +1040,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
1023
1040
  if (kind === "pdf") {
1024
1041
  nativeExtraction = {
1025
1042
  willAttempt: true,
1026
- description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\`.`
1043
+ description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : intent === "extractMarkdown" ? "extractMarkdown: structured PDF extract (same as extractStructuredData) for fallback; primary Markdown from `@dragon708/docmind-markdown` tries `@opendataloader/pdf` on PDF bytes when possible." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (renderLlmText or chunk sections).`
1027
1044
  };
1028
1045
  ocr = {
1029
1046
  mayUse: pdfOcr !== "off",
@@ -1032,7 +1049,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
1032
1049
  } else if (kind === "docx") {
1033
1050
  nativeExtraction = {
1034
1051
  willAttempt: true,
1035
- description: intent === "extractStructuredData" ? "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope." : `${intent}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\`.`
1052
+ description: intent === "extractStructuredData" ? "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope." : intent === "extractMarkdown" ? "extractMarkdown: structured DOCX envelope for fallback; primary Markdown from Mammoth\u2192Turndown on DOCX bytes when possible (`@dragon708/docmind-markdown`)." : `${intent}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\`.`
1036
1053
  };
1037
1054
  ocr = { mayUse: false, description: "DOCX does not use OCR." };
1038
1055
  } else if (kind === "image") {
@@ -1055,7 +1072,9 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
1055
1072
  if (intent === "extractMarkdown") {
1056
1073
  limitations = [
1057
1074
  ...limitations,
1058
- ...lim("Output: Markdown string via renderMarkdown (tables, headings, lists).")
1075
+ ...lim(
1076
+ "Output: Markdown string from `@dragon708/docmind-markdown` extractMarkdown (PDF/DOCX binary routes on Node when applicable; structured serializer as fallback)."
1077
+ )
1059
1078
  ];
1060
1079
  } else if (intent === "extractLlmContent") {
1061
1080
  limitations = [
@@ -1066,7 +1085,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
1066
1085
  limitations = [
1067
1086
  ...limitations,
1068
1087
  ...lim(
1069
- "Output: MarkdownSection[] via renderMarkdownSections (splitStructuredIntoChunks under the hood)."
1088
+ "Output: MarkdownSection[] via renderMarkdownSections (splitStructuredIntoChunks / extractStructuredChunks alias in `@dragon708/docmind-markdown`)."
1070
1089
  )
1071
1090
  ];
1072
1091
  }
@@ -1337,7 +1356,27 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr, anal
1337
1356
  };
1338
1357
  }
1339
1358
  }
1340
- if (intent === "extractMarkdown" || intent === "extractLlmContent" || intent === "extractStructuredChunks") {
1359
+ if (intent === "extractMarkdown") {
1360
+ const sub = planForIntent(
1361
+ "extractStructuredData",
1362
+ kind,
1363
+ pdfOcrForAnalyze,
1364
+ docxInclude,
1365
+ ocr,
1366
+ analyzeFileOutput
1367
+ );
1368
+ return {
1369
+ intent,
1370
+ steps: [
1371
+ ...sub.steps ?? [],
1372
+ {
1373
+ id: "markdown_hybrid_package",
1374
+ status: "planned"
1375
+ }
1376
+ ]
1377
+ };
1378
+ }
1379
+ if (intent === "extractLlmContent" || intent === "extractStructuredChunks") {
1341
1380
  const sub = planForIntent(
1342
1381
  "extractStructuredData",
1343
1382
  kind,
@@ -1350,7 +1389,10 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr, anal
1350
1389
  intent,
1351
1390
  steps: [
1352
1391
  ...sub.steps ?? [],
1353
- { id: "docmind_markdown_render", status: "planned" }
1392
+ {
1393
+ id: intent === "extractLlmContent" ? "docmind_markdown_llm_text" : "docmind_markdown_chunk_sections",
1394
+ status: "planned"
1395
+ }
1354
1396
  ]
1355
1397
  };
1356
1398
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-node",
3
- "version": "1.11.0",
3
+ "version": "1.12.1",
4
4
  "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -33,7 +33,7 @@
33
33
  "license": "MIT",
34
34
  "dependencies": {
35
35
  "@dragon708/docmind-docx": "^1.8.0",
36
- "@dragon708/docmind-markdown": "^1.0.0",
36
+ "@dragon708/docmind-markdown": "^1.1.1",
37
37
  "@dragon708/docmind-ocr": "^1.1.4",
38
38
  "@dragon708/docmind-pdf": "^2.2.0",
39
39
  "@dragon708/docmind-shared": "^1.2.0"