@dragon708/docmind-node 1.13.2 → 1.13.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -6,8 +6,8 @@ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDo
6
6
  export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
7
7
  import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
8
8
  export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
9
- import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, ExtractMarkdownStrategy, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
10
- export { MarkdownSection } from '@dragon708/docmind-markdown';
9
+ import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, ConvertHtmlToMarkdownOptions, ConvertCsvToMarkdownOptions, ConvertSpreadsheetToMarkdownOptions, ExtractMarkdownStrategy, ExtractMarkdownRoutingInfo, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
10
+ export { ConvertCsvToMarkdownOptions, ConvertCsvToMarkdownResult, ConvertHtmlToMarkdownOptions, ConvertHtmlToMarkdownResult, ConvertSpreadsheetToMarkdownOptions, ConvertSpreadsheetToMarkdownResult, CsvStringInputMode, DetectedBinaryFormat, ExtractMarkdownMediaHint, ExtractMarkdownRoutingInfo, ExtractMarkdownStrategy, HtmlStringInputMode, MarkdownSection, convertCsvToMarkdown, convertHtmlToMarkdown, convertSpreadsheetToMarkdown, detectBinaryFormat } from '@dragon708/docmind-markdown';
11
11
 
12
12
  /**
13
13
  * Options for Node public APIs (`analyzeFile`, intent methods).
@@ -54,8 +54,9 @@ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
54
54
  }
55
55
  /**
56
56
  * {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; `markdown` maps to structured-serializer
57
- * options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf`
58
- * configure Mammoth→Turndown and `@cognipeer/to-markdown` (`convertPdfToMarkdown`) respectively — separate from `docx` / `pdf` used only by {@link extractStructuredData}.
57
+ * options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf` /
58
+ * `markdownHtml` / `markdownCsv` / `markdownSpreadsheet` configure specialized binary routes — separate from `docx` / `pdf`
59
+ * used only by {@link extractStructuredData}.
59
60
  */
60
61
  interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
61
62
  readonly markdown?: RenderMarkdownOptions;
@@ -63,13 +64,20 @@ interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
63
64
  readonly markdownDocx?: ConvertDocxToMarkdownOptions;
64
65
  /** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF (Node; `@cognipeer/to-markdown`). */
65
66
  readonly markdownPdf?: ConvertPdfToMarkdownOptions;
67
+ /** Forwarded to `extractMarkdown` → `convertHtmlToMarkdown` when bytes are detected as HTML (Node). */
68
+ readonly markdownHtml?: ConvertHtmlToMarkdownOptions;
69
+ /** Forwarded to `extractMarkdown` → `convertCsvToMarkdown` when bytes are detected as CSV (Node). */
70
+ readonly markdownCsv?: ConvertCsvToMarkdownOptions;
71
+ /** Forwarded to `extractMarkdown` → `convertSpreadsheetToMarkdown` for `.xlsx` / `.xls` (Node). */
72
+ readonly markdownSpreadsheet?: ConvertSpreadsheetToMarkdownOptions;
66
73
  /**
67
- * Optional: receive `strategy` and merged `warnings` from `@dragon708/docmind-markdown` `extractMarkdown`
68
- * (e.g. `pdf-cognipeer-specialized` vs `pdf-structured-fallback`, `docx-mammoth` vs `docx-structured-fallback`) without changing the `Promise<string>` return type.
74
+ * Optional: receive `strategy`, merged `warnings`, and optional {@link ExtractMarkdownRoutingInfo} from
75
+ * `@dragon708/docmind-markdown` `extractMarkdown` without changing the `Promise<string>` return type.
69
76
  */
70
77
  readonly onMarkdownExtract?: (info: {
71
78
  strategy: ExtractMarkdownStrategy;
72
79
  warnings: readonly string[];
80
+ routing?: ExtractMarkdownRoutingInfo;
73
81
  }) => void;
74
82
  }
75
83
  /**
@@ -142,23 +150,24 @@ declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeEx
142
150
  * End-to-end: {@link extractStructuredData} (for fallback + option parity) plus
143
151
  * `extractMarkdown` from `@dragon708/docmind-markdown` on `{ data, filename, mimeType }`.
144
152
  *
145
- * On Node, **DOCX** bytes use **Mammoth → Turndown**; **PDF** bytes use **`convertPdfToMarkdown`**
146
- * (`@cognipeer/to-markdown`, Node-first, no JVM). The structured envelope is always passed as `structuredFallback`
147
- * so specialized routes can fall back to structured Markdown serialization with clear warnings when needed.
153
+ * On Node, **DOCX** bytes use **Mammoth → Turndown**; **PDF**, **HTML**, **CSV**, and **Excel** bytes use Cognipeer-backed
154
+ * converters in `@dragon708/docmind-markdown` (`convertPdfToMarkdown`, `convertHtmlToMarkdown`, `convertCsvToMarkdown`,
155
+ * `convertSpreadsheetToMarkdown`). The structured envelope is always passed as `structuredFallback` so specialized routes
156
+ * can fall back to structured → Markdown serialization with clear warnings when needed.
148
157
  *
149
- * {@link NodeExtractMarkdownOptions.onMarkdownExtract} receives `strategy` and merged `warnings` from the markdown
150
- * package (for example `pdf-cognipeer-specialized`, `pdf-structured-fallback`, `docx-structured-fallback`, and
151
- * `[docmind-markdown:extractMarkdown]` trace lines).
158
+ * {@link NodeExtractMarkdownOptions.onMarkdownExtract} receives `strategy`, merged `warnings`, and optional `routing`
159
+ * (including `routingSummary`) from the markdown package.
152
160
  *
153
161
  * @param input - Path, buffer, or {@link NodeAnalyzeInput} accepted by the Node facade.
154
162
  * @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`), optional `markdown` serializer knobs,
155
- * and optional `markdownDocx` / `markdownPdf` for the binary Markdown pipelines (distinct from structured-only `docx` / `pdf`).
163
+ * and optional `markdownDocx` / `markdownPdf` / `markdownHtml` / `markdownCsv` / `markdownSpreadsheet` for binary Markdown
164
+ * pipelines (distinct from structured-only `docx` / `pdf`).
156
165
  */
157
166
  declare function extractMarkdown(input: NodeAnalyzeInput, options?: NodeExtractMarkdownOptions): Promise<string>;
158
167
  /**
159
168
  * {@link extractStructuredData} then `renderLlmText` in `@dragon708/docmind-markdown` (tagged plain text for LLMs).
160
- * This path **does not** run the binary PDF/DOCX Markdown pipelines (no `@cognipeer/to-markdown` / Mammoth→Turndown here);
161
- * it linearizes the structured envelope only. Use {@link extractMarkdown} when you need specialized PDF or DOCX Markdown layout.
169
+ * This path **does not** run binary Cognipeer / Mammoth→Turndown Markdown pipelines (no `@cognipeer/to-markdown` here);
170
+ * it linearizes the structured envelope only. Use {@link extractMarkdown} when you need specialized PDF / DOCX / HTML / CSV / Excel Markdown layout on Node.
162
171
  *
163
172
  * @param options - Structured routing plus optional `llm` passed through to `renderLlmText`.
164
173
  */
@@ -275,7 +284,7 @@ type NodeExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "intent">
275
284
 
276
285
  /**
277
286
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
278
- * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (hybrid package extract on Node) | `llm-text` | `structured-chunks` (split + Markdown sections)
287
+ * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (hybrid `extractMarkdown`: DOCX, PDF, HTML, CSV, Excel on Node via `@dragon708/docmind-markdown`) | `llm-text` | `structured-chunks` (split + Markdown sections)
279
288
  * (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
280
289
  * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
281
290
  * For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
package/dist/index.js CHANGED
@@ -10,6 +10,7 @@ import { readFile } from 'fs/promises';
10
10
  import { basename } from 'path';
11
11
  import { fileURLToPath } from 'url';
12
12
  import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
13
+ export { convertCsvToMarkdown, convertHtmlToMarkdown, convertSpreadsheetToMarkdown, detectBinaryFormat } from '@dragon708/docmind-markdown';
13
14
 
14
15
  // src/analyze.ts
15
16
 
@@ -624,6 +625,9 @@ async function extractMarkdown(input, options) {
624
625
  markdown: markdownOpts,
625
626
  markdownDocx,
626
627
  markdownPdf,
628
+ markdownHtml,
629
+ markdownCsv,
630
+ markdownSpreadsheet,
627
631
  onMarkdownExtract,
628
632
  ...structuredOpts
629
633
  } = options ?? {};
@@ -642,10 +646,13 @@ async function extractMarkdown(input, options) {
642
646
  ...markdownOpts ?? {},
643
647
  ...markdownDocx !== void 0 ? { docx: markdownDocx } : {},
644
648
  ...markdownPdf !== void 0 ? { pdf: markdownPdf } : {},
649
+ ...markdownHtml !== void 0 ? { html: markdownHtml } : {},
650
+ ...markdownCsv !== void 0 ? { csv: markdownCsv } : {},
651
+ ...markdownSpreadsheet !== void 0 ? { spreadsheet: markdownSpreadsheet } : {},
645
652
  structuredFallback: structured
646
653
  }
647
654
  );
648
- onMarkdownExtract?.({ strategy: r.strategy, warnings: r.warnings });
655
+ onMarkdownExtract?.({ strategy: r.strategy, warnings: r.warnings, routing: r.routing });
649
656
  return r.markdown;
650
657
  }
651
658
  async function extractLlmContent(input, options) {
@@ -724,7 +731,7 @@ function buildNodeCapabilityReport(kind) {
724
731
  "extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
725
732
  ]),
726
733
  slot("markdown", true, [
727
- "extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes use `@cognipeer/to-markdown` (Node-first, no Java); structured PDF (`extractStructuredData`, respects pdf.ocr) is always built as fallback."
734
+ "extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF/HTML/CSV/Excel bytes use `@cognipeer/to-markdown` (Node-first, no Java); DOCX uses Mammoth\u2192Turndown; structured envelope from `extractStructuredData` is always `structuredFallback` (PDF respects pdf.ocr)."
728
735
  ]),
729
736
  slot("llm-text", true, [
730
737
  "extractLlmContent: structured envelope \u2192 LLM-oriented plain text (`renderLlmText` in `@dragon708/docmind-markdown`)."
@@ -811,7 +818,7 @@ function buildNodeCapabilityReport(kind) {
811
818
  "extractStructuredData wraps UTF-8 decode in normalizeToStructuredResult (rollup text only unless you add blocks upstream)."
812
819
  ]),
813
820
  slot("markdown", true, [
814
- "extractMarkdown: rollup/decoded text \u2192 Markdown (mostly paragraphs; no layout without upstream blocks)."
821
+ "extractMarkdown: rollup/decoded text \u2192 Markdown (mostly paragraphs; no layout without upstream blocks). If filename/MIME indicate `.html`, `.csv`, `.xlsx`/`.xls`, the markdown package routes to Cognipeer specialized converters on Node."
815
822
  ]),
816
823
  slot("llm-text", true, ["extractLlmContent: rollup \u2192 LLM plain text via the same envelope."]),
817
824
  slot("structured-chunks", true, [
@@ -1080,7 +1087,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
1080
1087
  limitations = [
1081
1088
  ...limitations,
1082
1089
  ...lim(
1083
- "Output: Markdown string from `@dragon708/docmind-markdown` extractMarkdown (PDF/DOCX binary routes on Node when applicable; structured serializer as fallback)."
1090
+ "Output: Markdown string from `@dragon708/docmind-markdown` extractMarkdown (PDF, DOCX, HTML, CSV, Excel specialized routes on Node when bytes match; structured serializer as `structuredFallback`)."
1084
1091
  )
1085
1092
  ];
1086
1093
  } else if (intent === "extractLlmContent") {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-node",
3
- "version": "1.13.2",
3
+ "version": "1.13.4",
4
4
  "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -33,7 +33,7 @@
33
33
  "license": "MIT",
34
34
  "dependencies": {
35
35
  "@dragon708/docmind-docx": "^1.8.0",
36
- "@dragon708/docmind-markdown": "^1.2.6",
36
+ "@dragon708/docmind-markdown": "^1.2.8",
37
37
  "@dragon708/docmind-ocr": "^1.1.4",
38
38
  "@dragon708/docmind-pdf": "^2.2.0",
39
39
  "@dragon708/docmind-shared": "^1.2.0"