npm - @dragon708/docmind-node - Versions diffs - 1.13.2 → 1.13.4 - Mend

@dragon708/docmind-node 1.13.2 → 1.13.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -6,8 +6,8 @@ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDo
 export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
 import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
 export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
-import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, ExtractMarkdownStrategy, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
-export { MarkdownSection } from '@dragon708/docmind-markdown';
+import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, ConvertHtmlToMarkdownOptions, ConvertCsvToMarkdownOptions, ConvertSpreadsheetToMarkdownOptions, ExtractMarkdownStrategy, ExtractMarkdownRoutingInfo, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
+export { ConvertCsvToMarkdownOptions, ConvertCsvToMarkdownResult, ConvertHtmlToMarkdownOptions, ConvertHtmlToMarkdownResult, ConvertSpreadsheetToMarkdownOptions, ConvertSpreadsheetToMarkdownResult, CsvStringInputMode, DetectedBinaryFormat, ExtractMarkdownMediaHint, ExtractMarkdownRoutingInfo, ExtractMarkdownStrategy, HtmlStringInputMode, MarkdownSection, convertCsvToMarkdown, convertHtmlToMarkdown, convertSpreadsheetToMarkdown, detectBinaryFormat } from '@dragon708/docmind-markdown';
 /**
  * Options for Node public APIs (`analyzeFile`, intent methods).
@@ -54,8 +54,9 @@ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
 }
 /**
  * {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; `markdown` maps to structured-serializer
- * options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf`
- * configure Mammoth→Turndown and `@cognipeer/to-markdown` (`convertPdfToMarkdown`) respectively — separate from `docx` / `pdf` used only by {@link extractStructuredData}.
+ * options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf` /
+ * `markdownHtml` / `markdownCsv` / `markdownSpreadsheet` configure specialized binary routes — separate from `docx` / `pdf`
+ * used only by {@link extractStructuredData}.
  */
 interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
     readonly markdown?: RenderMarkdownOptions;
@@ -63,13 +64,20 @@ interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
     readonly markdownDocx?: ConvertDocxToMarkdownOptions;
     /** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF (Node; `@cognipeer/to-markdown`). */
     readonly markdownPdf?: ConvertPdfToMarkdownOptions;
+    /** Forwarded to `extractMarkdown` → `convertHtmlToMarkdown` when bytes are detected as HTML (Node). */
+    readonly markdownHtml?: ConvertHtmlToMarkdownOptions;
+    /** Forwarded to `extractMarkdown` → `convertCsvToMarkdown` when bytes are detected as CSV (Node). */
+    readonly markdownCsv?: ConvertCsvToMarkdownOptions;
+    /** Forwarded to `extractMarkdown` → `convertSpreadsheetToMarkdown` for `.xlsx` / `.xls` (Node). */
+    readonly markdownSpreadsheet?: ConvertSpreadsheetToMarkdownOptions;
     /**
-     * Optional: receive `strategy` and merged `warnings` from `@dragon708/docmind-markdown` `extractMarkdown`
-     * (e.g. `pdf-cognipeer-specialized` vs `pdf-structured-fallback`, `docx-mammoth` vs `docx-structured-fallback`) without changing the `Promise<string>` return type.
+     * Optional: receive `strategy`, merged `warnings`, and optional {@link ExtractMarkdownRoutingInfo} from
+     * `@dragon708/docmind-markdown` `extractMarkdown` without changing the `Promise<string>` return type.
      */
     readonly onMarkdownExtract?: (info: {
         strategy: ExtractMarkdownStrategy;
         warnings: readonly string[];
+        routing?: ExtractMarkdownRoutingInfo;
     }) => void;
 }
 /**
@@ -142,23 +150,24 @@ declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeEx
  * End-to-end: {@link extractStructuredData} (for fallback + option parity) plus
  * `extractMarkdown` from `@dragon708/docmind-markdown` on `{ data, filename, mimeType }`.
  *
- * On Node, **DOCX** bytes use **Mammoth → Turndown**; **PDF** bytes use **`convertPdfToMarkdown`**
- * (`@cognipeer/to-markdown`, Node-first, no JVM). The structured envelope is always passed as `structuredFallback`
- * so specialized routes can fall back to structured → Markdown serialization with clear warnings when needed.
+ * On Node, **DOCX** bytes use **Mammoth → Turndown**; **PDF**, **HTML**, **CSV**, and **Excel** bytes use Cognipeer-backed
+ * converters in `@dragon708/docmind-markdown` (`convertPdfToMarkdown`, `convertHtmlToMarkdown`, `convertCsvToMarkdown`,
+ * `convertSpreadsheetToMarkdown`). The structured envelope is always passed as `structuredFallback` so specialized routes
+ * can fall back to structured → Markdown serialization with clear warnings when needed.
  *
- * {@link NodeExtractMarkdownOptions.onMarkdownExtract} receives `strategy` and merged `warnings` from the markdown
- * package (for example `pdf-cognipeer-specialized`, `pdf-structured-fallback`, `docx-structured-fallback`, and
- * `[docmind-markdown:extractMarkdown]` trace lines).
+ * {@link NodeExtractMarkdownOptions.onMarkdownExtract} receives `strategy`, merged `warnings`, and optional `routing`
+ * (including `routingSummary`) from the markdown package.
  *
  * @param input - Path, buffer, or {@link NodeAnalyzeInput} accepted by the Node facade.
  * @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`), optional `markdown` serializer knobs,
- *   and optional `markdownDocx` / `markdownPdf` for the binary Markdown pipelines (distinct from structured-only `docx` / `pdf`).
+ *   and optional `markdownDocx` / `markdownPdf` / `markdownHtml` / `markdownCsv` / `markdownSpreadsheet` for binary Markdown
+ *   pipelines (distinct from structured-only `docx` / `pdf`).
  */
 declare function extractMarkdown(input: NodeAnalyzeInput, options?: NodeExtractMarkdownOptions): Promise<string>;
 /**
  * {@link extractStructuredData} then `renderLlmText` in `@dragon708/docmind-markdown` (tagged plain text for LLMs).
- * This path **does not** run the binary PDF/DOCX Markdown pipelines (no `@cognipeer/to-markdown` / Mammoth→Turndown here);
- * it linearizes the structured envelope only. Use {@link extractMarkdown} when you need specialized PDF or DOCX Markdown layout.
+ * This path **does not** run binary Cognipeer / Mammoth→Turndown Markdown pipelines (no `@cognipeer/to-markdown` here);
+ * it linearizes the structured envelope only. Use {@link extractMarkdown} when you need specialized PDF / DOCX / HTML / CSV / Excel Markdown layout on Node.
  *
  * @param options - Structured routing plus optional `llm` passed through to `renderLlmText`.
  */
@@ -275,7 +284,7 @@ type NodeExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "intent">
 /**
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
- * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (hybrid package extract on Node) | `llm-text` | `structured-chunks` (split + Markdown sections)
+ * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (hybrid `extractMarkdown`: DOCX, PDF, HTML, CSV, Excel on Node via `@dragon708/docmind-markdown`) | `llm-text` | `structured-chunks` (split + Markdown sections)
  * (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
  * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
  * For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).

package/dist/index.js CHANGED Viewed

@@ -10,6 +10,7 @@ import { readFile } from 'fs/promises';
 import { basename } from 'path';
 import { fileURLToPath } from 'url';
 import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
+export { convertCsvToMarkdown, convertHtmlToMarkdown, convertSpreadsheetToMarkdown, detectBinaryFormat } from '@dragon708/docmind-markdown';
 // src/analyze.ts
@@ -624,6 +625,9 @@ async function extractMarkdown(input, options) {
     markdown: markdownOpts,
     markdownDocx,
     markdownPdf,
+    markdownHtml,
+    markdownCsv,
+    markdownSpreadsheet,
     onMarkdownExtract,
     ...structuredOpts
   } = options ?? {};
@@ -642,10 +646,13 @@ async function extractMarkdown(input, options) {
       ...markdownOpts ?? {},
       ...markdownDocx !== void 0 ? { docx: markdownDocx } : {},
       ...markdownPdf !== void 0 ? { pdf: markdownPdf } : {},
+      ...markdownHtml !== void 0 ? { html: markdownHtml } : {},
+      ...markdownCsv !== void 0 ? { csv: markdownCsv } : {},
+      ...markdownSpreadsheet !== void 0 ? { spreadsheet: markdownSpreadsheet } : {},
       structuredFallback: structured
     }
   );
-  onMarkdownExtract?.({ strategy: r.strategy, warnings: r.warnings });
+  onMarkdownExtract?.({ strategy: r.strategy, warnings: r.warnings, routing: r.routing });
   return r.markdown;
 }
 async function extractLlmContent(input, options) {
@@ -724,7 +731,7 @@ function buildNodeCapabilityReport(kind) {
           "extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
         ]),
         slot("markdown", true, [
-          "extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes use `@cognipeer/to-markdown` (Node-first, no Java); structured PDF (`extractStructuredData`, respects pdf.ocr) is always built as fallback."
+          "extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF/HTML/CSV/Excel bytes use `@cognipeer/to-markdown` (Node-first, no Java); DOCX uses Mammoth\u2192Turndown; structured envelope from `extractStructuredData` is always `structuredFallback` (PDF respects pdf.ocr)."
         ]),
         slot("llm-text", true, [
           "extractLlmContent: structured envelope \u2192 LLM-oriented plain text (`renderLlmText` in `@dragon708/docmind-markdown`)."
@@ -811,7 +818,7 @@ function buildNodeCapabilityReport(kind) {
           "extractStructuredData wraps UTF-8 decode in normalizeToStructuredResult (rollup text only unless you add blocks upstream)."
         ]),
         slot("markdown", true, [
-          "extractMarkdown: rollup/decoded text \u2192 Markdown (mostly paragraphs; no layout without upstream blocks)."
+          "extractMarkdown: rollup/decoded text \u2192 Markdown (mostly paragraphs; no layout without upstream blocks). If filename/MIME indicate `.html`, `.csv`, `.xlsx`/`.xls`, the markdown package routes to Cognipeer specialized converters on Node."
         ]),
         slot("llm-text", true, ["extractLlmContent: rollup \u2192 LLM plain text via the same envelope."]),
         slot("structured-chunks", true, [
@@ -1080,7 +1087,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
         limitations = [
           ...limitations,
           ...lim(
-            "Output: Markdown string from `@dragon708/docmind-markdown` extractMarkdown (PDF/DOCX binary routes on Node when applicable; structured serializer as fallback)."
+            "Output: Markdown string from `@dragon708/docmind-markdown` extractMarkdown (PDF, DOCX, HTML, CSV, Excel specialized routes on Node when bytes match; structured serializer as `structuredFallback`)."
           )
         ];
       } else if (intent === "extractLlmContent") {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@dragon708/docmind-node",
-  "version": "1.13.2",
+  "version": "1.13.4",
   "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
   "type": "module",
   "main": "./dist/index.js",
@@ -33,7 +33,7 @@
   "license": "MIT",
   "dependencies": {
     "@dragon708/docmind-docx": "^1.8.0",
-    "@dragon708/docmind-markdown": "^1.2.6",
+    "@dragon708/docmind-markdown": "^1.2.8",
     "@dragon708/docmind-ocr": "^1.1.4",
     "@dragon708/docmind-pdf": "^2.2.0",
     "@dragon708/docmind-shared": "^1.2.0"