@dragon708/docmind-node 1.13.2 → 1.13.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +25 -16
- package/dist/index.js +11 -4
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -6,8 +6,8 @@ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDo
|
|
|
6
6
|
export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
|
|
7
7
|
import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
|
|
8
8
|
export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
|
|
9
|
-
import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, ExtractMarkdownStrategy, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
|
|
10
|
-
export { MarkdownSection } from '@dragon708/docmind-markdown';
|
|
9
|
+
import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, ConvertHtmlToMarkdownOptions, ConvertCsvToMarkdownOptions, ConvertSpreadsheetToMarkdownOptions, ExtractMarkdownStrategy, ExtractMarkdownRoutingInfo, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
|
|
10
|
+
export { ConvertCsvToMarkdownOptions, ConvertCsvToMarkdownResult, ConvertHtmlToMarkdownOptions, ConvertHtmlToMarkdownResult, ConvertSpreadsheetToMarkdownOptions, ConvertSpreadsheetToMarkdownResult, CsvStringInputMode, DetectedBinaryFormat, ExtractMarkdownMediaHint, ExtractMarkdownRoutingInfo, ExtractMarkdownStrategy, HtmlStringInputMode, MarkdownSection, convertCsvToMarkdown, convertHtmlToMarkdown, convertSpreadsheetToMarkdown, detectBinaryFormat } from '@dragon708/docmind-markdown';
|
|
11
11
|
|
|
12
12
|
/**
|
|
13
13
|
* Options for Node public APIs (`analyzeFile`, intent methods).
|
|
@@ -54,8 +54,9 @@ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
|
|
|
54
54
|
}
|
|
55
55
|
/**
|
|
56
56
|
* {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; `markdown` maps to structured-serializer
|
|
57
|
-
* options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf`
|
|
58
|
-
*
|
|
57
|
+
* options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf` /
|
|
58
|
+
* `markdownHtml` / `markdownCsv` / `markdownSpreadsheet` configure specialized binary routes — separate from `docx` / `pdf`
|
|
59
|
+
* used only by {@link extractStructuredData}.
|
|
59
60
|
*/
|
|
60
61
|
interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
|
|
61
62
|
readonly markdown?: RenderMarkdownOptions;
|
|
@@ -63,13 +64,20 @@ interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
|
|
|
63
64
|
readonly markdownDocx?: ConvertDocxToMarkdownOptions;
|
|
64
65
|
/** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF (Node; `@cognipeer/to-markdown`). */
|
|
65
66
|
readonly markdownPdf?: ConvertPdfToMarkdownOptions;
|
|
67
|
+
/** Forwarded to `extractMarkdown` → `convertHtmlToMarkdown` when bytes are detected as HTML (Node). */
|
|
68
|
+
readonly markdownHtml?: ConvertHtmlToMarkdownOptions;
|
|
69
|
+
/** Forwarded to `extractMarkdown` → `convertCsvToMarkdown` when bytes are detected as CSV (Node). */
|
|
70
|
+
readonly markdownCsv?: ConvertCsvToMarkdownOptions;
|
|
71
|
+
/** Forwarded to `extractMarkdown` → `convertSpreadsheetToMarkdown` for `.xlsx` / `.xls` (Node). */
|
|
72
|
+
readonly markdownSpreadsheet?: ConvertSpreadsheetToMarkdownOptions;
|
|
66
73
|
/**
|
|
67
|
-
* Optional: receive `strategy
|
|
68
|
-
*
|
|
74
|
+
* Optional: receive `strategy`, merged `warnings`, and optional {@link ExtractMarkdownRoutingInfo} from
|
|
75
|
+
* `@dragon708/docmind-markdown` `extractMarkdown` without changing the `Promise<string>` return type.
|
|
69
76
|
*/
|
|
70
77
|
readonly onMarkdownExtract?: (info: {
|
|
71
78
|
strategy: ExtractMarkdownStrategy;
|
|
72
79
|
warnings: readonly string[];
|
|
80
|
+
routing?: ExtractMarkdownRoutingInfo;
|
|
73
81
|
}) => void;
|
|
74
82
|
}
|
|
75
83
|
/**
|
|
@@ -142,23 +150,24 @@ declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeEx
|
|
|
142
150
|
* End-to-end: {@link extractStructuredData} (for fallback + option parity) plus
|
|
143
151
|
* `extractMarkdown` from `@dragon708/docmind-markdown` on `{ data, filename, mimeType }`.
|
|
144
152
|
*
|
|
145
|
-
* On Node, **DOCX** bytes use **Mammoth → Turndown**; **PDF** bytes use
|
|
146
|
-
*
|
|
147
|
-
*
|
|
153
|
+
* On Node, **DOCX** bytes use **Mammoth → Turndown**; **PDF**, **HTML**, **CSV**, and **Excel** bytes use Cognipeer-backed
|
|
154
|
+
* converters in `@dragon708/docmind-markdown` (`convertPdfToMarkdown`, `convertHtmlToMarkdown`, `convertCsvToMarkdown`,
|
|
155
|
+
* `convertSpreadsheetToMarkdown`). The structured envelope is always passed as `structuredFallback` so specialized routes
|
|
156
|
+
* can fall back to structured → Markdown serialization with clear warnings when needed.
|
|
148
157
|
*
|
|
149
|
-
* {@link NodeExtractMarkdownOptions.onMarkdownExtract} receives `strategy
|
|
150
|
-
*
|
|
151
|
-
* `[docmind-markdown:extractMarkdown]` trace lines).
|
|
158
|
+
* {@link NodeExtractMarkdownOptions.onMarkdownExtract} receives `strategy`, merged `warnings`, and optional `routing`
|
|
159
|
+
* (including `routingSummary`) from the markdown package.
|
|
152
160
|
*
|
|
153
161
|
* @param input - Path, buffer, or {@link NodeAnalyzeInput} accepted by the Node facade.
|
|
154
162
|
* @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`), optional `markdown` serializer knobs,
|
|
155
|
-
* and optional `markdownDocx` / `markdownPdf`
|
|
163
|
+
* and optional `markdownDocx` / `markdownPdf` / `markdownHtml` / `markdownCsv` / `markdownSpreadsheet` for binary Markdown
|
|
164
|
+
* pipelines (distinct from structured-only `docx` / `pdf`).
|
|
156
165
|
*/
|
|
157
166
|
declare function extractMarkdown(input: NodeAnalyzeInput, options?: NodeExtractMarkdownOptions): Promise<string>;
|
|
158
167
|
/**
|
|
159
168
|
* {@link extractStructuredData} then `renderLlmText` in `@dragon708/docmind-markdown` (tagged plain text for LLMs).
|
|
160
|
-
* This path **does not** run
|
|
161
|
-
* it linearizes the structured envelope only. Use {@link extractMarkdown} when you need specialized PDF
|
|
169
|
+
* This path **does not** run binary Cognipeer / Mammoth→Turndown Markdown pipelines (no `@cognipeer/to-markdown` here);
|
|
170
|
+
* it linearizes the structured envelope only. Use {@link extractMarkdown} when you need specialized PDF / DOCX / HTML / CSV / Excel Markdown layout on Node.
|
|
162
171
|
*
|
|
163
172
|
* @param options - Structured routing plus optional `llm` passed through to `renderLlmText`.
|
|
164
173
|
*/
|
|
@@ -275,7 +284,7 @@ type NodeExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "intent">
|
|
|
275
284
|
|
|
276
285
|
/**
|
|
277
286
|
* Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
|
|
278
|
-
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (hybrid
|
|
287
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (hybrid `extractMarkdown`: DOCX, PDF, HTML, CSV, Excel on Node via `@dragon708/docmind-markdown`) | `llm-text` | `structured-chunks` (split + Markdown sections)
|
|
279
288
|
* (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
|
|
280
289
|
* `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
|
|
281
290
|
* For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
|
package/dist/index.js
CHANGED
|
@@ -10,6 +10,7 @@ import { readFile } from 'fs/promises';
|
|
|
10
10
|
import { basename } from 'path';
|
|
11
11
|
import { fileURLToPath } from 'url';
|
|
12
12
|
import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
|
|
13
|
+
export { convertCsvToMarkdown, convertHtmlToMarkdown, convertSpreadsheetToMarkdown, detectBinaryFormat } from '@dragon708/docmind-markdown';
|
|
13
14
|
|
|
14
15
|
// src/analyze.ts
|
|
15
16
|
|
|
@@ -624,6 +625,9 @@ async function extractMarkdown(input, options) {
|
|
|
624
625
|
markdown: markdownOpts,
|
|
625
626
|
markdownDocx,
|
|
626
627
|
markdownPdf,
|
|
628
|
+
markdownHtml,
|
|
629
|
+
markdownCsv,
|
|
630
|
+
markdownSpreadsheet,
|
|
627
631
|
onMarkdownExtract,
|
|
628
632
|
...structuredOpts
|
|
629
633
|
} = options ?? {};
|
|
@@ -642,10 +646,13 @@ async function extractMarkdown(input, options) {
|
|
|
642
646
|
...markdownOpts ?? {},
|
|
643
647
|
...markdownDocx !== void 0 ? { docx: markdownDocx } : {},
|
|
644
648
|
...markdownPdf !== void 0 ? { pdf: markdownPdf } : {},
|
|
649
|
+
...markdownHtml !== void 0 ? { html: markdownHtml } : {},
|
|
650
|
+
...markdownCsv !== void 0 ? { csv: markdownCsv } : {},
|
|
651
|
+
...markdownSpreadsheet !== void 0 ? { spreadsheet: markdownSpreadsheet } : {},
|
|
645
652
|
structuredFallback: structured
|
|
646
653
|
}
|
|
647
654
|
);
|
|
648
|
-
onMarkdownExtract?.({ strategy: r.strategy, warnings: r.warnings });
|
|
655
|
+
onMarkdownExtract?.({ strategy: r.strategy, warnings: r.warnings, routing: r.routing });
|
|
649
656
|
return r.markdown;
|
|
650
657
|
}
|
|
651
658
|
async function extractLlmContent(input, options) {
|
|
@@ -724,7 +731,7 @@ function buildNodeCapabilityReport(kind) {
|
|
|
724
731
|
"extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
|
|
725
732
|
]),
|
|
726
733
|
slot("markdown", true, [
|
|
727
|
-
"extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes use `@cognipeer/to-markdown` (Node-first, no Java); structured
|
|
734
|
+
"extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF/HTML/CSV/Excel bytes use `@cognipeer/to-markdown` (Node-first, no Java); DOCX uses Mammoth\u2192Turndown; structured envelope from `extractStructuredData` is always `structuredFallback` (PDF respects pdf.ocr)."
|
|
728
735
|
]),
|
|
729
736
|
slot("llm-text", true, [
|
|
730
737
|
"extractLlmContent: structured envelope \u2192 LLM-oriented plain text (`renderLlmText` in `@dragon708/docmind-markdown`)."
|
|
@@ -811,7 +818,7 @@ function buildNodeCapabilityReport(kind) {
|
|
|
811
818
|
"extractStructuredData wraps UTF-8 decode in normalizeToStructuredResult (rollup text only unless you add blocks upstream)."
|
|
812
819
|
]),
|
|
813
820
|
slot("markdown", true, [
|
|
814
|
-
"extractMarkdown: rollup/decoded text \u2192 Markdown (mostly paragraphs; no layout without upstream blocks)."
|
|
821
|
+
"extractMarkdown: rollup/decoded text \u2192 Markdown (mostly paragraphs; no layout without upstream blocks). If filename/MIME indicate `.html`, `.csv`, `.xlsx`/`.xls`, the markdown package routes to Cognipeer specialized converters on Node."
|
|
815
822
|
]),
|
|
816
823
|
slot("llm-text", true, ["extractLlmContent: rollup \u2192 LLM plain text via the same envelope."]),
|
|
817
824
|
slot("structured-chunks", true, [
|
|
@@ -1080,7 +1087,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
|
|
|
1080
1087
|
limitations = [
|
|
1081
1088
|
...limitations,
|
|
1082
1089
|
...lim(
|
|
1083
|
-
"Output: Markdown string from `@dragon708/docmind-markdown` extractMarkdown (PDF
|
|
1090
|
+
"Output: Markdown string from `@dragon708/docmind-markdown` extractMarkdown (PDF, DOCX, HTML, CSV, Excel specialized routes on Node when bytes match; structured serializer as `structuredFallback`)."
|
|
1084
1091
|
)
|
|
1085
1092
|
];
|
|
1086
1093
|
} else if (intent === "extractLlmContent") {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-node",
|
|
3
|
-
"version": "1.13.
|
|
3
|
+
"version": "1.13.4",
|
|
4
4
|
"description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -33,7 +33,7 @@
|
|
|
33
33
|
"license": "MIT",
|
|
34
34
|
"dependencies": {
|
|
35
35
|
"@dragon708/docmind-docx": "^1.8.0",
|
|
36
|
-
"@dragon708/docmind-markdown": "^1.2.
|
|
36
|
+
"@dragon708/docmind-markdown": "^1.2.8",
|
|
37
37
|
"@dragon708/docmind-ocr": "^1.1.4",
|
|
38
38
|
"@dragon708/docmind-pdf": "^2.2.0",
|
|
39
39
|
"@dragon708/docmind-shared": "^1.2.0"
|