@dragon708/docmind-node 1.11.0 → 1.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +21 -9
- package/dist/index.js +57 -15
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -6,7 +6,7 @@ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDo
|
|
|
6
6
|
export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
|
|
7
7
|
import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
|
|
8
8
|
export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
|
|
9
|
-
import { RenderLlmTextOptions, RenderMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
|
|
9
|
+
import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
|
|
10
10
|
export { MarkdownSection } from '@dragon708/docmind-markdown';
|
|
11
11
|
|
|
12
12
|
/**
|
|
@@ -53,10 +53,16 @@ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
|
|
|
53
53
|
readonly normalize?: NormalizeStructuredOptions;
|
|
54
54
|
}
|
|
55
55
|
/**
|
|
56
|
-
* {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions};
|
|
56
|
+
* {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; `markdown` maps to structured-serializer
|
|
57
|
+
* options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf`
|
|
58
|
+
* configure Mammoth→Turndown and `@opendataloader/pdf` respectively — separate from `docx` / `pdf` used only by {@link extractStructuredData}.
|
|
57
59
|
*/
|
|
58
60
|
interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
|
|
59
61
|
readonly markdown?: RenderMarkdownOptions;
|
|
62
|
+
/** Forwarded to `extractMarkdown` → `convertDocxToMarkdown` when bytes are DOCX (Node). */
|
|
63
|
+
readonly markdownDocx?: ConvertDocxToMarkdownOptions;
|
|
64
|
+
/** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF. */
|
|
65
|
+
readonly markdownPdf?: ConvertPdfToMarkdownOptions;
|
|
60
66
|
}
|
|
61
67
|
/**
|
|
62
68
|
* {@link extractLlmContent}: same structured fields; `llm` forwards to `renderLlmText`.
|
|
@@ -125,20 +131,26 @@ declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions):
|
|
|
125
131
|
declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
|
|
126
132
|
|
|
127
133
|
/**
|
|
128
|
-
* End-to-end:
|
|
134
|
+
* End-to-end: {@link extractStructuredData} (for fallback + option parity) plus
|
|
135
|
+
* `extractMarkdown` from `@dragon708/docmind-markdown` on `{ data, filename, mimeType }`.
|
|
136
|
+
* On Node, PDF/DOCX bytes use specialized routes (`@opendataloader/pdf`, Mammoth+Turndown) when detection matches;
|
|
137
|
+
* the structured envelope is always passed as `structuredFallback`.
|
|
129
138
|
*
|
|
130
139
|
* @param input - Path, buffer, or {@link NodeAnalyzeInput} accepted by the Node facade.
|
|
131
|
-
* @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`)
|
|
140
|
+
* @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`), optional `markdown` serializer knobs,
|
|
141
|
+
* and optional `markdownDocx` / `markdownPdf` for the binary Markdown pipelines (distinct from structured-only `docx` / `pdf`).
|
|
132
142
|
*/
|
|
133
143
|
declare function extractMarkdown(input: NodeAnalyzeInput, options?: NodeExtractMarkdownOptions): Promise<string>;
|
|
134
144
|
/**
|
|
135
|
-
*
|
|
145
|
+
* {@link extractStructuredData} then `renderLlmText` in `@dragon708/docmind-markdown` (tagged plain text for LLMs).
|
|
146
|
+
* That package's `extractLlmContent` is the same transform on an in-memory structured result only.
|
|
136
147
|
*
|
|
137
|
-
* @param options -
|
|
148
|
+
* @param options - Structured routing plus optional `llm` passed through to `renderLlmText`.
|
|
138
149
|
*/
|
|
139
150
|
declare function extractLlmContent(input: NodeAnalyzeInput, options?: NodeExtractLlmContentOptions): Promise<string>;
|
|
140
151
|
/**
|
|
141
|
-
* Structured extract →
|
|
152
|
+
* Structured extract → {@link renderMarkdownSections} (`splitStructuredIntoChunks` with `includeMarkdown: true`;
|
|
153
|
+
* same layer as `extractStructuredChunks` / `splitStructuredIntoChunks` in `@dragon708/docmind-markdown`).
|
|
142
154
|
*
|
|
143
155
|
* @param options - Optional `chunks` (e.g. `maxChars`, `preferHeadings`) from `@dragon708/docmind-markdown`.
|
|
144
156
|
*/
|
|
@@ -146,7 +158,7 @@ declare function extractStructuredChunks(input: NodeAnalyzeInput, options?: Node
|
|
|
146
158
|
|
|
147
159
|
/** High-level features the user can ask DocMind for (per input kind and runtime). */
|
|
148
160
|
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
|
|
149
|
-
/** Node: {@link extractMarkdown}
|
|
161
|
+
/** Node: {@link extractMarkdown} — hybrid `extractMarkdown` in `@dragon708/docmind-markdown` (binary PDF/DOCX routes + structured fallback). */
|
|
150
162
|
| "markdown"
|
|
151
163
|
/** Node: {@link extractLlmContent} (LLM-oriented plain text). */
|
|
152
164
|
| "llm-text"
|
|
@@ -248,7 +260,7 @@ type NodeExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "intent">
|
|
|
248
260
|
|
|
249
261
|
/**
|
|
250
262
|
* Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
|
|
251
|
-
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` | `llm-text` | `structured-chunks`
|
|
263
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (hybrid package extract on Node) | `llm-text` | `structured-chunks` (split + Markdown sections)
|
|
252
264
|
* (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
|
|
253
265
|
* `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
|
|
254
266
|
* For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
|
package/dist/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { assertValidAnalyzeFileInput, detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, notImplementedResult, analyzeFileRequestsStructured,
|
|
1
|
+
import { assertValidAnalyzeFileInput, detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, notImplementedResult, analyzeFileRequestsStructured, isNamedInput, toUint8Array, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
|
|
2
2
|
export { analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
|
|
3
3
|
import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-docx';
|
|
4
4
|
export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
|
|
@@ -9,7 +9,7 @@ export { extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
|
|
|
9
9
|
import { readFile } from 'fs/promises';
|
|
10
10
|
import { basename } from 'path';
|
|
11
11
|
import { fileURLToPath } from 'url';
|
|
12
|
-
import {
|
|
12
|
+
import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
|
|
13
13
|
|
|
14
14
|
// src/analyze.ts
|
|
15
15
|
|
|
@@ -620,9 +620,26 @@ async function runOcr(input, options) {
|
|
|
620
620
|
}
|
|
621
621
|
async function extractMarkdown(input, options) {
|
|
622
622
|
throwIfAborted(options?.signal);
|
|
623
|
-
const { markdown: markdownOpts, ...structuredOpts } = options ?? {};
|
|
624
|
-
const
|
|
625
|
-
|
|
623
|
+
const { markdown: markdownOpts, markdownDocx, markdownPdf, ...structuredOpts } = options ?? {};
|
|
624
|
+
const resolved = await resolveNodeAnalyzeInput(input);
|
|
625
|
+
const structured = await extractStructuredData(resolved, structuredOpts);
|
|
626
|
+
const data = await bytesFromDetectInput(resolved);
|
|
627
|
+
let filename;
|
|
628
|
+
let mimeType;
|
|
629
|
+
if (isNamedInput(resolved)) {
|
|
630
|
+
filename = resolved.name;
|
|
631
|
+
mimeType = resolved.mimeType;
|
|
632
|
+
}
|
|
633
|
+
const r = await extractMarkdown$1(
|
|
634
|
+
{ data, filename, mimeType },
|
|
635
|
+
{
|
|
636
|
+
...markdownOpts ?? {},
|
|
637
|
+
...markdownDocx !== void 0 ? { docx: markdownDocx } : {},
|
|
638
|
+
...markdownPdf !== void 0 ? { pdf: markdownPdf } : {},
|
|
639
|
+
structuredFallback: structured
|
|
640
|
+
}
|
|
641
|
+
);
|
|
642
|
+
return r.markdown;
|
|
626
643
|
}
|
|
627
644
|
async function extractLlmContent(input, options) {
|
|
628
645
|
throwIfAborted(options?.signal);
|
|
@@ -700,13 +717,13 @@ function buildNodeCapabilityReport(kind) {
|
|
|
700
717
|
"extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
|
|
701
718
|
]),
|
|
702
719
|
slot("markdown", true, [
|
|
703
|
-
"extractMarkdown:
|
|
720
|
+
"extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes prefer `@opendataloader/pdf` \u2192 Markdown; structured PDF (`extractStructuredData`, respects pdf.ocr) is always built as fallback and for non-binary inputs."
|
|
704
721
|
]),
|
|
705
722
|
slot("llm-text", true, [
|
|
706
|
-
"extractLlmContent: structured \u2192
|
|
723
|
+
"extractLlmContent: structured envelope \u2192 LLM-oriented plain text (`renderLlmText` in `@dragon708/docmind-markdown`)."
|
|
707
724
|
]),
|
|
708
725
|
slot("structured-chunks", true, [
|
|
709
|
-
"extractStructuredChunks: structured \u2192 Markdown
|
|
726
|
+
"extractStructuredChunks: structured \u2192 `renderMarkdownSections` (splitStructuredIntoChunks + Markdown per slice; heading-aware chunking)."
|
|
710
727
|
])
|
|
711
728
|
];
|
|
712
729
|
break;
|
|
@@ -729,7 +746,7 @@ function buildNodeCapabilityReport(kind) {
|
|
|
729
746
|
"extractStructuredData runs analyzeDocx with merged OOXML includes (blocks, tables, headings, embedded images, etc.) into StructuredDocumentResult."
|
|
730
747
|
]),
|
|
731
748
|
slot("markdown", true, [
|
|
732
|
-
"extractMarkdown:
|
|
749
|
+
"extractMarkdown: hybrid \u2014 DOCX bytes use Mammoth\u2192Turndown on Node; structured DOCX (`extractStructuredData`, options.docx.include) is always built as fallback."
|
|
733
750
|
]),
|
|
734
751
|
slot("llm-text", true, ["extractLlmContent: structured \u2192 LLM-oriented plain text."]),
|
|
735
752
|
slot("structured-chunks", true, [
|
|
@@ -1023,7 +1040,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
|
|
|
1023
1040
|
if (kind === "pdf") {
|
|
1024
1041
|
nativeExtraction = {
|
|
1025
1042
|
willAttempt: true,
|
|
1026
|
-
description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown
|
|
1043
|
+
description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : intent === "extractMarkdown" ? "extractMarkdown: structured PDF extract (same as extractStructuredData) for fallback; primary Markdown from `@dragon708/docmind-markdown` tries `@opendataloader/pdf` on PDF bytes when possible." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (renderLlmText or chunk sections).`
|
|
1027
1044
|
};
|
|
1028
1045
|
ocr = {
|
|
1029
1046
|
mayUse: pdfOcr !== "off",
|
|
@@ -1032,7 +1049,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
|
|
|
1032
1049
|
} else if (kind === "docx") {
|
|
1033
1050
|
nativeExtraction = {
|
|
1034
1051
|
willAttempt: true,
|
|
1035
|
-
description: intent === "extractStructuredData" ? "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope." : `${intent}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\`.`
|
|
1052
|
+
description: intent === "extractStructuredData" ? "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope." : intent === "extractMarkdown" ? "extractMarkdown: structured DOCX envelope for fallback; primary Markdown from Mammoth\u2192Turndown on DOCX bytes when possible (`@dragon708/docmind-markdown`)." : `${intent}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\`.`
|
|
1036
1053
|
};
|
|
1037
1054
|
ocr = { mayUse: false, description: "DOCX does not use OCR." };
|
|
1038
1055
|
} else if (kind === "image") {
|
|
@@ -1055,7 +1072,9 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
|
|
|
1055
1072
|
if (intent === "extractMarkdown") {
|
|
1056
1073
|
limitations = [
|
|
1057
1074
|
...limitations,
|
|
1058
|
-
...lim(
|
|
1075
|
+
...lim(
|
|
1076
|
+
"Output: Markdown string from `@dragon708/docmind-markdown` extractMarkdown (PDF/DOCX binary routes on Node when applicable; structured serializer as fallback)."
|
|
1077
|
+
)
|
|
1059
1078
|
];
|
|
1060
1079
|
} else if (intent === "extractLlmContent") {
|
|
1061
1080
|
limitations = [
|
|
@@ -1066,7 +1085,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
|
|
|
1066
1085
|
limitations = [
|
|
1067
1086
|
...limitations,
|
|
1068
1087
|
...lim(
|
|
1069
|
-
"Output: MarkdownSection[] via renderMarkdownSections (splitStructuredIntoChunks
|
|
1088
|
+
"Output: MarkdownSection[] via renderMarkdownSections (splitStructuredIntoChunks / extractStructuredChunks alias in `@dragon708/docmind-markdown`)."
|
|
1070
1089
|
)
|
|
1071
1090
|
];
|
|
1072
1091
|
}
|
|
@@ -1337,7 +1356,27 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr, anal
|
|
|
1337
1356
|
};
|
|
1338
1357
|
}
|
|
1339
1358
|
}
|
|
1340
|
-
if (intent === "extractMarkdown"
|
|
1359
|
+
if (intent === "extractMarkdown") {
|
|
1360
|
+
const sub = planForIntent(
|
|
1361
|
+
"extractStructuredData",
|
|
1362
|
+
kind,
|
|
1363
|
+
pdfOcrForAnalyze,
|
|
1364
|
+
docxInclude,
|
|
1365
|
+
ocr,
|
|
1366
|
+
analyzeFileOutput
|
|
1367
|
+
);
|
|
1368
|
+
return {
|
|
1369
|
+
intent,
|
|
1370
|
+
steps: [
|
|
1371
|
+
...sub.steps ?? [],
|
|
1372
|
+
{
|
|
1373
|
+
id: "markdown_hybrid_package",
|
|
1374
|
+
status: "planned"
|
|
1375
|
+
}
|
|
1376
|
+
]
|
|
1377
|
+
};
|
|
1378
|
+
}
|
|
1379
|
+
if (intent === "extractLlmContent" || intent === "extractStructuredChunks") {
|
|
1341
1380
|
const sub = planForIntent(
|
|
1342
1381
|
"extractStructuredData",
|
|
1343
1382
|
kind,
|
|
@@ -1350,7 +1389,10 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr, anal
|
|
|
1350
1389
|
intent,
|
|
1351
1390
|
steps: [
|
|
1352
1391
|
...sub.steps ?? [],
|
|
1353
|
-
{
|
|
1392
|
+
{
|
|
1393
|
+
id: intent === "extractLlmContent" ? "docmind_markdown_llm_text" : "docmind_markdown_chunk_sections",
|
|
1394
|
+
status: "planned"
|
|
1395
|
+
}
|
|
1354
1396
|
]
|
|
1355
1397
|
};
|
|
1356
1398
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-node",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.12.1",
|
|
4
4
|
"description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -33,7 +33,7 @@
|
|
|
33
33
|
"license": "MIT",
|
|
34
34
|
"dependencies": {
|
|
35
35
|
"@dragon708/docmind-docx": "^1.8.0",
|
|
36
|
-
"@dragon708/docmind-markdown": "^1.
|
|
36
|
+
"@dragon708/docmind-markdown": "^1.1.1",
|
|
37
37
|
"@dragon708/docmind-ocr": "^1.1.4",
|
|
38
38
|
"@dragon708/docmind-pdf": "^2.2.0",
|
|
39
39
|
"@dragon708/docmind-shared": "^1.2.0"
|