@dragon708/docmind-node 1.12.3 → 1.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +21 -6
- package/dist/index.js +10 -3
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -6,7 +6,7 @@ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDo
|
|
|
6
6
|
export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
|
|
7
7
|
import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
|
|
8
8
|
export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
|
|
9
|
-
import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
|
|
9
|
+
import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, ExtractMarkdownStrategy, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
|
|
10
10
|
export { MarkdownSection } from '@dragon708/docmind-markdown';
|
|
11
11
|
|
|
12
12
|
/**
|
|
@@ -55,14 +55,22 @@ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
|
|
|
55
55
|
/**
|
|
56
56
|
* {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; `markdown` maps to structured-serializer
|
|
57
57
|
* options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf`
|
|
58
|
-
* configure Mammoth→Turndown and `@
|
|
58
|
+
* configure Mammoth→Turndown and `@cognipeer/to-markdown` (`convertPdfToMarkdown`) respectively — separate from `docx` / `pdf` used only by {@link extractStructuredData}.
|
|
59
59
|
*/
|
|
60
60
|
interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
|
|
61
61
|
readonly markdown?: RenderMarkdownOptions;
|
|
62
62
|
/** Forwarded to `extractMarkdown` → `convertDocxToMarkdown` when bytes are DOCX (Node). */
|
|
63
63
|
readonly markdownDocx?: ConvertDocxToMarkdownOptions;
|
|
64
|
-
/** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF. */
|
|
64
|
+
/** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF (Node; `@cognipeer/to-markdown`). */
|
|
65
65
|
readonly markdownPdf?: ConvertPdfToMarkdownOptions;
|
|
66
|
+
/**
|
|
67
|
+
* Optional: receive `strategy` and merged `warnings` from `@dragon708/docmind-markdown` `extractMarkdown`
|
|
68
|
+
* (e.g. `pdf-cognipeer-specialized` vs `pdf-structured-fallback`, `docx-mammoth` vs `docx-structured-fallback`) without changing the `Promise<string>` return type.
|
|
69
|
+
*/
|
|
70
|
+
readonly onMarkdownExtract?: (info: {
|
|
71
|
+
strategy: ExtractMarkdownStrategy;
|
|
72
|
+
warnings: readonly string[];
|
|
73
|
+
}) => void;
|
|
66
74
|
}
|
|
67
75
|
/**
|
|
68
76
|
* {@link extractLlmContent}: same structured fields; `llm` forwards to `renderLlmText`.
|
|
@@ -133,8 +141,14 @@ declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeEx
|
|
|
133
141
|
/**
|
|
134
142
|
* End-to-end: {@link extractStructuredData} (for fallback + option parity) plus
|
|
135
143
|
* `extractMarkdown` from `@dragon708/docmind-markdown` on `{ data, filename, mimeType }`.
|
|
136
|
-
*
|
|
137
|
-
*
|
|
144
|
+
*
|
|
145
|
+
* On Node, **DOCX** bytes use **Mammoth → Turndown**; **PDF** bytes use **`convertPdfToMarkdown`**
|
|
146
|
+
* (`@cognipeer/to-markdown`, Node-first, no JVM). The structured envelope is always passed as `structuredFallback`
|
|
147
|
+
* so specialized routes can fall back to structured → Markdown serialization with clear warnings when needed.
|
|
148
|
+
*
|
|
149
|
+
* {@link NodeExtractMarkdownOptions.onMarkdownExtract} receives `strategy` and merged `warnings` from the markdown
|
|
150
|
+
* package (for example `pdf-cognipeer-specialized`, `pdf-structured-fallback`, `docx-structured-fallback`, and
|
|
151
|
+
* `[docmind-markdown:extractMarkdown]` trace lines).
|
|
138
152
|
*
|
|
139
153
|
* @param input - Path, buffer, or {@link NodeAnalyzeInput} accepted by the Node facade.
|
|
140
154
|
* @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`), optional `markdown` serializer knobs,
|
|
@@ -143,7 +157,8 @@ declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeEx
|
|
|
143
157
|
declare function extractMarkdown(input: NodeAnalyzeInput, options?: NodeExtractMarkdownOptions): Promise<string>;
|
|
144
158
|
/**
|
|
145
159
|
* {@link extractStructuredData} then `renderLlmText` in `@dragon708/docmind-markdown` (tagged plain text for LLMs).
|
|
146
|
-
*
|
|
160
|
+
* This path **does not** run the binary PDF/DOCX Markdown pipelines (no `@cognipeer/to-markdown` / Mammoth→Turndown here);
|
|
161
|
+
* it linearizes the structured envelope only. Use {@link extractMarkdown} when you need specialized PDF or DOCX Markdown layout.
|
|
147
162
|
*
|
|
148
163
|
* @param options - Structured routing plus optional `llm` passed through to `renderLlmText`.
|
|
149
164
|
*/
|
package/dist/index.js
CHANGED
|
@@ -620,7 +620,13 @@ async function runOcr(input, options) {
|
|
|
620
620
|
}
|
|
621
621
|
async function extractMarkdown(input, options) {
|
|
622
622
|
throwIfAborted(options?.signal);
|
|
623
|
-
const {
|
|
623
|
+
const {
|
|
624
|
+
markdown: markdownOpts,
|
|
625
|
+
markdownDocx,
|
|
626
|
+
markdownPdf,
|
|
627
|
+
onMarkdownExtract,
|
|
628
|
+
...structuredOpts
|
|
629
|
+
} = options ?? {};
|
|
624
630
|
const resolved = await resolveNodeAnalyzeInput(input);
|
|
625
631
|
const structured = await extractStructuredData(resolved, structuredOpts);
|
|
626
632
|
const data = await bytesFromDetectInput(resolved);
|
|
@@ -639,6 +645,7 @@ async function extractMarkdown(input, options) {
|
|
|
639
645
|
structuredFallback: structured
|
|
640
646
|
}
|
|
641
647
|
);
|
|
648
|
+
onMarkdownExtract?.({ strategy: r.strategy, warnings: r.warnings });
|
|
642
649
|
return r.markdown;
|
|
643
650
|
}
|
|
644
651
|
async function extractLlmContent(input, options) {
|
|
@@ -717,7 +724,7 @@ function buildNodeCapabilityReport(kind) {
|
|
|
717
724
|
"extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
|
|
718
725
|
]),
|
|
719
726
|
slot("markdown", true, [
|
|
720
|
-
"extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes
|
|
727
|
+
"extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes use `@cognipeer/to-markdown` (Node-first, no Java); structured PDF (`extractStructuredData`, respects pdf.ocr) is always built as fallback."
|
|
721
728
|
]),
|
|
722
729
|
slot("llm-text", true, [
|
|
723
730
|
"extractLlmContent: structured envelope \u2192 LLM-oriented plain text (`renderLlmText` in `@dragon708/docmind-markdown`)."
|
|
@@ -1040,7 +1047,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
|
|
|
1040
1047
|
if (kind === "pdf") {
|
|
1041
1048
|
nativeExtraction = {
|
|
1042
1049
|
willAttempt: true,
|
|
1043
|
-
description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : intent === "extractMarkdown" ? "extractMarkdown: structured PDF extract (same as extractStructuredData) for fallback; primary Markdown
|
|
1050
|
+
description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : intent === "extractMarkdown" ? "extractMarkdown: structured PDF extract (same as extractStructuredData) for fallback; primary Markdown on PDF bytes uses `@cognipeer/to-markdown` via `@dragon708/docmind-markdown` (Node-first, no Java)." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (renderLlmText or chunk sections).`
|
|
1044
1051
|
};
|
|
1045
1052
|
ocr = {
|
|
1046
1053
|
mayUse: pdfOcr !== "off",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-node",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.13.1",
|
|
4
4
|
"description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -33,7 +33,7 @@
|
|
|
33
33
|
"license": "MIT",
|
|
34
34
|
"dependencies": {
|
|
35
35
|
"@dragon708/docmind-docx": "^1.8.0",
|
|
36
|
-
"@dragon708/docmind-markdown": "^1.1
|
|
36
|
+
"@dragon708/docmind-markdown": "^1.2.1",
|
|
37
37
|
"@dragon708/docmind-ocr": "^1.1.4",
|
|
38
38
|
"@dragon708/docmind-pdf": "^2.2.0",
|
|
39
39
|
"@dragon708/docmind-shared": "^1.2.0"
|