@dragon708/docmind-node 1.10.0 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +67 -4
- package/dist/index.js +137 -7
- package/package.json +2 -1
package/dist/index.d.ts
CHANGED
|
@@ -6,6 +6,8 @@ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDo
|
|
|
6
6
|
export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
|
|
7
7
|
import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
|
|
8
8
|
export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
|
|
9
|
+
import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
|
|
10
|
+
export { MarkdownSection } from '@dragon708/docmind-markdown';
|
|
9
11
|
|
|
10
12
|
/**
|
|
11
13
|
* Options for Node public APIs (`analyzeFile`, intent methods).
|
|
@@ -50,6 +52,30 @@ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
|
|
|
50
52
|
readonly ocr?: ExtractStructuredDataFromImageOptions;
|
|
51
53
|
readonly normalize?: NormalizeStructuredOptions;
|
|
52
54
|
}
|
|
55
|
+
/**
|
|
56
|
+
* {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; `markdown` maps to structured-serializer
|
|
57
|
+
* options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf`
|
|
58
|
+
* configure Mammoth→Turndown and `@opendataloader/pdf` respectively — separate from `docx` / `pdf` used only by {@link extractStructuredData}.
|
|
59
|
+
*/
|
|
60
|
+
interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
|
|
61
|
+
readonly markdown?: RenderMarkdownOptions;
|
|
62
|
+
/** Forwarded to `extractMarkdown` → `convertDocxToMarkdown` when bytes are DOCX (Node). */
|
|
63
|
+
readonly markdownDocx?: ConvertDocxToMarkdownOptions;
|
|
64
|
+
/** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF. */
|
|
65
|
+
readonly markdownPdf?: ConvertPdfToMarkdownOptions;
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* {@link extractLlmContent}: same structured fields; `llm` forwards to `renderLlmText`.
|
|
69
|
+
*/
|
|
70
|
+
interface NodeExtractLlmContentOptions extends NodeExtractStructuredDataOptions {
|
|
71
|
+
readonly llm?: RenderLlmTextOptions;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* {@link extractStructuredChunks}: `chunks` maps to split/render options (`maxChars`, `preferHeadings`, etc.).
|
|
75
|
+
*/
|
|
76
|
+
interface NodeExtractStructuredChunksOptions extends NodeExtractStructuredDataOptions {
|
|
77
|
+
readonly chunks?: RenderMarkdownSectionsOptions;
|
|
78
|
+
}
|
|
53
79
|
|
|
54
80
|
/**
|
|
55
81
|
* Inputs accepted by {@link analyzeFile} in this package.
|
|
@@ -104,8 +130,40 @@ declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions):
|
|
|
104
130
|
*/
|
|
105
131
|
declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
|
|
106
132
|
|
|
133
|
+
/**
|
|
134
|
+
* End-to-end: {@link extractStructuredData} (for fallback + option parity) plus
|
|
135
|
+
* `extractMarkdown` from `@dragon708/docmind-markdown` on `{ data, filename, mimeType }`.
|
|
136
|
+
* On Node, PDF/DOCX bytes use specialized routes (`@opendataloader/pdf`, Mammoth+Turndown) when detection matches;
|
|
137
|
+
* the structured envelope is always passed as `structuredFallback`.
|
|
138
|
+
*
|
|
139
|
+
* @param input - Path, buffer, or {@link NodeAnalyzeInput} accepted by the Node facade.
|
|
140
|
+
* @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`), optional `markdown` serializer knobs,
|
|
141
|
+
* and optional `markdownDocx` / `markdownPdf` for the binary Markdown pipelines (distinct from structured-only `docx` / `pdf`).
|
|
142
|
+
*/
|
|
143
|
+
declare function extractMarkdown(input: NodeAnalyzeInput, options?: NodeExtractMarkdownOptions): Promise<string>;
|
|
144
|
+
/**
|
|
145
|
+
* {@link extractStructuredData} then `renderLlmText` in `@dragon708/docmind-markdown` (tagged plain text for LLMs).
|
|
146
|
+
* That package's `extractLlmContent` is the same transform on an in-memory structured result only.
|
|
147
|
+
*
|
|
148
|
+
* @param options - Structured routing plus optional `llm` passed through to `renderLlmText`.
|
|
149
|
+
*/
|
|
150
|
+
declare function extractLlmContent(input: NodeAnalyzeInput, options?: NodeExtractLlmContentOptions): Promise<string>;
|
|
151
|
+
/**
|
|
152
|
+
* Structured extract → {@link renderMarkdownSections} (`splitStructuredIntoChunks` with `includeMarkdown: true`;
|
|
153
|
+
* same layer as `extractStructuredChunks` / `splitStructuredIntoChunks` in `@dragon708/docmind-markdown`).
|
|
154
|
+
*
|
|
155
|
+
* @param options - Optional `chunks` (e.g. `maxChars`, `preferHeadings`) from `@dragon708/docmind-markdown`.
|
|
156
|
+
*/
|
|
157
|
+
declare function extractStructuredChunks(input: NodeAnalyzeInput, options?: NodeExtractStructuredChunksOptions): Promise<readonly MarkdownSection[]>;
|
|
158
|
+
|
|
107
159
|
/** High-level features the user can ask DocMind for (per input kind and runtime). */
|
|
108
|
-
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
|
|
160
|
+
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
|
|
161
|
+
/** Node: {@link extractMarkdown} — hybrid `extractMarkdown` in `@dragon708/docmind-markdown` (binary PDF/DOCX routes + structured fallback). */
|
|
162
|
+
| "markdown"
|
|
163
|
+
/** Node: {@link extractLlmContent} (LLM-oriented plain text). */
|
|
164
|
+
| "llm-text"
|
|
165
|
+
/** Node: {@link extractStructuredChunks} (Markdown sections / chunking). */
|
|
166
|
+
| "structured-chunks" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
|
|
109
167
|
/**
|
|
110
168
|
* DOCX-only: what the stack can do with OOXML embedded bitmaps/vector payloads under `word/media`.
|
|
111
169
|
* Present on {@link GetCapabilitiesReport} when `kind === "docx"`.
|
|
@@ -193,12 +251,17 @@ interface ExplainAnalysisPlanReport {
|
|
|
193
251
|
readonly warnings?: readonly string[];
|
|
194
252
|
}
|
|
195
253
|
|
|
254
|
+
/** Node-only intents layered on `@dragon708/docmind-markdown` after structured extraction. */
|
|
255
|
+
type NodeMarkdownFacadeIntent = "extractMarkdown" | "extractLlmContent" | "extractStructuredChunks";
|
|
196
256
|
/** Options for {@link explainAnalysisPlan} including PDF/OCR/DOCX hints for accurate planning. */
|
|
197
|
-
type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx" | "structuredOutput" | "output"
|
|
257
|
+
type NodeExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "intent"> & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx" | "structuredOutput" | "output"> & {
|
|
258
|
+
readonly intent?: DocMindPublicIntent | NodeMarkdownFacadeIntent;
|
|
259
|
+
};
|
|
198
260
|
|
|
199
261
|
/**
|
|
200
262
|
* Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
|
|
201
|
-
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` (
|
|
263
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (hybrid package extract on Node) | `llm-text` | `structured-chunks` (split + Markdown sections)
|
|
264
|
+
* (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
|
|
202
265
|
* `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
|
|
203
266
|
* For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
|
|
204
267
|
* Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
|
|
@@ -210,4 +273,4 @@ declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilit
|
|
|
210
273
|
*/
|
|
211
274
|
declare function explainAnalysisPlan(input: NodeAnalyzeInput, options?: NodeExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
|
|
212
275
|
|
|
213
|
-
export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type NodeExtractStructuredDataOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
|
276
|
+
export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type NodeExtractLlmContentOptions, type NodeExtractMarkdownOptions, type NodeExtractStructuredChunksOptions, type NodeExtractStructuredDataOptions, type NodeMarkdownFacadeIntent, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractLlmContent, extractMarkdown, extractMetadata, extractStructuredChunks, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
package/dist/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { assertValidAnalyzeFileInput, detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, notImplementedResult, analyzeFileRequestsStructured,
|
|
1
|
+
import { assertValidAnalyzeFileInput, detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, notImplementedResult, analyzeFileRequestsStructured, isNamedInput, toUint8Array, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
|
|
2
2
|
export { analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
|
|
3
3
|
import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-docx';
|
|
4
4
|
export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
|
|
@@ -9,6 +9,7 @@ export { extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
|
|
|
9
9
|
import { readFile } from 'fs/promises';
|
|
10
10
|
import { basename } from 'path';
|
|
11
11
|
import { fileURLToPath } from 'url';
|
|
12
|
+
import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
|
|
12
13
|
|
|
13
14
|
// src/analyze.ts
|
|
14
15
|
|
|
@@ -617,6 +618,41 @@ async function runOcr(input, options) {
|
|
|
617
618
|
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
618
619
|
}
|
|
619
620
|
}
|
|
621
|
+
async function extractMarkdown(input, options) {
|
|
622
|
+
throwIfAborted(options?.signal);
|
|
623
|
+
const { markdown: markdownOpts, markdownDocx, markdownPdf, ...structuredOpts } = options ?? {};
|
|
624
|
+
const resolved = await resolveNodeAnalyzeInput(input);
|
|
625
|
+
const structured = await extractStructuredData(resolved, structuredOpts);
|
|
626
|
+
const data = await bytesFromDetectInput(resolved);
|
|
627
|
+
let filename;
|
|
628
|
+
let mimeType;
|
|
629
|
+
if (isNamedInput(resolved)) {
|
|
630
|
+
filename = resolved.name;
|
|
631
|
+
mimeType = resolved.mimeType;
|
|
632
|
+
}
|
|
633
|
+
const r = await extractMarkdown$1(
|
|
634
|
+
{ data, filename, mimeType },
|
|
635
|
+
{
|
|
636
|
+
...markdownOpts ?? {},
|
|
637
|
+
...markdownDocx !== void 0 ? { docx: markdownDocx } : {},
|
|
638
|
+
...markdownPdf !== void 0 ? { pdf: markdownPdf } : {},
|
|
639
|
+
structuredFallback: structured
|
|
640
|
+
}
|
|
641
|
+
);
|
|
642
|
+
return r.markdown;
|
|
643
|
+
}
|
|
644
|
+
async function extractLlmContent(input, options) {
|
|
645
|
+
throwIfAborted(options?.signal);
|
|
646
|
+
const { llm: llmOpts, ...structuredOpts } = options ?? {};
|
|
647
|
+
const structured = await extractStructuredData(input, structuredOpts);
|
|
648
|
+
return renderLlmText(structured, llmOpts);
|
|
649
|
+
}
|
|
650
|
+
async function extractStructuredChunks(input, options) {
|
|
651
|
+
throwIfAborted(options?.signal);
|
|
652
|
+
const { chunks: chunkOpts, ...structuredOpts } = options ?? {};
|
|
653
|
+
const structured = await extractStructuredData(input, structuredOpts);
|
|
654
|
+
return renderMarkdownSections(structured, chunkOpts);
|
|
655
|
+
}
|
|
620
656
|
|
|
621
657
|
// src/capabilityReport.ts
|
|
622
658
|
function docxIncludeRequested(flags) {
|
|
@@ -679,6 +715,15 @@ function buildNodeCapabilityReport(kind) {
|
|
|
679
715
|
slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."]),
|
|
680
716
|
slot("structured-output", true, [
|
|
681
717
|
"extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
|
|
718
|
+
]),
|
|
719
|
+
slot("markdown", true, [
|
|
720
|
+
"extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes prefer `@opendataloader/pdf` \u2192 Markdown; structured PDF (`extractStructuredData`, respects pdf.ocr) is always built as fallback and for non-binary inputs."
|
|
721
|
+
]),
|
|
722
|
+
slot("llm-text", true, [
|
|
723
|
+
"extractLlmContent: structured envelope \u2192 LLM-oriented plain text (`renderLlmText` in `@dragon708/docmind-markdown`)."
|
|
724
|
+
]),
|
|
725
|
+
slot("structured-chunks", true, [
|
|
726
|
+
"extractStructuredChunks: structured \u2192 `renderMarkdownSections` (splitStructuredIntoChunks + Markdown per slice; heading-aware chunking)."
|
|
682
727
|
])
|
|
683
728
|
];
|
|
684
729
|
break;
|
|
@@ -699,6 +744,13 @@ function buildNodeCapabilityReport(kind) {
|
|
|
699
744
|
]),
|
|
700
745
|
slot("structured-output", true, [
|
|
701
746
|
"extractStructuredData runs analyzeDocx with merged OOXML includes (blocks, tables, headings, embedded images, etc.) into StructuredDocumentResult."
|
|
747
|
+
]),
|
|
748
|
+
slot("markdown", true, [
|
|
749
|
+
"extractMarkdown: hybrid \u2014 DOCX bytes use Mammoth\u2192Turndown on Node; structured DOCX (`extractStructuredData`, options.docx.include) is always built as fallback."
|
|
750
|
+
]),
|
|
751
|
+
slot("llm-text", true, ["extractLlmContent: structured \u2192 LLM-oriented plain text."]),
|
|
752
|
+
slot("structured-chunks", true, [
|
|
753
|
+
"extractStructuredChunks: structured \u2192 sectioned Markdown chunks."
|
|
702
754
|
])
|
|
703
755
|
];
|
|
704
756
|
break;
|
|
@@ -733,7 +785,12 @@ function buildNodeCapabilityReport(kind) {
|
|
|
733
785
|
]),
|
|
734
786
|
slot("structured-output", true, [
|
|
735
787
|
"extractStructuredData: OCR + layout blocks (ocrImageDetailed / ocrTiff) normalized to StructuredDocumentResult."
|
|
736
|
-
])
|
|
788
|
+
]),
|
|
789
|
+
slot("markdown", true, [
|
|
790
|
+
"extractMarkdown: OCR structured layout \u2192 Markdown (tables/lists as GFM where blocks exist)."
|
|
791
|
+
]),
|
|
792
|
+
slot("llm-text", true, ["extractLlmContent: OCR structured \u2192 LLM plain text."]),
|
|
793
|
+
slot("structured-chunks", true, ["extractStructuredChunks: OCR structured \u2192 Markdown sections."])
|
|
737
794
|
];
|
|
738
795
|
break;
|
|
739
796
|
case "text":
|
|
@@ -745,6 +802,13 @@ function buildNodeCapabilityReport(kind) {
|
|
|
745
802
|
slot("pages", false),
|
|
746
803
|
slot("structured-output", true, [
|
|
747
804
|
"extractStructuredData wraps UTF-8 decode in normalizeToStructuredResult (rollup text only unless you add blocks upstream)."
|
|
805
|
+
]),
|
|
806
|
+
slot("markdown", true, [
|
|
807
|
+
"extractMarkdown: rollup/decoded text \u2192 Markdown (mostly paragraphs; no layout without upstream blocks)."
|
|
808
|
+
]),
|
|
809
|
+
slot("llm-text", true, ["extractLlmContent: rollup \u2192 LLM plain text via the same envelope."]),
|
|
810
|
+
slot("structured-chunks", true, [
|
|
811
|
+
"extractStructuredChunks: single-chunk Markdown is typical when only rollup text exists."
|
|
748
812
|
])
|
|
749
813
|
];
|
|
750
814
|
break;
|
|
@@ -756,7 +820,10 @@ function buildNodeCapabilityReport(kind) {
|
|
|
756
820
|
slot("html", false),
|
|
757
821
|
slot("ocr", false),
|
|
758
822
|
slot("pages", false),
|
|
759
|
-
slot("structured-output", false)
|
|
823
|
+
slot("structured-output", false),
|
|
824
|
+
slot("markdown", false),
|
|
825
|
+
slot("llm-text", false),
|
|
826
|
+
slot("structured-chunks", false)
|
|
760
827
|
];
|
|
761
828
|
}
|
|
762
829
|
return {
|
|
@@ -967,10 +1034,13 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
|
|
|
967
1034
|
}
|
|
968
1035
|
break;
|
|
969
1036
|
case "extractStructuredData":
|
|
1037
|
+
case "extractMarkdown":
|
|
1038
|
+
case "extractLlmContent":
|
|
1039
|
+
case "extractStructuredChunks":
|
|
970
1040
|
if (kind === "pdf") {
|
|
971
1041
|
nativeExtraction = {
|
|
972
1042
|
willAttempt: true,
|
|
973
|
-
description: "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult."
|
|
1043
|
+
description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : intent === "extractMarkdown" ? "extractMarkdown: structured PDF extract (same as extractStructuredData) for fallback; primary Markdown from `@dragon708/docmind-markdown` tries `@opendataloader/pdf` on PDF bytes when possible." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (renderLlmText or chunk sections).`
|
|
974
1044
|
};
|
|
975
1045
|
ocr = {
|
|
976
1046
|
mayUse: pdfOcr !== "off",
|
|
@@ -979,7 +1049,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
|
|
|
979
1049
|
} else if (kind === "docx") {
|
|
980
1050
|
nativeExtraction = {
|
|
981
1051
|
willAttempt: true,
|
|
982
|
-
description: "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope."
|
|
1052
|
+
description: intent === "extractStructuredData" ? "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope." : intent === "extractMarkdown" ? "extractMarkdown: structured DOCX envelope for fallback; primary Markdown from Mammoth\u2192Turndown on DOCX bytes when possible (`@dragon708/docmind-markdown`)." : `${intent}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\`.`
|
|
983
1053
|
};
|
|
984
1054
|
ocr = { mayUse: false, description: "DOCX does not use OCR." };
|
|
985
1055
|
} else if (kind === "image") {
|
|
@@ -994,11 +1064,31 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
|
|
|
994
1064
|
} else {
|
|
995
1065
|
nativeExtraction = {
|
|
996
1066
|
willAttempt: true,
|
|
997
|
-
description: "UTF-8 decode with BOM handling; normalizeToStructuredResult produces the structured envelope."
|
|
1067
|
+
description: intent === "extractStructuredData" ? "UTF-8 decode with BOM handling; normalizeToStructuredResult produces the structured envelope." : `${intent}: UTF-8 structured envelope, then \`@dragon708/docmind-markdown\` export.`
|
|
998
1068
|
};
|
|
999
1069
|
ocr = { mayUse: false, description: "OCR does not apply to text files." };
|
|
1000
1070
|
limitations = lim("Plain text has no native layout blocks; `text` carries the decoded content.");
|
|
1001
1071
|
}
|
|
1072
|
+
if (intent === "extractMarkdown") {
|
|
1073
|
+
limitations = [
|
|
1074
|
+
...limitations,
|
|
1075
|
+
...lim(
|
|
1076
|
+
"Output: Markdown string from `@dragon708/docmind-markdown` extractMarkdown (PDF/DOCX binary routes on Node when applicable; structured serializer as fallback)."
|
|
1077
|
+
)
|
|
1078
|
+
];
|
|
1079
|
+
} else if (intent === "extractLlmContent") {
|
|
1080
|
+
limitations = [
|
|
1081
|
+
...limitations,
|
|
1082
|
+
...lim("Output: compact plain text via renderLlmText (prompt / embedding friendly).")
|
|
1083
|
+
];
|
|
1084
|
+
} else if (intent === "extractStructuredChunks") {
|
|
1085
|
+
limitations = [
|
|
1086
|
+
...limitations,
|
|
1087
|
+
...lim(
|
|
1088
|
+
"Output: MarkdownSection[] via renderMarkdownSections (splitStructuredIntoChunks / extractStructuredChunks alias in `@dragon708/docmind-markdown`)."
|
|
1089
|
+
)
|
|
1090
|
+
];
|
|
1091
|
+
}
|
|
1002
1092
|
break;
|
|
1003
1093
|
default:
|
|
1004
1094
|
nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
|
|
@@ -1266,6 +1356,46 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr, anal
|
|
|
1266
1356
|
};
|
|
1267
1357
|
}
|
|
1268
1358
|
}
|
|
1359
|
+
if (intent === "extractMarkdown") {
|
|
1360
|
+
const sub = planForIntent(
|
|
1361
|
+
"extractStructuredData",
|
|
1362
|
+
kind,
|
|
1363
|
+
pdfOcrForAnalyze,
|
|
1364
|
+
docxInclude,
|
|
1365
|
+
ocr,
|
|
1366
|
+
analyzeFileOutput
|
|
1367
|
+
);
|
|
1368
|
+
return {
|
|
1369
|
+
intent,
|
|
1370
|
+
steps: [
|
|
1371
|
+
...sub.steps ?? [],
|
|
1372
|
+
{
|
|
1373
|
+
id: "markdown_hybrid_package",
|
|
1374
|
+
status: "planned"
|
|
1375
|
+
}
|
|
1376
|
+
]
|
|
1377
|
+
};
|
|
1378
|
+
}
|
|
1379
|
+
if (intent === "extractLlmContent" || intent === "extractStructuredChunks") {
|
|
1380
|
+
const sub = planForIntent(
|
|
1381
|
+
"extractStructuredData",
|
|
1382
|
+
kind,
|
|
1383
|
+
pdfOcrForAnalyze,
|
|
1384
|
+
docxInclude,
|
|
1385
|
+
ocr,
|
|
1386
|
+
analyzeFileOutput
|
|
1387
|
+
);
|
|
1388
|
+
return {
|
|
1389
|
+
intent,
|
|
1390
|
+
steps: [
|
|
1391
|
+
...sub.steps ?? [],
|
|
1392
|
+
{
|
|
1393
|
+
id: intent === "extractLlmContent" ? "docmind_markdown_llm_text" : "docmind_markdown_chunk_sections",
|
|
1394
|
+
status: "planned"
|
|
1395
|
+
}
|
|
1396
|
+
]
|
|
1397
|
+
};
|
|
1398
|
+
}
|
|
1269
1399
|
return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
|
|
1270
1400
|
}
|
|
1271
1401
|
async function getCapabilities(input, options) {
|
|
@@ -1291,6 +1421,6 @@ async function explainAnalysisPlan(input, options) {
|
|
|
1291
1421
|
return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan, docxInc, ocrSlice);
|
|
1292
1422
|
}
|
|
1293
1423
|
|
|
1294
|
-
export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
|
1424
|
+
export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractLlmContent, extractMarkdown, extractMetadata, extractStructuredChunks, extractStructuredData, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
|
1295
1425
|
//# sourceMappingURL=index.js.map
|
|
1296
1426
|
//# sourceMappingURL=index.js.map
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-node",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.12.0",
|
|
4
4
|
"description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -33,6 +33,7 @@
|
|
|
33
33
|
"license": "MIT",
|
|
34
34
|
"dependencies": {
|
|
35
35
|
"@dragon708/docmind-docx": "^1.8.0",
|
|
36
|
+
"@dragon708/docmind-markdown": "^1.1.0",
|
|
36
37
|
"@dragon708/docmind-ocr": "^1.1.4",
|
|
37
38
|
"@dragon708/docmind-pdf": "^2.2.0",
|
|
38
39
|
"@dragon708/docmind-shared": "^1.2.0"
|