@dragon708/docmind-node 1.2.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +91 -24
- package/dist/index.js +258 -122
- package/package.json +3 -3
package/dist/index.d.ts
CHANGED
|
@@ -2,6 +2,8 @@ import { DocMindAnalyzeOptions, DetectFileKindInput, NamedInput, AnalysisResult,
|
|
|
2
2
|
export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
|
|
3
3
|
import { OcrOptions } from '@dragon708/docmind-ocr';
|
|
4
4
|
import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
|
|
5
|
+
import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
|
|
6
|
+
export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
|
|
5
7
|
|
|
6
8
|
/**
|
|
7
9
|
* Options for Node public APIs (`analyzeFile`, intent methods).
|
|
@@ -9,10 +11,30 @@ import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
|
|
|
9
11
|
* - **`pdf`**: forwarded to `@dragon708/docmind-pdf`. `analyzeFile` defaults `pdf.ocr` to `"auto"` when omitted.
|
|
10
12
|
* {@link extractText} / {@link convertToHtml} merge a default of `ocr: "off"` unless you set `pdf.ocr` explicitly.
|
|
11
13
|
* - **`ocr`**: forwarded to `@dragon708/docmind-ocr` for raster images; language string also feeds PDF OCR when `pdf.ocrLangs` is unset.
|
|
14
|
+
* - **`pdfNativeTextSource`**: when `pdf.ocr` resolves to `"off"`, chooses how native text is obtained (see {@link extractText} default).
|
|
12
15
|
*/
|
|
16
|
+
/**
|
|
17
|
+
* Opciones DOCX para el facade Node (Mammoth + inclusiones v2 opcionales de `@dragon708/docmind-docx`).
|
|
18
|
+
*/
|
|
19
|
+
interface NodeAnalyzeDocxOptionsSlice {
|
|
20
|
+
/**
|
|
21
|
+
* Pasa a `analyzeDocx` → extractores OOXML/ZIP en paralelo con Mammoth (`structure`, `headings`, `tables`, `blocks`, `pagesApprox`, `embeddedImages`).
|
|
22
|
+
*/
|
|
23
|
+
readonly include?: AnalyzeDocxIncludeFlags;
|
|
24
|
+
/** Opciones Mammoth para HTML (p. ej. `convertImage`). */
|
|
25
|
+
readonly html?: DocxToHtmlOptions;
|
|
26
|
+
}
|
|
13
27
|
interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
|
|
14
28
|
readonly pdf?: PdfAnalyzeOptions;
|
|
15
29
|
readonly ocr?: OcrOptions;
|
|
30
|
+
/** Solo DOCX: ver {@link NodeAnalyzeDocxOptionsSlice}. */
|
|
31
|
+
readonly docx?: NodeAnalyzeDocxOptionsSlice;
|
|
32
|
+
/**
|
|
33
|
+
* Native PDF text when `pdf.ocr` is `"off"`:
|
|
34
|
+
* - **`pdfjs-per-page`** (default for {@link extractText}): PDF.js text per page, merged for `text` (aligns with OCR raster engine).
|
|
35
|
+
* - **`pdf-parse`**: single pdf-parse pass (default for {@link analyzeFile} when you set `pdf.ocr: "off"` without this flag).
|
|
36
|
+
*/
|
|
37
|
+
readonly pdfNativeTextSource?: "pdf-parse" | "pdfjs-per-page";
|
|
16
38
|
}
|
|
17
39
|
|
|
18
40
|
/**
|
|
@@ -38,8 +60,9 @@ declare function resolveNodeAnalyzeInput(input: NodeAnalyzeInput): Promise<Detec
|
|
|
38
60
|
declare function analyzeFile(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
39
61
|
|
|
40
62
|
/**
|
|
41
|
-
* Plain-text extraction using {@link analyzeFile} routing. PDFs default to **text
|
|
42
|
-
* (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly.
|
|
63
|
+
* Plain-text extraction using {@link analyzeFile} routing. PDFs default to **native text only**
|
|
64
|
+
* (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly, and to **PDF.js per-page** assembly
|
|
65
|
+
* (`pdfNativeTextSource: "pdfjs-per-page"`) unless you set `options.pdfNativeTextSource` or `pdf.ocr` enables OCR.
|
|
43
66
|
*/
|
|
44
67
|
declare function extractText(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
45
68
|
/**
|
|
@@ -58,6 +81,62 @@ declare function convertToHtml(input: NodeAnalyzeInput, options?: NodeAnalyzeOpt
|
|
|
58
81
|
*/
|
|
59
82
|
declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
60
83
|
|
|
84
|
+
/** High-level features the user can ask DocMind for (per input kind and runtime). */
|
|
85
|
+
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
|
|
86
|
+
/**
|
|
87
|
+
* DOCX-only: what the stack can do with OOXML embedded bitmaps/vector payloads under `word/media`.
|
|
88
|
+
* Present on {@link GetCapabilitiesReport} when `kind === "docx"`.
|
|
89
|
+
*/
|
|
90
|
+
interface DocxEmbeddedImageCapabilities {
|
|
91
|
+
/** Bytes under `word/media/*` can be read (see `@dragon708/docmind-docx` / facade {@link extractImagesFromDocx}). */
|
|
92
|
+
readonly canExtractEmbeddedImages: true;
|
|
93
|
+
/**
|
|
94
|
+
* DOCX files may contain EMF, WMF, HEIC, etc., which are not reliably usable in a browser `<img>` without conversion.
|
|
95
|
+
* This flag is static (kind-based); it does not inspect the open document.
|
|
96
|
+
*/
|
|
97
|
+
readonly documentsMayIncludeImagesRequiringWebConversion: true;
|
|
98
|
+
/**
|
|
99
|
+
* In-browser conversion for those formats is **not** provided by DocMind; Node helpers may attempt best-effort conversion
|
|
100
|
+
* (currently stub — see package warnings).
|
|
101
|
+
*/
|
|
102
|
+
readonly webFriendlyConversionNodeFirst: true;
|
|
103
|
+
readonly notes: readonly string[];
|
|
104
|
+
}
|
|
105
|
+
/** Shared slice for {@link GetCapabilitiesReport} and {@link ExplainAnalysisPlanReport}. */
|
|
106
|
+
/** True si `options.docx.include` solicita al menos un extractor OOXML v2. */
|
|
107
|
+
declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
|
|
108
|
+
/**
|
|
109
|
+
* Capacidades estructurales DOCX v2 en Node (vía `@dragon708/docmind-docx` + `options.docx.include`).
|
|
110
|
+
* Presente en {@link GetCapabilitiesReport} cuando `kind === "docx"`.
|
|
111
|
+
*/
|
|
112
|
+
interface DocxStructuralCapabilities {
|
|
113
|
+
readonly ooxmlExtractorsAvailable: true;
|
|
114
|
+
readonly activatedViaDocxInclude: true;
|
|
115
|
+
readonly features: readonly string[];
|
|
116
|
+
readonly notes: readonly string[];
|
|
117
|
+
}
|
|
118
|
+
declare const DOCX_STRUCTURE_CAPABILITIES: DocxStructuralCapabilities;
|
|
119
|
+
declare const DOCX_EMBEDDED_IMAGE_CAPABILITIES: DocxEmbeddedImageCapabilities;
|
|
120
|
+
/** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
|
|
121
|
+
interface PublicCapabilitySupport {
|
|
122
|
+
readonly id: PublicCapabilityId;
|
|
123
|
+
readonly supported: boolean;
|
|
124
|
+
readonly warnings?: readonly string[];
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
|
|
128
|
+
*/
|
|
129
|
+
interface GetCapabilitiesReport {
|
|
130
|
+
readonly kind: FileKind;
|
|
131
|
+
readonly runtime: RuntimeDescriptor;
|
|
132
|
+
readonly capabilities: readonly PublicCapabilitySupport[];
|
|
133
|
+
/** Only when {@link GetCapabilitiesReport.kind} is `"docx"`. */
|
|
134
|
+
readonly docxEmbeddedImages?: DocxEmbeddedImageCapabilities;
|
|
135
|
+
/** Only when `kind === "docx"`: extractores OOXML v2 disponibles con `options.docx.include`. */
|
|
136
|
+
readonly docxStructure?: DocxStructuralCapabilities;
|
|
137
|
+
readonly warnings?: readonly string[];
|
|
138
|
+
}
|
|
139
|
+
|
|
61
140
|
/**
|
|
62
141
|
* Whether DocMind will try a non-OCR text/HTML path (e.g. Mammoth, pdf-parse text layer, UTF-8).
|
|
63
142
|
*/
|
|
@@ -83,33 +162,21 @@ interface ExplainAnalysisPlanReport {
|
|
|
83
162
|
readonly ocr: OcrPlan;
|
|
84
163
|
readonly limitations: readonly string[];
|
|
85
164
|
readonly plan: ProcessingPlanDescriptor;
|
|
165
|
+
/** Only when `kind === "docx"` (same payload as `getCapabilities` → `docxEmbeddedImages`). */
|
|
166
|
+
readonly docxEmbeddedImages?: DocxEmbeddedImageCapabilities;
|
|
167
|
+
/** Only when `kind === "docx"` (same payload as `getCapabilities` → `docxStructure`). */
|
|
168
|
+
readonly docxStructure?: DocxStructuralCapabilities;
|
|
86
169
|
readonly warnings?: readonly string[];
|
|
87
170
|
}
|
|
88
171
|
|
|
89
|
-
/**
|
|
90
|
-
type
|
|
91
|
-
/** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
|
|
92
|
-
interface PublicCapabilitySupport {
|
|
93
|
-
readonly id: PublicCapabilityId;
|
|
94
|
-
readonly supported: boolean;
|
|
95
|
-
readonly warnings?: readonly string[];
|
|
96
|
-
}
|
|
97
|
-
/**
|
|
98
|
-
* Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
|
|
99
|
-
*/
|
|
100
|
-
interface GetCapabilitiesReport {
|
|
101
|
-
readonly kind: FileKind;
|
|
102
|
-
readonly runtime: RuntimeDescriptor;
|
|
103
|
-
readonly capabilities: readonly PublicCapabilitySupport[];
|
|
104
|
-
readonly warnings?: readonly string[];
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
/** Options for {@link explainAnalysisPlan} including PDF/OCR hints for accurate planning. */
|
|
108
|
-
type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr">;
|
|
172
|
+
/** Options for {@link explainAnalysisPlan} including PDF/OCR/DOCX hints for accurate planning. */
|
|
173
|
+
type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr" | "docx">;
|
|
109
174
|
|
|
110
175
|
/**
|
|
111
176
|
* Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
|
|
112
|
-
* `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (PDF
|
|
177
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (for PDF, `text` / `metadata` /
|
|
178
|
+
* `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
|
|
179
|
+
* For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
|
|
113
180
|
* Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
|
|
114
181
|
*/
|
|
115
182
|
declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
|
|
@@ -119,4 +186,4 @@ declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilit
|
|
|
119
186
|
*/
|
|
120
187
|
declare function explainAnalysisPlan(input: NodeAnalyzeInput, options?: NodeExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
|
|
121
188
|
|
|
122
|
-
export { type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
|
189
|
+
export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeDocxOptionsSlice, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
package/dist/index.js
CHANGED
|
@@ -1,13 +1,45 @@
|
|
|
1
1
|
import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile } from '@dragon708/docmind-shared';
|
|
2
2
|
export { detectFileKind } from '@dragon708/docmind-shared';
|
|
3
3
|
import { analyzeDocx } from '@dragon708/docmind-docx';
|
|
4
|
+
export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
|
|
4
5
|
import { ocr } from '@dragon708/docmind-ocr';
|
|
5
|
-
import { extractPdfMetadata, extractTextFromPdf, analyzePdf } from '@dragon708/docmind-pdf';
|
|
6
|
+
import { extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
|
|
6
7
|
import { readFile } from 'fs/promises';
|
|
7
8
|
import { basename } from 'path';
|
|
8
9
|
import { fileURLToPath } from 'url';
|
|
9
10
|
|
|
10
11
|
// src/analyze.ts
|
|
12
|
+
|
|
13
|
+
// src/docxNodeMapper.ts
|
|
14
|
+
function analyzeDocxOptionsFromNode(options) {
|
|
15
|
+
const sig = options?.signal;
|
|
16
|
+
const dx = options?.docx;
|
|
17
|
+
if (!dx?.include && !dx?.html && !sig) return void 0;
|
|
18
|
+
const out = { ...dx?.html ?? {} };
|
|
19
|
+
if (dx?.include) out.include = dx.include;
|
|
20
|
+
if (sig) out.signal = sig;
|
|
21
|
+
return out;
|
|
22
|
+
}
|
|
23
|
+
function docxPackageResultToAnalysisResult(r) {
|
|
24
|
+
const base = {
|
|
25
|
+
fileKind: "docx",
|
|
26
|
+
analyzer: "docx",
|
|
27
|
+
status: "ok",
|
|
28
|
+
kind: "docx",
|
|
29
|
+
text: r.text,
|
|
30
|
+
html: r.html,
|
|
31
|
+
warnings: [...r.warnings]
|
|
32
|
+
};
|
|
33
|
+
const v2 = {
|
|
34
|
+
...r.structure !== void 0 ? { structure: r.structure } : {},
|
|
35
|
+
...r.headings !== void 0 ? { headings: r.headings } : {},
|
|
36
|
+
...r.tables !== void 0 ? { tables: r.tables } : {},
|
|
37
|
+
...r.blocks !== void 0 ? { blocks: r.blocks } : {},
|
|
38
|
+
...r.pagesApprox !== void 0 ? { pagesApprox: r.pagesApprox } : {},
|
|
39
|
+
...r.embeddedImages !== void 0 ? { embeddedImages: r.embeddedImages } : {}
|
|
40
|
+
};
|
|
41
|
+
return { ...base, ...v2 };
|
|
42
|
+
}
|
|
11
43
|
function isByteBackedInput(input) {
|
|
12
44
|
return isNamedInput(input) || isBinaryInput(input) || isBlob(input) || isFile(input);
|
|
13
45
|
}
|
|
@@ -19,7 +51,8 @@ async function bytesFromDetectInput(input) {
|
|
|
19
51
|
}
|
|
20
52
|
|
|
21
53
|
// src/analyzers/docx.ts
|
|
22
|
-
async function analyzeDocxForNode(input,
|
|
54
|
+
async function analyzeDocxForNode(input, options) {
|
|
55
|
+
const signal = options?.signal;
|
|
23
56
|
if (signal?.aborted) {
|
|
24
57
|
const err = new Error("The operation was aborted");
|
|
25
58
|
err.name = "AbortError";
|
|
@@ -37,16 +70,9 @@ async function analyzeDocxForNode(input, signal) {
|
|
|
37
70
|
warnings: ["No document bytes were provided for analysis."]
|
|
38
71
|
};
|
|
39
72
|
}
|
|
40
|
-
const
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
analyzer: "docx",
|
|
44
|
-
status: "ok",
|
|
45
|
-
kind: "docx",
|
|
46
|
-
text: r.text,
|
|
47
|
-
html: r.html,
|
|
48
|
-
warnings: [...r.warnings]
|
|
49
|
-
};
|
|
73
|
+
const docxOpts = analyzeDocxOptionsFromNode(options);
|
|
74
|
+
const r = docxOpts !== void 0 ? await analyzeDocx(data, docxOpts) : await analyzeDocx(data);
|
|
75
|
+
return docxPackageResultToAnalysisResult(r);
|
|
50
76
|
}
|
|
51
77
|
async function analyzeImageForNode(input, options) {
|
|
52
78
|
if (options?.signal?.aborted) {
|
|
@@ -112,17 +138,45 @@ async function analyzePdfForNode(input, options) {
|
|
|
112
138
|
signal: userPdf?.signal ?? options?.signal
|
|
113
139
|
};
|
|
114
140
|
const r = await analyzePdf(data, pdfOpts);
|
|
141
|
+
const usePdfJsPerPage = pdfOpts.ocr === "off" && (options?.pdfNativeTextSource ?? "pdf-parse") === "pdfjs-per-page";
|
|
142
|
+
if (!usePdfJsPerPage) {
|
|
143
|
+
return {
|
|
144
|
+
fileKind: "pdf",
|
|
145
|
+
analyzer: "pdf",
|
|
146
|
+
status: "ok",
|
|
147
|
+
kind: "pdf",
|
|
148
|
+
text: r.text,
|
|
149
|
+
pages: r.pages,
|
|
150
|
+
metadata: r.metadata,
|
|
151
|
+
warnings: [...r.warnings],
|
|
152
|
+
needsOCR: r.needsOCR,
|
|
153
|
+
ocrUsed: r.ocrUsed
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
let text = r.text;
|
|
157
|
+
const extra = [];
|
|
158
|
+
try {
|
|
159
|
+
const rows = await extractPdfTextByPage(data, {
|
|
160
|
+
maxPages: pdfOpts.maxPages,
|
|
161
|
+
signal: pdfOpts.signal
|
|
162
|
+
});
|
|
163
|
+
text = rows.map((row) => row.text).join("\n\n");
|
|
164
|
+
} catch (e) {
|
|
165
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
166
|
+
extra.push(`warning: PDF.js per-page text failed; using pdf-parse text: ${msg}`);
|
|
167
|
+
}
|
|
168
|
+
const needsOCR = r.pages > 0 && text.trim().length === 0;
|
|
115
169
|
return {
|
|
116
170
|
fileKind: "pdf",
|
|
117
171
|
analyzer: "pdf",
|
|
118
172
|
status: "ok",
|
|
119
173
|
kind: "pdf",
|
|
120
|
-
text
|
|
174
|
+
text,
|
|
121
175
|
pages: r.pages,
|
|
122
176
|
metadata: r.metadata,
|
|
123
|
-
warnings: [...r.warnings],
|
|
124
|
-
needsOCR
|
|
125
|
-
ocrUsed:
|
|
177
|
+
warnings: [...r.warnings, ...extra],
|
|
178
|
+
needsOCR,
|
|
179
|
+
ocrUsed: false
|
|
126
180
|
};
|
|
127
181
|
}
|
|
128
182
|
function toPathString(pathOrUrl) {
|
|
@@ -160,7 +214,7 @@ async function analyzeFile(input, options) {
|
|
|
160
214
|
case "pdf":
|
|
161
215
|
return analyzePdfForNode(resolved, options);
|
|
162
216
|
case "docx":
|
|
163
|
-
return analyzeDocxForNode(resolved, options
|
|
217
|
+
return analyzeDocxForNode(resolved, options);
|
|
164
218
|
case "image":
|
|
165
219
|
return analyzeImageForNode(resolved, options);
|
|
166
220
|
case "text":
|
|
@@ -191,7 +245,7 @@ function throwIfAborted(signal) {
|
|
|
191
245
|
}
|
|
192
246
|
|
|
193
247
|
// src/publicActions.ts
|
|
194
|
-
var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not
|
|
248
|
+
var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not returned by extractMetadata; use analyzeFile, extractText, or convertToHtml with options.docx.include for OOXML structure, headings, tables, blocks, approximate pages, and embedded images.";
|
|
195
249
|
var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
|
|
196
250
|
var RUN_OCR_PDF_FORCE_SEMANTICS = 'runOcr: PDF pipeline ran with `ocr: "force"` so text may include raster OCR output even when a text layer exists.';
|
|
197
251
|
function escapeHtmlMinimal(s) {
|
|
@@ -211,7 +265,11 @@ function toExtractTextResult(full) {
|
|
|
211
265
|
}
|
|
212
266
|
async function extractText(input, options) {
|
|
213
267
|
throwIfAborted(options?.signal);
|
|
214
|
-
const
|
|
268
|
+
const merged = {
|
|
269
|
+
...withPdfOcrDefaultOff(options),
|
|
270
|
+
pdfNativeTextSource: options?.pdfNativeTextSource ?? "pdfjs-per-page"
|
|
271
|
+
};
|
|
272
|
+
const full = await analyzeFile(input, merged);
|
|
215
273
|
return toExtractTextResult(full);
|
|
216
274
|
}
|
|
217
275
|
async function extractMetadata(input, options) {
|
|
@@ -433,19 +491,16 @@ async function runOcr(input, options) {
|
|
|
433
491
|
warnings: ["No document bytes were provided for analysis."]
|
|
434
492
|
};
|
|
435
493
|
}
|
|
436
|
-
const
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
status: "ok",
|
|
441
|
-
kind: "docx",
|
|
442
|
-
text: r.text,
|
|
443
|
-
html: r.html,
|
|
494
|
+
const opt = analyzeDocxOptionsFromNode(options);
|
|
495
|
+
const raw = opt !== void 0 ? await analyzeDocx(data, opt) : await analyzeDocx(data);
|
|
496
|
+
const withNote = {
|
|
497
|
+
...raw,
|
|
444
498
|
warnings: [
|
|
445
|
-
...
|
|
499
|
+
...raw.warnings,
|
|
446
500
|
"OCR does not apply to DOCX; returned structured text/HTML extract."
|
|
447
501
|
]
|
|
448
502
|
};
|
|
503
|
+
return docxPackageResultToAnalysisResult(withNote);
|
|
449
504
|
}
|
|
450
505
|
case "text":
|
|
451
506
|
return analyzeText(resolved, { signal });
|
|
@@ -454,11 +509,141 @@ async function runOcr(input, options) {
|
|
|
454
509
|
}
|
|
455
510
|
}
|
|
456
511
|
|
|
512
|
+
// src/capabilityReport.ts
|
|
513
|
+
function docxIncludeRequested(flags) {
|
|
514
|
+
if (!flags) return false;
|
|
515
|
+
return !!(flags.structure || flags.headings || flags.tables || flags.blocks || flags.pagesApprox || flags.embeddedImages);
|
|
516
|
+
}
|
|
517
|
+
var DOCX_STRUCTURE_CAPABILITIES = {
|
|
518
|
+
ooxmlExtractorsAvailable: true,
|
|
519
|
+
activatedViaDocxInclude: true,
|
|
520
|
+
features: [
|
|
521
|
+
"OOXML structure (body blocks)",
|
|
522
|
+
"headings",
|
|
523
|
+
"tables",
|
|
524
|
+
"semantic blocks (RAG-friendly)",
|
|
525
|
+
"approximate pages (OOXML page-break hints)",
|
|
526
|
+
"embedded images (word/media; optional web/both modes)"
|
|
527
|
+
],
|
|
528
|
+
notes: [
|
|
529
|
+
"Use options.docx.include on analyzeFile, extractText, convertToHtml, or runOcr to merge Mammoth output with selected extractors.",
|
|
530
|
+
"extractMetadata for DOCX remains a lightweight stub and does not run these extractors."
|
|
531
|
+
]
|
|
532
|
+
};
|
|
533
|
+
var DOCX_EMBEDDED_IMAGE_CAPABILITIES = {
|
|
534
|
+
canExtractEmbeddedImages: true,
|
|
535
|
+
documentsMayIncludeImagesRequiringWebConversion: true,
|
|
536
|
+
webFriendlyConversionNodeFirst: true,
|
|
537
|
+
notes: [
|
|
538
|
+
"Use extractImagesFromDocx (re-exported from @dragon708/docmind-node) for raw ZIP media; optional mode: web | both for web-oriented bytes.",
|
|
539
|
+
"PNG, JPEG, GIF, WebP, SVG, TIFF, BMP, ICO are treated as browser-embeddable; EMF/WMF and similar may require an external Node pipeline.",
|
|
540
|
+
"@dragon708/docmind-docx does not ship a bundled EMF/WMF converter; convertDocxEmbeddedImageToWeb surfaces clear warnings until you wire ImageMagick, Sharp, or similar."
|
|
541
|
+
]
|
|
542
|
+
};
|
|
543
|
+
var DOCX_META = "Structured document metadata is not exposed separately; extractMetadata returns a stub for DOCX.";
|
|
544
|
+
var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
|
|
545
|
+
var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
|
|
546
|
+
var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
|
|
547
|
+
var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
|
|
548
|
+
function slot(id, supported, warnings) {
|
|
549
|
+
return warnings?.length ? { id, supported, warnings } : { id, supported };
|
|
550
|
+
}
|
|
551
|
+
function buildNodeCapabilityReport(kind) {
|
|
552
|
+
const runtime = { id: "node" };
|
|
553
|
+
let capabilities;
|
|
554
|
+
const topWarnings = [];
|
|
555
|
+
switch (kind) {
|
|
556
|
+
case "pdf":
|
|
557
|
+
capabilities = [
|
|
558
|
+
slot("text", true, [
|
|
559
|
+
"Native text via pdf-parse and (in Node extractText) PDF.js per-page text; set pdf.ocr for raster OCR."
|
|
560
|
+
]),
|
|
561
|
+
slot("metadata", true, [
|
|
562
|
+
"Document info / XMP-style metadata via pdf-parse without running the OCR pipeline."
|
|
563
|
+
]),
|
|
564
|
+
slot("pages", true, [
|
|
565
|
+
"Page count and per-page native extraction (PDF.js) where used; OCR respects pdf.maxPages."
|
|
566
|
+
]),
|
|
567
|
+
slot("ocr", true, [
|
|
568
|
+
"Raster OCR pipeline (pdf.ocr auto with quality heuristics, force, or runOcr)."
|
|
569
|
+
]),
|
|
570
|
+
slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."])
|
|
571
|
+
];
|
|
572
|
+
break;
|
|
573
|
+
case "docx":
|
|
574
|
+
capabilities = [
|
|
575
|
+
slot("text", true, [
|
|
576
|
+
"Mammoth plain text; extractText clears html. Optional OOXML extractors merge when options.docx.include is set."
|
|
577
|
+
]),
|
|
578
|
+
slot("metadata", false, [
|
|
579
|
+
`${DOCX_META} OOXML structure, headings, tables, blocks, approximate pages, and embedded images are available via analyzeFile-style routes with options.docx.include.`
|
|
580
|
+
]),
|
|
581
|
+
slot("html", true, [
|
|
582
|
+
"Mammoth HTML uses docxImagesAsDataUri for web-safe images; EMF/WMF and other non-web types appear as placeholders, not extracted media."
|
|
583
|
+
]),
|
|
584
|
+
slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
|
|
585
|
+
slot("pages", false, [
|
|
586
|
+
"No PDF-style page count; approximate DOCX pages via options.docx.include.pagesApprox (OOXML hints, not print layout)."
|
|
587
|
+
])
|
|
588
|
+
];
|
|
589
|
+
break;
|
|
590
|
+
case "image":
|
|
591
|
+
capabilities = [
|
|
592
|
+
slot("text", true, ["Text is obtained via OCR."]),
|
|
593
|
+
slot("metadata", false, [IMAGE_META]),
|
|
594
|
+
slot("html", false, [IMAGE_HTML]),
|
|
595
|
+
slot("ocr", true),
|
|
596
|
+
slot("pages", false)
|
|
597
|
+
];
|
|
598
|
+
break;
|
|
599
|
+
case "text":
|
|
600
|
+
capabilities = [
|
|
601
|
+
slot("text", true),
|
|
602
|
+
slot("metadata", true, [TEXT_META_NOTE]),
|
|
603
|
+
slot("html", true),
|
|
604
|
+
slot("ocr", false, ["OCR does not apply to plain text files."]),
|
|
605
|
+
slot("pages", false)
|
|
606
|
+
];
|
|
607
|
+
break;
|
|
608
|
+
default:
|
|
609
|
+
topWarnings.push(UNKNOWN_KIND);
|
|
610
|
+
capabilities = [
|
|
611
|
+
slot("text", false),
|
|
612
|
+
slot("metadata", false),
|
|
613
|
+
slot("html", false),
|
|
614
|
+
slot("ocr", false),
|
|
615
|
+
slot("pages", false)
|
|
616
|
+
];
|
|
617
|
+
}
|
|
618
|
+
return {
|
|
619
|
+
kind,
|
|
620
|
+
runtime,
|
|
621
|
+
capabilities,
|
|
622
|
+
...kind === "docx" ? {
|
|
623
|
+
docxEmbeddedImages: DOCX_EMBEDDED_IMAGE_CAPABILITIES,
|
|
624
|
+
docxStructure: DOCX_STRUCTURE_CAPABILITIES
|
|
625
|
+
} : {},
|
|
626
|
+
warnings: topWarnings.length > 0 ? topWarnings : void 0
|
|
627
|
+
};
|
|
628
|
+
}
|
|
629
|
+
|
|
457
630
|
// src/analysisPlanReport.ts
|
|
458
631
|
function lim(...items) {
|
|
459
632
|
return items.filter(Boolean);
|
|
460
633
|
}
|
|
461
|
-
|
|
634
|
+
var DOCX_ZIP_MEDIA_PLAN_NOTE = "ZIP embedded images (word/media) use extractImagesFromDocx (re-exported from @dragon708/docmind-node); not merged into this intent pipeline.";
|
|
635
|
+
function finalizeDocxExplainReport(report) {
|
|
636
|
+
if (report.kind !== "docx") return report;
|
|
637
|
+
const limitations = report.limitations.includes(DOCX_ZIP_MEDIA_PLAN_NOTE) ? report.limitations : [...report.limitations, DOCX_ZIP_MEDIA_PLAN_NOTE];
|
|
638
|
+
return {
|
|
639
|
+
...report,
|
|
640
|
+
docxEmbeddedImages: DOCX_EMBEDDED_IMAGE_CAPABILITIES,
|
|
641
|
+
docxStructure: DOCX_STRUCTURE_CAPABILITIES,
|
|
642
|
+
limitations
|
|
643
|
+
};
|
|
644
|
+
}
|
|
645
|
+
var DOCX_MAMMOTH_PLUS_OPTIONAL = "Mammoth extracts text and HTML from OOXML; optional parallel OOXML/ZIP extractors run when options.docx.include is set (structure, headings, tables, blocks, pagesApprox, embeddedImages).";
|
|
646
|
+
function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
|
|
462
647
|
const runtime = { id: "node" };
|
|
463
648
|
const primaryAnalyzer = kind === "pdf" ? "pdf" : kind === "docx" ? "docx" : kind === "image" ? "image" : kind === "text" ? "text" : "none";
|
|
464
649
|
let nativeExtraction;
|
|
@@ -468,7 +653,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
|
|
|
468
653
|
limitations = lim(
|
|
469
654
|
"Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve."
|
|
470
655
|
);
|
|
471
|
-
return {
|
|
656
|
+
return finalizeDocxExplainReport({
|
|
472
657
|
kind,
|
|
473
658
|
detectedKind: kind,
|
|
474
659
|
runtime,
|
|
@@ -478,23 +663,23 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
|
|
|
478
663
|
ocr: { mayUse: false, description: "OCR is not used for unknown kinds." },
|
|
479
664
|
limitations,
|
|
480
665
|
plan
|
|
481
|
-
};
|
|
666
|
+
});
|
|
482
667
|
}
|
|
483
668
|
switch (intent) {
|
|
484
669
|
case "analyzeFile":
|
|
485
670
|
if (kind === "pdf") {
|
|
486
671
|
nativeExtraction = {
|
|
487
672
|
willAttempt: true,
|
|
488
|
-
description: "pdf-parse
|
|
673
|
+
description: "pdf-parse supplies embedded text, metadata, and page count; PDF.js drives raster OCR when enabled."
|
|
489
674
|
};
|
|
490
675
|
ocr3 = {
|
|
491
676
|
mayUse: pdfOcr !== "off",
|
|
492
|
-
description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when
|
|
677
|
+
description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when native text looks insufficient (pdf.ocr: auto + heuristics)."
|
|
493
678
|
};
|
|
494
679
|
} else if (kind === "docx") {
|
|
495
680
|
nativeExtraction = {
|
|
496
681
|
willAttempt: true,
|
|
497
|
-
description: "Mammoth
|
|
682
|
+
description: docxIncludeRequested(docxInclude) ? "Mammoth plus parallel OOXML extractors (per options.docx.include)." : DOCX_MAMMOTH_PLUS_OPTIONAL
|
|
498
683
|
};
|
|
499
684
|
ocr3 = { mayUse: false, description: "DOCX does not use OCR in DocMind." };
|
|
500
685
|
} else if (kind === "image") {
|
|
@@ -515,16 +700,16 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
|
|
|
515
700
|
if (kind === "pdf") {
|
|
516
701
|
nativeExtraction = {
|
|
517
702
|
willAttempt: true,
|
|
518
|
-
description: "
|
|
703
|
+
description: "Node: pdf-parse for metadata/page baseline, then PDF.js per-page text merged into `text` (pdfNativeTextSource pdfjs-per-page default)."
|
|
519
704
|
};
|
|
520
705
|
ocr3 = {
|
|
521
706
|
mayUse: false,
|
|
522
|
-
description: "extractText
|
|
707
|
+
description: "extractText defaults pdf.ocr off; set pdf.ocr explicitly to allow auto/force raster OCR."
|
|
523
708
|
};
|
|
524
709
|
} else if (kind === "docx") {
|
|
525
710
|
nativeExtraction = {
|
|
526
711
|
willAttempt: true,
|
|
527
|
-
description: "Mammoth plain text; HTML cleared
|
|
712
|
+
description: docxIncludeRequested(docxInclude) ? "Same DOCX router as analyzeFile: Mammoth text + optional OOXML fields; HTML cleared in extractText." : "Mammoth plain text; HTML cleared. Optional OOXML v2 fields when options.docx.include is set."
|
|
528
713
|
};
|
|
529
714
|
ocr3 = { mayUse: false, description: "DOCX does not use OCR." };
|
|
530
715
|
} else if (kind === "image") {
|
|
@@ -552,7 +737,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
|
|
|
552
737
|
};
|
|
553
738
|
ocr3 = { mayUse: false, description: "OCR not used for this metadata path." };
|
|
554
739
|
limitations = lim(
|
|
555
|
-
kind === "docx" ? "Structured DOCX metadata is not exposed separately." : "Raster images have no document metadata bundle."
|
|
740
|
+
kind === "docx" ? "Structured DOCX metadata is not exposed separately; use analyzeFile / extractText / convertToHtml with options.docx.include for OOXML structure." : "Raster images have no document metadata bundle."
|
|
556
741
|
);
|
|
557
742
|
} else {
|
|
558
743
|
nativeExtraction = {
|
|
@@ -574,7 +759,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
|
|
|
574
759
|
} else if (kind === "docx") {
|
|
575
760
|
nativeExtraction = {
|
|
576
761
|
willAttempt: true,
|
|
577
|
-
description: "Mammoth HTML
|
|
762
|
+
description: docxIncludeRequested(docxInclude) ? "Mammoth HTML plus optional OOXML extractors (same router as analyzeFile)." : "Mammoth HTML via analyzeFile routing; optional OOXML v2 when options.docx.include is set."
|
|
578
763
|
};
|
|
579
764
|
ocr3 = { mayUse: false, description: "DOCX path does not use OCR." };
|
|
580
765
|
} else if (kind === "text") {
|
|
@@ -609,7 +794,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
|
|
|
609
794
|
} else if (kind === "docx") {
|
|
610
795
|
nativeExtraction = {
|
|
611
796
|
willAttempt: true,
|
|
612
|
-
description: "Full Mammoth extract (text + HTML); not OCR."
|
|
797
|
+
description: docxIncludeRequested(docxInclude) ? "Mammoth text/HTML plus optional OOXML extractors; still not OCR." : "Full Mammoth extract (text + HTML); optional OOXML v2 via options.docx.include; not OCR."
|
|
613
798
|
};
|
|
614
799
|
ocr3 = { mayUse: false, description: "DOCX is not OCR'd." };
|
|
615
800
|
limitations = lim("Result is structured extract, not OCR output.");
|
|
@@ -625,7 +810,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
|
|
|
625
810
|
nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
|
|
626
811
|
ocr3 = { mayUse: false, description: "See plan steps." };
|
|
627
812
|
}
|
|
628
|
-
return {
|
|
813
|
+
return finalizeDocxExplainReport({
|
|
629
814
|
kind,
|
|
630
815
|
detectedKind: kind,
|
|
631
816
|
runtime,
|
|
@@ -635,82 +820,14 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
|
|
|
635
820
|
ocr: ocr3,
|
|
636
821
|
limitations,
|
|
637
822
|
plan
|
|
638
|
-
};
|
|
639
|
-
}
|
|
640
|
-
|
|
641
|
-
// src/capabilityReport.ts
|
|
642
|
-
var DOCX_META = "Structured document metadata is not exposed separately; extractMetadata returns a stub for DOCX.";
|
|
643
|
-
var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
|
|
644
|
-
var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
|
|
645
|
-
var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
|
|
646
|
-
var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
|
|
647
|
-
function slot(id, supported, warnings) {
|
|
648
|
-
return warnings?.length ? { id, supported, warnings } : { id, supported };
|
|
649
|
-
}
|
|
650
|
-
function buildNodeCapabilityReport(kind) {
|
|
651
|
-
const runtime = { id: "node" };
|
|
652
|
-
let capabilities;
|
|
653
|
-
const topWarnings = [];
|
|
654
|
-
switch (kind) {
|
|
655
|
-
case "pdf":
|
|
656
|
-
capabilities = [
|
|
657
|
-
slot("text", true, ["Includes text layer extraction; use extractText options to avoid PDF OCR."]),
|
|
658
|
-
slot("metadata", true),
|
|
659
|
-
slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."]),
|
|
660
|
-
slot("ocr", true, ["Raster OCR is available (e.g. analyzeFile with pdf.ocr auto/force, or runOcr)."]),
|
|
661
|
-
slot("pages", true)
|
|
662
|
-
];
|
|
663
|
-
break;
|
|
664
|
-
case "docx":
|
|
665
|
-
capabilities = [
|
|
666
|
-
slot("text", true),
|
|
667
|
-
slot("metadata", false, [DOCX_META]),
|
|
668
|
-
slot("html", true),
|
|
669
|
-
slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
|
|
670
|
-
slot("pages", false)
|
|
671
|
-
];
|
|
672
|
-
break;
|
|
673
|
-
case "image":
|
|
674
|
-
capabilities = [
|
|
675
|
-
slot("text", true, ["Text is obtained via OCR."]),
|
|
676
|
-
slot("metadata", false, [IMAGE_META]),
|
|
677
|
-
slot("html", false, [IMAGE_HTML]),
|
|
678
|
-
slot("ocr", true),
|
|
679
|
-
slot("pages", false)
|
|
680
|
-
];
|
|
681
|
-
break;
|
|
682
|
-
case "text":
|
|
683
|
-
capabilities = [
|
|
684
|
-
slot("text", true),
|
|
685
|
-
slot("metadata", true, [TEXT_META_NOTE]),
|
|
686
|
-
slot("html", true),
|
|
687
|
-
slot("ocr", false, ["OCR does not apply to plain text files."]),
|
|
688
|
-
slot("pages", false)
|
|
689
|
-
];
|
|
690
|
-
break;
|
|
691
|
-
default:
|
|
692
|
-
topWarnings.push(UNKNOWN_KIND);
|
|
693
|
-
capabilities = [
|
|
694
|
-
slot("text", false),
|
|
695
|
-
slot("metadata", false),
|
|
696
|
-
slot("html", false),
|
|
697
|
-
slot("ocr", false),
|
|
698
|
-
slot("pages", false)
|
|
699
|
-
];
|
|
700
|
-
}
|
|
701
|
-
return {
|
|
702
|
-
kind,
|
|
703
|
-
runtime,
|
|
704
|
-
capabilities,
|
|
705
|
-
warnings: topWarnings.length > 0 ? topWarnings : void 0
|
|
706
|
-
};
|
|
823
|
+
});
|
|
707
824
|
}
|
|
708
825
|
|
|
709
826
|
// src/introspection.ts
|
|
710
827
|
function resolvePdfOcrMode(pdf) {
|
|
711
828
|
return pdf?.ocr ?? "auto";
|
|
712
829
|
}
|
|
713
|
-
function planAnalyzeFile(kind, pdfOcr) {
|
|
830
|
+
function planAnalyzeFile(kind, pdfOcr, docxInclude) {
|
|
714
831
|
switch (kind) {
|
|
715
832
|
case "pdf":
|
|
716
833
|
return {
|
|
@@ -724,14 +841,17 @@ function planAnalyzeFile(kind, pdfOcr) {
|
|
|
724
841
|
}
|
|
725
842
|
]
|
|
726
843
|
};
|
|
727
|
-
case "docx":
|
|
844
|
+
case "docx": {
|
|
845
|
+
const parallel = docxIncludeRequested(docxInclude);
|
|
728
846
|
return {
|
|
729
847
|
intent: "analyzeFile",
|
|
730
848
|
steps: [
|
|
731
849
|
{ id: "detect_kind", status: "done" },
|
|
732
|
-
{ id: "docx_mammoth", status: "planned" }
|
|
850
|
+
{ id: "docx_mammoth", status: "planned" },
|
|
851
|
+
...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
|
|
733
852
|
]
|
|
734
853
|
};
|
|
854
|
+
}
|
|
735
855
|
case "image":
|
|
736
856
|
return {
|
|
737
857
|
intent: "analyzeFile",
|
|
@@ -758,11 +878,22 @@ function planAnalyzeFile(kind, pdfOcr) {
|
|
|
758
878
|
};
|
|
759
879
|
}
|
|
760
880
|
}
|
|
761
|
-
function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
|
|
881
|
+
function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude) {
|
|
762
882
|
const intent = intentOpt ?? "analyzeFile";
|
|
763
|
-
if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze);
|
|
883
|
+
if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude);
|
|
764
884
|
if (intent === "extractText") {
|
|
765
|
-
|
|
885
|
+
if (kind === "pdf") {
|
|
886
|
+
return {
|
|
887
|
+
intent: "extractText",
|
|
888
|
+
steps: [
|
|
889
|
+
{ id: "detect_kind", status: "done" },
|
|
890
|
+
{ id: "pdf_parse", status: "planned" },
|
|
891
|
+
{ id: "pdfjs_per_page", status: "planned" },
|
|
892
|
+
{ id: "pdf_ocr", status: "skipped" }
|
|
893
|
+
]
|
|
894
|
+
};
|
|
895
|
+
}
|
|
896
|
+
const p = planAnalyzeFile(kind, "off", docxInclude);
|
|
766
897
|
return { ...p, intent: "extractText" };
|
|
767
898
|
}
|
|
768
899
|
if (intent === "extractMetadata") {
|
|
@@ -794,11 +925,13 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
|
|
|
794
925
|
}
|
|
795
926
|
if (intent === "convertToHtml") {
|
|
796
927
|
if (kind === "docx") {
|
|
928
|
+
const parallel = docxIncludeRequested(docxInclude);
|
|
797
929
|
return {
|
|
798
930
|
intent: "convertToHtml",
|
|
799
931
|
steps: [
|
|
800
932
|
{ id: "detect_kind", status: "done" },
|
|
801
|
-
{ id: "docx_mammoth_html", status: "planned" }
|
|
933
|
+
{ id: "docx_mammoth_html", status: "planned" },
|
|
934
|
+
...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
|
|
802
935
|
]
|
|
803
936
|
};
|
|
804
937
|
}
|
|
@@ -851,11 +984,13 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
|
|
|
851
984
|
};
|
|
852
985
|
}
|
|
853
986
|
if (kind === "docx") {
|
|
987
|
+
const parallel = docxIncludeRequested(docxInclude);
|
|
854
988
|
return {
|
|
855
989
|
intent: "runOcr",
|
|
856
990
|
steps: [
|
|
857
991
|
{ id: "detect_kind", status: "done" },
|
|
858
|
-
{ id: "
|
|
992
|
+
{ id: "docx_mammoth", status: "planned" },
|
|
993
|
+
...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
|
|
859
994
|
]
|
|
860
995
|
};
|
|
861
996
|
}
|
|
@@ -867,7 +1002,7 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
|
|
|
867
1002
|
]
|
|
868
1003
|
};
|
|
869
1004
|
}
|
|
870
|
-
return planAnalyzeFile(kind, pdfOcrForAnalyze);
|
|
1005
|
+
return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude);
|
|
871
1006
|
}
|
|
872
1007
|
async function getCapabilities(input, options) {
|
|
873
1008
|
throwIfAborted(options?.signal);
|
|
@@ -883,10 +1018,11 @@ async function explainAnalysisPlan(input, options) {
|
|
|
883
1018
|
const kind = detectFileKind(resolved);
|
|
884
1019
|
const intent = options?.intent ?? "analyzeFile";
|
|
885
1020
|
const pdfOcrAnalyze = resolvePdfOcrMode(options?.pdf);
|
|
886
|
-
const
|
|
887
|
-
|
|
1021
|
+
const docxInc = options?.docx?.include;
|
|
1022
|
+
const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc);
|
|
1023
|
+
return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan, docxInc);
|
|
888
1024
|
}
|
|
889
1025
|
|
|
890
|
-
export { analyzeFile, bufferToInput, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
|
1026
|
+
export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
|
891
1027
|
//# sourceMappingURL=index.js.map
|
|
892
1028
|
//# sourceMappingURL=index.js.map
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-node",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.7.0",
|
|
4
4
|
"description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -32,9 +32,9 @@
|
|
|
32
32
|
],
|
|
33
33
|
"license": "MIT",
|
|
34
34
|
"dependencies": {
|
|
35
|
-
"@dragon708/docmind-docx": "^1.
|
|
35
|
+
"@dragon708/docmind-docx": "^1.7.0",
|
|
36
36
|
"@dragon708/docmind-ocr": "^1.0.0",
|
|
37
|
-
"@dragon708/docmind-pdf": "^
|
|
37
|
+
"@dragon708/docmind-pdf": "^2.0.0",
|
|
38
38
|
"@dragon708/docmind-shared": "^1.1.0"
|
|
39
39
|
},
|
|
40
40
|
"devDependencies": {
|