@dragon708/docmind-node 1.8.0 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +10 -6
- package/dist/index.js +163 -86
- package/package.json +5 -5
package/dist/index.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { DocMindAnalyzeOptions, DetectFileKindInput, NamedInput, AnalysisResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
|
|
2
2
|
export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
|
|
3
|
-
import { OcrOptions } from '@dragon708/docmind-ocr';
|
|
3
|
+
import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
|
|
4
4
|
import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
|
|
5
5
|
import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
|
|
6
6
|
export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
|
|
@@ -10,7 +10,7 @@ export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversion
|
|
|
10
10
|
*
|
|
11
11
|
* - **`pdf`**: forwarded to `@dragon708/docmind-pdf`. `analyzeFile` defaults `pdf.ocr` to `"auto"` when omitted.
|
|
12
12
|
* {@link extractText} / {@link convertToHtml} merge a default of `ocr: "off"` unless you set `pdf.ocr` explicitly.
|
|
13
|
-
* - **`ocr`**: forwarded to `@dragon708/docmind-ocr` for raster images;
|
|
13
|
+
* - **`ocr`**: forwarded to `@dragon708/docmind-ocr` for raster images (`ocrImageDetailed` + `normalizeImageForOcr`, or `ocrTiff` for TIFF); optional `preprocess` maps to {@link PreprocessImageOptions}. `maxPages` / `pageSeparator` apply to multipage TIFF. Language also feeds PDF OCR when `pdf.ocrLangs` is unset.
|
|
14
14
|
* - **`pdfNativeTextSource`**: when `pdf.ocr` resolves to `"off"`, chooses how native text is obtained (see {@link extractText} default).
|
|
15
15
|
*/
|
|
16
16
|
/**
|
|
@@ -26,7 +26,9 @@ interface NodeAnalyzeDocxOptionsSlice {
|
|
|
26
26
|
}
|
|
27
27
|
interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
|
|
28
28
|
readonly pdf?: PdfAnalyzeOptions;
|
|
29
|
-
readonly ocr?: OcrOptions
|
|
29
|
+
readonly ocr?: OcrOptions & Pick<OcrTiffOptions, "maxPages" | "pageSeparator"> & {
|
|
30
|
+
readonly preprocess?: PreprocessImageOptions;
|
|
31
|
+
};
|
|
30
32
|
/** Solo DOCX: ver {@link NodeAnalyzeDocxOptionsSlice}. */
|
|
31
33
|
readonly docx?: NodeAnalyzeDocxOptionsSlice;
|
|
32
34
|
/**
|
|
@@ -77,12 +79,14 @@ declare function extractMetadata(input: NodeAnalyzeInput, options?: NodeAnalyzeO
|
|
|
77
79
|
declare function convertToHtml(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
78
80
|
/**
|
|
79
81
|
* OCR intent: PDF always runs {@link analyzePdf} with `ocr: "force"` (merged with `options.pdf`).
|
|
80
|
-
* Raster images
|
|
82
|
+
* Raster images: `ocrImageDetailed` (normalize → optional preprocess → Tesseract) for supported single-frame inputs;
|
|
83
|
+
* TIFF via multipage `ocrTiff` (`options.ocr.maxPages`, `pageSeparator`).
|
|
84
|
+
* DOCX returns structured extract with a notice.
|
|
81
85
|
*/
|
|
82
86
|
declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
83
87
|
|
|
84
88
|
/** High-level features the user can ask DocMind for (per input kind and runtime). */
|
|
85
|
-
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
|
|
89
|
+
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
|
|
86
90
|
/**
|
|
87
91
|
* DOCX-only: what the stack can do with OOXML embedded bitmaps/vector payloads under `word/media`.
|
|
88
92
|
* Present on {@link GetCapabilitiesReport} when `kind === "docx"`.
|
|
@@ -174,7 +178,7 @@ type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnal
|
|
|
174
178
|
|
|
175
179
|
/**
|
|
176
180
|
* Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
|
|
177
|
-
* `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (for PDF, `text` / `metadata` /
|
|
181
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
|
|
178
182
|
* `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
|
|
179
183
|
* For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
|
|
180
184
|
* Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
|
package/dist/index.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile } from '@dragon708/docmind-shared';
|
|
1
|
+
import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
|
|
2
2
|
export { detectFileKind } from '@dragon708/docmind-shared';
|
|
3
3
|
import { analyzeDocx } from '@dragon708/docmind-docx';
|
|
4
4
|
export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
|
|
5
|
-
import {
|
|
5
|
+
import { preprocessHasEffect, resolveImageFormat, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
|
|
6
6
|
import { extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
|
|
7
7
|
import { readFile } from 'fs/promises';
|
|
8
8
|
import { basename } from 'path';
|
|
@@ -74,6 +74,53 @@ async function analyzeDocxForNode(input, options) {
|
|
|
74
74
|
const r = docxOpts !== void 0 ? await analyzeDocx(data, docxOpts) : await analyzeDocx(data);
|
|
75
75
|
return docxPackageResultToAnalysisResult(r);
|
|
76
76
|
}
|
|
77
|
+
function meanPageConfidence(pages) {
|
|
78
|
+
if (pages.length === 0) return 0;
|
|
79
|
+
return pages.reduce((s, p) => s + p.confidence, 0) / pages.length;
|
|
80
|
+
}
|
|
81
|
+
function mimeHintFromDetectInput(input) {
|
|
82
|
+
if (!isByteBackedInput(input)) return void 0;
|
|
83
|
+
return getMimeType(input);
|
|
84
|
+
}
|
|
85
|
+
async function runRasterOcrForNode(data, input, options) {
|
|
86
|
+
const signal = options?.ocr?.signal ?? options?.signal;
|
|
87
|
+
const langs = options?.ocr?.langs;
|
|
88
|
+
const mimeHint = mimeHintFromDetectInput(input);
|
|
89
|
+
const format = resolveImageFormat(data, mimeHint);
|
|
90
|
+
if (format === "tiff") {
|
|
91
|
+
const tiff = await ocrTiff(data, {
|
|
92
|
+
langs,
|
|
93
|
+
signal,
|
|
94
|
+
maxPages: options?.ocr?.maxPages,
|
|
95
|
+
pageSeparator: options?.ocr?.pageSeparator
|
|
96
|
+
});
|
|
97
|
+
const warnings = [...tiff.warnings];
|
|
98
|
+
return {
|
|
99
|
+
text: tiff.text.trim(),
|
|
100
|
+
confidence: meanPageConfidence(tiff.textByPage),
|
|
101
|
+
ocrUsed: true,
|
|
102
|
+
warnings,
|
|
103
|
+
pages: tiff.pagesProcessed,
|
|
104
|
+
textByPage: tiff.textByPage
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
const detailed = await ocrImageDetailed(data, {
|
|
108
|
+
langs,
|
|
109
|
+
signal,
|
|
110
|
+
preprocess: options?.ocr?.preprocess
|
|
111
|
+
});
|
|
112
|
+
return {
|
|
113
|
+
text: detailed.text.trim(),
|
|
114
|
+
confidence: detailed.confidence,
|
|
115
|
+
ocrUsed: true,
|
|
116
|
+
warnings: [...detailed.warnings],
|
|
117
|
+
pages: detailed.pages,
|
|
118
|
+
inputFormat: detailed.inputFormat,
|
|
119
|
+
normalizedFormat: detailed.normalizedFormat
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// src/analyzers/image.ts
|
|
77
124
|
async function analyzeImageForNode(input, options) {
|
|
78
125
|
if (options?.signal?.aborted) {
|
|
79
126
|
const err = new Error("The operation was aborted");
|
|
@@ -93,21 +140,26 @@ async function analyzeImageForNode(input, options) {
|
|
|
93
140
|
warnings: ["No image bytes were provided for analysis."]
|
|
94
141
|
};
|
|
95
142
|
}
|
|
96
|
-
const
|
|
97
|
-
|
|
98
|
-
signal: options?.ocr?.signal ?? options?.signal
|
|
99
|
-
};
|
|
100
|
-
const r = await ocr(data, ocrOpts);
|
|
101
|
-
return {
|
|
143
|
+
const ocrPart = await runRasterOcrForNode(data, input, options);
|
|
144
|
+
const base = {
|
|
102
145
|
fileKind: "image",
|
|
103
146
|
analyzer: "image",
|
|
104
147
|
status: "ok",
|
|
105
148
|
kind: "image",
|
|
106
|
-
text:
|
|
107
|
-
confidence:
|
|
108
|
-
ocrUsed:
|
|
109
|
-
warnings:
|
|
149
|
+
text: ocrPart.text,
|
|
150
|
+
confidence: ocrPart.confidence,
|
|
151
|
+
ocrUsed: true,
|
|
152
|
+
warnings: ocrPart.warnings
|
|
110
153
|
};
|
|
154
|
+
const extra = {};
|
|
155
|
+
if (ocrPart.pages !== void 0) extra.pages = ocrPart.pages;
|
|
156
|
+
if (ocrPart.textByPage !== void 0) extra.textByPage = ocrPart.textByPage;
|
|
157
|
+
if (ocrPart.inputFormat !== void 0) extra.inputFormat = ocrPart.inputFormat;
|
|
158
|
+
if (ocrPart.normalizedFormat !== void 0) extra.normalizedFormat = ocrPart.normalizedFormat;
|
|
159
|
+
if (Object.keys(extra).length > 0) {
|
|
160
|
+
return { ...base, ...extra };
|
|
161
|
+
}
|
|
162
|
+
return base;
|
|
111
163
|
}
|
|
112
164
|
async function analyzePdfForNode(input, options) {
|
|
113
165
|
if (options?.signal?.aborted) {
|
|
@@ -449,34 +501,10 @@ async function runOcr(input, options) {
|
|
|
449
501
|
};
|
|
450
502
|
}
|
|
451
503
|
case "image": {
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
analyzer: "image",
|
|
457
|
-
status: "ok",
|
|
458
|
-
kind: "image",
|
|
459
|
-
text: "",
|
|
460
|
-
confidence: 0,
|
|
461
|
-
ocrUsed: true,
|
|
462
|
-
warnings: ["No image bytes were provided for analysis."]
|
|
463
|
-
};
|
|
464
|
-
}
|
|
465
|
-
const ocrOpts = {
|
|
466
|
-
...options?.ocr ?? {},
|
|
467
|
-
signal: options?.ocr?.signal ?? signal
|
|
468
|
-
};
|
|
469
|
-
const r = await ocr(data, ocrOpts);
|
|
470
|
-
return {
|
|
471
|
-
fileKind: "image",
|
|
472
|
-
analyzer: "image",
|
|
473
|
-
status: "ok",
|
|
474
|
-
kind: "image",
|
|
475
|
-
text: r.text,
|
|
476
|
-
confidence: r.confidence,
|
|
477
|
-
ocrUsed: r.ocrUsed,
|
|
478
|
-
warnings: []
|
|
479
|
-
};
|
|
504
|
+
return analyzeImageForNode(resolved, {
|
|
505
|
+
...options,
|
|
506
|
+
ocr: { ...options?.ocr ?? {}, signal: options?.ocr?.signal ?? signal }
|
|
507
|
+
});
|
|
480
508
|
}
|
|
481
509
|
case "docx": {
|
|
482
510
|
const data = await bytesFromDetectInput(resolved);
|
|
@@ -589,11 +617,33 @@ function buildNodeCapabilityReport(kind) {
|
|
|
589
617
|
break;
|
|
590
618
|
case "image":
|
|
591
619
|
capabilities = [
|
|
592
|
-
slot("text", true, [
|
|
620
|
+
slot("text", true, [
|
|
621
|
+
"Text via `@dragon708/docmind-ocr` after format sniff (PNG, JPEG, WebP, TIFF, BMP, GIF first frame, HEIC/HEIF in Node via conversion)."
|
|
622
|
+
]),
|
|
593
623
|
slot("metadata", false, [IMAGE_META]),
|
|
594
624
|
slot("html", false, [IMAGE_HTML]),
|
|
595
|
-
slot("ocr", true
|
|
596
|
-
|
|
625
|
+
slot("ocr", true, [
|
|
626
|
+
"Single-frame pipeline: `normalizeImageForOcr` \u2192 optional `preprocessImageForOcr` (`options.ocr.preprocess`) \u2192 Tesseract via `ocrImageDetailed`."
|
|
627
|
+
]),
|
|
628
|
+
slot("ocr-multipage", true, [
|
|
629
|
+
"Multipage TIFF: `ocrTiff` with per-page text; `text` joins pages (see `options.ocr.pageSeparator`)."
|
|
630
|
+
]),
|
|
631
|
+
slot("image-normalization", true, [
|
|
632
|
+
"Bytes are normalized to a Tesseract-friendly raster (PNG-oriented) inside the OCR package before recognition."
|
|
633
|
+
]),
|
|
634
|
+
slot("tiff", true, [
|
|
635
|
+
"TIFF detected by magic/MIME routes to `ocrTiff` (not the single-frame `ocrImageDetailed` path)."
|
|
636
|
+
]),
|
|
637
|
+
slot("heic-node-only", true, [
|
|
638
|
+
"HEIC/HEIF uses Node `sharp` conversion in `@dragon708/docmind-ocr`; not available in the browser facade."
|
|
639
|
+
]),
|
|
640
|
+
slot("bmp", true, ["BMP inputs are supported through the universal normalization path."]),
|
|
641
|
+
slot("gif-first-frame", true, [
|
|
642
|
+
"Animated GIF: only the first frame is normalized and OCR'd."
|
|
643
|
+
]),
|
|
644
|
+
slot("pages", true, [
|
|
645
|
+
"TIFF: `pages` and `textByPage` mirror frames processed. Other formats may expose `pages` when the normalizer reports it (e.g. GIF)."
|
|
646
|
+
])
|
|
597
647
|
];
|
|
598
648
|
break;
|
|
599
649
|
case "text":
|
|
@@ -643,11 +693,12 @@ function finalizeDocxExplainReport(report) {
|
|
|
643
693
|
};
|
|
644
694
|
}
|
|
645
695
|
var DOCX_MAMMOTH_PLUS_OPTIONAL = "Mammoth extracts text and HTML from OOXML; optional parallel OOXML/ZIP extractors run when options.docx.include is set (structure, headings, tables, blocks, pagesApprox, embeddedImages).";
|
|
646
|
-
|
|
696
|
+
var NODE_IMAGE_OCR_PIPELINE = "Node raster OCR: detect format (magic/MIME) \u2192 `normalizeImageForOcr` \u2192 optional `preprocessImageForOcr` when `options.ocr.preprocess` is set \u2192 Tesseract. TIFF is handled with `ocrTiff` (multipage; per-page `textByPage` and joined `text`). HEIC/HEIF is decoded via `sharp` on Node (not in the browser package). BMP and static images share the normalization path; GIF uses the first frame only.";
|
|
697
|
+
function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlice) {
|
|
647
698
|
const runtime = { id: "node" };
|
|
648
699
|
const primaryAnalyzer = kind === "pdf" ? "pdf" : kind === "docx" ? "docx" : kind === "image" ? "image" : kind === "text" ? "text" : "none";
|
|
649
700
|
let nativeExtraction;
|
|
650
|
-
let
|
|
701
|
+
let ocr;
|
|
651
702
|
let limitations = [];
|
|
652
703
|
if (kind === "unknown") {
|
|
653
704
|
limitations = lim(
|
|
@@ -672,7 +723,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
|
|
|
672
723
|
willAttempt: true,
|
|
673
724
|
description: "pdf-parse supplies embedded text, metadata, and page count; PDF.js drives raster OCR when enabled."
|
|
674
725
|
};
|
|
675
|
-
|
|
726
|
+
ocr = {
|
|
676
727
|
mayUse: pdfOcr !== "off",
|
|
677
728
|
description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when native text looks insufficient (pdf.ocr: auto + heuristics)."
|
|
678
729
|
};
|
|
@@ -681,19 +732,22 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
|
|
|
681
732
|
willAttempt: true,
|
|
682
733
|
description: docxIncludeRequested(docxInclude) ? "Mammoth plus parallel OOXML extractors (per options.docx.include)." : DOCX_MAMMOTH_PLUS_OPTIONAL
|
|
683
734
|
};
|
|
684
|
-
|
|
735
|
+
ocr = { mayUse: false, description: "DOCX does not use OCR in DocMind." };
|
|
685
736
|
} else if (kind === "image") {
|
|
686
737
|
nativeExtraction = {
|
|
687
738
|
willAttempt: false,
|
|
688
|
-
description:
|
|
739
|
+
description: NODE_IMAGE_OCR_PIPELINE
|
|
740
|
+
};
|
|
741
|
+
ocr = {
|
|
742
|
+
mayUse: true,
|
|
743
|
+
description: "Same stack as `runOcr` / `analyzeFile` for images: universal normalization, optional preprocess, then `ocrImageDetailed` or multipage `ocrTiff` for TIFF."
|
|
689
744
|
};
|
|
690
|
-
ocr3 = { mayUse: true, description: "Tesseract runs on supported raster formats." };
|
|
691
745
|
} else {
|
|
692
746
|
nativeExtraction = {
|
|
693
747
|
willAttempt: true,
|
|
694
748
|
description: "UTF-8 decode with BOM handling for plain text."
|
|
695
749
|
};
|
|
696
|
-
|
|
750
|
+
ocr = { mayUse: false, description: "OCR does not apply to text files." };
|
|
697
751
|
}
|
|
698
752
|
break;
|
|
699
753
|
case "extractText":
|
|
@@ -702,7 +756,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
|
|
|
702
756
|
willAttempt: true,
|
|
703
757
|
description: "Node: pdf-parse for metadata/page baseline, then PDF.js per-page text merged into `text` (pdfNativeTextSource pdfjs-per-page default)."
|
|
704
758
|
};
|
|
705
|
-
|
|
759
|
+
ocr = {
|
|
706
760
|
mayUse: false,
|
|
707
761
|
description: "extractText defaults pdf.ocr off; set pdf.ocr explicitly to allow auto/force raster OCR."
|
|
708
762
|
};
|
|
@@ -711,16 +765,22 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
|
|
|
711
765
|
willAttempt: true,
|
|
712
766
|
description: docxIncludeRequested(docxInclude) ? "Same DOCX router as analyzeFile: Mammoth text + optional OOXML fields; HTML cleared in extractText." : "Mammoth plain text; HTML cleared. Optional OOXML v2 fields when options.docx.include is set."
|
|
713
767
|
};
|
|
714
|
-
|
|
768
|
+
ocr = { mayUse: false, description: "DOCX does not use OCR." };
|
|
715
769
|
} else if (kind === "image") {
|
|
716
|
-
nativeExtraction = {
|
|
717
|
-
|
|
770
|
+
nativeExtraction = {
|
|
771
|
+
willAttempt: false,
|
|
772
|
+
description: NODE_IMAGE_OCR_PIPELINE
|
|
773
|
+
};
|
|
774
|
+
ocr = {
|
|
775
|
+
mayUse: true,
|
|
776
|
+
description: "Same Node image pipeline as analyzeFile (normalize \u2192 optional preprocess \u2192 `ocrImageDetailed` or `ocrTiff`)."
|
|
777
|
+
};
|
|
718
778
|
} else {
|
|
719
779
|
nativeExtraction = {
|
|
720
780
|
willAttempt: true,
|
|
721
781
|
description: "UTF-8 decode only."
|
|
722
782
|
};
|
|
723
|
-
|
|
783
|
+
ocr = { mayUse: false, description: "OCR does not apply." };
|
|
724
784
|
}
|
|
725
785
|
break;
|
|
726
786
|
case "extractMetadata":
|
|
@@ -729,13 +789,13 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
|
|
|
729
789
|
willAttempt: true,
|
|
730
790
|
description: "Lightweight PDF info/XMP normalization without full OCR."
|
|
731
791
|
};
|
|
732
|
-
|
|
792
|
+
ocr = { mayUse: false, description: "extractMetadata does not run the OCR pipeline." };
|
|
733
793
|
} else if (kind === "docx" || kind === "image") {
|
|
734
794
|
nativeExtraction = {
|
|
735
795
|
willAttempt: false,
|
|
736
796
|
description: "Stub response; no heavy extractor."
|
|
737
797
|
};
|
|
738
|
-
|
|
798
|
+
ocr = { mayUse: false, description: "OCR not used for this metadata path." };
|
|
739
799
|
limitations = lim(
|
|
740
800
|
kind === "docx" ? "Structured DOCX metadata is not exposed separately; use analyzeFile / extractText / convertToHtml with options.docx.include for OOXML structure." : "Raster images have no document metadata bundle."
|
|
741
801
|
);
|
|
@@ -744,7 +804,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
|
|
|
744
804
|
willAttempt: true,
|
|
745
805
|
description: "Decoded text only; no structured document metadata."
|
|
746
806
|
};
|
|
747
|
-
|
|
807
|
+
ocr = { mayUse: false, description: "OCR does not apply." };
|
|
748
808
|
limitations = lim("Plain text has no structured document metadata.");
|
|
749
809
|
}
|
|
750
810
|
break;
|
|
@@ -754,26 +814,26 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
|
|
|
754
814
|
willAttempt: true,
|
|
755
815
|
description: "Text layer extracted then wrapped in <pre> (not visual layout)."
|
|
756
816
|
};
|
|
757
|
-
|
|
817
|
+
ocr = { mayUse: false, description: "convertToHtml does not run PDF OCR." };
|
|
758
818
|
limitations = lim("PDF HTML is a plain-text preview, not page layout.");
|
|
759
819
|
} else if (kind === "docx") {
|
|
760
820
|
nativeExtraction = {
|
|
761
821
|
willAttempt: true,
|
|
762
822
|
description: docxIncludeRequested(docxInclude) ? "Mammoth HTML plus optional OOXML extractors (same router as analyzeFile)." : "Mammoth HTML via analyzeFile routing; optional OOXML v2 when options.docx.include is set."
|
|
763
823
|
};
|
|
764
|
-
|
|
824
|
+
ocr = { mayUse: false, description: "DOCX path does not use OCR." };
|
|
765
825
|
} else if (kind === "text") {
|
|
766
826
|
nativeExtraction = {
|
|
767
827
|
willAttempt: true,
|
|
768
828
|
description: "UTF-8 decode then <pre> wrapper."
|
|
769
829
|
};
|
|
770
|
-
|
|
830
|
+
ocr = { mayUse: false, description: "OCR does not apply." };
|
|
771
831
|
} else {
|
|
772
832
|
nativeExtraction = {
|
|
773
833
|
willAttempt: false,
|
|
774
834
|
description: "No HTML path for raster images."
|
|
775
835
|
};
|
|
776
|
-
|
|
836
|
+
ocr = { mayUse: false, description: "OCR does not emit layout HTML here." };
|
|
777
837
|
limitations = lim("Use extractText or runOcr for image text.");
|
|
778
838
|
}
|
|
779
839
|
break;
|
|
@@ -783,32 +843,44 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
|
|
|
783
843
|
willAttempt: true,
|
|
784
844
|
description: "pdf-parse runs first; text may be replaced by raster OCR output."
|
|
785
845
|
};
|
|
786
|
-
|
|
846
|
+
ocr = {
|
|
787
847
|
mayUse: true,
|
|
788
848
|
description: 'runOcr always sets pdf.ocr to "force" for PDFs.'
|
|
789
849
|
};
|
|
790
850
|
limitations = lim("Forced OCR may run even when a text layer exists.");
|
|
791
851
|
} else if (kind === "image") {
|
|
792
|
-
nativeExtraction = {
|
|
793
|
-
|
|
852
|
+
nativeExtraction = {
|
|
853
|
+
willAttempt: false,
|
|
854
|
+
description: NODE_IMAGE_OCR_PIPELINE
|
|
855
|
+
};
|
|
856
|
+
ocr = {
|
|
857
|
+
mayUse: true,
|
|
858
|
+
description: "Forced OCR path for rasters: TIFF \u2192 `ocrTiff` with `textByPage` when multipage; other formats \u2192 `ocrImageDetailed` after normalization (HEIC converted with `sharp` on Node)."
|
|
859
|
+
};
|
|
794
860
|
} else if (kind === "docx") {
|
|
795
861
|
nativeExtraction = {
|
|
796
862
|
willAttempt: true,
|
|
797
863
|
description: docxIncludeRequested(docxInclude) ? "Mammoth text/HTML plus optional OOXML extractors; still not OCR." : "Full Mammoth extract (text + HTML); optional OOXML v2 via options.docx.include; not OCR."
|
|
798
864
|
};
|
|
799
|
-
|
|
865
|
+
ocr = { mayUse: false, description: "DOCX is not OCR'd." };
|
|
800
866
|
limitations = lim("Result is structured extract, not OCR output.");
|
|
801
867
|
} else {
|
|
802
868
|
nativeExtraction = {
|
|
803
869
|
willAttempt: true,
|
|
804
870
|
description: "UTF-8 decode only."
|
|
805
871
|
};
|
|
806
|
-
|
|
872
|
+
ocr = { mayUse: false, description: "OCR does not apply to text files." };
|
|
807
873
|
}
|
|
808
874
|
break;
|
|
809
875
|
default:
|
|
810
876
|
nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
|
|
811
|
-
|
|
877
|
+
ocr = { mayUse: false, description: "See plan steps." };
|
|
878
|
+
}
|
|
879
|
+
if (kind === "image" && preprocessHasEffect(ocrSlice?.preprocess)) {
|
|
880
|
+
limitations = [
|
|
881
|
+
...limitations,
|
|
882
|
+
"options.ocr.preprocess applies to the single-frame `ocrImageDetailed` path; multipage TIFF (`ocrTiff`) does not run this preprocess on each frame."
|
|
883
|
+
];
|
|
812
884
|
}
|
|
813
885
|
return finalizeDocxExplainReport({
|
|
814
886
|
kind,
|
|
@@ -817,17 +889,27 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
|
|
|
817
889
|
intent,
|
|
818
890
|
primaryAnalyzer,
|
|
819
891
|
nativeExtraction,
|
|
820
|
-
ocr
|
|
892
|
+
ocr,
|
|
821
893
|
limitations,
|
|
822
894
|
plan
|
|
823
895
|
});
|
|
824
896
|
}
|
|
825
|
-
|
|
826
|
-
// src/introspection.ts
|
|
827
897
|
function resolvePdfOcrMode(pdf) {
|
|
828
898
|
return pdf?.ocr ?? "auto";
|
|
829
899
|
}
|
|
830
|
-
function
|
|
900
|
+
function imageOcrPlanSteps(ocr) {
|
|
901
|
+
return [
|
|
902
|
+
{ id: "detect_kind", status: "done" },
|
|
903
|
+
{ id: "image_format_detect", status: "planned" },
|
|
904
|
+
{ id: "normalize_image_for_ocr", status: "planned" },
|
|
905
|
+
{
|
|
906
|
+
id: "preprocess_image_for_ocr",
|
|
907
|
+
status: preprocessHasEffect(ocr?.preprocess) ? "planned" : "skipped"
|
|
908
|
+
},
|
|
909
|
+
{ id: "ocr_tesseract", status: "planned" }
|
|
910
|
+
];
|
|
911
|
+
}
|
|
912
|
+
function planAnalyzeFile(kind, pdfOcr, docxInclude, ocr) {
|
|
831
913
|
switch (kind) {
|
|
832
914
|
case "pdf":
|
|
833
915
|
return {
|
|
@@ -855,10 +937,7 @@ function planAnalyzeFile(kind, pdfOcr, docxInclude) {
|
|
|
855
937
|
case "image":
|
|
856
938
|
return {
|
|
857
939
|
intent: "analyzeFile",
|
|
858
|
-
steps:
|
|
859
|
-
{ id: "detect_kind", status: "done" },
|
|
860
|
-
{ id: "image_ocr", status: "planned" }
|
|
861
|
-
]
|
|
940
|
+
steps: imageOcrPlanSteps(ocr)
|
|
862
941
|
};
|
|
863
942
|
case "text":
|
|
864
943
|
return {
|
|
@@ -878,9 +957,9 @@ function planAnalyzeFile(kind, pdfOcr, docxInclude) {
|
|
|
878
957
|
};
|
|
879
958
|
}
|
|
880
959
|
}
|
|
881
|
-
function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude) {
|
|
960
|
+
function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr) {
|
|
882
961
|
const intent = intentOpt ?? "analyzeFile";
|
|
883
|
-
if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude);
|
|
962
|
+
if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
|
|
884
963
|
if (intent === "extractText") {
|
|
885
964
|
if (kind === "pdf") {
|
|
886
965
|
return {
|
|
@@ -893,7 +972,7 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude) {
|
|
|
893
972
|
]
|
|
894
973
|
};
|
|
895
974
|
}
|
|
896
|
-
const p = planAnalyzeFile(kind, "off", docxInclude);
|
|
975
|
+
const p = planAnalyzeFile(kind, "off", docxInclude, ocr);
|
|
897
976
|
return { ...p, intent: "extractText" };
|
|
898
977
|
}
|
|
899
978
|
if (intent === "extractMetadata") {
|
|
@@ -977,10 +1056,7 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude) {
|
|
|
977
1056
|
if (kind === "image") {
|
|
978
1057
|
return {
|
|
979
1058
|
intent: "runOcr",
|
|
980
|
-
steps:
|
|
981
|
-
{ id: "detect_kind", status: "done" },
|
|
982
|
-
{ id: "tesseract_ocr", status: "planned" }
|
|
983
|
-
]
|
|
1059
|
+
steps: imageOcrPlanSteps(ocr)
|
|
984
1060
|
};
|
|
985
1061
|
}
|
|
986
1062
|
if (kind === "docx") {
|
|
@@ -1002,7 +1078,7 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude) {
|
|
|
1002
1078
|
]
|
|
1003
1079
|
};
|
|
1004
1080
|
}
|
|
1005
|
-
return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude);
|
|
1081
|
+
return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
|
|
1006
1082
|
}
|
|
1007
1083
|
async function getCapabilities(input, options) {
|
|
1008
1084
|
throwIfAborted(options?.signal);
|
|
@@ -1019,8 +1095,9 @@ async function explainAnalysisPlan(input, options) {
|
|
|
1019
1095
|
const intent = options?.intent ?? "analyzeFile";
|
|
1020
1096
|
const pdfOcrAnalyze = resolvePdfOcrMode(options?.pdf);
|
|
1021
1097
|
const docxInc = options?.docx?.include;
|
|
1022
|
-
const
|
|
1023
|
-
|
|
1098
|
+
const ocrSlice = options?.ocr;
|
|
1099
|
+
const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc, ocrSlice);
|
|
1100
|
+
return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan, docxInc, ocrSlice);
|
|
1024
1101
|
}
|
|
1025
1102
|
|
|
1026
1103
|
export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-node",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.9.1",
|
|
4
4
|
"description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -32,10 +32,10 @@
|
|
|
32
32
|
],
|
|
33
33
|
"license": "MIT",
|
|
34
34
|
"dependencies": {
|
|
35
|
-
"@dragon708/docmind-docx": "^1.7.
|
|
36
|
-
"@dragon708/docmind-ocr": "^1.
|
|
37
|
-
"@dragon708/docmind-pdf": "^2.
|
|
38
|
-
"@dragon708/docmind-shared": "^1.1.
|
|
35
|
+
"@dragon708/docmind-docx": "^1.7.1",
|
|
36
|
+
"@dragon708/docmind-ocr": "^1.1.3",
|
|
37
|
+
"@dragon708/docmind-pdf": "^2.1.1",
|
|
38
|
+
"@dragon708/docmind-shared": "^1.1.1"
|
|
39
39
|
},
|
|
40
40
|
"devDependencies": {
|
|
41
41
|
"@types/node": "^20.19.37",
|