@dragon708/docmind-node 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +12 -3
- package/dist/index.js +66 -15
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -9,10 +9,17 @@ import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
|
|
|
9
9
|
* - **`pdf`**: forwarded to `@dragon708/docmind-pdf`. `analyzeFile` defaults `pdf.ocr` to `"auto"` when omitted.
|
|
10
10
|
* {@link extractText} / {@link convertToHtml} merge a default of `ocr: "off"` unless you set `pdf.ocr` explicitly.
|
|
11
11
|
* - **`ocr`**: forwarded to `@dragon708/docmind-ocr` for raster images; language string also feeds PDF OCR when `pdf.ocrLangs` is unset.
|
|
12
|
+
* - **`pdfNativeTextSource`**: when `pdf.ocr` resolves to `"off"`, chooses how native text is obtained (see {@link extractText} default).
|
|
12
13
|
*/
|
|
13
14
|
interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
|
|
14
15
|
readonly pdf?: PdfAnalyzeOptions;
|
|
15
16
|
readonly ocr?: OcrOptions;
|
|
17
|
+
/**
|
|
18
|
+
* Native PDF text when `pdf.ocr` is `"off"`:
|
|
19
|
+
* - **`pdfjs-per-page`** (default for {@link extractText}): PDF.js text per page, merged for `text` (aligns with OCR raster engine).
|
|
20
|
+
* - **`pdf-parse`**: single pdf-parse pass (default for {@link analyzeFile} when you set `pdf.ocr: "off"` without this flag).
|
|
21
|
+
*/
|
|
22
|
+
readonly pdfNativeTextSource?: "pdf-parse" | "pdfjs-per-page";
|
|
16
23
|
}
|
|
17
24
|
|
|
18
25
|
/**
|
|
@@ -38,8 +45,9 @@ declare function resolveNodeAnalyzeInput(input: NodeAnalyzeInput): Promise<Detec
|
|
|
38
45
|
declare function analyzeFile(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
39
46
|
|
|
40
47
|
/**
|
|
41
|
-
* Plain-text extraction using {@link analyzeFile} routing. PDFs default to **text
|
|
42
|
-
* (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly.
|
|
48
|
+
* Plain-text extraction using {@link analyzeFile} routing. PDFs default to **native text only**
|
|
49
|
+
* (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly, and to **PDF.js per-page** assembly
|
|
50
|
+
* (`pdfNativeTextSource: "pdfjs-per-page"`) unless you set `options.pdfNativeTextSource` or `pdf.ocr` enables OCR.
|
|
43
51
|
*/
|
|
44
52
|
declare function extractText(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
45
53
|
/**
|
|
@@ -109,7 +117,8 @@ type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnal
|
|
|
109
117
|
|
|
110
118
|
/**
|
|
111
119
|
* Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
|
|
112
|
-
* `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (PDF
|
|
120
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (for PDF, `text` / `metadata` /
|
|
121
|
+
* `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
|
|
113
122
|
* Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
|
|
114
123
|
*/
|
|
115
124
|
declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
|
package/dist/index.js
CHANGED
|
@@ -2,7 +2,7 @@ import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKN
|
|
|
2
2
|
export { detectFileKind } from '@dragon708/docmind-shared';
|
|
3
3
|
import { analyzeDocx } from '@dragon708/docmind-docx';
|
|
4
4
|
import { ocr } from '@dragon708/docmind-ocr';
|
|
5
|
-
import { extractPdfMetadata, extractTextFromPdf, analyzePdf } from '@dragon708/docmind-pdf';
|
|
5
|
+
import { extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
|
|
6
6
|
import { readFile } from 'fs/promises';
|
|
7
7
|
import { basename } from 'path';
|
|
8
8
|
import { fileURLToPath } from 'url';
|
|
@@ -112,17 +112,45 @@ async function analyzePdfForNode(input, options) {
|
|
|
112
112
|
signal: userPdf?.signal ?? options?.signal
|
|
113
113
|
};
|
|
114
114
|
const r = await analyzePdf(data, pdfOpts);
|
|
115
|
+
const usePdfJsPerPage = pdfOpts.ocr === "off" && (options?.pdfNativeTextSource ?? "pdf-parse") === "pdfjs-per-page";
|
|
116
|
+
if (!usePdfJsPerPage) {
|
|
117
|
+
return {
|
|
118
|
+
fileKind: "pdf",
|
|
119
|
+
analyzer: "pdf",
|
|
120
|
+
status: "ok",
|
|
121
|
+
kind: "pdf",
|
|
122
|
+
text: r.text,
|
|
123
|
+
pages: r.pages,
|
|
124
|
+
metadata: r.metadata,
|
|
125
|
+
warnings: [...r.warnings],
|
|
126
|
+
needsOCR: r.needsOCR,
|
|
127
|
+
ocrUsed: r.ocrUsed
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
let text = r.text;
|
|
131
|
+
const extra = [];
|
|
132
|
+
try {
|
|
133
|
+
const rows = await extractPdfTextByPage(data, {
|
|
134
|
+
maxPages: pdfOpts.maxPages,
|
|
135
|
+
signal: pdfOpts.signal
|
|
136
|
+
});
|
|
137
|
+
text = rows.map((row) => row.text).join("\n\n");
|
|
138
|
+
} catch (e) {
|
|
139
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
140
|
+
extra.push(`warning: PDF.js per-page text failed; using pdf-parse text: ${msg}`);
|
|
141
|
+
}
|
|
142
|
+
const needsOCR = r.pages > 0 && text.trim().length === 0;
|
|
115
143
|
return {
|
|
116
144
|
fileKind: "pdf",
|
|
117
145
|
analyzer: "pdf",
|
|
118
146
|
status: "ok",
|
|
119
147
|
kind: "pdf",
|
|
120
|
-
text
|
|
148
|
+
text,
|
|
121
149
|
pages: r.pages,
|
|
122
150
|
metadata: r.metadata,
|
|
123
|
-
warnings: [...r.warnings],
|
|
124
|
-
needsOCR
|
|
125
|
-
ocrUsed:
|
|
151
|
+
warnings: [...r.warnings, ...extra],
|
|
152
|
+
needsOCR,
|
|
153
|
+
ocrUsed: false
|
|
126
154
|
};
|
|
127
155
|
}
|
|
128
156
|
function toPathString(pathOrUrl) {
|
|
@@ -211,7 +239,11 @@ function toExtractTextResult(full) {
|
|
|
211
239
|
}
|
|
212
240
|
async function extractText(input, options) {
|
|
213
241
|
throwIfAborted(options?.signal);
|
|
214
|
-
const
|
|
242
|
+
const merged = {
|
|
243
|
+
...withPdfOcrDefaultOff(options),
|
|
244
|
+
pdfNativeTextSource: options?.pdfNativeTextSource ?? "pdfjs-per-page"
|
|
245
|
+
};
|
|
246
|
+
const full = await analyzeFile(input, merged);
|
|
215
247
|
return toExtractTextResult(full);
|
|
216
248
|
}
|
|
217
249
|
async function extractMetadata(input, options) {
|
|
@@ -485,11 +517,11 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
|
|
|
485
517
|
if (kind === "pdf") {
|
|
486
518
|
nativeExtraction = {
|
|
487
519
|
willAttempt: true,
|
|
488
|
-
description: "pdf-parse
|
|
520
|
+
description: "pdf-parse supplies embedded text, metadata, and page count; PDF.js drives raster OCR when enabled."
|
|
489
521
|
};
|
|
490
522
|
ocr3 = {
|
|
491
523
|
mayUse: pdfOcr !== "off",
|
|
492
|
-
description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when
|
|
524
|
+
description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when native text looks insufficient (pdf.ocr: auto + heuristics)."
|
|
493
525
|
};
|
|
494
526
|
} else if (kind === "docx") {
|
|
495
527
|
nativeExtraction = {
|
|
@@ -515,11 +547,11 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
|
|
|
515
547
|
if (kind === "pdf") {
|
|
516
548
|
nativeExtraction = {
|
|
517
549
|
willAttempt: true,
|
|
518
|
-
description: "
|
|
550
|
+
description: "Node: pdf-parse for metadata/page baseline, then PDF.js per-page text merged into `text` (pdfNativeTextSource pdfjs-per-page default)."
|
|
519
551
|
};
|
|
520
552
|
ocr3 = {
|
|
521
553
|
mayUse: false,
|
|
522
|
-
description: "extractText
|
|
554
|
+
description: "extractText defaults pdf.ocr off; set pdf.ocr explicitly to allow auto/force raster OCR."
|
|
523
555
|
};
|
|
524
556
|
} else if (kind === "docx") {
|
|
525
557
|
nativeExtraction = {
|
|
@@ -654,11 +686,19 @@ function buildNodeCapabilityReport(kind) {
|
|
|
654
686
|
switch (kind) {
|
|
655
687
|
case "pdf":
|
|
656
688
|
capabilities = [
|
|
657
|
-
slot("text", true, [
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
slot("
|
|
661
|
-
|
|
689
|
+
slot("text", true, [
|
|
690
|
+
"Native text via pdf-parse and (in Node extractText) PDF.js per-page text; set pdf.ocr for raster OCR."
|
|
691
|
+
]),
|
|
692
|
+
slot("metadata", true, [
|
|
693
|
+
"Document info / XMP-style metadata via pdf-parse without running the OCR pipeline."
|
|
694
|
+
]),
|
|
695
|
+
slot("pages", true, [
|
|
696
|
+
"Page count and per-page native extraction (PDF.js) where used; OCR respects pdf.maxPages."
|
|
697
|
+
]),
|
|
698
|
+
slot("ocr", true, [
|
|
699
|
+
"Raster OCR pipeline (pdf.ocr auto with quality heuristics, force, or runOcr)."
|
|
700
|
+
]),
|
|
701
|
+
slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."])
|
|
662
702
|
];
|
|
663
703
|
break;
|
|
664
704
|
case "docx":
|
|
@@ -762,6 +802,17 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
|
|
|
762
802
|
const intent = intentOpt ?? "analyzeFile";
|
|
763
803
|
if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze);
|
|
764
804
|
if (intent === "extractText") {
|
|
805
|
+
if (kind === "pdf") {
|
|
806
|
+
return {
|
|
807
|
+
intent: "extractText",
|
|
808
|
+
steps: [
|
|
809
|
+
{ id: "detect_kind", status: "done" },
|
|
810
|
+
{ id: "pdf_parse", status: "planned" },
|
|
811
|
+
{ id: "pdfjs_per_page", status: "planned" },
|
|
812
|
+
{ id: "pdf_ocr", status: "skipped" }
|
|
813
|
+
]
|
|
814
|
+
};
|
|
815
|
+
}
|
|
765
816
|
const p = planAnalyzeFile(kind, "off");
|
|
766
817
|
return { ...p, intent: "extractText" };
|
|
767
818
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-node",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.4.0",
|
|
4
4
|
"description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -34,7 +34,7 @@
|
|
|
34
34
|
"dependencies": {
|
|
35
35
|
"@dragon708/docmind-docx": "^1.0.0",
|
|
36
36
|
"@dragon708/docmind-ocr": "^1.0.0",
|
|
37
|
-
"@dragon708/docmind-pdf": "^
|
|
37
|
+
"@dragon708/docmind-pdf": "^2.0.0",
|
|
38
38
|
"@dragon708/docmind-shared": "^1.1.0"
|
|
39
39
|
},
|
|
40
40
|
"devDependencies": {
|