@dragon708/docmind-node 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -9,10 +9,17 @@ import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
9
9
  * - **`pdf`**: forwarded to `@dragon708/docmind-pdf`. `analyzeFile` defaults `pdf.ocr` to `"auto"` when omitted.
10
10
  * {@link extractText} / {@link convertToHtml} merge a default of `ocr: "off"` unless you set `pdf.ocr` explicitly.
11
11
  * - **`ocr`**: forwarded to `@dragon708/docmind-ocr` for raster images; language string also feeds PDF OCR when `pdf.ocrLangs` is unset.
12
+ * - **`pdfNativeTextSource`**: when `pdf.ocr` resolves to `"off"`, chooses how native text is obtained (see {@link extractText} default).
12
13
  */
13
14
  interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
14
15
  readonly pdf?: PdfAnalyzeOptions;
15
16
  readonly ocr?: OcrOptions;
17
+ /**
18
+ * Native PDF text when `pdf.ocr` is `"off"`:
19
+ * - **`pdfjs-per-page`** (default for {@link extractText}): PDF.js text per page, merged for `text` (aligns with OCR raster engine).
20
+ * - **`pdf-parse`**: single pdf-parse pass (default for {@link analyzeFile} when you set `pdf.ocr: "off"` without this flag).
21
+ */
22
+ readonly pdfNativeTextSource?: "pdf-parse" | "pdfjs-per-page";
16
23
  }
17
24
 
18
25
  /**
@@ -38,8 +45,9 @@ declare function resolveNodeAnalyzeInput(input: NodeAnalyzeInput): Promise<Detec
38
45
  declare function analyzeFile(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
39
46
 
40
47
  /**
41
- * Plain-text extraction using {@link analyzeFile} routing. PDFs default to **text layer only**
42
- * (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly.
48
+ * Plain-text extraction using {@link analyzeFile} routing. PDFs default to **native text only**
49
+ * (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly, and to **PDF.js per-page** assembly
50
+ * (`pdfNativeTextSource: "pdfjs-per-page"`) unless you set `options.pdfNativeTextSource` or `pdf.ocr` enables OCR.
43
51
  */
44
52
  declare function extractText(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
45
53
  /**
@@ -109,7 +117,8 @@ type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnal
109
117
 
110
118
  /**
111
119
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
112
- * `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (PDF fully supported).
120
+ * `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (for PDF, `text` / `metadata` /
121
+ * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
113
122
  * Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
114
123
  */
115
124
  declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
package/dist/index.js CHANGED
@@ -2,7 +2,7 @@ import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKN
2
2
  export { detectFileKind } from '@dragon708/docmind-shared';
3
3
  import { analyzeDocx } from '@dragon708/docmind-docx';
4
4
  import { ocr } from '@dragon708/docmind-ocr';
5
- import { extractPdfMetadata, extractTextFromPdf, analyzePdf } from '@dragon708/docmind-pdf';
5
+ import { extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
6
6
  import { readFile } from 'fs/promises';
7
7
  import { basename } from 'path';
8
8
  import { fileURLToPath } from 'url';
@@ -112,17 +112,45 @@ async function analyzePdfForNode(input, options) {
112
112
  signal: userPdf?.signal ?? options?.signal
113
113
  };
114
114
  const r = await analyzePdf(data, pdfOpts);
115
+ const usePdfJsPerPage = pdfOpts.ocr === "off" && (options?.pdfNativeTextSource ?? "pdf-parse") === "pdfjs-per-page";
116
+ if (!usePdfJsPerPage) {
117
+ return {
118
+ fileKind: "pdf",
119
+ analyzer: "pdf",
120
+ status: "ok",
121
+ kind: "pdf",
122
+ text: r.text,
123
+ pages: r.pages,
124
+ metadata: r.metadata,
125
+ warnings: [...r.warnings],
126
+ needsOCR: r.needsOCR,
127
+ ocrUsed: r.ocrUsed
128
+ };
129
+ }
130
+ let text = r.text;
131
+ const extra = [];
132
+ try {
133
+ const rows = await extractPdfTextByPage(data, {
134
+ maxPages: pdfOpts.maxPages,
135
+ signal: pdfOpts.signal
136
+ });
137
+ text = rows.map((row) => row.text).join("\n\n");
138
+ } catch (e) {
139
+ const msg = e instanceof Error ? e.message : String(e);
140
+ extra.push(`warning: PDF.js per-page text failed; using pdf-parse text: ${msg}`);
141
+ }
142
+ const needsOCR = r.pages > 0 && text.trim().length === 0;
115
143
  return {
116
144
  fileKind: "pdf",
117
145
  analyzer: "pdf",
118
146
  status: "ok",
119
147
  kind: "pdf",
120
- text: r.text,
148
+ text,
121
149
  pages: r.pages,
122
150
  metadata: r.metadata,
123
- warnings: [...r.warnings],
124
- needsOCR: r.needsOCR,
125
- ocrUsed: r.ocrUsed
151
+ warnings: [...r.warnings, ...extra],
152
+ needsOCR,
153
+ ocrUsed: false
126
154
  };
127
155
  }
128
156
  function toPathString(pathOrUrl) {
@@ -211,7 +239,11 @@ function toExtractTextResult(full) {
211
239
  }
212
240
  async function extractText(input, options) {
213
241
  throwIfAborted(options?.signal);
214
- const full = await analyzeFile(input, withPdfOcrDefaultOff(options));
242
+ const merged = {
243
+ ...withPdfOcrDefaultOff(options),
244
+ pdfNativeTextSource: options?.pdfNativeTextSource ?? "pdfjs-per-page"
245
+ };
246
+ const full = await analyzeFile(input, merged);
215
247
  return toExtractTextResult(full);
216
248
  }
217
249
  async function extractMetadata(input, options) {
@@ -485,11 +517,11 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
485
517
  if (kind === "pdf") {
486
518
  nativeExtraction = {
487
519
  willAttempt: true,
488
- description: "pdf-parse extracts embedded text and page count first."
520
+ description: "pdf-parse supplies embedded text, metadata, and page count; PDF.js drives raster OCR when enabled."
489
521
  };
490
522
  ocr3 = {
491
523
  mayUse: pdfOcr !== "off",
492
- description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when the text layer is empty (pdf.ocr: auto)."
524
+ description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when native text looks insufficient (pdf.ocr: auto + heuristics)."
493
525
  };
494
526
  } else if (kind === "docx") {
495
527
  nativeExtraction = {
@@ -515,11 +547,11 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
515
547
  if (kind === "pdf") {
516
548
  nativeExtraction = {
517
549
  willAttempt: true,
518
- description: "Text layer via pdf-parse; defaults to pdf.ocr off unless you override."
550
+ description: "Node: pdf-parse for metadata/page baseline, then PDF.js per-page text merged into `text` (pdfNativeTextSource pdfjs-per-page default)."
519
551
  };
520
552
  ocr3 = {
521
553
  mayUse: false,
522
- description: "extractText merges pdf.ocr default off \u2014 no raster OCR unless you set pdf.ocr explicitly."
554
+ description: "extractText defaults pdf.ocr off; set pdf.ocr explicitly to allow auto/force raster OCR."
523
555
  };
524
556
  } else if (kind === "docx") {
525
557
  nativeExtraction = {
@@ -654,11 +686,19 @@ function buildNodeCapabilityReport(kind) {
654
686
  switch (kind) {
655
687
  case "pdf":
656
688
  capabilities = [
657
- slot("text", true, ["Includes text layer extraction; use extractText options to avoid PDF OCR."]),
658
- slot("metadata", true),
659
- slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."]),
660
- slot("ocr", true, ["Raster OCR is available (e.g. analyzeFile with pdf.ocr auto/force, or runOcr)."]),
661
- slot("pages", true)
689
+ slot("text", true, [
690
+ "Native text via pdf-parse and (in Node extractText) PDF.js per-page text; set pdf.ocr for raster OCR."
691
+ ]),
692
+ slot("metadata", true, [
693
+ "Document info / XMP-style metadata via pdf-parse without running the OCR pipeline."
694
+ ]),
695
+ slot("pages", true, [
696
+ "Page count and per-page native extraction (PDF.js) where used; OCR respects pdf.maxPages."
697
+ ]),
698
+ slot("ocr", true, [
699
+ "Raster OCR pipeline (pdf.ocr auto with quality heuristics, force, or runOcr)."
700
+ ]),
701
+ slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."])
662
702
  ];
663
703
  break;
664
704
  case "docx":
@@ -762,6 +802,17 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
762
802
  const intent = intentOpt ?? "analyzeFile";
763
803
  if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze);
764
804
  if (intent === "extractText") {
805
+ if (kind === "pdf") {
806
+ return {
807
+ intent: "extractText",
808
+ steps: [
809
+ { id: "detect_kind", status: "done" },
810
+ { id: "pdf_parse", status: "planned" },
811
+ { id: "pdfjs_per_page", status: "planned" },
812
+ { id: "pdf_ocr", status: "skipped" }
813
+ ]
814
+ };
815
+ }
765
816
  const p = planAnalyzeFile(kind, "off");
766
817
  return { ...p, intent: "extractText" };
767
818
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-node",
3
- "version": "1.2.0",
3
+ "version": "1.4.0",
4
4
  "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -34,7 +34,7 @@
34
34
  "dependencies": {
35
35
  "@dragon708/docmind-docx": "^1.0.0",
36
36
  "@dragon708/docmind-ocr": "^1.0.0",
37
- "@dragon708/docmind-pdf": "^1.0.0",
37
+ "@dragon708/docmind-pdf": "^2.0.0",
38
38
  "@dragon708/docmind-shared": "^1.1.0"
39
39
  },
40
40
  "devDependencies": {