npm - @dragon708/docmind-node - Versions diffs - 1.2.0 → 1.4.0 - Mend

@dragon708/docmind-node 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -9,10 +9,17 @@ import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
  * - **`pdf`**: forwarded to `@dragon708/docmind-pdf`. `analyzeFile` defaults `pdf.ocr` to `"auto"` when omitted.
  *   {@link extractText} / {@link convertToHtml} merge a default of `ocr: "off"` unless you set `pdf.ocr` explicitly.
  * - **`ocr`**: forwarded to `@dragon708/docmind-ocr` for raster images; language string also feeds PDF OCR when `pdf.ocrLangs` is unset.
+ * - **`pdfNativeTextSource`**: when `pdf.ocr` resolves to `"off"`, chooses how native text is obtained (see {@link extractText} default).
  */
 interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
     readonly pdf?: PdfAnalyzeOptions;
     readonly ocr?: OcrOptions;
+    /**
+     * Native PDF text when `pdf.ocr` is `"off"`:
+     * - **`pdfjs-per-page`** (default for {@link extractText}): PDF.js text per page, merged for `text` (aligns with OCR raster engine).
+     * - **`pdf-parse`**: single pdf-parse pass (default for {@link analyzeFile} when you set `pdf.ocr: "off"` without this flag).
+     */
+    readonly pdfNativeTextSource?: "pdf-parse" | "pdfjs-per-page";
 }
 /**
@@ -38,8 +45,9 @@ declare function resolveNodeAnalyzeInput(input: NodeAnalyzeInput): Promise<Detec
 declare function analyzeFile(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
 /**
- * Plain-text extraction using {@link analyzeFile} routing. PDFs default to **text layer only**
- * (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly.
+ * Plain-text extraction using {@link analyzeFile} routing. PDFs default to **native text only**
+ * (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly, and to **PDF.js per-page** assembly
+ * (`pdfNativeTextSource: "pdfjs-per-page"`) unless you set `options.pdfNativeTextSource` or `pdf.ocr` enables OCR.
  */
 declare function extractText(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
 /**
@@ -109,7 +117,8 @@ type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnal
 /**
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
- * `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (PDF fully supported).
+ * `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (for PDF, `text` / `metadata` /
+ * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
  * Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
  */
 declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;

package/dist/index.js CHANGED Viewed

@@ -2,7 +2,7 @@ import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKN
 export { detectFileKind } from '@dragon708/docmind-shared';
 import { analyzeDocx } from '@dragon708/docmind-docx';
 import { ocr } from '@dragon708/docmind-ocr';
-import { extractPdfMetadata, extractTextFromPdf, analyzePdf } from '@dragon708/docmind-pdf';
+import { extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
 import { readFile } from 'fs/promises';
 import { basename } from 'path';
 import { fileURLToPath } from 'url';
@@ -112,17 +112,45 @@ async function analyzePdfForNode(input, options) {
     signal: userPdf?.signal ?? options?.signal
   };
   const r = await analyzePdf(data, pdfOpts);
+  const usePdfJsPerPage = pdfOpts.ocr === "off" && (options?.pdfNativeTextSource ?? "pdf-parse") === "pdfjs-per-page";
+  if (!usePdfJsPerPage) {
+    return {
+      fileKind: "pdf",
+      analyzer: "pdf",
+      status: "ok",
+      kind: "pdf",
+      text: r.text,
+      pages: r.pages,
+      metadata: r.metadata,
+      warnings: [...r.warnings],
+      needsOCR: r.needsOCR,
+      ocrUsed: r.ocrUsed
+    };
+  }
+  let text = r.text;
+  const extra = [];
+  try {
+    const rows = await extractPdfTextByPage(data, {
+      maxPages: pdfOpts.maxPages,
+      signal: pdfOpts.signal
+    });
+    text = rows.map((row) => row.text).join("\n\n");
+  } catch (e) {
+    const msg = e instanceof Error ? e.message : String(e);
+    extra.push(`warning: PDF.js per-page text failed; using pdf-parse text: ${msg}`);
+  }
+  const needsOCR = r.pages > 0 && text.trim().length === 0;
   return {
     fileKind: "pdf",
     analyzer: "pdf",
     status: "ok",
     kind: "pdf",
-    text: r.text,
+    text,
     pages: r.pages,
     metadata: r.metadata,
-    warnings: [...r.warnings],
-    needsOCR: r.needsOCR,
-    ocrUsed: r.ocrUsed
+    warnings: [...r.warnings, ...extra],
+    needsOCR,
+    ocrUsed: false
   };
 }
 function toPathString(pathOrUrl) {
@@ -211,7 +239,11 @@ function toExtractTextResult(full) {
 }
 async function extractText(input, options) {
   throwIfAborted(options?.signal);
-  const full = await analyzeFile(input, withPdfOcrDefaultOff(options));
+  const merged = {
+    ...withPdfOcrDefaultOff(options),
+    pdfNativeTextSource: options?.pdfNativeTextSource ?? "pdfjs-per-page"
+  };
+  const full = await analyzeFile(input, merged);
   return toExtractTextResult(full);
 }
 async function extractMetadata(input, options) {
@@ -485,11 +517,11 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
       if (kind === "pdf") {
         nativeExtraction = {
           willAttempt: true,
-          description: "pdf-parse extracts embedded text and page count first."
+          description: "pdf-parse supplies embedded text, metadata, and page count; PDF.js drives raster OCR when enabled."
         };
         ocr3 = {
           mayUse: pdfOcr !== "off",
-          description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when the text layer is empty (pdf.ocr: auto)."
+          description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when native text looks insufficient (pdf.ocr: auto + heuristics)."
         };
       } else if (kind === "docx") {
         nativeExtraction = {
@@ -515,11 +547,11 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
       if (kind === "pdf") {
         nativeExtraction = {
           willAttempt: true,
-          description: "Text layer via pdf-parse; defaults to pdf.ocr off unless you override."
+          description: "Node: pdf-parse for metadata/page baseline, then PDF.js per-page text merged into `text` (pdfNativeTextSource pdfjs-per-page default)."
         };
         ocr3 = {
           mayUse: false,
-          description: "extractText merges pdf.ocr default off \u2014 no raster OCR unless you set pdf.ocr explicitly."
+          description: "extractText defaults pdf.ocr off; set pdf.ocr explicitly to allow auto/force raster OCR."
         };
       } else if (kind === "docx") {
         nativeExtraction = {
@@ -654,11 +686,19 @@ function buildNodeCapabilityReport(kind) {
   switch (kind) {
     case "pdf":
       capabilities = [
-        slot("text", true, ["Includes text layer extraction; use extractText options to avoid PDF OCR."]),
-        slot("metadata", true),
-        slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."]),
-        slot("ocr", true, ["Raster OCR is available (e.g. analyzeFile with pdf.ocr auto/force, or runOcr)."]),
-        slot("pages", true)
+        slot("text", true, [
+          "Native text via pdf-parse and (in Node extractText) PDF.js per-page text; set pdf.ocr for raster OCR."
+        ]),
+        slot("metadata", true, [
+          "Document info / XMP-style metadata via pdf-parse without running the OCR pipeline."
+        ]),
+        slot("pages", true, [
+          "Page count and per-page native extraction (PDF.js) where used; OCR respects pdf.maxPages."
+        ]),
+        slot("ocr", true, [
+          "Raster OCR pipeline (pdf.ocr auto with quality heuristics, force, or runOcr)."
+        ]),
+        slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."])
       ];
       break;
     case "docx":
@@ -762,6 +802,17 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
   const intent = intentOpt ?? "analyzeFile";
   if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze);
   if (intent === "extractText") {
+    if (kind === "pdf") {
+      return {
+        intent: "extractText",
+        steps: [
+          { id: "detect_kind", status: "done" },
+          { id: "pdf_parse", status: "planned" },
+          { id: "pdfjs_per_page", status: "planned" },
+          { id: "pdf_ocr", status: "skipped" }
+        ]
+      };
+    }
     const p = planAnalyzeFile(kind, "off");
     return { ...p, intent: "extractText" };
   }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@dragon708/docmind-node",
-  "version": "1.2.0",
+  "version": "1.4.0",
   "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
   "type": "module",
   "main": "./dist/index.js",
@@ -34,7 +34,7 @@
   "dependencies": {
     "@dragon708/docmind-docx": "^1.0.0",
     "@dragon708/docmind-ocr": "^1.0.0",
-    "@dragon708/docmind-pdf": "^1.0.0",
+    "@dragon708/docmind-pdf": "^2.0.0",
     "@dragon708/docmind-shared": "^1.1.0"
   },
   "devDependencies": {