@dragon708/docmind-node 1.7.0 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,6 +1,6 @@
1
1
  import { DocMindAnalyzeOptions, DetectFileKindInput, NamedInput, AnalysisResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
2
2
  export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
3
- import { OcrOptions } from '@dragon708/docmind-ocr';
3
+ import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
4
4
  import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
5
5
  import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
6
6
  export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
@@ -10,7 +10,7 @@ export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversion
10
10
  *
11
11
  * - **`pdf`**: forwarded to `@dragon708/docmind-pdf`. `analyzeFile` defaults `pdf.ocr` to `"auto"` when omitted.
12
12
  * {@link extractText} / {@link convertToHtml} merge a default of `ocr: "off"` unless you set `pdf.ocr` explicitly.
13
- * - **`ocr`**: forwarded to `@dragon708/docmind-ocr` for raster images; language string also feeds PDF OCR when `pdf.ocrLangs` is unset.
13
+ * - **`ocr`**: forwarded to `@dragon708/docmind-ocr` for raster images (`ocrImageDetailed` + `normalizeImageForOcr`, or `ocrTiff` for TIFF); optional `preprocess` maps to {@link PreprocessImageOptions}. `maxPages` / `pageSeparator` apply to multipage TIFF. Language also feeds PDF OCR when `pdf.ocrLangs` is unset.
14
14
  * - **`pdfNativeTextSource`**: when `pdf.ocr` resolves to `"off"`, chooses how native text is obtained (see {@link extractText} default).
15
15
  */
16
16
  /**
@@ -26,7 +26,9 @@ interface NodeAnalyzeDocxOptionsSlice {
26
26
  }
27
27
  interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
28
28
  readonly pdf?: PdfAnalyzeOptions;
29
- readonly ocr?: OcrOptions;
29
+ readonly ocr?: OcrOptions & Pick<OcrTiffOptions, "maxPages" | "pageSeparator"> & {
30
+ readonly preprocess?: PreprocessImageOptions;
31
+ };
30
32
  /** Solo DOCX: ver {@link NodeAnalyzeDocxOptionsSlice}. */
31
33
  readonly docx?: NodeAnalyzeDocxOptionsSlice;
32
34
  /**
@@ -77,12 +79,14 @@ declare function extractMetadata(input: NodeAnalyzeInput, options?: NodeAnalyzeO
77
79
  declare function convertToHtml(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
78
80
  /**
79
81
  * OCR intent: PDF always runs {@link analyzePdf} with `ocr: "force"` (merged with `options.pdf`).
80
- * Raster images run Tesseract via `options.ocr`. DOCX returns structured extract with a notice.
82
+ * Raster images: `ocrImageDetailed` (normalize optional preprocess Tesseract) for supported single-frame inputs;
83
+ * TIFF via multipage `ocrTiff` (`options.ocr.maxPages`, `pageSeparator`).
84
+ * DOCX returns structured extract with a notice.
81
85
  */
82
86
  declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
83
87
 
84
88
  /** High-level features the user can ask DocMind for (per input kind and runtime). */
85
- type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
89
+ type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "ocr-multipage" | "image-normalization" | "tiff" | "heic-node-only" | "bmp" | "gif-first-frame";
86
90
  /**
87
91
  * DOCX-only: what the stack can do with OOXML embedded bitmaps/vector payloads under `word/media`.
88
92
  * Present on {@link GetCapabilitiesReport} when `kind === "docx"`.
@@ -174,7 +178,7 @@ type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnal
174
178
 
175
179
  /**
176
180
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
177
- * `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (for PDF, `text` / `metadata` /
181
+ * `text` | `metadata` | `html` | `ocr` | `pages` (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
178
182
  * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
179
183
  * For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).
180
184
  * Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
package/dist/index.js CHANGED
@@ -1,8 +1,8 @@
1
- import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile } from '@dragon708/docmind-shared';
1
+ import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
2
2
  export { detectFileKind } from '@dragon708/docmind-shared';
3
3
  import { analyzeDocx } from '@dragon708/docmind-docx';
4
4
  export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
5
- import { ocr } from '@dragon708/docmind-ocr';
5
+ import { preprocessHasEffect, resolveImageFormat, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
6
6
  import { extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
7
7
  import { readFile } from 'fs/promises';
8
8
  import { basename } from 'path';
@@ -74,6 +74,53 @@ async function analyzeDocxForNode(input, options) {
74
74
  const r = docxOpts !== void 0 ? await analyzeDocx(data, docxOpts) : await analyzeDocx(data);
75
75
  return docxPackageResultToAnalysisResult(r);
76
76
  }
77
+ function meanPageConfidence(pages) {
78
+ if (pages.length === 0) return 0;
79
+ return pages.reduce((s, p) => s + p.confidence, 0) / pages.length;
80
+ }
81
+ function mimeHintFromDetectInput(input) {
82
+ if (!isByteBackedInput(input)) return void 0;
83
+ return getMimeType(input);
84
+ }
85
+ async function runRasterOcrForNode(data, input, options) {
86
+ const signal = options?.ocr?.signal ?? options?.signal;
87
+ const langs = options?.ocr?.langs;
88
+ const mimeHint = mimeHintFromDetectInput(input);
89
+ const format = resolveImageFormat(data, mimeHint);
90
+ if (format === "tiff") {
91
+ const tiff = await ocrTiff(data, {
92
+ langs,
93
+ signal,
94
+ maxPages: options?.ocr?.maxPages,
95
+ pageSeparator: options?.ocr?.pageSeparator
96
+ });
97
+ const warnings = [...tiff.warnings];
98
+ return {
99
+ text: tiff.text.trim(),
100
+ confidence: meanPageConfidence(tiff.textByPage),
101
+ ocrUsed: true,
102
+ warnings,
103
+ pages: tiff.pagesProcessed,
104
+ textByPage: tiff.textByPage
105
+ };
106
+ }
107
+ const detailed = await ocrImageDetailed(data, {
108
+ langs,
109
+ signal,
110
+ preprocess: options?.ocr?.preprocess
111
+ });
112
+ return {
113
+ text: detailed.text.trim(),
114
+ confidence: detailed.confidence,
115
+ ocrUsed: true,
116
+ warnings: [...detailed.warnings],
117
+ pages: detailed.pages,
118
+ inputFormat: detailed.inputFormat,
119
+ normalizedFormat: detailed.normalizedFormat
120
+ };
121
+ }
122
+
123
+ // src/analyzers/image.ts
77
124
  async function analyzeImageForNode(input, options) {
78
125
  if (options?.signal?.aborted) {
79
126
  const err = new Error("The operation was aborted");
@@ -93,21 +140,26 @@ async function analyzeImageForNode(input, options) {
93
140
  warnings: ["No image bytes were provided for analysis."]
94
141
  };
95
142
  }
96
- const ocrOpts = {
97
- ...options?.ocr ?? {},
98
- signal: options?.ocr?.signal ?? options?.signal
99
- };
100
- const r = await ocr(data, ocrOpts);
101
- return {
143
+ const ocrPart = await runRasterOcrForNode(data, input, options);
144
+ const base = {
102
145
  fileKind: "image",
103
146
  analyzer: "image",
104
147
  status: "ok",
105
148
  kind: "image",
106
- text: r.text,
107
- confidence: r.confidence,
108
- ocrUsed: r.ocrUsed,
109
- warnings: []
149
+ text: ocrPart.text,
150
+ confidence: ocrPart.confidence,
151
+ ocrUsed: true,
152
+ warnings: ocrPart.warnings
110
153
  };
154
+ const extra = {};
155
+ if (ocrPart.pages !== void 0) extra.pages = ocrPart.pages;
156
+ if (ocrPart.textByPage !== void 0) extra.textByPage = ocrPart.textByPage;
157
+ if (ocrPart.inputFormat !== void 0) extra.inputFormat = ocrPart.inputFormat;
158
+ if (ocrPart.normalizedFormat !== void 0) extra.normalizedFormat = ocrPart.normalizedFormat;
159
+ if (Object.keys(extra).length > 0) {
160
+ return { ...base, ...extra };
161
+ }
162
+ return base;
111
163
  }
112
164
  async function analyzePdfForNode(input, options) {
113
165
  if (options?.signal?.aborted) {
@@ -301,7 +353,7 @@ async function extractMetadata(input, options) {
301
353
  status: "ok",
302
354
  kind: "pdf",
303
355
  text: "",
304
- pages: 0,
356
+ pages: r.pages,
305
357
  metadata: r.metadata,
306
358
  warnings: r.warnings,
307
359
  needsOCR: false,
@@ -449,34 +501,10 @@ async function runOcr(input, options) {
449
501
  };
450
502
  }
451
503
  case "image": {
452
- const data = await bytesFromDetectInput(resolved);
453
- if (data.byteLength === 0) {
454
- return {
455
- fileKind: "image",
456
- analyzer: "image",
457
- status: "ok",
458
- kind: "image",
459
- text: "",
460
- confidence: 0,
461
- ocrUsed: true,
462
- warnings: ["No image bytes were provided for analysis."]
463
- };
464
- }
465
- const ocrOpts = {
466
- ...options?.ocr ?? {},
467
- signal: options?.ocr?.signal ?? signal
468
- };
469
- const r = await ocr(data, ocrOpts);
470
- return {
471
- fileKind: "image",
472
- analyzer: "image",
473
- status: "ok",
474
- kind: "image",
475
- text: r.text,
476
- confidence: r.confidence,
477
- ocrUsed: r.ocrUsed,
478
- warnings: []
479
- };
504
+ return analyzeImageForNode(resolved, {
505
+ ...options,
506
+ ocr: { ...options?.ocr ?? {}, signal: options?.ocr?.signal ?? signal }
507
+ });
480
508
  }
481
509
  case "docx": {
482
510
  const data = await bytesFromDetectInput(resolved);
@@ -589,11 +617,33 @@ function buildNodeCapabilityReport(kind) {
589
617
  break;
590
618
  case "image":
591
619
  capabilities = [
592
- slot("text", true, ["Text is obtained via OCR."]),
620
+ slot("text", true, [
621
+ "Text via `@dragon708/docmind-ocr` after format sniff (PNG, JPEG, WebP, TIFF, BMP, GIF first frame, HEIC/HEIF in Node via conversion)."
622
+ ]),
593
623
  slot("metadata", false, [IMAGE_META]),
594
624
  slot("html", false, [IMAGE_HTML]),
595
- slot("ocr", true),
596
- slot("pages", false)
625
+ slot("ocr", true, [
626
+ "Single-frame pipeline: `normalizeImageForOcr` \u2192 optional `preprocessImageForOcr` (`options.ocr.preprocess`) \u2192 Tesseract via `ocrImageDetailed`."
627
+ ]),
628
+ slot("ocr-multipage", true, [
629
+ "Multipage TIFF: `ocrTiff` with per-page text; `text` joins pages (see `options.ocr.pageSeparator`)."
630
+ ]),
631
+ slot("image-normalization", true, [
632
+ "Bytes are normalized to a Tesseract-friendly raster (PNG-oriented) inside the OCR package before recognition."
633
+ ]),
634
+ slot("tiff", true, [
635
+ "TIFF detected by magic/MIME routes to `ocrTiff` (not the single-frame `ocrImageDetailed` path)."
636
+ ]),
637
+ slot("heic-node-only", true, [
638
+ "HEIC/HEIF uses Node `sharp` conversion in `@dragon708/docmind-ocr`; not available in the browser facade."
639
+ ]),
640
+ slot("bmp", true, ["BMP inputs are supported through the universal normalization path."]),
641
+ slot("gif-first-frame", true, [
642
+ "Animated GIF: only the first frame is normalized and OCR'd."
643
+ ]),
644
+ slot("pages", true, [
645
+ "TIFF: `pages` and `textByPage` mirror frames processed. Other formats may expose `pages` when the normalizer reports it (e.g. GIF)."
646
+ ])
597
647
  ];
598
648
  break;
599
649
  case "text":
@@ -643,11 +693,12 @@ function finalizeDocxExplainReport(report) {
643
693
  };
644
694
  }
645
695
  var DOCX_MAMMOTH_PLUS_OPTIONAL = "Mammoth extracts text and HTML from OOXML; optional parallel OOXML/ZIP extractors run when options.docx.include is set (structure, headings, tables, blocks, pagesApprox, embeddedImages).";
646
- function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
696
+ var NODE_IMAGE_OCR_PIPELINE = "Node raster OCR: detect format (magic/MIME) \u2192 `normalizeImageForOcr` \u2192 optional `preprocessImageForOcr` when `options.ocr.preprocess` is set \u2192 Tesseract. TIFF is handled with `ocrTiff` (multipage; per-page `textByPage` and joined `text`). HEIC/HEIF is decoded via `sharp` on Node (not in the browser package). BMP and static images share the normalization path; GIF uses the first frame only.";
697
+ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlice) {
647
698
  const runtime = { id: "node" };
648
699
  const primaryAnalyzer = kind === "pdf" ? "pdf" : kind === "docx" ? "docx" : kind === "image" ? "image" : kind === "text" ? "text" : "none";
649
700
  let nativeExtraction;
650
- let ocr3;
701
+ let ocr;
651
702
  let limitations = [];
652
703
  if (kind === "unknown") {
653
704
  limitations = lim(
@@ -672,7 +723,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
672
723
  willAttempt: true,
673
724
  description: "pdf-parse supplies embedded text, metadata, and page count; PDF.js drives raster OCR when enabled."
674
725
  };
675
- ocr3 = {
726
+ ocr = {
676
727
  mayUse: pdfOcr !== "off",
677
728
  description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when native text looks insufficient (pdf.ocr: auto + heuristics)."
678
729
  };
@@ -681,19 +732,22 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
681
732
  willAttempt: true,
682
733
  description: docxIncludeRequested(docxInclude) ? "Mammoth plus parallel OOXML extractors (per options.docx.include)." : DOCX_MAMMOTH_PLUS_OPTIONAL
683
734
  };
684
- ocr3 = { mayUse: false, description: "DOCX does not use OCR in DocMind." };
735
+ ocr = { mayUse: false, description: "DOCX does not use OCR in DocMind." };
685
736
  } else if (kind === "image") {
686
737
  nativeExtraction = {
687
738
  willAttempt: false,
688
- description: "Images have no native text layer; text comes from OCR only."
739
+ description: NODE_IMAGE_OCR_PIPELINE
740
+ };
741
+ ocr = {
742
+ mayUse: true,
743
+ description: "Same stack as `runOcr` / `analyzeFile` for images: universal normalization, optional preprocess, then `ocrImageDetailed` or multipage `ocrTiff` for TIFF."
689
744
  };
690
- ocr3 = { mayUse: true, description: "Tesseract runs on supported raster formats." };
691
745
  } else {
692
746
  nativeExtraction = {
693
747
  willAttempt: true,
694
748
  description: "UTF-8 decode with BOM handling for plain text."
695
749
  };
696
- ocr3 = { mayUse: false, description: "OCR does not apply to text files." };
750
+ ocr = { mayUse: false, description: "OCR does not apply to text files." };
697
751
  }
698
752
  break;
699
753
  case "extractText":
@@ -702,7 +756,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
702
756
  willAttempt: true,
703
757
  description: "Node: pdf-parse for metadata/page baseline, then PDF.js per-page text merged into `text` (pdfNativeTextSource pdfjs-per-page default)."
704
758
  };
705
- ocr3 = {
759
+ ocr = {
706
760
  mayUse: false,
707
761
  description: "extractText defaults pdf.ocr off; set pdf.ocr explicitly to allow auto/force raster OCR."
708
762
  };
@@ -711,16 +765,22 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
711
765
  willAttempt: true,
712
766
  description: docxIncludeRequested(docxInclude) ? "Same DOCX router as analyzeFile: Mammoth text + optional OOXML fields; HTML cleared in extractText." : "Mammoth plain text; HTML cleared. Optional OOXML v2 fields when options.docx.include is set."
713
767
  };
714
- ocr3 = { mayUse: false, description: "DOCX does not use OCR." };
768
+ ocr = { mayUse: false, description: "DOCX does not use OCR." };
715
769
  } else if (kind === "image") {
716
- nativeExtraction = { willAttempt: false, description: "No embedded text layer." };
717
- ocr3 = { mayUse: true, description: "OCR produces text for images." };
770
+ nativeExtraction = {
771
+ willAttempt: false,
772
+ description: NODE_IMAGE_OCR_PIPELINE
773
+ };
774
+ ocr = {
775
+ mayUse: true,
776
+ description: "Same Node image pipeline as analyzeFile (normalize \u2192 optional preprocess \u2192 `ocrImageDetailed` or `ocrTiff`)."
777
+ };
718
778
  } else {
719
779
  nativeExtraction = {
720
780
  willAttempt: true,
721
781
  description: "UTF-8 decode only."
722
782
  };
723
- ocr3 = { mayUse: false, description: "OCR does not apply." };
783
+ ocr = { mayUse: false, description: "OCR does not apply." };
724
784
  }
725
785
  break;
726
786
  case "extractMetadata":
@@ -729,13 +789,13 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
729
789
  willAttempt: true,
730
790
  description: "Lightweight PDF info/XMP normalization without full OCR."
731
791
  };
732
- ocr3 = { mayUse: false, description: "extractMetadata does not run the OCR pipeline." };
792
+ ocr = { mayUse: false, description: "extractMetadata does not run the OCR pipeline." };
733
793
  } else if (kind === "docx" || kind === "image") {
734
794
  nativeExtraction = {
735
795
  willAttempt: false,
736
796
  description: "Stub response; no heavy extractor."
737
797
  };
738
- ocr3 = { mayUse: false, description: "OCR not used for this metadata path." };
798
+ ocr = { mayUse: false, description: "OCR not used for this metadata path." };
739
799
  limitations = lim(
740
800
  kind === "docx" ? "Structured DOCX metadata is not exposed separately; use analyzeFile / extractText / convertToHtml with options.docx.include for OOXML structure." : "Raster images have no document metadata bundle."
741
801
  );
@@ -744,7 +804,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
744
804
  willAttempt: true,
745
805
  description: "Decoded text only; no structured document metadata."
746
806
  };
747
- ocr3 = { mayUse: false, description: "OCR does not apply." };
807
+ ocr = { mayUse: false, description: "OCR does not apply." };
748
808
  limitations = lim("Plain text has no structured document metadata.");
749
809
  }
750
810
  break;
@@ -754,26 +814,26 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
754
814
  willAttempt: true,
755
815
  description: "Text layer extracted then wrapped in <pre> (not visual layout)."
756
816
  };
757
- ocr3 = { mayUse: false, description: "convertToHtml does not run PDF OCR." };
817
+ ocr = { mayUse: false, description: "convertToHtml does not run PDF OCR." };
758
818
  limitations = lim("PDF HTML is a plain-text preview, not page layout.");
759
819
  } else if (kind === "docx") {
760
820
  nativeExtraction = {
761
821
  willAttempt: true,
762
822
  description: docxIncludeRequested(docxInclude) ? "Mammoth HTML plus optional OOXML extractors (same router as analyzeFile)." : "Mammoth HTML via analyzeFile routing; optional OOXML v2 when options.docx.include is set."
763
823
  };
764
- ocr3 = { mayUse: false, description: "DOCX path does not use OCR." };
824
+ ocr = { mayUse: false, description: "DOCX path does not use OCR." };
765
825
  } else if (kind === "text") {
766
826
  nativeExtraction = {
767
827
  willAttempt: true,
768
828
  description: "UTF-8 decode then <pre> wrapper."
769
829
  };
770
- ocr3 = { mayUse: false, description: "OCR does not apply." };
830
+ ocr = { mayUse: false, description: "OCR does not apply." };
771
831
  } else {
772
832
  nativeExtraction = {
773
833
  willAttempt: false,
774
834
  description: "No HTML path for raster images."
775
835
  };
776
- ocr3 = { mayUse: false, description: "OCR does not emit layout HTML here." };
836
+ ocr = { mayUse: false, description: "OCR does not emit layout HTML here." };
777
837
  limitations = lim("Use extractText or runOcr for image text.");
778
838
  }
779
839
  break;
@@ -783,32 +843,44 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
783
843
  willAttempt: true,
784
844
  description: "pdf-parse runs first; text may be replaced by raster OCR output."
785
845
  };
786
- ocr3 = {
846
+ ocr = {
787
847
  mayUse: true,
788
848
  description: 'runOcr always sets pdf.ocr to "force" for PDFs.'
789
849
  };
790
850
  limitations = lim("Forced OCR may run even when a text layer exists.");
791
851
  } else if (kind === "image") {
792
- nativeExtraction = { willAttempt: false, description: "No native text layer." };
793
- ocr3 = { mayUse: true, description: "Tesseract OCR on the image bytes." };
852
+ nativeExtraction = {
853
+ willAttempt: false,
854
+ description: NODE_IMAGE_OCR_PIPELINE
855
+ };
856
+ ocr = {
857
+ mayUse: true,
858
+ description: "Forced OCR path for rasters: TIFF \u2192 `ocrTiff` with `textByPage` when multipage; other formats \u2192 `ocrImageDetailed` after normalization (HEIC converted with `sharp` on Node)."
859
+ };
794
860
  } else if (kind === "docx") {
795
861
  nativeExtraction = {
796
862
  willAttempt: true,
797
863
  description: docxIncludeRequested(docxInclude) ? "Mammoth text/HTML plus optional OOXML extractors; still not OCR." : "Full Mammoth extract (text + HTML); optional OOXML v2 via options.docx.include; not OCR."
798
864
  };
799
- ocr3 = { mayUse: false, description: "DOCX is not OCR'd." };
865
+ ocr = { mayUse: false, description: "DOCX is not OCR'd." };
800
866
  limitations = lim("Result is structured extract, not OCR output.");
801
867
  } else {
802
868
  nativeExtraction = {
803
869
  willAttempt: true,
804
870
  description: "UTF-8 decode only."
805
871
  };
806
- ocr3 = { mayUse: false, description: "OCR does not apply to text files." };
872
+ ocr = { mayUse: false, description: "OCR does not apply to text files." };
807
873
  }
808
874
  break;
809
875
  default:
810
876
  nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
811
- ocr3 = { mayUse: false, description: "See plan steps." };
877
+ ocr = { mayUse: false, description: "See plan steps." };
878
+ }
879
+ if (kind === "image" && preprocessHasEffect(ocrSlice?.preprocess)) {
880
+ limitations = [
881
+ ...limitations,
882
+ "options.ocr.preprocess applies to the single-frame `ocrImageDetailed` path; multipage TIFF (`ocrTiff`) does not run this preprocess on each frame."
883
+ ];
812
884
  }
813
885
  return finalizeDocxExplainReport({
814
886
  kind,
@@ -817,17 +889,27 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude) {
817
889
  intent,
818
890
  primaryAnalyzer,
819
891
  nativeExtraction,
820
- ocr: ocr3,
892
+ ocr,
821
893
  limitations,
822
894
  plan
823
895
  });
824
896
  }
825
-
826
- // src/introspection.ts
827
897
  function resolvePdfOcrMode(pdf) {
828
898
  return pdf?.ocr ?? "auto";
829
899
  }
830
- function planAnalyzeFile(kind, pdfOcr, docxInclude) {
900
+ function imageOcrPlanSteps(ocr) {
901
+ return [
902
+ { id: "detect_kind", status: "done" },
903
+ { id: "image_format_detect", status: "planned" },
904
+ { id: "normalize_image_for_ocr", status: "planned" },
905
+ {
906
+ id: "preprocess_image_for_ocr",
907
+ status: preprocessHasEffect(ocr?.preprocess) ? "planned" : "skipped"
908
+ },
909
+ { id: "ocr_tesseract", status: "planned" }
910
+ ];
911
+ }
912
+ function planAnalyzeFile(kind, pdfOcr, docxInclude, ocr) {
831
913
  switch (kind) {
832
914
  case "pdf":
833
915
  return {
@@ -855,10 +937,7 @@ function planAnalyzeFile(kind, pdfOcr, docxInclude) {
855
937
  case "image":
856
938
  return {
857
939
  intent: "analyzeFile",
858
- steps: [
859
- { id: "detect_kind", status: "done" },
860
- { id: "image_ocr", status: "planned" }
861
- ]
940
+ steps: imageOcrPlanSteps(ocr)
862
941
  };
863
942
  case "text":
864
943
  return {
@@ -878,9 +957,9 @@ function planAnalyzeFile(kind, pdfOcr, docxInclude) {
878
957
  };
879
958
  }
880
959
  }
881
- function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude) {
960
+ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr) {
882
961
  const intent = intentOpt ?? "analyzeFile";
883
- if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude);
962
+ if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
884
963
  if (intent === "extractText") {
885
964
  if (kind === "pdf") {
886
965
  return {
@@ -893,7 +972,7 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude) {
893
972
  ]
894
973
  };
895
974
  }
896
- const p = planAnalyzeFile(kind, "off", docxInclude);
975
+ const p = planAnalyzeFile(kind, "off", docxInclude, ocr);
897
976
  return { ...p, intent: "extractText" };
898
977
  }
899
978
  if (intent === "extractMetadata") {
@@ -977,10 +1056,7 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude) {
977
1056
  if (kind === "image") {
978
1057
  return {
979
1058
  intent: "runOcr",
980
- steps: [
981
- { id: "detect_kind", status: "done" },
982
- { id: "tesseract_ocr", status: "planned" }
983
- ]
1059
+ steps: imageOcrPlanSteps(ocr)
984
1060
  };
985
1061
  }
986
1062
  if (kind === "docx") {
@@ -1002,7 +1078,7 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude) {
1002
1078
  ]
1003
1079
  };
1004
1080
  }
1005
- return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude);
1081
+ return planAnalyzeFile(kind, pdfOcrForAnalyze, docxInclude, ocr);
1006
1082
  }
1007
1083
  async function getCapabilities(input, options) {
1008
1084
  throwIfAborted(options?.signal);
@@ -1019,8 +1095,9 @@ async function explainAnalysisPlan(input, options) {
1019
1095
  const intent = options?.intent ?? "analyzeFile";
1020
1096
  const pdfOcrAnalyze = resolvePdfOcrMode(options?.pdf);
1021
1097
  const docxInc = options?.docx?.include;
1022
- const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc);
1023
- return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan, docxInc);
1098
+ const ocrSlice = options?.ocr;
1099
+ const plan = planForIntent(intent, kind, pdfOcrAnalyze, docxInc, ocrSlice);
1100
+ return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan, docxInc, ocrSlice);
1024
1101
  }
1025
1102
 
1026
1103
  export { DOCX_EMBEDDED_IMAGE_CAPABILITIES, DOCX_STRUCTURE_CAPABILITIES, analyzeFile, bufferToInput, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-node",
3
- "version": "1.7.0",
3
+ "version": "1.9.0",
4
4
  "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -33,8 +33,8 @@
33
33
  "license": "MIT",
34
34
  "dependencies": {
35
35
  "@dragon708/docmind-docx": "^1.7.0",
36
- "@dragon708/docmind-ocr": "^1.0.0",
37
- "@dragon708/docmind-pdf": "^2.0.0",
36
+ "@dragon708/docmind-ocr": "^1.1.0",
37
+ "@dragon708/docmind-pdf": "^2.1.0",
38
38
  "@dragon708/docmind-shared": "^1.1.0"
39
39
  },
40
40
  "devDependencies": {