@dragon708/docmind-node 1.1.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +91 -15
- package/dist/index.js +578 -186
- package/package.json +6 -5
- package/dist/index.js.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -1,13 +1,25 @@
|
|
|
1
|
-
import { DetectFileKindInput, NamedInput, AnalysisResult } from '@dragon708/docmind-shared';
|
|
2
|
-
export { AnalysisAnalyzer, AnalysisResult, DetectFileKindInput, DocxAnalysisCoreResult, FileKind, FileKindMetadata, GenericAnalysisResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
|
|
1
|
+
import { DocMindAnalyzeOptions, DetectFileKindInput, NamedInput, AnalysisResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
|
|
2
|
+
export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
|
|
3
3
|
import { OcrOptions } from '@dragon708/docmind-ocr';
|
|
4
4
|
import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
|
|
5
5
|
|
|
6
|
-
/**
|
|
7
|
-
|
|
8
|
-
|
|
6
|
+
/**
|
|
7
|
+
* Options for Node public APIs (`analyzeFile`, intent methods).
|
|
8
|
+
*
|
|
9
|
+
* - **`pdf`**: forwarded to `@dragon708/docmind-pdf`. `analyzeFile` defaults `pdf.ocr` to `"auto"` when omitted.
|
|
10
|
+
* {@link extractText} / {@link convertToHtml} merge a default of `ocr: "off"` unless you set `pdf.ocr` explicitly.
|
|
11
|
+
* - **`ocr`**: forwarded to `@dragon708/docmind-ocr` for raster images; language string also feeds PDF OCR when `pdf.ocrLangs` is unset.
|
|
12
|
+
* - **`pdfNativeTextSource`**: when `pdf.ocr` resolves to `"off"`, chooses how native text is obtained (see {@link extractText} default).
|
|
13
|
+
*/
|
|
14
|
+
interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
|
|
9
15
|
readonly pdf?: PdfAnalyzeOptions;
|
|
10
16
|
readonly ocr?: OcrOptions;
|
|
17
|
+
/**
|
|
18
|
+
* Native PDF text when `pdf.ocr` is `"off"`:
|
|
19
|
+
* - **`pdfjs-per-page`** (default for {@link extractText}): PDF.js text per page, merged for `text` (aligns with OCR raster engine).
|
|
20
|
+
* - **`pdf-parse`**: single pdf-parse pass (default for {@link analyzeFile} when you set `pdf.ocr: "off"` without this flag).
|
|
21
|
+
*/
|
|
22
|
+
readonly pdfNativeTextSource?: "pdf-parse" | "pdfjs-per-page";
|
|
11
23
|
}
|
|
12
24
|
|
|
13
25
|
/**
|
|
@@ -27,29 +39,93 @@ declare function bufferToInput(buffer: Buffer, name?: string): NamedInput<Buffer
|
|
|
27
39
|
declare function resolveNodeAnalyzeInput(input: NodeAnalyzeInput): Promise<DetectFileKindInput>;
|
|
28
40
|
|
|
29
41
|
/**
|
|
30
|
-
*
|
|
42
|
+
* Resolves {@link NodeAnalyzeInput} (paths read from disk), classifies with {@link detectFileKind}, then runs
|
|
43
|
+
* the PDF, DOCX, image, or text pipeline. PDF OCR defaults to `"auto"` when `options.pdf.ocr` is omitted.
|
|
31
44
|
*/
|
|
32
45
|
declare function analyzeFile(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
33
46
|
|
|
34
47
|
/**
|
|
35
|
-
*
|
|
36
|
-
*
|
|
48
|
+
* Plain-text extraction using {@link analyzeFile} routing. PDFs default to **native text only**
|
|
49
|
+
* (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly, and to **PDF.js per-page** assembly
|
|
50
|
+
* (`pdfNativeTextSource: "pdfjs-per-page"`) unless you set `options.pdfNativeTextSource` or `pdf.ocr` enables OCR.
|
|
37
51
|
*/
|
|
38
52
|
declare function extractText(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
39
53
|
/**
|
|
40
|
-
*
|
|
41
|
-
*
|
|
54
|
+
* Metadata: PDF uses lightweight metadata extraction; DOCX/images return stubs; plain text uses the same
|
|
55
|
+
* router as {@link extractText} (`analyzeFile` with PDF OCR off by default).
|
|
42
56
|
*/
|
|
43
57
|
declare function extractMetadata(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
44
58
|
/**
|
|
45
|
-
* HTML: DOCX
|
|
46
|
-
*
|
|
59
|
+
* HTML: DOCX and plain text go through {@link analyzeFile} (then `<pre>` for text). PDF uses the text layer
|
|
60
|
+
* only wrapped in `<pre>` (no OCR). Images return a stub without running OCR.
|
|
47
61
|
*/
|
|
48
62
|
declare function convertToHtml(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
49
63
|
/**
|
|
50
|
-
* OCR: PDF
|
|
51
|
-
*
|
|
64
|
+
* OCR intent: PDF always runs {@link analyzePdf} with `ocr: "force"` (merged with `options.pdf`).
|
|
65
|
+
* Raster images run Tesseract via `options.ocr`. DOCX returns structured extract with a notice.
|
|
52
66
|
*/
|
|
53
67
|
declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
54
68
|
|
|
55
|
-
|
|
69
|
+
/**
|
|
70
|
+
* Whether DocMind will try a non-OCR text/HTML path (e.g. Mammoth, pdf-parse text layer, UTF-8).
|
|
71
|
+
*/
|
|
72
|
+
interface NativeExtractionPlan {
|
|
73
|
+
readonly willAttempt: boolean;
|
|
74
|
+
readonly description: string;
|
|
75
|
+
}
|
|
76
|
+
/** Whether OCR (raster or PDF pipeline) may run for this intent + kind. */
|
|
77
|
+
interface OcrPlan {
|
|
78
|
+
readonly mayUse: boolean;
|
|
79
|
+
readonly description: string;
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Structured explanation of what DocMind would do for a public intent in Node (no heavy I/O).
|
|
83
|
+
*/
|
|
84
|
+
interface ExplainAnalysisPlanReport {
|
|
85
|
+
readonly kind: FileKind;
|
|
86
|
+
readonly detectedKind: FileKind;
|
|
87
|
+
readonly runtime: RuntimeDescriptor;
|
|
88
|
+
readonly intent: DocMindPublicIntent | (string & {});
|
|
89
|
+
readonly primaryAnalyzer: AnalysisAnalyzer;
|
|
90
|
+
readonly nativeExtraction: NativeExtractionPlan;
|
|
91
|
+
readonly ocr: OcrPlan;
|
|
92
|
+
readonly limitations: readonly string[];
|
|
93
|
+
readonly plan: ProcessingPlanDescriptor;
|
|
94
|
+
readonly warnings?: readonly string[];
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/** High-level features the user can ask DocMind for (per input kind and runtime). */
|
|
98
|
+
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
|
|
99
|
+
/** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
|
|
100
|
+
interface PublicCapabilitySupport {
|
|
101
|
+
readonly id: PublicCapabilityId;
|
|
102
|
+
readonly supported: boolean;
|
|
103
|
+
readonly warnings?: readonly string[];
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
|
|
107
|
+
*/
|
|
108
|
+
interface GetCapabilitiesReport {
|
|
109
|
+
readonly kind: FileKind;
|
|
110
|
+
readonly runtime: RuntimeDescriptor;
|
|
111
|
+
readonly capabilities: readonly PublicCapabilitySupport[];
|
|
112
|
+
readonly warnings?: readonly string[];
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/** Options for {@link explainAnalysisPlan} including PDF/OCR hints for accurate planning. */
|
|
116
|
+
type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr">;
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
|
|
120
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (for PDF, `text` / `metadata` /
|
|
121
|
+
* `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
|
|
122
|
+
* Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
|
|
123
|
+
*/
|
|
124
|
+
declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
|
|
125
|
+
/**
|
|
126
|
+
* Epic 1 — **Plan preview:** same shape as browser; PDF branches include `pdf.ocr` from options (`off` | `auto` | `force`).
|
|
127
|
+
* No full document parse unless resolving a path reads the file.
|
|
128
|
+
*/
|
|
129
|
+
declare function explainAnalysisPlan(input: NodeAnalyzeInput, options?: NodeExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
|
|
130
|
+
|
|
131
|
+
export { type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
package/dist/index.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile } from '@dragon708/docmind-shared';
|
|
2
2
|
export { detectFileKind } from '@dragon708/docmind-shared';
|
|
3
|
-
import {
|
|
3
|
+
import { analyzeDocx } from '@dragon708/docmind-docx';
|
|
4
4
|
import { ocr } from '@dragon708/docmind-ocr';
|
|
5
|
-
import {
|
|
5
|
+
import { extractPdfMetadata, extractTextFromPdf, analyzePdf, extractPdfTextByPage } from '@dragon708/docmind-pdf';
|
|
6
6
|
import { readFile } from 'fs/promises';
|
|
7
7
|
import { basename } from 'path';
|
|
8
8
|
import { fileURLToPath } from 'url';
|
|
@@ -112,17 +112,45 @@ async function analyzePdfForNode(input, options) {
|
|
|
112
112
|
signal: userPdf?.signal ?? options?.signal
|
|
113
113
|
};
|
|
114
114
|
const r = await analyzePdf(data, pdfOpts);
|
|
115
|
+
const usePdfJsPerPage = pdfOpts.ocr === "off" && (options?.pdfNativeTextSource ?? "pdf-parse") === "pdfjs-per-page";
|
|
116
|
+
if (!usePdfJsPerPage) {
|
|
117
|
+
return {
|
|
118
|
+
fileKind: "pdf",
|
|
119
|
+
analyzer: "pdf",
|
|
120
|
+
status: "ok",
|
|
121
|
+
kind: "pdf",
|
|
122
|
+
text: r.text,
|
|
123
|
+
pages: r.pages,
|
|
124
|
+
metadata: r.metadata,
|
|
125
|
+
warnings: [...r.warnings],
|
|
126
|
+
needsOCR: r.needsOCR,
|
|
127
|
+
ocrUsed: r.ocrUsed
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
let text = r.text;
|
|
131
|
+
const extra = [];
|
|
132
|
+
try {
|
|
133
|
+
const rows = await extractPdfTextByPage(data, {
|
|
134
|
+
maxPages: pdfOpts.maxPages,
|
|
135
|
+
signal: pdfOpts.signal
|
|
136
|
+
});
|
|
137
|
+
text = rows.map((row) => row.text).join("\n\n");
|
|
138
|
+
} catch (e) {
|
|
139
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
140
|
+
extra.push(`warning: PDF.js per-page text failed; using pdf-parse text: ${msg}`);
|
|
141
|
+
}
|
|
142
|
+
const needsOCR = r.pages > 0 && text.trim().length === 0;
|
|
115
143
|
return {
|
|
116
144
|
fileKind: "pdf",
|
|
117
145
|
analyzer: "pdf",
|
|
118
146
|
status: "ok",
|
|
119
147
|
kind: "pdf",
|
|
120
|
-
text
|
|
148
|
+
text,
|
|
121
149
|
pages: r.pages,
|
|
122
150
|
metadata: r.metadata,
|
|
123
|
-
warnings: [...r.warnings],
|
|
124
|
-
needsOCR
|
|
125
|
-
ocrUsed:
|
|
151
|
+
warnings: [...r.warnings, ...extra],
|
|
152
|
+
needsOCR,
|
|
153
|
+
ocrUsed: false
|
|
126
154
|
};
|
|
127
155
|
}
|
|
128
156
|
function toPathString(pathOrUrl) {
|
|
@@ -169,6 +197,19 @@ async function analyzeFile(input, options) {
|
|
|
169
197
|
return notImplementedResult(fileKind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
170
198
|
}
|
|
171
199
|
}
|
|
200
|
+
|
|
201
|
+
// src/intentPdfOptions.ts
|
|
202
|
+
function withPdfOcrDefaultOff(options) {
|
|
203
|
+
return {
|
|
204
|
+
...options,
|
|
205
|
+
pdf: {
|
|
206
|
+
...options?.pdf,
|
|
207
|
+
ocr: options?.pdf?.ocr ?? "off"
|
|
208
|
+
}
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// src/internal/abort.ts
|
|
172
213
|
function throwIfAborted(signal) {
|
|
173
214
|
if (signal?.aborted) {
|
|
174
215
|
const err = new Error("The operation was aborted");
|
|
@@ -176,117 +217,40 @@ function throwIfAborted(signal) {
|
|
|
176
217
|
throw err;
|
|
177
218
|
}
|
|
178
219
|
}
|
|
220
|
+
|
|
221
|
+
// src/publicActions.ts
|
|
222
|
+
var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not exposed as a separate API; use extractText or analyzeFile.";
|
|
223
|
+
var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
|
|
224
|
+
var RUN_OCR_PDF_FORCE_SEMANTICS = 'runOcr: PDF pipeline ran with `ocr: "force"` so text may include raster OCR output even when a text layer exists.';
|
|
225
|
+
function escapeHtmlMinimal(s) {
|
|
226
|
+
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
227
|
+
}
|
|
179
228
|
async function prepare(input) {
|
|
180
229
|
const resolved = await resolveNodeAnalyzeInput(input);
|
|
181
230
|
assertValidAnalyzeFileInput(resolved);
|
|
182
231
|
return resolved;
|
|
183
232
|
}
|
|
184
|
-
function
|
|
185
|
-
|
|
233
|
+
function toExtractTextResult(full) {
|
|
234
|
+
if (full.status !== "ok") return full;
|
|
235
|
+
if (full.fileKind === "docx") {
|
|
236
|
+
return { ...full, html: "" };
|
|
237
|
+
}
|
|
238
|
+
return full;
|
|
186
239
|
}
|
|
187
|
-
var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not exposed as a separate API; use extractText or analyzeFile.";
|
|
188
|
-
var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
|
|
189
240
|
async function extractText(input, options) {
|
|
190
241
|
throwIfAborted(options?.signal);
|
|
191
|
-
const
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
if (data.byteLength === 0) {
|
|
198
|
-
return {
|
|
199
|
-
fileKind: "pdf",
|
|
200
|
-
analyzer: "pdf",
|
|
201
|
-
status: "ok",
|
|
202
|
-
kind: "pdf",
|
|
203
|
-
text: "",
|
|
204
|
-
pages: 0,
|
|
205
|
-
metadata: { info: {} },
|
|
206
|
-
warnings: ["No document bytes were provided for analysis."],
|
|
207
|
-
needsOCR: false,
|
|
208
|
-
ocrUsed: false
|
|
209
|
-
};
|
|
210
|
-
}
|
|
211
|
-
const r = await extractTextFromPdf(data);
|
|
212
|
-
return {
|
|
213
|
-
fileKind: "pdf",
|
|
214
|
-
analyzer: "pdf",
|
|
215
|
-
status: "ok",
|
|
216
|
-
kind: "pdf",
|
|
217
|
-
text: r.text,
|
|
218
|
-
pages: r.pages,
|
|
219
|
-
metadata: { info: {} },
|
|
220
|
-
warnings: r.warnings,
|
|
221
|
-
needsOCR: false,
|
|
222
|
-
ocrUsed: false
|
|
223
|
-
};
|
|
224
|
-
}
|
|
225
|
-
case "docx": {
|
|
226
|
-
const data = await bytesFromDetectInput(resolved);
|
|
227
|
-
if (data.byteLength === 0) {
|
|
228
|
-
return {
|
|
229
|
-
fileKind: "docx",
|
|
230
|
-
analyzer: "docx",
|
|
231
|
-
status: "ok",
|
|
232
|
-
kind: "docx",
|
|
233
|
-
text: "",
|
|
234
|
-
html: "",
|
|
235
|
-
warnings: ["No document bytes were provided for analysis."]
|
|
236
|
-
};
|
|
237
|
-
}
|
|
238
|
-
const r = await extractTextFromDocx(data);
|
|
239
|
-
return {
|
|
240
|
-
fileKind: "docx",
|
|
241
|
-
analyzer: "docx",
|
|
242
|
-
status: "ok",
|
|
243
|
-
kind: "docx",
|
|
244
|
-
text: r.text,
|
|
245
|
-
html: "",
|
|
246
|
-
warnings: r.warnings
|
|
247
|
-
};
|
|
248
|
-
}
|
|
249
|
-
case "image": {
|
|
250
|
-
const data = await bytesFromDetectInput(resolved);
|
|
251
|
-
if (data.byteLength === 0) {
|
|
252
|
-
return {
|
|
253
|
-
fileKind: "image",
|
|
254
|
-
analyzer: "image",
|
|
255
|
-
status: "ok",
|
|
256
|
-
kind: "image",
|
|
257
|
-
text: "",
|
|
258
|
-
confidence: 0,
|
|
259
|
-
ocrUsed: true,
|
|
260
|
-
warnings: ["No image bytes were provided for analysis."]
|
|
261
|
-
};
|
|
262
|
-
}
|
|
263
|
-
const ocrOpts = {
|
|
264
|
-
...options?.ocr ?? {},
|
|
265
|
-
signal: options?.ocr?.signal ?? signal
|
|
266
|
-
};
|
|
267
|
-
const r = await ocr(data, ocrOpts);
|
|
268
|
-
return {
|
|
269
|
-
fileKind: "image",
|
|
270
|
-
analyzer: "image",
|
|
271
|
-
status: "ok",
|
|
272
|
-
kind: "image",
|
|
273
|
-
text: r.text,
|
|
274
|
-
confidence: r.confidence,
|
|
275
|
-
ocrUsed: r.ocrUsed,
|
|
276
|
-
warnings: []
|
|
277
|
-
};
|
|
278
|
-
}
|
|
279
|
-
case "text":
|
|
280
|
-
return analyzeText(resolved, { signal });
|
|
281
|
-
default:
|
|
282
|
-
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
283
|
-
}
|
|
242
|
+
const merged = {
|
|
243
|
+
...withPdfOcrDefaultOff(options),
|
|
244
|
+
pdfNativeTextSource: options?.pdfNativeTextSource ?? "pdfjs-per-page"
|
|
245
|
+
};
|
|
246
|
+
const full = await analyzeFile(input, merged);
|
|
247
|
+
return toExtractTextResult(full);
|
|
284
248
|
}
|
|
285
249
|
async function extractMetadata(input, options) {
|
|
286
250
|
throwIfAborted(options?.signal);
|
|
287
251
|
const resolved = await prepare(input);
|
|
288
252
|
const kind = detectFileKind(resolved);
|
|
289
|
-
|
|
253
|
+
options?.signal;
|
|
290
254
|
switch (kind) {
|
|
291
255
|
case "pdf": {
|
|
292
256
|
const data = await bytesFromDetectInput(resolved);
|
|
@@ -340,7 +304,7 @@ async function extractMetadata(input, options) {
|
|
|
340
304
|
warnings: [IMAGE_METADATA_NOTE]
|
|
341
305
|
};
|
|
342
306
|
case "text":
|
|
343
|
-
return
|
|
307
|
+
return analyzeFile(input, withPdfOcrDefaultOff(options));
|
|
344
308
|
default:
|
|
345
309
|
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
346
310
|
}
|
|
@@ -349,96 +313,72 @@ async function convertToHtml(input, options) {
|
|
|
349
313
|
throwIfAborted(options?.signal);
|
|
350
314
|
const resolved = await prepare(input);
|
|
351
315
|
const kind = detectFileKind(resolved);
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
analyzer: "docx",
|
|
374
|
-
status: "ok",
|
|
375
|
-
kind: "docx",
|
|
376
|
-
text: textPart.text,
|
|
377
|
-
html: htmlPart.html,
|
|
378
|
-
warnings: [...textPart.warnings, ...htmlPart.warnings]
|
|
379
|
-
};
|
|
380
|
-
}
|
|
381
|
-
case "pdf": {
|
|
382
|
-
const data = await bytesFromDetectInput(resolved);
|
|
383
|
-
if (data.byteLength === 0) {
|
|
384
|
-
return {
|
|
385
|
-
fileKind: "pdf",
|
|
386
|
-
analyzer: "pdf",
|
|
387
|
-
status: "ok",
|
|
388
|
-
kind: "pdf",
|
|
389
|
-
text: "",
|
|
390
|
-
pages: 0,
|
|
391
|
-
metadata: { info: {} },
|
|
392
|
-
warnings: ["No document bytes were provided for analysis."],
|
|
393
|
-
needsOCR: false,
|
|
394
|
-
ocrUsed: false
|
|
395
|
-
};
|
|
396
|
-
}
|
|
397
|
-
const r = await extractTextFromPdf(data);
|
|
398
|
-
const html = `<pre>${escapeHtmlMinimal(r.text)}</pre>`;
|
|
316
|
+
options?.signal;
|
|
317
|
+
if (kind === "docx") {
|
|
318
|
+
return analyzeFile(input, withPdfOcrDefaultOff(options));
|
|
319
|
+
}
|
|
320
|
+
if (kind === "text") {
|
|
321
|
+
const r = await analyzeFile(input, withPdfOcrDefaultOff(options));
|
|
322
|
+
if (r.status !== "ok") return r;
|
|
323
|
+
if (r.fileKind !== "text") return r;
|
|
324
|
+
const html = `<pre>${escapeHtmlMinimal(r.text)}</pre>`;
|
|
325
|
+
return {
|
|
326
|
+
...r,
|
|
327
|
+
html,
|
|
328
|
+
warnings: [
|
|
329
|
+
...r.warnings,
|
|
330
|
+
"HTML for plain text is a <pre> wrapper around decoded UTF-8 content."
|
|
331
|
+
]
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
if (kind === "pdf") {
|
|
335
|
+
const data = await bytesFromDetectInput(resolved);
|
|
336
|
+
if (data.byteLength === 0) {
|
|
399
337
|
return {
|
|
400
338
|
fileKind: "pdf",
|
|
401
339
|
analyzer: "pdf",
|
|
402
340
|
status: "ok",
|
|
403
341
|
kind: "pdf",
|
|
404
|
-
text:
|
|
405
|
-
pages:
|
|
342
|
+
text: "",
|
|
343
|
+
pages: 0,
|
|
406
344
|
metadata: { info: {} },
|
|
407
|
-
|
|
408
|
-
warnings: [
|
|
409
|
-
...r.warnings,
|
|
410
|
-
"PDF HTML is a plain-text preview wrapped in <pre> (not a visual layout)."
|
|
411
|
-
],
|
|
345
|
+
warnings: ["No document bytes were provided for analysis."],
|
|
412
346
|
needsOCR: false,
|
|
413
347
|
ocrUsed: false
|
|
414
348
|
};
|
|
415
349
|
}
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
text: "",
|
|
435
|
-
confidence: 0,
|
|
436
|
-
ocrUsed: true,
|
|
437
|
-
warnings: ["No HTML representation for raster images; use extractText / runOcr."]
|
|
438
|
-
};
|
|
439
|
-
default:
|
|
440
|
-
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
350
|
+
const r = await extractTextFromPdf(data);
|
|
351
|
+
const html = `<pre>${escapeHtmlMinimal(r.text)}</pre>`;
|
|
352
|
+
return {
|
|
353
|
+
fileKind: "pdf",
|
|
354
|
+
analyzer: "pdf",
|
|
355
|
+
status: "ok",
|
|
356
|
+
kind: "pdf",
|
|
357
|
+
text: r.text,
|
|
358
|
+
pages: r.pages,
|
|
359
|
+
metadata: { info: {} },
|
|
360
|
+
html,
|
|
361
|
+
warnings: [
|
|
362
|
+
...r.warnings,
|
|
363
|
+
"PDF HTML is a plain-text preview wrapped in <pre> (not a visual layout)."
|
|
364
|
+
],
|
|
365
|
+
needsOCR: false,
|
|
366
|
+
ocrUsed: false
|
|
367
|
+
};
|
|
441
368
|
}
|
|
369
|
+
if (kind === "image") {
|
|
370
|
+
return {
|
|
371
|
+
fileKind: "image",
|
|
372
|
+
analyzer: "image",
|
|
373
|
+
status: "ok",
|
|
374
|
+
kind: "image",
|
|
375
|
+
text: "",
|
|
376
|
+
confidence: 0,
|
|
377
|
+
ocrUsed: true,
|
|
378
|
+
warnings: ["No HTML representation for raster images; use extractText / runOcr."]
|
|
379
|
+
};
|
|
380
|
+
}
|
|
381
|
+
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
442
382
|
}
|
|
443
383
|
async function runOcr(input, options) {
|
|
444
384
|
throwIfAborted(options?.signal);
|
|
@@ -477,7 +417,7 @@ async function runOcr(input, options) {
|
|
|
477
417
|
text: r.text,
|
|
478
418
|
pages: r.pages,
|
|
479
419
|
metadata: r.metadata,
|
|
480
|
-
warnings: [...r.warnings],
|
|
420
|
+
warnings: [RUN_OCR_PDF_FORCE_SEMANTICS, ...r.warnings],
|
|
481
421
|
needsOCR: r.needsOCR,
|
|
482
422
|
ocrUsed: r.ocrUsed
|
|
483
423
|
};
|
|
@@ -546,6 +486,458 @@ async function runOcr(input, options) {
|
|
|
546
486
|
}
|
|
547
487
|
}
|
|
548
488
|
|
|
549
|
-
|
|
489
|
+
// src/analysisPlanReport.ts
|
|
490
|
+
function lim(...items) {
|
|
491
|
+
return items.filter(Boolean);
|
|
492
|
+
}
|
|
493
|
+
function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
|
|
494
|
+
const runtime = { id: "node" };
|
|
495
|
+
const primaryAnalyzer = kind === "pdf" ? "pdf" : kind === "docx" ? "docx" : kind === "image" ? "image" : kind === "text" ? "text" : "none";
|
|
496
|
+
let nativeExtraction;
|
|
497
|
+
let ocr3;
|
|
498
|
+
let limitations = [];
|
|
499
|
+
if (kind === "unknown") {
|
|
500
|
+
limitations = lim(
|
|
501
|
+
"Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve."
|
|
502
|
+
);
|
|
503
|
+
return {
|
|
504
|
+
kind,
|
|
505
|
+
detectedKind: kind,
|
|
506
|
+
runtime,
|
|
507
|
+
intent,
|
|
508
|
+
primaryAnalyzer: "none",
|
|
509
|
+
nativeExtraction: { willAttempt: false, description: "No analyzer without a known file kind." },
|
|
510
|
+
ocr: { mayUse: false, description: "OCR is not used for unknown kinds." },
|
|
511
|
+
limitations,
|
|
512
|
+
plan
|
|
513
|
+
};
|
|
514
|
+
}
|
|
515
|
+
switch (intent) {
|
|
516
|
+
case "analyzeFile":
|
|
517
|
+
if (kind === "pdf") {
|
|
518
|
+
nativeExtraction = {
|
|
519
|
+
willAttempt: true,
|
|
520
|
+
description: "pdf-parse supplies embedded text, metadata, and page count; PDF.js drives raster OCR when enabled."
|
|
521
|
+
};
|
|
522
|
+
ocr3 = {
|
|
523
|
+
mayUse: pdfOcr !== "off",
|
|
524
|
+
description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when native text looks insufficient (pdf.ocr: auto + heuristics)."
|
|
525
|
+
};
|
|
526
|
+
} else if (kind === "docx") {
|
|
527
|
+
nativeExtraction = {
|
|
528
|
+
willAttempt: true,
|
|
529
|
+
description: "Mammoth extracts text and HTML from OOXML."
|
|
530
|
+
};
|
|
531
|
+
ocr3 = { mayUse: false, description: "DOCX does not use OCR in DocMind." };
|
|
532
|
+
} else if (kind === "image") {
|
|
533
|
+
nativeExtraction = {
|
|
534
|
+
willAttempt: false,
|
|
535
|
+
description: "Images have no native text layer; text comes from OCR only."
|
|
536
|
+
};
|
|
537
|
+
ocr3 = { mayUse: true, description: "Tesseract runs on supported raster formats." };
|
|
538
|
+
} else {
|
|
539
|
+
nativeExtraction = {
|
|
540
|
+
willAttempt: true,
|
|
541
|
+
description: "UTF-8 decode with BOM handling for plain text."
|
|
542
|
+
};
|
|
543
|
+
ocr3 = { mayUse: false, description: "OCR does not apply to text files." };
|
|
544
|
+
}
|
|
545
|
+
break;
|
|
546
|
+
case "extractText":
|
|
547
|
+
if (kind === "pdf") {
|
|
548
|
+
nativeExtraction = {
|
|
549
|
+
willAttempt: true,
|
|
550
|
+
description: "Node: pdf-parse for metadata/page baseline, then PDF.js per-page text merged into `text` (pdfNativeTextSource pdfjs-per-page default)."
|
|
551
|
+
};
|
|
552
|
+
ocr3 = {
|
|
553
|
+
mayUse: false,
|
|
554
|
+
description: "extractText defaults pdf.ocr off; set pdf.ocr explicitly to allow auto/force raster OCR."
|
|
555
|
+
};
|
|
556
|
+
} else if (kind === "docx") {
|
|
557
|
+
nativeExtraction = {
|
|
558
|
+
willAttempt: true,
|
|
559
|
+
description: "Mammoth plain text; HTML cleared in the extractText response."
|
|
560
|
+
};
|
|
561
|
+
ocr3 = { mayUse: false, description: "DOCX does not use OCR." };
|
|
562
|
+
} else if (kind === "image") {
|
|
563
|
+
nativeExtraction = { willAttempt: false, description: "No embedded text layer." };
|
|
564
|
+
ocr3 = { mayUse: true, description: "OCR produces text for images." };
|
|
565
|
+
} else {
|
|
566
|
+
nativeExtraction = {
|
|
567
|
+
willAttempt: true,
|
|
568
|
+
description: "UTF-8 decode only."
|
|
569
|
+
};
|
|
570
|
+
ocr3 = { mayUse: false, description: "OCR does not apply." };
|
|
571
|
+
}
|
|
572
|
+
break;
|
|
573
|
+
case "extractMetadata":
|
|
574
|
+
if (kind === "pdf") {
|
|
575
|
+
nativeExtraction = {
|
|
576
|
+
willAttempt: true,
|
|
577
|
+
description: "Lightweight PDF info/XMP normalization without full OCR."
|
|
578
|
+
};
|
|
579
|
+
ocr3 = { mayUse: false, description: "extractMetadata does not run the OCR pipeline." };
|
|
580
|
+
} else if (kind === "docx" || kind === "image") {
|
|
581
|
+
nativeExtraction = {
|
|
582
|
+
willAttempt: false,
|
|
583
|
+
description: "Stub response; no heavy extractor."
|
|
584
|
+
};
|
|
585
|
+
ocr3 = { mayUse: false, description: "OCR not used for this metadata path." };
|
|
586
|
+
limitations = lim(
|
|
587
|
+
kind === "docx" ? "Structured DOCX metadata is not exposed separately." : "Raster images have no document metadata bundle."
|
|
588
|
+
);
|
|
589
|
+
} else {
|
|
590
|
+
nativeExtraction = {
|
|
591
|
+
willAttempt: true,
|
|
592
|
+
description: "Decoded text only; no structured document metadata."
|
|
593
|
+
};
|
|
594
|
+
ocr3 = { mayUse: false, description: "OCR does not apply." };
|
|
595
|
+
limitations = lim("Plain text has no structured document metadata.");
|
|
596
|
+
}
|
|
597
|
+
break;
|
|
598
|
+
case "convertToHtml":
|
|
599
|
+
if (kind === "pdf") {
|
|
600
|
+
nativeExtraction = {
|
|
601
|
+
willAttempt: true,
|
|
602
|
+
description: "Text layer extracted then wrapped in <pre> (not visual layout)."
|
|
603
|
+
};
|
|
604
|
+
ocr3 = { mayUse: false, description: "convertToHtml does not run PDF OCR." };
|
|
605
|
+
limitations = lim("PDF HTML is a plain-text preview, not page layout.");
|
|
606
|
+
} else if (kind === "docx") {
|
|
607
|
+
nativeExtraction = {
|
|
608
|
+
willAttempt: true,
|
|
609
|
+
description: "Mammoth HTML output via analyzeFile routing."
|
|
610
|
+
};
|
|
611
|
+
ocr3 = { mayUse: false, description: "DOCX path does not use OCR." };
|
|
612
|
+
} else if (kind === "text") {
|
|
613
|
+
nativeExtraction = {
|
|
614
|
+
willAttempt: true,
|
|
615
|
+
description: "UTF-8 decode then <pre> wrapper."
|
|
616
|
+
};
|
|
617
|
+
ocr3 = { mayUse: false, description: "OCR does not apply." };
|
|
618
|
+
} else {
|
|
619
|
+
nativeExtraction = {
|
|
620
|
+
willAttempt: false,
|
|
621
|
+
description: "No HTML path for raster images."
|
|
622
|
+
};
|
|
623
|
+
ocr3 = { mayUse: false, description: "OCR does not emit layout HTML here." };
|
|
624
|
+
limitations = lim("Use extractText or runOcr for image text.");
|
|
625
|
+
}
|
|
626
|
+
break;
|
|
627
|
+
case "runOcr":
|
|
628
|
+
if (kind === "pdf") {
|
|
629
|
+
nativeExtraction = {
|
|
630
|
+
willAttempt: true,
|
|
631
|
+
description: "pdf-parse runs first; text may be replaced by raster OCR output."
|
|
632
|
+
};
|
|
633
|
+
ocr3 = {
|
|
634
|
+
mayUse: true,
|
|
635
|
+
description: 'runOcr always sets pdf.ocr to "force" for PDFs.'
|
|
636
|
+
};
|
|
637
|
+
limitations = lim("Forced OCR may run even when a text layer exists.");
|
|
638
|
+
} else if (kind === "image") {
|
|
639
|
+
nativeExtraction = { willAttempt: false, description: "No native text layer." };
|
|
640
|
+
ocr3 = { mayUse: true, description: "Tesseract OCR on the image bytes." };
|
|
641
|
+
} else if (kind === "docx") {
|
|
642
|
+
nativeExtraction = {
|
|
643
|
+
willAttempt: true,
|
|
644
|
+
description: "Full Mammoth extract (text + HTML); not OCR."
|
|
645
|
+
};
|
|
646
|
+
ocr3 = { mayUse: false, description: "DOCX is not OCR'd." };
|
|
647
|
+
limitations = lim("Result is structured extract, not OCR output.");
|
|
648
|
+
} else {
|
|
649
|
+
nativeExtraction = {
|
|
650
|
+
willAttempt: true,
|
|
651
|
+
description: "UTF-8 decode only."
|
|
652
|
+
};
|
|
653
|
+
ocr3 = { mayUse: false, description: "OCR does not apply to text files." };
|
|
654
|
+
}
|
|
655
|
+
break;
|
|
656
|
+
default:
|
|
657
|
+
nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
|
|
658
|
+
ocr3 = { mayUse: false, description: "See plan steps." };
|
|
659
|
+
}
|
|
660
|
+
return {
|
|
661
|
+
kind,
|
|
662
|
+
detectedKind: kind,
|
|
663
|
+
runtime,
|
|
664
|
+
intent,
|
|
665
|
+
primaryAnalyzer,
|
|
666
|
+
nativeExtraction,
|
|
667
|
+
ocr: ocr3,
|
|
668
|
+
limitations,
|
|
669
|
+
plan
|
|
670
|
+
};
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
// src/capabilityReport.ts
|
|
674
|
+
var DOCX_META = "Structured document metadata is not exposed separately; extractMetadata returns a stub for DOCX.";
|
|
675
|
+
var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
|
|
676
|
+
var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
|
|
677
|
+
var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
|
|
678
|
+
var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
|
|
679
|
+
function slot(id, supported, warnings) {
|
|
680
|
+
return warnings?.length ? { id, supported, warnings } : { id, supported };
|
|
681
|
+
}
|
|
682
|
+
function buildNodeCapabilityReport(kind) {
|
|
683
|
+
const runtime = { id: "node" };
|
|
684
|
+
let capabilities;
|
|
685
|
+
const topWarnings = [];
|
|
686
|
+
switch (kind) {
|
|
687
|
+
case "pdf":
|
|
688
|
+
capabilities = [
|
|
689
|
+
slot("text", true, [
|
|
690
|
+
"Native text via pdf-parse and (in Node extractText) PDF.js per-page text; set pdf.ocr for raster OCR."
|
|
691
|
+
]),
|
|
692
|
+
slot("metadata", true, [
|
|
693
|
+
"Document info / XMP-style metadata via pdf-parse without running the OCR pipeline."
|
|
694
|
+
]),
|
|
695
|
+
slot("pages", true, [
|
|
696
|
+
"Page count and per-page native extraction (PDF.js) where used; OCR respects pdf.maxPages."
|
|
697
|
+
]),
|
|
698
|
+
slot("ocr", true, [
|
|
699
|
+
"Raster OCR pipeline (pdf.ocr auto with quality heuristics, force, or runOcr)."
|
|
700
|
+
]),
|
|
701
|
+
slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."])
|
|
702
|
+
];
|
|
703
|
+
break;
|
|
704
|
+
case "docx":
|
|
705
|
+
capabilities = [
|
|
706
|
+
slot("text", true),
|
|
707
|
+
slot("metadata", false, [DOCX_META]),
|
|
708
|
+
slot("html", true),
|
|
709
|
+
slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
|
|
710
|
+
slot("pages", false)
|
|
711
|
+
];
|
|
712
|
+
break;
|
|
713
|
+
case "image":
|
|
714
|
+
capabilities = [
|
|
715
|
+
slot("text", true, ["Text is obtained via OCR."]),
|
|
716
|
+
slot("metadata", false, [IMAGE_META]),
|
|
717
|
+
slot("html", false, [IMAGE_HTML]),
|
|
718
|
+
slot("ocr", true),
|
|
719
|
+
slot("pages", false)
|
|
720
|
+
];
|
|
721
|
+
break;
|
|
722
|
+
case "text":
|
|
723
|
+
capabilities = [
|
|
724
|
+
slot("text", true),
|
|
725
|
+
slot("metadata", true, [TEXT_META_NOTE]),
|
|
726
|
+
slot("html", true),
|
|
727
|
+
slot("ocr", false, ["OCR does not apply to plain text files."]),
|
|
728
|
+
slot("pages", false)
|
|
729
|
+
];
|
|
730
|
+
break;
|
|
731
|
+
default:
|
|
732
|
+
topWarnings.push(UNKNOWN_KIND);
|
|
733
|
+
capabilities = [
|
|
734
|
+
slot("text", false),
|
|
735
|
+
slot("metadata", false),
|
|
736
|
+
slot("html", false),
|
|
737
|
+
slot("ocr", false),
|
|
738
|
+
slot("pages", false)
|
|
739
|
+
];
|
|
740
|
+
}
|
|
741
|
+
return {
|
|
742
|
+
kind,
|
|
743
|
+
runtime,
|
|
744
|
+
capabilities,
|
|
745
|
+
warnings: topWarnings.length > 0 ? topWarnings : void 0
|
|
746
|
+
};
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
// src/introspection.ts
|
|
750
|
+
function resolvePdfOcrMode(pdf) {
|
|
751
|
+
return pdf?.ocr ?? "auto";
|
|
752
|
+
}
|
|
753
|
+
function planAnalyzeFile(kind, pdfOcr) {
|
|
754
|
+
switch (kind) {
|
|
755
|
+
case "pdf":
|
|
756
|
+
return {
|
|
757
|
+
intent: "analyzeFile",
|
|
758
|
+
steps: [
|
|
759
|
+
{ id: "detect_kind", status: "done" },
|
|
760
|
+
{ id: "pdf_parse", status: "planned" },
|
|
761
|
+
{
|
|
762
|
+
id: "pdf_ocr",
|
|
763
|
+
status: pdfOcr === "off" ? "skipped" : "planned"
|
|
764
|
+
}
|
|
765
|
+
]
|
|
766
|
+
};
|
|
767
|
+
case "docx":
|
|
768
|
+
return {
|
|
769
|
+
intent: "analyzeFile",
|
|
770
|
+
steps: [
|
|
771
|
+
{ id: "detect_kind", status: "done" },
|
|
772
|
+
{ id: "docx_mammoth", status: "planned" }
|
|
773
|
+
]
|
|
774
|
+
};
|
|
775
|
+
case "image":
|
|
776
|
+
return {
|
|
777
|
+
intent: "analyzeFile",
|
|
778
|
+
steps: [
|
|
779
|
+
{ id: "detect_kind", status: "done" },
|
|
780
|
+
{ id: "image_ocr", status: "planned" }
|
|
781
|
+
]
|
|
782
|
+
};
|
|
783
|
+
case "text":
|
|
784
|
+
return {
|
|
785
|
+
intent: "analyzeFile",
|
|
786
|
+
steps: [
|
|
787
|
+
{ id: "detect_kind", status: "done" },
|
|
788
|
+
{ id: "utf8_decode", status: "planned" }
|
|
789
|
+
]
|
|
790
|
+
};
|
|
791
|
+
default:
|
|
792
|
+
return {
|
|
793
|
+
intent: "analyzeFile",
|
|
794
|
+
steps: [
|
|
795
|
+
{ id: "detect_kind", status: "done" },
|
|
796
|
+
{ id: "route", status: "failed" }
|
|
797
|
+
]
|
|
798
|
+
};
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
|
|
802
|
+
const intent = intentOpt ?? "analyzeFile";
|
|
803
|
+
if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze);
|
|
804
|
+
if (intent === "extractText") {
|
|
805
|
+
if (kind === "pdf") {
|
|
806
|
+
return {
|
|
807
|
+
intent: "extractText",
|
|
808
|
+
steps: [
|
|
809
|
+
{ id: "detect_kind", status: "done" },
|
|
810
|
+
{ id: "pdf_parse", status: "planned" },
|
|
811
|
+
{ id: "pdfjs_per_page", status: "planned" },
|
|
812
|
+
{ id: "pdf_ocr", status: "skipped" }
|
|
813
|
+
]
|
|
814
|
+
};
|
|
815
|
+
}
|
|
816
|
+
const p = planAnalyzeFile(kind, "off");
|
|
817
|
+
return { ...p, intent: "extractText" };
|
|
818
|
+
}
|
|
819
|
+
if (intent === "extractMetadata") {
|
|
820
|
+
if (kind === "pdf") {
|
|
821
|
+
return {
|
|
822
|
+
intent: "extractMetadata",
|
|
823
|
+
steps: [
|
|
824
|
+
{ id: "detect_kind", status: "done" },
|
|
825
|
+
{ id: "pdf_metadata", status: "planned" }
|
|
826
|
+
]
|
|
827
|
+
};
|
|
828
|
+
}
|
|
829
|
+
if (kind === "text") {
|
|
830
|
+
return {
|
|
831
|
+
intent: "extractMetadata",
|
|
832
|
+
steps: [
|
|
833
|
+
{ id: "detect_kind", status: "done" },
|
|
834
|
+
{ id: "utf8_decode", status: "planned" }
|
|
835
|
+
]
|
|
836
|
+
};
|
|
837
|
+
}
|
|
838
|
+
return {
|
|
839
|
+
intent: "extractMetadata",
|
|
840
|
+
steps: [
|
|
841
|
+
{ id: "detect_kind", status: "done" },
|
|
842
|
+
{ id: "metadata_stub", status: kind === "docx" || kind === "image" ? "planned" : "skipped" }
|
|
843
|
+
]
|
|
844
|
+
};
|
|
845
|
+
}
|
|
846
|
+
if (intent === "convertToHtml") {
|
|
847
|
+
if (kind === "docx") {
|
|
848
|
+
return {
|
|
849
|
+
intent: "convertToHtml",
|
|
850
|
+
steps: [
|
|
851
|
+
{ id: "detect_kind", status: "done" },
|
|
852
|
+
{ id: "docx_mammoth_html", status: "planned" }
|
|
853
|
+
]
|
|
854
|
+
};
|
|
855
|
+
}
|
|
856
|
+
if (kind === "text") {
|
|
857
|
+
return {
|
|
858
|
+
intent: "convertToHtml",
|
|
859
|
+
steps: [
|
|
860
|
+
{ id: "detect_kind", status: "done" },
|
|
861
|
+
{ id: "utf8_decode", status: "planned" },
|
|
862
|
+
{ id: "wrap_pre", status: "planned" }
|
|
863
|
+
]
|
|
864
|
+
};
|
|
865
|
+
}
|
|
866
|
+
if (kind === "pdf") {
|
|
867
|
+
return {
|
|
868
|
+
intent: "convertToHtml",
|
|
869
|
+
steps: [
|
|
870
|
+
{ id: "detect_kind", status: "done" },
|
|
871
|
+
{ id: "pdf_text_layer", status: "planned" },
|
|
872
|
+
{ id: "wrap_pre", status: "planned" }
|
|
873
|
+
]
|
|
874
|
+
};
|
|
875
|
+
}
|
|
876
|
+
return {
|
|
877
|
+
intent: "convertToHtml",
|
|
878
|
+
steps: [
|
|
879
|
+
{ id: "detect_kind", status: "done" },
|
|
880
|
+
{ id: "rich_html", status: "skipped" }
|
|
881
|
+
]
|
|
882
|
+
};
|
|
883
|
+
}
|
|
884
|
+
if (intent === "runOcr") {
|
|
885
|
+
if (kind === "pdf") {
|
|
886
|
+
return {
|
|
887
|
+
intent: "runOcr",
|
|
888
|
+
steps: [
|
|
889
|
+
{ id: "detect_kind", status: "done" },
|
|
890
|
+
{ id: "pdf_parse", status: "planned" },
|
|
891
|
+
{ id: "pdf_ocr_forced", status: "planned" }
|
|
892
|
+
]
|
|
893
|
+
};
|
|
894
|
+
}
|
|
895
|
+
if (kind === "image") {
|
|
896
|
+
return {
|
|
897
|
+
intent: "runOcr",
|
|
898
|
+
steps: [
|
|
899
|
+
{ id: "detect_kind", status: "done" },
|
|
900
|
+
{ id: "tesseract_ocr", status: "planned" }
|
|
901
|
+
]
|
|
902
|
+
};
|
|
903
|
+
}
|
|
904
|
+
if (kind === "docx") {
|
|
905
|
+
return {
|
|
906
|
+
intent: "runOcr",
|
|
907
|
+
steps: [
|
|
908
|
+
{ id: "detect_kind", status: "done" },
|
|
909
|
+
{ id: "docx_structured_extract", status: "planned" }
|
|
910
|
+
]
|
|
911
|
+
};
|
|
912
|
+
}
|
|
913
|
+
return {
|
|
914
|
+
intent: "runOcr",
|
|
915
|
+
steps: [
|
|
916
|
+
{ id: "detect_kind", status: "done" },
|
|
917
|
+
{ id: "ocr", status: "skipped" }
|
|
918
|
+
]
|
|
919
|
+
};
|
|
920
|
+
}
|
|
921
|
+
return planAnalyzeFile(kind, pdfOcrForAnalyze);
|
|
922
|
+
}
|
|
923
|
+
async function getCapabilities(input, options) {
|
|
924
|
+
throwIfAborted(options?.signal);
|
|
925
|
+
const resolved = await resolveNodeAnalyzeInput(input);
|
|
926
|
+
assertValidAnalyzeFileInput(resolved);
|
|
927
|
+
const kind = detectFileKind(resolved);
|
|
928
|
+
return buildNodeCapabilityReport(kind);
|
|
929
|
+
}
|
|
930
|
+
async function explainAnalysisPlan(input, options) {
|
|
931
|
+
throwIfAborted(options?.signal);
|
|
932
|
+
const resolved = await resolveNodeAnalyzeInput(input);
|
|
933
|
+
assertValidAnalyzeFileInput(resolved);
|
|
934
|
+
const kind = detectFileKind(resolved);
|
|
935
|
+
const intent = options?.intent ?? "analyzeFile";
|
|
936
|
+
const pdfOcrAnalyze = resolvePdfOcrMode(options?.pdf);
|
|
937
|
+
const plan = planForIntent(intent, kind, pdfOcrAnalyze);
|
|
938
|
+
return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan);
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
export { analyzeFile, bufferToInput, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
|
550
942
|
//# sourceMappingURL=index.js.map
|
|
551
943
|
//# sourceMappingURL=index.js.map
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-node",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "1.4.0",
|
|
4
|
+
"description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
7
7
|
"module": "./dist/index.js",
|
|
@@ -14,7 +14,8 @@
|
|
|
14
14
|
}
|
|
15
15
|
},
|
|
16
16
|
"files": [
|
|
17
|
-
"dist"
|
|
17
|
+
"dist/**/*.js",
|
|
18
|
+
"dist/**/*.d.ts"
|
|
18
19
|
],
|
|
19
20
|
"publishConfig": {
|
|
20
21
|
"access": "public"
|
|
@@ -33,8 +34,8 @@
|
|
|
33
34
|
"dependencies": {
|
|
34
35
|
"@dragon708/docmind-docx": "^1.0.0",
|
|
35
36
|
"@dragon708/docmind-ocr": "^1.0.0",
|
|
36
|
-
"@dragon708/docmind-pdf": "^
|
|
37
|
-
"@dragon708/docmind-shared": "^1.
|
|
37
|
+
"@dragon708/docmind-pdf": "^2.0.0",
|
|
38
|
+
"@dragon708/docmind-shared": "^1.1.0"
|
|
38
39
|
},
|
|
39
40
|
"devDependencies": {
|
|
40
41
|
"@types/node": "^20.19.37",
|
package/dist/index.js.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/inputBytes.ts","../src/analyzers/docx.ts","../src/analyzers/image.ts","../src/analyzers/pdf.ts","../src/resolveNodeInput.ts","../src/analyze.ts","../src/publicActions.ts"],"names":["extractDocx","runPdf","assertValidAnalyzeFileInput","detectFileKind","ocr","analyzeText","notImplementedResult","UNKNOWN_FORMAT_WARNING"],"mappings":";;;;;;;;;;AASO,SAAS,kBAAkB,KAAA,EAAoD;AACpF,EAAA,OAAO,YAAA,CAAa,KAAK,CAAA,IAAK,aAAA,CAAc,KAAK,KAAK,MAAA,CAAO,KAAK,CAAA,IAAK,MAAA,CAAO,KAAK,CAAA;AACrF;AAGA,eAAsB,qBAAqB,KAAA,EAAiD;AAC1F,EAAA,IAAI,CAAC,iBAAA,CAAkB,KAAK,CAAA,EAAG;AAC7B,IAAA,OAAO,IAAI,WAAW,CAAC,CAAA;AAAA,EACzB;AACA,EAAA,OAAO,aAAa,KAAK,CAAA;AAC3B;;;ACZA,eAAsB,kBAAA,CACpB,OACA,MAAA,EACyB;AACzB,EAAA,IAAI,QAAQ,OAAA,EAAS;AACnB,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,KAAK,CAAA;AAC7C,EAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,IAAA,OAAO;AAAA,MACL,QAAA,EAAU,MAAA;AAAA,MACV,QAAA,EAAU,MAAA;AAAA,MACV,MAAA,EAAQ,IAAA;AAAA,MACR,IAAA,EAAM,MAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,QAAA,EAAU,CAAC,+CAA+C;AAAA,KAC5D;AAAA,EACF;AAEA,EAAA,MAAM,CAAA,GAAI,MAAMA,WAAA,CAAY,IAAI,CAAA;AAChC,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,MAAA;AAAA,IACV,QAAA,EAAU,MAAA;AAAA,IACV,MAAA,EAAQ,IAAA;AAAA,IACR,IAAA,EAAM,MAAA;AAAA,IACN,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,QAAA,EAAU,CAAC,GAAG,CAAA,CAAE,QAAQ;AAAA,GAC1B;AACF;AChCA,eAAsB,mBAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,IAAI,OAAA,EAAS,QAAQ,OAAA,EAAS;AAC5B,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,KAAK,CAAA;AAC7C,EAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,IAAA,OAAO;AAAA,MACL,QAAA,EAAU,OAAA;AAAA,MACV,QAAA,EAAU,OAAA;AAAA,MACV,MAAA,EAAQ,IAAA;AAAA,MACR,IAAA,EAAM,OAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,UAAA,EAAY,CAAA;AAAA,MACZ,OAAA,EAAS,IAAA;AAAA,MACT,QAAA,EAAU,CAAC,4CAA4C;AAAA,KACzD;AAAA,EACF;AAEA,EAAA,MAAM,OAAA,GAAU;AAAA,IACd,GAAI,OAAA,EAAS,GAAA,IAAO,EAAC;AAAA,IACrB,MAAA,EAAQ,OAAA,EAAS,GAAA,EAAK,MAAA,IAAU,OAAA,EAAS;AAAA,GAC3C;AAEA,EAAA,MAAM,CAAA,GAAI,MAAM,GAAA,CAAI,IAAA,EAAM,OAAO,CAAA;AACjC,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,OAAA;AAAA,IACV,QAAA,EAAU,OAAA;AAAA,IACV,MAAA,EAAQ,IAAA;AAAA,IACR,IAAA,EAAM,OAAA;AAAA,IACN,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,SAAS,CAAA,CAAE,OAAA;AAAA,IACX,UAAU;AAAC,GACb;AACF;AClCA,eAAsB,iBAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,IAAI,OAAA,EAAS,QAAQ,OAAA,EAAS;AAC5B,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,KAAK,CAAA;AAC7C,EAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,IAAA,OAAO;AAAA,MACL,QAAA,EAAU,KAAA;AAAA,MACV,QAAA,EAAU,KAAA;AAAA,MACV,MAAA,EAAQ,IAAA;AAAA,MACR,IAAA,EAAM,KAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,KAAA,EAAO,CAAA;AAAA,MACP,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,MACrB,QAAA,EAAU,CAAC,+CAA+C,CAAA;AAAA,MAC1D,QAAA,EAAU,KAAA;AAAA,MACV,OAAA,EAAS;AAAA,KACX;AAAA,EACF;AAEA,EAAA,MAAM,UAAU,OAAA,EAAS,GAAA;AACzB,EAAA,MAAM,OAAA,GAA6B;AAAA,IACjC,GAAG,OAAA;AAAA,IACH,GAAA,EAAK,SAAS,GAAA,IAAO,MAAA;AAAA,IACrB,QAAA,EAAU,OAAA,EAAS,QAAA,IAAY,OAAA,EAAS,GAAA,EAAK,KAAA;AAAA,IAC7C,MAAA,EAAQ,OAAA,EAAS,MAAA,IAAU,OAAA,EAAS;AAAA,GACtC;AAEA,EAAA,MAAM,CAAA,GAAI,MAAMC,UAAA,CAAO,IAAA,EAAM,OAAO,CAAA;AACpC,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,KAAA;AAAA,IACV,QAAA,EAAU,KAAA;AAAA,IACV,MAAA,EAAQ,IAAA;AAAA,IACR,IAAA,EAAM,KAAA;AAAA,IACN,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,OAAO,CAAA,CAAE,KAAA;AAAA,IACT,UAAU,CAAA,CAAE,QAAA;AAAA,IACZ,QAAA,EAAU,CAAC,GAAG,CAAA,CAAE,QAAQ,CAAA;AAAA,IACxB,UAAU,CAAA,CAAE,QAAA;AAAA,IACZ,SAAS,CAAA,CAAE;AAAA,GACb;AACF;AClDA,SAAS,aAAa,SAAA,EAAiC;AACrD,EAAA,OAAO,SAAA,YAAqB,GAAA,GAAM,aAAA,CAAc,SAAS,CAAA,GAAI,SAAA;AAC/D;AAKA,eAAsB,gBAAgB,IAAA,EAAiD;AACrF,EAAA,MAAM,MAAA,GAAS,aAAa,IAAI,CAAA;AAChC,EAAA,MAAM,IAAA,GAAO,MAAM,QAAA,CAAS,MAAM,CAAA;AAClC,EAAA,OAAO;AAAA,IACL,IAAA;AAAA,IACA,IAAA,EAAM,SAAS,MAAM;AAAA,GACvB;AACF;AAGO,SAAS,aAAA,CAAc,QAAgB,IAAA,EAAmC;AAC/E,EAAA,OAAO,IAAA,KAAS,SAAY,EAAE,IAAA,EAAM,QAAQ,IAAA,EAAK,GAAI,EAAE,IAAA,EAAM,MAAA,EAAO;AACtE;AAKA,eAAsB,wBAAwB,KAAA,EAAuD;AACnG,EAAA,IAAI,OAAO,KAAA,KAAU,QAAA,IAAY,KAAA,YAAiB,GAAA,EAAK;AACrD,IAAA,OAAO,gBAAgB,KAAK,CAAA;AAAA,EAC9B;AACA,EAAA,OAAO,KAAA;AACT;;;ACvBA,eAAsB,WAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,IAAI,OAAA,EAAS,QAAQ,OAAA,EAAS;AAC5B,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,MAAM,QAAA,GAAW,MAAM,uBAAA,CAAwB,KAAK,CAAA;AACpD,EAAA,2BAAA,CAA4B,QAAQ,CAAA;AAEpC,EAAA,MAAM,QAAA,GAAW,eAAe,QAAQ,CAAA;AAExC,EAAA,QAAQ,QAAA;AAAU,IAChB,KAAK,KAAA;AACH,MAAA,OAAO,iBAAA,CAAkB,UAAiC,OAAO,CAAA;AAAA,IACnE,KAAK,MAAA;AACH,MAAA,OAAO,kBAAA,CAAmB,QAAA,EAAiC,OAAA,EAAS,MAAM,CAAA;AAAA,IAC5E,KAAK,OAAA;AACH,MAAA,OAAO,mBAAA,CAAoB,UAAiC,OAAO,CAAA;AAAA,IACrE,KAAK,MAAA;AACH,MAAA,OAAO,YAAY,QAAA,EAAiC,EAAE,MAAA,EAAQ,OAAA,EAAS,QAAQ,CAAA;AAAA,IACjF;AACE,MAAA,OAAO,oBAAA,CAAqB,QAAA,EAAU,MAAA,EAAQ,CAAC,sBAAsB,CAAC,CAAA;AAAA;AAE5E;ACrBA,SAAS,eAAe,MAAA,EAA4B;AAClD,EAAA,IAAI,QAAQ,OAAA,EAAS;AACnB,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AACF;AAEA,eAAe,QAAQ,KAAA,EAAuD;AAC5E,EAAA,MAAM,QAAA,GAAW,MAAM,uBAAA,CAAwB,KAAK,CAAA;AACpD,EAAAC,4BAA4B,QAAQ,CAAA;AACpC,EAAA,OAAO,QAAA;AACT;AAEA,SAAS,kBAAkB,CAAA,EAAmB;AAC5C,EAAA,OAAO,CAAA,CACJ,OAAA,CAAQ,IAAA,EAAM,OAAO,EACrB,OAAA,CAAQ,IAAA,EAAM,MAAM,CAAA,CACpB,QAAQ,IAAA,EAAM,MAAM,CAAA,CACpB,OAAA,CAAQ,MAAM,QAAQ,CAAA;AAC3B;AAEA,IAAM,kBAAA,GACJ,yGAAA;AAEF,IAAM,mBAAA,GACJ,6DAAA;AAMF,eAAsB,WAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,cAAA,CAAe,SAAS,MAAM,CAAA;AAC9B,EAAA,MAAM,QAAA,GAAW,MAAM,OAAA,CAAQ,KAAK,CAAA;AACpC,EAAA,MAAM,IAAA,GAAOC,eAAe,QAAQ,CAAA;AACpC,EAAA,MAAM,SAAS,OAAA,EAAS,MAAA;AAExB,EAAA,QAAQ,IAAA;AAAM,IACZ,KAAK,KAAA,EAAO;AACV,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,KAAA;AAAA,UACV,QAAA,EAAU,KAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,KAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,KAAA,EAAO,CAAA;AAAA,UACP,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,UACrB,QAAA,EAAU,CAAC,+CAA+C,CAAA;AAAA,UAC1D,QAAA,EAAU,KAAA;AAAA,UACV,OAAA,EAAS;AAAA,SACX;AAAA,MACF;AACA,MAAA,MAAM,CAAA,GAAI,MAAM,kBAAA,CAAmB,IAAI,CAAA;AACvC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,KAAA;AAAA,QACV,QAAA,EAAU,KAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,KAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,OAAO,CAAA,CAAE,KAAA;AAAA,QACT,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,QACrB,UAAU,CAAA,CAAE,QAAA;AAAA,QACZ,QAAA,EAAU,KAAA;AAAA,QACV,OAAA,EAAS;AAAA,OACX;AAAA,IACF;AAAA,IACA,KAAK,MAAA,EAAQ;AACX,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,MAAA;AAAA,UACV,QAAA,EAAU,MAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,MAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,QAAA,EAAU,CAAC,+CAA+C;AAAA,SAC5D;AAAA,MACF;AACA,MAAA,MAAM,CAAA,GAAI,MAAM,mBAAA,CAAoB,IAAI,CAAA;AACxC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,QAAA,EAAU,MAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,MAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,IAAA,EAAM,EAAA;AAAA,QACN,UAAU,CAAA,CAAE;AAAA,OACd;AAAA,IACF;AAAA,IACA,KAAK,OAAA,EAAS;AACZ,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,OAAA;AAAA,UACV,QAAA,EAAU,OAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,OAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,UAAA,EAAY,CAAA;AAAA,UACZ,OAAA,EAAS,IAAA;AAAA,UACT,QAAA,EAAU,CAAC,4CAA4C;AAAA,SACzD;AAAA,MACF;AACA,MAAA,MAAM,OAAA,GAAU;AAAA,QACd,GAAI,OAAA,EAAS,GAAA,IAAO,EAAC;AAAA,QACrB,MAAA,EAAQ,OAAA,EAAS,GAAA,EAAK,MAAA,IAAU;AAAA,OAClC;AACA,MAAA,MAAM,CAAA,GAAI,MAAMC,GAAAA,CAAI,IAAA,EAAM,OAAO,CAAA;AACjC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,OAAA;AAAA,QACV,QAAA,EAAU,OAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,OAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,YAAY,CAAA,CAAE,UAAA;AAAA,QACd,SAAS,CAAA,CAAE,OAAA;AAAA,QACX,UAAU;AAAC,OACb;AAAA,IACF;AAAA,IACA,KAAK,MAAA;AACH,MAAA,OAAOC,WAAAA,CAAY,QAAA,EAAiC,EAAE,MAAA,EAAQ,CAAA;AAAA,IAChE;AACE,MAAA,OAAOC,oBAAAA,CAAqB,IAAA,EAAM,MAAA,EAAQ,CAACC,sBAAsB,CAAC,CAAA;AAAA;AAExE;AAMA,eAAsB,eAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,cAAA,CAAe,SAAS,MAAM,CAAA;AAC9B,EAAA,MAAM,QAAA,GAAW,MAAM,OAAA,CAAQ,KAAK,CAAA;AACpC,EAAA,MAAM,IAAA,GAAOJ,eAAe,QAAQ,CAAA;AACpC,EAAA,MAAM,SAAS,OAAA,EAAS,MAAA;AAExB,EAAA,QAAQ,IAAA;AAAM,IACZ,KAAK,KAAA,EAAO;AACV,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,KAAA;AAAA,UACV,QAAA,EAAU,KAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,KAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,KAAA,EAAO,CAAA;AAAA,UACP,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,UACrB,QAAA,EAAU,CAAC,+CAA+C,CAAA;AAAA,UAC1D,QAAA,EAAU,KAAA;AAAA,UACV,OAAA,EAAS;AAAA,SACX;AAAA,MACF;AACA,MAAA,MAAM,CAAA,GAAI,MAAM,kBAAA,CAAmB,IAAI,CAAA;AACvC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,KAAA;AAAA,QACV,QAAA,EAAU,KAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,KAAA;AAAA,QACN,IAAA,EAAM,EAAA;AAAA,QACN,KAAA,EAAO,CAAA;AAAA,QACP,UAAU,CAAA,CAAE,QAAA;AAAA,QACZ,UAAU,CAAA,CAAE,QAAA;AAAA,QACZ,QAAA,EAAU,KAAA;AAAA,QACV,OAAA,EAAS;AAAA,OACX;AAAA,IACF;AAAA,IACA,KAAK,MAAA;AACH,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,QAAA,EAAU,MAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,MAAA;AAAA,QACN,IAAA,EAAM,EAAA;AAAA,QACN,IAAA,EAAM,EAAA;AAAA,QACN,QAAA,EAAU,CAAC,kBAAkB;AAAA,OAC/B;AAAA,IACF,KAAK,OAAA;AACH,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,OAAA;AAAA,QACV,QAAA,EAAU,OAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,OAAA;AAAA,QACN,IAAA,EAAM,EAAA;AAAA,QACN,UAAA,EAAY,CAAA;AAAA,QACZ,OAAA,EAAS,IAAA;AAAA,QACT,QAAA,EAAU,CAAC,mBAAmB;AAAA,OAChC;AAAA,IACF,KAAK,MAAA;AACH,MAAA,OAAOE,WAAAA,CAAY,QAAA,EAAiC,EAAE,MAAA,EAAQ,CAAA;AAAA,IAChE;AACE,MAAA,OAAOC,oBAAAA,CAAqB,IAAA,EAAM,MAAA,EAAQ,CAACC,sBAAsB,CAAC,CAAA;AAAA;AAExE;AAMA,eAAsB,aAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,cAAA,CAAe,SAAS,MAAM,CAAA;AAC9B,EAAA,MAAM,QAAA,GAAW,MAAM,OAAA,CAAQ,KAAK,CAAA;AACpC,EAAA,MAAM,IAAA,GAAOJ,eAAe,QAAQ,CAAA;AACpC,EAAA,MAAM,SAAS,OAAA,EAAS,MAAA;AAExB,EAAA,QAAQ,IAAA;AAAM,IACZ,KAAK,MAAA,EAAQ;AACX,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,MAAA;AAAA,UACV,QAAA,EAAU,MAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,MAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,QAAA,EAAU,CAAC,+CAA+C;AAAA,SAC5D;AAAA,MACF;AACA,MAAA,MAAM,CAAC,QAAA,EAAU,QAAQ,CAAA,GAAI,MAAM,QAAQ,GAAA,CAAI;AAAA,QAC7C,oBAAoB,IAAI,CAAA;AAAA,QACxB,kBAAkB,IAAI;AAAA,OACvB,CAAA;AACD,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,QAAA,EAAU,MAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,MAAA;AAAA,QACN,MAAM,QAAA,CAAS,IAAA;AAAA,QACf,MAAM,QAAA,CAAS,IAAA;AAAA,QACf,UAAU,CAAC,GAAG,SAAS,QAAA,EAAU,GAAG,SAAS,QAAQ;AAAA,OACvD;AAAA,IACF;AAAA,IACA,KAAK,KAAA,EAAO;AACV,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,KAAA;AAAA,UACV,QAAA,EAAU,KAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,KAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,KAAA,EAAO,CAAA;AAAA,UACP,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,UACrB,QAAA,EAAU,CAAC,+CAA+C,CAAA;AAAA,UAC1D,QAAA,EAAU,KAAA;AAAA,UACV,OAAA,EAAS;AAAA,SACX;AAAA,MACF;AACA,MAAA,MAAM,CAAA,GAAI,MAAM,kBAAA,CAAmB,IAAI,CAAA;AACvC,MAAA,MAAM,IAAA,GAAO,CAAA,KAAA,EAAQ,iBAAA,CAAkB,CAAA,CAAE,IAAI,CAAC,CAAA,MAAA,CAAA;AAC9C,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,KAAA;AAAA,QACV,QAAA,EAAU,KAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,KAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,OAAO,CAAA,CAAE,KAAA;AAAA,QACT,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,QACrB,IAAA;AAAA,QACA,QAAA,EAAU;AAAA,UACR,GAAG,CAAA,CAAE,QAAA;AAAA,UACL;AAAA,SACF;AAAA,QACA,QAAA,EAAU,KAAA;AAAA,QACV,OAAA,EAAS;AAAA,OACX;AAAA,IACF;AAAA,IACA,KAAK,MAAA,EAAQ;AACX,MAAA,MAAM,IAAI,MAAME,WAAAA,CAAY,QAAA,EAAiC,EAAE,QAAQ,CAAA;AACvE,MAAA,MAAM,IAAA,GAAO,CAAA,KAAA,EAAQ,iBAAA,CAAkB,CAAA,CAAE,IAAI,CAAC,CAAA,MAAA,CAAA;AAC9C,MAAA,OAAO;AAAA,QACL,GAAG,CAAA;AAAA,QACH,IAAA;AAAA,QACA,QAAA,EAAU;AAAA,UACR,GAAG,CAAA,CAAE,QAAA;AAAA,UACL;AAAA;AACF,OACF;AAAA,IACF;AAAA,IACA,KAAK,OAAA;AACH,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,OAAA;AAAA,QACV,QAAA,EAAU,OAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,OAAA;AAAA,QACN,IAAA,EAAM,EAAA;AAAA,QACN,UAAA,EAAY,CAAA;AAAA,QACZ,OAAA,EAAS,IAAA;AAAA,QACT,QAAA,EAAU,CAAC,qEAAqE;AAAA,OAClF;AAAA,IACF;AACE,MAAA,OAAOC,oBAAAA,CAAqB,IAAA,EAAM,MAAA,EAAQ,CAACC,sBAAsB,CAAC,CAAA;AAAA;AAExE;AAMA,eAAsB,MAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,cAAA,CAAe,SAAS,MAAM,CAAA;AAC9B,EAAA,MAAM,QAAA,GAAW,MAAM,OAAA,CAAQ,KAAK,CAAA;AACpC,EAAA,MAAM,IAAA,GAAOJ,eAAe,QAAQ,CAAA;AACpC,EAAA,MAAM,SAAS,OAAA,EAAS,MAAA;AACxB,EAAA,MAAM,IAAA,GAAO,OAAA,EAAS,GAAA,EAAK,KAAA,IAAS,SAAS,GAAA,EAAK,QAAA;AAElD,EAAA,QAAQ,IAAA;AAAM,IACZ,KAAK,KAAA,EAAO;AACV,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,KAAA;AAAA,UACV,QAAA,EAAU,KAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,KAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,KAAA,EAAO,CAAA;AAAA,UACP,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,UACrB,QAAA,EAAU,CAAC,+CAA+C,CAAA;AAAA,UAC1D,QAAA,EAAU,KAAA;AAAA,UACV,OAAA,EAAS;AAAA,SACX;AAAA,MACF;AACA,MAAA,MAAM,CAAA,GAAI,MAAM,UAAA,CAAW,IAAA,EAAM;AAAA,QAC/B,GAAG,OAAA,EAAS,GAAA;AAAA,QACZ,GAAA,EAAK,OAAA;AAAA,QACL,QAAA,EAAU,IAAA,IAAQ,OAAA,EAAS,GAAA,EAAK,QAAA;AAAA,QAChC,MAAA,EAAQ,OAAA,EAAS,GAAA,EAAK,MAAA,IAAU;AAAA,OACjC,CAAA;AACD,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,KAAA;AAAA,QACV,QAAA,EAAU,KAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,KAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,OAAO,CAAA,CAAE,KAAA;AAAA,QACT,UAAU,CAAA,CAAE,QAAA;AAAA,QACZ,QAAA,EAAU,CAAC,GAAG,CAAA,CAAE,QAAQ,CAAA;AAAA,QACxB,UAAU,CAAA,CAAE,QAAA;AAAA,QACZ,SAAS,CAAA,CAAE;AAAA,OACb;AAAA,IACF;AAAA,IACA,KAAK,OAAA,EAAS;AACZ,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,OAAA;AAAA,UACV,QAAA,EAAU,OAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,OAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,UAAA,EAAY,CAAA;AAAA,UACZ,OAAA,EAAS,IAAA;AAAA,UACT,QAAA,EAAU,CAAC,4CAA4C;AAAA,SACzD;AAAA,MACF;AACA,MAAA,MAAM,OAAA,GAAU;AAAA,QACd,GAAI,OAAA,EAAS,GAAA,IAAO,EAAC;AAAA,QACrB,MAAA,EAAQ,OAAA,EAAS,GAAA,EAAK,MAAA,IAAU;AAAA,OAClC;AACA,MAAA,MAAM,CAAA,GAAI,MAAMC,GAAAA,CAAI,IAAA,EAAM,OAAO,CAAA;AACjC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,OAAA;AAAA,QACV,QAAA,EAAU,OAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,OAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,YAAY,CAAA,CAAE,UAAA;AAAA,QACd,SAAS,CAAA,CAAE,OAAA;AAAA,QACX,UAAU;AAAC,OACb;AAAA,IACF;AAAA,IACA,KAAK,MAAA,EAAQ;AACX,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,MAAA;AAAA,UACV,QAAA,EAAU,MAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,MAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,QAAA,EAAU,CAAC,+CAA+C;AAAA,SAC5D;AAAA,MACF;AACA,MAAA,MAAM,CAAA,GAAI,MAAM,WAAA,CAAY,IAAI,CAAA;AAChC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,QAAA,EAAU,MAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,MAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,QAAA,EAAU;AAAA,UACR,GAAG,CAAA,CAAE,QAAA;AAAA,UACL;AAAA;AACF,OACF;AAAA,IACF;AAAA,IACA,KAAK,MAAA;AACH,MAAA,OAAOC,WAAAA,CAAY,QAAA,EAAiC,EAAE,MAAA,EAAQ,CAAA;AAAA,IAChE;AACE,MAAA,OAAOC,oBAAAA,CAAqB,IAAA,EAAM,MAAA,EAAQ,CAACC,sBAAsB,CAAC,CAAA;AAAA;AAExE","file":"index.js","sourcesContent":["import type { DetectFileKindInput, FileLikeInput } from \"@dragon708/docmind-shared\";\nimport {\n isBinaryInput,\n isBlob,\n isFile,\n isNamedInput,\n toUint8Array,\n} from \"@dragon708/docmind-shared\";\n\nexport function isByteBackedInput(input: DetectFileKindInput): input is FileLikeInput {\n return isNamedInput(input) || isBinaryInput(input) || isBlob(input) || isFile(input);\n}\n\n/** Resolves bytes when the input carries a body; otherwise an empty `Uint8Array`. */\nexport async function bytesFromDetectInput(input: DetectFileKindInput): Promise<Uint8Array> {\n if (!isByteBackedInput(input)) {\n return new Uint8Array(0);\n }\n return toUint8Array(input);\n}\n","import { analyzeDocx as extractDocx } from \"@dragon708/docmind-docx\";\nimport type { AnalysisResult, DetectFileKindInput } from \"@dragon708/docmind-shared\";\nimport { bytesFromDetectInput } from \"../inputBytes.js\";\n\n/**\n * DOCX → `@dragon708/docmind-docx`.\n */\nexport async function analyzeDocxForNode(\n input: DetectFileKindInput,\n signal?: AbortSignal,\n): Promise<AnalysisResult> {\n if (signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n\n const data = await bytesFromDetectInput(input);\n if (data.byteLength === 0) {\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [\"No document bytes were provided for analysis.\"],\n };\n }\n\n const r = await extractDocx(data);\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: r.text,\n html: r.html,\n warnings: [...r.warnings],\n };\n}\n","import { ocr } from \"@dragon708/docmind-ocr\";\nimport type { AnalysisResult, DetectFileKindInput } from \"@dragon708/docmind-shared\";\nimport type { NodeAnalyzeOptions } from \"../nodeAnalyzeOptions.js\";\nimport { bytesFromDetectInput } from \"../inputBytes.js\";\n\n/**\n * Image → `@dragon708/docmind-ocr`.\n */\nexport async function analyzeImageForNode(\n input: DetectFileKindInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n if (options?.signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n\n const data = await bytesFromDetectInput(input);\n if (data.byteLength === 0) {\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [\"No image bytes were provided for analysis.\"],\n };\n }\n\n const ocrOpts = {\n ...(options?.ocr ?? {}),\n signal: options?.ocr?.signal ?? options?.signal,\n };\n\n const r = await ocr(data, ocrOpts);\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: r.text,\n confidence: r.confidence,\n ocrUsed: r.ocrUsed,\n warnings: [],\n };\n}\n","import { analyzePdf as runPdf } from \"@dragon708/docmind-pdf\";\nimport type { PdfAnalyzeOptions } from \"@dragon708/docmind-pdf\";\nimport type { AnalysisResult, DetectFileKindInput } from \"@dragon708/docmind-shared\";\nimport type { NodeAnalyzeOptions } from \"../nodeAnalyzeOptions.js\";\nimport { bytesFromDetectInput } from \"../inputBytes.js\";\n\n/**\n * PDF → `@dragon708/docmind-pdf` (Node / pdf-parse + OCR).\n *\n * Unlike `analyzePdf` from `@dragon708/docmind-pdf` (OCR off unless set), `analyzeFile` defaults\n * to `pdf.ocr: \"auto\"`: when the PDF has\n * pages but almost no extractable text (typical scan), the raster OCR pipeline runs. Pass\n * `pdf: { ocr: \"off\" }` to skip OCR for speed.\n */\nexport async function analyzePdfForNode(\n input: DetectFileKindInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n if (options?.signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n\n const data = await bytesFromDetectInput(input);\n if (data.byteLength === 0) {\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: \"\",\n pages: 0,\n metadata: { info: {} },\n warnings: [\"No document bytes were provided for analysis.\"],\n needsOCR: false,\n ocrUsed: false,\n };\n }\n\n const userPdf = options?.pdf;\n const pdfOpts: PdfAnalyzeOptions = {\n ...userPdf,\n ocr: userPdf?.ocr ?? \"auto\",\n ocrLangs: userPdf?.ocrLangs ?? options?.ocr?.langs,\n signal: userPdf?.signal ?? options?.signal,\n };\n\n const r = await runPdf(data, pdfOpts);\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: r.text,\n pages: r.pages,\n metadata: r.metadata,\n warnings: [...r.warnings],\n needsOCR: r.needsOCR,\n ocrUsed: r.ocrUsed,\n };\n}\n","import type { DetectFileKindInput, NamedInput } from \"@dragon708/docmind-shared\";\nimport { readFile } from \"node:fs/promises\";\nimport { basename } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\n\n/**\n * Inputs accepted by {@link analyzeFile} in this package.\n * Paths and `file:` URLs are read with `fs`; other values pass through as {@link DetectFileKindInput}.\n */\nexport type NodeAnalyzeInput = string | URL | DetectFileKindInput;\n\nfunction toPathString(pathOrUrl: string | URL): string {\n return pathOrUrl instanceof URL ? fileURLToPath(pathOrUrl) : pathOrUrl;\n}\n\n/**\n * Reads a file from disk into a {@link NamedInput} (binary `Buffer`, basename as `name` for hints).\n */\nexport async function readFileToInput(path: string | URL): Promise<NamedInput<Buffer>> {\n const fsPath = toPathString(path);\n const data = await readFile(fsPath);\n return {\n data,\n name: basename(fsPath),\n };\n}\n\n/** Wraps a `Buffer` as a named payload when you already know the filename. */\nexport function bufferToInput(buffer: Buffer, name?: string): NamedInput<Buffer> {\n return name !== undefined ? { data: buffer, name } : { data: buffer };\n}\n\n/**\n * Resolves paths / `file:` URLs to a {@link DetectFileKindInput}; leaves other values untouched.\n */\nexport async function resolveNodeAnalyzeInput(input: NodeAnalyzeInput): Promise<DetectFileKindInput> {\n if (typeof input === \"string\" || input instanceof URL) {\n return readFileToInput(input);\n }\n return input;\n}\n","import type { AnalysisResult, DetectFileKindInput } from \"@dragon708/docmind-shared\";\nimport {\n analyzeText,\n assertValidAnalyzeFileInput,\n detectFileKind,\n notImplementedResult,\n UNKNOWN_FORMAT_WARNING,\n} from \"@dragon708/docmind-shared\";\nimport { analyzeDocxForNode } from \"./analyzers/docx.js\";\nimport { analyzeImageForNode } from \"./analyzers/image.js\";\nimport { analyzePdfForNode } from \"./analyzers/pdf.js\";\nimport type { NodeAnalyzeOptions } from \"./nodeAnalyzeOptions.js\";\nimport { resolveNodeAnalyzeInput, type NodeAnalyzeInput } from \"./resolveNodeInput.js\";\n\n/**\n * Node router: PDF, DOCX, images (OCR), and text. Paths and `file:` URLs are read via `fs`.\n */\nexport async function analyzeFile(\n input: NodeAnalyzeInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n if (options?.signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n\n const resolved = await resolveNodeAnalyzeInput(input);\n assertValidAnalyzeFileInput(resolved);\n\n const fileKind = detectFileKind(resolved);\n\n switch (fileKind) {\n case \"pdf\":\n return analyzePdfForNode(resolved as DetectFileKindInput, options);\n case \"docx\":\n return analyzeDocxForNode(resolved as DetectFileKindInput, options?.signal);\n case \"image\":\n return analyzeImageForNode(resolved as DetectFileKindInput, options);\n case \"text\":\n return analyzeText(resolved as DetectFileKindInput, { signal: options?.signal });\n default:\n return notImplementedResult(fileKind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n","import type { AnalysisResult, DetectFileKindInput } from \"@dragon708/docmind-shared\";\nimport {\n analyzeText,\n assertValidAnalyzeFileInput,\n detectFileKind,\n notImplementedResult,\n UNKNOWN_FORMAT_WARNING,\n} from \"@dragon708/docmind-shared\";\nimport {\n analyzePdf,\n extractPdfMetadata,\n extractTextFromPdf,\n} from \"@dragon708/docmind-pdf\";\nimport {\n analyzeDocx,\n convertDocxToHtml,\n extractTextFromDocx,\n} from \"@dragon708/docmind-docx\";\nimport { ocr } from \"@dragon708/docmind-ocr\";\nimport { bytesFromDetectInput } from \"./inputBytes.js\";\nimport type { NodeAnalyzeOptions } from \"./nodeAnalyzeOptions.js\";\nimport { resolveNodeAnalyzeInput, type NodeAnalyzeInput } from \"./resolveNodeInput.js\";\n\nfunction throwIfAborted(signal?: AbortSignal): void {\n if (signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n}\n\nasync function prepare(input: NodeAnalyzeInput): Promise<DetectFileKindInput> {\n const resolved = await resolveNodeAnalyzeInput(input);\n assertValidAnalyzeFileInput(resolved);\n return resolved;\n}\n\nfunction escapeHtmlMinimal(s: string): string {\n return s\n .replace(/&/g, \"&\")\n .replace(/</g, \"<\")\n .replace(/>/g, \">\")\n .replace(/\"/g, \""\");\n}\n\nconst DOCX_METADATA_STUB =\n \"Structured document metadata for DOCX is not exposed as a separate API; use extractText or analyzeFile.\";\n\nconst IMAGE_METADATA_NOTE =\n \"Raster images have no document metadata bundle in this API.\";\n\n/**\n * Text only: PDF → `extractTextFromPdf` (capa de texto, sin OCR); DOCX → `extractTextFromDocx`;\n * imagen → `ocr`; texto → `analyzeText`.\n */\nexport async function extractText(\n input: NodeAnalyzeInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n throwIfAborted(options?.signal);\n const resolved = await prepare(input);\n const kind = detectFileKind(resolved);\n const signal = options?.signal;\n\n switch (kind) {\n case \"pdf\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: \"\",\n pages: 0,\n metadata: { info: {} },\n warnings: [\"No document bytes were provided for analysis.\"],\n needsOCR: false,\n ocrUsed: false,\n };\n }\n const r = await extractTextFromPdf(data);\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: r.text,\n pages: r.pages,\n metadata: { info: {} },\n warnings: r.warnings,\n needsOCR: false,\n ocrUsed: false,\n };\n }\n case \"docx\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [\"No document bytes were provided for analysis.\"],\n };\n }\n const r = await extractTextFromDocx(data);\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: r.text,\n html: \"\",\n warnings: r.warnings,\n };\n }\n case \"image\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [\"No image bytes were provided for analysis.\"],\n };\n }\n const ocrOpts = {\n ...(options?.ocr ?? {}),\n signal: options?.ocr?.signal ?? signal,\n };\n const r = await ocr(data, ocrOpts);\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: r.text,\n confidence: r.confidence,\n ocrUsed: r.ocrUsed,\n warnings: [],\n };\n }\n case \"text\":\n return analyzeText(resolved as DetectFileKindInput, { signal });\n default:\n return notImplementedResult(kind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n\n/**\n * Metadatos: PDF → `extractPdfMetadata`; resto con mejor esfuerzo o aviso.\n * El resultado sigue siendo `AnalysisResult` (PDF rellena `metadata` en forma `PdfAnalysisCoreResult`).\n */\nexport async function extractMetadata(\n input: NodeAnalyzeInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n throwIfAborted(options?.signal);\n const resolved = await prepare(input);\n const kind = detectFileKind(resolved);\n const signal = options?.signal;\n\n switch (kind) {\n case \"pdf\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: \"\",\n pages: 0,\n metadata: { info: {} },\n warnings: [\"No document bytes were provided for analysis.\"],\n needsOCR: false,\n ocrUsed: false,\n };\n }\n const r = await extractPdfMetadata(data);\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: \"\",\n pages: 0,\n metadata: r.metadata,\n warnings: r.warnings,\n needsOCR: false,\n ocrUsed: false,\n };\n }\n case \"docx\":\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [DOCX_METADATA_STUB],\n };\n case \"image\":\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [IMAGE_METADATA_NOTE],\n };\n case \"text\":\n return analyzeText(resolved as DetectFileKindInput, { signal });\n default:\n return notImplementedResult(kind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n\n/**\n * HTML: DOCX → `convertDocxToHtml`; PDF/texto → `<pre>` a partir de texto extraído;\n * imágenes → vacío con aviso.\n */\nexport async function convertToHtml(\n input: NodeAnalyzeInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n throwIfAborted(options?.signal);\n const resolved = await prepare(input);\n const kind = detectFileKind(resolved);\n const signal = options?.signal;\n\n switch (kind) {\n case \"docx\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [\"No document bytes were provided for analysis.\"],\n };\n }\n const [textPart, htmlPart] = await Promise.all([\n extractTextFromDocx(data),\n convertDocxToHtml(data),\n ]);\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: textPart.text,\n html: htmlPart.html,\n warnings: [...textPart.warnings, ...htmlPart.warnings],\n };\n }\n case \"pdf\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: \"\",\n pages: 0,\n metadata: { info: {} },\n warnings: [\"No document bytes were provided for analysis.\"],\n needsOCR: false,\n ocrUsed: false,\n };\n }\n const r = await extractTextFromPdf(data);\n const html = `<pre>${escapeHtmlMinimal(r.text)}</pre>`;\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: r.text,\n pages: r.pages,\n metadata: { info: {} },\n html,\n warnings: [\n ...r.warnings,\n \"PDF HTML is a plain-text preview wrapped in <pre> (not a visual layout).\",\n ],\n needsOCR: false,\n ocrUsed: false,\n } as AnalysisResult;\n }\n case \"text\": {\n const t = await analyzeText(resolved as DetectFileKindInput, { signal });\n const html = `<pre>${escapeHtmlMinimal(t.text)}</pre>`;\n return {\n ...t,\n html,\n warnings: [\n ...t.warnings,\n \"HTML for plain text is a <pre> wrapper around decoded UTF-8 content.\",\n ],\n } as AnalysisResult;\n }\n case \"image\":\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [\"No HTML representation for raster images; use extractText / runOcr.\"],\n };\n default:\n return notImplementedResult(kind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n\n/**\n * OCR: PDF → `analyzePdf` con `ocr: \"force\"`; imagen → `ocr`; DOCX → texto estructurado con aviso\n * (sin OCR); texto → `analyzeText`.\n */\nexport async function runOcr(\n input: NodeAnalyzeInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n throwIfAborted(options?.signal);\n const resolved = await prepare(input);\n const kind = detectFileKind(resolved);\n const signal = options?.signal;\n const lang = options?.ocr?.langs ?? options?.pdf?.ocrLangs;\n\n switch (kind) {\n case \"pdf\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: \"\",\n pages: 0,\n metadata: { info: {} },\n warnings: [\"No document bytes were provided for analysis.\"],\n needsOCR: false,\n ocrUsed: false,\n };\n }\n const r = await analyzePdf(data, {\n ...options?.pdf,\n ocr: \"force\",\n ocrLangs: lang ?? options?.pdf?.ocrLangs,\n signal: options?.pdf?.signal ?? signal,\n });\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: r.text,\n pages: r.pages,\n metadata: r.metadata,\n warnings: [...r.warnings],\n needsOCR: r.needsOCR,\n ocrUsed: r.ocrUsed,\n };\n }\n case \"image\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [\"No image bytes were provided for analysis.\"],\n };\n }\n const ocrOpts = {\n ...(options?.ocr ?? {}),\n signal: options?.ocr?.signal ?? signal,\n };\n const r = await ocr(data, ocrOpts);\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: r.text,\n confidence: r.confidence,\n ocrUsed: r.ocrUsed,\n warnings: [],\n };\n }\n case \"docx\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [\"No document bytes were provided for analysis.\"],\n };\n }\n const r = await analyzeDocx(data);\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: r.text,\n html: r.html,\n warnings: [\n ...r.warnings,\n \"OCR does not apply to DOCX; returned structured text/HTML extract.\",\n ],\n };\n }\n case \"text\":\n return analyzeText(resolved as DetectFileKindInput, { signal });\n default:\n return notImplementedResult(kind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n"]}
|