@dragon708/docmind-node 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +82 -15
- package/dist/index.js +523 -182
- package/package.json +5 -4
- package/dist/index.js.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -1,11 +1,16 @@
|
|
|
1
|
-
import { DetectFileKindInput, NamedInput, AnalysisResult } from '@dragon708/docmind-shared';
|
|
2
|
-
export { AnalysisAnalyzer, AnalysisResult, DetectFileKindInput, DocxAnalysisCoreResult, FileKind, FileKindMetadata, GenericAnalysisResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
|
|
1
|
+
import { DocMindAnalyzeOptions, DetectFileKindInput, NamedInput, AnalysisResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
|
|
2
|
+
export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
|
|
3
3
|
import { OcrOptions } from '@dragon708/docmind-ocr';
|
|
4
4
|
import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
|
|
5
5
|
|
|
6
|
-
/**
|
|
7
|
-
|
|
8
|
-
|
|
6
|
+
/**
|
|
7
|
+
* Options for Node public APIs (`analyzeFile`, intent methods).
|
|
8
|
+
*
|
|
9
|
+
* - **`pdf`**: forwarded to `@dragon708/docmind-pdf`. `analyzeFile` defaults `pdf.ocr` to `"auto"` when omitted.
|
|
10
|
+
* {@link extractText} / {@link convertToHtml} merge a default of `ocr: "off"` unless you set `pdf.ocr` explicitly.
|
|
11
|
+
* - **`ocr`**: forwarded to `@dragon708/docmind-ocr` for raster images; language string also feeds PDF OCR when `pdf.ocrLangs` is unset.
|
|
12
|
+
*/
|
|
13
|
+
interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
|
|
9
14
|
readonly pdf?: PdfAnalyzeOptions;
|
|
10
15
|
readonly ocr?: OcrOptions;
|
|
11
16
|
}
|
|
@@ -27,29 +32,91 @@ declare function bufferToInput(buffer: Buffer, name?: string): NamedInput<Buffer
|
|
|
27
32
|
declare function resolveNodeAnalyzeInput(input: NodeAnalyzeInput): Promise<DetectFileKindInput>;
|
|
28
33
|
|
|
29
34
|
/**
|
|
30
|
-
*
|
|
35
|
+
* Resolves {@link NodeAnalyzeInput} (paths read from disk), classifies with {@link detectFileKind}, then runs
|
|
36
|
+
* the PDF, DOCX, image, or text pipeline. PDF OCR defaults to `"auto"` when `options.pdf.ocr` is omitted.
|
|
31
37
|
*/
|
|
32
38
|
declare function analyzeFile(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
33
39
|
|
|
34
40
|
/**
|
|
35
|
-
*
|
|
36
|
-
*
|
|
41
|
+
* Plain-text extraction using {@link analyzeFile} routing. PDFs default to **text layer only**
|
|
42
|
+
* (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly.
|
|
37
43
|
*/
|
|
38
44
|
declare function extractText(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
39
45
|
/**
|
|
40
|
-
*
|
|
41
|
-
*
|
|
46
|
+
* Metadata: PDF uses lightweight metadata extraction; DOCX/images return stubs; plain text uses the same
|
|
47
|
+
* router as {@link extractText} (`analyzeFile` with PDF OCR off by default).
|
|
42
48
|
*/
|
|
43
49
|
declare function extractMetadata(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
44
50
|
/**
|
|
45
|
-
* HTML: DOCX
|
|
46
|
-
*
|
|
51
|
+
* HTML: DOCX and plain text go through {@link analyzeFile} (then `<pre>` for text). PDF uses the text layer
|
|
52
|
+
* only wrapped in `<pre>` (no OCR). Images return a stub without running OCR.
|
|
47
53
|
*/
|
|
48
54
|
declare function convertToHtml(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
49
55
|
/**
|
|
50
|
-
* OCR: PDF
|
|
51
|
-
*
|
|
56
|
+
* OCR intent: PDF always runs {@link analyzePdf} with `ocr: "force"` (merged with `options.pdf`).
|
|
57
|
+
* Raster images run Tesseract via `options.ocr`. DOCX returns structured extract with a notice.
|
|
52
58
|
*/
|
|
53
59
|
declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
|
|
54
60
|
|
|
55
|
-
|
|
61
|
+
/**
|
|
62
|
+
* Whether DocMind will try a non-OCR text/HTML path (e.g. Mammoth, pdf-parse text layer, UTF-8).
|
|
63
|
+
*/
|
|
64
|
+
interface NativeExtractionPlan {
|
|
65
|
+
readonly willAttempt: boolean;
|
|
66
|
+
readonly description: string;
|
|
67
|
+
}
|
|
68
|
+
/** Whether OCR (raster or PDF pipeline) may run for this intent + kind. */
|
|
69
|
+
interface OcrPlan {
|
|
70
|
+
readonly mayUse: boolean;
|
|
71
|
+
readonly description: string;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Structured explanation of what DocMind would do for a public intent in Node (no heavy I/O).
|
|
75
|
+
*/
|
|
76
|
+
interface ExplainAnalysisPlanReport {
|
|
77
|
+
readonly kind: FileKind;
|
|
78
|
+
readonly detectedKind: FileKind;
|
|
79
|
+
readonly runtime: RuntimeDescriptor;
|
|
80
|
+
readonly intent: DocMindPublicIntent | (string & {});
|
|
81
|
+
readonly primaryAnalyzer: AnalysisAnalyzer;
|
|
82
|
+
readonly nativeExtraction: NativeExtractionPlan;
|
|
83
|
+
readonly ocr: OcrPlan;
|
|
84
|
+
readonly limitations: readonly string[];
|
|
85
|
+
readonly plan: ProcessingPlanDescriptor;
|
|
86
|
+
readonly warnings?: readonly string[];
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/** High-level features the user can ask DocMind for (per input kind and runtime). */
|
|
90
|
+
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
|
|
91
|
+
/** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
|
|
92
|
+
interface PublicCapabilitySupport {
|
|
93
|
+
readonly id: PublicCapabilityId;
|
|
94
|
+
readonly supported: boolean;
|
|
95
|
+
readonly warnings?: readonly string[];
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
|
|
99
|
+
*/
|
|
100
|
+
interface GetCapabilitiesReport {
|
|
101
|
+
readonly kind: FileKind;
|
|
102
|
+
readonly runtime: RuntimeDescriptor;
|
|
103
|
+
readonly capabilities: readonly PublicCapabilitySupport[];
|
|
104
|
+
readonly warnings?: readonly string[];
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/** Options for {@link explainAnalysisPlan} including PDF/OCR hints for accurate planning. */
|
|
108
|
+
type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr">;
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
|
|
112
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (PDF fully supported).
|
|
113
|
+
* Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
|
|
114
|
+
*/
|
|
115
|
+
declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
|
|
116
|
+
/**
|
|
117
|
+
* Epic 1 — **Plan preview:** same shape as browser; PDF branches include `pdf.ocr` from options (`off` | `auto` | `force`).
|
|
118
|
+
* No full document parse unless resolving a path reads the file.
|
|
119
|
+
*/
|
|
120
|
+
declare function explainAnalysisPlan(input: NodeAnalyzeInput, options?: NodeExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
|
|
121
|
+
|
|
122
|
+
export { type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
package/dist/index.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile } from '@dragon708/docmind-shared';
|
|
2
2
|
export { detectFileKind } from '@dragon708/docmind-shared';
|
|
3
|
-
import {
|
|
3
|
+
import { analyzeDocx } from '@dragon708/docmind-docx';
|
|
4
4
|
import { ocr } from '@dragon708/docmind-ocr';
|
|
5
|
-
import {
|
|
5
|
+
import { extractPdfMetadata, extractTextFromPdf, analyzePdf } from '@dragon708/docmind-pdf';
|
|
6
6
|
import { readFile } from 'fs/promises';
|
|
7
7
|
import { basename } from 'path';
|
|
8
8
|
import { fileURLToPath } from 'url';
|
|
@@ -169,6 +169,19 @@ async function analyzeFile(input, options) {
|
|
|
169
169
|
return notImplementedResult(fileKind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
170
170
|
}
|
|
171
171
|
}
|
|
172
|
+
|
|
173
|
+
// src/intentPdfOptions.ts
|
|
174
|
+
function withPdfOcrDefaultOff(options) {
|
|
175
|
+
return {
|
|
176
|
+
...options,
|
|
177
|
+
pdf: {
|
|
178
|
+
...options?.pdf,
|
|
179
|
+
ocr: options?.pdf?.ocr ?? "off"
|
|
180
|
+
}
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// src/internal/abort.ts
|
|
172
185
|
function throwIfAborted(signal) {
|
|
173
186
|
if (signal?.aborted) {
|
|
174
187
|
const err = new Error("The operation was aborted");
|
|
@@ -176,117 +189,36 @@ function throwIfAborted(signal) {
|
|
|
176
189
|
throw err;
|
|
177
190
|
}
|
|
178
191
|
}
|
|
192
|
+
|
|
193
|
+
// src/publicActions.ts
|
|
194
|
+
var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not exposed as a separate API; use extractText or analyzeFile.";
|
|
195
|
+
var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
|
|
196
|
+
var RUN_OCR_PDF_FORCE_SEMANTICS = 'runOcr: PDF pipeline ran with `ocr: "force"` so text may include raster OCR output even when a text layer exists.';
|
|
197
|
+
function escapeHtmlMinimal(s) {
|
|
198
|
+
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
199
|
+
}
|
|
179
200
|
async function prepare(input) {
|
|
180
201
|
const resolved = await resolveNodeAnalyzeInput(input);
|
|
181
202
|
assertValidAnalyzeFileInput(resolved);
|
|
182
203
|
return resolved;
|
|
183
204
|
}
|
|
184
|
-
function
|
|
185
|
-
|
|
205
|
+
function toExtractTextResult(full) {
|
|
206
|
+
if (full.status !== "ok") return full;
|
|
207
|
+
if (full.fileKind === "docx") {
|
|
208
|
+
return { ...full, html: "" };
|
|
209
|
+
}
|
|
210
|
+
return full;
|
|
186
211
|
}
|
|
187
|
-
var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not exposed as a separate API; use extractText or analyzeFile.";
|
|
188
|
-
var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
|
|
189
212
|
async function extractText(input, options) {
|
|
190
213
|
throwIfAborted(options?.signal);
|
|
191
|
-
const
|
|
192
|
-
|
|
193
|
-
const signal = options?.signal;
|
|
194
|
-
switch (kind) {
|
|
195
|
-
case "pdf": {
|
|
196
|
-
const data = await bytesFromDetectInput(resolved);
|
|
197
|
-
if (data.byteLength === 0) {
|
|
198
|
-
return {
|
|
199
|
-
fileKind: "pdf",
|
|
200
|
-
analyzer: "pdf",
|
|
201
|
-
status: "ok",
|
|
202
|
-
kind: "pdf",
|
|
203
|
-
text: "",
|
|
204
|
-
pages: 0,
|
|
205
|
-
metadata: { info: {} },
|
|
206
|
-
warnings: ["No document bytes were provided for analysis."],
|
|
207
|
-
needsOCR: false,
|
|
208
|
-
ocrUsed: false
|
|
209
|
-
};
|
|
210
|
-
}
|
|
211
|
-
const r = await extractTextFromPdf(data);
|
|
212
|
-
return {
|
|
213
|
-
fileKind: "pdf",
|
|
214
|
-
analyzer: "pdf",
|
|
215
|
-
status: "ok",
|
|
216
|
-
kind: "pdf",
|
|
217
|
-
text: r.text,
|
|
218
|
-
pages: r.pages,
|
|
219
|
-
metadata: { info: {} },
|
|
220
|
-
warnings: r.warnings,
|
|
221
|
-
needsOCR: false,
|
|
222
|
-
ocrUsed: false
|
|
223
|
-
};
|
|
224
|
-
}
|
|
225
|
-
case "docx": {
|
|
226
|
-
const data = await bytesFromDetectInput(resolved);
|
|
227
|
-
if (data.byteLength === 0) {
|
|
228
|
-
return {
|
|
229
|
-
fileKind: "docx",
|
|
230
|
-
analyzer: "docx",
|
|
231
|
-
status: "ok",
|
|
232
|
-
kind: "docx",
|
|
233
|
-
text: "",
|
|
234
|
-
html: "",
|
|
235
|
-
warnings: ["No document bytes were provided for analysis."]
|
|
236
|
-
};
|
|
237
|
-
}
|
|
238
|
-
const r = await extractTextFromDocx(data);
|
|
239
|
-
return {
|
|
240
|
-
fileKind: "docx",
|
|
241
|
-
analyzer: "docx",
|
|
242
|
-
status: "ok",
|
|
243
|
-
kind: "docx",
|
|
244
|
-
text: r.text,
|
|
245
|
-
html: "",
|
|
246
|
-
warnings: r.warnings
|
|
247
|
-
};
|
|
248
|
-
}
|
|
249
|
-
case "image": {
|
|
250
|
-
const data = await bytesFromDetectInput(resolved);
|
|
251
|
-
if (data.byteLength === 0) {
|
|
252
|
-
return {
|
|
253
|
-
fileKind: "image",
|
|
254
|
-
analyzer: "image",
|
|
255
|
-
status: "ok",
|
|
256
|
-
kind: "image",
|
|
257
|
-
text: "",
|
|
258
|
-
confidence: 0,
|
|
259
|
-
ocrUsed: true,
|
|
260
|
-
warnings: ["No image bytes were provided for analysis."]
|
|
261
|
-
};
|
|
262
|
-
}
|
|
263
|
-
const ocrOpts = {
|
|
264
|
-
...options?.ocr ?? {},
|
|
265
|
-
signal: options?.ocr?.signal ?? signal
|
|
266
|
-
};
|
|
267
|
-
const r = await ocr(data, ocrOpts);
|
|
268
|
-
return {
|
|
269
|
-
fileKind: "image",
|
|
270
|
-
analyzer: "image",
|
|
271
|
-
status: "ok",
|
|
272
|
-
kind: "image",
|
|
273
|
-
text: r.text,
|
|
274
|
-
confidence: r.confidence,
|
|
275
|
-
ocrUsed: r.ocrUsed,
|
|
276
|
-
warnings: []
|
|
277
|
-
};
|
|
278
|
-
}
|
|
279
|
-
case "text":
|
|
280
|
-
return analyzeText(resolved, { signal });
|
|
281
|
-
default:
|
|
282
|
-
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
283
|
-
}
|
|
214
|
+
const full = await analyzeFile(input, withPdfOcrDefaultOff(options));
|
|
215
|
+
return toExtractTextResult(full);
|
|
284
216
|
}
|
|
285
217
|
async function extractMetadata(input, options) {
|
|
286
218
|
throwIfAborted(options?.signal);
|
|
287
219
|
const resolved = await prepare(input);
|
|
288
220
|
const kind = detectFileKind(resolved);
|
|
289
|
-
|
|
221
|
+
options?.signal;
|
|
290
222
|
switch (kind) {
|
|
291
223
|
case "pdf": {
|
|
292
224
|
const data = await bytesFromDetectInput(resolved);
|
|
@@ -340,7 +272,7 @@ async function extractMetadata(input, options) {
|
|
|
340
272
|
warnings: [IMAGE_METADATA_NOTE]
|
|
341
273
|
};
|
|
342
274
|
case "text":
|
|
343
|
-
return
|
|
275
|
+
return analyzeFile(input, withPdfOcrDefaultOff(options));
|
|
344
276
|
default:
|
|
345
277
|
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
346
278
|
}
|
|
@@ -349,96 +281,72 @@ async function convertToHtml(input, options) {
|
|
|
349
281
|
throwIfAborted(options?.signal);
|
|
350
282
|
const resolved = await prepare(input);
|
|
351
283
|
const kind = detectFileKind(resolved);
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
analyzer: "docx",
|
|
374
|
-
status: "ok",
|
|
375
|
-
kind: "docx",
|
|
376
|
-
text: textPart.text,
|
|
377
|
-
html: htmlPart.html,
|
|
378
|
-
warnings: [...textPart.warnings, ...htmlPart.warnings]
|
|
379
|
-
};
|
|
380
|
-
}
|
|
381
|
-
case "pdf": {
|
|
382
|
-
const data = await bytesFromDetectInput(resolved);
|
|
383
|
-
if (data.byteLength === 0) {
|
|
384
|
-
return {
|
|
385
|
-
fileKind: "pdf",
|
|
386
|
-
analyzer: "pdf",
|
|
387
|
-
status: "ok",
|
|
388
|
-
kind: "pdf",
|
|
389
|
-
text: "",
|
|
390
|
-
pages: 0,
|
|
391
|
-
metadata: { info: {} },
|
|
392
|
-
warnings: ["No document bytes were provided for analysis."],
|
|
393
|
-
needsOCR: false,
|
|
394
|
-
ocrUsed: false
|
|
395
|
-
};
|
|
396
|
-
}
|
|
397
|
-
const r = await extractTextFromPdf(data);
|
|
398
|
-
const html = `<pre>${escapeHtmlMinimal(r.text)}</pre>`;
|
|
284
|
+
options?.signal;
|
|
285
|
+
if (kind === "docx") {
|
|
286
|
+
return analyzeFile(input, withPdfOcrDefaultOff(options));
|
|
287
|
+
}
|
|
288
|
+
if (kind === "text") {
|
|
289
|
+
const r = await analyzeFile(input, withPdfOcrDefaultOff(options));
|
|
290
|
+
if (r.status !== "ok") return r;
|
|
291
|
+
if (r.fileKind !== "text") return r;
|
|
292
|
+
const html = `<pre>${escapeHtmlMinimal(r.text)}</pre>`;
|
|
293
|
+
return {
|
|
294
|
+
...r,
|
|
295
|
+
html,
|
|
296
|
+
warnings: [
|
|
297
|
+
...r.warnings,
|
|
298
|
+
"HTML for plain text is a <pre> wrapper around decoded UTF-8 content."
|
|
299
|
+
]
|
|
300
|
+
};
|
|
301
|
+
}
|
|
302
|
+
if (kind === "pdf") {
|
|
303
|
+
const data = await bytesFromDetectInput(resolved);
|
|
304
|
+
if (data.byteLength === 0) {
|
|
399
305
|
return {
|
|
400
306
|
fileKind: "pdf",
|
|
401
307
|
analyzer: "pdf",
|
|
402
308
|
status: "ok",
|
|
403
309
|
kind: "pdf",
|
|
404
|
-
text:
|
|
405
|
-
pages:
|
|
310
|
+
text: "",
|
|
311
|
+
pages: 0,
|
|
406
312
|
metadata: { info: {} },
|
|
407
|
-
|
|
408
|
-
warnings: [
|
|
409
|
-
...r.warnings,
|
|
410
|
-
"PDF HTML is a plain-text preview wrapped in <pre> (not a visual layout)."
|
|
411
|
-
],
|
|
313
|
+
warnings: ["No document bytes were provided for analysis."],
|
|
412
314
|
needsOCR: false,
|
|
413
315
|
ocrUsed: false
|
|
414
316
|
};
|
|
415
317
|
}
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
318
|
+
const r = await extractTextFromPdf(data);
|
|
319
|
+
const html = `<pre>${escapeHtmlMinimal(r.text)}</pre>`;
|
|
320
|
+
return {
|
|
321
|
+
fileKind: "pdf",
|
|
322
|
+
analyzer: "pdf",
|
|
323
|
+
status: "ok",
|
|
324
|
+
kind: "pdf",
|
|
325
|
+
text: r.text,
|
|
326
|
+
pages: r.pages,
|
|
327
|
+
metadata: { info: {} },
|
|
328
|
+
html,
|
|
329
|
+
warnings: [
|
|
330
|
+
...r.warnings,
|
|
331
|
+
"PDF HTML is a plain-text preview wrapped in <pre> (not a visual layout)."
|
|
332
|
+
],
|
|
333
|
+
needsOCR: false,
|
|
334
|
+
ocrUsed: false
|
|
335
|
+
};
|
|
336
|
+
}
|
|
337
|
+
if (kind === "image") {
|
|
338
|
+
return {
|
|
339
|
+
fileKind: "image",
|
|
340
|
+
analyzer: "image",
|
|
341
|
+
status: "ok",
|
|
342
|
+
kind: "image",
|
|
343
|
+
text: "",
|
|
344
|
+
confidence: 0,
|
|
345
|
+
ocrUsed: true,
|
|
346
|
+
warnings: ["No HTML representation for raster images; use extractText / runOcr."]
|
|
347
|
+
};
|
|
441
348
|
}
|
|
349
|
+
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
442
350
|
}
|
|
443
351
|
async function runOcr(input, options) {
|
|
444
352
|
throwIfAborted(options?.signal);
|
|
@@ -477,7 +385,7 @@ async function runOcr(input, options) {
|
|
|
477
385
|
text: r.text,
|
|
478
386
|
pages: r.pages,
|
|
479
387
|
metadata: r.metadata,
|
|
480
|
-
warnings: [...r.warnings],
|
|
388
|
+
warnings: [RUN_OCR_PDF_FORCE_SEMANTICS, ...r.warnings],
|
|
481
389
|
needsOCR: r.needsOCR,
|
|
482
390
|
ocrUsed: r.ocrUsed
|
|
483
391
|
};
|
|
@@ -546,6 +454,439 @@ async function runOcr(input, options) {
|
|
|
546
454
|
}
|
|
547
455
|
}
|
|
548
456
|
|
|
549
|
-
|
|
457
|
+
// src/analysisPlanReport.ts
|
|
458
|
+
function lim(...items) {
|
|
459
|
+
return items.filter(Boolean);
|
|
460
|
+
}
|
|
461
|
+
function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
|
|
462
|
+
const runtime = { id: "node" };
|
|
463
|
+
const primaryAnalyzer = kind === "pdf" ? "pdf" : kind === "docx" ? "docx" : kind === "image" ? "image" : kind === "text" ? "text" : "none";
|
|
464
|
+
let nativeExtraction;
|
|
465
|
+
let ocr3;
|
|
466
|
+
let limitations = [];
|
|
467
|
+
if (kind === "unknown") {
|
|
468
|
+
limitations = lim(
|
|
469
|
+
"Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve."
|
|
470
|
+
);
|
|
471
|
+
return {
|
|
472
|
+
kind,
|
|
473
|
+
detectedKind: kind,
|
|
474
|
+
runtime,
|
|
475
|
+
intent,
|
|
476
|
+
primaryAnalyzer: "none",
|
|
477
|
+
nativeExtraction: { willAttempt: false, description: "No analyzer without a known file kind." },
|
|
478
|
+
ocr: { mayUse: false, description: "OCR is not used for unknown kinds." },
|
|
479
|
+
limitations,
|
|
480
|
+
plan
|
|
481
|
+
};
|
|
482
|
+
}
|
|
483
|
+
switch (intent) {
|
|
484
|
+
case "analyzeFile":
|
|
485
|
+
if (kind === "pdf") {
|
|
486
|
+
nativeExtraction = {
|
|
487
|
+
willAttempt: true,
|
|
488
|
+
description: "pdf-parse extracts embedded text and page count first."
|
|
489
|
+
};
|
|
490
|
+
ocr3 = {
|
|
491
|
+
mayUse: pdfOcr !== "off",
|
|
492
|
+
description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when the text layer is empty (pdf.ocr: auto)."
|
|
493
|
+
};
|
|
494
|
+
} else if (kind === "docx") {
|
|
495
|
+
nativeExtraction = {
|
|
496
|
+
willAttempt: true,
|
|
497
|
+
description: "Mammoth extracts text and HTML from OOXML."
|
|
498
|
+
};
|
|
499
|
+
ocr3 = { mayUse: false, description: "DOCX does not use OCR in DocMind." };
|
|
500
|
+
} else if (kind === "image") {
|
|
501
|
+
nativeExtraction = {
|
|
502
|
+
willAttempt: false,
|
|
503
|
+
description: "Images have no native text layer; text comes from OCR only."
|
|
504
|
+
};
|
|
505
|
+
ocr3 = { mayUse: true, description: "Tesseract runs on supported raster formats." };
|
|
506
|
+
} else {
|
|
507
|
+
nativeExtraction = {
|
|
508
|
+
willAttempt: true,
|
|
509
|
+
description: "UTF-8 decode with BOM handling for plain text."
|
|
510
|
+
};
|
|
511
|
+
ocr3 = { mayUse: false, description: "OCR does not apply to text files." };
|
|
512
|
+
}
|
|
513
|
+
break;
|
|
514
|
+
case "extractText":
|
|
515
|
+
if (kind === "pdf") {
|
|
516
|
+
nativeExtraction = {
|
|
517
|
+
willAttempt: true,
|
|
518
|
+
description: "Text layer via pdf-parse; defaults to pdf.ocr off unless you override."
|
|
519
|
+
};
|
|
520
|
+
ocr3 = {
|
|
521
|
+
mayUse: false,
|
|
522
|
+
description: "extractText merges pdf.ocr default off \u2014 no raster OCR unless you set pdf.ocr explicitly."
|
|
523
|
+
};
|
|
524
|
+
} else if (kind === "docx") {
|
|
525
|
+
nativeExtraction = {
|
|
526
|
+
willAttempt: true,
|
|
527
|
+
description: "Mammoth plain text; HTML cleared in the extractText response."
|
|
528
|
+
};
|
|
529
|
+
ocr3 = { mayUse: false, description: "DOCX does not use OCR." };
|
|
530
|
+
} else if (kind === "image") {
|
|
531
|
+
nativeExtraction = { willAttempt: false, description: "No embedded text layer." };
|
|
532
|
+
ocr3 = { mayUse: true, description: "OCR produces text for images." };
|
|
533
|
+
} else {
|
|
534
|
+
nativeExtraction = {
|
|
535
|
+
willAttempt: true,
|
|
536
|
+
description: "UTF-8 decode only."
|
|
537
|
+
};
|
|
538
|
+
ocr3 = { mayUse: false, description: "OCR does not apply." };
|
|
539
|
+
}
|
|
540
|
+
break;
|
|
541
|
+
case "extractMetadata":
|
|
542
|
+
if (kind === "pdf") {
|
|
543
|
+
nativeExtraction = {
|
|
544
|
+
willAttempt: true,
|
|
545
|
+
description: "Lightweight PDF info/XMP normalization without full OCR."
|
|
546
|
+
};
|
|
547
|
+
ocr3 = { mayUse: false, description: "extractMetadata does not run the OCR pipeline." };
|
|
548
|
+
} else if (kind === "docx" || kind === "image") {
|
|
549
|
+
nativeExtraction = {
|
|
550
|
+
willAttempt: false,
|
|
551
|
+
description: "Stub response; no heavy extractor."
|
|
552
|
+
};
|
|
553
|
+
ocr3 = { mayUse: false, description: "OCR not used for this metadata path." };
|
|
554
|
+
limitations = lim(
|
|
555
|
+
kind === "docx" ? "Structured DOCX metadata is not exposed separately." : "Raster images have no document metadata bundle."
|
|
556
|
+
);
|
|
557
|
+
} else {
|
|
558
|
+
nativeExtraction = {
|
|
559
|
+
willAttempt: true,
|
|
560
|
+
description: "Decoded text only; no structured document metadata."
|
|
561
|
+
};
|
|
562
|
+
ocr3 = { mayUse: false, description: "OCR does not apply." };
|
|
563
|
+
limitations = lim("Plain text has no structured document metadata.");
|
|
564
|
+
}
|
|
565
|
+
break;
|
|
566
|
+
case "convertToHtml":
|
|
567
|
+
if (kind === "pdf") {
|
|
568
|
+
nativeExtraction = {
|
|
569
|
+
willAttempt: true,
|
|
570
|
+
description: "Text layer extracted then wrapped in <pre> (not visual layout)."
|
|
571
|
+
};
|
|
572
|
+
ocr3 = { mayUse: false, description: "convertToHtml does not run PDF OCR." };
|
|
573
|
+
limitations = lim("PDF HTML is a plain-text preview, not page layout.");
|
|
574
|
+
} else if (kind === "docx") {
|
|
575
|
+
nativeExtraction = {
|
|
576
|
+
willAttempt: true,
|
|
577
|
+
description: "Mammoth HTML output via analyzeFile routing."
|
|
578
|
+
};
|
|
579
|
+
ocr3 = { mayUse: false, description: "DOCX path does not use OCR." };
|
|
580
|
+
} else if (kind === "text") {
|
|
581
|
+
nativeExtraction = {
|
|
582
|
+
willAttempt: true,
|
|
583
|
+
description: "UTF-8 decode then <pre> wrapper."
|
|
584
|
+
};
|
|
585
|
+
ocr3 = { mayUse: false, description: "OCR does not apply." };
|
|
586
|
+
} else {
|
|
587
|
+
nativeExtraction = {
|
|
588
|
+
willAttempt: false,
|
|
589
|
+
description: "No HTML path for raster images."
|
|
590
|
+
};
|
|
591
|
+
ocr3 = { mayUse: false, description: "OCR does not emit layout HTML here." };
|
|
592
|
+
limitations = lim("Use extractText or runOcr for image text.");
|
|
593
|
+
}
|
|
594
|
+
break;
|
|
595
|
+
case "runOcr":
|
|
596
|
+
if (kind === "pdf") {
|
|
597
|
+
nativeExtraction = {
|
|
598
|
+
willAttempt: true,
|
|
599
|
+
description: "pdf-parse runs first; text may be replaced by raster OCR output."
|
|
600
|
+
};
|
|
601
|
+
ocr3 = {
|
|
602
|
+
mayUse: true,
|
|
603
|
+
description: 'runOcr always sets pdf.ocr to "force" for PDFs.'
|
|
604
|
+
};
|
|
605
|
+
limitations = lim("Forced OCR may run even when a text layer exists.");
|
|
606
|
+
} else if (kind === "image") {
|
|
607
|
+
nativeExtraction = { willAttempt: false, description: "No native text layer." };
|
|
608
|
+
ocr3 = { mayUse: true, description: "Tesseract OCR on the image bytes." };
|
|
609
|
+
} else if (kind === "docx") {
|
|
610
|
+
nativeExtraction = {
|
|
611
|
+
willAttempt: true,
|
|
612
|
+
description: "Full Mammoth extract (text + HTML); not OCR."
|
|
613
|
+
};
|
|
614
|
+
ocr3 = { mayUse: false, description: "DOCX is not OCR'd." };
|
|
615
|
+
limitations = lim("Result is structured extract, not OCR output.");
|
|
616
|
+
} else {
|
|
617
|
+
nativeExtraction = {
|
|
618
|
+
willAttempt: true,
|
|
619
|
+
description: "UTF-8 decode only."
|
|
620
|
+
};
|
|
621
|
+
ocr3 = { mayUse: false, description: "OCR does not apply to text files." };
|
|
622
|
+
}
|
|
623
|
+
break;
|
|
624
|
+
default:
|
|
625
|
+
nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
|
|
626
|
+
ocr3 = { mayUse: false, description: "See plan steps." };
|
|
627
|
+
}
|
|
628
|
+
return {
|
|
629
|
+
kind,
|
|
630
|
+
detectedKind: kind,
|
|
631
|
+
runtime,
|
|
632
|
+
intent,
|
|
633
|
+
primaryAnalyzer,
|
|
634
|
+
nativeExtraction,
|
|
635
|
+
ocr: ocr3,
|
|
636
|
+
limitations,
|
|
637
|
+
plan
|
|
638
|
+
};
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
// src/capabilityReport.ts
|
|
642
|
+
var DOCX_META = "Structured document metadata is not exposed separately; extractMetadata returns a stub for DOCX.";
|
|
643
|
+
var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
|
|
644
|
+
var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
|
|
645
|
+
var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
|
|
646
|
+
var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
|
|
647
|
+
function slot(id, supported, warnings) {
|
|
648
|
+
return warnings?.length ? { id, supported, warnings } : { id, supported };
|
|
649
|
+
}
|
|
650
|
+
function buildNodeCapabilityReport(kind) {
|
|
651
|
+
const runtime = { id: "node" };
|
|
652
|
+
let capabilities;
|
|
653
|
+
const topWarnings = [];
|
|
654
|
+
switch (kind) {
|
|
655
|
+
case "pdf":
|
|
656
|
+
capabilities = [
|
|
657
|
+
slot("text", true, ["Includes text layer extraction; use extractText options to avoid PDF OCR."]),
|
|
658
|
+
slot("metadata", true),
|
|
659
|
+
slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."]),
|
|
660
|
+
slot("ocr", true, ["Raster OCR is available (e.g. analyzeFile with pdf.ocr auto/force, or runOcr)."]),
|
|
661
|
+
slot("pages", true)
|
|
662
|
+
];
|
|
663
|
+
break;
|
|
664
|
+
case "docx":
|
|
665
|
+
capabilities = [
|
|
666
|
+
slot("text", true),
|
|
667
|
+
slot("metadata", false, [DOCX_META]),
|
|
668
|
+
slot("html", true),
|
|
669
|
+
slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
|
|
670
|
+
slot("pages", false)
|
|
671
|
+
];
|
|
672
|
+
break;
|
|
673
|
+
case "image":
|
|
674
|
+
capabilities = [
|
|
675
|
+
slot("text", true, ["Text is obtained via OCR."]),
|
|
676
|
+
slot("metadata", false, [IMAGE_META]),
|
|
677
|
+
slot("html", false, [IMAGE_HTML]),
|
|
678
|
+
slot("ocr", true),
|
|
679
|
+
slot("pages", false)
|
|
680
|
+
];
|
|
681
|
+
break;
|
|
682
|
+
case "text":
|
|
683
|
+
capabilities = [
|
|
684
|
+
slot("text", true),
|
|
685
|
+
slot("metadata", true, [TEXT_META_NOTE]),
|
|
686
|
+
slot("html", true),
|
|
687
|
+
slot("ocr", false, ["OCR does not apply to plain text files."]),
|
|
688
|
+
slot("pages", false)
|
|
689
|
+
];
|
|
690
|
+
break;
|
|
691
|
+
default:
|
|
692
|
+
topWarnings.push(UNKNOWN_KIND);
|
|
693
|
+
capabilities = [
|
|
694
|
+
slot("text", false),
|
|
695
|
+
slot("metadata", false),
|
|
696
|
+
slot("html", false),
|
|
697
|
+
slot("ocr", false),
|
|
698
|
+
slot("pages", false)
|
|
699
|
+
];
|
|
700
|
+
}
|
|
701
|
+
return {
|
|
702
|
+
kind,
|
|
703
|
+
runtime,
|
|
704
|
+
capabilities,
|
|
705
|
+
warnings: topWarnings.length > 0 ? topWarnings : void 0
|
|
706
|
+
};
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
// src/introspection.ts
|
|
710
|
+
function resolvePdfOcrMode(pdf) {
|
|
711
|
+
return pdf?.ocr ?? "auto";
|
|
712
|
+
}
|
|
713
|
+
function planAnalyzeFile(kind, pdfOcr) {
|
|
714
|
+
switch (kind) {
|
|
715
|
+
case "pdf":
|
|
716
|
+
return {
|
|
717
|
+
intent: "analyzeFile",
|
|
718
|
+
steps: [
|
|
719
|
+
{ id: "detect_kind", status: "done" },
|
|
720
|
+
{ id: "pdf_parse", status: "planned" },
|
|
721
|
+
{
|
|
722
|
+
id: "pdf_ocr",
|
|
723
|
+
status: pdfOcr === "off" ? "skipped" : "planned"
|
|
724
|
+
}
|
|
725
|
+
]
|
|
726
|
+
};
|
|
727
|
+
case "docx":
|
|
728
|
+
return {
|
|
729
|
+
intent: "analyzeFile",
|
|
730
|
+
steps: [
|
|
731
|
+
{ id: "detect_kind", status: "done" },
|
|
732
|
+
{ id: "docx_mammoth", status: "planned" }
|
|
733
|
+
]
|
|
734
|
+
};
|
|
735
|
+
case "image":
|
|
736
|
+
return {
|
|
737
|
+
intent: "analyzeFile",
|
|
738
|
+
steps: [
|
|
739
|
+
{ id: "detect_kind", status: "done" },
|
|
740
|
+
{ id: "image_ocr", status: "planned" }
|
|
741
|
+
]
|
|
742
|
+
};
|
|
743
|
+
case "text":
|
|
744
|
+
return {
|
|
745
|
+
intent: "analyzeFile",
|
|
746
|
+
steps: [
|
|
747
|
+
{ id: "detect_kind", status: "done" },
|
|
748
|
+
{ id: "utf8_decode", status: "planned" }
|
|
749
|
+
]
|
|
750
|
+
};
|
|
751
|
+
default:
|
|
752
|
+
return {
|
|
753
|
+
intent: "analyzeFile",
|
|
754
|
+
steps: [
|
|
755
|
+
{ id: "detect_kind", status: "done" },
|
|
756
|
+
{ id: "route", status: "failed" }
|
|
757
|
+
]
|
|
758
|
+
};
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
|
|
762
|
+
const intent = intentOpt ?? "analyzeFile";
|
|
763
|
+
if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze);
|
|
764
|
+
if (intent === "extractText") {
|
|
765
|
+
const p = planAnalyzeFile(kind, "off");
|
|
766
|
+
return { ...p, intent: "extractText" };
|
|
767
|
+
}
|
|
768
|
+
if (intent === "extractMetadata") {
|
|
769
|
+
if (kind === "pdf") {
|
|
770
|
+
return {
|
|
771
|
+
intent: "extractMetadata",
|
|
772
|
+
steps: [
|
|
773
|
+
{ id: "detect_kind", status: "done" },
|
|
774
|
+
{ id: "pdf_metadata", status: "planned" }
|
|
775
|
+
]
|
|
776
|
+
};
|
|
777
|
+
}
|
|
778
|
+
if (kind === "text") {
|
|
779
|
+
return {
|
|
780
|
+
intent: "extractMetadata",
|
|
781
|
+
steps: [
|
|
782
|
+
{ id: "detect_kind", status: "done" },
|
|
783
|
+
{ id: "utf8_decode", status: "planned" }
|
|
784
|
+
]
|
|
785
|
+
};
|
|
786
|
+
}
|
|
787
|
+
return {
|
|
788
|
+
intent: "extractMetadata",
|
|
789
|
+
steps: [
|
|
790
|
+
{ id: "detect_kind", status: "done" },
|
|
791
|
+
{ id: "metadata_stub", status: kind === "docx" || kind === "image" ? "planned" : "skipped" }
|
|
792
|
+
]
|
|
793
|
+
};
|
|
794
|
+
}
|
|
795
|
+
if (intent === "convertToHtml") {
|
|
796
|
+
if (kind === "docx") {
|
|
797
|
+
return {
|
|
798
|
+
intent: "convertToHtml",
|
|
799
|
+
steps: [
|
|
800
|
+
{ id: "detect_kind", status: "done" },
|
|
801
|
+
{ id: "docx_mammoth_html", status: "planned" }
|
|
802
|
+
]
|
|
803
|
+
};
|
|
804
|
+
}
|
|
805
|
+
if (kind === "text") {
|
|
806
|
+
return {
|
|
807
|
+
intent: "convertToHtml",
|
|
808
|
+
steps: [
|
|
809
|
+
{ id: "detect_kind", status: "done" },
|
|
810
|
+
{ id: "utf8_decode", status: "planned" },
|
|
811
|
+
{ id: "wrap_pre", status: "planned" }
|
|
812
|
+
]
|
|
813
|
+
};
|
|
814
|
+
}
|
|
815
|
+
if (kind === "pdf") {
|
|
816
|
+
return {
|
|
817
|
+
intent: "convertToHtml",
|
|
818
|
+
steps: [
|
|
819
|
+
{ id: "detect_kind", status: "done" },
|
|
820
|
+
{ id: "pdf_text_layer", status: "planned" },
|
|
821
|
+
{ id: "wrap_pre", status: "planned" }
|
|
822
|
+
]
|
|
823
|
+
};
|
|
824
|
+
}
|
|
825
|
+
return {
|
|
826
|
+
intent: "convertToHtml",
|
|
827
|
+
steps: [
|
|
828
|
+
{ id: "detect_kind", status: "done" },
|
|
829
|
+
{ id: "rich_html", status: "skipped" }
|
|
830
|
+
]
|
|
831
|
+
};
|
|
832
|
+
}
|
|
833
|
+
if (intent === "runOcr") {
|
|
834
|
+
if (kind === "pdf") {
|
|
835
|
+
return {
|
|
836
|
+
intent: "runOcr",
|
|
837
|
+
steps: [
|
|
838
|
+
{ id: "detect_kind", status: "done" },
|
|
839
|
+
{ id: "pdf_parse", status: "planned" },
|
|
840
|
+
{ id: "pdf_ocr_forced", status: "planned" }
|
|
841
|
+
]
|
|
842
|
+
};
|
|
843
|
+
}
|
|
844
|
+
if (kind === "image") {
|
|
845
|
+
return {
|
|
846
|
+
intent: "runOcr",
|
|
847
|
+
steps: [
|
|
848
|
+
{ id: "detect_kind", status: "done" },
|
|
849
|
+
{ id: "tesseract_ocr", status: "planned" }
|
|
850
|
+
]
|
|
851
|
+
};
|
|
852
|
+
}
|
|
853
|
+
if (kind === "docx") {
|
|
854
|
+
return {
|
|
855
|
+
intent: "runOcr",
|
|
856
|
+
steps: [
|
|
857
|
+
{ id: "detect_kind", status: "done" },
|
|
858
|
+
{ id: "docx_structured_extract", status: "planned" }
|
|
859
|
+
]
|
|
860
|
+
};
|
|
861
|
+
}
|
|
862
|
+
return {
|
|
863
|
+
intent: "runOcr",
|
|
864
|
+
steps: [
|
|
865
|
+
{ id: "detect_kind", status: "done" },
|
|
866
|
+
{ id: "ocr", status: "skipped" }
|
|
867
|
+
]
|
|
868
|
+
};
|
|
869
|
+
}
|
|
870
|
+
return planAnalyzeFile(kind, pdfOcrForAnalyze);
|
|
871
|
+
}
|
|
872
|
+
async function getCapabilities(input, options) {
|
|
873
|
+
throwIfAborted(options?.signal);
|
|
874
|
+
const resolved = await resolveNodeAnalyzeInput(input);
|
|
875
|
+
assertValidAnalyzeFileInput(resolved);
|
|
876
|
+
const kind = detectFileKind(resolved);
|
|
877
|
+
return buildNodeCapabilityReport(kind);
|
|
878
|
+
}
|
|
879
|
+
async function explainAnalysisPlan(input, options) {
|
|
880
|
+
throwIfAborted(options?.signal);
|
|
881
|
+
const resolved = await resolveNodeAnalyzeInput(input);
|
|
882
|
+
assertValidAnalyzeFileInput(resolved);
|
|
883
|
+
const kind = detectFileKind(resolved);
|
|
884
|
+
const intent = options?.intent ?? "analyzeFile";
|
|
885
|
+
const pdfOcrAnalyze = resolvePdfOcrMode(options?.pdf);
|
|
886
|
+
const plan = planForIntent(intent, kind, pdfOcrAnalyze);
|
|
887
|
+
return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan);
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
export { analyzeFile, bufferToInput, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
|
|
550
891
|
//# sourceMappingURL=index.js.map
|
|
551
892
|
//# sourceMappingURL=index.js.map
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-node",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "1.2.0",
|
|
4
|
+
"description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
7
7
|
"module": "./dist/index.js",
|
|
@@ -14,7 +14,8 @@
|
|
|
14
14
|
}
|
|
15
15
|
},
|
|
16
16
|
"files": [
|
|
17
|
-
"dist"
|
|
17
|
+
"dist/**/*.js",
|
|
18
|
+
"dist/**/*.d.ts"
|
|
18
19
|
],
|
|
19
20
|
"publishConfig": {
|
|
20
21
|
"access": "public"
|
|
@@ -34,7 +35,7 @@
|
|
|
34
35
|
"@dragon708/docmind-docx": "^1.0.0",
|
|
35
36
|
"@dragon708/docmind-ocr": "^1.0.0",
|
|
36
37
|
"@dragon708/docmind-pdf": "^1.0.0",
|
|
37
|
-
"@dragon708/docmind-shared": "^1.
|
|
38
|
+
"@dragon708/docmind-shared": "^1.1.0"
|
|
38
39
|
},
|
|
39
40
|
"devDependencies": {
|
|
40
41
|
"@types/node": "^20.19.37",
|
package/dist/index.js.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/inputBytes.ts","../src/analyzers/docx.ts","../src/analyzers/image.ts","../src/analyzers/pdf.ts","../src/resolveNodeInput.ts","../src/analyze.ts","../src/publicActions.ts"],"names":["extractDocx","runPdf","assertValidAnalyzeFileInput","detectFileKind","ocr","analyzeText","notImplementedResult","UNKNOWN_FORMAT_WARNING"],"mappings":";;;;;;;;;;AASO,SAAS,kBAAkB,KAAA,EAAoD;AACpF,EAAA,OAAO,YAAA,CAAa,KAAK,CAAA,IAAK,aAAA,CAAc,KAAK,KAAK,MAAA,CAAO,KAAK,CAAA,IAAK,MAAA,CAAO,KAAK,CAAA;AACrF;AAGA,eAAsB,qBAAqB,KAAA,EAAiD;AAC1F,EAAA,IAAI,CAAC,iBAAA,CAAkB,KAAK,CAAA,EAAG;AAC7B,IAAA,OAAO,IAAI,WAAW,CAAC,CAAA;AAAA,EACzB;AACA,EAAA,OAAO,aAAa,KAAK,CAAA;AAC3B;;;ACZA,eAAsB,kBAAA,CACpB,OACA,MAAA,EACyB;AACzB,EAAA,IAAI,QAAQ,OAAA,EAAS;AACnB,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,KAAK,CAAA;AAC7C,EAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,IAAA,OAAO;AAAA,MACL,QAAA,EAAU,MAAA;AAAA,MACV,QAAA,EAAU,MAAA;AAAA,MACV,MAAA,EAAQ,IAAA;AAAA,MACR,IAAA,EAAM,MAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,QAAA,EAAU,CAAC,+CAA+C;AAAA,KAC5D;AAAA,EACF;AAEA,EAAA,MAAM,CAAA,GAAI,MAAMA,WAAA,CAAY,IAAI,CAAA;AAChC,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,MAAA;AAAA,IACV,QAAA,EAAU,MAAA;AAAA,IACV,MAAA,EAAQ,IAAA;AAAA,IACR,IAAA,EAAM,MAAA;AAAA,IACN,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,QAAA,EAAU,CAAC,GAAG,CAAA,CAAE,QAAQ;AAAA,GAC1B;AACF;AChCA,eAAsB,mBAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,IAAI,OAAA,EAAS,QAAQ,OAAA,EAAS;AAC5B,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,KAAK,CAAA;AAC7C,EAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,IAAA,OAAO;AAAA,MACL,QAAA,EAAU,OAAA;AAAA,MACV,QAAA,EAAU,OAAA;AAAA,MACV,MAAA,EAAQ,IAAA;AAAA,MACR,IAAA,EAAM,OAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,UAAA,EAAY,CAAA;AAAA,MACZ,OAAA,EAAS,IAAA;AAAA,MACT,QAAA,EAAU,CAAC,4CAA4C;AAAA,KACzD;AAAA,EACF;AAEA,EAAA,MAAM,OAAA,GAAU;AAAA,IACd,GAAI,OAAA,EAAS,GAAA,IAAO,EAAC;AAAA,IACrB,MAAA,EAAQ,OAAA,EAAS,GAAA,EAAK,MAAA,IAAU,OAAA,EAAS;AAAA,GAC3C;AAEA,EAAA,MAAM,CAAA,GAAI,MAAM,GAAA,CAAI,IAAA,EAAM,OAAO,CAAA;AACjC,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,OAAA;AAAA,IACV,QAAA,EAAU,OAAA;AAAA,IACV,MAAA,EAAQ,IAAA;AAAA,IACR,IAAA,EAAM,OAAA;AAAA,IACN,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,SAAS,CAAA,CAAE,OAAA;AAAA,IACX,UAAU;AAAC,GACb;AACF;AClCA,eAAsB,iBAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,IAAI,OAAA,EAAS,QAAQ,OAAA,EAAS;AAC5B,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,KAAK,CAAA;AAC7C,EAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,IAAA,OAAO;AAAA,MACL,QAAA,EAAU,KAAA;AAAA,MACV,QAAA,EAAU,KAAA;AAAA,MACV,MAAA,EAAQ,IAAA;AAAA,MACR,IAAA,EAAM,KAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,KAAA,EAAO,CAAA;AAAA,MACP,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,MACrB,QAAA,EAAU,CAAC,+CAA+C,CAAA;AAAA,MAC1D,QAAA,EAAU,KAAA;AAAA,MACV,OAAA,EAAS;AAAA,KACX;AAAA,EACF;AAEA,EAAA,MAAM,UAAU,OAAA,EAAS,GAAA;AACzB,EAAA,MAAM,OAAA,GAA6B;AAAA,IACjC,GAAG,OAAA;AAAA,IACH,GAAA,EAAK,SAAS,GAAA,IAAO,MAAA;AAAA,IACrB,QAAA,EAAU,OAAA,EAAS,QAAA,IAAY,OAAA,EAAS,GAAA,EAAK,KAAA;AAAA,IAC7C,MAAA,EAAQ,OAAA,EAAS,MAAA,IAAU,OAAA,EAAS;AAAA,GACtC;AAEA,EAAA,MAAM,CAAA,GAAI,MAAMC,UAAA,CAAO,IAAA,EAAM,OAAO,CAAA;AACpC,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,KAAA;AAAA,IACV,QAAA,EAAU,KAAA;AAAA,IACV,MAAA,EAAQ,IAAA;AAAA,IACR,IAAA,EAAM,KAAA;AAAA,IACN,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,OAAO,CAAA,CAAE,KAAA;AAAA,IACT,UAAU,CAAA,CAAE,QAAA;AAAA,IACZ,QAAA,EAAU,CAAC,GAAG,CAAA,CAAE,QAAQ,CAAA;AAAA,IACxB,UAAU,CAAA,CAAE,QAAA;AAAA,IACZ,SAAS,CAAA,CAAE;AAAA,GACb;AACF;AClDA,SAAS,aAAa,SAAA,EAAiC;AACrD,EAAA,OAAO,SAAA,YAAqB,GAAA,GAAM,aAAA,CAAc,SAAS,CAAA,GAAI,SAAA;AAC/D;AAKA,eAAsB,gBAAgB,IAAA,EAAiD;AACrF,EAAA,MAAM,MAAA,GAAS,aAAa,IAAI,CAAA;AAChC,EAAA,MAAM,IAAA,GAAO,MAAM,QAAA,CAAS,MAAM,CAAA;AAClC,EAAA,OAAO;AAAA,IACL,IAAA;AAAA,IACA,IAAA,EAAM,SAAS,MAAM;AAAA,GACvB;AACF;AAGO,SAAS,aAAA,CAAc,QAAgB,IAAA,EAAmC;AAC/E,EAAA,OAAO,IAAA,KAAS,SAAY,EAAE,IAAA,EAAM,QAAQ,IAAA,EAAK,GAAI,EAAE,IAAA,EAAM,MAAA,EAAO;AACtE;AAKA,eAAsB,wBAAwB,KAAA,EAAuD;AACnG,EAAA,IAAI,OAAO,KAAA,KAAU,QAAA,IAAY,KAAA,YAAiB,GAAA,EAAK;AACrD,IAAA,OAAO,gBAAgB,KAAK,CAAA;AAAA,EAC9B;AACA,EAAA,OAAO,KAAA;AACT;;;ACvBA,eAAsB,WAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,IAAI,OAAA,EAAS,QAAQ,OAAA,EAAS;AAC5B,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,MAAM,QAAA,GAAW,MAAM,uBAAA,CAAwB,KAAK,CAAA;AACpD,EAAA,2BAAA,CAA4B,QAAQ,CAAA;AAEpC,EAAA,MAAM,QAAA,GAAW,eAAe,QAAQ,CAAA;AAExC,EAAA,QAAQ,QAAA;AAAU,IAChB,KAAK,KAAA;AACH,MAAA,OAAO,iBAAA,CAAkB,UAAiC,OAAO,CAAA;AAAA,IACnE,KAAK,MAAA;AACH,MAAA,OAAO,kBAAA,CAAmB,QAAA,EAAiC,OAAA,EAAS,MAAM,CAAA;AAAA,IAC5E,KAAK,OAAA;AACH,MAAA,OAAO,mBAAA,CAAoB,UAAiC,OAAO,CAAA;AAAA,IACrE,KAAK,MAAA;AACH,MAAA,OAAO,YAAY,QAAA,EAAiC,EAAE,MAAA,EAAQ,OAAA,EAAS,QAAQ,CAAA;AAAA,IACjF;AACE,MAAA,OAAO,oBAAA,CAAqB,QAAA,EAAU,MAAA,EAAQ,CAAC,sBAAsB,CAAC,CAAA;AAAA;AAE5E;ACrBA,SAAS,eAAe,MAAA,EAA4B;AAClD,EAAA,IAAI,QAAQ,OAAA,EAAS;AACnB,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AACF;AAEA,eAAe,QAAQ,KAAA,EAAuD;AAC5E,EAAA,MAAM,QAAA,GAAW,MAAM,uBAAA,CAAwB,KAAK,CAAA;AACpD,EAAAC,4BAA4B,QAAQ,CAAA;AACpC,EAAA,OAAO,QAAA;AACT;AAEA,SAAS,kBAAkB,CAAA,EAAmB;AAC5C,EAAA,OAAO,CAAA,CACJ,OAAA,CAAQ,IAAA,EAAM,OAAO,EACrB,OAAA,CAAQ,IAAA,EAAM,MAAM,CAAA,CACpB,QAAQ,IAAA,EAAM,MAAM,CAAA,CACpB,OAAA,CAAQ,MAAM,QAAQ,CAAA;AAC3B;AAEA,IAAM,kBAAA,GACJ,yGAAA;AAEF,IAAM,mBAAA,GACJ,6DAAA;AAMF,eAAsB,WAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,cAAA,CAAe,SAAS,MAAM,CAAA;AAC9B,EAAA,MAAM,QAAA,GAAW,MAAM,OAAA,CAAQ,KAAK,CAAA;AACpC,EAAA,MAAM,IAAA,GAAOC,eAAe,QAAQ,CAAA;AACpC,EAAA,MAAM,SAAS,OAAA,EAAS,MAAA;AAExB,EAAA,QAAQ,IAAA;AAAM,IACZ,KAAK,KAAA,EAAO;AACV,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,KAAA;AAAA,UACV,QAAA,EAAU,KAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,KAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,KAAA,EAAO,CAAA;AAAA,UACP,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,UACrB,QAAA,EAAU,CAAC,+CAA+C,CAAA;AAAA,UAC1D,QAAA,EAAU,KAAA;AAAA,UACV,OAAA,EAAS;AAAA,SACX;AAAA,MACF;AACA,MAAA,MAAM,CAAA,GAAI,MAAM,kBAAA,CAAmB,IAAI,CAAA;AACvC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,KAAA;AAAA,QACV,QAAA,EAAU,KAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,KAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,OAAO,CAAA,CAAE,KAAA;AAAA,QACT,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,QACrB,UAAU,CAAA,CAAE,QAAA;AAAA,QACZ,QAAA,EAAU,KAAA;AAAA,QACV,OAAA,EAAS;AAAA,OACX;AAAA,IACF;AAAA,IACA,KAAK,MAAA,EAAQ;AACX,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,MAAA;AAAA,UACV,QAAA,EAAU,MAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,MAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,QAAA,EAAU,CAAC,+CAA+C;AAAA,SAC5D;AAAA,MACF;AACA,MAAA,MAAM,CAAA,GAAI,MAAM,mBAAA,CAAoB,IAAI,CAAA;AACxC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,QAAA,EAAU,MAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,MAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,IAAA,EAAM,EAAA;AAAA,QACN,UAAU,CAAA,CAAE;AAAA,OACd;AAAA,IACF;AAAA,IACA,KAAK,OAAA,EAAS;AACZ,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,OAAA;AAAA,UACV,QAAA,EAAU,OAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,OAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,UAAA,EAAY,CAAA;AAAA,UACZ,OAAA,EAAS,IAAA;AAAA,UACT,QAAA,EAAU,CAAC,4CAA4C;AAAA,SACzD;AAAA,MACF;AACA,MAAA,MAAM,OAAA,GAAU;AAAA,QACd,GAAI,OAAA,EAAS,GAAA,IAAO,EAAC;AAAA,QACrB,MAAA,EAAQ,OAAA,EAAS,GAAA,EAAK,MAAA,IAAU;AAAA,OAClC;AACA,MAAA,MAAM,CAAA,GAAI,MAAMC,GAAAA,CAAI,IAAA,EAAM,OAAO,CAAA;AACjC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,OAAA;AAAA,QACV,QAAA,EAAU,OAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,OAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,YAAY,CAAA,CAAE,UAAA;AAAA,QACd,SAAS,CAAA,CAAE,OAAA;AAAA,QACX,UAAU;AAAC,OACb;AAAA,IACF;AAAA,IACA,KAAK,MAAA;AACH,MAAA,OAAOC,WAAAA,CAAY,QAAA,EAAiC,EAAE,MAAA,EAAQ,CAAA;AAAA,IAChE;AACE,MAAA,OAAOC,oBAAAA,CAAqB,IAAA,EAAM,MAAA,EAAQ,CAACC,sBAAsB,CAAC,CAAA;AAAA;AAExE;AAMA,eAAsB,eAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,cAAA,CAAe,SAAS,MAAM,CAAA;AAC9B,EAAA,MAAM,QAAA,GAAW,MAAM,OAAA,CAAQ,KAAK,CAAA;AACpC,EAAA,MAAM,IAAA,GAAOJ,eAAe,QAAQ,CAAA;AACpC,EAAA,MAAM,SAAS,OAAA,EAAS,MAAA;AAExB,EAAA,QAAQ,IAAA;AAAM,IACZ,KAAK,KAAA,EAAO;AACV,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,KAAA;AAAA,UACV,QAAA,EAAU,KAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,KAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,KAAA,EAAO,CAAA;AAAA,UACP,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,UACrB,QAAA,EAAU,CAAC,+CAA+C,CAAA;AAAA,UAC1D,QAAA,EAAU,KAAA;AAAA,UACV,OAAA,EAAS;AAAA,SACX;AAAA,MACF;AACA,MAAA,MAAM,CAAA,GAAI,MAAM,kBAAA,CAAmB,IAAI,CAAA;AACvC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,KAAA;AAAA,QACV,QAAA,EAAU,KAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,KAAA;AAAA,QACN,IAAA,EAAM,EAAA;AAAA,QACN,KAAA,EAAO,CAAA;AAAA,QACP,UAAU,CAAA,CAAE,QAAA;AAAA,QACZ,UAAU,CAAA,CAAE,QAAA;AAAA,QACZ,QAAA,EAAU,KAAA;AAAA,QACV,OAAA,EAAS;AAAA,OACX;AAAA,IACF;AAAA,IACA,KAAK,MAAA;AACH,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,QAAA,EAAU,MAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,MAAA;AAAA,QACN,IAAA,EAAM,EAAA;AAAA,QACN,IAAA,EAAM,EAAA;AAAA,QACN,QAAA,EAAU,CAAC,kBAAkB;AAAA,OAC/B;AAAA,IACF,KAAK,OAAA;AACH,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,OAAA;AAAA,QACV,QAAA,EAAU,OAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,OAAA;AAAA,QACN,IAAA,EAAM,EAAA;AAAA,QACN,UAAA,EAAY,CAAA;AAAA,QACZ,OAAA,EAAS,IAAA;AAAA,QACT,QAAA,EAAU,CAAC,mBAAmB;AAAA,OAChC;AAAA,IACF,KAAK,MAAA;AACH,MAAA,OAAOE,WAAAA,CAAY,QAAA,EAAiC,EAAE,MAAA,EAAQ,CAAA;AAAA,IAChE;AACE,MAAA,OAAOC,oBAAAA,CAAqB,IAAA,EAAM,MAAA,EAAQ,CAACC,sBAAsB,CAAC,CAAA;AAAA;AAExE;AAMA,eAAsB,aAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,cAAA,CAAe,SAAS,MAAM,CAAA;AAC9B,EAAA,MAAM,QAAA,GAAW,MAAM,OAAA,CAAQ,KAAK,CAAA;AACpC,EAAA,MAAM,IAAA,GAAOJ,eAAe,QAAQ,CAAA;AACpC,EAAA,MAAM,SAAS,OAAA,EAAS,MAAA;AAExB,EAAA,QAAQ,IAAA;AAAM,IACZ,KAAK,MAAA,EAAQ;AACX,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,MAAA;AAAA,UACV,QAAA,EAAU,MAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,MAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,QAAA,EAAU,CAAC,+CAA+C;AAAA,SAC5D;AAAA,MACF;AACA,MAAA,MAAM,CAAC,QAAA,EAAU,QAAQ,CAAA,GAAI,MAAM,QAAQ,GAAA,CAAI;AAAA,QAC7C,oBAAoB,IAAI,CAAA;AAAA,QACxB,kBAAkB,IAAI;AAAA,OACvB,CAAA;AACD,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,QAAA,EAAU,MAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,MAAA;AAAA,QACN,MAAM,QAAA,CAAS,IAAA;AAAA,QACf,MAAM,QAAA,CAAS,IAAA;AAAA,QACf,UAAU,CAAC,GAAG,SAAS,QAAA,EAAU,GAAG,SAAS,QAAQ;AAAA,OACvD;AAAA,IACF;AAAA,IACA,KAAK,KAAA,EAAO;AACV,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,KAAA;AAAA,UACV,QAAA,EAAU,KAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,KAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,KAAA,EAAO,CAAA;AAAA,UACP,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,UACrB,QAAA,EAAU,CAAC,+CAA+C,CAAA;AAAA,UAC1D,QAAA,EAAU,KAAA;AAAA,UACV,OAAA,EAAS;AAAA,SACX;AAAA,MACF;AACA,MAAA,MAAM,CAAA,GAAI,MAAM,kBAAA,CAAmB,IAAI,CAAA;AACvC,MAAA,MAAM,IAAA,GAAO,CAAA,KAAA,EAAQ,iBAAA,CAAkB,CAAA,CAAE,IAAI,CAAC,CAAA,MAAA,CAAA;AAC9C,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,KAAA;AAAA,QACV,QAAA,EAAU,KAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,KAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,OAAO,CAAA,CAAE,KAAA;AAAA,QACT,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,QACrB,IAAA;AAAA,QACA,QAAA,EAAU;AAAA,UACR,GAAG,CAAA,CAAE,QAAA;AAAA,UACL;AAAA,SACF;AAAA,QACA,QAAA,EAAU,KAAA;AAAA,QACV,OAAA,EAAS;AAAA,OACX;AAAA,IACF;AAAA,IACA,KAAK,MAAA,EAAQ;AACX,MAAA,MAAM,IAAI,MAAME,WAAAA,CAAY,QAAA,EAAiC,EAAE,QAAQ,CAAA;AACvE,MAAA,MAAM,IAAA,GAAO,CAAA,KAAA,EAAQ,iBAAA,CAAkB,CAAA,CAAE,IAAI,CAAC,CAAA,MAAA,CAAA;AAC9C,MAAA,OAAO;AAAA,QACL,GAAG,CAAA;AAAA,QACH,IAAA;AAAA,QACA,QAAA,EAAU;AAAA,UACR,GAAG,CAAA,CAAE,QAAA;AAAA,UACL;AAAA;AACF,OACF;AAAA,IACF;AAAA,IACA,KAAK,OAAA;AACH,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,OAAA;AAAA,QACV,QAAA,EAAU,OAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,OAAA;AAAA,QACN,IAAA,EAAM,EAAA;AAAA,QACN,UAAA,EAAY,CAAA;AAAA,QACZ,OAAA,EAAS,IAAA;AAAA,QACT,QAAA,EAAU,CAAC,qEAAqE;AAAA,OAClF;AAAA,IACF;AACE,MAAA,OAAOC,oBAAAA,CAAqB,IAAA,EAAM,MAAA,EAAQ,CAACC,sBAAsB,CAAC,CAAA;AAAA;AAExE;AAMA,eAAsB,MAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,cAAA,CAAe,SAAS,MAAM,CAAA;AAC9B,EAAA,MAAM,QAAA,GAAW,MAAM,OAAA,CAAQ,KAAK,CAAA;AACpC,EAAA,MAAM,IAAA,GAAOJ,eAAe,QAAQ,CAAA;AACpC,EAAA,MAAM,SAAS,OAAA,EAAS,MAAA;AACxB,EAAA,MAAM,IAAA,GAAO,OAAA,EAAS,GAAA,EAAK,KAAA,IAAS,SAAS,GAAA,EAAK,QAAA;AAElD,EAAA,QAAQ,IAAA;AAAM,IACZ,KAAK,KAAA,EAAO;AACV,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,KAAA;AAAA,UACV,QAAA,EAAU,KAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,KAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,KAAA,EAAO,CAAA;AAAA,UACP,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,UACrB,QAAA,EAAU,CAAC,+CAA+C,CAAA;AAAA,UAC1D,QAAA,EAAU,KAAA;AAAA,UACV,OAAA,EAAS;AAAA,SACX;AAAA,MACF;AACA,MAAA,MAAM,CAAA,GAAI,MAAM,UAAA,CAAW,IAAA,EAAM;AAAA,QAC/B,GAAG,OAAA,EAAS,GAAA;AAAA,QACZ,GAAA,EAAK,OAAA;AAAA,QACL,QAAA,EAAU,IAAA,IAAQ,OAAA,EAAS,GAAA,EAAK,QAAA;AAAA,QAChC,MAAA,EAAQ,OAAA,EAAS,GAAA,EAAK,MAAA,IAAU;AAAA,OACjC,CAAA;AACD,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,KAAA;AAAA,QACV,QAAA,EAAU,KAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,KAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,OAAO,CAAA,CAAE,KAAA;AAAA,QACT,UAAU,CAAA,CAAE,QAAA;AAAA,QACZ,QAAA,EAAU,CAAC,GAAG,CAAA,CAAE,QAAQ,CAAA;AAAA,QACxB,UAAU,CAAA,CAAE,QAAA;AAAA,QACZ,SAAS,CAAA,CAAE;AAAA,OACb;AAAA,IACF;AAAA,IACA,KAAK,OAAA,EAAS;AACZ,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,OAAA;AAAA,UACV,QAAA,EAAU,OAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,OAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,UAAA,EAAY,CAAA;AAAA,UACZ,OAAA,EAAS,IAAA;AAAA,UACT,QAAA,EAAU,CAAC,4CAA4C;AAAA,SACzD;AAAA,MACF;AACA,MAAA,MAAM,OAAA,GAAU;AAAA,QACd,GAAI,OAAA,EAAS,GAAA,IAAO,EAAC;AAAA,QACrB,MAAA,EAAQ,OAAA,EAAS,GAAA,EAAK,MAAA,IAAU;AAAA,OAClC;AACA,MAAA,MAAM,CAAA,GAAI,MAAMC,GAAAA,CAAI,IAAA,EAAM,OAAO,CAAA;AACjC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,OAAA;AAAA,QACV,QAAA,EAAU,OAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,OAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,YAAY,CAAA,CAAE,UAAA;AAAA,QACd,SAAS,CAAA,CAAE,OAAA;AAAA,QACX,UAAU;AAAC,OACb;AAAA,IACF;AAAA,IACA,KAAK,MAAA,EAAQ;AACX,MAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,QAAQ,CAAA;AAChD,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,MAAA;AAAA,UACV,QAAA,EAAU,MAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,MAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,QAAA,EAAU,CAAC,+CAA+C;AAAA,SAC5D;AAAA,MACF;AACA,MAAA,MAAM,CAAA,GAAI,MAAM,WAAA,CAAY,IAAI,CAAA;AAChC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,QAAA,EAAU,MAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,MAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,QAAA,EAAU;AAAA,UACR,GAAG,CAAA,CAAE,QAAA;AAAA,UACL;AAAA;AACF,OACF;AAAA,IACF;AAAA,IACA,KAAK,MAAA;AACH,MAAA,OAAOC,WAAAA,CAAY,QAAA,EAAiC,EAAE,MAAA,EAAQ,CAAA;AAAA,IAChE;AACE,MAAA,OAAOC,oBAAAA,CAAqB,IAAA,EAAM,MAAA,EAAQ,CAACC,sBAAsB,CAAC,CAAA;AAAA;AAExE","file":"index.js","sourcesContent":["import type { DetectFileKindInput, FileLikeInput } from \"@dragon708/docmind-shared\";\nimport {\n isBinaryInput,\n isBlob,\n isFile,\n isNamedInput,\n toUint8Array,\n} from \"@dragon708/docmind-shared\";\n\nexport function isByteBackedInput(input: DetectFileKindInput): input is FileLikeInput {\n return isNamedInput(input) || isBinaryInput(input) || isBlob(input) || isFile(input);\n}\n\n/** Resolves bytes when the input carries a body; otherwise an empty `Uint8Array`. */\nexport async function bytesFromDetectInput(input: DetectFileKindInput): Promise<Uint8Array> {\n if (!isByteBackedInput(input)) {\n return new Uint8Array(0);\n }\n return toUint8Array(input);\n}\n","import { analyzeDocx as extractDocx } from \"@dragon708/docmind-docx\";\nimport type { AnalysisResult, DetectFileKindInput } from \"@dragon708/docmind-shared\";\nimport { bytesFromDetectInput } from \"../inputBytes.js\";\n\n/**\n * DOCX → `@dragon708/docmind-docx`.\n */\nexport async function analyzeDocxForNode(\n input: DetectFileKindInput,\n signal?: AbortSignal,\n): Promise<AnalysisResult> {\n if (signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n\n const data = await bytesFromDetectInput(input);\n if (data.byteLength === 0) {\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [\"No document bytes were provided for analysis.\"],\n };\n }\n\n const r = await extractDocx(data);\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: r.text,\n html: r.html,\n warnings: [...r.warnings],\n };\n}\n","import { ocr } from \"@dragon708/docmind-ocr\";\nimport type { AnalysisResult, DetectFileKindInput } from \"@dragon708/docmind-shared\";\nimport type { NodeAnalyzeOptions } from \"../nodeAnalyzeOptions.js\";\nimport { bytesFromDetectInput } from \"../inputBytes.js\";\n\n/**\n * Image → `@dragon708/docmind-ocr`.\n */\nexport async function analyzeImageForNode(\n input: DetectFileKindInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n if (options?.signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n\n const data = await bytesFromDetectInput(input);\n if (data.byteLength === 0) {\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [\"No image bytes were provided for analysis.\"],\n };\n }\n\n const ocrOpts = {\n ...(options?.ocr ?? {}),\n signal: options?.ocr?.signal ?? options?.signal,\n };\n\n const r = await ocr(data, ocrOpts);\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: r.text,\n confidence: r.confidence,\n ocrUsed: r.ocrUsed,\n warnings: [],\n };\n}\n","import { analyzePdf as runPdf } from \"@dragon708/docmind-pdf\";\nimport type { PdfAnalyzeOptions } from \"@dragon708/docmind-pdf\";\nimport type { AnalysisResult, DetectFileKindInput } from \"@dragon708/docmind-shared\";\nimport type { NodeAnalyzeOptions } from \"../nodeAnalyzeOptions.js\";\nimport { bytesFromDetectInput } from \"../inputBytes.js\";\n\n/**\n * PDF → `@dragon708/docmind-pdf` (Node / pdf-parse + OCR).\n *\n * Unlike `analyzePdf` from `@dragon708/docmind-pdf` (OCR off unless set), `analyzeFile` defaults\n * to `pdf.ocr: \"auto\"`: when the PDF has\n * pages but almost no extractable text (typical scan), the raster OCR pipeline runs. Pass\n * `pdf: { ocr: \"off\" }` to skip OCR for speed.\n */\nexport async function analyzePdfForNode(\n input: DetectFileKindInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n if (options?.signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n\n const data = await bytesFromDetectInput(input);\n if (data.byteLength === 0) {\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: \"\",\n pages: 0,\n metadata: { info: {} },\n warnings: [\"No document bytes were provided for analysis.\"],\n needsOCR: false,\n ocrUsed: false,\n };\n }\n\n const userPdf = options?.pdf;\n const pdfOpts: PdfAnalyzeOptions = {\n ...userPdf,\n ocr: userPdf?.ocr ?? \"auto\",\n ocrLangs: userPdf?.ocrLangs ?? options?.ocr?.langs,\n signal: userPdf?.signal ?? options?.signal,\n };\n\n const r = await runPdf(data, pdfOpts);\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: r.text,\n pages: r.pages,\n metadata: r.metadata,\n warnings: [...r.warnings],\n needsOCR: r.needsOCR,\n ocrUsed: r.ocrUsed,\n };\n}\n","import type { DetectFileKindInput, NamedInput } from \"@dragon708/docmind-shared\";\nimport { readFile } from \"node:fs/promises\";\nimport { basename } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\n\n/**\n * Inputs accepted by {@link analyzeFile} in this package.\n * Paths and `file:` URLs are read with `fs`; other values pass through as {@link DetectFileKindInput}.\n */\nexport type NodeAnalyzeInput = string | URL | DetectFileKindInput;\n\nfunction toPathString(pathOrUrl: string | URL): string {\n return pathOrUrl instanceof URL ? fileURLToPath(pathOrUrl) : pathOrUrl;\n}\n\n/**\n * Reads a file from disk into a {@link NamedInput} (binary `Buffer`, basename as `name` for hints).\n */\nexport async function readFileToInput(path: string | URL): Promise<NamedInput<Buffer>> {\n const fsPath = toPathString(path);\n const data = await readFile(fsPath);\n return {\n data,\n name: basename(fsPath),\n };\n}\n\n/** Wraps a `Buffer` as a named payload when you already know the filename. */\nexport function bufferToInput(buffer: Buffer, name?: string): NamedInput<Buffer> {\n return name !== undefined ? { data: buffer, name } : { data: buffer };\n}\n\n/**\n * Resolves paths / `file:` URLs to a {@link DetectFileKindInput}; leaves other values untouched.\n */\nexport async function resolveNodeAnalyzeInput(input: NodeAnalyzeInput): Promise<DetectFileKindInput> {\n if (typeof input === \"string\" || input instanceof URL) {\n return readFileToInput(input);\n }\n return input;\n}\n","import type { AnalysisResult, DetectFileKindInput } from \"@dragon708/docmind-shared\";\nimport {\n analyzeText,\n assertValidAnalyzeFileInput,\n detectFileKind,\n notImplementedResult,\n UNKNOWN_FORMAT_WARNING,\n} from \"@dragon708/docmind-shared\";\nimport { analyzeDocxForNode } from \"./analyzers/docx.js\";\nimport { analyzeImageForNode } from \"./analyzers/image.js\";\nimport { analyzePdfForNode } from \"./analyzers/pdf.js\";\nimport type { NodeAnalyzeOptions } from \"./nodeAnalyzeOptions.js\";\nimport { resolveNodeAnalyzeInput, type NodeAnalyzeInput } from \"./resolveNodeInput.js\";\n\n/**\n * Node router: PDF, DOCX, images (OCR), and text. Paths and `file:` URLs are read via `fs`.\n */\nexport async function analyzeFile(\n input: NodeAnalyzeInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n if (options?.signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n\n const resolved = await resolveNodeAnalyzeInput(input);\n assertValidAnalyzeFileInput(resolved);\n\n const fileKind = detectFileKind(resolved);\n\n switch (fileKind) {\n case \"pdf\":\n return analyzePdfForNode(resolved as DetectFileKindInput, options);\n case \"docx\":\n return analyzeDocxForNode(resolved as DetectFileKindInput, options?.signal);\n case \"image\":\n return analyzeImageForNode(resolved as DetectFileKindInput, options);\n case \"text\":\n return analyzeText(resolved as DetectFileKindInput, { signal: options?.signal });\n default:\n return notImplementedResult(fileKind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n","import type { AnalysisResult, DetectFileKindInput } from \"@dragon708/docmind-shared\";\nimport {\n analyzeText,\n assertValidAnalyzeFileInput,\n detectFileKind,\n notImplementedResult,\n UNKNOWN_FORMAT_WARNING,\n} from \"@dragon708/docmind-shared\";\nimport {\n analyzePdf,\n extractPdfMetadata,\n extractTextFromPdf,\n} from \"@dragon708/docmind-pdf\";\nimport {\n analyzeDocx,\n convertDocxToHtml,\n extractTextFromDocx,\n} from \"@dragon708/docmind-docx\";\nimport { ocr } from \"@dragon708/docmind-ocr\";\nimport { bytesFromDetectInput } from \"./inputBytes.js\";\nimport type { NodeAnalyzeOptions } from \"./nodeAnalyzeOptions.js\";\nimport { resolveNodeAnalyzeInput, type NodeAnalyzeInput } from \"./resolveNodeInput.js\";\n\nfunction throwIfAborted(signal?: AbortSignal): void {\n if (signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n}\n\nasync function prepare(input: NodeAnalyzeInput): Promise<DetectFileKindInput> {\n const resolved = await resolveNodeAnalyzeInput(input);\n assertValidAnalyzeFileInput(resolved);\n return resolved;\n}\n\nfunction escapeHtmlMinimal(s: string): string {\n return s\n .replace(/&/g, \"&\")\n .replace(/</g, \"<\")\n .replace(/>/g, \">\")\n .replace(/\"/g, \""\");\n}\n\nconst DOCX_METADATA_STUB =\n \"Structured document metadata for DOCX is not exposed as a separate API; use extractText or analyzeFile.\";\n\nconst IMAGE_METADATA_NOTE =\n \"Raster images have no document metadata bundle in this API.\";\n\n/**\n * Text only: PDF → `extractTextFromPdf` (capa de texto, sin OCR); DOCX → `extractTextFromDocx`;\n * imagen → `ocr`; texto → `analyzeText`.\n */\nexport async function extractText(\n input: NodeAnalyzeInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n throwIfAborted(options?.signal);\n const resolved = await prepare(input);\n const kind = detectFileKind(resolved);\n const signal = options?.signal;\n\n switch (kind) {\n case \"pdf\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: \"\",\n pages: 0,\n metadata: { info: {} },\n warnings: [\"No document bytes were provided for analysis.\"],\n needsOCR: false,\n ocrUsed: false,\n };\n }\n const r = await extractTextFromPdf(data);\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: r.text,\n pages: r.pages,\n metadata: { info: {} },\n warnings: r.warnings,\n needsOCR: false,\n ocrUsed: false,\n };\n }\n case \"docx\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [\"No document bytes were provided for analysis.\"],\n };\n }\n const r = await extractTextFromDocx(data);\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: r.text,\n html: \"\",\n warnings: r.warnings,\n };\n }\n case \"image\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [\"No image bytes were provided for analysis.\"],\n };\n }\n const ocrOpts = {\n ...(options?.ocr ?? {}),\n signal: options?.ocr?.signal ?? signal,\n };\n const r = await ocr(data, ocrOpts);\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: r.text,\n confidence: r.confidence,\n ocrUsed: r.ocrUsed,\n warnings: [],\n };\n }\n case \"text\":\n return analyzeText(resolved as DetectFileKindInput, { signal });\n default:\n return notImplementedResult(kind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n\n/**\n * Metadatos: PDF → `extractPdfMetadata`; resto con mejor esfuerzo o aviso.\n * El resultado sigue siendo `AnalysisResult` (PDF rellena `metadata` en forma `PdfAnalysisCoreResult`).\n */\nexport async function extractMetadata(\n input: NodeAnalyzeInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n throwIfAborted(options?.signal);\n const resolved = await prepare(input);\n const kind = detectFileKind(resolved);\n const signal = options?.signal;\n\n switch (kind) {\n case \"pdf\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: \"\",\n pages: 0,\n metadata: { info: {} },\n warnings: [\"No document bytes were provided for analysis.\"],\n needsOCR: false,\n ocrUsed: false,\n };\n }\n const r = await extractPdfMetadata(data);\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: \"\",\n pages: 0,\n metadata: r.metadata,\n warnings: r.warnings,\n needsOCR: false,\n ocrUsed: false,\n };\n }\n case \"docx\":\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [DOCX_METADATA_STUB],\n };\n case \"image\":\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [IMAGE_METADATA_NOTE],\n };\n case \"text\":\n return analyzeText(resolved as DetectFileKindInput, { signal });\n default:\n return notImplementedResult(kind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n\n/**\n * HTML: DOCX → `convertDocxToHtml`; PDF/texto → `<pre>` a partir de texto extraído;\n * imágenes → vacío con aviso.\n */\nexport async function convertToHtml(\n input: NodeAnalyzeInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n throwIfAborted(options?.signal);\n const resolved = await prepare(input);\n const kind = detectFileKind(resolved);\n const signal = options?.signal;\n\n switch (kind) {\n case \"docx\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [\"No document bytes were provided for analysis.\"],\n };\n }\n const [textPart, htmlPart] = await Promise.all([\n extractTextFromDocx(data),\n convertDocxToHtml(data),\n ]);\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: textPart.text,\n html: htmlPart.html,\n warnings: [...textPart.warnings, ...htmlPart.warnings],\n };\n }\n case \"pdf\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: \"\",\n pages: 0,\n metadata: { info: {} },\n warnings: [\"No document bytes were provided for analysis.\"],\n needsOCR: false,\n ocrUsed: false,\n };\n }\n const r = await extractTextFromPdf(data);\n const html = `<pre>${escapeHtmlMinimal(r.text)}</pre>`;\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: r.text,\n pages: r.pages,\n metadata: { info: {} },\n html,\n warnings: [\n ...r.warnings,\n \"PDF HTML is a plain-text preview wrapped in <pre> (not a visual layout).\",\n ],\n needsOCR: false,\n ocrUsed: false,\n } as AnalysisResult;\n }\n case \"text\": {\n const t = await analyzeText(resolved as DetectFileKindInput, { signal });\n const html = `<pre>${escapeHtmlMinimal(t.text)}</pre>`;\n return {\n ...t,\n html,\n warnings: [\n ...t.warnings,\n \"HTML for plain text is a <pre> wrapper around decoded UTF-8 content.\",\n ],\n } as AnalysisResult;\n }\n case \"image\":\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [\"No HTML representation for raster images; use extractText / runOcr.\"],\n };\n default:\n return notImplementedResult(kind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n\n/**\n * OCR: PDF → `analyzePdf` con `ocr: \"force\"`; imagen → `ocr`; DOCX → texto estructurado con aviso\n * (sin OCR); texto → `analyzeText`.\n */\nexport async function runOcr(\n input: NodeAnalyzeInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n throwIfAborted(options?.signal);\n const resolved = await prepare(input);\n const kind = detectFileKind(resolved);\n const signal = options?.signal;\n const lang = options?.ocr?.langs ?? options?.pdf?.ocrLangs;\n\n switch (kind) {\n case \"pdf\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: \"\",\n pages: 0,\n metadata: { info: {} },\n warnings: [\"No document bytes were provided for analysis.\"],\n needsOCR: false,\n ocrUsed: false,\n };\n }\n const r = await analyzePdf(data, {\n ...options?.pdf,\n ocr: \"force\",\n ocrLangs: lang ?? options?.pdf?.ocrLangs,\n signal: options?.pdf?.signal ?? signal,\n });\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: r.text,\n pages: r.pages,\n metadata: r.metadata,\n warnings: [...r.warnings],\n needsOCR: r.needsOCR,\n ocrUsed: r.ocrUsed,\n };\n }\n case \"image\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [\"No image bytes were provided for analysis.\"],\n };\n }\n const ocrOpts = {\n ...(options?.ocr ?? {}),\n signal: options?.ocr?.signal ?? signal,\n };\n const r = await ocr(data, ocrOpts);\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: r.text,\n confidence: r.confidence,\n ocrUsed: r.ocrUsed,\n warnings: [],\n };\n }\n case \"docx\": {\n const data = await bytesFromDetectInput(resolved);\n if (data.byteLength === 0) {\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [\"No document bytes were provided for analysis.\"],\n };\n }\n const r = await analyzeDocx(data);\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: r.text,\n html: r.html,\n warnings: [\n ...r.warnings,\n \"OCR does not apply to DOCX; returned structured text/HTML extract.\",\n ],\n };\n }\n case \"text\":\n return analyzeText(resolved as DetectFileKindInput, { signal });\n default:\n return notImplementedResult(kind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n"]}
|