@dragon708/docmind-node 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,11 +1,16 @@
1
- import { DetectFileKindInput, NamedInput, AnalysisResult } from '@dragon708/docmind-shared';
2
- export { AnalysisAnalyzer, AnalysisResult, DetectFileKindInput, DocxAnalysisCoreResult, FileKind, FileKindMetadata, GenericAnalysisResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult } from '@dragon708/docmind-shared';
1
+ import { DocMindAnalyzeOptions, DetectFileKindInput, NamedInput, AnalysisResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
2
+ export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
3
3
  import { OcrOptions } from '@dragon708/docmind-ocr';
4
4
  import { PdfAnalyzeOptions } from '@dragon708/docmind-pdf';
5
5
 
6
- /** Options for {@link analyzeFile} in the Node entry (`pdf` / `ocr` forwarded to format packages). */
7
- interface NodeAnalyzeOptions {
8
- readonly signal?: AbortSignal;
6
+ /**
7
+ * Options for Node public APIs (`analyzeFile`, intent methods).
8
+ *
9
+ * - **`pdf`**: forwarded to `@dragon708/docmind-pdf`. `analyzeFile` defaults `pdf.ocr` to `"auto"` when omitted.
10
+ * {@link extractText} / {@link convertToHtml} merge a default of `ocr: "off"` unless you set `pdf.ocr` explicitly.
11
+ * - **`ocr`**: forwarded to `@dragon708/docmind-ocr` for raster images; language string also feeds PDF OCR when `pdf.ocrLangs` is unset.
12
+ */
13
+ interface NodeAnalyzeOptions extends DocMindAnalyzeOptions {
9
14
  readonly pdf?: PdfAnalyzeOptions;
10
15
  readonly ocr?: OcrOptions;
11
16
  }
@@ -27,8 +32,91 @@ declare function bufferToInput(buffer: Buffer, name?: string): NamedInput<Buffer
27
32
  declare function resolveNodeAnalyzeInput(input: NodeAnalyzeInput): Promise<DetectFileKindInput>;
28
33
 
29
34
  /**
30
- * Node router: PDF, DOCX, images (OCR), and text. Paths and `file:` URLs are read via `fs`.
35
+ * Resolves {@link NodeAnalyzeInput} (paths read from disk), classifies with {@link detectFileKind}, then runs
36
+ * the PDF, DOCX, image, or text pipeline. PDF OCR defaults to `"auto"` when `options.pdf.ocr` is omitted.
31
37
  */
32
38
  declare function analyzeFile(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
33
39
 
34
- export { type NodeAnalyzeInput, type NodeAnalyzeOptions, analyzeFile, bufferToInput, readFileToInput, resolveNodeAnalyzeInput };
40
+ /**
41
+ * Plain-text extraction using {@link analyzeFile} routing. PDFs default to **text layer only**
42
+ * (`pdf.ocr: "off"`) unless you set `options.pdf.ocr` explicitly.
43
+ */
44
+ declare function extractText(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
45
+ /**
46
+ * Metadata: PDF uses lightweight metadata extraction; DOCX/images return stubs; plain text uses the same
47
+ * router as {@link extractText} (`analyzeFile` with PDF OCR off by default).
48
+ */
49
+ declare function extractMetadata(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
50
+ /**
51
+ * HTML: DOCX and plain text go through {@link analyzeFile} (then `<pre>` for text). PDF uses the text layer
52
+ * only wrapped in `<pre>` (no OCR). Images return a stub without running OCR.
53
+ */
54
+ declare function convertToHtml(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
55
+ /**
56
+ * OCR intent: PDF always runs {@link analyzePdf} with `ocr: "force"` (merged with `options.pdf`).
57
+ * Raster images run Tesseract via `options.ocr`. DOCX returns structured extract with a notice.
58
+ */
59
+ declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions): Promise<AnalysisResult>;
60
+
61
+ /**
62
+ * Whether DocMind will try a non-OCR text/HTML path (e.g. Mammoth, pdf-parse text layer, UTF-8).
63
+ */
64
+ interface NativeExtractionPlan {
65
+ readonly willAttempt: boolean;
66
+ readonly description: string;
67
+ }
68
+ /** Whether OCR (raster or PDF pipeline) may run for this intent + kind. */
69
+ interface OcrPlan {
70
+ readonly mayUse: boolean;
71
+ readonly description: string;
72
+ }
73
+ /**
74
+ * Structured explanation of what DocMind would do for a public intent in Node (no heavy I/O).
75
+ */
76
+ interface ExplainAnalysisPlanReport {
77
+ readonly kind: FileKind;
78
+ readonly detectedKind: FileKind;
79
+ readonly runtime: RuntimeDescriptor;
80
+ readonly intent: DocMindPublicIntent | (string & {});
81
+ readonly primaryAnalyzer: AnalysisAnalyzer;
82
+ readonly nativeExtraction: NativeExtractionPlan;
83
+ readonly ocr: OcrPlan;
84
+ readonly limitations: readonly string[];
85
+ readonly plan: ProcessingPlanDescriptor;
86
+ readonly warnings?: readonly string[];
87
+ }
88
+
89
+ /** High-level features the user can ask DocMind for (per input kind and runtime). */
90
+ type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
91
+ /** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
92
+ interface PublicCapabilitySupport {
93
+ readonly id: PublicCapabilityId;
94
+ readonly supported: boolean;
95
+ readonly warnings?: readonly string[];
96
+ }
97
+ /**
98
+ * Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
99
+ */
100
+ interface GetCapabilitiesReport {
101
+ readonly kind: FileKind;
102
+ readonly runtime: RuntimeDescriptor;
103
+ readonly capabilities: readonly PublicCapabilitySupport[];
104
+ readonly warnings?: readonly string[];
105
+ }
106
+
107
+ /** Options for {@link explainAnalysisPlan} including PDF/OCR hints for accurate planning. */
108
+ type NodeExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<NodeAnalyzeOptions, "pdf" | "ocr">;
109
+
110
+ /**
111
+ * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
112
+ * `text` | `metadata` | `html` | `ocr` | `pages` apply for that kind in Node (PDF fully supported).
113
+ * Does not run Mammoth/Tesseract/PDF bodies beyond path resolution.
114
+ */
115
+ declare function getCapabilities(input: NodeAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
116
+ /**
117
+ * Epic 1 — **Plan preview:** same shape as browser; PDF branches include `pdf.ocr` from options (`off` | `auto` | `force`).
118
+ * No full document parse unless resolving a path reads the file.
119
+ */
120
+ declare function explainAnalysisPlan(input: NodeAnalyzeInput, options?: NodeExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
121
+
122
+ export { type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type NodeAnalyzeInput, type NodeAnalyzeOptions, type NodeExplainAnalysisPlanOptions, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, bufferToInput, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
package/dist/index.js CHANGED
@@ -1,7 +1,8 @@
1
1
  import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile } from '@dragon708/docmind-shared';
2
+ export { detectFileKind } from '@dragon708/docmind-shared';
2
3
  import { analyzeDocx } from '@dragon708/docmind-docx';
3
4
  import { ocr } from '@dragon708/docmind-ocr';
4
- import { analyzePdf } from '@dragon708/docmind-pdf';
5
+ import { extractPdfMetadata, extractTextFromPdf, analyzePdf } from '@dragon708/docmind-pdf';
5
6
  import { readFile } from 'fs/promises';
6
7
  import { basename } from 'path';
7
8
  import { fileURLToPath } from 'url';
@@ -169,6 +170,723 @@ async function analyzeFile(input, options) {
169
170
  }
170
171
  }
171
172
 
172
- export { analyzeFile, bufferToInput, readFileToInput, resolveNodeAnalyzeInput };
173
+ // src/intentPdfOptions.ts
174
+ function withPdfOcrDefaultOff(options) {
175
+ return {
176
+ ...options,
177
+ pdf: {
178
+ ...options?.pdf,
179
+ ocr: options?.pdf?.ocr ?? "off"
180
+ }
181
+ };
182
+ }
183
+
184
+ // src/internal/abort.ts
185
+ function throwIfAborted(signal) {
186
+ if (signal?.aborted) {
187
+ const err = new Error("The operation was aborted");
188
+ err.name = "AbortError";
189
+ throw err;
190
+ }
191
+ }
192
+
193
+ // src/publicActions.ts
194
+ var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not exposed as a separate API; use extractText or analyzeFile.";
195
+ var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
196
+ var RUN_OCR_PDF_FORCE_SEMANTICS = 'runOcr: PDF pipeline ran with `ocr: "force"` so text may include raster OCR output even when a text layer exists.';
197
+ function escapeHtmlMinimal(s) {
198
+ return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
199
+ }
200
+ async function prepare(input) {
201
+ const resolved = await resolveNodeAnalyzeInput(input);
202
+ assertValidAnalyzeFileInput(resolved);
203
+ return resolved;
204
+ }
205
+ function toExtractTextResult(full) {
206
+ if (full.status !== "ok") return full;
207
+ if (full.fileKind === "docx") {
208
+ return { ...full, html: "" };
209
+ }
210
+ return full;
211
+ }
212
+ async function extractText(input, options) {
213
+ throwIfAborted(options?.signal);
214
+ const full = await analyzeFile(input, withPdfOcrDefaultOff(options));
215
+ return toExtractTextResult(full);
216
+ }
217
+ async function extractMetadata(input, options) {
218
+ throwIfAborted(options?.signal);
219
+ const resolved = await prepare(input);
220
+ const kind = detectFileKind(resolved);
221
+ options?.signal;
222
+ switch (kind) {
223
+ case "pdf": {
224
+ const data = await bytesFromDetectInput(resolved);
225
+ if (data.byteLength === 0) {
226
+ return {
227
+ fileKind: "pdf",
228
+ analyzer: "pdf",
229
+ status: "ok",
230
+ kind: "pdf",
231
+ text: "",
232
+ pages: 0,
233
+ metadata: { info: {} },
234
+ warnings: ["No document bytes were provided for analysis."],
235
+ needsOCR: false,
236
+ ocrUsed: false
237
+ };
238
+ }
239
+ const r = await extractPdfMetadata(data);
240
+ return {
241
+ fileKind: "pdf",
242
+ analyzer: "pdf",
243
+ status: "ok",
244
+ kind: "pdf",
245
+ text: "",
246
+ pages: 0,
247
+ metadata: r.metadata,
248
+ warnings: r.warnings,
249
+ needsOCR: false,
250
+ ocrUsed: false
251
+ };
252
+ }
253
+ case "docx":
254
+ return {
255
+ fileKind: "docx",
256
+ analyzer: "docx",
257
+ status: "ok",
258
+ kind: "docx",
259
+ text: "",
260
+ html: "",
261
+ warnings: [DOCX_METADATA_STUB]
262
+ };
263
+ case "image":
264
+ return {
265
+ fileKind: "image",
266
+ analyzer: "image",
267
+ status: "ok",
268
+ kind: "image",
269
+ text: "",
270
+ confidence: 0,
271
+ ocrUsed: true,
272
+ warnings: [IMAGE_METADATA_NOTE]
273
+ };
274
+ case "text":
275
+ return analyzeFile(input, withPdfOcrDefaultOff(options));
276
+ default:
277
+ return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
278
+ }
279
+ }
280
+ async function convertToHtml(input, options) {
281
+ throwIfAborted(options?.signal);
282
+ const resolved = await prepare(input);
283
+ const kind = detectFileKind(resolved);
284
+ options?.signal;
285
+ if (kind === "docx") {
286
+ return analyzeFile(input, withPdfOcrDefaultOff(options));
287
+ }
288
+ if (kind === "text") {
289
+ const r = await analyzeFile(input, withPdfOcrDefaultOff(options));
290
+ if (r.status !== "ok") return r;
291
+ if (r.fileKind !== "text") return r;
292
+ const html = `<pre>${escapeHtmlMinimal(r.text)}</pre>`;
293
+ return {
294
+ ...r,
295
+ html,
296
+ warnings: [
297
+ ...r.warnings,
298
+ "HTML for plain text is a <pre> wrapper around decoded UTF-8 content."
299
+ ]
300
+ };
301
+ }
302
+ if (kind === "pdf") {
303
+ const data = await bytesFromDetectInput(resolved);
304
+ if (data.byteLength === 0) {
305
+ return {
306
+ fileKind: "pdf",
307
+ analyzer: "pdf",
308
+ status: "ok",
309
+ kind: "pdf",
310
+ text: "",
311
+ pages: 0,
312
+ metadata: { info: {} },
313
+ warnings: ["No document bytes were provided for analysis."],
314
+ needsOCR: false,
315
+ ocrUsed: false
316
+ };
317
+ }
318
+ const r = await extractTextFromPdf(data);
319
+ const html = `<pre>${escapeHtmlMinimal(r.text)}</pre>`;
320
+ return {
321
+ fileKind: "pdf",
322
+ analyzer: "pdf",
323
+ status: "ok",
324
+ kind: "pdf",
325
+ text: r.text,
326
+ pages: r.pages,
327
+ metadata: { info: {} },
328
+ html,
329
+ warnings: [
330
+ ...r.warnings,
331
+ "PDF HTML is a plain-text preview wrapped in <pre> (not a visual layout)."
332
+ ],
333
+ needsOCR: false,
334
+ ocrUsed: false
335
+ };
336
+ }
337
+ if (kind === "image") {
338
+ return {
339
+ fileKind: "image",
340
+ analyzer: "image",
341
+ status: "ok",
342
+ kind: "image",
343
+ text: "",
344
+ confidence: 0,
345
+ ocrUsed: true,
346
+ warnings: ["No HTML representation for raster images; use extractText / runOcr."]
347
+ };
348
+ }
349
+ return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
350
+ }
351
+ async function runOcr(input, options) {
352
+ throwIfAborted(options?.signal);
353
+ const resolved = await prepare(input);
354
+ const kind = detectFileKind(resolved);
355
+ const signal = options?.signal;
356
+ const lang = options?.ocr?.langs ?? options?.pdf?.ocrLangs;
357
+ switch (kind) {
358
+ case "pdf": {
359
+ const data = await bytesFromDetectInput(resolved);
360
+ if (data.byteLength === 0) {
361
+ return {
362
+ fileKind: "pdf",
363
+ analyzer: "pdf",
364
+ status: "ok",
365
+ kind: "pdf",
366
+ text: "",
367
+ pages: 0,
368
+ metadata: { info: {} },
369
+ warnings: ["No document bytes were provided for analysis."],
370
+ needsOCR: false,
371
+ ocrUsed: false
372
+ };
373
+ }
374
+ const r = await analyzePdf(data, {
375
+ ...options?.pdf,
376
+ ocr: "force",
377
+ ocrLangs: lang ?? options?.pdf?.ocrLangs,
378
+ signal: options?.pdf?.signal ?? signal
379
+ });
380
+ return {
381
+ fileKind: "pdf",
382
+ analyzer: "pdf",
383
+ status: "ok",
384
+ kind: "pdf",
385
+ text: r.text,
386
+ pages: r.pages,
387
+ metadata: r.metadata,
388
+ warnings: [RUN_OCR_PDF_FORCE_SEMANTICS, ...r.warnings],
389
+ needsOCR: r.needsOCR,
390
+ ocrUsed: r.ocrUsed
391
+ };
392
+ }
393
+ case "image": {
394
+ const data = await bytesFromDetectInput(resolved);
395
+ if (data.byteLength === 0) {
396
+ return {
397
+ fileKind: "image",
398
+ analyzer: "image",
399
+ status: "ok",
400
+ kind: "image",
401
+ text: "",
402
+ confidence: 0,
403
+ ocrUsed: true,
404
+ warnings: ["No image bytes were provided for analysis."]
405
+ };
406
+ }
407
+ const ocrOpts = {
408
+ ...options?.ocr ?? {},
409
+ signal: options?.ocr?.signal ?? signal
410
+ };
411
+ const r = await ocr(data, ocrOpts);
412
+ return {
413
+ fileKind: "image",
414
+ analyzer: "image",
415
+ status: "ok",
416
+ kind: "image",
417
+ text: r.text,
418
+ confidence: r.confidence,
419
+ ocrUsed: r.ocrUsed,
420
+ warnings: []
421
+ };
422
+ }
423
+ case "docx": {
424
+ const data = await bytesFromDetectInput(resolved);
425
+ if (data.byteLength === 0) {
426
+ return {
427
+ fileKind: "docx",
428
+ analyzer: "docx",
429
+ status: "ok",
430
+ kind: "docx",
431
+ text: "",
432
+ html: "",
433
+ warnings: ["No document bytes were provided for analysis."]
434
+ };
435
+ }
436
+ const r = await analyzeDocx(data);
437
+ return {
438
+ fileKind: "docx",
439
+ analyzer: "docx",
440
+ status: "ok",
441
+ kind: "docx",
442
+ text: r.text,
443
+ html: r.html,
444
+ warnings: [
445
+ ...r.warnings,
446
+ "OCR does not apply to DOCX; returned structured text/HTML extract."
447
+ ]
448
+ };
449
+ }
450
+ case "text":
451
+ return analyzeText(resolved, { signal });
452
+ default:
453
+ return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
454
+ }
455
+ }
456
+
457
+ // src/analysisPlanReport.ts
458
+ function lim(...items) {
459
+ return items.filter(Boolean);
460
+ }
461
+ function buildNodeExplainReport(kind, intent, pdfOcr, plan) {
462
+ const runtime = { id: "node" };
463
+ const primaryAnalyzer = kind === "pdf" ? "pdf" : kind === "docx" ? "docx" : kind === "image" ? "image" : kind === "text" ? "text" : "none";
464
+ let nativeExtraction;
465
+ let ocr3;
466
+ let limitations = [];
467
+ if (kind === "unknown") {
468
+ limitations = lim(
469
+ "Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve."
470
+ );
471
+ return {
472
+ kind,
473
+ detectedKind: kind,
474
+ runtime,
475
+ intent,
476
+ primaryAnalyzer: "none",
477
+ nativeExtraction: { willAttempt: false, description: "No analyzer without a known file kind." },
478
+ ocr: { mayUse: false, description: "OCR is not used for unknown kinds." },
479
+ limitations,
480
+ plan
481
+ };
482
+ }
483
+ switch (intent) {
484
+ case "analyzeFile":
485
+ if (kind === "pdf") {
486
+ nativeExtraction = {
487
+ willAttempt: true,
488
+ description: "pdf-parse extracts embedded text and page count first."
489
+ };
490
+ ocr3 = {
491
+ mayUse: pdfOcr !== "off",
492
+ description: pdfOcr === "off" ? "Raster OCR pipeline is off (pdf.ocr: off)." : pdfOcr === "force" ? "Raster OCR may run on all pages when pdf.ocr is force." : "Raster OCR may run when the text layer is empty (pdf.ocr: auto)."
493
+ };
494
+ } else if (kind === "docx") {
495
+ nativeExtraction = {
496
+ willAttempt: true,
497
+ description: "Mammoth extracts text and HTML from OOXML."
498
+ };
499
+ ocr3 = { mayUse: false, description: "DOCX does not use OCR in DocMind." };
500
+ } else if (kind === "image") {
501
+ nativeExtraction = {
502
+ willAttempt: false,
503
+ description: "Images have no native text layer; text comes from OCR only."
504
+ };
505
+ ocr3 = { mayUse: true, description: "Tesseract runs on supported raster formats." };
506
+ } else {
507
+ nativeExtraction = {
508
+ willAttempt: true,
509
+ description: "UTF-8 decode with BOM handling for plain text."
510
+ };
511
+ ocr3 = { mayUse: false, description: "OCR does not apply to text files." };
512
+ }
513
+ break;
514
+ case "extractText":
515
+ if (kind === "pdf") {
516
+ nativeExtraction = {
517
+ willAttempt: true,
518
+ description: "Text layer via pdf-parse; defaults to pdf.ocr off unless you override."
519
+ };
520
+ ocr3 = {
521
+ mayUse: false,
522
+ description: "extractText merges pdf.ocr default off \u2014 no raster OCR unless you set pdf.ocr explicitly."
523
+ };
524
+ } else if (kind === "docx") {
525
+ nativeExtraction = {
526
+ willAttempt: true,
527
+ description: "Mammoth plain text; HTML cleared in the extractText response."
528
+ };
529
+ ocr3 = { mayUse: false, description: "DOCX does not use OCR." };
530
+ } else if (kind === "image") {
531
+ nativeExtraction = { willAttempt: false, description: "No embedded text layer." };
532
+ ocr3 = { mayUse: true, description: "OCR produces text for images." };
533
+ } else {
534
+ nativeExtraction = {
535
+ willAttempt: true,
536
+ description: "UTF-8 decode only."
537
+ };
538
+ ocr3 = { mayUse: false, description: "OCR does not apply." };
539
+ }
540
+ break;
541
+ case "extractMetadata":
542
+ if (kind === "pdf") {
543
+ nativeExtraction = {
544
+ willAttempt: true,
545
+ description: "Lightweight PDF info/XMP normalization without full OCR."
546
+ };
547
+ ocr3 = { mayUse: false, description: "extractMetadata does not run the OCR pipeline." };
548
+ } else if (kind === "docx" || kind === "image") {
549
+ nativeExtraction = {
550
+ willAttempt: false,
551
+ description: "Stub response; no heavy extractor."
552
+ };
553
+ ocr3 = { mayUse: false, description: "OCR not used for this metadata path." };
554
+ limitations = lim(
555
+ kind === "docx" ? "Structured DOCX metadata is not exposed separately." : "Raster images have no document metadata bundle."
556
+ );
557
+ } else {
558
+ nativeExtraction = {
559
+ willAttempt: true,
560
+ description: "Decoded text only; no structured document metadata."
561
+ };
562
+ ocr3 = { mayUse: false, description: "OCR does not apply." };
563
+ limitations = lim("Plain text has no structured document metadata.");
564
+ }
565
+ break;
566
+ case "convertToHtml":
567
+ if (kind === "pdf") {
568
+ nativeExtraction = {
569
+ willAttempt: true,
570
+ description: "Text layer extracted then wrapped in <pre> (not visual layout)."
571
+ };
572
+ ocr3 = { mayUse: false, description: "convertToHtml does not run PDF OCR." };
573
+ limitations = lim("PDF HTML is a plain-text preview, not page layout.");
574
+ } else if (kind === "docx") {
575
+ nativeExtraction = {
576
+ willAttempt: true,
577
+ description: "Mammoth HTML output via analyzeFile routing."
578
+ };
579
+ ocr3 = { mayUse: false, description: "DOCX path does not use OCR." };
580
+ } else if (kind === "text") {
581
+ nativeExtraction = {
582
+ willAttempt: true,
583
+ description: "UTF-8 decode then <pre> wrapper."
584
+ };
585
+ ocr3 = { mayUse: false, description: "OCR does not apply." };
586
+ } else {
587
+ nativeExtraction = {
588
+ willAttempt: false,
589
+ description: "No HTML path for raster images."
590
+ };
591
+ ocr3 = { mayUse: false, description: "OCR does not emit layout HTML here." };
592
+ limitations = lim("Use extractText or runOcr for image text.");
593
+ }
594
+ break;
595
+ case "runOcr":
596
+ if (kind === "pdf") {
597
+ nativeExtraction = {
598
+ willAttempt: true,
599
+ description: "pdf-parse runs first; text may be replaced by raster OCR output."
600
+ };
601
+ ocr3 = {
602
+ mayUse: true,
603
+ description: 'runOcr always sets pdf.ocr to "force" for PDFs.'
604
+ };
605
+ limitations = lim("Forced OCR may run even when a text layer exists.");
606
+ } else if (kind === "image") {
607
+ nativeExtraction = { willAttempt: false, description: "No native text layer." };
608
+ ocr3 = { mayUse: true, description: "Tesseract OCR on the image bytes." };
609
+ } else if (kind === "docx") {
610
+ nativeExtraction = {
611
+ willAttempt: true,
612
+ description: "Full Mammoth extract (text + HTML); not OCR."
613
+ };
614
+ ocr3 = { mayUse: false, description: "DOCX is not OCR'd." };
615
+ limitations = lim("Result is structured extract, not OCR output.");
616
+ } else {
617
+ nativeExtraction = {
618
+ willAttempt: true,
619
+ description: "UTF-8 decode only."
620
+ };
621
+ ocr3 = { mayUse: false, description: "OCR does not apply to text files." };
622
+ }
623
+ break;
624
+ default:
625
+ nativeExtraction = { willAttempt: false, description: "Generic intent; see plan." };
626
+ ocr3 = { mayUse: false, description: "See plan steps." };
627
+ }
628
+ return {
629
+ kind,
630
+ detectedKind: kind,
631
+ runtime,
632
+ intent,
633
+ primaryAnalyzer,
634
+ nativeExtraction,
635
+ ocr: ocr3,
636
+ limitations,
637
+ plan
638
+ };
639
+ }
640
+
641
+ // src/capabilityReport.ts
642
+ var DOCX_META = "Structured document metadata is not exposed separately; extractMetadata returns a stub for DOCX.";
643
+ var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
644
+ var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
645
+ var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
646
+ var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
647
+ function slot(id, supported, warnings) {
648
+ return warnings?.length ? { id, supported, warnings } : { id, supported };
649
+ }
650
+ function buildNodeCapabilityReport(kind) {
651
+ const runtime = { id: "node" };
652
+ let capabilities;
653
+ const topWarnings = [];
654
+ switch (kind) {
655
+ case "pdf":
656
+ capabilities = [
657
+ slot("text", true, ["Includes text layer extraction; use extractText options to avoid PDF OCR."]),
658
+ slot("metadata", true),
659
+ slot("html", true, ["HTML is a <pre> preview of extracted text, not visual layout."]),
660
+ slot("ocr", true, ["Raster OCR is available (e.g. analyzeFile with pdf.ocr auto/force, or runOcr)."]),
661
+ slot("pages", true)
662
+ ];
663
+ break;
664
+ case "docx":
665
+ capabilities = [
666
+ slot("text", true),
667
+ slot("metadata", false, [DOCX_META]),
668
+ slot("html", true),
669
+ slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
670
+ slot("pages", false)
671
+ ];
672
+ break;
673
+ case "image":
674
+ capabilities = [
675
+ slot("text", true, ["Text is obtained via OCR."]),
676
+ slot("metadata", false, [IMAGE_META]),
677
+ slot("html", false, [IMAGE_HTML]),
678
+ slot("ocr", true),
679
+ slot("pages", false)
680
+ ];
681
+ break;
682
+ case "text":
683
+ capabilities = [
684
+ slot("text", true),
685
+ slot("metadata", true, [TEXT_META_NOTE]),
686
+ slot("html", true),
687
+ slot("ocr", false, ["OCR does not apply to plain text files."]),
688
+ slot("pages", false)
689
+ ];
690
+ break;
691
+ default:
692
+ topWarnings.push(UNKNOWN_KIND);
693
+ capabilities = [
694
+ slot("text", false),
695
+ slot("metadata", false),
696
+ slot("html", false),
697
+ slot("ocr", false),
698
+ slot("pages", false)
699
+ ];
700
+ }
701
+ return {
702
+ kind,
703
+ runtime,
704
+ capabilities,
705
+ warnings: topWarnings.length > 0 ? topWarnings : void 0
706
+ };
707
+ }
708
+
709
+ // src/introspection.ts
710
+ function resolvePdfOcrMode(pdf) {
711
+ return pdf?.ocr ?? "auto";
712
+ }
713
+ function planAnalyzeFile(kind, pdfOcr) {
714
+ switch (kind) {
715
+ case "pdf":
716
+ return {
717
+ intent: "analyzeFile",
718
+ steps: [
719
+ { id: "detect_kind", status: "done" },
720
+ { id: "pdf_parse", status: "planned" },
721
+ {
722
+ id: "pdf_ocr",
723
+ status: pdfOcr === "off" ? "skipped" : "planned"
724
+ }
725
+ ]
726
+ };
727
+ case "docx":
728
+ return {
729
+ intent: "analyzeFile",
730
+ steps: [
731
+ { id: "detect_kind", status: "done" },
732
+ { id: "docx_mammoth", status: "planned" }
733
+ ]
734
+ };
735
+ case "image":
736
+ return {
737
+ intent: "analyzeFile",
738
+ steps: [
739
+ { id: "detect_kind", status: "done" },
740
+ { id: "image_ocr", status: "planned" }
741
+ ]
742
+ };
743
+ case "text":
744
+ return {
745
+ intent: "analyzeFile",
746
+ steps: [
747
+ { id: "detect_kind", status: "done" },
748
+ { id: "utf8_decode", status: "planned" }
749
+ ]
750
+ };
751
+ default:
752
+ return {
753
+ intent: "analyzeFile",
754
+ steps: [
755
+ { id: "detect_kind", status: "done" },
756
+ { id: "route", status: "failed" }
757
+ ]
758
+ };
759
+ }
760
+ }
761
+ function planForIntent(intentOpt, kind, pdfOcrForAnalyze) {
762
+ const intent = intentOpt ?? "analyzeFile";
763
+ if (intent === "analyzeFile") return planAnalyzeFile(kind, pdfOcrForAnalyze);
764
+ if (intent === "extractText") {
765
+ const p = planAnalyzeFile(kind, "off");
766
+ return { ...p, intent: "extractText" };
767
+ }
768
+ if (intent === "extractMetadata") {
769
+ if (kind === "pdf") {
770
+ return {
771
+ intent: "extractMetadata",
772
+ steps: [
773
+ { id: "detect_kind", status: "done" },
774
+ { id: "pdf_metadata", status: "planned" }
775
+ ]
776
+ };
777
+ }
778
+ if (kind === "text") {
779
+ return {
780
+ intent: "extractMetadata",
781
+ steps: [
782
+ { id: "detect_kind", status: "done" },
783
+ { id: "utf8_decode", status: "planned" }
784
+ ]
785
+ };
786
+ }
787
+ return {
788
+ intent: "extractMetadata",
789
+ steps: [
790
+ { id: "detect_kind", status: "done" },
791
+ { id: "metadata_stub", status: kind === "docx" || kind === "image" ? "planned" : "skipped" }
792
+ ]
793
+ };
794
+ }
795
+ if (intent === "convertToHtml") {
796
+ if (kind === "docx") {
797
+ return {
798
+ intent: "convertToHtml",
799
+ steps: [
800
+ { id: "detect_kind", status: "done" },
801
+ { id: "docx_mammoth_html", status: "planned" }
802
+ ]
803
+ };
804
+ }
805
+ if (kind === "text") {
806
+ return {
807
+ intent: "convertToHtml",
808
+ steps: [
809
+ { id: "detect_kind", status: "done" },
810
+ { id: "utf8_decode", status: "planned" },
811
+ { id: "wrap_pre", status: "planned" }
812
+ ]
813
+ };
814
+ }
815
+ if (kind === "pdf") {
816
+ return {
817
+ intent: "convertToHtml",
818
+ steps: [
819
+ { id: "detect_kind", status: "done" },
820
+ { id: "pdf_text_layer", status: "planned" },
821
+ { id: "wrap_pre", status: "planned" }
822
+ ]
823
+ };
824
+ }
825
+ return {
826
+ intent: "convertToHtml",
827
+ steps: [
828
+ { id: "detect_kind", status: "done" },
829
+ { id: "rich_html", status: "skipped" }
830
+ ]
831
+ };
832
+ }
833
+ if (intent === "runOcr") {
834
+ if (kind === "pdf") {
835
+ return {
836
+ intent: "runOcr",
837
+ steps: [
838
+ { id: "detect_kind", status: "done" },
839
+ { id: "pdf_parse", status: "planned" },
840
+ { id: "pdf_ocr_forced", status: "planned" }
841
+ ]
842
+ };
843
+ }
844
+ if (kind === "image") {
845
+ return {
846
+ intent: "runOcr",
847
+ steps: [
848
+ { id: "detect_kind", status: "done" },
849
+ { id: "tesseract_ocr", status: "planned" }
850
+ ]
851
+ };
852
+ }
853
+ if (kind === "docx") {
854
+ return {
855
+ intent: "runOcr",
856
+ steps: [
857
+ { id: "detect_kind", status: "done" },
858
+ { id: "docx_structured_extract", status: "planned" }
859
+ ]
860
+ };
861
+ }
862
+ return {
863
+ intent: "runOcr",
864
+ steps: [
865
+ { id: "detect_kind", status: "done" },
866
+ { id: "ocr", status: "skipped" }
867
+ ]
868
+ };
869
+ }
870
+ return planAnalyzeFile(kind, pdfOcrForAnalyze);
871
+ }
872
+ async function getCapabilities(input, options) {
873
+ throwIfAborted(options?.signal);
874
+ const resolved = await resolveNodeAnalyzeInput(input);
875
+ assertValidAnalyzeFileInput(resolved);
876
+ const kind = detectFileKind(resolved);
877
+ return buildNodeCapabilityReport(kind);
878
+ }
879
+ async function explainAnalysisPlan(input, options) {
880
+ throwIfAborted(options?.signal);
881
+ const resolved = await resolveNodeAnalyzeInput(input);
882
+ assertValidAnalyzeFileInput(resolved);
883
+ const kind = detectFileKind(resolved);
884
+ const intent = options?.intent ?? "analyzeFile";
885
+ const pdfOcrAnalyze = resolvePdfOcrMode(options?.pdf);
886
+ const plan = planForIntent(intent, kind, pdfOcrAnalyze);
887
+ return buildNodeExplainReport(kind, intent, pdfOcrAnalyze, plan);
888
+ }
889
+
890
+ export { analyzeFile, bufferToInput, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, readFileToInput, resolveNodeAnalyzeInput, runOcr };
173
891
  //# sourceMappingURL=index.js.map
174
892
  //# sourceMappingURL=index.js.map
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-node",
3
- "version": "1.0.0",
4
- "description": "Node.js DocMind entry: PDF, DOCX, OCR, text, and fs helpers.",
3
+ "version": "1.2.0",
4
+ "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
7
7
  "module": "./dist/index.js",
@@ -14,7 +14,8 @@
14
14
  }
15
15
  },
16
16
  "files": [
17
- "dist"
17
+ "dist/**/*.js",
18
+ "dist/**/*.d.ts"
18
19
  ],
19
20
  "publishConfig": {
20
21
  "access": "public"
@@ -34,7 +35,7 @@
34
35
  "@dragon708/docmind-docx": "^1.0.0",
35
36
  "@dragon708/docmind-ocr": "^1.0.0",
36
37
  "@dragon708/docmind-pdf": "^1.0.0",
37
- "@dragon708/docmind-shared": "^1.0.0"
38
+ "@dragon708/docmind-shared": "^1.1.0"
38
39
  },
39
40
  "devDependencies": {
40
41
  "@types/node": "^20.19.37",
package/dist/index.js.map DELETED
@@ -1 +0,0 @@
1
- {"version":3,"sources":["../src/inputBytes.ts","../src/analyzers/docx.ts","../src/analyzers/image.ts","../src/analyzers/pdf.ts","../src/resolveNodeInput.ts","../src/analyze.ts"],"names":["extractDocx","runPdf"],"mappings":";;;;;;;;;AASO,SAAS,kBAAkB,KAAA,EAAoD;AACpF,EAAA,OAAO,YAAA,CAAa,KAAK,CAAA,IAAK,aAAA,CAAc,KAAK,KAAK,MAAA,CAAO,KAAK,CAAA,IAAK,MAAA,CAAO,KAAK,CAAA;AACrF;AAGA,eAAsB,qBAAqB,KAAA,EAAiD;AAC1F,EAAA,IAAI,CAAC,iBAAA,CAAkB,KAAK,CAAA,EAAG;AAC7B,IAAA,OAAO,IAAI,WAAW,CAAC,CAAA;AAAA,EACzB;AACA,EAAA,OAAO,aAAa,KAAK,CAAA;AAC3B;;;ACZA,eAAsB,kBAAA,CACpB,OACA,MAAA,EACyB;AACzB,EAAA,IAAI,QAAQ,OAAA,EAAS;AACnB,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,KAAK,CAAA;AAC7C,EAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,IAAA,OAAO;AAAA,MACL,QAAA,EAAU,MAAA;AAAA,MACV,QAAA,EAAU,MAAA;AAAA,MACV,MAAA,EAAQ,IAAA;AAAA,MACR,IAAA,EAAM,MAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,QAAA,EAAU,CAAC,+CAA+C;AAAA,KAC5D;AAAA,EACF;AAEA,EAAA,MAAM,CAAA,GAAI,MAAMA,WAAA,CAAY,IAAI,CAAA;AAChC,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,MAAA;AAAA,IACV,QAAA,EAAU,MAAA;AAAA,IACV,MAAA,EAAQ,IAAA;AAAA,IACR,IAAA,EAAM,MAAA;AAAA,IACN,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,QAAA,EAAU,CAAC,GAAG,CAAA,CAAE,QAAQ;AAAA,GAC1B;AACF;AChCA,eAAsB,mBAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,IAAI,OAAA,EAAS,QAAQ,OAAA,EAAS;AAC5B,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,KAAK,CAAA;AAC7C,EAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,IAAA,OAAO;AAAA,MACL,QAAA,EAAU,OAAA;AAAA,MACV,QAAA,EAAU,OAAA;AAAA,MACV,MAAA,EAAQ,IAAA;AAAA,MACR,IAAA,EAAM,OAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,UAAA,EAAY,CAAA;AAAA,MACZ,OAAA,EAAS,IAAA;AAAA,MACT,QAAA,EAAU,CAAC,4CAA4C;AAAA,KACzD;AAAA,EACF;AAEA,EAAA,MAAM,OAAA,GAAU;AAAA,IACd,GAAI,OAAA,EAAS,GAAA,IAAO,EAAC;AAAA,IACrB,MAAA,EAAQ,OAAA,EAAS,GAAA,EAAK,MAAA,IAAU,OAAA,EAAS;AAAA,GAC3C;AAEA,EAAA,MAAM,CAAA,GAAI,MAAM,GAAA,CAAI,IAAA,EAAM,OAAO,CAAA;AACjC,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,OAAA;AAAA,IACV,QAAA,EAAU,OAAA;AAAA,IACV,MAAA,EAAQ,IAAA;AAAA,IACR,IAAA,EAAM,OAAA;AAAA,IACN,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,SAAS,CAAA,CAAE,OAAA;AAAA,IACX,UAAU;AAAC,GACb;AACF;AClCA,eAAsB,iBAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,IAAI,OAAA,EAAS,QAAQ,OAAA,EAAS;AAC5B,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,MAAM,IAAA,GAAO,MAAM,oBAAA,CAAqB,KAAK,CAAA;AAC7C,EAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,IAAA,OAAO;AAAA,MACL,QAAA,EAAU,KAAA;AAAA,MACV,QAAA,EAAU,KAAA;AAAA,MACV,MAAA,EAAQ,IAAA;AAAA,MACR,IAAA,EAAM,KAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,KAAA,EAAO,CAAA;AAAA,MACP,QAAA,EAAU,EAAE,IAAA,EAAM,EAAC,EAAE;AAAA,MACrB,QAAA,EAAU,CAAC,+CAA+C,CAAA;AAAA,MAC1D,QAAA,EAAU,KAAA;AAAA,MACV,OAAA,EAAS;AAAA,KACX;AAAA,EACF;AAEA,EAAA,MAAM,UAAU,OAAA,EAAS,GAAA;AACzB,EAAA,MAAM,OAAA,GAA6B;AAAA,IACjC,GAAG,OAAA;AAAA,IACH,GAAA,EAAK,SAAS,GAAA,IAAO,MAAA;AAAA,IACrB,QAAA,EAAU,OAAA,EAAS,QAAA,IAAY,OAAA,EAAS,GAAA,EAAK,KAAA;AAAA,IAC7C,MAAA,EAAQ,OAAA,EAAS,MAAA,IAAU,OAAA,EAAS;AAAA,GACtC;AAEA,EAAA,MAAM,CAAA,GAAI,MAAMC,UAAA,CAAO,IAAA,EAAM,OAAO,CAAA;AACpC,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,KAAA;AAAA,IACV,QAAA,EAAU,KAAA;AAAA,IACV,MAAA,EAAQ,IAAA;AAAA,IACR,IAAA,EAAM,KAAA;AAAA,IACN,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,OAAO,CAAA,CAAE,KAAA;AAAA,IACT,UAAU,CAAA,CAAE,QAAA;AAAA,IACZ,QAAA,EAAU,CAAC,GAAG,CAAA,CAAE,QAAQ,CAAA;AAAA,IACxB,UAAU,CAAA,CAAE,QAAA;AAAA,IACZ,SAAS,CAAA,CAAE;AAAA,GACb;AACF;AClDA,SAAS,aAAa,SAAA,EAAiC;AACrD,EAAA,OAAO,SAAA,YAAqB,GAAA,GAAM,aAAA,CAAc,SAAS,CAAA,GAAI,SAAA;AAC/D;AAKA,eAAsB,gBAAgB,IAAA,EAAiD;AACrF,EAAA,MAAM,MAAA,GAAS,aAAa,IAAI,CAAA;AAChC,EAAA,MAAM,IAAA,GAAO,MAAM,QAAA,CAAS,MAAM,CAAA;AAClC,EAAA,OAAO;AAAA,IACL,IAAA;AAAA,IACA,IAAA,EAAM,SAAS,MAAM;AAAA,GACvB;AACF;AAGO,SAAS,aAAA,CAAc,QAAgB,IAAA,EAAmC;AAC/E,EAAA,OAAO,IAAA,KAAS,SAAY,EAAE,IAAA,EAAM,QAAQ,IAAA,EAAK,GAAI,EAAE,IAAA,EAAM,MAAA,EAAO;AACtE;AAKA,eAAsB,wBAAwB,KAAA,EAAuD;AACnG,EAAA,IAAI,OAAO,KAAA,KAAU,QAAA,IAAY,KAAA,YAAiB,GAAA,EAAK;AACrD,IAAA,OAAO,gBAAgB,KAAK,CAAA;AAAA,EAC9B;AACA,EAAA,OAAO,KAAA;AACT;;;ACvBA,eAAsB,WAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,IAAI,OAAA,EAAS,QAAQ,OAAA,EAAS;AAC5B,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,MAAM,QAAA,GAAW,MAAM,uBAAA,CAAwB,KAAK,CAAA;AACpD,EAAA,2BAAA,CAA4B,QAAQ,CAAA;AAEpC,EAAA,MAAM,QAAA,GAAW,eAAe,QAAQ,CAAA;AAExC,EAAA,QAAQ,QAAA;AAAU,IAChB,KAAK,KAAA;AACH,MAAA,OAAO,iBAAA,CAAkB,UAAiC,OAAO,CAAA;AAAA,IACnE,KAAK,MAAA;AACH,MAAA,OAAO,kBAAA,CAAmB,QAAA,EAAiC,OAAA,EAAS,MAAM,CAAA;AAAA,IAC5E,KAAK,OAAA;AACH,MAAA,OAAO,mBAAA,CAAoB,UAAiC,OAAO,CAAA;AAAA,IACrE,KAAK,MAAA;AACH,MAAA,OAAO,YAAY,QAAA,EAAiC,EAAE,MAAA,EAAQ,OAAA,EAAS,QAAQ,CAAA;AAAA,IACjF;AACE,MAAA,OAAO,oBAAA,CAAqB,QAAA,EAAU,MAAA,EAAQ,CAAC,sBAAsB,CAAC,CAAA;AAAA;AAE5E","file":"index.js","sourcesContent":["import type { DetectFileKindInput, FileLikeInput } from \"@dragon708/docmind-shared\";\nimport {\n isBinaryInput,\n isBlob,\n isFile,\n isNamedInput,\n toUint8Array,\n} from \"@dragon708/docmind-shared\";\n\nexport function isByteBackedInput(input: DetectFileKindInput): input is FileLikeInput {\n return isNamedInput(input) || isBinaryInput(input) || isBlob(input) || isFile(input);\n}\n\n/** Resolves bytes when the input carries a body; otherwise an empty `Uint8Array`. */\nexport async function bytesFromDetectInput(input: DetectFileKindInput): Promise<Uint8Array> {\n if (!isByteBackedInput(input)) {\n return new Uint8Array(0);\n }\n return toUint8Array(input);\n}\n","import { analyzeDocx as extractDocx } from \"@dragon708/docmind-docx\";\nimport type { AnalysisResult, DetectFileKindInput } from \"@dragon708/docmind-shared\";\nimport { bytesFromDetectInput } from \"../inputBytes.js\";\n\n/**\n * DOCX → `@dragon708/docmind-docx`.\n */\nexport async function analyzeDocxForNode(\n input: DetectFileKindInput,\n signal?: AbortSignal,\n): Promise<AnalysisResult> {\n if (signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n\n const data = await bytesFromDetectInput(input);\n if (data.byteLength === 0) {\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [\"No document bytes were provided for analysis.\"],\n };\n }\n\n const r = await extractDocx(data);\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: r.text,\n html: r.html,\n warnings: [...r.warnings],\n };\n}\n","import { ocr } from \"@dragon708/docmind-ocr\";\nimport type { AnalysisResult, DetectFileKindInput } from \"@dragon708/docmind-shared\";\nimport type { NodeAnalyzeOptions } from \"../nodeAnalyzeOptions.js\";\nimport { bytesFromDetectInput } from \"../inputBytes.js\";\n\n/**\n * Image → `@dragon708/docmind-ocr`.\n */\nexport async function analyzeImageForNode(\n input: DetectFileKindInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n if (options?.signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n\n const data = await bytesFromDetectInput(input);\n if (data.byteLength === 0) {\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [\"No image bytes were provided for analysis.\"],\n };\n }\n\n const ocrOpts = {\n ...(options?.ocr ?? {}),\n signal: options?.ocr?.signal ?? options?.signal,\n };\n\n const r = await ocr(data, ocrOpts);\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: r.text,\n confidence: r.confidence,\n ocrUsed: r.ocrUsed,\n warnings: [],\n };\n}\n","import { analyzePdf as runPdf } from \"@dragon708/docmind-pdf\";\nimport type { PdfAnalyzeOptions } from \"@dragon708/docmind-pdf\";\nimport type { AnalysisResult, DetectFileKindInput } from \"@dragon708/docmind-shared\";\nimport type { NodeAnalyzeOptions } from \"../nodeAnalyzeOptions.js\";\nimport { bytesFromDetectInput } from \"../inputBytes.js\";\n\n/**\n * PDF → `@dragon708/docmind-pdf` (Node / pdf-parse + OCR).\n *\n * Unlike `analyzePdf` from `@dragon708/docmind-pdf` (OCR off unless set), `analyzeFile` defaults\n * to `pdf.ocr: \"auto\"`: when the PDF has\n * pages but almost no extractable text (typical scan), the raster OCR pipeline runs. Pass\n * `pdf: { ocr: \"off\" }` to skip OCR for speed.\n */\nexport async function analyzePdfForNode(\n input: DetectFileKindInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n if (options?.signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n\n const data = await bytesFromDetectInput(input);\n if (data.byteLength === 0) {\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: \"\",\n pages: 0,\n metadata: { info: {} },\n warnings: [\"No document bytes were provided for analysis.\"],\n needsOCR: false,\n ocrUsed: false,\n };\n }\n\n const userPdf = options?.pdf;\n const pdfOpts: PdfAnalyzeOptions = {\n ...userPdf,\n ocr: userPdf?.ocr ?? \"auto\",\n ocrLangs: userPdf?.ocrLangs ?? options?.ocr?.langs,\n signal: userPdf?.signal ?? options?.signal,\n };\n\n const r = await runPdf(data, pdfOpts);\n return {\n fileKind: \"pdf\",\n analyzer: \"pdf\",\n status: \"ok\",\n kind: \"pdf\",\n text: r.text,\n pages: r.pages,\n metadata: r.metadata,\n warnings: [...r.warnings],\n needsOCR: r.needsOCR,\n ocrUsed: r.ocrUsed,\n };\n}\n","import type { DetectFileKindInput, NamedInput } from \"@dragon708/docmind-shared\";\nimport { readFile } from \"node:fs/promises\";\nimport { basename } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\n\n/**\n * Inputs accepted by {@link analyzeFile} in this package.\n * Paths and `file:` URLs are read with `fs`; other values pass through as {@link DetectFileKindInput}.\n */\nexport type NodeAnalyzeInput = string | URL | DetectFileKindInput;\n\nfunction toPathString(pathOrUrl: string | URL): string {\n return pathOrUrl instanceof URL ? fileURLToPath(pathOrUrl) : pathOrUrl;\n}\n\n/**\n * Reads a file from disk into a {@link NamedInput} (binary `Buffer`, basename as `name` for hints).\n */\nexport async function readFileToInput(path: string | URL): Promise<NamedInput<Buffer>> {\n const fsPath = toPathString(path);\n const data = await readFile(fsPath);\n return {\n data,\n name: basename(fsPath),\n };\n}\n\n/** Wraps a `Buffer` as a named payload when you already know the filename. */\nexport function bufferToInput(buffer: Buffer, name?: string): NamedInput<Buffer> {\n return name !== undefined ? { data: buffer, name } : { data: buffer };\n}\n\n/**\n * Resolves paths / `file:` URLs to a {@link DetectFileKindInput}; leaves other values untouched.\n */\nexport async function resolveNodeAnalyzeInput(input: NodeAnalyzeInput): Promise<DetectFileKindInput> {\n if (typeof input === \"string\" || input instanceof URL) {\n return readFileToInput(input);\n }\n return input;\n}\n","import type { AnalysisResult, DetectFileKindInput } from \"@dragon708/docmind-shared\";\nimport {\n analyzeText,\n assertValidAnalyzeFileInput,\n detectFileKind,\n notImplementedResult,\n UNKNOWN_FORMAT_WARNING,\n} from \"@dragon708/docmind-shared\";\nimport { analyzeDocxForNode } from \"./analyzers/docx.js\";\nimport { analyzeImageForNode } from \"./analyzers/image.js\";\nimport { analyzePdfForNode } from \"./analyzers/pdf.js\";\nimport type { NodeAnalyzeOptions } from \"./nodeAnalyzeOptions.js\";\nimport { resolveNodeAnalyzeInput, type NodeAnalyzeInput } from \"./resolveNodeInput.js\";\n\n/**\n * Node router: PDF, DOCX, images (OCR), and text. Paths and `file:` URLs are read via `fs`.\n */\nexport async function analyzeFile(\n input: NodeAnalyzeInput,\n options?: NodeAnalyzeOptions,\n): Promise<AnalysisResult> {\n if (options?.signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n\n const resolved = await resolveNodeAnalyzeInput(input);\n assertValidAnalyzeFileInput(resolved);\n\n const fileKind = detectFileKind(resolved);\n\n switch (fileKind) {\n case \"pdf\":\n return analyzePdfForNode(resolved as DetectFileKindInput, options);\n case \"docx\":\n return analyzeDocxForNode(resolved as DetectFileKindInput, options?.signal);\n case \"image\":\n return analyzeImageForNode(resolved as DetectFileKindInput, options);\n case \"text\":\n return analyzeText(resolved as DetectFileKindInput, { signal: options?.signal });\n default:\n return notImplementedResult(fileKind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n"]}