@dragon708/docmind-markdown 1.1.3 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +66 -17
- package/dist/index.js +168 -55
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -39,8 +39,8 @@ type StructuredToMarkdownOptions = ConvertStructuredToMarkdownOptions;
|
|
|
39
39
|
/**
|
|
40
40
|
* Converts a {@link StructuredDocumentResult} to readable, semantic Markdown (GFM-style tables).
|
|
41
41
|
*
|
|
42
|
-
* **Universal fallback** for DocMind: use when a format-specific pipeline (DOCX Mammoth, PDF
|
|
43
|
-
* does not apply or fails, and for OCR / image / plain-text flows that already populate this shape.
|
|
42
|
+
* **Universal fallback** for DocMind: use when a format-specific pipeline (DOCX Mammoth→Turndown, specialized PDF→Markdown
|
|
43
|
+
* via `@cognipeer/to-markdown` on Node, …) does not apply or fails, and for OCR / image / plain-text flows that already populate this shape.
|
|
44
44
|
*
|
|
45
45
|
* Uses `blocks` in order; resolves `table` / `image-ref` via `tables` and `images`. When blocks are
|
|
46
46
|
* empty or yield no output, falls back to the rollup `text`. Optional sections use `pages`, `metadata`,
|
|
@@ -213,6 +213,13 @@ type RenderMarkdownSectionsOptions = SplitStructuredIntoChunksOptions;
|
|
|
213
213
|
*/
|
|
214
214
|
declare function renderMarkdownSections(result: StructuredDocumentResult, options?: RenderMarkdownSectionsOptions): MarkdownSection[];
|
|
215
215
|
|
|
216
|
+
/**
|
|
217
|
+
* DOCX → Markdown: **Mammoth** (`convertToHtml` semantic HTML) → **Turndown** (ATX headings, lists, fenced code;
|
|
218
|
+
* **GFM tables** when {@link ConvertDocxToMarkdownOptions.includeTables} is true via `turndown-plugin-gfm`).
|
|
219
|
+
* Page breaks map through Mammoth `styleMap` → `<hr class="page-break">` → Turndown horizontal rules.
|
|
220
|
+
*
|
|
221
|
+
* Independent of the PDF → Markdown pipeline in this package (`pdf-markdown.ts`, `@cognipeer/to-markdown`).
|
|
222
|
+
*/
|
|
216
223
|
/**
|
|
217
224
|
* Binary `.docx` payload accepted by {@link convertDocxToMarkdown}.
|
|
218
225
|
*/
|
|
@@ -316,8 +323,10 @@ declare function convertDocxToMarkdown(input: DocxToMarkdownInput, options?: Con
|
|
|
316
323
|
declare function convertDocxBufferToMarkdown(input: DocxToMarkdownInput, options?: ConvertDocxToMarkdownOptions): Promise<DocxMarkdownResult>;
|
|
317
324
|
|
|
318
325
|
/**
|
|
319
|
-
*
|
|
320
|
-
*
|
|
326
|
+
* Legacy option bag: **ignored** by {@link convertPdfToMarkdown}. Retained so existing TypeScript callers and object
|
|
327
|
+
* literals that still spread old shapes remain assignable to {@link ConvertPdfToMarkdownOptions}.
|
|
328
|
+
*
|
|
329
|
+
* @deprecated PDF → Markdown uses `@cognipeer/to-markdown` only; these keys are not read.
|
|
321
330
|
*/
|
|
322
331
|
interface OpenDataLoaderPdfConvertOptions {
|
|
323
332
|
outputDir?: string;
|
|
@@ -346,11 +355,13 @@ interface OpenDataLoaderPdfConvertOptions {
|
|
|
346
355
|
hybridFallback?: boolean;
|
|
347
356
|
}
|
|
348
357
|
/**
|
|
349
|
-
* Options for {@link convertPdfToMarkdown}.
|
|
358
|
+
* Options for {@link convertPdfToMarkdown}. Properties from {@link OpenDataLoaderPdfConvertOptions} are accepted for
|
|
359
|
+
* compatibility but **ignored**. `fileName`, `forceExtension`, and `url` are forwarded to `@cognipeer/to-markdown`
|
|
360
|
+
* where applicable (see that package’s `ConverterOptions`).
|
|
350
361
|
*/
|
|
351
362
|
type ConvertPdfToMarkdownOptions = OpenDataLoaderPdfConvertOptions & {
|
|
352
363
|
/**
|
|
353
|
-
* When the
|
|
364
|
+
* When the Cognipeer path fails, returns empty output, or `@cognipeer/to-markdown` cannot load,
|
|
354
365
|
* call this to obtain {@link StructuredDocumentResult} (e.g. from another extractor) and serialize with
|
|
355
366
|
* {@link convertStructuredToMarkdown}. Does not import other DocMind packages.
|
|
356
367
|
*/
|
|
@@ -362,15 +373,25 @@ type ConvertPdfToMarkdownOptions = OpenDataLoaderPdfConvertOptions & {
|
|
|
362
373
|
* @default true
|
|
363
374
|
*/
|
|
364
375
|
readonly cleanMarkdown?: boolean;
|
|
376
|
+
/** Forwarded to `@cognipeer/to-markdown` (useful for buffer inputs). */
|
|
377
|
+
readonly fileName?: string;
|
|
378
|
+
/** Forwarded to `@cognipeer/to-markdown`. */
|
|
379
|
+
readonly forceExtension?: string;
|
|
380
|
+
/** Forwarded to `@cognipeer/to-markdown`. */
|
|
381
|
+
readonly url?: string;
|
|
365
382
|
};
|
|
366
383
|
/** Input for {@link convertPdfToMarkdown}: filesystem path (Node) or PDF bytes. */
|
|
367
384
|
type PdfToMarkdownInput = string | Buffer | Uint8Array | ArrayBuffer;
|
|
368
385
|
/** Which pipeline produced {@link ConvertPdfToMarkdownResult.markdown}. */
|
|
369
|
-
type PdfToMarkdownSource = "
|
|
386
|
+
type PdfToMarkdownSource = "cognipeer" | "structured-fallback" | "unsupported-runtime"
|
|
387
|
+
/** `@cognipeer/to-markdown` missing or failed to load (install dep; check bundler externals). */
|
|
388
|
+
| "cognipeer-unavailable"
|
|
389
|
+
/** Engine ran or was attempted but produced no usable Markdown (error, empty output, missing file, etc.). */
|
|
390
|
+
| "cognipeer-failed";
|
|
370
391
|
type PdfToMarkdownFallbackReason = "unsupported-runtime" | "error" | "empty" | "module-not-found";
|
|
371
392
|
interface ConvertPdfToMarkdownResult {
|
|
372
393
|
readonly markdown: string;
|
|
373
|
-
/** Human-readable issues (runtime, missing module,
|
|
394
|
+
/** Human-readable issues (runtime, missing module, conversion errors, empty output, fallback errors). */
|
|
374
395
|
readonly warnings: readonly string[];
|
|
375
396
|
readonly source: PdfToMarkdownSource;
|
|
376
397
|
readonly fallbackReason?: PdfToMarkdownFallbackReason;
|
|
@@ -380,21 +401,25 @@ interface PdfMarkdownResult {
|
|
|
380
401
|
readonly markdown: string;
|
|
381
402
|
}
|
|
382
403
|
/**
|
|
383
|
-
* Primary API: PDF path or bytes → Markdown via `@
|
|
404
|
+
* Primary API: PDF path or bytes → Markdown via `@cognipeer/to-markdown` on Node, with clear warnings and optional
|
|
384
405
|
* structured fallback. In non-Node runtimes returns an empty `markdown` and {@link PdfToMarkdownSource} `unsupported-runtime`
|
|
385
|
-
* without loading `@
|
|
406
|
+
* without loading `@cognipeer/to-markdown`.
|
|
407
|
+
*
|
|
408
|
+
* If the specialized conversion fails or returns empty Markdown and {@link ConvertPdfToMarkdownOptions.resolveStructured} /
|
|
409
|
+
* `extractMarkdown`’s `structuredFallback` is set, output comes from {@link convertStructuredToMarkdown} instead
|
|
410
|
+
* (`extractMarkdown` reports strategy `pdf-structured-fallback`; this result uses `source` `structured-fallback`).
|
|
386
411
|
*/
|
|
387
412
|
declare function convertPdfToMarkdown(input: PdfToMarkdownInput, options?: ConvertPdfToMarkdownOptions): Promise<ConvertPdfToMarkdownResult>;
|
|
388
413
|
/**
|
|
389
|
-
* **Node only.** PDF file path → Markdown via `@
|
|
414
|
+
* **Node only.** PDF file path → Markdown via `@cognipeer/to-markdown`.
|
|
390
415
|
*
|
|
391
|
-
* Throws if not Node, if conversion yields no Markdown (and no structured fallback), or on
|
|
416
|
+
* Throws if not Node, if conversion yields no Markdown (and no structured fallback), or on conversion errors
|
|
392
417
|
* when no fallback is configured — same contract as before {@link convertPdfToMarkdown} existed.
|
|
393
418
|
*/
|
|
394
419
|
declare function convertPdfPathToMarkdown(inputPath: string, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
|
|
395
420
|
/**
|
|
396
421
|
* **Node only.** Same pipeline as {@link convertPdfPathToMarkdown}, but writes bytes to a temporary `.pdf`
|
|
397
|
-
*
|
|
422
|
+
* then converts with `@cognipeer/to-markdown`.
|
|
398
423
|
*/
|
|
399
424
|
declare function convertPdfBufferToMarkdown(input: Buffer | Uint8Array | ArrayBuffer, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
|
|
400
425
|
|
|
@@ -433,17 +458,38 @@ type ExtractMarkdownOptions = ConvertStructuredToMarkdownOptions & {
|
|
|
433
458
|
/** Overrides merged into {@link convertPdfToMarkdown} when the input is identified as `.pdf`. */
|
|
434
459
|
readonly pdf?: ConvertPdfToMarkdownOptions;
|
|
435
460
|
};
|
|
461
|
+
/** @see {@link detectBinaryFormat} */
|
|
462
|
+
type DetectedBinaryFormat = "docx" | "pdf" | "unknown";
|
|
436
463
|
/** Which branch produced {@link ExtractMarkdownResult.markdown}. */
|
|
437
|
-
type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-
|
|
464
|
+
type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-cognipeer-specialized"
|
|
465
|
+
/** `@cognipeer/to-markdown` not loadable (missing package, bundler, etc.). */
|
|
466
|
+
| "pdf-cognipeer-unavailable"
|
|
467
|
+
/** Engine failed or returned empty Markdown; no structured fallback or fallback also failed. */
|
|
468
|
+
| "pdf-cognipeer-failed" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
|
|
469
|
+
/**
|
|
470
|
+
* Format detection and fallback bookkeeping for file/path inputs to {@link extractMarkdown}.
|
|
471
|
+
* Omitted when `input` is a {@link StructuredDocumentResult} (already structured).
|
|
472
|
+
*/
|
|
473
|
+
interface ExtractMarkdownRoutingInfo {
|
|
474
|
+
readonly detectedFormat: DetectedBinaryFormat;
|
|
475
|
+
/** Specialized binary pipeline invoked first on Node (`none` if not applicable). */
|
|
476
|
+
readonly specializedPipeline: "docx" | "pdf" | "none";
|
|
477
|
+
/** Final Markdown came from {@link ExtractMarkdownOptions.structuredFallback} (or the same snapshot via `resolveStructured`). */
|
|
478
|
+
readonly usedStructuredFallback: boolean;
|
|
479
|
+
}
|
|
438
480
|
interface ExtractMarkdownResult {
|
|
439
481
|
readonly markdown: string;
|
|
440
482
|
/** Merged pipeline warnings (conversion, runtime, and optional {@link StructuredDocumentResult.warnings}). */
|
|
441
483
|
readonly warnings: readonly string[];
|
|
442
484
|
readonly strategy: ExtractMarkdownStrategy;
|
|
485
|
+
/**
|
|
486
|
+
* Present for `{ data, … }` / `{ path, … }` flows after sniffing bytes (or when returning early without reading).
|
|
487
|
+
* Absent for direct structured input.
|
|
488
|
+
*/
|
|
489
|
+
readonly routing?: ExtractMarkdownRoutingInfo;
|
|
443
490
|
}
|
|
444
491
|
/** Type guard: file-like `{ data: … }` input for {@link extractMarkdown}. */
|
|
445
492
|
declare function isExtractMarkdownFileInput(value: unknown): value is ExtractMarkdownFileInput;
|
|
446
|
-
type DetectedBinaryFormat = "docx" | "pdf" | "unknown";
|
|
447
493
|
/** Detect PDF / OOXML zip (DOCX) from magic bytes and optional filename / MIME. */
|
|
448
494
|
declare function detectBinaryFormat(data: Buffer | Uint8Array | ArrayBuffer, filename?: string, mimeType?: string): DetectedBinaryFormat;
|
|
449
495
|
/**
|
|
@@ -452,8 +498,11 @@ declare function detectBinaryFormat(data: Buffer | Uint8Array | ArrayBuffer, fil
|
|
|
452
498
|
*
|
|
453
499
|
* - **Structured input** — always uses the structured serializer (image/OCR/text/PDF/DOCX blocks already normalized).
|
|
454
500
|
* - **DOCX bytes / path** — {@link convertDocxToMarkdown} on Node; otherwise warns and uses {@link ExtractMarkdownOptions.structuredFallback} if provided.
|
|
455
|
-
* - **PDF bytes / path** — {@link convertPdfToMarkdown} (`@
|
|
501
|
+
* - **PDF bytes / path** — {@link convertPdfToMarkdown} (`@cognipeer/to-markdown` on Node); in non-Node runtimes returns empty Markdown with warnings unless {@link ExtractMarkdownOptions.structuredFallback} supplies content.
|
|
502
|
+
*
|
|
503
|
+
* Any use of `structuredFallback` (or the same snapshot through `resolveStructured`) adds a tagged line in {@link ExtractMarkdownResult.warnings}
|
|
504
|
+
* and sets {@link ExtractMarkdownRoutingInfo.usedStructuredFallback} when {@link ExtractMarkdownResult.routing} is present.
|
|
456
505
|
*/
|
|
457
506
|
declare function extractMarkdown(input: ExtractMarkdownInput, options?: ExtractMarkdownOptions): Promise<ExtractMarkdownResult>;
|
|
458
507
|
|
|
459
|
-
export { type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownStrategy, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
|
|
508
|
+
export { type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownRoutingInfo, type ExtractMarkdownStrategy, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
|
package/dist/index.js
CHANGED
|
@@ -993,21 +993,27 @@ async function convertDocxBufferToMarkdown(input, options) {
|
|
|
993
993
|
}
|
|
994
994
|
|
|
995
995
|
// src/pdf-markdown.ts
|
|
996
|
-
var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @
|
|
996
|
+
var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @cognipeer/to-markdown requires Node.js. In the browser, use a server-side conversion or supply structured text/Markdown from your backend.";
|
|
997
|
+
var COGNIPEER_WARN_TAG = "[docmind-markdown:pdf] pdf-cognipeer-specialized:";
|
|
997
998
|
function normalizePdfMarkdown(markdown, clean) {
|
|
998
999
|
const t = markdown.trim();
|
|
999
1000
|
if (!clean) return t;
|
|
1000
1001
|
return t.replace(/\n{3,}/g, "\n\n");
|
|
1001
1002
|
}
|
|
1002
|
-
|
|
1003
|
+
var FALLBACK_WARN_PREFIX = "[docmind-markdown:pdf] pdf-structured-fallback: serializing StructuredDocumentResult to Markdown because";
|
|
1004
|
+
function structuredFallbackWarnings(reason, detail) {
|
|
1005
|
+
const tail = reason === "module-not-found" ? "@cognipeer/to-markdown could not be loaded." : reason === "error" ? "the specialized PDF engine raised an error or rejected the input." : reason === "empty" ? "the specialized engine returned empty Markdown." : "the specialized path is unavailable in this runtime.";
|
|
1006
|
+
const extra = detail ? ` (${detail})` : "";
|
|
1007
|
+
return [`${FALLBACK_WARN_PREFIX} ${tail}${extra}`];
|
|
1008
|
+
}
|
|
1009
|
+
function cognipeerConverterOptions(options) {
|
|
1003
1010
|
if (!options) return {};
|
|
1004
|
-
const {
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
return rest;
|
|
1011
|
+
const { fileName, forceExtension, url } = options;
|
|
1012
|
+
const o = {};
|
|
1013
|
+
if (fileName !== void 0) o.fileName = fileName;
|
|
1014
|
+
if (forceExtension !== void 0) o.forceExtension = forceExtension;
|
|
1015
|
+
if (url !== void 0) o.url = url;
|
|
1016
|
+
return o;
|
|
1011
1017
|
}
|
|
1012
1018
|
async function toNodeBuffer2(input) {
|
|
1013
1019
|
const { Buffer: Buffer2 } = await importEsm("node:buffer");
|
|
@@ -1015,11 +1021,17 @@ async function toNodeBuffer2(input) {
|
|
|
1015
1021
|
if (input instanceof ArrayBuffer) return Buffer2.from(input);
|
|
1016
1022
|
return Buffer2.from(input);
|
|
1017
1023
|
}
|
|
1024
|
+
async function loadCognipeerConvertToMarkdown() {
|
|
1025
|
+
const { createRequire } = await importEsm("node:module");
|
|
1026
|
+
const require2 = createRequire(import.meta.url);
|
|
1027
|
+
const mod = require2("@cognipeer/to-markdown");
|
|
1028
|
+
return mod.convertToMarkdown;
|
|
1029
|
+
}
|
|
1018
1030
|
async function convertPdfToMarkdown(input, options) {
|
|
1019
1031
|
const clean = options?.cleanMarkdown !== false;
|
|
1020
1032
|
const resolveStructured = options?.resolveStructured;
|
|
1021
1033
|
const structuredMdOpts = options?.structuredMarkdown;
|
|
1022
|
-
const
|
|
1034
|
+
const cognipeerOpts = cognipeerConverterOptions(options);
|
|
1023
1035
|
if (!isNodeRuntime()) {
|
|
1024
1036
|
return {
|
|
1025
1037
|
markdown: "",
|
|
@@ -1046,15 +1058,13 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1046
1058
|
await writeFile(inputPath, buffer);
|
|
1047
1059
|
cleanup = async () => rm(dir, { recursive: true, force: true });
|
|
1048
1060
|
}
|
|
1049
|
-
let
|
|
1061
|
+
let convertToMarkdown;
|
|
1050
1062
|
try {
|
|
1051
|
-
|
|
1052
|
-
"@opendataloader/pdf"
|
|
1053
|
-
));
|
|
1063
|
+
convertToMarkdown = await loadCognipeerConvertToMarkdown();
|
|
1054
1064
|
} catch (e) {
|
|
1055
|
-
const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@
|
|
1065
|
+
const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@cognipeer/to-markdown` in your project." : "";
|
|
1056
1066
|
warnings.push(
|
|
1057
|
-
|
|
1067
|
+
`${COGNIPEER_WARN_TAG} package could not be loaded (${e instanceof Error ? e.message : String(e)}).${hint}`
|
|
1058
1068
|
);
|
|
1059
1069
|
if (resolveStructured) {
|
|
1060
1070
|
try {
|
|
@@ -1065,7 +1075,10 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1065
1075
|
);
|
|
1066
1076
|
return {
|
|
1067
1077
|
markdown: md,
|
|
1068
|
-
warnings
|
|
1078
|
+
warnings: [
|
|
1079
|
+
...structuredFallbackWarnings("module-not-found"),
|
|
1080
|
+
...warnings
|
|
1081
|
+
],
|
|
1069
1082
|
source: "structured-fallback",
|
|
1070
1083
|
fallbackReason: "module-not-found"
|
|
1071
1084
|
};
|
|
@@ -1078,20 +1091,16 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1078
1091
|
return {
|
|
1079
1092
|
markdown: "",
|
|
1080
1093
|
warnings,
|
|
1081
|
-
source: "
|
|
1094
|
+
source: "cognipeer-unavailable",
|
|
1082
1095
|
fallbackReason: "module-not-found"
|
|
1083
1096
|
};
|
|
1084
1097
|
}
|
|
1085
1098
|
let rawMarkdown;
|
|
1086
1099
|
try {
|
|
1087
|
-
rawMarkdown = await
|
|
1088
|
-
...eng,
|
|
1089
|
-
format: "markdown",
|
|
1090
|
-
toStdout: true,
|
|
1091
|
-
quiet: eng.quiet !== false
|
|
1092
|
-
}).then((s) => String(s));
|
|
1100
|
+
rawMarkdown = await convertToMarkdown(inputPath, cognipeerOpts);
|
|
1093
1101
|
} catch (e) {
|
|
1094
|
-
|
|
1102
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1103
|
+
warnings.push(`${COGNIPEER_WARN_TAG} ${msg}`);
|
|
1095
1104
|
if (resolveStructured) {
|
|
1096
1105
|
try {
|
|
1097
1106
|
const structured = await resolveStructured();
|
|
@@ -1101,7 +1110,10 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1101
1110
|
);
|
|
1102
1111
|
return {
|
|
1103
1112
|
markdown: md,
|
|
1104
|
-
warnings
|
|
1113
|
+
warnings: [
|
|
1114
|
+
...structuredFallbackWarnings("error", msg.slice(0, 500)),
|
|
1115
|
+
...warnings
|
|
1116
|
+
],
|
|
1105
1117
|
source: "structured-fallback",
|
|
1106
1118
|
fallbackReason: "error"
|
|
1107
1119
|
};
|
|
@@ -1114,13 +1126,18 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1114
1126
|
return {
|
|
1115
1127
|
markdown: "",
|
|
1116
1128
|
warnings,
|
|
1117
|
-
source: "
|
|
1129
|
+
source: "cognipeer-failed",
|
|
1118
1130
|
fallbackReason: "error"
|
|
1119
1131
|
};
|
|
1120
1132
|
}
|
|
1121
|
-
let markdown = normalizePdfMarkdown(
|
|
1133
|
+
let markdown = normalizePdfMarkdown(
|
|
1134
|
+
typeof rawMarkdown === "string" ? rawMarkdown : String(rawMarkdown ?? ""),
|
|
1135
|
+
clean
|
|
1136
|
+
);
|
|
1122
1137
|
if (markdown.length === 0) {
|
|
1123
|
-
warnings.push(
|
|
1138
|
+
warnings.push(
|
|
1139
|
+
`${COGNIPEER_WARN_TAG} returned empty Markdown for this PDF (whitespace-only after normalize).`
|
|
1140
|
+
);
|
|
1124
1141
|
if (resolveStructured) {
|
|
1125
1142
|
try {
|
|
1126
1143
|
const structured = await resolveStructured();
|
|
@@ -1130,7 +1147,7 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1130
1147
|
);
|
|
1131
1148
|
return {
|
|
1132
1149
|
markdown,
|
|
1133
|
-
warnings,
|
|
1150
|
+
warnings: [...structuredFallbackWarnings("empty"), ...warnings],
|
|
1134
1151
|
source: "structured-fallback",
|
|
1135
1152
|
fallbackReason: "empty"
|
|
1136
1153
|
};
|
|
@@ -1143,11 +1160,11 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1143
1160
|
return {
|
|
1144
1161
|
markdown: "",
|
|
1145
1162
|
warnings,
|
|
1146
|
-
source: "
|
|
1163
|
+
source: "cognipeer-failed",
|
|
1147
1164
|
fallbackReason: "empty"
|
|
1148
1165
|
};
|
|
1149
1166
|
}
|
|
1150
|
-
return { markdown, warnings, source: "
|
|
1167
|
+
return { markdown, warnings, source: "cognipeer" };
|
|
1151
1168
|
} finally {
|
|
1152
1169
|
if (cleanup) {
|
|
1153
1170
|
await cleanup().catch(() => {
|
|
@@ -1166,13 +1183,13 @@ function throwIfLegacyFailure(r) {
|
|
|
1166
1183
|
}
|
|
1167
1184
|
}
|
|
1168
1185
|
async function convertPdfPathToMarkdown(inputPath, options) {
|
|
1169
|
-
assertNodeRuntime("PDF \u2192 Markdown (@
|
|
1186
|
+
assertNodeRuntime("PDF \u2192 Markdown (@cognipeer/to-markdown)");
|
|
1170
1187
|
const r = await convertPdfToMarkdown(inputPath, options);
|
|
1171
1188
|
throwIfLegacyFailure(r);
|
|
1172
1189
|
return { markdown: r.markdown };
|
|
1173
1190
|
}
|
|
1174
1191
|
async function convertPdfBufferToMarkdown(input, options) {
|
|
1175
|
-
assertNodeRuntime("PDF \u2192 Markdown (@
|
|
1192
|
+
assertNodeRuntime("PDF \u2192 Markdown (@cognipeer/to-markdown)");
|
|
1176
1193
|
const r = await convertPdfToMarkdown(input, options);
|
|
1177
1194
|
throwIfLegacyFailure(r);
|
|
1178
1195
|
return { markdown: r.markdown };
|
|
@@ -1241,9 +1258,19 @@ function docxStrategyFromSource(source) {
|
|
|
1241
1258
|
return source === "structured-fallback" ? "docx-structured-fallback" : "docx-mammoth";
|
|
1242
1259
|
}
|
|
1243
1260
|
function pdfStrategyFromResult(r) {
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1261
|
+
switch (r.source) {
|
|
1262
|
+
case "structured-fallback":
|
|
1263
|
+
return "pdf-structured-fallback";
|
|
1264
|
+
case "unsupported-runtime":
|
|
1265
|
+
return "pdf-unsupported-runtime";
|
|
1266
|
+
case "cognipeer-unavailable":
|
|
1267
|
+
return "pdf-cognipeer-unavailable";
|
|
1268
|
+
case "cognipeer-failed":
|
|
1269
|
+
return "pdf-cognipeer-failed";
|
|
1270
|
+
case "cognipeer":
|
|
1271
|
+
default:
|
|
1272
|
+
return "pdf-cognipeer-specialized";
|
|
1273
|
+
}
|
|
1247
1274
|
}
|
|
1248
1275
|
function mergeWarnings(base, ...more) {
|
|
1249
1276
|
const out = [...base];
|
|
@@ -1252,6 +1279,22 @@ function mergeWarnings(base, ...more) {
|
|
|
1252
1279
|
}
|
|
1253
1280
|
return out;
|
|
1254
1281
|
}
|
|
1282
|
+
var EXTRACT_WARN = "[docmind-markdown:extractMarkdown]";
|
|
1283
|
+
function traceUsedStructuredFallback(context) {
|
|
1284
|
+
return `${EXTRACT_WARN} ${context}: final Markdown from structuredFallback (specialized route unavailable, failed, or insufficient).`;
|
|
1285
|
+
}
|
|
1286
|
+
function tracePdfStructuredAfterUnsupportedRuntime() {
|
|
1287
|
+
return `${EXTRACT_WARN} pdf-unsupported-runtime: final Markdown from structuredFallback \u2014 @cognipeer/to-markdown cannot run in this environment.`;
|
|
1288
|
+
}
|
|
1289
|
+
function traceDocxStructuredAfterMammoth() {
|
|
1290
|
+
return `${EXTRACT_WARN} docx-structured-fallback: final Markdown from structured envelope after Mammoth/Turndown did not yield the result.`;
|
|
1291
|
+
}
|
|
1292
|
+
function tracePdfStructuredAfterCognipeer() {
|
|
1293
|
+
return `${EXTRACT_WARN} pdf-structured-fallback: final Markdown from structured envelope after Cognipeer PDF path did not yield the result.`;
|
|
1294
|
+
}
|
|
1295
|
+
function tracePdfSpecializedDeadEnd() {
|
|
1296
|
+
return `${EXTRACT_WARN} pdf: Cognipeer specialized route did not produce Markdown and structuredFallback was not provided.`;
|
|
1297
|
+
}
|
|
1255
1298
|
async function extractMarkdown(input, options) {
|
|
1256
1299
|
const smOpts = pickStructuredMarkdownOptions(options);
|
|
1257
1300
|
const fb = options?.structuredFallback;
|
|
@@ -1278,11 +1321,25 @@ async function extractMarkdown(input, options) {
|
|
|
1278
1321
|
if (fb) {
|
|
1279
1322
|
return {
|
|
1280
1323
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1281
|
-
warnings: mergeWarnings(warnings, fb.warnings),
|
|
1282
|
-
strategy: "path-requires-node"
|
|
1324
|
+
warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("path-requires-node")]),
|
|
1325
|
+
strategy: "path-requires-node",
|
|
1326
|
+
routing: {
|
|
1327
|
+
detectedFormat: "unknown",
|
|
1328
|
+
specializedPipeline: "none",
|
|
1329
|
+
usedStructuredFallback: true
|
|
1330
|
+
}
|
|
1283
1331
|
};
|
|
1284
1332
|
}
|
|
1285
|
-
return {
|
|
1333
|
+
return {
|
|
1334
|
+
markdown: "",
|
|
1335
|
+
warnings,
|
|
1336
|
+
strategy: "path-requires-node",
|
|
1337
|
+
routing: {
|
|
1338
|
+
detectedFormat: "unknown",
|
|
1339
|
+
specializedPipeline: "none",
|
|
1340
|
+
usedStructuredFallback: false
|
|
1341
|
+
}
|
|
1342
|
+
};
|
|
1286
1343
|
}
|
|
1287
1344
|
const { readFile } = await importEsm(
|
|
1288
1345
|
"node:fs/promises"
|
|
@@ -1302,7 +1359,7 @@ async function extractMarkdown(input, options) {
|
|
|
1302
1359
|
if (fb) {
|
|
1303
1360
|
return {
|
|
1304
1361
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1305
|
-
warnings: mergeWarnings(warnings, fb.warnings),
|
|
1362
|
+
warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("invalid-input-shape")]),
|
|
1306
1363
|
strategy: "binary-unidentified-structured-fallback"
|
|
1307
1364
|
};
|
|
1308
1365
|
}
|
|
@@ -1317,37 +1374,79 @@ async function extractMarkdown(input, options) {
|
|
|
1317
1374
|
if (fb) {
|
|
1318
1375
|
return {
|
|
1319
1376
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1320
|
-
warnings: mergeWarnings(warnings, fb.warnings),
|
|
1321
|
-
strategy: "docx-requires-node"
|
|
1377
|
+
warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("docx-requires-node")]),
|
|
1378
|
+
strategy: "docx-requires-node",
|
|
1379
|
+
routing: {
|
|
1380
|
+
detectedFormat: "docx",
|
|
1381
|
+
specializedPipeline: "none",
|
|
1382
|
+
usedStructuredFallback: true
|
|
1383
|
+
}
|
|
1322
1384
|
};
|
|
1323
1385
|
}
|
|
1324
|
-
return {
|
|
1386
|
+
return {
|
|
1387
|
+
markdown: "",
|
|
1388
|
+
warnings,
|
|
1389
|
+
strategy: "docx-requires-node",
|
|
1390
|
+
routing: {
|
|
1391
|
+
detectedFormat: "docx",
|
|
1392
|
+
specializedPipeline: "none",
|
|
1393
|
+
usedStructuredFallback: false
|
|
1394
|
+
}
|
|
1395
|
+
};
|
|
1325
1396
|
}
|
|
1326
1397
|
const r = await convertDocxToMarkdown(data, buildDocxOptions(options));
|
|
1398
|
+
const strategy = docxStrategyFromSource(r.source);
|
|
1327
1399
|
const w = mergeWarnings(
|
|
1328
1400
|
warnings,
|
|
1329
|
-
r.messages.map((m) => m.message)
|
|
1401
|
+
r.messages.map((m) => m.message),
|
|
1402
|
+
strategy === "docx-structured-fallback" ? [traceDocxStructuredAfterMammoth()] : []
|
|
1330
1403
|
);
|
|
1331
1404
|
return {
|
|
1332
1405
|
markdown: r.markdown,
|
|
1333
1406
|
warnings: w,
|
|
1334
|
-
strategy
|
|
1407
|
+
strategy,
|
|
1408
|
+
routing: {
|
|
1409
|
+
detectedFormat: "docx",
|
|
1410
|
+
specializedPipeline: "docx",
|
|
1411
|
+
usedStructuredFallback: strategy === "docx-structured-fallback"
|
|
1412
|
+
}
|
|
1335
1413
|
};
|
|
1336
1414
|
}
|
|
1337
1415
|
if (fmt === "pdf") {
|
|
1338
1416
|
const r = await convertPdfToMarkdown(data, buildPdfOptions(options));
|
|
1339
1417
|
const strategy = pdfStrategyFromResult(r);
|
|
1340
|
-
|
|
1418
|
+
let w = mergeWarnings(warnings, r.warnings);
|
|
1341
1419
|
if (strategy === "pdf-unsupported-runtime" && r.markdown === "" && fb) {
|
|
1420
|
+
w = mergeWarnings(w, fb.warnings, [
|
|
1421
|
+
tracePdfStructuredAfterUnsupportedRuntime()
|
|
1422
|
+
]);
|
|
1342
1423
|
return {
|
|
1343
1424
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1344
|
-
warnings:
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1425
|
+
warnings: w,
|
|
1426
|
+
strategy: "pdf-structured-fallback",
|
|
1427
|
+
routing: {
|
|
1428
|
+
detectedFormat: "pdf",
|
|
1429
|
+
specializedPipeline: "pdf",
|
|
1430
|
+
usedStructuredFallback: true
|
|
1431
|
+
}
|
|
1348
1432
|
};
|
|
1349
1433
|
}
|
|
1350
|
-
|
|
1434
|
+
if (strategy === "pdf-structured-fallback") {
|
|
1435
|
+
w = mergeWarnings(w, [tracePdfStructuredAfterCognipeer()]);
|
|
1436
|
+
}
|
|
1437
|
+
if ((strategy === "pdf-cognipeer-failed" || strategy === "pdf-cognipeer-unavailable") && r.markdown.trim() === "" && !fb) {
|
|
1438
|
+
w = mergeWarnings(w, [tracePdfSpecializedDeadEnd()]);
|
|
1439
|
+
}
|
|
1440
|
+
return {
|
|
1441
|
+
markdown: r.markdown,
|
|
1442
|
+
warnings: w,
|
|
1443
|
+
strategy,
|
|
1444
|
+
routing: {
|
|
1445
|
+
detectedFormat: "pdf",
|
|
1446
|
+
specializedPipeline: "pdf",
|
|
1447
|
+
usedStructuredFallback: strategy === "pdf-structured-fallback"
|
|
1448
|
+
}
|
|
1449
|
+
};
|
|
1351
1450
|
}
|
|
1352
1451
|
warnings.push(
|
|
1353
1452
|
"@dragon708/docmind-markdown: Unidentified binary format (expected PDF magic or ZIP/DOCX). Using structured fallback if provided."
|
|
@@ -1355,11 +1454,25 @@ async function extractMarkdown(input, options) {
|
|
|
1355
1454
|
if (fb) {
|
|
1356
1455
|
return {
|
|
1357
1456
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1358
|
-
warnings: mergeWarnings(warnings, fb.warnings),
|
|
1359
|
-
strategy: "binary-unidentified-structured-fallback"
|
|
1457
|
+
warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("binary-unidentified")]),
|
|
1458
|
+
strategy: "binary-unidentified-structured-fallback",
|
|
1459
|
+
routing: {
|
|
1460
|
+
detectedFormat: "unknown",
|
|
1461
|
+
specializedPipeline: "none",
|
|
1462
|
+
usedStructuredFallback: true
|
|
1463
|
+
}
|
|
1360
1464
|
};
|
|
1361
1465
|
}
|
|
1362
|
-
return {
|
|
1466
|
+
return {
|
|
1467
|
+
markdown: "",
|
|
1468
|
+
warnings,
|
|
1469
|
+
strategy: "binary-unidentified",
|
|
1470
|
+
routing: {
|
|
1471
|
+
detectedFormat: "unknown",
|
|
1472
|
+
specializedPipeline: "none",
|
|
1473
|
+
usedStructuredFallback: false
|
|
1474
|
+
}
|
|
1475
|
+
};
|
|
1363
1476
|
}
|
|
1364
1477
|
|
|
1365
1478
|
export { convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-markdown",
|
|
3
|
-
"version": "1.1
|
|
3
|
+
"version": "1.2.1",
|
|
4
4
|
"description": "StructuredDocumentResult → Markdown and LLM-oriented plain text for DocMind.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"sideEffects": false,
|
|
@@ -35,8 +35,8 @@
|
|
|
35
35
|
],
|
|
36
36
|
"license": "MIT",
|
|
37
37
|
"dependencies": {
|
|
38
|
+
"@cognipeer/to-markdown": "^2.0.1",
|
|
38
39
|
"@dragon708/docmind-shared": "^1.2.0",
|
|
39
|
-
"@opendataloader/pdf": "^2.2.1",
|
|
40
40
|
"mammoth": "^1.6.0",
|
|
41
41
|
"turndown": "^7.0.0",
|
|
42
42
|
"turndown-plugin-gfm": "^1.0.2"
|