@dragon708/docmind-markdown 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +65 -24
- package/dist/index.js +145 -69
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -39,8 +39,8 @@ type StructuredToMarkdownOptions = ConvertStructuredToMarkdownOptions;
|
|
|
39
39
|
/**
|
|
40
40
|
* Converts a {@link StructuredDocumentResult} to readable, semantic Markdown (GFM-style tables).
|
|
41
41
|
*
|
|
42
|
-
* **Universal fallback** for DocMind: use when a format-specific pipeline (DOCX Mammoth, PDF
|
|
43
|
-
* does not apply or fails, and for OCR / image / plain-text flows that already populate this shape.
|
|
42
|
+
* **Universal fallback** for DocMind: use when a format-specific pipeline (DOCX Mammoth→Turndown, specialized PDF→Markdown
|
|
43
|
+
* via `@cognipeer/to-markdown` on Node, …) does not apply or fails, and for OCR / image / plain-text flows that already populate this shape.
|
|
44
44
|
*
|
|
45
45
|
* Uses `blocks` in order; resolves `table` / `image-ref` via `tables` and `images`. When blocks are
|
|
46
46
|
* empty or yield no output, falls back to the rollup `text`. Optional sections use `pages`, `metadata`,
|
|
@@ -213,6 +213,13 @@ type RenderMarkdownSectionsOptions = SplitStructuredIntoChunksOptions;
|
|
|
213
213
|
*/
|
|
214
214
|
declare function renderMarkdownSections(result: StructuredDocumentResult, options?: RenderMarkdownSectionsOptions): MarkdownSection[];
|
|
215
215
|
|
|
216
|
+
/**
|
|
217
|
+
* DOCX → Markdown: **Mammoth** (`convertToHtml` semantic HTML) → **Turndown** (ATX headings, lists, fenced code;
|
|
218
|
+
* **GFM tables** when {@link ConvertDocxToMarkdownOptions.includeTables} is true via `turndown-plugin-gfm`).
|
|
219
|
+
* Page breaks map through Mammoth `styleMap` → `<hr class="page-break">` → Turndown horizontal rules.
|
|
220
|
+
*
|
|
221
|
+
* Independent of the PDF → Markdown pipeline in this package (`pdf-markdown.ts`, `@cognipeer/to-markdown`).
|
|
222
|
+
*/
|
|
216
223
|
/**
|
|
217
224
|
* Binary `.docx` payload accepted by {@link convertDocxToMarkdown}.
|
|
218
225
|
*/
|
|
@@ -316,8 +323,10 @@ declare function convertDocxToMarkdown(input: DocxToMarkdownInput, options?: Con
|
|
|
316
323
|
declare function convertDocxBufferToMarkdown(input: DocxToMarkdownInput, options?: ConvertDocxToMarkdownOptions): Promise<DocxMarkdownResult>;
|
|
317
324
|
|
|
318
325
|
/**
|
|
319
|
-
*
|
|
320
|
-
*
|
|
326
|
+
* Legacy option bag: **ignored** by {@link convertPdfToMarkdown}. Retained so existing TypeScript callers and object
|
|
327
|
+
* literals that still spread old shapes remain assignable to {@link ConvertPdfToMarkdownOptions}.
|
|
328
|
+
*
|
|
329
|
+
* @deprecated PDF → Markdown uses `@cognipeer/to-markdown` only; these keys are not read.
|
|
321
330
|
*/
|
|
322
331
|
interface OpenDataLoaderPdfConvertOptions {
|
|
323
332
|
outputDir?: string;
|
|
@@ -346,11 +355,13 @@ interface OpenDataLoaderPdfConvertOptions {
|
|
|
346
355
|
hybridFallback?: boolean;
|
|
347
356
|
}
|
|
348
357
|
/**
|
|
349
|
-
* Options for {@link convertPdfToMarkdown}.
|
|
358
|
+
* Options for {@link convertPdfToMarkdown}. Properties from {@link OpenDataLoaderPdfConvertOptions} are accepted for
|
|
359
|
+
* compatibility but **ignored**. `fileName`, `forceExtension`, and `url` are forwarded to `@cognipeer/to-markdown`
|
|
360
|
+
* where applicable (see that package’s `ConverterOptions`).
|
|
350
361
|
*/
|
|
351
362
|
type ConvertPdfToMarkdownOptions = OpenDataLoaderPdfConvertOptions & {
|
|
352
363
|
/**
|
|
353
|
-
* When the
|
|
364
|
+
* When the Cognipeer path fails, returns empty output, or `@cognipeer/to-markdown` cannot load,
|
|
354
365
|
* call this to obtain {@link StructuredDocumentResult} (e.g. from another extractor) and serialize with
|
|
355
366
|
* {@link convertStructuredToMarkdown}. Does not import other DocMind packages.
|
|
356
367
|
*/
|
|
@@ -362,19 +373,25 @@ type ConvertPdfToMarkdownOptions = OpenDataLoaderPdfConvertOptions & {
|
|
|
362
373
|
* @default true
|
|
363
374
|
*/
|
|
364
375
|
readonly cleanMarkdown?: boolean;
|
|
376
|
+
/** Forwarded to `@cognipeer/to-markdown` (useful for buffer inputs). */
|
|
377
|
+
readonly fileName?: string;
|
|
378
|
+
/** Forwarded to `@cognipeer/to-markdown`. */
|
|
379
|
+
readonly forceExtension?: string;
|
|
380
|
+
/** Forwarded to `@cognipeer/to-markdown`. */
|
|
381
|
+
readonly url?: string;
|
|
365
382
|
};
|
|
366
383
|
/** Input for {@link convertPdfToMarkdown}: filesystem path (Node) or PDF bytes. */
|
|
367
384
|
type PdfToMarkdownInput = string | Buffer | Uint8Array | ArrayBuffer;
|
|
368
385
|
/** Which pipeline produced {@link ConvertPdfToMarkdownResult.markdown}. */
|
|
369
|
-
type PdfToMarkdownSource = "
|
|
370
|
-
/** `@
|
|
371
|
-
| "
|
|
372
|
-
/** Engine ran or was attempted but produced no usable Markdown (
|
|
373
|
-
| "
|
|
386
|
+
type PdfToMarkdownSource = "cognipeer" | "structured-fallback" | "unsupported-runtime"
|
|
387
|
+
/** `@cognipeer/to-markdown` missing or failed to load (install dep; check bundler externals). */
|
|
388
|
+
| "cognipeer-unavailable"
|
|
389
|
+
/** Engine ran or was attempted but produced no usable Markdown (error, empty output, missing file, etc.). */
|
|
390
|
+
| "cognipeer-failed";
|
|
374
391
|
type PdfToMarkdownFallbackReason = "unsupported-runtime" | "error" | "empty" | "module-not-found";
|
|
375
392
|
interface ConvertPdfToMarkdownResult {
|
|
376
393
|
readonly markdown: string;
|
|
377
|
-
/** Human-readable issues (runtime, missing module,
|
|
394
|
+
/** Human-readable issues (runtime, missing module, conversion errors, empty output, fallback errors). */
|
|
378
395
|
readonly warnings: readonly string[];
|
|
379
396
|
readonly source: PdfToMarkdownSource;
|
|
380
397
|
readonly fallbackReason?: PdfToMarkdownFallbackReason;
|
|
@@ -384,21 +401,25 @@ interface PdfMarkdownResult {
|
|
|
384
401
|
readonly markdown: string;
|
|
385
402
|
}
|
|
386
403
|
/**
|
|
387
|
-
* Primary API: PDF path or bytes → Markdown via `@
|
|
404
|
+
* Primary API: PDF path or bytes → Markdown via `@cognipeer/to-markdown` on Node, with clear warnings and optional
|
|
388
405
|
* structured fallback. In non-Node runtimes returns an empty `markdown` and {@link PdfToMarkdownSource} `unsupported-runtime`
|
|
389
|
-
* without loading `@
|
|
406
|
+
* without loading `@cognipeer/to-markdown`.
|
|
407
|
+
*
|
|
408
|
+
* If the specialized conversion fails or returns empty Markdown and {@link ConvertPdfToMarkdownOptions.resolveStructured} /
|
|
409
|
+
* `extractMarkdown`’s `structuredFallback` is set, output comes from {@link convertStructuredToMarkdown} instead
|
|
410
|
+
* (`extractMarkdown` reports strategy `pdf-structured-fallback`; this result uses `source` `structured-fallback`).
|
|
390
411
|
*/
|
|
391
412
|
declare function convertPdfToMarkdown(input: PdfToMarkdownInput, options?: ConvertPdfToMarkdownOptions): Promise<ConvertPdfToMarkdownResult>;
|
|
392
413
|
/**
|
|
393
|
-
* **Node only.** PDF file path → Markdown via `@
|
|
414
|
+
* **Node only.** PDF file path → Markdown via `@cognipeer/to-markdown`.
|
|
394
415
|
*
|
|
395
|
-
* Throws if not Node, if conversion yields no Markdown (and no structured fallback), or on
|
|
416
|
+
* Throws if not Node, if conversion yields no Markdown (and no structured fallback), or on conversion errors
|
|
396
417
|
* when no fallback is configured — same contract as before {@link convertPdfToMarkdown} existed.
|
|
397
418
|
*/
|
|
398
419
|
declare function convertPdfPathToMarkdown(inputPath: string, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
|
|
399
420
|
/**
|
|
400
421
|
* **Node only.** Same pipeline as {@link convertPdfPathToMarkdown}, but writes bytes to a temporary `.pdf`
|
|
401
|
-
*
|
|
422
|
+
* then converts with `@cognipeer/to-markdown`.
|
|
402
423
|
*/
|
|
403
424
|
declare function convertPdfBufferToMarkdown(input: Buffer | Uint8Array | ArrayBuffer, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
|
|
404
425
|
|
|
@@ -437,21 +458,38 @@ type ExtractMarkdownOptions = ConvertStructuredToMarkdownOptions & {
|
|
|
437
458
|
/** Overrides merged into {@link convertPdfToMarkdown} when the input is identified as `.pdf`. */
|
|
438
459
|
readonly pdf?: ConvertPdfToMarkdownOptions;
|
|
439
460
|
};
|
|
461
|
+
/** @see {@link detectBinaryFormat} */
|
|
462
|
+
type DetectedBinaryFormat = "docx" | "pdf" | "unknown";
|
|
440
463
|
/** Which branch produced {@link ExtractMarkdownResult.markdown}. */
|
|
441
|
-
type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-
|
|
442
|
-
/** `@
|
|
443
|
-
| "pdf-
|
|
464
|
+
type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-cognipeer-specialized"
|
|
465
|
+
/** `@cognipeer/to-markdown` not loadable (missing package, bundler, etc.). */
|
|
466
|
+
| "pdf-cognipeer-unavailable"
|
|
444
467
|
/** Engine failed or returned empty Markdown; no structured fallback or fallback also failed. */
|
|
445
|
-
| "pdf-
|
|
468
|
+
| "pdf-cognipeer-failed" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
|
|
469
|
+
/**
|
|
470
|
+
* Format detection and fallback bookkeeping for file/path inputs to {@link extractMarkdown}.
|
|
471
|
+
* Omitted when `input` is a {@link StructuredDocumentResult} (already structured).
|
|
472
|
+
*/
|
|
473
|
+
interface ExtractMarkdownRoutingInfo {
|
|
474
|
+
readonly detectedFormat: DetectedBinaryFormat;
|
|
475
|
+
/** Specialized binary pipeline invoked first on Node (`none` if not applicable). */
|
|
476
|
+
readonly specializedPipeline: "docx" | "pdf" | "none";
|
|
477
|
+
/** Final Markdown came from {@link ExtractMarkdownOptions.structuredFallback} (or the same snapshot via `resolveStructured`). */
|
|
478
|
+
readonly usedStructuredFallback: boolean;
|
|
479
|
+
}
|
|
446
480
|
interface ExtractMarkdownResult {
|
|
447
481
|
readonly markdown: string;
|
|
448
482
|
/** Merged pipeline warnings (conversion, runtime, and optional {@link StructuredDocumentResult.warnings}). */
|
|
449
483
|
readonly warnings: readonly string[];
|
|
450
484
|
readonly strategy: ExtractMarkdownStrategy;
|
|
485
|
+
/**
|
|
486
|
+
* Present for `{ data, … }` / `{ path, … }` flows after sniffing bytes (or when returning early without reading).
|
|
487
|
+
* Absent for direct structured input.
|
|
488
|
+
*/
|
|
489
|
+
readonly routing?: ExtractMarkdownRoutingInfo;
|
|
451
490
|
}
|
|
452
491
|
/** Type guard: file-like `{ data: … }` input for {@link extractMarkdown}. */
|
|
453
492
|
declare function isExtractMarkdownFileInput(value: unknown): value is ExtractMarkdownFileInput;
|
|
454
|
-
type DetectedBinaryFormat = "docx" | "pdf" | "unknown";
|
|
455
493
|
/** Detect PDF / OOXML zip (DOCX) from magic bytes and optional filename / MIME. */
|
|
456
494
|
declare function detectBinaryFormat(data: Buffer | Uint8Array | ArrayBuffer, filename?: string, mimeType?: string): DetectedBinaryFormat;
|
|
457
495
|
/**
|
|
@@ -460,8 +498,11 @@ declare function detectBinaryFormat(data: Buffer | Uint8Array | ArrayBuffer, fil
|
|
|
460
498
|
*
|
|
461
499
|
* - **Structured input** — always uses the structured serializer (image/OCR/text/PDF/DOCX blocks already normalized).
|
|
462
500
|
* - **DOCX bytes / path** — {@link convertDocxToMarkdown} on Node; otherwise warns and uses {@link ExtractMarkdownOptions.structuredFallback} if provided.
|
|
463
|
-
* - **PDF bytes / path** — {@link convertPdfToMarkdown} (`@
|
|
501
|
+
* - **PDF bytes / path** — {@link convertPdfToMarkdown} (`@cognipeer/to-markdown` on Node); in non-Node runtimes returns empty Markdown with warnings unless {@link ExtractMarkdownOptions.structuredFallback} supplies content.
|
|
502
|
+
*
|
|
503
|
+
* Any use of `structuredFallback` (or the same snapshot through `resolveStructured`) adds a tagged line in {@link ExtractMarkdownResult.warnings}
|
|
504
|
+
* and sets {@link ExtractMarkdownRoutingInfo.usedStructuredFallback} when {@link ExtractMarkdownResult.routing} is present.
|
|
464
505
|
*/
|
|
465
506
|
declare function extractMarkdown(input: ExtractMarkdownInput, options?: ExtractMarkdownOptions): Promise<ExtractMarkdownResult>;
|
|
466
507
|
|
|
467
|
-
export { type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownStrategy, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
|
|
508
|
+
export { type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownRoutingInfo, type ExtractMarkdownStrategy, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
|
package/dist/index.js
CHANGED
|
@@ -993,38 +993,27 @@ async function convertDocxBufferToMarkdown(input, options) {
|
|
|
993
993
|
}
|
|
994
994
|
|
|
995
995
|
// src/pdf-markdown.ts
|
|
996
|
-
var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @
|
|
996
|
+
var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @cognipeer/to-markdown requires Node.js. In the browser, use a server-side conversion or supply structured text/Markdown from your backend.";
|
|
997
|
+
var COGNIPEER_WARN_TAG = "[docmind-markdown:pdf] pdf-cognipeer-specialized:";
|
|
997
998
|
function normalizePdfMarkdown(markdown, clean) {
|
|
998
999
|
const t = markdown.trim();
|
|
999
1000
|
if (!clean) return t;
|
|
1000
1001
|
return t.replace(/\n{3,}/g, "\n\n");
|
|
1001
1002
|
}
|
|
1002
|
-
|
|
1003
|
-
if (typeof out === "string") return out;
|
|
1004
|
-
if (typeof Buffer !== "undefined" && Buffer.isBuffer(out)) {
|
|
1005
|
-
return out.toString("utf8");
|
|
1006
|
-
}
|
|
1007
|
-
if (out instanceof Uint8Array) {
|
|
1008
|
-
return typeof Buffer !== "undefined" ? Buffer.from(out).toString("utf8") : new TextDecoder("utf8", { fatal: false }).decode(out);
|
|
1009
|
-
}
|
|
1010
|
-
if (out == null) return "";
|
|
1011
|
-
return String(out);
|
|
1012
|
-
}
|
|
1013
|
-
var FALLBACK_WARN_PREFIX = "[docmind-markdown:pdf] structured-fallback: serializing StructuredDocumentResult to Markdown because";
|
|
1003
|
+
var FALLBACK_WARN_PREFIX = "[docmind-markdown:pdf] pdf-structured-fallback: serializing StructuredDocumentResult to Markdown because";
|
|
1014
1004
|
function structuredFallbackWarnings(reason, detail) {
|
|
1015
|
-
const tail = reason === "module-not-found" ? "@
|
|
1005
|
+
const tail = reason === "module-not-found" ? "@cognipeer/to-markdown could not be loaded." : reason === "error" ? "the specialized PDF engine raised an error or rejected the input." : reason === "empty" ? "the specialized engine returned empty Markdown." : "the specialized path is unavailable in this runtime.";
|
|
1016
1006
|
const extra = detail ? ` (${detail})` : "";
|
|
1017
1007
|
return [`${FALLBACK_WARN_PREFIX} ${tail}${extra}`];
|
|
1018
1008
|
}
|
|
1019
|
-
function
|
|
1009
|
+
function cognipeerConverterOptions(options) {
|
|
1020
1010
|
if (!options) return {};
|
|
1021
|
-
const {
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
return rest;
|
|
1011
|
+
const { fileName, forceExtension, url } = options;
|
|
1012
|
+
const o = {};
|
|
1013
|
+
if (fileName !== void 0) o.fileName = fileName;
|
|
1014
|
+
if (forceExtension !== void 0) o.forceExtension = forceExtension;
|
|
1015
|
+
if (url !== void 0) o.url = url;
|
|
1016
|
+
return o;
|
|
1028
1017
|
}
|
|
1029
1018
|
async function toNodeBuffer2(input) {
|
|
1030
1019
|
const { Buffer: Buffer2 } = await importEsm("node:buffer");
|
|
@@ -1032,11 +1021,17 @@ async function toNodeBuffer2(input) {
|
|
|
1032
1021
|
if (input instanceof ArrayBuffer) return Buffer2.from(input);
|
|
1033
1022
|
return Buffer2.from(input);
|
|
1034
1023
|
}
|
|
1024
|
+
async function loadCognipeerConvertToMarkdown() {
|
|
1025
|
+
const { createRequire } = await importEsm("node:module");
|
|
1026
|
+
const require2 = createRequire(import.meta.url);
|
|
1027
|
+
const mod = require2("@cognipeer/to-markdown");
|
|
1028
|
+
return mod.convertToMarkdown;
|
|
1029
|
+
}
|
|
1035
1030
|
async function convertPdfToMarkdown(input, options) {
|
|
1036
1031
|
const clean = options?.cleanMarkdown !== false;
|
|
1037
1032
|
const resolveStructured = options?.resolveStructured;
|
|
1038
1033
|
const structuredMdOpts = options?.structuredMarkdown;
|
|
1039
|
-
const
|
|
1034
|
+
const cognipeerOpts = cognipeerConverterOptions(options);
|
|
1040
1035
|
if (!isNodeRuntime()) {
|
|
1041
1036
|
return {
|
|
1042
1037
|
markdown: "",
|
|
@@ -1063,15 +1058,13 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1063
1058
|
await writeFile(inputPath, buffer);
|
|
1064
1059
|
cleanup = async () => rm(dir, { recursive: true, force: true });
|
|
1065
1060
|
}
|
|
1066
|
-
let
|
|
1061
|
+
let convertToMarkdown;
|
|
1067
1062
|
try {
|
|
1068
|
-
|
|
1069
|
-
"@opendataloader/pdf"
|
|
1070
|
-
));
|
|
1063
|
+
convertToMarkdown = await loadCognipeerConvertToMarkdown();
|
|
1071
1064
|
} catch (e) {
|
|
1072
|
-
const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@
|
|
1065
|
+
const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@cognipeer/to-markdown` in your project." : "";
|
|
1073
1066
|
warnings.push(
|
|
1074
|
-
|
|
1067
|
+
`${COGNIPEER_WARN_TAG} package could not be loaded (${e instanceof Error ? e.message : String(e)}).${hint}`
|
|
1075
1068
|
);
|
|
1076
1069
|
if (resolveStructured) {
|
|
1077
1070
|
try {
|
|
@@ -1098,22 +1091,16 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1098
1091
|
return {
|
|
1099
1092
|
markdown: "",
|
|
1100
1093
|
warnings,
|
|
1101
|
-
source: "
|
|
1094
|
+
source: "cognipeer-unavailable",
|
|
1102
1095
|
fallbackReason: "module-not-found"
|
|
1103
1096
|
};
|
|
1104
1097
|
}
|
|
1105
1098
|
let rawMarkdown;
|
|
1106
1099
|
try {
|
|
1107
|
-
|
|
1108
|
-
...eng,
|
|
1109
|
-
format: "markdown",
|
|
1110
|
-
toStdout: true,
|
|
1111
|
-
quiet: eng.quiet !== false
|
|
1112
|
-
});
|
|
1113
|
-
rawMarkdown = normalizeConvertStdout(out);
|
|
1100
|
+
rawMarkdown = await convertToMarkdown(inputPath, cognipeerOpts);
|
|
1114
1101
|
} catch (e) {
|
|
1115
1102
|
const msg = e instanceof Error ? e.message : String(e);
|
|
1116
|
-
warnings.push(
|
|
1103
|
+
warnings.push(`${COGNIPEER_WARN_TAG} ${msg}`);
|
|
1117
1104
|
if (resolveStructured) {
|
|
1118
1105
|
try {
|
|
1119
1106
|
const structured = await resolveStructured();
|
|
@@ -1139,14 +1126,17 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1139
1126
|
return {
|
|
1140
1127
|
markdown: "",
|
|
1141
1128
|
warnings,
|
|
1142
|
-
source: "
|
|
1129
|
+
source: "cognipeer-failed",
|
|
1143
1130
|
fallbackReason: "error"
|
|
1144
1131
|
};
|
|
1145
1132
|
}
|
|
1146
|
-
let markdown = normalizePdfMarkdown(
|
|
1133
|
+
let markdown = normalizePdfMarkdown(
|
|
1134
|
+
typeof rawMarkdown === "string" ? rawMarkdown : String(rawMarkdown ?? ""),
|
|
1135
|
+
clean
|
|
1136
|
+
);
|
|
1147
1137
|
if (markdown.length === 0) {
|
|
1148
1138
|
warnings.push(
|
|
1149
|
-
|
|
1139
|
+
`${COGNIPEER_WARN_TAG} returned empty Markdown for this PDF (whitespace-only after normalize).`
|
|
1150
1140
|
);
|
|
1151
1141
|
if (resolveStructured) {
|
|
1152
1142
|
try {
|
|
@@ -1170,11 +1160,11 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1170
1160
|
return {
|
|
1171
1161
|
markdown: "",
|
|
1172
1162
|
warnings,
|
|
1173
|
-
source: "
|
|
1163
|
+
source: "cognipeer-failed",
|
|
1174
1164
|
fallbackReason: "empty"
|
|
1175
1165
|
};
|
|
1176
1166
|
}
|
|
1177
|
-
return { markdown, warnings, source: "
|
|
1167
|
+
return { markdown, warnings, source: "cognipeer" };
|
|
1178
1168
|
} finally {
|
|
1179
1169
|
if (cleanup) {
|
|
1180
1170
|
await cleanup().catch(() => {
|
|
@@ -1193,13 +1183,13 @@ function throwIfLegacyFailure(r) {
|
|
|
1193
1183
|
}
|
|
1194
1184
|
}
|
|
1195
1185
|
async function convertPdfPathToMarkdown(inputPath, options) {
|
|
1196
|
-
assertNodeRuntime("PDF \u2192 Markdown (@
|
|
1186
|
+
assertNodeRuntime("PDF \u2192 Markdown (@cognipeer/to-markdown)");
|
|
1197
1187
|
const r = await convertPdfToMarkdown(inputPath, options);
|
|
1198
1188
|
throwIfLegacyFailure(r);
|
|
1199
1189
|
return { markdown: r.markdown };
|
|
1200
1190
|
}
|
|
1201
1191
|
async function convertPdfBufferToMarkdown(input, options) {
|
|
1202
|
-
assertNodeRuntime("PDF \u2192 Markdown (@
|
|
1192
|
+
assertNodeRuntime("PDF \u2192 Markdown (@cognipeer/to-markdown)");
|
|
1203
1193
|
const r = await convertPdfToMarkdown(input, options);
|
|
1204
1194
|
throwIfLegacyFailure(r);
|
|
1205
1195
|
return { markdown: r.markdown };
|
|
@@ -1273,13 +1263,13 @@ function pdfStrategyFromResult(r) {
|
|
|
1273
1263
|
return "pdf-structured-fallback";
|
|
1274
1264
|
case "unsupported-runtime":
|
|
1275
1265
|
return "pdf-unsupported-runtime";
|
|
1276
|
-
case "
|
|
1277
|
-
return "pdf-
|
|
1278
|
-
case "
|
|
1279
|
-
return "pdf-
|
|
1280
|
-
case "
|
|
1266
|
+
case "cognipeer-unavailable":
|
|
1267
|
+
return "pdf-cognipeer-unavailable";
|
|
1268
|
+
case "cognipeer-failed":
|
|
1269
|
+
return "pdf-cognipeer-failed";
|
|
1270
|
+
case "cognipeer":
|
|
1281
1271
|
default:
|
|
1282
|
-
return "pdf-
|
|
1272
|
+
return "pdf-cognipeer-specialized";
|
|
1283
1273
|
}
|
|
1284
1274
|
}
|
|
1285
1275
|
function mergeWarnings(base, ...more) {
|
|
@@ -1289,6 +1279,22 @@ function mergeWarnings(base, ...more) {
|
|
|
1289
1279
|
}
|
|
1290
1280
|
return out;
|
|
1291
1281
|
}
|
|
1282
|
+
var EXTRACT_WARN = "[docmind-markdown:extractMarkdown]";
|
|
1283
|
+
function traceUsedStructuredFallback(context) {
|
|
1284
|
+
return `${EXTRACT_WARN} ${context}: final Markdown from structuredFallback (specialized route unavailable, failed, or insufficient).`;
|
|
1285
|
+
}
|
|
1286
|
+
function tracePdfStructuredAfterUnsupportedRuntime() {
|
|
1287
|
+
return `${EXTRACT_WARN} pdf-unsupported-runtime: final Markdown from structuredFallback \u2014 @cognipeer/to-markdown cannot run in this environment.`;
|
|
1288
|
+
}
|
|
1289
|
+
function traceDocxStructuredAfterMammoth() {
|
|
1290
|
+
return `${EXTRACT_WARN} docx-structured-fallback: final Markdown from structured envelope after Mammoth/Turndown did not yield the result.`;
|
|
1291
|
+
}
|
|
1292
|
+
function tracePdfStructuredAfterCognipeer() {
|
|
1293
|
+
return `${EXTRACT_WARN} pdf-structured-fallback: final Markdown from structured envelope after Cognipeer PDF path did not yield the result.`;
|
|
1294
|
+
}
|
|
1295
|
+
function tracePdfSpecializedDeadEnd() {
|
|
1296
|
+
return `${EXTRACT_WARN} pdf: Cognipeer specialized route did not produce Markdown and structuredFallback was not provided.`;
|
|
1297
|
+
}
|
|
1292
1298
|
async function extractMarkdown(input, options) {
|
|
1293
1299
|
const smOpts = pickStructuredMarkdownOptions(options);
|
|
1294
1300
|
const fb = options?.structuredFallback;
|
|
@@ -1315,11 +1321,25 @@ async function extractMarkdown(input, options) {
|
|
|
1315
1321
|
if (fb) {
|
|
1316
1322
|
return {
|
|
1317
1323
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1318
|
-
warnings: mergeWarnings(warnings, fb.warnings),
|
|
1319
|
-
strategy: "path-requires-node"
|
|
1324
|
+
warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("path-requires-node")]),
|
|
1325
|
+
strategy: "path-requires-node",
|
|
1326
|
+
routing: {
|
|
1327
|
+
detectedFormat: "unknown",
|
|
1328
|
+
specializedPipeline: "none",
|
|
1329
|
+
usedStructuredFallback: true
|
|
1330
|
+
}
|
|
1320
1331
|
};
|
|
1321
1332
|
}
|
|
1322
|
-
return {
|
|
1333
|
+
return {
|
|
1334
|
+
markdown: "",
|
|
1335
|
+
warnings,
|
|
1336
|
+
strategy: "path-requires-node",
|
|
1337
|
+
routing: {
|
|
1338
|
+
detectedFormat: "unknown",
|
|
1339
|
+
specializedPipeline: "none",
|
|
1340
|
+
usedStructuredFallback: false
|
|
1341
|
+
}
|
|
1342
|
+
};
|
|
1323
1343
|
}
|
|
1324
1344
|
const { readFile } = await importEsm(
|
|
1325
1345
|
"node:fs/promises"
|
|
@@ -1339,7 +1359,7 @@ async function extractMarkdown(input, options) {
|
|
|
1339
1359
|
if (fb) {
|
|
1340
1360
|
return {
|
|
1341
1361
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1342
|
-
warnings: mergeWarnings(warnings, fb.warnings),
|
|
1362
|
+
warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("invalid-input-shape")]),
|
|
1343
1363
|
strategy: "binary-unidentified-structured-fallback"
|
|
1344
1364
|
};
|
|
1345
1365
|
}
|
|
@@ -1354,37 +1374,79 @@ async function extractMarkdown(input, options) {
|
|
|
1354
1374
|
if (fb) {
|
|
1355
1375
|
return {
|
|
1356
1376
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1357
|
-
warnings: mergeWarnings(warnings, fb.warnings),
|
|
1358
|
-
strategy: "docx-requires-node"
|
|
1377
|
+
warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("docx-requires-node")]),
|
|
1378
|
+
strategy: "docx-requires-node",
|
|
1379
|
+
routing: {
|
|
1380
|
+
detectedFormat: "docx",
|
|
1381
|
+
specializedPipeline: "none",
|
|
1382
|
+
usedStructuredFallback: true
|
|
1383
|
+
}
|
|
1359
1384
|
};
|
|
1360
1385
|
}
|
|
1361
|
-
return {
|
|
1386
|
+
return {
|
|
1387
|
+
markdown: "",
|
|
1388
|
+
warnings,
|
|
1389
|
+
strategy: "docx-requires-node",
|
|
1390
|
+
routing: {
|
|
1391
|
+
detectedFormat: "docx",
|
|
1392
|
+
specializedPipeline: "none",
|
|
1393
|
+
usedStructuredFallback: false
|
|
1394
|
+
}
|
|
1395
|
+
};
|
|
1362
1396
|
}
|
|
1363
1397
|
const r = await convertDocxToMarkdown(data, buildDocxOptions(options));
|
|
1398
|
+
const strategy = docxStrategyFromSource(r.source);
|
|
1364
1399
|
const w = mergeWarnings(
|
|
1365
1400
|
warnings,
|
|
1366
|
-
r.messages.map((m) => m.message)
|
|
1401
|
+
r.messages.map((m) => m.message),
|
|
1402
|
+
strategy === "docx-structured-fallback" ? [traceDocxStructuredAfterMammoth()] : []
|
|
1367
1403
|
);
|
|
1368
1404
|
return {
|
|
1369
1405
|
markdown: r.markdown,
|
|
1370
1406
|
warnings: w,
|
|
1371
|
-
strategy
|
|
1407
|
+
strategy,
|
|
1408
|
+
routing: {
|
|
1409
|
+
detectedFormat: "docx",
|
|
1410
|
+
specializedPipeline: "docx",
|
|
1411
|
+
usedStructuredFallback: strategy === "docx-structured-fallback"
|
|
1412
|
+
}
|
|
1372
1413
|
};
|
|
1373
1414
|
}
|
|
1374
1415
|
if (fmt === "pdf") {
|
|
1375
1416
|
const r = await convertPdfToMarkdown(data, buildPdfOptions(options));
|
|
1376
1417
|
const strategy = pdfStrategyFromResult(r);
|
|
1377
|
-
|
|
1418
|
+
let w = mergeWarnings(warnings, r.warnings);
|
|
1378
1419
|
if (strategy === "pdf-unsupported-runtime" && r.markdown === "" && fb) {
|
|
1420
|
+
w = mergeWarnings(w, fb.warnings, [
|
|
1421
|
+
tracePdfStructuredAfterUnsupportedRuntime()
|
|
1422
|
+
]);
|
|
1379
1423
|
return {
|
|
1380
1424
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1381
|
-
warnings:
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1425
|
+
warnings: w,
|
|
1426
|
+
strategy: "pdf-structured-fallback",
|
|
1427
|
+
routing: {
|
|
1428
|
+
detectedFormat: "pdf",
|
|
1429
|
+
specializedPipeline: "pdf",
|
|
1430
|
+
usedStructuredFallback: true
|
|
1431
|
+
}
|
|
1385
1432
|
};
|
|
1386
1433
|
}
|
|
1387
|
-
|
|
1434
|
+
if (strategy === "pdf-structured-fallback") {
|
|
1435
|
+
w = mergeWarnings(w, [tracePdfStructuredAfterCognipeer()]);
|
|
1436
|
+
}
|
|
1437
|
+
if ((strategy === "pdf-cognipeer-failed" || strategy === "pdf-cognipeer-unavailable") && r.markdown.trim() === "" && !fb) {
|
|
1438
|
+
w = mergeWarnings(w, [tracePdfSpecializedDeadEnd()]);
|
|
1439
|
+
}
|
|
1440
|
+
return {
|
|
1441
|
+
markdown: r.markdown,
|
|
1442
|
+
warnings: w,
|
|
1443
|
+
strategy,
|
|
1444
|
+
routing: {
|
|
1445
|
+
detectedFormat: "pdf",
|
|
1446
|
+
specializedPipeline: "pdf",
|
|
1447
|
+
usedStructuredFallback: strategy === "pdf-structured-fallback"
|
|
1448
|
+
}
|
|
1449
|
+
};
|
|
1388
1450
|
}
|
|
1389
1451
|
warnings.push(
|
|
1390
1452
|
"@dragon708/docmind-markdown: Unidentified binary format (expected PDF magic or ZIP/DOCX). Using structured fallback if provided."
|
|
@@ -1392,11 +1454,25 @@ async function extractMarkdown(input, options) {
|
|
|
1392
1454
|
if (fb) {
|
|
1393
1455
|
return {
|
|
1394
1456
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1395
|
-
warnings: mergeWarnings(warnings, fb.warnings),
|
|
1396
|
-
strategy: "binary-unidentified-structured-fallback"
|
|
1457
|
+
warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("binary-unidentified")]),
|
|
1458
|
+
strategy: "binary-unidentified-structured-fallback",
|
|
1459
|
+
routing: {
|
|
1460
|
+
detectedFormat: "unknown",
|
|
1461
|
+
specializedPipeline: "none",
|
|
1462
|
+
usedStructuredFallback: true
|
|
1463
|
+
}
|
|
1397
1464
|
};
|
|
1398
1465
|
}
|
|
1399
|
-
return {
|
|
1466
|
+
return {
|
|
1467
|
+
markdown: "",
|
|
1468
|
+
warnings,
|
|
1469
|
+
strategy: "binary-unidentified",
|
|
1470
|
+
routing: {
|
|
1471
|
+
detectedFormat: "unknown",
|
|
1472
|
+
specializedPipeline: "none",
|
|
1473
|
+
usedStructuredFallback: false
|
|
1474
|
+
}
|
|
1475
|
+
};
|
|
1400
1476
|
}
|
|
1401
1477
|
|
|
1402
1478
|
export { convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-markdown",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.1",
|
|
4
4
|
"description": "StructuredDocumentResult → Markdown and LLM-oriented plain text for DocMind.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"sideEffects": false,
|
|
@@ -35,8 +35,8 @@
|
|
|
35
35
|
],
|
|
36
36
|
"license": "MIT",
|
|
37
37
|
"dependencies": {
|
|
38
|
+
"@cognipeer/to-markdown": "^2.0.1",
|
|
38
39
|
"@dragon708/docmind-shared": "^1.2.0",
|
|
39
|
-
"@opendataloader/pdf": "^2.2.1",
|
|
40
40
|
"mammoth": "^1.6.0",
|
|
41
41
|
"turndown": "^7.0.0",
|
|
42
42
|
"turndown-plugin-gfm": "^1.0.2"
|