@dragon708/docmind-markdown 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -39,8 +39,8 @@ type StructuredToMarkdownOptions = ConvertStructuredToMarkdownOptions;
39
39
  /**
40
40
  * Converts a {@link StructuredDocumentResult} to readable, semantic Markdown (GFM-style tables).
41
41
  *
42
- * **Universal fallback** for DocMind: use when a format-specific pipeline (DOCX Mammoth, PDF OpenDataLoader, …)
43
- * does not apply or fails, and for OCR / image / plain-text flows that already populate this shape.
42
+ * **Universal fallback** for DocMind: use when a format-specific pipeline (DOCX Mammoth→Turndown, specialized PDF→Markdown
43
+ * via `@cognipeer/to-markdown` on Node, …) does not apply or fails, and for OCR / image / plain-text flows that already populate this shape.
44
44
  *
45
45
  * Uses `blocks` in order; resolves `table` / `image-ref` via `tables` and `images`. When blocks are
46
46
  * empty or yield no output, falls back to the rollup `text`. Optional sections use `pages`, `metadata`,
@@ -213,6 +213,13 @@ type RenderMarkdownSectionsOptions = SplitStructuredIntoChunksOptions;
213
213
  */
214
214
  declare function renderMarkdownSections(result: StructuredDocumentResult, options?: RenderMarkdownSectionsOptions): MarkdownSection[];
215
215
 
216
+ /**
217
+ * DOCX → Markdown: **Mammoth** (`convertToHtml` semantic HTML) → **Turndown** (ATX headings, lists, fenced code;
218
+ * **GFM tables** when {@link ConvertDocxToMarkdownOptions.includeTables} is true via `turndown-plugin-gfm`).
219
+ * Page breaks map through Mammoth `styleMap` → `<hr class="page-break">` → Turndown horizontal rules.
220
+ *
221
+ * Independent of the PDF → Markdown pipeline in this package (`pdf-markdown.ts`, `@cognipeer/to-markdown`).
222
+ */
216
223
  /**
217
224
  * Binary `.docx` payload accepted by {@link convertDocxToMarkdown}.
218
225
  */
@@ -316,8 +323,10 @@ declare function convertDocxToMarkdown(input: DocxToMarkdownInput, options?: Con
316
323
  declare function convertDocxBufferToMarkdown(input: DocxToMarkdownInput, options?: ConvertDocxToMarkdownOptions): Promise<DocxMarkdownResult>;
317
324
 
318
325
  /**
319
- * Options forwarded to `@opendataloader/pdf` `convert()`, except `format` and `toStdout` (set internally).
320
- * Shaped to match `ConvertOptions` from `@opendataloader/pdf` v2.x without a static type import.
326
+ * Legacy option bag: **ignored** by {@link convertPdfToMarkdown}. Retained so existing TypeScript callers and object
327
+ * literals that still spread old shapes remain assignable to {@link ConvertPdfToMarkdownOptions}.
328
+ *
329
+ * @deprecated PDF → Markdown uses `@cognipeer/to-markdown` only; these keys are not read.
321
330
  */
322
331
  interface OpenDataLoaderPdfConvertOptions {
323
332
  outputDir?: string;
@@ -346,11 +355,13 @@ interface OpenDataLoaderPdfConvertOptions {
346
355
  hybridFallback?: boolean;
347
356
  }
348
357
  /**
349
- * Options for {@link convertPdfToMarkdown}. OpenDataLoader fields are passed through; structured fields are local.
358
+ * Options for {@link convertPdfToMarkdown}. Properties from {@link OpenDataLoaderPdfConvertOptions} are accepted for
359
+ * compatibility but **ignored**. `fileName`, `forceExtension`, and `url` are forwarded to `@cognipeer/to-markdown`
360
+ * where applicable (see that package’s `ConverterOptions`).
350
361
  */
351
362
  type ConvertPdfToMarkdownOptions = OpenDataLoaderPdfConvertOptions & {
352
363
  /**
353
- * When the OpenDataLoader path fails, returns empty output, or `@opendataloader/pdf` cannot load,
364
+ * When the Cognipeer path fails, returns empty output, or `@cognipeer/to-markdown` cannot load,
354
365
  * call this to obtain {@link StructuredDocumentResult} (e.g. from another extractor) and serialize with
355
366
  * {@link convertStructuredToMarkdown}. Does not import other DocMind packages.
356
367
  */
@@ -362,19 +373,25 @@ type ConvertPdfToMarkdownOptions = OpenDataLoaderPdfConvertOptions & {
362
373
  * @default true
363
374
  */
364
375
  readonly cleanMarkdown?: boolean;
376
+ /** Forwarded to `@cognipeer/to-markdown` (useful for buffer inputs). */
377
+ readonly fileName?: string;
378
+ /** Forwarded to `@cognipeer/to-markdown`. */
379
+ readonly forceExtension?: string;
380
+ /** Forwarded to `@cognipeer/to-markdown`. */
381
+ readonly url?: string;
365
382
  };
366
383
  /** Input for {@link convertPdfToMarkdown}: filesystem path (Node) or PDF bytes. */
367
384
  type PdfToMarkdownInput = string | Buffer | Uint8Array | ArrayBuffer;
368
385
  /** Which pipeline produced {@link ConvertPdfToMarkdownResult.markdown}. */
369
- type PdfToMarkdownSource = "opendataloader" | "structured-fallback" | "unsupported-runtime"
370
- /** `@opendataloader/pdf` missing or failed to load (install dep; check bundler externals). */
371
- | "opendataloader-unavailable"
372
- /** Engine ran or was attempted but produced no usable Markdown (Java error, empty stdout, missing file, etc.). */
373
- | "opendataloader-failed";
386
+ type PdfToMarkdownSource = "cognipeer" | "structured-fallback" | "unsupported-runtime"
387
+ /** `@cognipeer/to-markdown` missing or failed to load (install dep; check bundler externals). */
388
+ | "cognipeer-unavailable"
389
+ /** Engine ran or was attempted but produced no usable Markdown (error, empty output, missing file, etc.). */
390
+ | "cognipeer-failed";
374
391
  type PdfToMarkdownFallbackReason = "unsupported-runtime" | "error" | "empty" | "module-not-found";
375
392
  interface ConvertPdfToMarkdownResult {
376
393
  readonly markdown: string;
377
- /** Human-readable issues (runtime, missing module, Java/PDF errors, empty output, fallback errors). */
394
+ /** Human-readable issues (runtime, missing module, conversion errors, empty output, fallback errors). */
378
395
  readonly warnings: readonly string[];
379
396
  readonly source: PdfToMarkdownSource;
380
397
  readonly fallbackReason?: PdfToMarkdownFallbackReason;
@@ -384,21 +401,25 @@ interface PdfMarkdownResult {
384
401
  readonly markdown: string;
385
402
  }
386
403
  /**
387
- * Primary API: PDF path or bytes → Markdown via `@opendataloader/pdf` on Node, with clear warnings and optional
404
+ * Primary API: PDF path or bytes → Markdown via `@cognipeer/to-markdown` on Node, with clear warnings and optional
388
405
  * structured fallback. In non-Node runtimes returns an empty `markdown` and {@link PdfToMarkdownSource} `unsupported-runtime`
389
- * without loading `@opendataloader/pdf`.
406
+ * without loading `@cognipeer/to-markdown`.
407
+ *
408
+ * If the specialized conversion fails or returns empty Markdown and {@link ConvertPdfToMarkdownOptions.resolveStructured} /
409
+ * `extractMarkdown`’s `structuredFallback` is set, output comes from {@link convertStructuredToMarkdown} instead
410
+ * (`extractMarkdown` reports strategy `pdf-structured-fallback`; this result uses `source` `structured-fallback`).
390
411
  */
391
412
  declare function convertPdfToMarkdown(input: PdfToMarkdownInput, options?: ConvertPdfToMarkdownOptions): Promise<ConvertPdfToMarkdownResult>;
392
413
  /**
393
- * **Node only.** PDF file path → Markdown via `@opendataloader/pdf` (`format: "markdown"`, `toStdout: true`).
414
+ * **Node only.** PDF file path → Markdown via `@cognipeer/to-markdown`.
394
415
  *
395
- * Throws if not Node, if conversion yields no Markdown (and no structured fallback), or on OpenDataLoader errors
416
+ * Throws if not Node, if conversion yields no Markdown (and no structured fallback), or on conversion errors
396
417
  * when no fallback is configured — same contract as before {@link convertPdfToMarkdown} existed.
397
418
  */
398
419
  declare function convertPdfPathToMarkdown(inputPath: string, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
399
420
  /**
400
421
  * **Node only.** Same pipeline as {@link convertPdfPathToMarkdown}, but writes bytes to a temporary `.pdf`
401
- * under the system temp directory (OpenDataLoader expects a file path).
422
+ * then converts with `@cognipeer/to-markdown`.
402
423
  */
403
424
  declare function convertPdfBufferToMarkdown(input: Buffer | Uint8Array | ArrayBuffer, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
404
425
 
@@ -437,21 +458,38 @@ type ExtractMarkdownOptions = ConvertStructuredToMarkdownOptions & {
437
458
  /** Overrides merged into {@link convertPdfToMarkdown} when the input is identified as `.pdf`. */
438
459
  readonly pdf?: ConvertPdfToMarkdownOptions;
439
460
  };
461
+ /** @see {@link detectBinaryFormat} */
462
+ type DetectedBinaryFormat = "docx" | "pdf" | "unknown";
440
463
  /** Which branch produced {@link ExtractMarkdownResult.markdown}. */
441
- type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-opendataloader"
442
- /** `@opendataloader/pdf` not loadable (missing package, bundler, etc.). */
443
- | "pdf-opendataloader-unavailable"
464
+ type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-cognipeer-specialized"
465
+ /** `@cognipeer/to-markdown` not loadable (missing package, bundler, etc.). */
466
+ | "pdf-cognipeer-unavailable"
444
467
  /** Engine failed or returned empty Markdown; no structured fallback or fallback also failed. */
445
- | "pdf-opendataloader-failed" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
468
+ | "pdf-cognipeer-failed" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
469
+ /**
470
+ * Format detection and fallback bookkeeping for file/path inputs to {@link extractMarkdown}.
471
+ * Omitted when `input` is a {@link StructuredDocumentResult} (already structured).
472
+ */
473
+ interface ExtractMarkdownRoutingInfo {
474
+ readonly detectedFormat: DetectedBinaryFormat;
475
+ /** Specialized binary pipeline invoked first on Node (`none` if not applicable). */
476
+ readonly specializedPipeline: "docx" | "pdf" | "none";
477
+ /** Final Markdown came from {@link ExtractMarkdownOptions.structuredFallback} (or the same snapshot via `resolveStructured`). */
478
+ readonly usedStructuredFallback: boolean;
479
+ }
446
480
  interface ExtractMarkdownResult {
447
481
  readonly markdown: string;
448
482
  /** Merged pipeline warnings (conversion, runtime, and optional {@link StructuredDocumentResult.warnings}). */
449
483
  readonly warnings: readonly string[];
450
484
  readonly strategy: ExtractMarkdownStrategy;
485
+ /**
486
+ * Present for `{ data, … }` / `{ path, … }` flows after sniffing bytes (or when returning early without reading).
487
+ * Absent for direct structured input.
488
+ */
489
+ readonly routing?: ExtractMarkdownRoutingInfo;
451
490
  }
452
491
  /** Type guard: file-like `{ data: … }` input for {@link extractMarkdown}. */
453
492
  declare function isExtractMarkdownFileInput(value: unknown): value is ExtractMarkdownFileInput;
454
- type DetectedBinaryFormat = "docx" | "pdf" | "unknown";
455
493
  /** Detect PDF / OOXML zip (DOCX) from magic bytes and optional filename / MIME. */
456
494
  declare function detectBinaryFormat(data: Buffer | Uint8Array | ArrayBuffer, filename?: string, mimeType?: string): DetectedBinaryFormat;
457
495
  /**
@@ -460,8 +498,11 @@ declare function detectBinaryFormat(data: Buffer | Uint8Array | ArrayBuffer, fil
460
498
  *
461
499
  * - **Structured input** — always uses the structured serializer (image/OCR/text/PDF/DOCX blocks already normalized).
462
500
  * - **DOCX bytes / path** — {@link convertDocxToMarkdown} on Node; otherwise warns and uses {@link ExtractMarkdownOptions.structuredFallback} if provided.
463
- * - **PDF bytes / path** — {@link convertPdfToMarkdown} (`@opendataloader/pdf` on Node when Java is available); in non-Node runtimes returns empty Markdown with warnings unless {@link ExtractMarkdownOptions.structuredFallback} supplies content.
501
+ * - **PDF bytes / path** — {@link convertPdfToMarkdown} (`@cognipeer/to-markdown` on Node); in non-Node runtimes returns empty Markdown with warnings unless {@link ExtractMarkdownOptions.structuredFallback} supplies content.
502
+ *
503
+ * Any use of `structuredFallback` (or the same snapshot through `resolveStructured`) adds a tagged line in {@link ExtractMarkdownResult.warnings}
504
+ * and sets {@link ExtractMarkdownRoutingInfo.usedStructuredFallback} when {@link ExtractMarkdownResult.routing} is present.
464
505
  */
465
506
  declare function extractMarkdown(input: ExtractMarkdownInput, options?: ExtractMarkdownOptions): Promise<ExtractMarkdownResult>;
466
507
 
467
- export { type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownStrategy, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
508
+ export { type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownRoutingInfo, type ExtractMarkdownStrategy, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
package/dist/index.js CHANGED
@@ -993,38 +993,27 @@ async function convertDocxBufferToMarkdown(input, options) {
993
993
  }
994
994
 
995
995
  // src/pdf-markdown.ts
996
- var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @opendataloader/pdf requires Node.js. In the browser, use a server-side conversion or supply structured text/Markdown from your backend.";
996
+ var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @cognipeer/to-markdown requires Node.js. In the browser, use a server-side conversion or supply structured text/Markdown from your backend.";
997
+ var COGNIPEER_WARN_TAG = "[docmind-markdown:pdf] pdf-cognipeer-specialized:";
997
998
  function normalizePdfMarkdown(markdown, clean) {
998
999
  const t = markdown.trim();
999
1000
  if (!clean) return t;
1000
1001
  return t.replace(/\n{3,}/g, "\n\n");
1001
1002
  }
1002
- function normalizeConvertStdout(out) {
1003
- if (typeof out === "string") return out;
1004
- if (typeof Buffer !== "undefined" && Buffer.isBuffer(out)) {
1005
- return out.toString("utf8");
1006
- }
1007
- if (out instanceof Uint8Array) {
1008
- return typeof Buffer !== "undefined" ? Buffer.from(out).toString("utf8") : new TextDecoder("utf8", { fatal: false }).decode(out);
1009
- }
1010
- if (out == null) return "";
1011
- return String(out);
1012
- }
1013
- var FALLBACK_WARN_PREFIX = "[docmind-markdown:pdf] structured-fallback: serializing StructuredDocumentResult to Markdown because";
1003
+ var FALLBACK_WARN_PREFIX = "[docmind-markdown:pdf] pdf-structured-fallback: serializing StructuredDocumentResult to Markdown because";
1014
1004
  function structuredFallbackWarnings(reason, detail) {
1015
- const tail = reason === "module-not-found" ? "@opendataloader/pdf could not be loaded." : reason === "error" ? "the specialized PDF engine raised an error or rejected the input." : reason === "empty" ? "the specialized engine returned empty Markdown." : "the specialized path is unavailable in this runtime.";
1005
+ const tail = reason === "module-not-found" ? "@cognipeer/to-markdown could not be loaded." : reason === "error" ? "the specialized PDF engine raised an error or rejected the input." : reason === "empty" ? "the specialized engine returned empty Markdown." : "the specialized path is unavailable in this runtime.";
1016
1006
  const extra = detail ? ` (${detail})` : "";
1017
1007
  return [`${FALLBACK_WARN_PREFIX} ${tail}${extra}`];
1018
1008
  }
1019
- function engineOptions(options) {
1009
+ function cognipeerConverterOptions(options) {
1020
1010
  if (!options) return {};
1021
- const {
1022
- resolveStructured: _r,
1023
- structuredMarkdown: _s,
1024
- cleanMarkdown: _c,
1025
- ...rest
1026
- } = options;
1027
- return rest;
1011
+ const { fileName, forceExtension, url } = options;
1012
+ const o = {};
1013
+ if (fileName !== void 0) o.fileName = fileName;
1014
+ if (forceExtension !== void 0) o.forceExtension = forceExtension;
1015
+ if (url !== void 0) o.url = url;
1016
+ return o;
1028
1017
  }
1029
1018
  async function toNodeBuffer2(input) {
1030
1019
  const { Buffer: Buffer2 } = await importEsm("node:buffer");
@@ -1032,11 +1021,17 @@ async function toNodeBuffer2(input) {
1032
1021
  if (input instanceof ArrayBuffer) return Buffer2.from(input);
1033
1022
  return Buffer2.from(input);
1034
1023
  }
1024
+ async function loadCognipeerConvertToMarkdown() {
1025
+ const { createRequire } = await importEsm("node:module");
1026
+ const require2 = createRequire(import.meta.url);
1027
+ const mod = require2("@cognipeer/to-markdown");
1028
+ return mod.convertToMarkdown;
1029
+ }
1035
1030
  async function convertPdfToMarkdown(input, options) {
1036
1031
  const clean = options?.cleanMarkdown !== false;
1037
1032
  const resolveStructured = options?.resolveStructured;
1038
1033
  const structuredMdOpts = options?.structuredMarkdown;
1039
- const eng = engineOptions(options);
1034
+ const cognipeerOpts = cognipeerConverterOptions(options);
1040
1035
  if (!isNodeRuntime()) {
1041
1036
  return {
1042
1037
  markdown: "",
@@ -1063,15 +1058,13 @@ async function convertPdfToMarkdown(input, options) {
1063
1058
  await writeFile(inputPath, buffer);
1064
1059
  cleanup = async () => rm(dir, { recursive: true, force: true });
1065
1060
  }
1066
- let convert;
1061
+ let convertToMarkdown;
1067
1062
  try {
1068
- ({ convert } = await importEsm(
1069
- "@opendataloader/pdf"
1070
- ));
1063
+ convertToMarkdown = await loadCognipeerConvertToMarkdown();
1071
1064
  } catch (e) {
1072
- const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@opendataloader/pdf` in your project." : "";
1065
+ const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@cognipeer/to-markdown` in your project." : "";
1073
1066
  warnings.push(
1074
- `@opendataloader/pdf could not be loaded (${e instanceof Error ? e.message : String(e)}).${hint}`
1067
+ `${COGNIPEER_WARN_TAG} package could not be loaded (${e instanceof Error ? e.message : String(e)}).${hint}`
1075
1068
  );
1076
1069
  if (resolveStructured) {
1077
1070
  try {
@@ -1098,22 +1091,16 @@ async function convertPdfToMarkdown(input, options) {
1098
1091
  return {
1099
1092
  markdown: "",
1100
1093
  warnings,
1101
- source: "opendataloader-unavailable",
1094
+ source: "cognipeer-unavailable",
1102
1095
  fallbackReason: "module-not-found"
1103
1096
  };
1104
1097
  }
1105
1098
  let rawMarkdown;
1106
1099
  try {
1107
- const out = await convert(inputPath, {
1108
- ...eng,
1109
- format: "markdown",
1110
- toStdout: true,
1111
- quiet: eng.quiet !== false
1112
- });
1113
- rawMarkdown = normalizeConvertStdout(out);
1100
+ rawMarkdown = await convertToMarkdown(inputPath, cognipeerOpts);
1114
1101
  } catch (e) {
1115
1102
  const msg = e instanceof Error ? e.message : String(e);
1116
- warnings.push(`PDF conversion failed (@opendataloader/pdf): ${msg}`);
1103
+ warnings.push(`${COGNIPEER_WARN_TAG} ${msg}`);
1117
1104
  if (resolveStructured) {
1118
1105
  try {
1119
1106
  const structured = await resolveStructured();
@@ -1139,14 +1126,17 @@ async function convertPdfToMarkdown(input, options) {
1139
1126
  return {
1140
1127
  markdown: "",
1141
1128
  warnings,
1142
- source: "opendataloader-failed",
1129
+ source: "cognipeer-failed",
1143
1130
  fallbackReason: "error"
1144
1131
  };
1145
1132
  }
1146
- let markdown = normalizePdfMarkdown(rawMarkdown, clean);
1133
+ let markdown = normalizePdfMarkdown(
1134
+ typeof rawMarkdown === "string" ? rawMarkdown : String(rawMarkdown ?? ""),
1135
+ clean
1136
+ );
1147
1137
  if (markdown.length === 0) {
1148
1138
  warnings.push(
1149
- "OpenDataLoader returned empty Markdown for this PDF (stdout empty or whitespace-only after normalize)."
1139
+ `${COGNIPEER_WARN_TAG} returned empty Markdown for this PDF (whitespace-only after normalize).`
1150
1140
  );
1151
1141
  if (resolveStructured) {
1152
1142
  try {
@@ -1170,11 +1160,11 @@ async function convertPdfToMarkdown(input, options) {
1170
1160
  return {
1171
1161
  markdown: "",
1172
1162
  warnings,
1173
- source: "opendataloader-failed",
1163
+ source: "cognipeer-failed",
1174
1164
  fallbackReason: "empty"
1175
1165
  };
1176
1166
  }
1177
- return { markdown, warnings, source: "opendataloader" };
1167
+ return { markdown, warnings, source: "cognipeer" };
1178
1168
  } finally {
1179
1169
  if (cleanup) {
1180
1170
  await cleanup().catch(() => {
@@ -1193,13 +1183,13 @@ function throwIfLegacyFailure(r) {
1193
1183
  }
1194
1184
  }
1195
1185
  async function convertPdfPathToMarkdown(inputPath, options) {
1196
- assertNodeRuntime("PDF \u2192 Markdown (@opendataloader/pdf)");
1186
+ assertNodeRuntime("PDF \u2192 Markdown (@cognipeer/to-markdown)");
1197
1187
  const r = await convertPdfToMarkdown(inputPath, options);
1198
1188
  throwIfLegacyFailure(r);
1199
1189
  return { markdown: r.markdown };
1200
1190
  }
1201
1191
  async function convertPdfBufferToMarkdown(input, options) {
1202
- assertNodeRuntime("PDF \u2192 Markdown (@opendataloader/pdf)");
1192
+ assertNodeRuntime("PDF \u2192 Markdown (@cognipeer/to-markdown)");
1203
1193
  const r = await convertPdfToMarkdown(input, options);
1204
1194
  throwIfLegacyFailure(r);
1205
1195
  return { markdown: r.markdown };
@@ -1273,13 +1263,13 @@ function pdfStrategyFromResult(r) {
1273
1263
  return "pdf-structured-fallback";
1274
1264
  case "unsupported-runtime":
1275
1265
  return "pdf-unsupported-runtime";
1276
- case "opendataloader-unavailable":
1277
- return "pdf-opendataloader-unavailable";
1278
- case "opendataloader-failed":
1279
- return "pdf-opendataloader-failed";
1280
- case "opendataloader":
1266
+ case "cognipeer-unavailable":
1267
+ return "pdf-cognipeer-unavailable";
1268
+ case "cognipeer-failed":
1269
+ return "pdf-cognipeer-failed";
1270
+ case "cognipeer":
1281
1271
  default:
1282
- return "pdf-opendataloader";
1272
+ return "pdf-cognipeer-specialized";
1283
1273
  }
1284
1274
  }
1285
1275
  function mergeWarnings(base, ...more) {
@@ -1289,6 +1279,22 @@ function mergeWarnings(base, ...more) {
1289
1279
  }
1290
1280
  return out;
1291
1281
  }
1282
+ var EXTRACT_WARN = "[docmind-markdown:extractMarkdown]";
1283
+ function traceUsedStructuredFallback(context) {
1284
+ return `${EXTRACT_WARN} ${context}: final Markdown from structuredFallback (specialized route unavailable, failed, or insufficient).`;
1285
+ }
1286
+ function tracePdfStructuredAfterUnsupportedRuntime() {
1287
+ return `${EXTRACT_WARN} pdf-unsupported-runtime: final Markdown from structuredFallback \u2014 @cognipeer/to-markdown cannot run in this environment.`;
1288
+ }
1289
+ function traceDocxStructuredAfterMammoth() {
1290
+ return `${EXTRACT_WARN} docx-structured-fallback: final Markdown from structured envelope after Mammoth/Turndown did not yield the result.`;
1291
+ }
1292
+ function tracePdfStructuredAfterCognipeer() {
1293
+ return `${EXTRACT_WARN} pdf-structured-fallback: final Markdown from structured envelope after Cognipeer PDF path did not yield the result.`;
1294
+ }
1295
+ function tracePdfSpecializedDeadEnd() {
1296
+ return `${EXTRACT_WARN} pdf: Cognipeer specialized route did not produce Markdown and structuredFallback was not provided.`;
1297
+ }
1292
1298
  async function extractMarkdown(input, options) {
1293
1299
  const smOpts = pickStructuredMarkdownOptions(options);
1294
1300
  const fb = options?.structuredFallback;
@@ -1315,11 +1321,25 @@ async function extractMarkdown(input, options) {
1315
1321
  if (fb) {
1316
1322
  return {
1317
1323
  markdown: convertStructuredToMarkdown(fb, smOpts),
1318
- warnings: mergeWarnings(warnings, fb.warnings),
1319
- strategy: "path-requires-node"
1324
+ warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("path-requires-node")]),
1325
+ strategy: "path-requires-node",
1326
+ routing: {
1327
+ detectedFormat: "unknown",
1328
+ specializedPipeline: "none",
1329
+ usedStructuredFallback: true
1330
+ }
1320
1331
  };
1321
1332
  }
1322
- return { markdown: "", warnings, strategy: "path-requires-node" };
1333
+ return {
1334
+ markdown: "",
1335
+ warnings,
1336
+ strategy: "path-requires-node",
1337
+ routing: {
1338
+ detectedFormat: "unknown",
1339
+ specializedPipeline: "none",
1340
+ usedStructuredFallback: false
1341
+ }
1342
+ };
1323
1343
  }
1324
1344
  const { readFile } = await importEsm(
1325
1345
  "node:fs/promises"
@@ -1339,7 +1359,7 @@ async function extractMarkdown(input, options) {
1339
1359
  if (fb) {
1340
1360
  return {
1341
1361
  markdown: convertStructuredToMarkdown(fb, smOpts),
1342
- warnings: mergeWarnings(warnings, fb.warnings),
1362
+ warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("invalid-input-shape")]),
1343
1363
  strategy: "binary-unidentified-structured-fallback"
1344
1364
  };
1345
1365
  }
@@ -1354,37 +1374,79 @@ async function extractMarkdown(input, options) {
1354
1374
  if (fb) {
1355
1375
  return {
1356
1376
  markdown: convertStructuredToMarkdown(fb, smOpts),
1357
- warnings: mergeWarnings(warnings, fb.warnings),
1358
- strategy: "docx-requires-node"
1377
+ warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("docx-requires-node")]),
1378
+ strategy: "docx-requires-node",
1379
+ routing: {
1380
+ detectedFormat: "docx",
1381
+ specializedPipeline: "none",
1382
+ usedStructuredFallback: true
1383
+ }
1359
1384
  };
1360
1385
  }
1361
- return { markdown: "", warnings, strategy: "docx-requires-node" };
1386
+ return {
1387
+ markdown: "",
1388
+ warnings,
1389
+ strategy: "docx-requires-node",
1390
+ routing: {
1391
+ detectedFormat: "docx",
1392
+ specializedPipeline: "none",
1393
+ usedStructuredFallback: false
1394
+ }
1395
+ };
1362
1396
  }
1363
1397
  const r = await convertDocxToMarkdown(data, buildDocxOptions(options));
1398
+ const strategy = docxStrategyFromSource(r.source);
1364
1399
  const w = mergeWarnings(
1365
1400
  warnings,
1366
- r.messages.map((m) => m.message)
1401
+ r.messages.map((m) => m.message),
1402
+ strategy === "docx-structured-fallback" ? [traceDocxStructuredAfterMammoth()] : []
1367
1403
  );
1368
1404
  return {
1369
1405
  markdown: r.markdown,
1370
1406
  warnings: w,
1371
- strategy: docxStrategyFromSource(r.source)
1407
+ strategy,
1408
+ routing: {
1409
+ detectedFormat: "docx",
1410
+ specializedPipeline: "docx",
1411
+ usedStructuredFallback: strategy === "docx-structured-fallback"
1412
+ }
1372
1413
  };
1373
1414
  }
1374
1415
  if (fmt === "pdf") {
1375
1416
  const r = await convertPdfToMarkdown(data, buildPdfOptions(options));
1376
1417
  const strategy = pdfStrategyFromResult(r);
1377
- const w = mergeWarnings(warnings, r.warnings);
1418
+ let w = mergeWarnings(warnings, r.warnings);
1378
1419
  if (strategy === "pdf-unsupported-runtime" && r.markdown === "" && fb) {
1420
+ w = mergeWarnings(w, fb.warnings, [
1421
+ tracePdfStructuredAfterUnsupportedRuntime()
1422
+ ]);
1379
1423
  return {
1380
1424
  markdown: convertStructuredToMarkdown(fb, smOpts),
1381
- warnings: mergeWarnings(w, fb.warnings, [
1382
- "extractMarkdown: PDF specialized pipeline is unavailable in this runtime; used structuredFallback (same envelope as extractStructuredData)."
1383
- ]),
1384
- strategy: "pdf-structured-fallback"
1425
+ warnings: w,
1426
+ strategy: "pdf-structured-fallback",
1427
+ routing: {
1428
+ detectedFormat: "pdf",
1429
+ specializedPipeline: "pdf",
1430
+ usedStructuredFallback: true
1431
+ }
1385
1432
  };
1386
1433
  }
1387
- return { markdown: r.markdown, warnings: w, strategy };
1434
+ if (strategy === "pdf-structured-fallback") {
1435
+ w = mergeWarnings(w, [tracePdfStructuredAfterCognipeer()]);
1436
+ }
1437
+ if ((strategy === "pdf-cognipeer-failed" || strategy === "pdf-cognipeer-unavailable") && r.markdown.trim() === "" && !fb) {
1438
+ w = mergeWarnings(w, [tracePdfSpecializedDeadEnd()]);
1439
+ }
1440
+ return {
1441
+ markdown: r.markdown,
1442
+ warnings: w,
1443
+ strategy,
1444
+ routing: {
1445
+ detectedFormat: "pdf",
1446
+ specializedPipeline: "pdf",
1447
+ usedStructuredFallback: strategy === "pdf-structured-fallback"
1448
+ }
1449
+ };
1388
1450
  }
1389
1451
  warnings.push(
1390
1452
  "@dragon708/docmind-markdown: Unidentified binary format (expected PDF magic or ZIP/DOCX). Using structured fallback if provided."
@@ -1392,11 +1454,25 @@ async function extractMarkdown(input, options) {
1392
1454
  if (fb) {
1393
1455
  return {
1394
1456
  markdown: convertStructuredToMarkdown(fb, smOpts),
1395
- warnings: mergeWarnings(warnings, fb.warnings),
1396
- strategy: "binary-unidentified-structured-fallback"
1457
+ warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("binary-unidentified")]),
1458
+ strategy: "binary-unidentified-structured-fallback",
1459
+ routing: {
1460
+ detectedFormat: "unknown",
1461
+ specializedPipeline: "none",
1462
+ usedStructuredFallback: true
1463
+ }
1397
1464
  };
1398
1465
  }
1399
- return { markdown: "", warnings, strategy: "binary-unidentified" };
1466
+ return {
1467
+ markdown: "",
1468
+ warnings,
1469
+ strategy: "binary-unidentified",
1470
+ routing: {
1471
+ detectedFormat: "unknown",
1472
+ specializedPipeline: "none",
1473
+ usedStructuredFallback: false
1474
+ }
1475
+ };
1400
1476
  }
1401
1477
 
1402
1478
  export { convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-markdown",
3
- "version": "1.2.0",
3
+ "version": "1.2.1",
4
4
  "description": "StructuredDocumentResult → Markdown and LLM-oriented plain text for DocMind.",
5
5
  "type": "module",
6
6
  "sideEffects": false,
@@ -35,8 +35,8 @@
35
35
  ],
36
36
  "license": "MIT",
37
37
  "dependencies": {
38
+ "@cognipeer/to-markdown": "^2.0.1",
38
39
  "@dragon708/docmind-shared": "^1.2.0",
39
- "@opendataloader/pdf": "^2.2.1",
40
40
  "mammoth": "^1.6.0",
41
41
  "turndown": "^7.0.0",
42
42
  "turndown-plugin-gfm": "^1.0.2"