@dragon708/docmind-markdown 1.1.3 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -39,8 +39,8 @@ type StructuredToMarkdownOptions = ConvertStructuredToMarkdownOptions;
39
39
  /**
40
40
  * Converts a {@link StructuredDocumentResult} to readable, semantic Markdown (GFM-style tables).
41
41
  *
42
- * **Universal fallback** for DocMind: use when a format-specific pipeline (DOCX Mammoth, PDF OpenDataLoader, …)
43
- * does not apply or fails, and for OCR / image / plain-text flows that already populate this shape.
42
+ * **Universal fallback** for DocMind: use when a format-specific pipeline (DOCX Mammoth→Turndown, specialized PDF→Markdown
43
+ * via `@cognipeer/to-markdown` on Node, …) does not apply or fails, and for OCR / image / plain-text flows that already populate this shape.
44
44
  *
45
45
  * Uses `blocks` in order; resolves `table` / `image-ref` via `tables` and `images`. When blocks are
46
46
  * empty or yield no output, falls back to the rollup `text`. Optional sections use `pages`, `metadata`,
@@ -213,6 +213,13 @@ type RenderMarkdownSectionsOptions = SplitStructuredIntoChunksOptions;
213
213
  */
214
214
  declare function renderMarkdownSections(result: StructuredDocumentResult, options?: RenderMarkdownSectionsOptions): MarkdownSection[];
215
215
 
216
+ /**
217
+ * DOCX → Markdown: **Mammoth** (`convertToHtml` semantic HTML) → **Turndown** (ATX headings, lists, fenced code;
218
+ * **GFM tables** when {@link ConvertDocxToMarkdownOptions.includeTables} is true via `turndown-plugin-gfm`).
219
+ * Page breaks map through Mammoth `styleMap` → `<hr class="page-break">` → Turndown horizontal rules.
220
+ *
221
+ * Independent of the PDF → Markdown pipeline in this package (`pdf-markdown.ts`, `@cognipeer/to-markdown`).
222
+ */
216
223
  /**
217
224
  * Binary `.docx` payload accepted by {@link convertDocxToMarkdown}.
218
225
  */
@@ -316,8 +323,10 @@ declare function convertDocxToMarkdown(input: DocxToMarkdownInput, options?: Con
316
323
  declare function convertDocxBufferToMarkdown(input: DocxToMarkdownInput, options?: ConvertDocxToMarkdownOptions): Promise<DocxMarkdownResult>;
317
324
 
318
325
  /**
319
- * Options forwarded to `@opendataloader/pdf` `convert()`, except `format` and `toStdout` (set internally).
320
- * Shaped to match `ConvertOptions` from `@opendataloader/pdf` v2.x without a static type import.
326
+ * Legacy option bag: **ignored** by {@link convertPdfToMarkdown}. Retained so existing TypeScript callers and object
327
+ * literals that still spread old shapes remain assignable to {@link ConvertPdfToMarkdownOptions}.
328
+ *
329
+ * @deprecated PDF → Markdown uses `@cognipeer/to-markdown` only; these keys are not read.
321
330
  */
322
331
  interface OpenDataLoaderPdfConvertOptions {
323
332
  outputDir?: string;
@@ -346,11 +355,13 @@ interface OpenDataLoaderPdfConvertOptions {
346
355
  hybridFallback?: boolean;
347
356
  }
348
357
  /**
349
- * Options for {@link convertPdfToMarkdown}. OpenDataLoader fields are passed through; structured fields are local.
358
+ * Options for {@link convertPdfToMarkdown}. Properties from {@link OpenDataLoaderPdfConvertOptions} are accepted for
359
+ * compatibility but **ignored**. `fileName`, `forceExtension`, and `url` are forwarded to `@cognipeer/to-markdown`
360
+ * where applicable (see that package’s `ConverterOptions`).
350
361
  */
351
362
  type ConvertPdfToMarkdownOptions = OpenDataLoaderPdfConvertOptions & {
352
363
  /**
353
- * When the OpenDataLoader path fails, returns empty output, or `@opendataloader/pdf` cannot load,
364
+ * When the Cognipeer path fails, returns empty output, or `@cognipeer/to-markdown` cannot load,
354
365
  * call this to obtain {@link StructuredDocumentResult} (e.g. from another extractor) and serialize with
355
366
  * {@link convertStructuredToMarkdown}. Does not import other DocMind packages.
356
367
  */
@@ -362,15 +373,25 @@ type ConvertPdfToMarkdownOptions = OpenDataLoaderPdfConvertOptions & {
362
373
  * @default true
363
374
  */
364
375
  readonly cleanMarkdown?: boolean;
376
+ /** Forwarded to `@cognipeer/to-markdown` (useful for buffer inputs). */
377
+ readonly fileName?: string;
378
+ /** Forwarded to `@cognipeer/to-markdown`. */
379
+ readonly forceExtension?: string;
380
+ /** Forwarded to `@cognipeer/to-markdown`. */
381
+ readonly url?: string;
365
382
  };
366
383
  /** Input for {@link convertPdfToMarkdown}: filesystem path (Node) or PDF bytes. */
367
384
  type PdfToMarkdownInput = string | Buffer | Uint8Array | ArrayBuffer;
368
385
  /** Which pipeline produced {@link ConvertPdfToMarkdownResult.markdown}. */
369
- type PdfToMarkdownSource = "opendataloader" | "structured-fallback" | "unsupported-runtime";
386
+ type PdfToMarkdownSource = "cognipeer" | "structured-fallback" | "unsupported-runtime"
387
+ /** `@cognipeer/to-markdown` missing or failed to load (install dep; check bundler externals). */
388
+ | "cognipeer-unavailable"
389
+ /** Engine ran or was attempted but produced no usable Markdown (error, empty output, missing file, etc.). */
390
+ | "cognipeer-failed";
370
391
  type PdfToMarkdownFallbackReason = "unsupported-runtime" | "error" | "empty" | "module-not-found";
371
392
  interface ConvertPdfToMarkdownResult {
372
393
  readonly markdown: string;
373
- /** Human-readable issues (runtime, missing module, Java/PDF errors, empty output, fallback errors). */
394
+ /** Human-readable issues (runtime, missing module, conversion errors, empty output, fallback errors). */
374
395
  readonly warnings: readonly string[];
375
396
  readonly source: PdfToMarkdownSource;
376
397
  readonly fallbackReason?: PdfToMarkdownFallbackReason;
@@ -380,21 +401,25 @@ interface PdfMarkdownResult {
380
401
  readonly markdown: string;
381
402
  }
382
403
  /**
383
- * Primary API: PDF path or bytes → Markdown via `@opendataloader/pdf` on Node, with clear warnings and optional
404
+ * Primary API: PDF path or bytes → Markdown via `@cognipeer/to-markdown` on Node, with clear warnings and optional
384
405
  * structured fallback. In non-Node runtimes returns an empty `markdown` and {@link PdfToMarkdownSource} `unsupported-runtime`
385
- * without loading `@opendataloader/pdf`.
406
+ * without loading `@cognipeer/to-markdown`.
407
+ *
408
+ * If the specialized conversion fails or returns empty Markdown and {@link ConvertPdfToMarkdownOptions.resolveStructured} /
409
+ * `extractMarkdown`’s `structuredFallback` is set, output comes from {@link convertStructuredToMarkdown} instead
410
+ * (`extractMarkdown` reports strategy `pdf-structured-fallback`; this result uses `source` `structured-fallback`).
386
411
  */
387
412
  declare function convertPdfToMarkdown(input: PdfToMarkdownInput, options?: ConvertPdfToMarkdownOptions): Promise<ConvertPdfToMarkdownResult>;
388
413
  /**
389
- * **Node only.** PDF file path → Markdown via `@opendataloader/pdf` (`format: "markdown"`, `toStdout: true`).
414
+ * **Node only.** PDF file path → Markdown via `@cognipeer/to-markdown`.
390
415
  *
391
- * Throws if not Node, if conversion yields no Markdown (and no structured fallback), or on OpenDataLoader errors
416
+ * Throws if not Node, if conversion yields no Markdown (and no structured fallback), or on conversion errors
392
417
  * when no fallback is configured — same contract as before {@link convertPdfToMarkdown} existed.
393
418
  */
394
419
  declare function convertPdfPathToMarkdown(inputPath: string, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
395
420
  /**
396
421
  * **Node only.** Same pipeline as {@link convertPdfPathToMarkdown}, but writes bytes to a temporary `.pdf`
397
- * under the system temp directory (OpenDataLoader expects a file path).
422
+ * then converts with `@cognipeer/to-markdown`.
398
423
  */
399
424
  declare function convertPdfBufferToMarkdown(input: Buffer | Uint8Array | ArrayBuffer, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
400
425
 
@@ -433,17 +458,38 @@ type ExtractMarkdownOptions = ConvertStructuredToMarkdownOptions & {
433
458
  /** Overrides merged into {@link convertPdfToMarkdown} when the input is identified as `.pdf`. */
434
459
  readonly pdf?: ConvertPdfToMarkdownOptions;
435
460
  };
461
+ /** @see {@link detectBinaryFormat} */
462
+ type DetectedBinaryFormat = "docx" | "pdf" | "unknown";
436
463
  /** Which branch produced {@link ExtractMarkdownResult.markdown}. */
437
- type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-opendataloader" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
464
+ type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-cognipeer-specialized"
465
+ /** `@cognipeer/to-markdown` not loadable (missing package, bundler, etc.). */
466
+ | "pdf-cognipeer-unavailable"
467
+ /** Engine failed or returned empty Markdown; no structured fallback or fallback also failed. */
468
+ | "pdf-cognipeer-failed" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
469
+ /**
470
+ * Format detection and fallback bookkeeping for file/path inputs to {@link extractMarkdown}.
471
+ * Omitted when `input` is a {@link StructuredDocumentResult} (already structured).
472
+ */
473
+ interface ExtractMarkdownRoutingInfo {
474
+ readonly detectedFormat: DetectedBinaryFormat;
475
+ /** Specialized binary pipeline invoked first on Node (`none` if not applicable). */
476
+ readonly specializedPipeline: "docx" | "pdf" | "none";
477
+ /** Final Markdown came from {@link ExtractMarkdownOptions.structuredFallback} (or the same snapshot via `resolveStructured`). */
478
+ readonly usedStructuredFallback: boolean;
479
+ }
438
480
  interface ExtractMarkdownResult {
439
481
  readonly markdown: string;
440
482
  /** Merged pipeline warnings (conversion, runtime, and optional {@link StructuredDocumentResult.warnings}). */
441
483
  readonly warnings: readonly string[];
442
484
  readonly strategy: ExtractMarkdownStrategy;
485
+ /**
486
+ * Present for `{ data, … }` / `{ path, … }` flows after sniffing bytes (or when returning early without reading).
487
+ * Absent for direct structured input.
488
+ */
489
+ readonly routing?: ExtractMarkdownRoutingInfo;
443
490
  }
444
491
  /** Type guard: file-like `{ data: … }` input for {@link extractMarkdown}. */
445
492
  declare function isExtractMarkdownFileInput(value: unknown): value is ExtractMarkdownFileInput;
446
- type DetectedBinaryFormat = "docx" | "pdf" | "unknown";
447
493
  /** Detect PDF / OOXML zip (DOCX) from magic bytes and optional filename / MIME. */
448
494
  declare function detectBinaryFormat(data: Buffer | Uint8Array | ArrayBuffer, filename?: string, mimeType?: string): DetectedBinaryFormat;
449
495
  /**
@@ -452,8 +498,11 @@ declare function detectBinaryFormat(data: Buffer | Uint8Array | ArrayBuffer, fil
452
498
  *
453
499
  * - **Structured input** — always uses the structured serializer (image/OCR/text/PDF/DOCX blocks already normalized).
454
500
  * - **DOCX bytes / path** — {@link convertDocxToMarkdown} on Node; otherwise warns and uses {@link ExtractMarkdownOptions.structuredFallback} if provided.
455
- * - **PDF bytes / path** — {@link convertPdfToMarkdown} (`@opendataloader/pdf` on Node when Java is available); in non-Node runtimes returns empty Markdown with warnings unless {@link ExtractMarkdownOptions.structuredFallback} supplies content.
501
+ * - **PDF bytes / path** — {@link convertPdfToMarkdown} (`@cognipeer/to-markdown` on Node); in non-Node runtimes returns empty Markdown with warnings unless {@link ExtractMarkdownOptions.structuredFallback} supplies content.
502
+ *
503
+ * Any use of `structuredFallback` (or the same snapshot through `resolveStructured`) adds a tagged line in {@link ExtractMarkdownResult.warnings}
504
+ * and sets {@link ExtractMarkdownRoutingInfo.usedStructuredFallback} when {@link ExtractMarkdownResult.routing} is present.
456
505
  */
457
506
  declare function extractMarkdown(input: ExtractMarkdownInput, options?: ExtractMarkdownOptions): Promise<ExtractMarkdownResult>;
458
507
 
459
- export { type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownStrategy, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
508
+ export { type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownRoutingInfo, type ExtractMarkdownStrategy, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
package/dist/index.js CHANGED
@@ -993,21 +993,27 @@ async function convertDocxBufferToMarkdown(input, options) {
993
993
  }
994
994
 
995
995
  // src/pdf-markdown.ts
996
- var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @opendataloader/pdf requires Node.js. In the browser, use a server-side conversion or supply structured text/Markdown from your backend.";
996
+ var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @cognipeer/to-markdown requires Node.js. In the browser, use a server-side conversion or supply structured text/Markdown from your backend.";
997
+ var COGNIPEER_WARN_TAG = "[docmind-markdown:pdf] pdf-cognipeer-specialized:";
997
998
  function normalizePdfMarkdown(markdown, clean) {
998
999
  const t = markdown.trim();
999
1000
  if (!clean) return t;
1000
1001
  return t.replace(/\n{3,}/g, "\n\n");
1001
1002
  }
1002
- function engineOptions(options) {
1003
+ var FALLBACK_WARN_PREFIX = "[docmind-markdown:pdf] pdf-structured-fallback: serializing StructuredDocumentResult to Markdown because";
1004
+ function structuredFallbackWarnings(reason, detail) {
1005
+ const tail = reason === "module-not-found" ? "@cognipeer/to-markdown could not be loaded." : reason === "error" ? "the specialized PDF engine raised an error or rejected the input." : reason === "empty" ? "the specialized engine returned empty Markdown." : "the specialized path is unavailable in this runtime.";
1006
+ const extra = detail ? ` (${detail})` : "";
1007
+ return [`${FALLBACK_WARN_PREFIX} ${tail}${extra}`];
1008
+ }
1009
+ function cognipeerConverterOptions(options) {
1003
1010
  if (!options) return {};
1004
- const {
1005
- resolveStructured: _r,
1006
- structuredMarkdown: _s,
1007
- cleanMarkdown: _c,
1008
- ...rest
1009
- } = options;
1010
- return rest;
1011
+ const { fileName, forceExtension, url } = options;
1012
+ const o = {};
1013
+ if (fileName !== void 0) o.fileName = fileName;
1014
+ if (forceExtension !== void 0) o.forceExtension = forceExtension;
1015
+ if (url !== void 0) o.url = url;
1016
+ return o;
1011
1017
  }
1012
1018
  async function toNodeBuffer2(input) {
1013
1019
  const { Buffer: Buffer2 } = await importEsm("node:buffer");
@@ -1015,11 +1021,17 @@ async function toNodeBuffer2(input) {
1015
1021
  if (input instanceof ArrayBuffer) return Buffer2.from(input);
1016
1022
  return Buffer2.from(input);
1017
1023
  }
1024
+ async function loadCognipeerConvertToMarkdown() {
1025
+ const { createRequire } = await importEsm("node:module");
1026
+ const require2 = createRequire(import.meta.url);
1027
+ const mod = require2("@cognipeer/to-markdown");
1028
+ return mod.convertToMarkdown;
1029
+ }
1018
1030
  async function convertPdfToMarkdown(input, options) {
1019
1031
  const clean = options?.cleanMarkdown !== false;
1020
1032
  const resolveStructured = options?.resolveStructured;
1021
1033
  const structuredMdOpts = options?.structuredMarkdown;
1022
- const eng = engineOptions(options);
1034
+ const cognipeerOpts = cognipeerConverterOptions(options);
1023
1035
  if (!isNodeRuntime()) {
1024
1036
  return {
1025
1037
  markdown: "",
@@ -1046,15 +1058,13 @@ async function convertPdfToMarkdown(input, options) {
1046
1058
  await writeFile(inputPath, buffer);
1047
1059
  cleanup = async () => rm(dir, { recursive: true, force: true });
1048
1060
  }
1049
- let convert;
1061
+ let convertToMarkdown;
1050
1062
  try {
1051
- ({ convert } = await importEsm(
1052
- "@opendataloader/pdf"
1053
- ));
1063
+ convertToMarkdown = await loadCognipeerConvertToMarkdown();
1054
1064
  } catch (e) {
1055
- const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@opendataloader/pdf` in your project." : "";
1065
+ const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@cognipeer/to-markdown` in your project." : "";
1056
1066
  warnings.push(
1057
- `@opendataloader/pdf could not be loaded (${e instanceof Error ? e.message : String(e)}).${hint}`
1067
+ `${COGNIPEER_WARN_TAG} package could not be loaded (${e instanceof Error ? e.message : String(e)}).${hint}`
1058
1068
  );
1059
1069
  if (resolveStructured) {
1060
1070
  try {
@@ -1065,7 +1075,10 @@ async function convertPdfToMarkdown(input, options) {
1065
1075
  );
1066
1076
  return {
1067
1077
  markdown: md,
1068
- warnings,
1078
+ warnings: [
1079
+ ...structuredFallbackWarnings("module-not-found"),
1080
+ ...warnings
1081
+ ],
1069
1082
  source: "structured-fallback",
1070
1083
  fallbackReason: "module-not-found"
1071
1084
  };
@@ -1078,20 +1091,16 @@ async function convertPdfToMarkdown(input, options) {
1078
1091
  return {
1079
1092
  markdown: "",
1080
1093
  warnings,
1081
- source: "opendataloader",
1094
+ source: "cognipeer-unavailable",
1082
1095
  fallbackReason: "module-not-found"
1083
1096
  };
1084
1097
  }
1085
1098
  let rawMarkdown;
1086
1099
  try {
1087
- rawMarkdown = await convert(inputPath, {
1088
- ...eng,
1089
- format: "markdown",
1090
- toStdout: true,
1091
- quiet: eng.quiet !== false
1092
- }).then((s) => String(s));
1100
+ rawMarkdown = await convertToMarkdown(inputPath, cognipeerOpts);
1093
1101
  } catch (e) {
1094
- warnings.push(`PDF conversion failed: ${e instanceof Error ? e.message : String(e)}`);
1102
+ const msg = e instanceof Error ? e.message : String(e);
1103
+ warnings.push(`${COGNIPEER_WARN_TAG} ${msg}`);
1095
1104
  if (resolveStructured) {
1096
1105
  try {
1097
1106
  const structured = await resolveStructured();
@@ -1101,7 +1110,10 @@ async function convertPdfToMarkdown(input, options) {
1101
1110
  );
1102
1111
  return {
1103
1112
  markdown: md,
1104
- warnings,
1113
+ warnings: [
1114
+ ...structuredFallbackWarnings("error", msg.slice(0, 500)),
1115
+ ...warnings
1116
+ ],
1105
1117
  source: "structured-fallback",
1106
1118
  fallbackReason: "error"
1107
1119
  };
@@ -1114,13 +1126,18 @@ async function convertPdfToMarkdown(input, options) {
1114
1126
  return {
1115
1127
  markdown: "",
1116
1128
  warnings,
1117
- source: "opendataloader",
1129
+ source: "cognipeer-failed",
1118
1130
  fallbackReason: "error"
1119
1131
  };
1120
1132
  }
1121
- let markdown = normalizePdfMarkdown(rawMarkdown, clean);
1133
+ let markdown = normalizePdfMarkdown(
1134
+ typeof rawMarkdown === "string" ? rawMarkdown : String(rawMarkdown ?? ""),
1135
+ clean
1136
+ );
1122
1137
  if (markdown.length === 0) {
1123
- warnings.push("OpenDataLoader returned empty Markdown for this PDF.");
1138
+ warnings.push(
1139
+ `${COGNIPEER_WARN_TAG} returned empty Markdown for this PDF (whitespace-only after normalize).`
1140
+ );
1124
1141
  if (resolveStructured) {
1125
1142
  try {
1126
1143
  const structured = await resolveStructured();
@@ -1130,7 +1147,7 @@ async function convertPdfToMarkdown(input, options) {
1130
1147
  );
1131
1148
  return {
1132
1149
  markdown,
1133
- warnings,
1150
+ warnings: [...structuredFallbackWarnings("empty"), ...warnings],
1134
1151
  source: "structured-fallback",
1135
1152
  fallbackReason: "empty"
1136
1153
  };
@@ -1143,11 +1160,11 @@ async function convertPdfToMarkdown(input, options) {
1143
1160
  return {
1144
1161
  markdown: "",
1145
1162
  warnings,
1146
- source: "opendataloader",
1163
+ source: "cognipeer-failed",
1147
1164
  fallbackReason: "empty"
1148
1165
  };
1149
1166
  }
1150
- return { markdown, warnings, source: "opendataloader" };
1167
+ return { markdown, warnings, source: "cognipeer" };
1151
1168
  } finally {
1152
1169
  if (cleanup) {
1153
1170
  await cleanup().catch(() => {
@@ -1166,13 +1183,13 @@ function throwIfLegacyFailure(r) {
1166
1183
  }
1167
1184
  }
1168
1185
  async function convertPdfPathToMarkdown(inputPath, options) {
1169
- assertNodeRuntime("PDF \u2192 Markdown (@opendataloader/pdf)");
1186
+ assertNodeRuntime("PDF \u2192 Markdown (@cognipeer/to-markdown)");
1170
1187
  const r = await convertPdfToMarkdown(inputPath, options);
1171
1188
  throwIfLegacyFailure(r);
1172
1189
  return { markdown: r.markdown };
1173
1190
  }
1174
1191
  async function convertPdfBufferToMarkdown(input, options) {
1175
- assertNodeRuntime("PDF \u2192 Markdown (@opendataloader/pdf)");
1192
+ assertNodeRuntime("PDF \u2192 Markdown (@cognipeer/to-markdown)");
1176
1193
  const r = await convertPdfToMarkdown(input, options);
1177
1194
  throwIfLegacyFailure(r);
1178
1195
  return { markdown: r.markdown };
@@ -1241,9 +1258,19 @@ function docxStrategyFromSource(source) {
1241
1258
  return source === "structured-fallback" ? "docx-structured-fallback" : "docx-mammoth";
1242
1259
  }
1243
1260
  function pdfStrategyFromResult(r) {
1244
- if (r.source === "structured-fallback") return "pdf-structured-fallback";
1245
- if (r.source === "unsupported-runtime") return "pdf-unsupported-runtime";
1246
- return "pdf-opendataloader";
1261
+ switch (r.source) {
1262
+ case "structured-fallback":
1263
+ return "pdf-structured-fallback";
1264
+ case "unsupported-runtime":
1265
+ return "pdf-unsupported-runtime";
1266
+ case "cognipeer-unavailable":
1267
+ return "pdf-cognipeer-unavailable";
1268
+ case "cognipeer-failed":
1269
+ return "pdf-cognipeer-failed";
1270
+ case "cognipeer":
1271
+ default:
1272
+ return "pdf-cognipeer-specialized";
1273
+ }
1247
1274
  }
1248
1275
  function mergeWarnings(base, ...more) {
1249
1276
  const out = [...base];
@@ -1252,6 +1279,22 @@ function mergeWarnings(base, ...more) {
1252
1279
  }
1253
1280
  return out;
1254
1281
  }
1282
+ var EXTRACT_WARN = "[docmind-markdown:extractMarkdown]";
1283
+ function traceUsedStructuredFallback(context) {
1284
+ return `${EXTRACT_WARN} ${context}: final Markdown from structuredFallback (specialized route unavailable, failed, or insufficient).`;
1285
+ }
1286
+ function tracePdfStructuredAfterUnsupportedRuntime() {
1287
+ return `${EXTRACT_WARN} pdf-unsupported-runtime: final Markdown from structuredFallback \u2014 @cognipeer/to-markdown cannot run in this environment.`;
1288
+ }
1289
+ function traceDocxStructuredAfterMammoth() {
1290
+ return `${EXTRACT_WARN} docx-structured-fallback: final Markdown from structured envelope after Mammoth/Turndown did not yield the result.`;
1291
+ }
1292
+ function tracePdfStructuredAfterCognipeer() {
1293
+ return `${EXTRACT_WARN} pdf-structured-fallback: final Markdown from structured envelope after Cognipeer PDF path did not yield the result.`;
1294
+ }
1295
+ function tracePdfSpecializedDeadEnd() {
1296
+ return `${EXTRACT_WARN} pdf: Cognipeer specialized route did not produce Markdown and structuredFallback was not provided.`;
1297
+ }
1255
1298
  async function extractMarkdown(input, options) {
1256
1299
  const smOpts = pickStructuredMarkdownOptions(options);
1257
1300
  const fb = options?.structuredFallback;
@@ -1278,11 +1321,25 @@ async function extractMarkdown(input, options) {
1278
1321
  if (fb) {
1279
1322
  return {
1280
1323
  markdown: convertStructuredToMarkdown(fb, smOpts),
1281
- warnings: mergeWarnings(warnings, fb.warnings),
1282
- strategy: "path-requires-node"
1324
+ warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("path-requires-node")]),
1325
+ strategy: "path-requires-node",
1326
+ routing: {
1327
+ detectedFormat: "unknown",
1328
+ specializedPipeline: "none",
1329
+ usedStructuredFallback: true
1330
+ }
1283
1331
  };
1284
1332
  }
1285
- return { markdown: "", warnings, strategy: "path-requires-node" };
1333
+ return {
1334
+ markdown: "",
1335
+ warnings,
1336
+ strategy: "path-requires-node",
1337
+ routing: {
1338
+ detectedFormat: "unknown",
1339
+ specializedPipeline: "none",
1340
+ usedStructuredFallback: false
1341
+ }
1342
+ };
1286
1343
  }
1287
1344
  const { readFile } = await importEsm(
1288
1345
  "node:fs/promises"
@@ -1302,7 +1359,7 @@ async function extractMarkdown(input, options) {
1302
1359
  if (fb) {
1303
1360
  return {
1304
1361
  markdown: convertStructuredToMarkdown(fb, smOpts),
1305
- warnings: mergeWarnings(warnings, fb.warnings),
1362
+ warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("invalid-input-shape")]),
1306
1363
  strategy: "binary-unidentified-structured-fallback"
1307
1364
  };
1308
1365
  }
@@ -1317,37 +1374,79 @@ async function extractMarkdown(input, options) {
1317
1374
  if (fb) {
1318
1375
  return {
1319
1376
  markdown: convertStructuredToMarkdown(fb, smOpts),
1320
- warnings: mergeWarnings(warnings, fb.warnings),
1321
- strategy: "docx-requires-node"
1377
+ warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("docx-requires-node")]),
1378
+ strategy: "docx-requires-node",
1379
+ routing: {
1380
+ detectedFormat: "docx",
1381
+ specializedPipeline: "none",
1382
+ usedStructuredFallback: true
1383
+ }
1322
1384
  };
1323
1385
  }
1324
- return { markdown: "", warnings, strategy: "docx-requires-node" };
1386
+ return {
1387
+ markdown: "",
1388
+ warnings,
1389
+ strategy: "docx-requires-node",
1390
+ routing: {
1391
+ detectedFormat: "docx",
1392
+ specializedPipeline: "none",
1393
+ usedStructuredFallback: false
1394
+ }
1395
+ };
1325
1396
  }
1326
1397
  const r = await convertDocxToMarkdown(data, buildDocxOptions(options));
1398
+ const strategy = docxStrategyFromSource(r.source);
1327
1399
  const w = mergeWarnings(
1328
1400
  warnings,
1329
- r.messages.map((m) => m.message)
1401
+ r.messages.map((m) => m.message),
1402
+ strategy === "docx-structured-fallback" ? [traceDocxStructuredAfterMammoth()] : []
1330
1403
  );
1331
1404
  return {
1332
1405
  markdown: r.markdown,
1333
1406
  warnings: w,
1334
- strategy: docxStrategyFromSource(r.source)
1407
+ strategy,
1408
+ routing: {
1409
+ detectedFormat: "docx",
1410
+ specializedPipeline: "docx",
1411
+ usedStructuredFallback: strategy === "docx-structured-fallback"
1412
+ }
1335
1413
  };
1336
1414
  }
1337
1415
  if (fmt === "pdf") {
1338
1416
  const r = await convertPdfToMarkdown(data, buildPdfOptions(options));
1339
1417
  const strategy = pdfStrategyFromResult(r);
1340
- const w = mergeWarnings(warnings, r.warnings);
1418
+ let w = mergeWarnings(warnings, r.warnings);
1341
1419
  if (strategy === "pdf-unsupported-runtime" && r.markdown === "" && fb) {
1420
+ w = mergeWarnings(w, fb.warnings, [
1421
+ tracePdfStructuredAfterUnsupportedRuntime()
1422
+ ]);
1342
1423
  return {
1343
1424
  markdown: convertStructuredToMarkdown(fb, smOpts),
1344
- warnings: mergeWarnings(w, fb.warnings, [
1345
- "extractMarkdown: PDF route unavailable in this runtime; used structuredFallback."
1346
- ]),
1347
- strategy: "pdf-structured-fallback"
1425
+ warnings: w,
1426
+ strategy: "pdf-structured-fallback",
1427
+ routing: {
1428
+ detectedFormat: "pdf",
1429
+ specializedPipeline: "pdf",
1430
+ usedStructuredFallback: true
1431
+ }
1348
1432
  };
1349
1433
  }
1350
- return { markdown: r.markdown, warnings: w, strategy };
1434
+ if (strategy === "pdf-structured-fallback") {
1435
+ w = mergeWarnings(w, [tracePdfStructuredAfterCognipeer()]);
1436
+ }
1437
+ if ((strategy === "pdf-cognipeer-failed" || strategy === "pdf-cognipeer-unavailable") && r.markdown.trim() === "" && !fb) {
1438
+ w = mergeWarnings(w, [tracePdfSpecializedDeadEnd()]);
1439
+ }
1440
+ return {
1441
+ markdown: r.markdown,
1442
+ warnings: w,
1443
+ strategy,
1444
+ routing: {
1445
+ detectedFormat: "pdf",
1446
+ specializedPipeline: "pdf",
1447
+ usedStructuredFallback: strategy === "pdf-structured-fallback"
1448
+ }
1449
+ };
1351
1450
  }
1352
1451
  warnings.push(
1353
1452
  "@dragon708/docmind-markdown: Unidentified binary format (expected PDF magic or ZIP/DOCX). Using structured fallback if provided."
@@ -1355,11 +1454,25 @@ async function extractMarkdown(input, options) {
1355
1454
  if (fb) {
1356
1455
  return {
1357
1456
  markdown: convertStructuredToMarkdown(fb, smOpts),
1358
- warnings: mergeWarnings(warnings, fb.warnings),
1359
- strategy: "binary-unidentified-structured-fallback"
1457
+ warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("binary-unidentified")]),
1458
+ strategy: "binary-unidentified-structured-fallback",
1459
+ routing: {
1460
+ detectedFormat: "unknown",
1461
+ specializedPipeline: "none",
1462
+ usedStructuredFallback: true
1463
+ }
1360
1464
  };
1361
1465
  }
1362
- return { markdown: "", warnings, strategy: "binary-unidentified" };
1466
+ return {
1467
+ markdown: "",
1468
+ warnings,
1469
+ strategy: "binary-unidentified",
1470
+ routing: {
1471
+ detectedFormat: "unknown",
1472
+ specializedPipeline: "none",
1473
+ usedStructuredFallback: false
1474
+ }
1475
+ };
1363
1476
  }
1364
1477
 
1365
1478
  export { convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-markdown",
3
- "version": "1.1.3",
3
+ "version": "1.2.1",
4
4
  "description": "StructuredDocumentResult → Markdown and LLM-oriented plain text for DocMind.",
5
5
  "type": "module",
6
6
  "sideEffects": false,
@@ -35,8 +35,8 @@
35
35
  ],
36
36
  "license": "MIT",
37
37
  "dependencies": {
38
+ "@cognipeer/to-markdown": "^2.0.1",
38
39
  "@dragon708/docmind-shared": "^1.2.0",
39
- "@opendataloader/pdf": "^2.2.1",
40
40
  "mammoth": "^1.6.0",
41
41
  "turndown": "^7.0.0",
42
42
  "turndown-plugin-gfm": "^1.0.2"