npm - @dragon708/docmind-markdown - Versions diffs - 1.1.3 → 1.2.1 - Mend

@dragon708/docmind-markdown 1.1.3 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -39,8 +39,8 @@ type StructuredToMarkdownOptions = ConvertStructuredToMarkdownOptions;
 /**
  * Converts a {@link StructuredDocumentResult} to readable, semantic Markdown (GFM-style tables).
  *
- * **Universal fallback** for DocMind: use when a format-specific pipeline (DOCX Mammoth, PDF OpenDataLoader, …)
- * does not apply or fails, and for OCR / image / plain-text flows that already populate this shape.
+ * **Universal fallback** for DocMind: use when a format-specific pipeline (DOCX Mammoth→Turndown, specialized PDF→Markdown
+ * via `@cognipeer/to-markdown` on Node, …) does not apply or fails, and for OCR / image / plain-text flows that already populate this shape.
  *
  * Uses `blocks` in order; resolves `table` / `image-ref` via `tables` and `images`. When blocks are
  * empty or yield no output, falls back to the rollup `text`. Optional sections use `pages`, `metadata`,
@@ -213,6 +213,13 @@ type RenderMarkdownSectionsOptions = SplitStructuredIntoChunksOptions;
  */
 declare function renderMarkdownSections(result: StructuredDocumentResult, options?: RenderMarkdownSectionsOptions): MarkdownSection[];
+/**
+ * DOCX → Markdown: **Mammoth** (`convertToHtml` semantic HTML) → **Turndown** (ATX headings, lists, fenced code;
+ * **GFM tables** when {@link ConvertDocxToMarkdownOptions.includeTables} is true via `turndown-plugin-gfm`).
+ * Page breaks map through Mammoth `styleMap` → `<hr class="page-break">` → Turndown horizontal rules.
+ *
+ * Independent of the PDF → Markdown pipeline in this package (`pdf-markdown.ts`, `@cognipeer/to-markdown`).
+ */
 /**
  * Binary `.docx` payload accepted by {@link convertDocxToMarkdown}.
  */
@@ -316,8 +323,10 @@ declare function convertDocxToMarkdown(input: DocxToMarkdownInput, options?: Con
 declare function convertDocxBufferToMarkdown(input: DocxToMarkdownInput, options?: ConvertDocxToMarkdownOptions): Promise<DocxMarkdownResult>;
 /**
- * Options forwarded to `@opendataloader/pdf` `convert()`, except `format` and `toStdout` (set internally).
- * Shaped to match `ConvertOptions` from `@opendataloader/pdf` v2.x without a static type import.
+ * Legacy option bag: **ignored** by {@link convertPdfToMarkdown}. Retained so existing TypeScript callers and object
+ * literals that still spread old shapes remain assignable to {@link ConvertPdfToMarkdownOptions}.
+ *
+ * @deprecated PDF → Markdown uses `@cognipeer/to-markdown` only; these keys are not read.
  */
 interface OpenDataLoaderPdfConvertOptions {
     outputDir?: string;
@@ -346,11 +355,13 @@ interface OpenDataLoaderPdfConvertOptions {
     hybridFallback?: boolean;
 }
 /**
- * Options for {@link convertPdfToMarkdown}. OpenDataLoader fields are passed through; structured fields are local.
+ * Options for {@link convertPdfToMarkdown}. Properties from {@link OpenDataLoaderPdfConvertOptions} are accepted for
+ * compatibility but **ignored**. `fileName`, `forceExtension`, and `url` are forwarded to `@cognipeer/to-markdown`
+ * where applicable (see that package’s `ConverterOptions`).
  */
 type ConvertPdfToMarkdownOptions = OpenDataLoaderPdfConvertOptions & {
     /**
-     * When the OpenDataLoader path fails, returns empty output, or `@opendataloader/pdf` cannot load,
+     * When the Cognipeer path fails, returns empty output, or `@cognipeer/to-markdown` cannot load,
      * call this to obtain {@link StructuredDocumentResult} (e.g. from another extractor) and serialize with
      * {@link convertStructuredToMarkdown}. Does not import other DocMind packages.
      */
@@ -362,15 +373,25 @@ type ConvertPdfToMarkdownOptions = OpenDataLoaderPdfConvertOptions & {
      * @default true
      */
     readonly cleanMarkdown?: boolean;
+    /** Forwarded to `@cognipeer/to-markdown` (useful for buffer inputs). */
+    readonly fileName?: string;
+    /** Forwarded to `@cognipeer/to-markdown`. */
+    readonly forceExtension?: string;
+    /** Forwarded to `@cognipeer/to-markdown`. */
+    readonly url?: string;
 };
 /** Input for {@link convertPdfToMarkdown}: filesystem path (Node) or PDF bytes. */
 type PdfToMarkdownInput = string | Buffer | Uint8Array | ArrayBuffer;
 /** Which pipeline produced {@link ConvertPdfToMarkdownResult.markdown}. */
-type PdfToMarkdownSource = "opendataloader" | "structured-fallback" | "unsupported-runtime";
+type PdfToMarkdownSource = "cognipeer" | "structured-fallback" | "unsupported-runtime"
+/** `@cognipeer/to-markdown` missing or failed to load (install dep; check bundler externals). */
+ | "cognipeer-unavailable"
+/** Engine ran or was attempted but produced no usable Markdown (error, empty output, missing file, etc.). */
+ | "cognipeer-failed";
 type PdfToMarkdownFallbackReason = "unsupported-runtime" | "error" | "empty" | "module-not-found";
 interface ConvertPdfToMarkdownResult {
     readonly markdown: string;
-    /** Human-readable issues (runtime, missing module, Java/PDF errors, empty output, fallback errors). */
+    /** Human-readable issues (runtime, missing module, conversion errors, empty output, fallback errors). */
     readonly warnings: readonly string[];
     readonly source: PdfToMarkdownSource;
     readonly fallbackReason?: PdfToMarkdownFallbackReason;
@@ -380,21 +401,25 @@ interface PdfMarkdownResult {
     readonly markdown: string;
 }
 /**
- * Primary API: PDF path or bytes → Markdown via `@opendataloader/pdf` on Node, with clear warnings and optional
+ * Primary API: PDF path or bytes → Markdown via `@cognipeer/to-markdown` on Node, with clear warnings and optional
  * structured fallback. In non-Node runtimes returns an empty `markdown` and {@link PdfToMarkdownSource} `unsupported-runtime`
- * without loading `@opendataloader/pdf`.
+ * without loading `@cognipeer/to-markdown`.
+ *
+ * If the specialized conversion fails or returns empty Markdown and {@link ConvertPdfToMarkdownOptions.resolveStructured} /
+ * `extractMarkdown`’s `structuredFallback` is set, output comes from {@link convertStructuredToMarkdown} instead
+ * (`extractMarkdown` reports strategy `pdf-structured-fallback`; this result uses `source` `structured-fallback`).
  */
 declare function convertPdfToMarkdown(input: PdfToMarkdownInput, options?: ConvertPdfToMarkdownOptions): Promise<ConvertPdfToMarkdownResult>;
 /**
- * **Node only.** PDF file path → Markdown via `@opendataloader/pdf` (`format: "markdown"`, `toStdout: true`).
+ * **Node only.** PDF file path → Markdown via `@cognipeer/to-markdown`.
  *
- * Throws if not Node, if conversion yields no Markdown (and no structured fallback), or on OpenDataLoader errors
+ * Throws if not Node, if conversion yields no Markdown (and no structured fallback), or on conversion errors
  * when no fallback is configured — same contract as before {@link convertPdfToMarkdown} existed.
  */
 declare function convertPdfPathToMarkdown(inputPath: string, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
 /**
  * **Node only.** Same pipeline as {@link convertPdfPathToMarkdown}, but writes bytes to a temporary `.pdf`
- * under the system temp directory (OpenDataLoader expects a file path).
+ * then converts with `@cognipeer/to-markdown`.
  */
 declare function convertPdfBufferToMarkdown(input: Buffer | Uint8Array | ArrayBuffer, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
@@ -433,17 +458,38 @@ type ExtractMarkdownOptions = ConvertStructuredToMarkdownOptions & {
     /** Overrides merged into {@link convertPdfToMarkdown} when the input is identified as `.pdf`. */
     readonly pdf?: ConvertPdfToMarkdownOptions;
 };
+/** @see {@link detectBinaryFormat} */
+type DetectedBinaryFormat = "docx" | "pdf" | "unknown";
 /** Which branch produced {@link ExtractMarkdownResult.markdown}. */
-type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-opendataloader" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
+type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-cognipeer-specialized"
+/** `@cognipeer/to-markdown` not loadable (missing package, bundler, etc.). */
+ | "pdf-cognipeer-unavailable"
+/** Engine failed or returned empty Markdown; no structured fallback or fallback also failed. */
+ | "pdf-cognipeer-failed" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
+/**
+ * Format detection and fallback bookkeeping for file/path inputs to {@link extractMarkdown}.
+ * Omitted when `input` is a {@link StructuredDocumentResult} (already structured).
+ */
+interface ExtractMarkdownRoutingInfo {
+    readonly detectedFormat: DetectedBinaryFormat;
+    /** Specialized binary pipeline invoked first on Node (`none` if not applicable). */
+    readonly specializedPipeline: "docx" | "pdf" | "none";
+    /** Final Markdown came from {@link ExtractMarkdownOptions.structuredFallback} (or the same snapshot via `resolveStructured`). */
+    readonly usedStructuredFallback: boolean;
+}
 interface ExtractMarkdownResult {
     readonly markdown: string;
     /** Merged pipeline warnings (conversion, runtime, and optional {@link StructuredDocumentResult.warnings}). */
     readonly warnings: readonly string[];
     readonly strategy: ExtractMarkdownStrategy;
+    /**
+     * Present for `{ data, … }` / `{ path, … }` flows after sniffing bytes (or when returning early without reading).
+     * Absent for direct structured input.
+     */
+    readonly routing?: ExtractMarkdownRoutingInfo;
 }
 /** Type guard: file-like `{ data: … }` input for {@link extractMarkdown}. */
 declare function isExtractMarkdownFileInput(value: unknown): value is ExtractMarkdownFileInput;
-type DetectedBinaryFormat = "docx" | "pdf" | "unknown";
 /** Detect PDF / OOXML zip (DOCX) from magic bytes and optional filename / MIME. */
 declare function detectBinaryFormat(data: Buffer | Uint8Array | ArrayBuffer, filename?: string, mimeType?: string): DetectedBinaryFormat;
 /**
@@ -452,8 +498,11 @@ declare function detectBinaryFormat(data: Buffer | Uint8Array | ArrayBuffer, fil
  *
  * - **Structured input** — always uses the structured serializer (image/OCR/text/PDF/DOCX blocks already normalized).
  * - **DOCX bytes / path** — {@link convertDocxToMarkdown} on Node; otherwise warns and uses {@link ExtractMarkdownOptions.structuredFallback} if provided.
- * - **PDF bytes / path** — {@link convertPdfToMarkdown} (`@opendataloader/pdf` on Node when Java is available); in non-Node runtimes returns empty Markdown with warnings unless {@link ExtractMarkdownOptions.structuredFallback} supplies content.
+ * - **PDF bytes / path** — {@link convertPdfToMarkdown} (`@cognipeer/to-markdown` on Node); in non-Node runtimes returns empty Markdown with warnings unless {@link ExtractMarkdownOptions.structuredFallback} supplies content.
+ *
+ * Any use of `structuredFallback` (or the same snapshot through `resolveStructured`) adds a tagged line in {@link ExtractMarkdownResult.warnings}
+ * and sets {@link ExtractMarkdownRoutingInfo.usedStructuredFallback} when {@link ExtractMarkdownResult.routing} is present.
  */
 declare function extractMarkdown(input: ExtractMarkdownInput, options?: ExtractMarkdownOptions): Promise<ExtractMarkdownResult>;
-export { type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownStrategy, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
+export { type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownRoutingInfo, type ExtractMarkdownStrategy, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };

package/dist/index.js CHANGED Viewed

@@ -993,21 +993,27 @@ async function convertDocxBufferToMarkdown(input, options) {
 }
 // src/pdf-markdown.ts
-var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @opendataloader/pdf requires Node.js. In the browser, use a server-side conversion or supply structured text/Markdown from your backend.";
+var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @cognipeer/to-markdown requires Node.js. In the browser, use a server-side conversion or supply structured text/Markdown from your backend.";
+var COGNIPEER_WARN_TAG = "[docmind-markdown:pdf] pdf-cognipeer-specialized:";
 function normalizePdfMarkdown(markdown, clean) {
   const t = markdown.trim();
   if (!clean) return t;
   return t.replace(/\n{3,}/g, "\n\n");
 }
-function engineOptions(options) {
+var FALLBACK_WARN_PREFIX = "[docmind-markdown:pdf] pdf-structured-fallback: serializing StructuredDocumentResult to Markdown because";
+function structuredFallbackWarnings(reason, detail) {
+  const tail = reason === "module-not-found" ? "@cognipeer/to-markdown could not be loaded." : reason === "error" ? "the specialized PDF engine raised an error or rejected the input." : reason === "empty" ? "the specialized engine returned empty Markdown." : "the specialized path is unavailable in this runtime.";
+  const extra = detail ? ` (${detail})` : "";
+  return [`${FALLBACK_WARN_PREFIX} ${tail}${extra}`];
+}
+function cognipeerConverterOptions(options) {
   if (!options) return {};
-  const {
-    resolveStructured: _r,
-    structuredMarkdown: _s,
-    cleanMarkdown: _c,
-    ...rest
-  } = options;
-  return rest;
+  const { fileName, forceExtension, url } = options;
+  const o = {};
+  if (fileName !== void 0) o.fileName = fileName;
+  if (forceExtension !== void 0) o.forceExtension = forceExtension;
+  if (url !== void 0) o.url = url;
+  return o;
 }
 async function toNodeBuffer2(input) {
   const { Buffer: Buffer2 } = await importEsm("node:buffer");
@@ -1015,11 +1021,17 @@ async function toNodeBuffer2(input) {
   if (input instanceof ArrayBuffer) return Buffer2.from(input);
   return Buffer2.from(input);
 }
+async function loadCognipeerConvertToMarkdown() {
+  const { createRequire } = await importEsm("node:module");
+  const require2 = createRequire(import.meta.url);
+  const mod = require2("@cognipeer/to-markdown");
+  return mod.convertToMarkdown;
+}
 async function convertPdfToMarkdown(input, options) {
   const clean = options?.cleanMarkdown !== false;
   const resolveStructured = options?.resolveStructured;
   const structuredMdOpts = options?.structuredMarkdown;
-  const eng = engineOptions(options);
+  const cognipeerOpts = cognipeerConverterOptions(options);
   if (!isNodeRuntime()) {
     return {
       markdown: "",
@@ -1046,15 +1058,13 @@ async function convertPdfToMarkdown(input, options) {
       await writeFile(inputPath, buffer);
       cleanup = async () => rm(dir, { recursive: true, force: true });
     }
-    let convert;
+    let convertToMarkdown;
     try {
-      ({ convert } = await importEsm(
-        "@opendataloader/pdf"
-      ));
+      convertToMarkdown = await loadCognipeerConvertToMarkdown();
     } catch (e) {
-      const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@opendataloader/pdf` in your project." : "";
+      const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@cognipeer/to-markdown` in your project." : "";
       warnings.push(
-        `@opendataloader/pdf could not be loaded (${e instanceof Error ? e.message : String(e)}).${hint}`
+        `${COGNIPEER_WARN_TAG} package could not be loaded (${e instanceof Error ? e.message : String(e)}).${hint}`
       );
       if (resolveStructured) {
         try {
@@ -1065,7 +1075,10 @@ async function convertPdfToMarkdown(input, options) {
           );
           return {
             markdown: md,
-            warnings,
+            warnings: [
+              ...structuredFallbackWarnings("module-not-found"),
+              ...warnings
+            ],
             source: "structured-fallback",
             fallbackReason: "module-not-found"
           };
@@ -1078,20 +1091,16 @@ async function convertPdfToMarkdown(input, options) {
       return {
         markdown: "",
         warnings,
-        source: "opendataloader",
+        source: "cognipeer-unavailable",
         fallbackReason: "module-not-found"
       };
     }
     let rawMarkdown;
     try {
-      rawMarkdown = await convert(inputPath, {
-        ...eng,
-        format: "markdown",
-        toStdout: true,
-        quiet: eng.quiet !== false
-      }).then((s) => String(s));
+      rawMarkdown = await convertToMarkdown(inputPath, cognipeerOpts);
     } catch (e) {
-      warnings.push(`PDF conversion failed: ${e instanceof Error ? e.message : String(e)}`);
+      const msg = e instanceof Error ? e.message : String(e);
+      warnings.push(`${COGNIPEER_WARN_TAG} ${msg}`);
       if (resolveStructured) {
         try {
           const structured = await resolveStructured();
@@ -1101,7 +1110,10 @@ async function convertPdfToMarkdown(input, options) {
           );
           return {
             markdown: md,
-            warnings,
+            warnings: [
+              ...structuredFallbackWarnings("error", msg.slice(0, 500)),
+              ...warnings
+            ],
             source: "structured-fallback",
             fallbackReason: "error"
           };
@@ -1114,13 +1126,18 @@ async function convertPdfToMarkdown(input, options) {
       return {
         markdown: "",
         warnings,
-        source: "opendataloader",
+        source: "cognipeer-failed",
         fallbackReason: "error"
       };
     }
-    let markdown = normalizePdfMarkdown(rawMarkdown, clean);
+    let markdown = normalizePdfMarkdown(
+      typeof rawMarkdown === "string" ? rawMarkdown : String(rawMarkdown ?? ""),
+      clean
+    );
     if (markdown.length === 0) {
-      warnings.push("OpenDataLoader returned empty Markdown for this PDF.");
+      warnings.push(
+        `${COGNIPEER_WARN_TAG} returned empty Markdown for this PDF (whitespace-only after normalize).`
+      );
       if (resolveStructured) {
         try {
           const structured = await resolveStructured();
@@ -1130,7 +1147,7 @@ async function convertPdfToMarkdown(input, options) {
           );
           return {
             markdown,
-            warnings,
+            warnings: [...structuredFallbackWarnings("empty"), ...warnings],
             source: "structured-fallback",
             fallbackReason: "empty"
           };
@@ -1143,11 +1160,11 @@ async function convertPdfToMarkdown(input, options) {
       return {
         markdown: "",
         warnings,
-        source: "opendataloader",
+        source: "cognipeer-failed",
         fallbackReason: "empty"
       };
     }
-    return { markdown, warnings, source: "opendataloader" };
+    return { markdown, warnings, source: "cognipeer" };
   } finally {
     if (cleanup) {
       await cleanup().catch(() => {
@@ -1166,13 +1183,13 @@ function throwIfLegacyFailure(r) {
   }
 }
 async function convertPdfPathToMarkdown(inputPath, options) {
-  assertNodeRuntime("PDF \u2192 Markdown (@opendataloader/pdf)");
+  assertNodeRuntime("PDF \u2192 Markdown (@cognipeer/to-markdown)");
   const r = await convertPdfToMarkdown(inputPath, options);
   throwIfLegacyFailure(r);
   return { markdown: r.markdown };
 }
 async function convertPdfBufferToMarkdown(input, options) {
-  assertNodeRuntime("PDF \u2192 Markdown (@opendataloader/pdf)");
+  assertNodeRuntime("PDF \u2192 Markdown (@cognipeer/to-markdown)");
   const r = await convertPdfToMarkdown(input, options);
   throwIfLegacyFailure(r);
   return { markdown: r.markdown };
@@ -1241,9 +1258,19 @@ function docxStrategyFromSource(source) {
   return source === "structured-fallback" ? "docx-structured-fallback" : "docx-mammoth";
 }
 function pdfStrategyFromResult(r) {
-  if (r.source === "structured-fallback") return "pdf-structured-fallback";
-  if (r.source === "unsupported-runtime") return "pdf-unsupported-runtime";
-  return "pdf-opendataloader";
+  switch (r.source) {
+    case "structured-fallback":
+      return "pdf-structured-fallback";
+    case "unsupported-runtime":
+      return "pdf-unsupported-runtime";
+    case "cognipeer-unavailable":
+      return "pdf-cognipeer-unavailable";
+    case "cognipeer-failed":
+      return "pdf-cognipeer-failed";
+    case "cognipeer":
+    default:
+      return "pdf-cognipeer-specialized";
+  }
 }
 function mergeWarnings(base, ...more) {
   const out = [...base];
@@ -1252,6 +1279,22 @@ function mergeWarnings(base, ...more) {
   }
   return out;
 }
+var EXTRACT_WARN = "[docmind-markdown:extractMarkdown]";
+function traceUsedStructuredFallback(context) {
+  return `${EXTRACT_WARN} ${context}: final Markdown from structuredFallback (specialized route unavailable, failed, or insufficient).`;
+}
+function tracePdfStructuredAfterUnsupportedRuntime() {
+  return `${EXTRACT_WARN} pdf-unsupported-runtime: final Markdown from structuredFallback \u2014 @cognipeer/to-markdown cannot run in this environment.`;
+}
+function traceDocxStructuredAfterMammoth() {
+  return `${EXTRACT_WARN} docx-structured-fallback: final Markdown from structured envelope after Mammoth/Turndown did not yield the result.`;
+}
+function tracePdfStructuredAfterCognipeer() {
+  return `${EXTRACT_WARN} pdf-structured-fallback: final Markdown from structured envelope after Cognipeer PDF path did not yield the result.`;
+}
+function tracePdfSpecializedDeadEnd() {
+  return `${EXTRACT_WARN} pdf: Cognipeer specialized route did not produce Markdown and structuredFallback was not provided.`;
+}
 async function extractMarkdown(input, options) {
   const smOpts = pickStructuredMarkdownOptions(options);
   const fb = options?.structuredFallback;
@@ -1278,11 +1321,25 @@ async function extractMarkdown(input, options) {
       if (fb) {
         return {
           markdown: convertStructuredToMarkdown(fb, smOpts),
-          warnings: mergeWarnings(warnings, fb.warnings),
-          strategy: "path-requires-node"
+          warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("path-requires-node")]),
+          strategy: "path-requires-node",
+          routing: {
+            detectedFormat: "unknown",
+            specializedPipeline: "none",
+            usedStructuredFallback: true
+          }
         };
       }
-      return { markdown: "", warnings, strategy: "path-requires-node" };
+      return {
+        markdown: "",
+        warnings,
+        strategy: "path-requires-node",
+        routing: {
+          detectedFormat: "unknown",
+          specializedPipeline: "none",
+          usedStructuredFallback: false
+        }
+      };
     }
     const { readFile } = await importEsm(
       "node:fs/promises"
@@ -1302,7 +1359,7 @@ async function extractMarkdown(input, options) {
     if (fb) {
       return {
         markdown: convertStructuredToMarkdown(fb, smOpts),
-        warnings: mergeWarnings(warnings, fb.warnings),
+        warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("invalid-input-shape")]),
         strategy: "binary-unidentified-structured-fallback"
       };
     }
@@ -1317,37 +1374,79 @@ async function extractMarkdown(input, options) {
       if (fb) {
         return {
           markdown: convertStructuredToMarkdown(fb, smOpts),
-          warnings: mergeWarnings(warnings, fb.warnings),
-          strategy: "docx-requires-node"
+          warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("docx-requires-node")]),
+          strategy: "docx-requires-node",
+          routing: {
+            detectedFormat: "docx",
+            specializedPipeline: "none",
+            usedStructuredFallback: true
+          }
         };
       }
-      return { markdown: "", warnings, strategy: "docx-requires-node" };
+      return {
+        markdown: "",
+        warnings,
+        strategy: "docx-requires-node",
+        routing: {
+          detectedFormat: "docx",
+          specializedPipeline: "none",
+          usedStructuredFallback: false
+        }
+      };
     }
     const r = await convertDocxToMarkdown(data, buildDocxOptions(options));
+    const strategy = docxStrategyFromSource(r.source);
     const w = mergeWarnings(
       warnings,
-      r.messages.map((m) => m.message)
+      r.messages.map((m) => m.message),
+      strategy === "docx-structured-fallback" ? [traceDocxStructuredAfterMammoth()] : []
     );
     return {
       markdown: r.markdown,
       warnings: w,
-      strategy: docxStrategyFromSource(r.source)
+      strategy,
+      routing: {
+        detectedFormat: "docx",
+        specializedPipeline: "docx",
+        usedStructuredFallback: strategy === "docx-structured-fallback"
+      }
     };
   }
   if (fmt === "pdf") {
     const r = await convertPdfToMarkdown(data, buildPdfOptions(options));
     const strategy = pdfStrategyFromResult(r);
-    const w = mergeWarnings(warnings, r.warnings);
+    let w = mergeWarnings(warnings, r.warnings);
     if (strategy === "pdf-unsupported-runtime" && r.markdown === "" && fb) {
+      w = mergeWarnings(w, fb.warnings, [
+        tracePdfStructuredAfterUnsupportedRuntime()
+      ]);
       return {
         markdown: convertStructuredToMarkdown(fb, smOpts),
-        warnings: mergeWarnings(w, fb.warnings, [
-          "extractMarkdown: PDF route unavailable in this runtime; used structuredFallback."
-        ]),
-        strategy: "pdf-structured-fallback"
+        warnings: w,
+        strategy: "pdf-structured-fallback",
+        routing: {
+          detectedFormat: "pdf",
+          specializedPipeline: "pdf",
+          usedStructuredFallback: true
+        }
       };
     }
-    return { markdown: r.markdown, warnings: w, strategy };
+    if (strategy === "pdf-structured-fallback") {
+      w = mergeWarnings(w, [tracePdfStructuredAfterCognipeer()]);
+    }
+    if ((strategy === "pdf-cognipeer-failed" || strategy === "pdf-cognipeer-unavailable") && r.markdown.trim() === "" && !fb) {
+      w = mergeWarnings(w, [tracePdfSpecializedDeadEnd()]);
+    }
+    return {
+      markdown: r.markdown,
+      warnings: w,
+      strategy,
+      routing: {
+        detectedFormat: "pdf",
+        specializedPipeline: "pdf",
+        usedStructuredFallback: strategy === "pdf-structured-fallback"
+      }
+    };
   }
   warnings.push(
     "@dragon708/docmind-markdown: Unidentified binary format (expected PDF magic or ZIP/DOCX). Using structured fallback if provided."
@@ -1355,11 +1454,25 @@ async function extractMarkdown(input, options) {
   if (fb) {
     return {
       markdown: convertStructuredToMarkdown(fb, smOpts),
-      warnings: mergeWarnings(warnings, fb.warnings),
-      strategy: "binary-unidentified-structured-fallback"
+      warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("binary-unidentified")]),
+      strategy: "binary-unidentified-structured-fallback",
+      routing: {
+        detectedFormat: "unknown",
+        specializedPipeline: "none",
+        usedStructuredFallback: true
+      }
     };
   }
-  return { markdown: "", warnings, strategy: "binary-unidentified" };
+  return {
+    markdown: "",
+    warnings,
+    strategy: "binary-unidentified",
+    routing: {
+      detectedFormat: "unknown",
+      specializedPipeline: "none",
+      usedStructuredFallback: false
+    }
+  };
 }
 export { convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@dragon708/docmind-markdown",
-  "version": "1.1.3",
+  "version": "1.2.1",
   "description": "StructuredDocumentResult → Markdown and LLM-oriented plain text for DocMind.",
   "type": "module",
   "sideEffects": false,
@@ -35,8 +35,8 @@
   ],
   "license": "MIT",
   "dependencies": {
+    "@cognipeer/to-markdown": "^2.0.1",
     "@dragon708/docmind-shared": "^1.2.0",
-    "@opendataloader/pdf": "^2.2.1",
     "mammoth": "^1.6.0",
     "turndown": "^7.0.0",
     "turndown-plugin-gfm": "^1.0.2"