npm - @dragon708/docmind-markdown - Versions diffs - 1.2.6 → 1.2.8 - Mend

@dragon708/docmind-markdown 1.2.6 → 1.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/dist/index.d.ts +139 -12
package/dist/index.js +781 -48
package/node_modules/turndown-plugin-gfm/LICENSE +21 -0
package/node_modules/turndown-plugin-gfm/README.md +50 -0
package/node_modules/turndown-plugin-gfm/dist/turndown-plugin-gfm.js +165 -0
package/node_modules/turndown-plugin-gfm/lib/turndown-plugin-gfm.browser.cjs.js +162 -0
package/node_modules/turndown-plugin-gfm/lib/turndown-plugin-gfm.browser.es.js +154 -0
package/node_modules/turndown-plugin-gfm/lib/turndown-plugin-gfm.cjs.js +162 -0
package/node_modules/turndown-plugin-gfm/lib/turndown-plugin-gfm.es.js +154 -0
package/node_modules/turndown-plugin-gfm/package.json +43 -0
package/package.json +5 -1

package/dist/index.js CHANGED Viewed

@@ -992,6 +992,20 @@ async function convertDocxBufferToMarkdown(input, options) {
   return { markdown: r.markdown, messages: r.messages };
 }
+// src/cognipeer-runtime.ts
+async function loadCognipeerConvertToMarkdown() {
+  const { createRequire } = await importEsm("node:module");
+  const require2 = createRequire(import.meta.url);
+  const mod = require2("@cognipeer/to-markdown");
+  return mod.convertToMarkdown;
+}
+async function toNodeBuffer2(input) {
+  const { Buffer: Buffer2 } = await importEsm("node:buffer");
+  if (Buffer2.isBuffer(input)) return input;
+  if (input instanceof ArrayBuffer) return Buffer2.from(input);
+  return Buffer2.from(input);
+}
 // src/pdf-markdown.ts
 var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @cognipeer/to-markdown requires Node.js. In the browser, use a server-side conversion or supply structured text/Markdown from your backend.";
 var COGNIPEER_WARN_TAG = "[docmind-markdown:pdf] pdf-cognipeer-specialized:";
@@ -1015,23 +1029,11 @@ function cognipeerConverterOptions(options) {
   if (url !== void 0) o.url = url;
   return o;
 }
-async function toNodeBuffer2(input) {
-  const { Buffer: Buffer2 } = await importEsm("node:buffer");
-  if (Buffer2.isBuffer(input)) return input;
-  if (input instanceof ArrayBuffer) return Buffer2.from(input);
-  return Buffer2.from(input);
-}
-async function loadCognipeerConvertToMarkdown() {
-  const { createRequire } = await importEsm("node:module");
-  const require2 = createRequire(import.meta.url);
-  const mod = require2("@cognipeer/to-markdown");
-  return mod.convertToMarkdown;
-}
 async function convertPdfToMarkdown(input, options) {
   const clean = options?.cleanMarkdown !== false;
   const resolveStructured = options?.resolveStructured;
   const structuredMdOpts = options?.structuredMarkdown;
-  const cognipeerOpts = cognipeerConverterOptions(options);
+  const cognipeerOpts2 = cognipeerConverterOptions(options);
   if (!isNodeRuntime()) {
     return {
       markdown: "",
@@ -1097,7 +1099,7 @@ async function convertPdfToMarkdown(input, options) {
     }
     let rawMarkdown;
     try {
-      rawMarkdown = await convertToMarkdown(inputPath, cognipeerOpts);
+      rawMarkdown = await convertToMarkdown(inputPath, cognipeerOpts2);
     } catch (e) {
       const msg = e instanceof Error ? e.message : String(e);
       warnings.push(`${COGNIPEER_WARN_TAG} ${msg}`);
@@ -1194,6 +1196,473 @@ async function convertPdfBufferToMarkdown(input, options) {
   throwIfLegacyFailure(r);
   return { markdown: r.markdown };
 }
+// src/cognipeer-file-markdown.ts
+var BROWSER = (label) => `@dragon708/docmind-markdown: ${label} \u2192 Markdown via @cognipeer/to-markdown requires Node.js. In the browser, use a server-side conversion or supply structured input / structuredFallback.`;
+function cognipeerOpts(options) {
+  if (!options) return {};
+  const { fileName, forceExtension, url } = options;
+  const o = {};
+  if (fileName !== void 0) o.fileName = fileName;
+  if (forceExtension !== void 0) o.forceExtension = forceExtension;
+  if (url !== void 0) o.url = url;
+  return o;
+}
+function normalizeMarkdown(markdown, clean) {
+  const t = markdown.trim();
+  if (!clean) return t;
+  return t.replace(/\n{3,}/g, "\n\n");
+}
+function structuredFallbackWarnings2(format, reason, detail) {
+  const tag = `[docmind-markdown:${format}] ${format}-structured-fallback:`;
+  const tail = reason === "module-not-found" ? "@cognipeer/to-markdown could not be loaded." : reason === "error" ? "the specialized engine raised an error or rejected the input." : reason === "empty" ? "the specialized engine returned empty Markdown." : "the specialized path is unavailable in this runtime.";
+  const extra = detail ? ` (${detail})` : "";
+  return [`${tag} serializing StructuredDocumentResult to Markdown because ${tail}${extra}`];
+}
+function warnTag(format) {
+  return `[docmind-markdown:${format}] ${format}-cognipeer-specialized:`;
+}
+async function convertCognipeerFileToMarkdown(format, defaultTempFile, input, options) {
+  const clean = options?.cleanMarkdown !== false;
+  const resolveStructured = options?.resolveStructured;
+  const structuredMdOpts = options?.structuredMarkdown;
+  const browserLabel = format === "html" ? "HTML" : format === "csv" ? "CSV" : "Spreadsheet";
+  if (!isNodeRuntime()) {
+    return {
+      markdown: "",
+      warnings: [BROWSER(browserLabel)],
+      source: "unsupported-runtime",
+      fallbackReason: "unsupported-runtime"
+    };
+  }
+  const warnings = [];
+  let cleanup;
+  try {
+    let inputPath;
+    if (typeof input === "string") {
+      inputPath = input;
+    } else {
+      const [{ mkdtemp, writeFile, rm }, { join }, { tmpdir }, buffer] = await Promise.all([
+        importEsm("node:fs/promises"),
+        importEsm("node:path"),
+        importEsm("node:os"),
+        toNodeBuffer2(input)
+      ]);
+      const dir = await mkdtemp(join(tmpdir(), `docmind-markdown-${format}-`));
+      inputPath = join(dir, defaultTempFile);
+      await writeFile(inputPath, buffer);
+      cleanup = async () => rm(dir, { recursive: true, force: true });
+    }
+    let convertToMarkdown;
+    try {
+      convertToMarkdown = await loadCognipeerConvertToMarkdown();
+    } catch (e) {
+      const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@cognipeer/to-markdown` in your project." : "";
+      warnings.push(
+        `${warnTag(format)} package could not be loaded (${e instanceof Error ? e.message : String(e)}).${hint}`
+      );
+      if (resolveStructured) {
+        try {
+          const structured = await resolveStructured();
+          const md = normalizeMarkdown(
+            convertStructuredToMarkdown(structured, structuredMdOpts),
+            clean
+          );
+          return {
+            markdown: md,
+            warnings: [
+              ...structuredFallbackWarnings2(format, "module-not-found"),
+              ...warnings
+            ],
+            source: "structured-fallback",
+            fallbackReason: "module-not-found"
+          };
+        } catch (e2) {
+          warnings.push(
+            `Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
+          );
+        }
+      }
+      return {
+        markdown: "",
+        warnings,
+        source: "cognipeer-unavailable",
+        fallbackReason: "module-not-found"
+      };
+    }
+    let rawMarkdown;
+    try {
+      rawMarkdown = await convertToMarkdown(inputPath, cognipeerOpts(options));
+    } catch (e) {
+      const msg = e instanceof Error ? e.message : String(e);
+      warnings.push(`${warnTag(format)} ${msg}`);
+      if (resolveStructured) {
+        try {
+          const structured = await resolveStructured();
+          const md = normalizeMarkdown(
+            convertStructuredToMarkdown(structured, structuredMdOpts),
+            clean
+          );
+          return {
+            markdown: md,
+            warnings: [
+              ...structuredFallbackWarnings2(format, "error", msg.slice(0, 500)),
+              ...warnings
+            ],
+            source: "structured-fallback",
+            fallbackReason: "error"
+          };
+        } catch (e2) {
+          warnings.push(
+            `Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
+          );
+        }
+      }
+      return {
+        markdown: "",
+        warnings,
+        source: "cognipeer-failed",
+        fallbackReason: "error"
+      };
+    }
+    let markdown = normalizeMarkdown(
+      typeof rawMarkdown === "string" ? rawMarkdown : String(rawMarkdown ?? ""),
+      clean
+    );
+    if (markdown.length === 0) {
+      warnings.push(
+        `${warnTag(format)} returned empty Markdown for this input (whitespace-only after normalize).`
+      );
+      if (resolveStructured) {
+        try {
+          const structured = await resolveStructured();
+          markdown = normalizeMarkdown(
+            convertStructuredToMarkdown(structured, structuredMdOpts),
+            clean
+          );
+          return {
+            markdown,
+            warnings: [...structuredFallbackWarnings2(format, "empty"), ...warnings],
+            source: "structured-fallback",
+            fallbackReason: "empty"
+          };
+        } catch (e2) {
+          warnings.push(
+            `Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
+          );
+        }
+      }
+      return {
+        markdown: "",
+        warnings,
+        source: "cognipeer-failed",
+        fallbackReason: "empty"
+      };
+    }
+    return { markdown, warnings, source: "cognipeer" };
+  } finally {
+    if (cleanup) {
+      await cleanup().catch(() => {
+      });
+    }
+  }
+}
+// src/node-is-regular-file.ts
+async function isExistingRegularFile(path) {
+  try {
+    const { stat } = await importEsm("node:fs/promises");
+    const s = await stat(path);
+    return s.isFile();
+  } catch {
+    return false;
+  }
+}
+// src/html-markdown.ts
+function looksLikeHtmlString(s) {
+  const t = s.trimStart();
+  if (t.length === 0) return false;
+  if (/^<!DOCTYPE\s+html/i.test(t)) return true;
+  if (/^<html[\s>]/i.test(t)) return true;
+  if (/^<head[\s>]/i.test(t)) return true;
+  if (/^<!--/.test(t)) return true;
+  const c0 = t[0];
+  const c1 = t[1] ?? "";
+  if (c0 === "<" && /[a-zA-Z!?]/.test(c1)) return true;
+  return false;
+}
+async function resolveHtmlStringInput(s, mode) {
+  const { Buffer: Buffer2 } = await importEsm("node:buffer");
+  if (mode === "html") {
+    return { kind: "buffer", buffer: Buffer2.from(s, "utf8") };
+  }
+  if (mode === "path") {
+    return { kind: "path", path: s };
+  }
+  if (isNodeRuntime() && await isExistingRegularFile(s)) {
+    return { kind: "path", path: s };
+  }
+  if (looksLikeHtmlString(s)) {
+    return { kind: "buffer", buffer: Buffer2.from(s, "utf8") };
+  }
+  return { kind: "path", path: s };
+}
+async function convertHtmlToMarkdown(input, options) {
+  const mode = options?.inputMode ?? "auto";
+  const { inputMode: _omit, ...cognipeerOptions } = options ?? {};
+  if (typeof input === "string") {
+    if (!isNodeRuntime()) {
+      return convertCognipeerFileToMarkdown("html", "document.html", input, cognipeerOptions);
+    }
+    const resolved = await resolveHtmlStringInput(input, mode);
+    if (resolved.kind === "path") {
+      return convertCognipeerFileToMarkdown("html", "document.html", resolved.path, cognipeerOptions);
+    }
+    return convertCognipeerFileToMarkdown("html", "document.html", resolved.buffer, {
+      ...cognipeerOptions,
+      forceExtension: cognipeerOptions.forceExtension ?? ".html",
+      fileName: cognipeerOptions.fileName ?? "document.html"
+    });
+  }
+  return convertCognipeerFileToMarkdown("html", "document.html", input, cognipeerOptions);
+}
+// src/tabular-markdown-postprocess.ts
+function compactMarkdownOutput(markdown) {
+  return markdown.replace(/\n{3,}/g, "\n\n").split("\n").map((l) => l.trimEnd()).join("\n").trim();
+}
+function countCsvColumns(firstLine) {
+  let n = 1;
+  let inQuotes = false;
+  for (let i = 0; i < firstLine.length; i++) {
+    const c = firstLine[i];
+    if (c === '"') {
+      inQuotes = !inQuotes;
+    } else if (c === "," && !inQuotes) {
+      n++;
+    }
+  }
+  return n;
+}
+function prepareCsvTextForCognipeer(text, options) {
+  const warnings = [];
+  const includeHeader = options?.includeHeader !== false;
+  const maxRows = options?.maxRows;
+  let lines = text.split(/\r?\n/).filter((l) => l.length > 0);
+  if (lines.length === 0) {
+    return { text, warnings };
+  }
+  if (!includeHeader) {
+    const colCount = Math.max(1, countCsvColumns(lines[0]));
+    const synth = Array.from({ length: colCount }, (_, i) => `Column ${i + 1}`).join(",");
+    lines = [synth, ...lines];
+    warnings.push(
+      "[docmind-markdown:csv] includeHeader:false: prepended synthetic header row so the first CSV row appears as table data."
+    );
+  }
+  if (maxRows != null && maxRows >= 0) {
+    const header = lines[0];
+    const rest = lines.slice(1);
+    const data = rest.slice(0, maxRows);
+    if (rest.length > maxRows) {
+      warnings.push(
+        `[docmind-markdown:csv] maxRows:${maxRows}: truncated data rows before conversion (line-based; quoted newlines inside fields may skew counts).`
+      );
+    }
+    lines = [header, ...data];
+  }
+  return { text: lines.join("\n"), warnings };
+}
+function stripSpreadsheetSheetHeadings(markdown) {
+  return markdown.replace(/^##[^\n]+\n+/gm, "");
+}
+function limitSpreadsheetMarkdownRowsPerSheet(markdown, maxRowsPerSheet) {
+  const warnings = [];
+  if (maxRowsPerSheet < 0) return { markdown, warnings };
+  const lines = markdown.split("\n");
+  const out = [];
+  let i = 0;
+  let truncatedAny = false;
+  const emitLimitedTable = (tableLines) => {
+    if (tableLines.length >= 3) {
+      const header = tableLines[0];
+      const sep = tableLines[1];
+      const body = tableLines.slice(2, 2 + maxRowsPerSheet);
+      if (tableLines.length - 2 > maxRowsPerSheet) truncatedAny = true;
+      out.push(header, sep, ...body);
+    } else {
+      out.push(...tableLines);
+    }
+  };
+  while (i < lines.length) {
+    const line = lines[i];
+    const isSheetTitle = /^##\s+.+$/.test(line);
+    if (isSheetTitle) {
+      out.push(line);
+      i++;
+      while (i < lines.length && lines[i].trim() === "") {
+        out.push(lines[i]);
+        i++;
+      }
+      const tableStart = i;
+      while (i < lines.length && lines[i].trim().startsWith("|")) {
+        i++;
+      }
+      emitLimitedTable(lines.slice(tableStart, i));
+      continue;
+    }
+    if (line.trim().startsWith("|")) {
+      const tableStart = i;
+      while (i < lines.length && lines[i].trim().startsWith("|")) {
+        i++;
+      }
+      emitLimitedTable(lines.slice(tableStart, i));
+      continue;
+    }
+    out.push(line);
+    i++;
+  }
+  if (truncatedAny) {
+    warnings.push(
+      `[docmind-markdown:spreadsheet] maxRowsPerSheet:${maxRowsPerSheet}: truncated data rows in one or more sheet tables.`
+    );
+  }
+  return { markdown: out.join("\n"), warnings };
+}
+// src/csv-markdown.ts
+function looksLikeCsvContent(s) {
+  return s.includes(",") && /[\r\n]/.test(s);
+}
+function stripCsvOptionKeys(o) {
+  if (!o) return {};
+  const {
+    inputMode: _im,
+    includeHeader: _ih,
+    compactMode: _cm,
+    maxRows: _mr,
+    ...rest
+  } = o;
+  return rest;
+}
+function finishCsvResult(r, prependWarnings, options) {
+  const markdown = options?.compactMode === true ? compactMarkdownOutput(r.markdown) : r.markdown;
+  if (prependWarnings.length === 0 && markdown === r.markdown) return r;
+  return {
+    ...r,
+    markdown,
+    warnings: [...prependWarnings, ...r.warnings]
+  };
+}
+async function readUtf8File(path) {
+  const { readFile } = await importEsm("node:fs/promises");
+  return readFile(path, "utf8");
+}
+async function resolveCsvStringInput(s, mode) {
+  if (mode === "content") return { kind: "text", text: s };
+  if (mode === "path") return { kind: "path", path: s };
+  if (isNodeRuntime() && await isExistingRegularFile(s)) return { kind: "path", path: s };
+  if (looksLikeCsvContent(s)) return { kind: "text", text: s };
+  return { kind: "path", path: s };
+}
+function csvNeedsPreprocess(options) {
+  return options?.maxRows != null || options?.includeHeader === false;
+}
+async function convertCsvToMarkdown(input, options) {
+  const cognipeerOptions = stripCsvOptionKeys(options);
+  const prepArgs = { includeHeader: options?.includeHeader, maxRows: options?.maxRows };
+  const needsPrep = csvNeedsPreprocess(options);
+  const strMode = options?.inputMode ?? "auto";
+  if (typeof input === "string") {
+    if (!isNodeRuntime()) {
+      const r3 = await convertCognipeerFileToMarkdown("csv", "document.csv", input, cognipeerOptions);
+      return finishCsvResult(r3, [], options);
+    }
+    const resolved = await resolveCsvStringInput(input, strMode);
+    if (resolved.kind === "path") {
+      if (needsPrep) {
+        const raw = await readUtf8File(resolved.path);
+        const { text: text3, warnings: w3 } = prepareCsvTextForCognipeer(raw, prepArgs);
+        const r4 = await convertCognipeerFileToMarkdown(
+          "csv",
+          "document.csv",
+          Buffer.from(text3, "utf8"),
+          cognipeerOptions
+        );
+        return finishCsvResult(r4, w3, options);
+      }
+      const r3 = await convertCognipeerFileToMarkdown(
+        "csv",
+        "document.csv",
+        resolved.path,
+        cognipeerOptions
+      );
+      return finishCsvResult(r3, [], options);
+    }
+    const { text: text2, warnings: w2 } = prepareCsvTextForCognipeer(resolved.text, prepArgs);
+    const r2 = await convertCognipeerFileToMarkdown(
+      "csv",
+      "document.csv",
+      Buffer.from(text2, "utf8"),
+      cognipeerOptions
+    );
+    return finishCsvResult(r2, w2, options);
+  }
+  if (!needsPrep) {
+    const r2 = await convertCognipeerFileToMarkdown("csv", "document.csv", input, cognipeerOptions);
+    return finishCsvResult(r2, [], options);
+  }
+  const buf = await toNodeBuffer2(input);
+  const { text, warnings: w } = prepareCsvTextForCognipeer(buf.toString("utf8"), prepArgs);
+  const r = await convertCognipeerFileToMarkdown(
+    "csv",
+    "document.csv",
+    Buffer.from(text, "utf8"),
+    cognipeerOptions
+  );
+  return finishCsvResult(r, w, options);
+}
+// src/spreadsheet-markdown.ts
+function stripSpreadsheetOptionKeys(o) {
+  if (!o) return {};
+  const { includeSheetNames: _isn, compactMode: _cm, maxRowsPerSheet: _mr, ...rest } = o;
+  return rest;
+}
+function finishSpreadsheetResult(r, options) {
+  if (!options) return r;
+  let markdown = r.markdown;
+  const warnings = [...r.warnings];
+  if (r.source === "cognipeer") {
+    if (options.maxRowsPerSheet != null) {
+      const lim = limitSpreadsheetMarkdownRowsPerSheet(markdown, options.maxRowsPerSheet);
+      markdown = lim.markdown;
+      warnings.push(...lim.warnings);
+    }
+    if (options.includeSheetNames === false) {
+      markdown = stripSpreadsheetSheetHeadings(markdown);
+      warnings.push(
+        "[docmind-markdown:spreadsheet] includeSheetNames:false: removed ## sheet title lines from specialized output."
+      );
+    }
+  }
+  if (options.compactMode === true) {
+    markdown = compactMarkdownOutput(markdown);
+  }
+  if (markdown === r.markdown && warnings.length === r.warnings.length) return r;
+  return { ...r, markdown, warnings };
+}
+async function convertSpreadsheetToMarkdown(input, options) {
+  const cognipeerOptions = stripSpreadsheetOptionKeys(options);
+  const r = await convertCognipeerFileToMarkdown(
+    "spreadsheet",
+    "document.xlsx",
+    input,
+    cognipeerOptions
+  );
+  return finishSpreadsheetResult(r, options);
+}
 function isArrayBufferLike(data) {
   if (data instanceof ArrayBuffer) return true;
   if (typeof Uint8Array !== "undefined" && data instanceof Uint8Array) return true;
@@ -1210,7 +1679,15 @@ function isExtractMarkdownPathInput(value) {
 }
 function pickStructuredMarkdownOptions(options) {
   if (!options) return {};
-  const { structuredFallback: _a, docx: _b, pdf: _c, ...rest } = options;
+  const {
+    structuredFallback: _a,
+    docx: _b,
+    pdf: _c,
+    html: _h,
+    csv: _csv,
+    spreadsheet: _s,
+    ...rest
+  } = options;
   return rest;
 }
 function buildDocxOptions(extract) {
@@ -1233,25 +1710,109 @@ function buildPdfOptions(extract) {
     structuredMarkdown: { ...sm, ...pdf?.structuredMarkdown }
   };
 }
+function buildHtmlOptions(extract) {
+  const html = extract?.html;
+  const fb = extract?.structuredFallback;
+  const sm = pickStructuredMarkdownOptions(extract);
+  return {
+    ...html,
+    resolveStructured: html?.resolveStructured ?? (fb ? () => Promise.resolve(fb) : void 0),
+    structuredMarkdown: { ...sm, ...html?.structuredMarkdown }
+  };
+}
+function buildCsvOptions(extract) {
+  const csv = extract?.csv;
+  const fb = extract?.structuredFallback;
+  const sm = pickStructuredMarkdownOptions(extract);
+  return {
+    ...csv,
+    resolveStructured: csv?.resolveStructured ?? (fb ? () => Promise.resolve(fb) : void 0),
+    structuredMarkdown: { ...sm, ...csv?.structuredMarkdown }
+  };
+}
+function buildSpreadsheetOptions(extract) {
+  const spreadsheet = extract?.spreadsheet;
+  const fb = extract?.structuredFallback;
+  const sm = pickStructuredMarkdownOptions(extract);
+  return {
+    ...spreadsheet,
+    resolveStructured: spreadsheet?.resolveStructured ?? (fb ? () => Promise.resolve(fb) : void 0),
+    structuredMarkdown: { ...sm, ...spreadsheet?.structuredMarkdown }
+  };
+}
 function toUint8View(data) {
   if (data instanceof Uint8Array) return data;
   if (data instanceof ArrayBuffer) return new Uint8Array(data);
   return new Uint8Array(data);
 }
+var XLS_OLE_MAGIC = new Uint8Array([208, 207, 17, 224, 161, 177, 26, 225]);
+function uint8ArraysEqual(a, b) {
+  if (a.length !== b.length) return false;
+  for (let i = 0; i < a.length; i++) if (a[i] !== b[i]) return false;
+  return true;
+}
+function containsUtf8Substring(haystack, needle) {
+  const bytes = new TextEncoder().encode(needle);
+  if (bytes.length === 0 || haystack.length < bytes.length) return false;
+  outer: for (let i = 0; i <= haystack.length - bytes.length; i++) {
+    for (let j = 0; j < bytes.length; j++) {
+      if (haystack[i + j] !== bytes[j]) continue outer;
+    }
+    return true;
+  }
+  return false;
+}
+function isZipLocalHeader(u) {
+  return u.length >= 4 && u[0] === 80 && u[1] === 75 && (u[2] === 3 || u[2] === 5 || u[2] === 7);
+}
+function looksLikeUtf8HtmlPrefix(u) {
+  if (u.length === 0) return false;
+  let start = 0;
+  if (u.length >= 3 && u[0] === 239 && u[1] === 187 && u[2] === 191) start = 3;
+  let s = "";
+  const n = Math.min(u.length, 256);
+  for (let i = start; i < n; i++) {
+    const c = u[i];
+    if (c === 0 || c > 127) return false;
+    s += String.fromCharCode(c);
+  }
+  const t = s.trimStart().slice(0, 96).toLowerCase();
+  return t.startsWith("<!doctype html") || t.startsWith("<html") || t.startsWith("<head") || t.startsWith("<!--");
+}
 function detectBinaryFormat(data, filename, mimeType) {
   const u = toUint8View(data);
   const lower = filename?.toLowerCase() ?? "";
   const mime = mimeType?.toLowerCase() ?? "";
   if (mime.includes("pdf") || lower.endsWith(".pdf")) return "pdf";
+  if (mime.includes("text/html") || mime.includes("application/xhtml+xml") || lower.endsWith(".html") || lower.endsWith(".htm")) {
+    return "html";
+  }
+  if (mime.includes("text/csv") || mime.includes("application/csv") || lower.endsWith(".csv")) {
+    return "csv";
+  }
+  if (mime.includes("spreadsheetml") || mime.includes("officedocument.spreadsheetml") || mime.includes("application/vnd.ms-excel") || lower.endsWith(".xlsx") || lower.endsWith(".xls")) {
+    return "spreadsheet";
+  }
   if (mime.includes("wordprocessingml") || mime.includes("officedocument.wordprocessingml.document") || lower.endsWith(".docx")) {
     return "docx";
   }
   if (u.length >= 4 && u[0] === 37 && u[1] === 80 && u[2] === 68 && u[3] === 70) {
     return "pdf";
   }
-  if (u.length >= 4 && u[0] === 80 && u[1] === 75 && (u[2] === 3 || u[2] === 5 || u[2] === 7)) {
-    return "docx";
+  if (u.length >= XLS_OLE_MAGIC.length && uint8ArraysEqual(u.subarray(0, XLS_OLE_MAGIC.length), XLS_OLE_MAGIC)) {
+    return "spreadsheet";
   }
+  if (isZipLocalHeader(u)) {
+    const hasWordDoc = containsUtf8Substring(u, "word/document");
+    const hasXlWorkbook = containsUtf8Substring(u, "xl/workbook");
+    if (hasWordDoc && !hasXlWorkbook) return "docx";
+    if (hasXlWorkbook && !hasWordDoc) return "spreadsheet";
+    if (hasWordDoc && hasXlWorkbook) return "docx";
+    if (lower.endsWith(".docx")) return "docx";
+    if (lower.endsWith(".xlsx")) return "spreadsheet";
+    return "unknown";
+  }
+  if (looksLikeUtf8HtmlPrefix(u)) return "html";
   return "unknown";
 }
 function docxStrategyFromSource(source) {
@@ -1272,6 +1833,21 @@ function pdfStrategyFromResult(r) {
       return "pdf-cognipeer-specialized";
   }
 }
+function cognipeerFileStrategyFromResult(format, r) {
+  switch (r.source) {
+    case "structured-fallback":
+      return format === "html" ? "html-structured-fallback" : format === "csv" ? "csv-structured-fallback" : "spreadsheet-structured-fallback";
+    case "unsupported-runtime":
+      return format === "html" ? "html-unsupported-runtime" : format === "csv" ? "csv-unsupported-runtime" : "spreadsheet-unsupported-runtime";
+    case "cognipeer-unavailable":
+      return format === "html" ? "html-cognipeer-unavailable" : format === "csv" ? "csv-cognipeer-unavailable" : "spreadsheet-cognipeer-unavailable";
+    case "cognipeer-failed":
+      return format === "html" ? "html-cognipeer-failed" : format === "csv" ? "csv-cognipeer-failed" : "spreadsheet-cognipeer-failed";
+    case "cognipeer":
+    default:
+      return format === "html" ? "html-cognipeer-specialized" : format === "csv" ? "csv-cognipeer-specialized" : "spreadsheet-cognipeer-specialized";
+  }
+}
 function mergeWarnings(base, ...more) {
   const out = [...base];
   for (const m of more) {
@@ -1280,6 +1856,30 @@ function mergeWarnings(base, ...more) {
   return out;
 }
 var EXTRACT_WARN = "[docmind-markdown:extractMarkdown]";
+var ROUTING_TAG = "[docmind-markdown:extractMarkdown:routing]";
+function inferMediaHint(mimeType, filename) {
+  const m = mimeType?.toLowerCase().trim() ?? "";
+  const f = filename?.toLowerCase() ?? "";
+  if (m.startsWith("image/") || /\.(png|jpe?g|gif|webp|bmp|ico|svg|tiff?)$/i.test(f)) {
+    return "image";
+  }
+  if (m.startsWith("text/") || m === "application/json" || /\.(txt|md|json|log)$/i.test(f)) {
+    return "text";
+  }
+  if (m.startsWith("audio/")) return "audio";
+  if (m.startsWith("video/")) return "video";
+  return void 0;
+}
+function buildRouting(p) {
+  const hintPart = p.mediaHint ? ` mediaHint=${p.mediaHint}` : "";
+  return {
+    detectedFormat: p.detectedFormat,
+    specializedPipeline: p.specializedPipeline,
+    usedStructuredFallback: p.usedStructuredFallback,
+    mediaHint: p.mediaHint,
+    routingSummary: `${ROUTING_TAG} strategy=${p.strategy} format=${p.detectedFormat} pipeline=${p.specializedPipeline} structuredFallback=${p.usedStructuredFallback}${hintPart}`
+  };
+}
 function traceUsedStructuredFallback(context) {
   return `${EXTRACT_WARN} ${context}: final Markdown from structuredFallback (specialized route unavailable, failed, or insufficient).`;
 }
@@ -1295,6 +1895,75 @@ function tracePdfStructuredAfterCognipeer() {
 function tracePdfSpecializedDeadEnd() {
   return `${EXTRACT_WARN} pdf: Cognipeer specialized route did not produce Markdown and structuredFallback was not provided.`;
 }
+function traceCognipeerFileStructuredAfterUnsupportedRuntime(label) {
+  return `${EXTRACT_WARN} ${label}-unsupported-runtime: final Markdown from structuredFallback \u2014 @cognipeer/to-markdown cannot run in this environment.`;
+}
+function traceCognipeerFileStructuredAfterCognipeer(label) {
+  return `${EXTRACT_WARN} ${label}-structured-fallback: final Markdown from structured envelope after Cognipeer ${label} path did not yield the result.`;
+}
+function traceCognipeerFileSpecializedDeadEnd(label) {
+  return `${EXTRACT_WARN} ${label}: Cognipeer specialized route did not produce Markdown and structuredFallback was not provided.`;
+}
+function traceCognipeerFileExtractLayerFallback(label) {
+  return `${EXTRACT_WARN} ${label}-extract-layer-fallback: specialized route returned empty Markdown but structuredFallback is set \u2014 applying convertStructuredToMarkdown at extractMarkdown layer.`;
+}
+function tracePdfStructuredExtractLayerFallback() {
+  return `${EXTRACT_WARN} pdf-extract-layer-fallback: specialized route returned empty Markdown but structuredFallback is set \u2014 applying convertStructuredToMarkdown at extractMarkdown layer.`;
+}
+async function extractCognipeerFileMarkdownBranch(format, data, options, baseWarnings, smOpts, fb) {
+  const r = format === "html" ? await convertHtmlToMarkdown(data, buildHtmlOptions(options)) : format === "csv" ? await convertCsvToMarkdown(data, buildCsvOptions(options)) : await convertSpreadsheetToMarkdown(data, buildSpreadsheetOptions(options));
+  const strategy = cognipeerFileStrategyFromResult(format, r);
+  let w = mergeWarnings(baseWarnings, r.warnings);
+  const unsupported = format === "html" ? "html-unsupported-runtime" : format === "csv" ? "csv-unsupported-runtime" : "spreadsheet-unsupported-runtime";
+  const structuredFb = format === "html" ? "html-structured-fallback" : format === "csv" ? "csv-structured-fallback" : "spreadsheet-structured-fallback";
+  const failed = format === "html" ? "html-cognipeer-failed" : format === "csv" ? "csv-cognipeer-failed" : "spreadsheet-cognipeer-failed";
+  const unavailable = format === "html" ? "html-cognipeer-unavailable" : format === "csv" ? "csv-cognipeer-unavailable" : "spreadsheet-cognipeer-unavailable";
+  if (strategy === unsupported && r.markdown === "" && fb) {
+    w = mergeWarnings(w, fb.warnings, [traceCognipeerFileStructuredAfterUnsupportedRuntime(format)]);
+    return {
+      markdown: convertStructuredToMarkdown(fb, smOpts),
+      warnings: w,
+      strategy: structuredFb,
+      routing: buildRouting({
+        detectedFormat: format,
+        specializedPipeline: format,
+        usedStructuredFallback: true,
+        strategy: structuredFb
+      })
+    };
+  }
+  if ((strategy === failed || strategy === unavailable) && r.markdown.trim() === "" && fb) {
+    w = mergeWarnings(w, fb.warnings, [traceCognipeerFileExtractLayerFallback(format)]);
+    return {
+      markdown: convertStructuredToMarkdown(fb, smOpts),
+      warnings: w,
+      strategy: structuredFb,
+      routing: buildRouting({
+        detectedFormat: format,
+        specializedPipeline: format,
+        usedStructuredFallback: true,
+        strategy: structuredFb
+      })
+    };
+  }
+  if (strategy === structuredFb) {
+    w = mergeWarnings(w, [traceCognipeerFileStructuredAfterCognipeer(format)]);
+  }
+  if ((strategy === failed || strategy === unavailable) && r.markdown.trim() === "" && !fb) {
+    w = mergeWarnings(w, [traceCognipeerFileSpecializedDeadEnd(format)]);
+  }
+  return {
+    markdown: r.markdown,
+    warnings: w,
+    strategy,
+    routing: buildRouting({
+      detectedFormat: format,
+      specializedPipeline: format,
+      usedStructuredFallback: strategy === structuredFb,
+      strategy
+    })
+  };
+}
 async function extractMarkdown(input, options) {
   const smOpts = pickStructuredMarkdownOptions(options);
   const fb = options?.structuredFallback;
@@ -1323,22 +1992,24 @@ async function extractMarkdown(input, options) {
           markdown: convertStructuredToMarkdown(fb, smOpts),
           warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("path-requires-node")]),
           strategy: "path-requires-node",
-          routing: {
+          routing: buildRouting({
             detectedFormat: "unknown",
             specializedPipeline: "none",
-            usedStructuredFallback: true
-          }
+            usedStructuredFallback: true,
+            strategy: "path-requires-node"
+          })
         };
       }
       return {
         markdown: "",
         warnings,
         strategy: "path-requires-node",
-        routing: {
+        routing: buildRouting({
           detectedFormat: "unknown",
           specializedPipeline: "none",
-          usedStructuredFallback: false
-        }
+          usedStructuredFallback: false,
+          strategy: "path-requires-node"
+        })
       };
     }
     const { readFile } = await importEsm(
@@ -1360,10 +2031,26 @@ async function extractMarkdown(input, options) {
       return {
         markdown: convertStructuredToMarkdown(fb, smOpts),
         warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("invalid-input-shape")]),
-        strategy: "binary-unidentified-structured-fallback"
+        strategy: "binary-unidentified-structured-fallback",
+        routing: buildRouting({
+          detectedFormat: "unknown",
+          specializedPipeline: "none",
+          usedStructuredFallback: true,
+          strategy: "binary-unidentified-structured-fallback"
+        })
       };
     }
-    return { markdown: "", warnings, strategy: "binary-unidentified" };
+    return {
+      markdown: "",
+      warnings,
+      strategy: "binary-unidentified",
+      routing: buildRouting({
+        detectedFormat: "unknown",
+        specializedPipeline: "none",
+        usedStructuredFallback: false,
+        strategy: "binary-unidentified"
+      })
+    };
   }
   const fmt = detectBinaryFormat(data, filename, mimeType);
   if (fmt === "docx") {
@@ -1376,22 +2063,24 @@ async function extractMarkdown(input, options) {
           markdown: convertStructuredToMarkdown(fb, smOpts),
           warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("docx-requires-node")]),
           strategy: "docx-requires-node",
-          routing: {
+          routing: buildRouting({
             detectedFormat: "docx",
             specializedPipeline: "none",
-            usedStructuredFallback: true
-          }
+            usedStructuredFallback: true,
+            strategy: "docx-requires-node"
+          })
         };
       }
       return {
         markdown: "",
         warnings,
         strategy: "docx-requires-node",
-        routing: {
+        routing: buildRouting({
           detectedFormat: "docx",
           specializedPipeline: "none",
-          usedStructuredFallback: false
-        }
+          usedStructuredFallback: false,
+          strategy: "docx-requires-node"
+        })
       };
     }
     const r = await convertDocxToMarkdown(data, buildDocxOptions(options));
@@ -1405,11 +2094,12 @@ async function extractMarkdown(input, options) {
       markdown: r.markdown,
       warnings: w,
       strategy,
-      routing: {
+      routing: buildRouting({
         detectedFormat: "docx",
         specializedPipeline: "docx",
-        usedStructuredFallback: strategy === "docx-structured-fallback"
-      }
+        usedStructuredFallback: strategy === "docx-structured-fallback",
+        strategy
+      })
     };
   }
   if (fmt === "pdf") {
@@ -1424,11 +2114,26 @@ async function extractMarkdown(input, options) {
         markdown: convertStructuredToMarkdown(fb, smOpts),
         warnings: w,
         strategy: "pdf-structured-fallback",
-        routing: {
+        routing: buildRouting({
           detectedFormat: "pdf",
           specializedPipeline: "pdf",
-          usedStructuredFallback: true
-        }
+          usedStructuredFallback: true,
+          strategy: "pdf-structured-fallback"
+        })
+      };
+    }
+    if ((strategy === "pdf-cognipeer-failed" || strategy === "pdf-cognipeer-unavailable") && r.markdown.trim() === "" && fb) {
+      w = mergeWarnings(w, fb.warnings, [tracePdfStructuredExtractLayerFallback()]);
+      return {
+        markdown: convertStructuredToMarkdown(fb, smOpts),
+        warnings: w,
+        strategy: "pdf-structured-fallback",
+        routing: buildRouting({
+          detectedFormat: "pdf",
+          specializedPipeline: "pdf",
+          usedStructuredFallback: true,
+          strategy: "pdf-structured-fallback"
+        })
       };
     }
     if (strategy === "pdf-structured-fallback") {
@@ -1441,40 +2146,68 @@ async function extractMarkdown(input, options) {
       markdown: r.markdown,
       warnings: w,
       strategy,
-      routing: {
+      routing: buildRouting({
         detectedFormat: "pdf",
         specializedPipeline: "pdf",
-        usedStructuredFallback: strategy === "pdf-structured-fallback"
-      }
+        usedStructuredFallback: strategy === "pdf-structured-fallback",
+        strategy
+      })
     };
   }
+  if (fmt === "html") {
+    return extractCognipeerFileMarkdownBranch("html", data, options, warnings, smOpts, fb);
+  }
+  if (fmt === "csv") {
+    return extractCognipeerFileMarkdownBranch("csv", data, options, warnings, smOpts, fb);
+  }
+  if (fmt === "spreadsheet") {
+    return extractCognipeerFileMarkdownBranch("spreadsheet", data, options, warnings, smOpts, fb);
+  }
+  const mediaHint = inferMediaHint(mimeType, filename);
   warnings.push(
-    "@dragon708/docmind-markdown: Unidentified binary format (expected PDF magic or ZIP/DOCX). Using structured fallback if provided."
+    "@dragon708/docmind-markdown: Unidentified binary format (expected PDF, OOXML Word/Excel, HTML, CSV, or related MIME/filename hints). Using structured fallback if provided."
   );
+  if (mediaHint === "image") {
+    warnings.push(
+      `${EXTRACT_WARN} image hint (MIME/filename): raw images are not converted by specialized file routes; pass a StructuredDocumentResult (e.g. after OCR) or structuredFallback.`
+    );
+  } else if (mediaHint === "text") {
+    warnings.push(
+      `${EXTRACT_WARN} text hint (MIME/filename): plain text / JSON bytes are not auto-routed to Markdown here; pass StructuredDocumentResult or structuredFallback for normalized text/OCR pipelines.`
+    );
+  } else if (mediaHint === "audio" || mediaHint === "video") {
+    warnings.push(
+      `${EXTRACT_WARN} ${mediaHint} hint (MIME): no specialized ${mediaHint}\u2192Markdown route in extractMarkdown; use StructuredDocumentResult or structuredFallback.`
+    );
+  }
   if (fb) {
     return {
       markdown: convertStructuredToMarkdown(fb, smOpts),
       warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("binary-unidentified")]),
       strategy: "binary-unidentified-structured-fallback",
-      routing: {
+      routing: buildRouting({
         detectedFormat: "unknown",
         specializedPipeline: "none",
-        usedStructuredFallback: true
-      }
+        usedStructuredFallback: true,
+        strategy: "binary-unidentified-structured-fallback",
+        mediaHint
+      })
     };
   }
   return {
     markdown: "",
     warnings,
     strategy: "binary-unidentified",
-    routing: {
+    routing: buildRouting({
       detectedFormat: "unknown",
       specializedPipeline: "none",
-      usedStructuredFallback: false
-    }
+      usedStructuredFallback: false,
+      strategy: "binary-unidentified",
+      mediaHint
+    })
   };
 }
-export { convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
+export { convertCsvToMarkdown, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertHtmlToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertSpreadsheetToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, looksLikeHtmlString, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
 //# sourceMappingURL=index.js.map
 //# sourceMappingURL=index.js.map