@dragon708/docmind-markdown 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -304,7 +304,7 @@ interface ConvertDocxToMarkdownResult {
304
304
  /**
305
305
  * **Node only.** Primary API: `.docx` bytes → semantic HTML (Mammoth) → LLM-friendly Markdown (Turndown + optional GFM).
306
306
  *
307
- * Optional peers: `mammoth`, `turndown`. Runtime dependency: `turndown-plugin-gfm` (declared on this package) when
307
+ * Dependencies: `mammoth`, `turndown`, and `turndown-plugin-gfm` (all declared on this package). GFM is loaded when
308
308
  * {@link ConvertDocxToMarkdownOptions.includeTables} is true.
309
309
  *
310
310
  * @see {@link convertDocxBufferToMarkdown} for a thin wrapper that only returns `markdown` and `messages`.
@@ -366,7 +366,11 @@ type ConvertPdfToMarkdownOptions = OpenDataLoaderPdfConvertOptions & {
366
366
  /** Input for {@link convertPdfToMarkdown}: filesystem path (Node) or PDF bytes. */
367
367
  type PdfToMarkdownInput = string | Buffer | Uint8Array | ArrayBuffer;
368
368
  /** Which pipeline produced {@link ConvertPdfToMarkdownResult.markdown}. */
369
- type PdfToMarkdownSource = "opendataloader" | "structured-fallback" | "unsupported-runtime";
369
+ type PdfToMarkdownSource = "opendataloader" | "structured-fallback" | "unsupported-runtime"
370
+ /** `@opendataloader/pdf` missing or failed to load (install dep; check bundler externals). */
371
+ | "opendataloader-unavailable"
372
+ /** Engine ran or was attempted but produced no usable Markdown (Java error, empty stdout, missing file, etc.). */
373
+ | "opendataloader-failed";
370
374
  type PdfToMarkdownFallbackReason = "unsupported-runtime" | "error" | "empty" | "module-not-found";
371
375
  interface ConvertPdfToMarkdownResult {
372
376
  readonly markdown: string;
@@ -434,7 +438,11 @@ type ExtractMarkdownOptions = ConvertStructuredToMarkdownOptions & {
434
438
  readonly pdf?: ConvertPdfToMarkdownOptions;
435
439
  };
436
440
  /** Which branch produced {@link ExtractMarkdownResult.markdown}. */
437
- type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-opendataloader" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
441
+ type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-opendataloader"
442
+ /** `@opendataloader/pdf` not loadable (missing package, bundler, etc.). */
443
+ | "pdf-opendataloader-unavailable"
444
+ /** Engine failed or returned empty Markdown; no structured fallback or fallback also failed. */
445
+ | "pdf-opendataloader-failed" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
438
446
  interface ExtractMarkdownResult {
439
447
  readonly markdown: string;
440
448
  /** Merged pipeline warnings (conversion, runtime, and optional {@link StructuredDocumentResult.warnings}). */
package/dist/index.js CHANGED
@@ -999,6 +999,23 @@ function normalizePdfMarkdown(markdown, clean) {
999
999
  if (!clean) return t;
1000
1000
  return t.replace(/\n{3,}/g, "\n\n");
1001
1001
  }
1002
+ function normalizeConvertStdout(out) {
1003
+ if (typeof out === "string") return out;
1004
+ if (typeof Buffer !== "undefined" && Buffer.isBuffer(out)) {
1005
+ return out.toString("utf8");
1006
+ }
1007
+ if (out instanceof Uint8Array) {
1008
+ return typeof Buffer !== "undefined" ? Buffer.from(out).toString("utf8") : new TextDecoder("utf8", { fatal: false }).decode(out);
1009
+ }
1010
+ if (out == null) return "";
1011
+ return String(out);
1012
+ }
1013
+ var FALLBACK_WARN_PREFIX = "[docmind-markdown:pdf] structured-fallback: serializing StructuredDocumentResult to Markdown because";
1014
+ function structuredFallbackWarnings(reason, detail) {
1015
+ const tail = reason === "module-not-found" ? "@opendataloader/pdf could not be loaded." : reason === "error" ? "the specialized PDF engine raised an error or rejected the input." : reason === "empty" ? "the specialized engine returned empty Markdown." : "the specialized path is unavailable in this runtime.";
1016
+ const extra = detail ? ` (${detail})` : "";
1017
+ return [`${FALLBACK_WARN_PREFIX} ${tail}${extra}`];
1018
+ }
1002
1019
  function engineOptions(options) {
1003
1020
  if (!options) return {};
1004
1021
  const {
@@ -1065,7 +1082,10 @@ async function convertPdfToMarkdown(input, options) {
1065
1082
  );
1066
1083
  return {
1067
1084
  markdown: md,
1068
- warnings,
1085
+ warnings: [
1086
+ ...structuredFallbackWarnings("module-not-found"),
1087
+ ...warnings
1088
+ ],
1069
1089
  source: "structured-fallback",
1070
1090
  fallbackReason: "module-not-found"
1071
1091
  };
@@ -1078,20 +1098,22 @@ async function convertPdfToMarkdown(input, options) {
1078
1098
  return {
1079
1099
  markdown: "",
1080
1100
  warnings,
1081
- source: "opendataloader",
1101
+ source: "opendataloader-unavailable",
1082
1102
  fallbackReason: "module-not-found"
1083
1103
  };
1084
1104
  }
1085
1105
  let rawMarkdown;
1086
1106
  try {
1087
- rawMarkdown = await convert(inputPath, {
1107
+ const out = await convert(inputPath, {
1088
1108
  ...eng,
1089
1109
  format: "markdown",
1090
1110
  toStdout: true,
1091
1111
  quiet: eng.quiet !== false
1092
- }).then((s) => String(s));
1112
+ });
1113
+ rawMarkdown = normalizeConvertStdout(out);
1093
1114
  } catch (e) {
1094
- warnings.push(`PDF conversion failed: ${e instanceof Error ? e.message : String(e)}`);
1115
+ const msg = e instanceof Error ? e.message : String(e);
1116
+ warnings.push(`PDF conversion failed (@opendataloader/pdf): ${msg}`);
1095
1117
  if (resolveStructured) {
1096
1118
  try {
1097
1119
  const structured = await resolveStructured();
@@ -1101,7 +1123,10 @@ async function convertPdfToMarkdown(input, options) {
1101
1123
  );
1102
1124
  return {
1103
1125
  markdown: md,
1104
- warnings,
1126
+ warnings: [
1127
+ ...structuredFallbackWarnings("error", msg.slice(0, 500)),
1128
+ ...warnings
1129
+ ],
1105
1130
  source: "structured-fallback",
1106
1131
  fallbackReason: "error"
1107
1132
  };
@@ -1114,13 +1139,15 @@ async function convertPdfToMarkdown(input, options) {
1114
1139
  return {
1115
1140
  markdown: "",
1116
1141
  warnings,
1117
- source: "opendataloader",
1142
+ source: "opendataloader-failed",
1118
1143
  fallbackReason: "error"
1119
1144
  };
1120
1145
  }
1121
1146
  let markdown = normalizePdfMarkdown(rawMarkdown, clean);
1122
1147
  if (markdown.length === 0) {
1123
- warnings.push("OpenDataLoader returned empty Markdown for this PDF.");
1148
+ warnings.push(
1149
+ "OpenDataLoader returned empty Markdown for this PDF (stdout empty or whitespace-only after normalize)."
1150
+ );
1124
1151
  if (resolveStructured) {
1125
1152
  try {
1126
1153
  const structured = await resolveStructured();
@@ -1130,7 +1157,7 @@ async function convertPdfToMarkdown(input, options) {
1130
1157
  );
1131
1158
  return {
1132
1159
  markdown,
1133
- warnings,
1160
+ warnings: [...structuredFallbackWarnings("empty"), ...warnings],
1134
1161
  source: "structured-fallback",
1135
1162
  fallbackReason: "empty"
1136
1163
  };
@@ -1143,7 +1170,7 @@ async function convertPdfToMarkdown(input, options) {
1143
1170
  return {
1144
1171
  markdown: "",
1145
1172
  warnings,
1146
- source: "opendataloader",
1173
+ source: "opendataloader-failed",
1147
1174
  fallbackReason: "empty"
1148
1175
  };
1149
1176
  }
@@ -1241,9 +1268,19 @@ function docxStrategyFromSource(source) {
1241
1268
  return source === "structured-fallback" ? "docx-structured-fallback" : "docx-mammoth";
1242
1269
  }
1243
1270
  function pdfStrategyFromResult(r) {
1244
- if (r.source === "structured-fallback") return "pdf-structured-fallback";
1245
- if (r.source === "unsupported-runtime") return "pdf-unsupported-runtime";
1246
- return "pdf-opendataloader";
1271
+ switch (r.source) {
1272
+ case "structured-fallback":
1273
+ return "pdf-structured-fallback";
1274
+ case "unsupported-runtime":
1275
+ return "pdf-unsupported-runtime";
1276
+ case "opendataloader-unavailable":
1277
+ return "pdf-opendataloader-unavailable";
1278
+ case "opendataloader-failed":
1279
+ return "pdf-opendataloader-failed";
1280
+ case "opendataloader":
1281
+ default:
1282
+ return "pdf-opendataloader";
1283
+ }
1247
1284
  }
1248
1285
  function mergeWarnings(base, ...more) {
1249
1286
  const out = [...base];
@@ -1342,7 +1379,7 @@ async function extractMarkdown(input, options) {
1342
1379
  return {
1343
1380
  markdown: convertStructuredToMarkdown(fb, smOpts),
1344
1381
  warnings: mergeWarnings(w, fb.warnings, [
1345
- "extractMarkdown: PDF route unavailable in this runtime; used structuredFallback."
1382
+ "extractMarkdown: PDF specialized pipeline is unavailable in this runtime; used structuredFallback (same envelope as extractStructuredData)."
1346
1383
  ]),
1347
1384
  strategy: "pdf-structured-fallback"
1348
1385
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-markdown",
3
- "version": "1.1.2",
3
+ "version": "1.2.0",
4
4
  "description": "StructuredDocumentResult → Markdown and LLM-oriented plain text for DocMind.",
5
5
  "type": "module",
6
6
  "sideEffects": false,
@@ -37,26 +37,14 @@
37
37
  "dependencies": {
38
38
  "@dragon708/docmind-shared": "^1.2.0",
39
39
  "@opendataloader/pdf": "^2.2.1",
40
- "turndown-plugin-gfm": "^1.0.2"
41
- },
42
- "peerDependencies": {
43
40
  "mammoth": "^1.6.0",
44
- "turndown": "^7.0.0"
45
- },
46
- "peerDependenciesMeta": {
47
- "mammoth": {
48
- "optional": true
49
- },
50
- "turndown": {
51
- "optional": true
52
- }
41
+ "turndown": "^7.0.0",
42
+ "turndown-plugin-gfm": "^1.0.2"
53
43
  },
54
44
  "devDependencies": {
55
45
  "@types/node": "^20.19.37",
56
46
  "jszip": "^3.10.1",
57
- "mammoth": "^1.12.0",
58
47
  "tsup": "^8.5.1",
59
- "turndown": "^7.2.4",
60
48
  "typescript": "^5.9.3",
61
49
  "vitest": "^1.6.1"
62
50
  }