@dragon708/docmind-markdown 1.1.3 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +10 -2
- package/dist/index.js +51 -14
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -366,7 +366,11 @@ type ConvertPdfToMarkdownOptions = OpenDataLoaderPdfConvertOptions & {
|
|
|
366
366
|
/** Input for {@link convertPdfToMarkdown}: filesystem path (Node) or PDF bytes. */
|
|
367
367
|
type PdfToMarkdownInput = string | Buffer | Uint8Array | ArrayBuffer;
|
|
368
368
|
/** Which pipeline produced {@link ConvertPdfToMarkdownResult.markdown}. */
|
|
369
|
-
type PdfToMarkdownSource = "opendataloader" | "structured-fallback" | "unsupported-runtime"
|
|
369
|
+
type PdfToMarkdownSource = "opendataloader" | "structured-fallback" | "unsupported-runtime"
|
|
370
|
+
/** `@opendataloader/pdf` missing or failed to load (install dep; check bundler externals). */
|
|
371
|
+
| "opendataloader-unavailable"
|
|
372
|
+
/** Engine ran or was attempted but produced no usable Markdown (Java error, empty stdout, missing file, etc.). */
|
|
373
|
+
| "opendataloader-failed";
|
|
370
374
|
type PdfToMarkdownFallbackReason = "unsupported-runtime" | "error" | "empty" | "module-not-found";
|
|
371
375
|
interface ConvertPdfToMarkdownResult {
|
|
372
376
|
readonly markdown: string;
|
|
@@ -434,7 +438,11 @@ type ExtractMarkdownOptions = ConvertStructuredToMarkdownOptions & {
|
|
|
434
438
|
readonly pdf?: ConvertPdfToMarkdownOptions;
|
|
435
439
|
};
|
|
436
440
|
/** Which branch produced {@link ExtractMarkdownResult.markdown}. */
|
|
437
|
-
type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-opendataloader"
|
|
441
|
+
type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-opendataloader"
|
|
442
|
+
/** `@opendataloader/pdf` not loadable (missing package, bundler, etc.). */
|
|
443
|
+
| "pdf-opendataloader-unavailable"
|
|
444
|
+
/** Engine failed or returned empty Markdown; no structured fallback or fallback also failed. */
|
|
445
|
+
| "pdf-opendataloader-failed" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
|
|
438
446
|
interface ExtractMarkdownResult {
|
|
439
447
|
readonly markdown: string;
|
|
440
448
|
/** Merged pipeline warnings (conversion, runtime, and optional {@link StructuredDocumentResult.warnings}). */
|
package/dist/index.js
CHANGED
|
@@ -999,6 +999,23 @@ function normalizePdfMarkdown(markdown, clean) {
|
|
|
999
999
|
if (!clean) return t;
|
|
1000
1000
|
return t.replace(/\n{3,}/g, "\n\n");
|
|
1001
1001
|
}
|
|
1002
|
+
function normalizeConvertStdout(out) {
|
|
1003
|
+
if (typeof out === "string") return out;
|
|
1004
|
+
if (typeof Buffer !== "undefined" && Buffer.isBuffer(out)) {
|
|
1005
|
+
return out.toString("utf8");
|
|
1006
|
+
}
|
|
1007
|
+
if (out instanceof Uint8Array) {
|
|
1008
|
+
return typeof Buffer !== "undefined" ? Buffer.from(out).toString("utf8") : new TextDecoder("utf8", { fatal: false }).decode(out);
|
|
1009
|
+
}
|
|
1010
|
+
if (out == null) return "";
|
|
1011
|
+
return String(out);
|
|
1012
|
+
}
|
|
1013
|
+
var FALLBACK_WARN_PREFIX = "[docmind-markdown:pdf] structured-fallback: serializing StructuredDocumentResult to Markdown because";
|
|
1014
|
+
function structuredFallbackWarnings(reason, detail) {
|
|
1015
|
+
const tail = reason === "module-not-found" ? "@opendataloader/pdf could not be loaded." : reason === "error" ? "the specialized PDF engine raised an error or rejected the input." : reason === "empty" ? "the specialized engine returned empty Markdown." : "the specialized path is unavailable in this runtime.";
|
|
1016
|
+
const extra = detail ? ` (${detail})` : "";
|
|
1017
|
+
return [`${FALLBACK_WARN_PREFIX} ${tail}${extra}`];
|
|
1018
|
+
}
|
|
1002
1019
|
function engineOptions(options) {
|
|
1003
1020
|
if (!options) return {};
|
|
1004
1021
|
const {
|
|
@@ -1065,7 +1082,10 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1065
1082
|
);
|
|
1066
1083
|
return {
|
|
1067
1084
|
markdown: md,
|
|
1068
|
-
warnings
|
|
1085
|
+
warnings: [
|
|
1086
|
+
...structuredFallbackWarnings("module-not-found"),
|
|
1087
|
+
...warnings
|
|
1088
|
+
],
|
|
1069
1089
|
source: "structured-fallback",
|
|
1070
1090
|
fallbackReason: "module-not-found"
|
|
1071
1091
|
};
|
|
@@ -1078,20 +1098,22 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1078
1098
|
return {
|
|
1079
1099
|
markdown: "",
|
|
1080
1100
|
warnings,
|
|
1081
|
-
source: "opendataloader",
|
|
1101
|
+
source: "opendataloader-unavailable",
|
|
1082
1102
|
fallbackReason: "module-not-found"
|
|
1083
1103
|
};
|
|
1084
1104
|
}
|
|
1085
1105
|
let rawMarkdown;
|
|
1086
1106
|
try {
|
|
1087
|
-
|
|
1107
|
+
const out = await convert(inputPath, {
|
|
1088
1108
|
...eng,
|
|
1089
1109
|
format: "markdown",
|
|
1090
1110
|
toStdout: true,
|
|
1091
1111
|
quiet: eng.quiet !== false
|
|
1092
|
-
})
|
|
1112
|
+
});
|
|
1113
|
+
rawMarkdown = normalizeConvertStdout(out);
|
|
1093
1114
|
} catch (e) {
|
|
1094
|
-
|
|
1115
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1116
|
+
warnings.push(`PDF conversion failed (@opendataloader/pdf): ${msg}`);
|
|
1095
1117
|
if (resolveStructured) {
|
|
1096
1118
|
try {
|
|
1097
1119
|
const structured = await resolveStructured();
|
|
@@ -1101,7 +1123,10 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1101
1123
|
);
|
|
1102
1124
|
return {
|
|
1103
1125
|
markdown: md,
|
|
1104
|
-
warnings
|
|
1126
|
+
warnings: [
|
|
1127
|
+
...structuredFallbackWarnings("error", msg.slice(0, 500)),
|
|
1128
|
+
...warnings
|
|
1129
|
+
],
|
|
1105
1130
|
source: "structured-fallback",
|
|
1106
1131
|
fallbackReason: "error"
|
|
1107
1132
|
};
|
|
@@ -1114,13 +1139,15 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1114
1139
|
return {
|
|
1115
1140
|
markdown: "",
|
|
1116
1141
|
warnings,
|
|
1117
|
-
source: "opendataloader",
|
|
1142
|
+
source: "opendataloader-failed",
|
|
1118
1143
|
fallbackReason: "error"
|
|
1119
1144
|
};
|
|
1120
1145
|
}
|
|
1121
1146
|
let markdown = normalizePdfMarkdown(rawMarkdown, clean);
|
|
1122
1147
|
if (markdown.length === 0) {
|
|
1123
|
-
warnings.push(
|
|
1148
|
+
warnings.push(
|
|
1149
|
+
"OpenDataLoader returned empty Markdown for this PDF (stdout empty or whitespace-only after normalize)."
|
|
1150
|
+
);
|
|
1124
1151
|
if (resolveStructured) {
|
|
1125
1152
|
try {
|
|
1126
1153
|
const structured = await resolveStructured();
|
|
@@ -1130,7 +1157,7 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1130
1157
|
);
|
|
1131
1158
|
return {
|
|
1132
1159
|
markdown,
|
|
1133
|
-
warnings,
|
|
1160
|
+
warnings: [...structuredFallbackWarnings("empty"), ...warnings],
|
|
1134
1161
|
source: "structured-fallback",
|
|
1135
1162
|
fallbackReason: "empty"
|
|
1136
1163
|
};
|
|
@@ -1143,7 +1170,7 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1143
1170
|
return {
|
|
1144
1171
|
markdown: "",
|
|
1145
1172
|
warnings,
|
|
1146
|
-
source: "opendataloader",
|
|
1173
|
+
source: "opendataloader-failed",
|
|
1147
1174
|
fallbackReason: "empty"
|
|
1148
1175
|
};
|
|
1149
1176
|
}
|
|
@@ -1241,9 +1268,19 @@ function docxStrategyFromSource(source) {
|
|
|
1241
1268
|
return source === "structured-fallback" ? "docx-structured-fallback" : "docx-mammoth";
|
|
1242
1269
|
}
|
|
1243
1270
|
function pdfStrategyFromResult(r) {
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1271
|
+
switch (r.source) {
|
|
1272
|
+
case "structured-fallback":
|
|
1273
|
+
return "pdf-structured-fallback";
|
|
1274
|
+
case "unsupported-runtime":
|
|
1275
|
+
return "pdf-unsupported-runtime";
|
|
1276
|
+
case "opendataloader-unavailable":
|
|
1277
|
+
return "pdf-opendataloader-unavailable";
|
|
1278
|
+
case "opendataloader-failed":
|
|
1279
|
+
return "pdf-opendataloader-failed";
|
|
1280
|
+
case "opendataloader":
|
|
1281
|
+
default:
|
|
1282
|
+
return "pdf-opendataloader";
|
|
1283
|
+
}
|
|
1247
1284
|
}
|
|
1248
1285
|
function mergeWarnings(base, ...more) {
|
|
1249
1286
|
const out = [...base];
|
|
@@ -1342,7 +1379,7 @@ async function extractMarkdown(input, options) {
|
|
|
1342
1379
|
return {
|
|
1343
1380
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1344
1381
|
warnings: mergeWarnings(w, fb.warnings, [
|
|
1345
|
-
"extractMarkdown: PDF
|
|
1382
|
+
"extractMarkdown: PDF specialized pipeline is unavailable in this runtime; used structuredFallback (same envelope as extractStructuredData)."
|
|
1346
1383
|
]),
|
|
1347
1384
|
strategy: "pdf-structured-fallback"
|
|
1348
1385
|
};
|