@dragon708/docmind-markdown 1.2.6 → 1.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +139 -12
- package/dist/index.js +781 -48
- package/node_modules/turndown-plugin-gfm/LICENSE +21 -0
- package/node_modules/turndown-plugin-gfm/README.md +50 -0
- package/node_modules/turndown-plugin-gfm/dist/turndown-plugin-gfm.js +165 -0
- package/node_modules/turndown-plugin-gfm/lib/turndown-plugin-gfm.browser.cjs.js +162 -0
- package/node_modules/turndown-plugin-gfm/lib/turndown-plugin-gfm.browser.es.js +154 -0
- package/node_modules/turndown-plugin-gfm/lib/turndown-plugin-gfm.cjs.js +162 -0
- package/node_modules/turndown-plugin-gfm/lib/turndown-plugin-gfm.es.js +154 -0
- package/node_modules/turndown-plugin-gfm/package.json +43 -0
- package/package.json +5 -1
package/dist/index.js
CHANGED
|
@@ -992,6 +992,20 @@ async function convertDocxBufferToMarkdown(input, options) {
|
|
|
992
992
|
return { markdown: r.markdown, messages: r.messages };
|
|
993
993
|
}
|
|
994
994
|
|
|
995
|
+
// src/cognipeer-runtime.ts
|
|
996
|
+
async function loadCognipeerConvertToMarkdown() {
|
|
997
|
+
const { createRequire } = await importEsm("node:module");
|
|
998
|
+
const require2 = createRequire(import.meta.url);
|
|
999
|
+
const mod = require2("@cognipeer/to-markdown");
|
|
1000
|
+
return mod.convertToMarkdown;
|
|
1001
|
+
}
|
|
1002
|
+
async function toNodeBuffer2(input) {
|
|
1003
|
+
const { Buffer: Buffer2 } = await importEsm("node:buffer");
|
|
1004
|
+
if (Buffer2.isBuffer(input)) return input;
|
|
1005
|
+
if (input instanceof ArrayBuffer) return Buffer2.from(input);
|
|
1006
|
+
return Buffer2.from(input);
|
|
1007
|
+
}
|
|
1008
|
+
|
|
995
1009
|
// src/pdf-markdown.ts
|
|
996
1010
|
var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @cognipeer/to-markdown requires Node.js. In the browser, use a server-side conversion or supply structured text/Markdown from your backend.";
|
|
997
1011
|
var COGNIPEER_WARN_TAG = "[docmind-markdown:pdf] pdf-cognipeer-specialized:";
|
|
@@ -1015,23 +1029,11 @@ function cognipeerConverterOptions(options) {
|
|
|
1015
1029
|
if (url !== void 0) o.url = url;
|
|
1016
1030
|
return o;
|
|
1017
1031
|
}
|
|
1018
|
-
async function toNodeBuffer2(input) {
|
|
1019
|
-
const { Buffer: Buffer2 } = await importEsm("node:buffer");
|
|
1020
|
-
if (Buffer2.isBuffer(input)) return input;
|
|
1021
|
-
if (input instanceof ArrayBuffer) return Buffer2.from(input);
|
|
1022
|
-
return Buffer2.from(input);
|
|
1023
|
-
}
|
|
1024
|
-
async function loadCognipeerConvertToMarkdown() {
|
|
1025
|
-
const { createRequire } = await importEsm("node:module");
|
|
1026
|
-
const require2 = createRequire(import.meta.url);
|
|
1027
|
-
const mod = require2("@cognipeer/to-markdown");
|
|
1028
|
-
return mod.convertToMarkdown;
|
|
1029
|
-
}
|
|
1030
1032
|
async function convertPdfToMarkdown(input, options) {
|
|
1031
1033
|
const clean = options?.cleanMarkdown !== false;
|
|
1032
1034
|
const resolveStructured = options?.resolveStructured;
|
|
1033
1035
|
const structuredMdOpts = options?.structuredMarkdown;
|
|
1034
|
-
const
|
|
1036
|
+
const cognipeerOpts2 = cognipeerConverterOptions(options);
|
|
1035
1037
|
if (!isNodeRuntime()) {
|
|
1036
1038
|
return {
|
|
1037
1039
|
markdown: "",
|
|
@@ -1097,7 +1099,7 @@ async function convertPdfToMarkdown(input, options) {
|
|
|
1097
1099
|
}
|
|
1098
1100
|
let rawMarkdown;
|
|
1099
1101
|
try {
|
|
1100
|
-
rawMarkdown = await convertToMarkdown(inputPath,
|
|
1102
|
+
rawMarkdown = await convertToMarkdown(inputPath, cognipeerOpts2);
|
|
1101
1103
|
} catch (e) {
|
|
1102
1104
|
const msg = e instanceof Error ? e.message : String(e);
|
|
1103
1105
|
warnings.push(`${COGNIPEER_WARN_TAG} ${msg}`);
|
|
@@ -1194,6 +1196,473 @@ async function convertPdfBufferToMarkdown(input, options) {
|
|
|
1194
1196
|
throwIfLegacyFailure(r);
|
|
1195
1197
|
return { markdown: r.markdown };
|
|
1196
1198
|
}
|
|
1199
|
+
|
|
1200
|
+
// src/cognipeer-file-markdown.ts
|
|
1201
|
+
var BROWSER = (label) => `@dragon708/docmind-markdown: ${label} \u2192 Markdown via @cognipeer/to-markdown requires Node.js. In the browser, use a server-side conversion or supply structured input / structuredFallback.`;
|
|
1202
|
+
function cognipeerOpts(options) {
|
|
1203
|
+
if (!options) return {};
|
|
1204
|
+
const { fileName, forceExtension, url } = options;
|
|
1205
|
+
const o = {};
|
|
1206
|
+
if (fileName !== void 0) o.fileName = fileName;
|
|
1207
|
+
if (forceExtension !== void 0) o.forceExtension = forceExtension;
|
|
1208
|
+
if (url !== void 0) o.url = url;
|
|
1209
|
+
return o;
|
|
1210
|
+
}
|
|
1211
|
+
function normalizeMarkdown(markdown, clean) {
|
|
1212
|
+
const t = markdown.trim();
|
|
1213
|
+
if (!clean) return t;
|
|
1214
|
+
return t.replace(/\n{3,}/g, "\n\n");
|
|
1215
|
+
}
|
|
1216
|
+
function structuredFallbackWarnings2(format, reason, detail) {
|
|
1217
|
+
const tag = `[docmind-markdown:${format}] ${format}-structured-fallback:`;
|
|
1218
|
+
const tail = reason === "module-not-found" ? "@cognipeer/to-markdown could not be loaded." : reason === "error" ? "the specialized engine raised an error or rejected the input." : reason === "empty" ? "the specialized engine returned empty Markdown." : "the specialized path is unavailable in this runtime.";
|
|
1219
|
+
const extra = detail ? ` (${detail})` : "";
|
|
1220
|
+
return [`${tag} serializing StructuredDocumentResult to Markdown because ${tail}${extra}`];
|
|
1221
|
+
}
|
|
1222
|
+
function warnTag(format) {
|
|
1223
|
+
return `[docmind-markdown:${format}] ${format}-cognipeer-specialized:`;
|
|
1224
|
+
}
|
|
1225
|
+
async function convertCognipeerFileToMarkdown(format, defaultTempFile, input, options) {
|
|
1226
|
+
const clean = options?.cleanMarkdown !== false;
|
|
1227
|
+
const resolveStructured = options?.resolveStructured;
|
|
1228
|
+
const structuredMdOpts = options?.structuredMarkdown;
|
|
1229
|
+
const browserLabel = format === "html" ? "HTML" : format === "csv" ? "CSV" : "Spreadsheet";
|
|
1230
|
+
if (!isNodeRuntime()) {
|
|
1231
|
+
return {
|
|
1232
|
+
markdown: "",
|
|
1233
|
+
warnings: [BROWSER(browserLabel)],
|
|
1234
|
+
source: "unsupported-runtime",
|
|
1235
|
+
fallbackReason: "unsupported-runtime"
|
|
1236
|
+
};
|
|
1237
|
+
}
|
|
1238
|
+
const warnings = [];
|
|
1239
|
+
let cleanup;
|
|
1240
|
+
try {
|
|
1241
|
+
let inputPath;
|
|
1242
|
+
if (typeof input === "string") {
|
|
1243
|
+
inputPath = input;
|
|
1244
|
+
} else {
|
|
1245
|
+
const [{ mkdtemp, writeFile, rm }, { join }, { tmpdir }, buffer] = await Promise.all([
|
|
1246
|
+
importEsm("node:fs/promises"),
|
|
1247
|
+
importEsm("node:path"),
|
|
1248
|
+
importEsm("node:os"),
|
|
1249
|
+
toNodeBuffer2(input)
|
|
1250
|
+
]);
|
|
1251
|
+
const dir = await mkdtemp(join(tmpdir(), `docmind-markdown-${format}-`));
|
|
1252
|
+
inputPath = join(dir, defaultTempFile);
|
|
1253
|
+
await writeFile(inputPath, buffer);
|
|
1254
|
+
cleanup = async () => rm(dir, { recursive: true, force: true });
|
|
1255
|
+
}
|
|
1256
|
+
let convertToMarkdown;
|
|
1257
|
+
try {
|
|
1258
|
+
convertToMarkdown = await loadCognipeerConvertToMarkdown();
|
|
1259
|
+
} catch (e) {
|
|
1260
|
+
const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@cognipeer/to-markdown` in your project." : "";
|
|
1261
|
+
warnings.push(
|
|
1262
|
+
`${warnTag(format)} package could not be loaded (${e instanceof Error ? e.message : String(e)}).${hint}`
|
|
1263
|
+
);
|
|
1264
|
+
if (resolveStructured) {
|
|
1265
|
+
try {
|
|
1266
|
+
const structured = await resolveStructured();
|
|
1267
|
+
const md = normalizeMarkdown(
|
|
1268
|
+
convertStructuredToMarkdown(structured, structuredMdOpts),
|
|
1269
|
+
clean
|
|
1270
|
+
);
|
|
1271
|
+
return {
|
|
1272
|
+
markdown: md,
|
|
1273
|
+
warnings: [
|
|
1274
|
+
...structuredFallbackWarnings2(format, "module-not-found"),
|
|
1275
|
+
...warnings
|
|
1276
|
+
],
|
|
1277
|
+
source: "structured-fallback",
|
|
1278
|
+
fallbackReason: "module-not-found"
|
|
1279
|
+
};
|
|
1280
|
+
} catch (e2) {
|
|
1281
|
+
warnings.push(
|
|
1282
|
+
`Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
|
|
1283
|
+
);
|
|
1284
|
+
}
|
|
1285
|
+
}
|
|
1286
|
+
return {
|
|
1287
|
+
markdown: "",
|
|
1288
|
+
warnings,
|
|
1289
|
+
source: "cognipeer-unavailable",
|
|
1290
|
+
fallbackReason: "module-not-found"
|
|
1291
|
+
};
|
|
1292
|
+
}
|
|
1293
|
+
let rawMarkdown;
|
|
1294
|
+
try {
|
|
1295
|
+
rawMarkdown = await convertToMarkdown(inputPath, cognipeerOpts(options));
|
|
1296
|
+
} catch (e) {
|
|
1297
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1298
|
+
warnings.push(`${warnTag(format)} ${msg}`);
|
|
1299
|
+
if (resolveStructured) {
|
|
1300
|
+
try {
|
|
1301
|
+
const structured = await resolveStructured();
|
|
1302
|
+
const md = normalizeMarkdown(
|
|
1303
|
+
convertStructuredToMarkdown(structured, structuredMdOpts),
|
|
1304
|
+
clean
|
|
1305
|
+
);
|
|
1306
|
+
return {
|
|
1307
|
+
markdown: md,
|
|
1308
|
+
warnings: [
|
|
1309
|
+
...structuredFallbackWarnings2(format, "error", msg.slice(0, 500)),
|
|
1310
|
+
...warnings
|
|
1311
|
+
],
|
|
1312
|
+
source: "structured-fallback",
|
|
1313
|
+
fallbackReason: "error"
|
|
1314
|
+
};
|
|
1315
|
+
} catch (e2) {
|
|
1316
|
+
warnings.push(
|
|
1317
|
+
`Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
|
|
1318
|
+
);
|
|
1319
|
+
}
|
|
1320
|
+
}
|
|
1321
|
+
return {
|
|
1322
|
+
markdown: "",
|
|
1323
|
+
warnings,
|
|
1324
|
+
source: "cognipeer-failed",
|
|
1325
|
+
fallbackReason: "error"
|
|
1326
|
+
};
|
|
1327
|
+
}
|
|
1328
|
+
let markdown = normalizeMarkdown(
|
|
1329
|
+
typeof rawMarkdown === "string" ? rawMarkdown : String(rawMarkdown ?? ""),
|
|
1330
|
+
clean
|
|
1331
|
+
);
|
|
1332
|
+
if (markdown.length === 0) {
|
|
1333
|
+
warnings.push(
|
|
1334
|
+
`${warnTag(format)} returned empty Markdown for this input (whitespace-only after normalize).`
|
|
1335
|
+
);
|
|
1336
|
+
if (resolveStructured) {
|
|
1337
|
+
try {
|
|
1338
|
+
const structured = await resolveStructured();
|
|
1339
|
+
markdown = normalizeMarkdown(
|
|
1340
|
+
convertStructuredToMarkdown(structured, structuredMdOpts),
|
|
1341
|
+
clean
|
|
1342
|
+
);
|
|
1343
|
+
return {
|
|
1344
|
+
markdown,
|
|
1345
|
+
warnings: [...structuredFallbackWarnings2(format, "empty"), ...warnings],
|
|
1346
|
+
source: "structured-fallback",
|
|
1347
|
+
fallbackReason: "empty"
|
|
1348
|
+
};
|
|
1349
|
+
} catch (e2) {
|
|
1350
|
+
warnings.push(
|
|
1351
|
+
`Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
|
|
1352
|
+
);
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1355
|
+
return {
|
|
1356
|
+
markdown: "",
|
|
1357
|
+
warnings,
|
|
1358
|
+
source: "cognipeer-failed",
|
|
1359
|
+
fallbackReason: "empty"
|
|
1360
|
+
};
|
|
1361
|
+
}
|
|
1362
|
+
return { markdown, warnings, source: "cognipeer" };
|
|
1363
|
+
} finally {
|
|
1364
|
+
if (cleanup) {
|
|
1365
|
+
await cleanup().catch(() => {
|
|
1366
|
+
});
|
|
1367
|
+
}
|
|
1368
|
+
}
|
|
1369
|
+
}
|
|
1370
|
+
|
|
1371
|
+
// src/node-is-regular-file.ts
|
|
1372
|
+
async function isExistingRegularFile(path) {
|
|
1373
|
+
try {
|
|
1374
|
+
const { stat } = await importEsm("node:fs/promises");
|
|
1375
|
+
const s = await stat(path);
|
|
1376
|
+
return s.isFile();
|
|
1377
|
+
} catch {
|
|
1378
|
+
return false;
|
|
1379
|
+
}
|
|
1380
|
+
}
|
|
1381
|
+
|
|
1382
|
+
// src/html-markdown.ts
|
|
1383
|
+
function looksLikeHtmlString(s) {
|
|
1384
|
+
const t = s.trimStart();
|
|
1385
|
+
if (t.length === 0) return false;
|
|
1386
|
+
if (/^<!DOCTYPE\s+html/i.test(t)) return true;
|
|
1387
|
+
if (/^<html[\s>]/i.test(t)) return true;
|
|
1388
|
+
if (/^<head[\s>]/i.test(t)) return true;
|
|
1389
|
+
if (/^<!--/.test(t)) return true;
|
|
1390
|
+
const c0 = t[0];
|
|
1391
|
+
const c1 = t[1] ?? "";
|
|
1392
|
+
if (c0 === "<" && /[a-zA-Z!?]/.test(c1)) return true;
|
|
1393
|
+
return false;
|
|
1394
|
+
}
|
|
1395
|
+
async function resolveHtmlStringInput(s, mode) {
|
|
1396
|
+
const { Buffer: Buffer2 } = await importEsm("node:buffer");
|
|
1397
|
+
if (mode === "html") {
|
|
1398
|
+
return { kind: "buffer", buffer: Buffer2.from(s, "utf8") };
|
|
1399
|
+
}
|
|
1400
|
+
if (mode === "path") {
|
|
1401
|
+
return { kind: "path", path: s };
|
|
1402
|
+
}
|
|
1403
|
+
if (isNodeRuntime() && await isExistingRegularFile(s)) {
|
|
1404
|
+
return { kind: "path", path: s };
|
|
1405
|
+
}
|
|
1406
|
+
if (looksLikeHtmlString(s)) {
|
|
1407
|
+
return { kind: "buffer", buffer: Buffer2.from(s, "utf8") };
|
|
1408
|
+
}
|
|
1409
|
+
return { kind: "path", path: s };
|
|
1410
|
+
}
|
|
1411
|
+
async function convertHtmlToMarkdown(input, options) {
|
|
1412
|
+
const mode = options?.inputMode ?? "auto";
|
|
1413
|
+
const { inputMode: _omit, ...cognipeerOptions } = options ?? {};
|
|
1414
|
+
if (typeof input === "string") {
|
|
1415
|
+
if (!isNodeRuntime()) {
|
|
1416
|
+
return convertCognipeerFileToMarkdown("html", "document.html", input, cognipeerOptions);
|
|
1417
|
+
}
|
|
1418
|
+
const resolved = await resolveHtmlStringInput(input, mode);
|
|
1419
|
+
if (resolved.kind === "path") {
|
|
1420
|
+
return convertCognipeerFileToMarkdown("html", "document.html", resolved.path, cognipeerOptions);
|
|
1421
|
+
}
|
|
1422
|
+
return convertCognipeerFileToMarkdown("html", "document.html", resolved.buffer, {
|
|
1423
|
+
...cognipeerOptions,
|
|
1424
|
+
forceExtension: cognipeerOptions.forceExtension ?? ".html",
|
|
1425
|
+
fileName: cognipeerOptions.fileName ?? "document.html"
|
|
1426
|
+
});
|
|
1427
|
+
}
|
|
1428
|
+
return convertCognipeerFileToMarkdown("html", "document.html", input, cognipeerOptions);
|
|
1429
|
+
}
|
|
1430
|
+
|
|
1431
|
+
// src/tabular-markdown-postprocess.ts
|
|
1432
|
+
function compactMarkdownOutput(markdown) {
|
|
1433
|
+
return markdown.replace(/\n{3,}/g, "\n\n").split("\n").map((l) => l.trimEnd()).join("\n").trim();
|
|
1434
|
+
}
|
|
1435
|
+
function countCsvColumns(firstLine) {
|
|
1436
|
+
let n = 1;
|
|
1437
|
+
let inQuotes = false;
|
|
1438
|
+
for (let i = 0; i < firstLine.length; i++) {
|
|
1439
|
+
const c = firstLine[i];
|
|
1440
|
+
if (c === '"') {
|
|
1441
|
+
inQuotes = !inQuotes;
|
|
1442
|
+
} else if (c === "," && !inQuotes) {
|
|
1443
|
+
n++;
|
|
1444
|
+
}
|
|
1445
|
+
}
|
|
1446
|
+
return n;
|
|
1447
|
+
}
|
|
1448
|
+
function prepareCsvTextForCognipeer(text, options) {
|
|
1449
|
+
const warnings = [];
|
|
1450
|
+
const includeHeader = options?.includeHeader !== false;
|
|
1451
|
+
const maxRows = options?.maxRows;
|
|
1452
|
+
let lines = text.split(/\r?\n/).filter((l) => l.length > 0);
|
|
1453
|
+
if (lines.length === 0) {
|
|
1454
|
+
return { text, warnings };
|
|
1455
|
+
}
|
|
1456
|
+
if (!includeHeader) {
|
|
1457
|
+
const colCount = Math.max(1, countCsvColumns(lines[0]));
|
|
1458
|
+
const synth = Array.from({ length: colCount }, (_, i) => `Column ${i + 1}`).join(",");
|
|
1459
|
+
lines = [synth, ...lines];
|
|
1460
|
+
warnings.push(
|
|
1461
|
+
"[docmind-markdown:csv] includeHeader:false: prepended synthetic header row so the first CSV row appears as table data."
|
|
1462
|
+
);
|
|
1463
|
+
}
|
|
1464
|
+
if (maxRows != null && maxRows >= 0) {
|
|
1465
|
+
const header = lines[0];
|
|
1466
|
+
const rest = lines.slice(1);
|
|
1467
|
+
const data = rest.slice(0, maxRows);
|
|
1468
|
+
if (rest.length > maxRows) {
|
|
1469
|
+
warnings.push(
|
|
1470
|
+
`[docmind-markdown:csv] maxRows:${maxRows}: truncated data rows before conversion (line-based; quoted newlines inside fields may skew counts).`
|
|
1471
|
+
);
|
|
1472
|
+
}
|
|
1473
|
+
lines = [header, ...data];
|
|
1474
|
+
}
|
|
1475
|
+
return { text: lines.join("\n"), warnings };
|
|
1476
|
+
}
|
|
1477
|
+
function stripSpreadsheetSheetHeadings(markdown) {
|
|
1478
|
+
return markdown.replace(/^##[^\n]+\n+/gm, "");
|
|
1479
|
+
}
|
|
1480
|
+
function limitSpreadsheetMarkdownRowsPerSheet(markdown, maxRowsPerSheet) {
|
|
1481
|
+
const warnings = [];
|
|
1482
|
+
if (maxRowsPerSheet < 0) return { markdown, warnings };
|
|
1483
|
+
const lines = markdown.split("\n");
|
|
1484
|
+
const out = [];
|
|
1485
|
+
let i = 0;
|
|
1486
|
+
let truncatedAny = false;
|
|
1487
|
+
const emitLimitedTable = (tableLines) => {
|
|
1488
|
+
if (tableLines.length >= 3) {
|
|
1489
|
+
const header = tableLines[0];
|
|
1490
|
+
const sep = tableLines[1];
|
|
1491
|
+
const body = tableLines.slice(2, 2 + maxRowsPerSheet);
|
|
1492
|
+
if (tableLines.length - 2 > maxRowsPerSheet) truncatedAny = true;
|
|
1493
|
+
out.push(header, sep, ...body);
|
|
1494
|
+
} else {
|
|
1495
|
+
out.push(...tableLines);
|
|
1496
|
+
}
|
|
1497
|
+
};
|
|
1498
|
+
while (i < lines.length) {
|
|
1499
|
+
const line = lines[i];
|
|
1500
|
+
const isSheetTitle = /^##\s+.+$/.test(line);
|
|
1501
|
+
if (isSheetTitle) {
|
|
1502
|
+
out.push(line);
|
|
1503
|
+
i++;
|
|
1504
|
+
while (i < lines.length && lines[i].trim() === "") {
|
|
1505
|
+
out.push(lines[i]);
|
|
1506
|
+
i++;
|
|
1507
|
+
}
|
|
1508
|
+
const tableStart = i;
|
|
1509
|
+
while (i < lines.length && lines[i].trim().startsWith("|")) {
|
|
1510
|
+
i++;
|
|
1511
|
+
}
|
|
1512
|
+
emitLimitedTable(lines.slice(tableStart, i));
|
|
1513
|
+
continue;
|
|
1514
|
+
}
|
|
1515
|
+
if (line.trim().startsWith("|")) {
|
|
1516
|
+
const tableStart = i;
|
|
1517
|
+
while (i < lines.length && lines[i].trim().startsWith("|")) {
|
|
1518
|
+
i++;
|
|
1519
|
+
}
|
|
1520
|
+
emitLimitedTable(lines.slice(tableStart, i));
|
|
1521
|
+
continue;
|
|
1522
|
+
}
|
|
1523
|
+
out.push(line);
|
|
1524
|
+
i++;
|
|
1525
|
+
}
|
|
1526
|
+
if (truncatedAny) {
|
|
1527
|
+
warnings.push(
|
|
1528
|
+
`[docmind-markdown:spreadsheet] maxRowsPerSheet:${maxRowsPerSheet}: truncated data rows in one or more sheet tables.`
|
|
1529
|
+
);
|
|
1530
|
+
}
|
|
1531
|
+
return { markdown: out.join("\n"), warnings };
|
|
1532
|
+
}
|
|
1533
|
+
|
|
1534
|
+
// src/csv-markdown.ts
|
|
1535
|
+
function looksLikeCsvContent(s) {
|
|
1536
|
+
return s.includes(",") && /[\r\n]/.test(s);
|
|
1537
|
+
}
|
|
1538
|
+
function stripCsvOptionKeys(o) {
|
|
1539
|
+
if (!o) return {};
|
|
1540
|
+
const {
|
|
1541
|
+
inputMode: _im,
|
|
1542
|
+
includeHeader: _ih,
|
|
1543
|
+
compactMode: _cm,
|
|
1544
|
+
maxRows: _mr,
|
|
1545
|
+
...rest
|
|
1546
|
+
} = o;
|
|
1547
|
+
return rest;
|
|
1548
|
+
}
|
|
1549
|
+
function finishCsvResult(r, prependWarnings, options) {
|
|
1550
|
+
const markdown = options?.compactMode === true ? compactMarkdownOutput(r.markdown) : r.markdown;
|
|
1551
|
+
if (prependWarnings.length === 0 && markdown === r.markdown) return r;
|
|
1552
|
+
return {
|
|
1553
|
+
...r,
|
|
1554
|
+
markdown,
|
|
1555
|
+
warnings: [...prependWarnings, ...r.warnings]
|
|
1556
|
+
};
|
|
1557
|
+
}
|
|
1558
|
+
async function readUtf8File(path) {
|
|
1559
|
+
const { readFile } = await importEsm("node:fs/promises");
|
|
1560
|
+
return readFile(path, "utf8");
|
|
1561
|
+
}
|
|
1562
|
+
async function resolveCsvStringInput(s, mode) {
|
|
1563
|
+
if (mode === "content") return { kind: "text", text: s };
|
|
1564
|
+
if (mode === "path") return { kind: "path", path: s };
|
|
1565
|
+
if (isNodeRuntime() && await isExistingRegularFile(s)) return { kind: "path", path: s };
|
|
1566
|
+
if (looksLikeCsvContent(s)) return { kind: "text", text: s };
|
|
1567
|
+
return { kind: "path", path: s };
|
|
1568
|
+
}
|
|
1569
|
+
function csvNeedsPreprocess(options) {
|
|
1570
|
+
return options?.maxRows != null || options?.includeHeader === false;
|
|
1571
|
+
}
|
|
1572
|
+
async function convertCsvToMarkdown(input, options) {
|
|
1573
|
+
const cognipeerOptions = stripCsvOptionKeys(options);
|
|
1574
|
+
const prepArgs = { includeHeader: options?.includeHeader, maxRows: options?.maxRows };
|
|
1575
|
+
const needsPrep = csvNeedsPreprocess(options);
|
|
1576
|
+
const strMode = options?.inputMode ?? "auto";
|
|
1577
|
+
if (typeof input === "string") {
|
|
1578
|
+
if (!isNodeRuntime()) {
|
|
1579
|
+
const r3 = await convertCognipeerFileToMarkdown("csv", "document.csv", input, cognipeerOptions);
|
|
1580
|
+
return finishCsvResult(r3, [], options);
|
|
1581
|
+
}
|
|
1582
|
+
const resolved = await resolveCsvStringInput(input, strMode);
|
|
1583
|
+
if (resolved.kind === "path") {
|
|
1584
|
+
if (needsPrep) {
|
|
1585
|
+
const raw = await readUtf8File(resolved.path);
|
|
1586
|
+
const { text: text3, warnings: w3 } = prepareCsvTextForCognipeer(raw, prepArgs);
|
|
1587
|
+
const r4 = await convertCognipeerFileToMarkdown(
|
|
1588
|
+
"csv",
|
|
1589
|
+
"document.csv",
|
|
1590
|
+
Buffer.from(text3, "utf8"),
|
|
1591
|
+
cognipeerOptions
|
|
1592
|
+
);
|
|
1593
|
+
return finishCsvResult(r4, w3, options);
|
|
1594
|
+
}
|
|
1595
|
+
const r3 = await convertCognipeerFileToMarkdown(
|
|
1596
|
+
"csv",
|
|
1597
|
+
"document.csv",
|
|
1598
|
+
resolved.path,
|
|
1599
|
+
cognipeerOptions
|
|
1600
|
+
);
|
|
1601
|
+
return finishCsvResult(r3, [], options);
|
|
1602
|
+
}
|
|
1603
|
+
const { text: text2, warnings: w2 } = prepareCsvTextForCognipeer(resolved.text, prepArgs);
|
|
1604
|
+
const r2 = await convertCognipeerFileToMarkdown(
|
|
1605
|
+
"csv",
|
|
1606
|
+
"document.csv",
|
|
1607
|
+
Buffer.from(text2, "utf8"),
|
|
1608
|
+
cognipeerOptions
|
|
1609
|
+
);
|
|
1610
|
+
return finishCsvResult(r2, w2, options);
|
|
1611
|
+
}
|
|
1612
|
+
if (!needsPrep) {
|
|
1613
|
+
const r2 = await convertCognipeerFileToMarkdown("csv", "document.csv", input, cognipeerOptions);
|
|
1614
|
+
return finishCsvResult(r2, [], options);
|
|
1615
|
+
}
|
|
1616
|
+
const buf = await toNodeBuffer2(input);
|
|
1617
|
+
const { text, warnings: w } = prepareCsvTextForCognipeer(buf.toString("utf8"), prepArgs);
|
|
1618
|
+
const r = await convertCognipeerFileToMarkdown(
|
|
1619
|
+
"csv",
|
|
1620
|
+
"document.csv",
|
|
1621
|
+
Buffer.from(text, "utf8"),
|
|
1622
|
+
cognipeerOptions
|
|
1623
|
+
);
|
|
1624
|
+
return finishCsvResult(r, w, options);
|
|
1625
|
+
}
|
|
1626
|
+
|
|
1627
|
+
// src/spreadsheet-markdown.ts
|
|
1628
|
+
function stripSpreadsheetOptionKeys(o) {
|
|
1629
|
+
if (!o) return {};
|
|
1630
|
+
const { includeSheetNames: _isn, compactMode: _cm, maxRowsPerSheet: _mr, ...rest } = o;
|
|
1631
|
+
return rest;
|
|
1632
|
+
}
|
|
1633
|
+
function finishSpreadsheetResult(r, options) {
|
|
1634
|
+
if (!options) return r;
|
|
1635
|
+
let markdown = r.markdown;
|
|
1636
|
+
const warnings = [...r.warnings];
|
|
1637
|
+
if (r.source === "cognipeer") {
|
|
1638
|
+
if (options.maxRowsPerSheet != null) {
|
|
1639
|
+
const lim = limitSpreadsheetMarkdownRowsPerSheet(markdown, options.maxRowsPerSheet);
|
|
1640
|
+
markdown = lim.markdown;
|
|
1641
|
+
warnings.push(...lim.warnings);
|
|
1642
|
+
}
|
|
1643
|
+
if (options.includeSheetNames === false) {
|
|
1644
|
+
markdown = stripSpreadsheetSheetHeadings(markdown);
|
|
1645
|
+
warnings.push(
|
|
1646
|
+
"[docmind-markdown:spreadsheet] includeSheetNames:false: removed ## sheet title lines from specialized output."
|
|
1647
|
+
);
|
|
1648
|
+
}
|
|
1649
|
+
}
|
|
1650
|
+
if (options.compactMode === true) {
|
|
1651
|
+
markdown = compactMarkdownOutput(markdown);
|
|
1652
|
+
}
|
|
1653
|
+
if (markdown === r.markdown && warnings.length === r.warnings.length) return r;
|
|
1654
|
+
return { ...r, markdown, warnings };
|
|
1655
|
+
}
|
|
1656
|
+
async function convertSpreadsheetToMarkdown(input, options) {
|
|
1657
|
+
const cognipeerOptions = stripSpreadsheetOptionKeys(options);
|
|
1658
|
+
const r = await convertCognipeerFileToMarkdown(
|
|
1659
|
+
"spreadsheet",
|
|
1660
|
+
"document.xlsx",
|
|
1661
|
+
input,
|
|
1662
|
+
cognipeerOptions
|
|
1663
|
+
);
|
|
1664
|
+
return finishSpreadsheetResult(r, options);
|
|
1665
|
+
}
|
|
1197
1666
|
function isArrayBufferLike(data) {
|
|
1198
1667
|
if (data instanceof ArrayBuffer) return true;
|
|
1199
1668
|
if (typeof Uint8Array !== "undefined" && data instanceof Uint8Array) return true;
|
|
@@ -1210,7 +1679,15 @@ function isExtractMarkdownPathInput(value) {
|
|
|
1210
1679
|
}
|
|
1211
1680
|
function pickStructuredMarkdownOptions(options) {
|
|
1212
1681
|
if (!options) return {};
|
|
1213
|
-
const {
|
|
1682
|
+
const {
|
|
1683
|
+
structuredFallback: _a,
|
|
1684
|
+
docx: _b,
|
|
1685
|
+
pdf: _c,
|
|
1686
|
+
html: _h,
|
|
1687
|
+
csv: _csv,
|
|
1688
|
+
spreadsheet: _s,
|
|
1689
|
+
...rest
|
|
1690
|
+
} = options;
|
|
1214
1691
|
return rest;
|
|
1215
1692
|
}
|
|
1216
1693
|
function buildDocxOptions(extract) {
|
|
@@ -1233,25 +1710,109 @@ function buildPdfOptions(extract) {
|
|
|
1233
1710
|
structuredMarkdown: { ...sm, ...pdf?.structuredMarkdown }
|
|
1234
1711
|
};
|
|
1235
1712
|
}
|
|
1713
|
+
function buildHtmlOptions(extract) {
|
|
1714
|
+
const html = extract?.html;
|
|
1715
|
+
const fb = extract?.structuredFallback;
|
|
1716
|
+
const sm = pickStructuredMarkdownOptions(extract);
|
|
1717
|
+
return {
|
|
1718
|
+
...html,
|
|
1719
|
+
resolveStructured: html?.resolveStructured ?? (fb ? () => Promise.resolve(fb) : void 0),
|
|
1720
|
+
structuredMarkdown: { ...sm, ...html?.structuredMarkdown }
|
|
1721
|
+
};
|
|
1722
|
+
}
|
|
1723
|
+
function buildCsvOptions(extract) {
|
|
1724
|
+
const csv = extract?.csv;
|
|
1725
|
+
const fb = extract?.structuredFallback;
|
|
1726
|
+
const sm = pickStructuredMarkdownOptions(extract);
|
|
1727
|
+
return {
|
|
1728
|
+
...csv,
|
|
1729
|
+
resolveStructured: csv?.resolveStructured ?? (fb ? () => Promise.resolve(fb) : void 0),
|
|
1730
|
+
structuredMarkdown: { ...sm, ...csv?.structuredMarkdown }
|
|
1731
|
+
};
|
|
1732
|
+
}
|
|
1733
|
+
function buildSpreadsheetOptions(extract) {
|
|
1734
|
+
const spreadsheet = extract?.spreadsheet;
|
|
1735
|
+
const fb = extract?.structuredFallback;
|
|
1736
|
+
const sm = pickStructuredMarkdownOptions(extract);
|
|
1737
|
+
return {
|
|
1738
|
+
...spreadsheet,
|
|
1739
|
+
resolveStructured: spreadsheet?.resolveStructured ?? (fb ? () => Promise.resolve(fb) : void 0),
|
|
1740
|
+
structuredMarkdown: { ...sm, ...spreadsheet?.structuredMarkdown }
|
|
1741
|
+
};
|
|
1742
|
+
}
|
|
1236
1743
|
function toUint8View(data) {
|
|
1237
1744
|
if (data instanceof Uint8Array) return data;
|
|
1238
1745
|
if (data instanceof ArrayBuffer) return new Uint8Array(data);
|
|
1239
1746
|
return new Uint8Array(data);
|
|
1240
1747
|
}
|
|
1748
|
+
var XLS_OLE_MAGIC = new Uint8Array([208, 207, 17, 224, 161, 177, 26, 225]);
|
|
1749
|
+
function uint8ArraysEqual(a, b) {
|
|
1750
|
+
if (a.length !== b.length) return false;
|
|
1751
|
+
for (let i = 0; i < a.length; i++) if (a[i] !== b[i]) return false;
|
|
1752
|
+
return true;
|
|
1753
|
+
}
|
|
1754
|
+
function containsUtf8Substring(haystack, needle) {
|
|
1755
|
+
const bytes = new TextEncoder().encode(needle);
|
|
1756
|
+
if (bytes.length === 0 || haystack.length < bytes.length) return false;
|
|
1757
|
+
outer: for (let i = 0; i <= haystack.length - bytes.length; i++) {
|
|
1758
|
+
for (let j = 0; j < bytes.length; j++) {
|
|
1759
|
+
if (haystack[i + j] !== bytes[j]) continue outer;
|
|
1760
|
+
}
|
|
1761
|
+
return true;
|
|
1762
|
+
}
|
|
1763
|
+
return false;
|
|
1764
|
+
}
|
|
1765
|
+
function isZipLocalHeader(u) {
|
|
1766
|
+
return u.length >= 4 && u[0] === 80 && u[1] === 75 && (u[2] === 3 || u[2] === 5 || u[2] === 7);
|
|
1767
|
+
}
|
|
1768
|
+
function looksLikeUtf8HtmlPrefix(u) {
|
|
1769
|
+
if (u.length === 0) return false;
|
|
1770
|
+
let start = 0;
|
|
1771
|
+
if (u.length >= 3 && u[0] === 239 && u[1] === 187 && u[2] === 191) start = 3;
|
|
1772
|
+
let s = "";
|
|
1773
|
+
const n = Math.min(u.length, 256);
|
|
1774
|
+
for (let i = start; i < n; i++) {
|
|
1775
|
+
const c = u[i];
|
|
1776
|
+
if (c === 0 || c > 127) return false;
|
|
1777
|
+
s += String.fromCharCode(c);
|
|
1778
|
+
}
|
|
1779
|
+
const t = s.trimStart().slice(0, 96).toLowerCase();
|
|
1780
|
+
return t.startsWith("<!doctype html") || t.startsWith("<html") || t.startsWith("<head") || t.startsWith("<!--");
|
|
1781
|
+
}
|
|
1241
1782
|
function detectBinaryFormat(data, filename, mimeType) {
|
|
1242
1783
|
const u = toUint8View(data);
|
|
1243
1784
|
const lower = filename?.toLowerCase() ?? "";
|
|
1244
1785
|
const mime = mimeType?.toLowerCase() ?? "";
|
|
1245
1786
|
if (mime.includes("pdf") || lower.endsWith(".pdf")) return "pdf";
|
|
1787
|
+
if (mime.includes("text/html") || mime.includes("application/xhtml+xml") || lower.endsWith(".html") || lower.endsWith(".htm")) {
|
|
1788
|
+
return "html";
|
|
1789
|
+
}
|
|
1790
|
+
if (mime.includes("text/csv") || mime.includes("application/csv") || lower.endsWith(".csv")) {
|
|
1791
|
+
return "csv";
|
|
1792
|
+
}
|
|
1793
|
+
if (mime.includes("spreadsheetml") || mime.includes("officedocument.spreadsheetml") || mime.includes("application/vnd.ms-excel") || lower.endsWith(".xlsx") || lower.endsWith(".xls")) {
|
|
1794
|
+
return "spreadsheet";
|
|
1795
|
+
}
|
|
1246
1796
|
if (mime.includes("wordprocessingml") || mime.includes("officedocument.wordprocessingml.document") || lower.endsWith(".docx")) {
|
|
1247
1797
|
return "docx";
|
|
1248
1798
|
}
|
|
1249
1799
|
if (u.length >= 4 && u[0] === 37 && u[1] === 80 && u[2] === 68 && u[3] === 70) {
|
|
1250
1800
|
return "pdf";
|
|
1251
1801
|
}
|
|
1252
|
-
if (u.length >=
|
|
1253
|
-
return "
|
|
1802
|
+
if (u.length >= XLS_OLE_MAGIC.length && uint8ArraysEqual(u.subarray(0, XLS_OLE_MAGIC.length), XLS_OLE_MAGIC)) {
|
|
1803
|
+
return "spreadsheet";
|
|
1254
1804
|
}
|
|
1805
|
+
if (isZipLocalHeader(u)) {
|
|
1806
|
+
const hasWordDoc = containsUtf8Substring(u, "word/document");
|
|
1807
|
+
const hasXlWorkbook = containsUtf8Substring(u, "xl/workbook");
|
|
1808
|
+
if (hasWordDoc && !hasXlWorkbook) return "docx";
|
|
1809
|
+
if (hasXlWorkbook && !hasWordDoc) return "spreadsheet";
|
|
1810
|
+
if (hasWordDoc && hasXlWorkbook) return "docx";
|
|
1811
|
+
if (lower.endsWith(".docx")) return "docx";
|
|
1812
|
+
if (lower.endsWith(".xlsx")) return "spreadsheet";
|
|
1813
|
+
return "unknown";
|
|
1814
|
+
}
|
|
1815
|
+
if (looksLikeUtf8HtmlPrefix(u)) return "html";
|
|
1255
1816
|
return "unknown";
|
|
1256
1817
|
}
|
|
1257
1818
|
function docxStrategyFromSource(source) {
|
|
@@ -1272,6 +1833,21 @@ function pdfStrategyFromResult(r) {
|
|
|
1272
1833
|
return "pdf-cognipeer-specialized";
|
|
1273
1834
|
}
|
|
1274
1835
|
}
|
|
1836
|
+
function cognipeerFileStrategyFromResult(format, r) {
|
|
1837
|
+
switch (r.source) {
|
|
1838
|
+
case "structured-fallback":
|
|
1839
|
+
return format === "html" ? "html-structured-fallback" : format === "csv" ? "csv-structured-fallback" : "spreadsheet-structured-fallback";
|
|
1840
|
+
case "unsupported-runtime":
|
|
1841
|
+
return format === "html" ? "html-unsupported-runtime" : format === "csv" ? "csv-unsupported-runtime" : "spreadsheet-unsupported-runtime";
|
|
1842
|
+
case "cognipeer-unavailable":
|
|
1843
|
+
return format === "html" ? "html-cognipeer-unavailable" : format === "csv" ? "csv-cognipeer-unavailable" : "spreadsheet-cognipeer-unavailable";
|
|
1844
|
+
case "cognipeer-failed":
|
|
1845
|
+
return format === "html" ? "html-cognipeer-failed" : format === "csv" ? "csv-cognipeer-failed" : "spreadsheet-cognipeer-failed";
|
|
1846
|
+
case "cognipeer":
|
|
1847
|
+
default:
|
|
1848
|
+
return format === "html" ? "html-cognipeer-specialized" : format === "csv" ? "csv-cognipeer-specialized" : "spreadsheet-cognipeer-specialized";
|
|
1849
|
+
}
|
|
1850
|
+
}
|
|
1275
1851
|
function mergeWarnings(base, ...more) {
|
|
1276
1852
|
const out = [...base];
|
|
1277
1853
|
for (const m of more) {
|
|
@@ -1280,6 +1856,30 @@ function mergeWarnings(base, ...more) {
|
|
|
1280
1856
|
return out;
|
|
1281
1857
|
}
|
|
1282
1858
|
var EXTRACT_WARN = "[docmind-markdown:extractMarkdown]";
|
|
1859
|
+
var ROUTING_TAG = "[docmind-markdown:extractMarkdown:routing]";
|
|
1860
|
+
function inferMediaHint(mimeType, filename) {
|
|
1861
|
+
const m = mimeType?.toLowerCase().trim() ?? "";
|
|
1862
|
+
const f = filename?.toLowerCase() ?? "";
|
|
1863
|
+
if (m.startsWith("image/") || /\.(png|jpe?g|gif|webp|bmp|ico|svg|tiff?)$/i.test(f)) {
|
|
1864
|
+
return "image";
|
|
1865
|
+
}
|
|
1866
|
+
if (m.startsWith("text/") || m === "application/json" || /\.(txt|md|json|log)$/i.test(f)) {
|
|
1867
|
+
return "text";
|
|
1868
|
+
}
|
|
1869
|
+
if (m.startsWith("audio/")) return "audio";
|
|
1870
|
+
if (m.startsWith("video/")) return "video";
|
|
1871
|
+
return void 0;
|
|
1872
|
+
}
|
|
1873
|
+
function buildRouting(p) {
|
|
1874
|
+
const hintPart = p.mediaHint ? ` mediaHint=${p.mediaHint}` : "";
|
|
1875
|
+
return {
|
|
1876
|
+
detectedFormat: p.detectedFormat,
|
|
1877
|
+
specializedPipeline: p.specializedPipeline,
|
|
1878
|
+
usedStructuredFallback: p.usedStructuredFallback,
|
|
1879
|
+
mediaHint: p.mediaHint,
|
|
1880
|
+
routingSummary: `${ROUTING_TAG} strategy=${p.strategy} format=${p.detectedFormat} pipeline=${p.specializedPipeline} structuredFallback=${p.usedStructuredFallback}${hintPart}`
|
|
1881
|
+
};
|
|
1882
|
+
}
|
|
1283
1883
|
function traceUsedStructuredFallback(context) {
|
|
1284
1884
|
return `${EXTRACT_WARN} ${context}: final Markdown from structuredFallback (specialized route unavailable, failed, or insufficient).`;
|
|
1285
1885
|
}
|
|
@@ -1295,6 +1895,75 @@ function tracePdfStructuredAfterCognipeer() {
|
|
|
1295
1895
|
function tracePdfSpecializedDeadEnd() {
|
|
1296
1896
|
return `${EXTRACT_WARN} pdf: Cognipeer specialized route did not produce Markdown and structuredFallback was not provided.`;
|
|
1297
1897
|
}
|
|
1898
|
+
function traceCognipeerFileStructuredAfterUnsupportedRuntime(label) {
|
|
1899
|
+
return `${EXTRACT_WARN} ${label}-unsupported-runtime: final Markdown from structuredFallback \u2014 @cognipeer/to-markdown cannot run in this environment.`;
|
|
1900
|
+
}
|
|
1901
|
+
function traceCognipeerFileStructuredAfterCognipeer(label) {
|
|
1902
|
+
return `${EXTRACT_WARN} ${label}-structured-fallback: final Markdown from structured envelope after Cognipeer ${label} path did not yield the result.`;
|
|
1903
|
+
}
|
|
1904
|
+
function traceCognipeerFileSpecializedDeadEnd(label) {
|
|
1905
|
+
return `${EXTRACT_WARN} ${label}: Cognipeer specialized route did not produce Markdown and structuredFallback was not provided.`;
|
|
1906
|
+
}
|
|
1907
|
+
function traceCognipeerFileExtractLayerFallback(label) {
|
|
1908
|
+
return `${EXTRACT_WARN} ${label}-extract-layer-fallback: specialized route returned empty Markdown but structuredFallback is set \u2014 applying convertStructuredToMarkdown at extractMarkdown layer.`;
|
|
1909
|
+
}
|
|
1910
|
+
function tracePdfStructuredExtractLayerFallback() {
|
|
1911
|
+
return `${EXTRACT_WARN} pdf-extract-layer-fallback: specialized route returned empty Markdown but structuredFallback is set \u2014 applying convertStructuredToMarkdown at extractMarkdown layer.`;
|
|
1912
|
+
}
|
|
1913
|
+
async function extractCognipeerFileMarkdownBranch(format, data, options, baseWarnings, smOpts, fb) {
|
|
1914
|
+
const r = format === "html" ? await convertHtmlToMarkdown(data, buildHtmlOptions(options)) : format === "csv" ? await convertCsvToMarkdown(data, buildCsvOptions(options)) : await convertSpreadsheetToMarkdown(data, buildSpreadsheetOptions(options));
|
|
1915
|
+
const strategy = cognipeerFileStrategyFromResult(format, r);
|
|
1916
|
+
let w = mergeWarnings(baseWarnings, r.warnings);
|
|
1917
|
+
const unsupported = format === "html" ? "html-unsupported-runtime" : format === "csv" ? "csv-unsupported-runtime" : "spreadsheet-unsupported-runtime";
|
|
1918
|
+
const structuredFb = format === "html" ? "html-structured-fallback" : format === "csv" ? "csv-structured-fallback" : "spreadsheet-structured-fallback";
|
|
1919
|
+
const failed = format === "html" ? "html-cognipeer-failed" : format === "csv" ? "csv-cognipeer-failed" : "spreadsheet-cognipeer-failed";
|
|
1920
|
+
const unavailable = format === "html" ? "html-cognipeer-unavailable" : format === "csv" ? "csv-cognipeer-unavailable" : "spreadsheet-cognipeer-unavailable";
|
|
1921
|
+
if (strategy === unsupported && r.markdown === "" && fb) {
|
|
1922
|
+
w = mergeWarnings(w, fb.warnings, [traceCognipeerFileStructuredAfterUnsupportedRuntime(format)]);
|
|
1923
|
+
return {
|
|
1924
|
+
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1925
|
+
warnings: w,
|
|
1926
|
+
strategy: structuredFb,
|
|
1927
|
+
routing: buildRouting({
|
|
1928
|
+
detectedFormat: format,
|
|
1929
|
+
specializedPipeline: format,
|
|
1930
|
+
usedStructuredFallback: true,
|
|
1931
|
+
strategy: structuredFb
|
|
1932
|
+
})
|
|
1933
|
+
};
|
|
1934
|
+
}
|
|
1935
|
+
if ((strategy === failed || strategy === unavailable) && r.markdown.trim() === "" && fb) {
|
|
1936
|
+
w = mergeWarnings(w, fb.warnings, [traceCognipeerFileExtractLayerFallback(format)]);
|
|
1937
|
+
return {
|
|
1938
|
+
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1939
|
+
warnings: w,
|
|
1940
|
+
strategy: structuredFb,
|
|
1941
|
+
routing: buildRouting({
|
|
1942
|
+
detectedFormat: format,
|
|
1943
|
+
specializedPipeline: format,
|
|
1944
|
+
usedStructuredFallback: true,
|
|
1945
|
+
strategy: structuredFb
|
|
1946
|
+
})
|
|
1947
|
+
};
|
|
1948
|
+
}
|
|
1949
|
+
if (strategy === structuredFb) {
|
|
1950
|
+
w = mergeWarnings(w, [traceCognipeerFileStructuredAfterCognipeer(format)]);
|
|
1951
|
+
}
|
|
1952
|
+
if ((strategy === failed || strategy === unavailable) && r.markdown.trim() === "" && !fb) {
|
|
1953
|
+
w = mergeWarnings(w, [traceCognipeerFileSpecializedDeadEnd(format)]);
|
|
1954
|
+
}
|
|
1955
|
+
return {
|
|
1956
|
+
markdown: r.markdown,
|
|
1957
|
+
warnings: w,
|
|
1958
|
+
strategy,
|
|
1959
|
+
routing: buildRouting({
|
|
1960
|
+
detectedFormat: format,
|
|
1961
|
+
specializedPipeline: format,
|
|
1962
|
+
usedStructuredFallback: strategy === structuredFb,
|
|
1963
|
+
strategy
|
|
1964
|
+
})
|
|
1965
|
+
};
|
|
1966
|
+
}
|
|
1298
1967
|
async function extractMarkdown(input, options) {
|
|
1299
1968
|
const smOpts = pickStructuredMarkdownOptions(options);
|
|
1300
1969
|
const fb = options?.structuredFallback;
|
|
@@ -1323,22 +1992,24 @@ async function extractMarkdown(input, options) {
|
|
|
1323
1992
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1324
1993
|
warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("path-requires-node")]),
|
|
1325
1994
|
strategy: "path-requires-node",
|
|
1326
|
-
routing: {
|
|
1995
|
+
routing: buildRouting({
|
|
1327
1996
|
detectedFormat: "unknown",
|
|
1328
1997
|
specializedPipeline: "none",
|
|
1329
|
-
usedStructuredFallback: true
|
|
1330
|
-
|
|
1998
|
+
usedStructuredFallback: true,
|
|
1999
|
+
strategy: "path-requires-node"
|
|
2000
|
+
})
|
|
1331
2001
|
};
|
|
1332
2002
|
}
|
|
1333
2003
|
return {
|
|
1334
2004
|
markdown: "",
|
|
1335
2005
|
warnings,
|
|
1336
2006
|
strategy: "path-requires-node",
|
|
1337
|
-
routing: {
|
|
2007
|
+
routing: buildRouting({
|
|
1338
2008
|
detectedFormat: "unknown",
|
|
1339
2009
|
specializedPipeline: "none",
|
|
1340
|
-
usedStructuredFallback: false
|
|
1341
|
-
|
|
2010
|
+
usedStructuredFallback: false,
|
|
2011
|
+
strategy: "path-requires-node"
|
|
2012
|
+
})
|
|
1342
2013
|
};
|
|
1343
2014
|
}
|
|
1344
2015
|
const { readFile } = await importEsm(
|
|
@@ -1360,10 +2031,26 @@ async function extractMarkdown(input, options) {
|
|
|
1360
2031
|
return {
|
|
1361
2032
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1362
2033
|
warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("invalid-input-shape")]),
|
|
1363
|
-
strategy: "binary-unidentified-structured-fallback"
|
|
2034
|
+
strategy: "binary-unidentified-structured-fallback",
|
|
2035
|
+
routing: buildRouting({
|
|
2036
|
+
detectedFormat: "unknown",
|
|
2037
|
+
specializedPipeline: "none",
|
|
2038
|
+
usedStructuredFallback: true,
|
|
2039
|
+
strategy: "binary-unidentified-structured-fallback"
|
|
2040
|
+
})
|
|
1364
2041
|
};
|
|
1365
2042
|
}
|
|
1366
|
-
return {
|
|
2043
|
+
return {
|
|
2044
|
+
markdown: "",
|
|
2045
|
+
warnings,
|
|
2046
|
+
strategy: "binary-unidentified",
|
|
2047
|
+
routing: buildRouting({
|
|
2048
|
+
detectedFormat: "unknown",
|
|
2049
|
+
specializedPipeline: "none",
|
|
2050
|
+
usedStructuredFallback: false,
|
|
2051
|
+
strategy: "binary-unidentified"
|
|
2052
|
+
})
|
|
2053
|
+
};
|
|
1367
2054
|
}
|
|
1368
2055
|
const fmt = detectBinaryFormat(data, filename, mimeType);
|
|
1369
2056
|
if (fmt === "docx") {
|
|
@@ -1376,22 +2063,24 @@ async function extractMarkdown(input, options) {
|
|
|
1376
2063
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1377
2064
|
warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("docx-requires-node")]),
|
|
1378
2065
|
strategy: "docx-requires-node",
|
|
1379
|
-
routing: {
|
|
2066
|
+
routing: buildRouting({
|
|
1380
2067
|
detectedFormat: "docx",
|
|
1381
2068
|
specializedPipeline: "none",
|
|
1382
|
-
usedStructuredFallback: true
|
|
1383
|
-
|
|
2069
|
+
usedStructuredFallback: true,
|
|
2070
|
+
strategy: "docx-requires-node"
|
|
2071
|
+
})
|
|
1384
2072
|
};
|
|
1385
2073
|
}
|
|
1386
2074
|
return {
|
|
1387
2075
|
markdown: "",
|
|
1388
2076
|
warnings,
|
|
1389
2077
|
strategy: "docx-requires-node",
|
|
1390
|
-
routing: {
|
|
2078
|
+
routing: buildRouting({
|
|
1391
2079
|
detectedFormat: "docx",
|
|
1392
2080
|
specializedPipeline: "none",
|
|
1393
|
-
usedStructuredFallback: false
|
|
1394
|
-
|
|
2081
|
+
usedStructuredFallback: false,
|
|
2082
|
+
strategy: "docx-requires-node"
|
|
2083
|
+
})
|
|
1395
2084
|
};
|
|
1396
2085
|
}
|
|
1397
2086
|
const r = await convertDocxToMarkdown(data, buildDocxOptions(options));
|
|
@@ -1405,11 +2094,12 @@ async function extractMarkdown(input, options) {
|
|
|
1405
2094
|
markdown: r.markdown,
|
|
1406
2095
|
warnings: w,
|
|
1407
2096
|
strategy,
|
|
1408
|
-
routing: {
|
|
2097
|
+
routing: buildRouting({
|
|
1409
2098
|
detectedFormat: "docx",
|
|
1410
2099
|
specializedPipeline: "docx",
|
|
1411
|
-
usedStructuredFallback: strategy === "docx-structured-fallback"
|
|
1412
|
-
|
|
2100
|
+
usedStructuredFallback: strategy === "docx-structured-fallback",
|
|
2101
|
+
strategy
|
|
2102
|
+
})
|
|
1413
2103
|
};
|
|
1414
2104
|
}
|
|
1415
2105
|
if (fmt === "pdf") {
|
|
@@ -1424,11 +2114,26 @@ async function extractMarkdown(input, options) {
|
|
|
1424
2114
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1425
2115
|
warnings: w,
|
|
1426
2116
|
strategy: "pdf-structured-fallback",
|
|
1427
|
-
routing: {
|
|
2117
|
+
routing: buildRouting({
|
|
1428
2118
|
detectedFormat: "pdf",
|
|
1429
2119
|
specializedPipeline: "pdf",
|
|
1430
|
-
usedStructuredFallback: true
|
|
1431
|
-
|
|
2120
|
+
usedStructuredFallback: true,
|
|
2121
|
+
strategy: "pdf-structured-fallback"
|
|
2122
|
+
})
|
|
2123
|
+
};
|
|
2124
|
+
}
|
|
2125
|
+
if ((strategy === "pdf-cognipeer-failed" || strategy === "pdf-cognipeer-unavailable") && r.markdown.trim() === "" && fb) {
|
|
2126
|
+
w = mergeWarnings(w, fb.warnings, [tracePdfStructuredExtractLayerFallback()]);
|
|
2127
|
+
return {
|
|
2128
|
+
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
2129
|
+
warnings: w,
|
|
2130
|
+
strategy: "pdf-structured-fallback",
|
|
2131
|
+
routing: buildRouting({
|
|
2132
|
+
detectedFormat: "pdf",
|
|
2133
|
+
specializedPipeline: "pdf",
|
|
2134
|
+
usedStructuredFallback: true,
|
|
2135
|
+
strategy: "pdf-structured-fallback"
|
|
2136
|
+
})
|
|
1432
2137
|
};
|
|
1433
2138
|
}
|
|
1434
2139
|
if (strategy === "pdf-structured-fallback") {
|
|
@@ -1441,40 +2146,68 @@ async function extractMarkdown(input, options) {
|
|
|
1441
2146
|
markdown: r.markdown,
|
|
1442
2147
|
warnings: w,
|
|
1443
2148
|
strategy,
|
|
1444
|
-
routing: {
|
|
2149
|
+
routing: buildRouting({
|
|
1445
2150
|
detectedFormat: "pdf",
|
|
1446
2151
|
specializedPipeline: "pdf",
|
|
1447
|
-
usedStructuredFallback: strategy === "pdf-structured-fallback"
|
|
1448
|
-
|
|
2152
|
+
usedStructuredFallback: strategy === "pdf-structured-fallback",
|
|
2153
|
+
strategy
|
|
2154
|
+
})
|
|
1449
2155
|
};
|
|
1450
2156
|
}
|
|
2157
|
+
if (fmt === "html") {
|
|
2158
|
+
return extractCognipeerFileMarkdownBranch("html", data, options, warnings, smOpts, fb);
|
|
2159
|
+
}
|
|
2160
|
+
if (fmt === "csv") {
|
|
2161
|
+
return extractCognipeerFileMarkdownBranch("csv", data, options, warnings, smOpts, fb);
|
|
2162
|
+
}
|
|
2163
|
+
if (fmt === "spreadsheet") {
|
|
2164
|
+
return extractCognipeerFileMarkdownBranch("spreadsheet", data, options, warnings, smOpts, fb);
|
|
2165
|
+
}
|
|
2166
|
+
const mediaHint = inferMediaHint(mimeType, filename);
|
|
1451
2167
|
warnings.push(
|
|
1452
|
-
"@dragon708/docmind-markdown: Unidentified binary format (expected PDF
|
|
2168
|
+
"@dragon708/docmind-markdown: Unidentified binary format (expected PDF, OOXML Word/Excel, HTML, CSV, or related MIME/filename hints). Using structured fallback if provided."
|
|
1453
2169
|
);
|
|
2170
|
+
if (mediaHint === "image") {
|
|
2171
|
+
warnings.push(
|
|
2172
|
+
`${EXTRACT_WARN} image hint (MIME/filename): raw images are not converted by specialized file routes; pass a StructuredDocumentResult (e.g. after OCR) or structuredFallback.`
|
|
2173
|
+
);
|
|
2174
|
+
} else if (mediaHint === "text") {
|
|
2175
|
+
warnings.push(
|
|
2176
|
+
`${EXTRACT_WARN} text hint (MIME/filename): plain text / JSON bytes are not auto-routed to Markdown here; pass StructuredDocumentResult or structuredFallback for normalized text/OCR pipelines.`
|
|
2177
|
+
);
|
|
2178
|
+
} else if (mediaHint === "audio" || mediaHint === "video") {
|
|
2179
|
+
warnings.push(
|
|
2180
|
+
`${EXTRACT_WARN} ${mediaHint} hint (MIME): no specialized ${mediaHint}\u2192Markdown route in extractMarkdown; use StructuredDocumentResult or structuredFallback.`
|
|
2181
|
+
);
|
|
2182
|
+
}
|
|
1454
2183
|
if (fb) {
|
|
1455
2184
|
return {
|
|
1456
2185
|
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1457
2186
|
warnings: mergeWarnings(warnings, fb.warnings, [traceUsedStructuredFallback("binary-unidentified")]),
|
|
1458
2187
|
strategy: "binary-unidentified-structured-fallback",
|
|
1459
|
-
routing: {
|
|
2188
|
+
routing: buildRouting({
|
|
1460
2189
|
detectedFormat: "unknown",
|
|
1461
2190
|
specializedPipeline: "none",
|
|
1462
|
-
usedStructuredFallback: true
|
|
1463
|
-
|
|
2191
|
+
usedStructuredFallback: true,
|
|
2192
|
+
strategy: "binary-unidentified-structured-fallback",
|
|
2193
|
+
mediaHint
|
|
2194
|
+
})
|
|
1464
2195
|
};
|
|
1465
2196
|
}
|
|
1466
2197
|
return {
|
|
1467
2198
|
markdown: "",
|
|
1468
2199
|
warnings,
|
|
1469
2200
|
strategy: "binary-unidentified",
|
|
1470
|
-
routing: {
|
|
2201
|
+
routing: buildRouting({
|
|
1471
2202
|
detectedFormat: "unknown",
|
|
1472
2203
|
specializedPipeline: "none",
|
|
1473
|
-
usedStructuredFallback: false
|
|
1474
|
-
|
|
2204
|
+
usedStructuredFallback: false,
|
|
2205
|
+
strategy: "binary-unidentified",
|
|
2206
|
+
mediaHint
|
|
2207
|
+
})
|
|
1475
2208
|
};
|
|
1476
2209
|
}
|
|
1477
2210
|
|
|
1478
|
-
export { convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
|
|
2211
|
+
export { convertCsvToMarkdown, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertHtmlToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertSpreadsheetToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, looksLikeHtmlString, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
|
|
1479
2212
|
//# sourceMappingURL=index.js.map
|
|
1480
2213
|
//# sourceMappingURL=index.js.map
|