@wsqc2026/markitdown-typescript 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/LICENSE +191 -0
  2. package/README.md +157 -0
  3. package/dist/cli.d.ts +2 -0
  4. package/dist/cli.js +3589 -0
  5. package/dist/converter-utils/docx/latex-dict.d.ts +34 -0
  6. package/dist/converter-utils/docx/omml.d.ts +25 -0
  7. package/dist/converter-utils/docx/pre-process.d.ts +9 -0
  8. package/dist/converter.d.ts +23 -0
  9. package/dist/converters/audio.d.ts +1 -0
  10. package/dist/converters/bing-serp.d.ts +1 -0
  11. package/dist/converters/csv.d.ts +1 -0
  12. package/dist/converters/docx.d.ts +1 -0
  13. package/dist/converters/epub.d.ts +1 -0
  14. package/dist/converters/exiftool.d.ts +1 -0
  15. package/dist/converters/html.d.ts +1 -0
  16. package/dist/converters/image.d.ts +1 -0
  17. package/dist/converters/index.d.ts +17 -0
  18. package/dist/converters/ipynb.d.ts +1 -0
  19. package/dist/converters/outlook-msg.d.ts +1 -0
  20. package/dist/converters/pdf.d.ts +1 -0
  21. package/dist/converters/plain-text.d.ts +1 -0
  22. package/dist/converters/pptx.d.ts +1 -0
  23. package/dist/converters/rss.d.ts +1 -0
  24. package/dist/converters/wikipedia.d.ts +1 -0
  25. package/dist/converters/xlsx.d.ts +2 -0
  26. package/dist/converters/youtube.d.ts +1 -0
  27. package/dist/converters/zip.d.ts +8 -0
  28. package/dist/exceptions.d.ts +18 -0
  29. package/dist/exit-codes.d.ts +8 -0
  30. package/dist/index.d.ts +27 -0
  31. package/dist/index.js +3184 -0
  32. package/dist/markitdown.d.ts +16 -0
  33. package/dist/stream-info.d.ts +14 -0
  34. package/dist/transforms/decode-text.d.ts +6 -0
  35. package/dist/transforms/html-to-markdown.d.ts +5 -0
  36. package/dist/types.d.ts +26 -0
  37. package/dist/uri-utils.d.ts +9 -0
  38. package/package.json +49 -0
package/dist/index.js ADDED
@@ -0,0 +1,3184 @@
1
+ import { createRequire } from "node:module";
2
+ var __require = /* @__PURE__ */ createRequire(import.meta.url);
3
+
4
+ // src/converter.ts
5
+ function byMime(...mimes) {
6
+ return (ctx) => {
7
+ const mimetype = (ctx.info.mimetype ?? "").toLowerCase();
8
+ if (!mimetype)
9
+ return false;
10
+ return mimes.some((m) => {
11
+ if (m.endsWith("/*")) {
12
+ return mimetype.startsWith(m.slice(0, -1));
13
+ }
14
+ return mimetype.startsWith(m);
15
+ });
16
+ };
17
+ }
18
+ function byExt(...exts) {
19
+ return (ctx) => {
20
+ const ext = (ctx.info.extension ?? "").toLowerCase();
21
+ if (!ext)
22
+ return false;
23
+ return exts.some((e) => ext === e.toLowerCase());
24
+ };
25
+ }
26
+ function byUrl(pattern) {
27
+ return (ctx) => {
28
+ const url = ctx.info.url ?? "";
29
+ if (!url)
30
+ return false;
31
+ return pattern.test(url);
32
+ };
33
+ }
34
+ function anyOf(...matchers) {
35
+ return (ctx) => matchers.some((m) => m(ctx));
36
+ }
37
+ function allOf(...matchers) {
38
+ return (ctx) => matchers.every((m) => m(ctx));
39
+ }
40
+ function hasCharset() {
41
+ return (ctx) => ctx.info.charset != null;
42
+ }
43
+ function converter(name, match, convert) {
44
+ return { name, match, convert };
45
+ }
46
+ // src/converters/exiftool.ts
47
+ import { execFile, spawn } from "node:child_process";
48
+ import { promisify } from "node:util";
49
+ var execFileAsync = promisify(execFile);
50
+ function parseVersion(version) {
51
+ return version.split(".").map((s) => parseInt(s, 10));
52
+ }
53
+ function compareVersions(a, b) {
54
+ for (let i = 0;i < Math.max(a.length, b.length); i++) {
55
+ const av = a[i] ?? 0;
56
+ const bv = b[i] ?? 0;
57
+ if (av !== bv)
58
+ return av - bv;
59
+ }
60
+ return 0;
61
+ }
62
+ async function exiftoolMetadata(buffer, exiftoolPath) {
63
+ if (!exiftoolPath)
64
+ return {};
65
+ try {
66
+ const { stdout } = await execFileAsync(exiftoolPath, ["-ver"]);
67
+ const version = parseVersion(stdout.trim());
68
+ if (compareVersions(version, [12, 24]) < 0) {
69
+ throw new Error(`ExifTool version ${stdout.trim()} is vulnerable to CVE-2021-22204. Please upgrade to version 12.24 or later.`);
70
+ }
71
+ } catch (e) {
72
+ if (e.message?.includes("CVE"))
73
+ throw e;
74
+ throw new Error("Failed to verify ExifTool version.");
75
+ }
76
+ try {
77
+ const result = await new Promise((resolve, reject) => {
78
+ const proc = spawn(exiftoolPath, ["-json", "-"], {
79
+ stdio: ["pipe", "pipe", "pipe"]
80
+ });
81
+ const chunks = [];
82
+ proc.stdout.on("data", (chunk) => chunks.push(chunk));
83
+ proc.on("close", (_code) => {
84
+ resolve(Buffer.concat(chunks).toString("utf-8"));
85
+ });
86
+ proc.on("error", reject);
87
+ proc.stdin.write(buffer);
88
+ proc.stdin.end();
89
+ });
90
+ const parsed = JSON.parse(result);
91
+ return parsed[0] ?? {};
92
+ } catch {
93
+ return {};
94
+ }
95
+ }
96
+
97
+ // src/converters/audio.ts
98
+ var ACCEPTED_EXTENSIONS = [".wav", ".mp3", ".m4a", ".mp4"];
99
+ var ACCEPTED_MIME_PREFIXES = ["audio/x-wav", "audio/mpeg", "video/mp4"];
100
+ var AUDIO_METADATA_FIELDS = [
101
+ "Title",
102
+ "Artist",
103
+ "Author",
104
+ "Band",
105
+ "Album",
106
+ "Genre",
107
+ "Track",
108
+ "DateTimeOriginal",
109
+ "CreateDate",
110
+ "NumChannels",
111
+ "SampleRate",
112
+ "AvgBytesPerSec",
113
+ "BitsPerSample"
114
+ ];
115
+ var audioConverter = converter("Audio", anyOf(byExt(...ACCEPTED_EXTENSIONS), byMime(...ACCEPTED_MIME_PREFIXES)), async (ctx) => {
116
+ let md = "";
117
+ const metadata = await exiftoolMetadata(ctx.buffer, ctx.opts.exiftoolPath);
118
+ const metaLines = [];
119
+ for (const field of AUDIO_METADATA_FIELDS) {
120
+ if (metadata[field]) {
121
+ metaLines.push(`- **${field}:** ${metadata[field]}`);
122
+ }
123
+ }
124
+ if (metaLines.length) {
125
+ md += `# Audio Metadata
126
+
127
+ ${metaLines.join(`
128
+ `)}
129
+ `;
130
+ }
131
+ return { markdown: md.trim() };
132
+ });
133
+ // src/converters/bing-serp.ts
134
+ import * as cheerio from "cheerio";
135
+ import TurndownService from "turndown";
136
+
137
+ // src/transforms/decode-text.ts
138
+ import iconv from "iconv-lite";
139
+ function decodeBuffer(buffer, charset) {
140
+ const encoding = charset ?? "utf-8";
141
+ try {
142
+ const decoder = new TextDecoder(encoding, { fatal: true });
143
+ return decoder.decode(buffer);
144
+ } catch {}
145
+ if (iconv.encodingExists(encoding)) {
146
+ return iconv.decode(buffer, encoding);
147
+ }
148
+ return new TextDecoder("utf-8", { fatal: false }).decode(buffer);
149
+ }
150
+
151
+ // src/converters/bing-serp.ts
152
+ var ACCEPTED_EXTENSIONS2 = [".html", ".htm"];
153
+ var ACCEPTED_MIME_PREFIXES2 = ["text/html", "application/xhtml"];
154
+ function decodeRedirectUrl(href) {
155
+ try {
156
+ const url = new URL(href);
157
+ const u = url.searchParams.get("u");
158
+ if (!u)
159
+ return href;
160
+ const encoded = `${u.slice(2).trim()}==`;
161
+ const normalized = encoded.replace(/-/g, "+").replace(/_/g, "/");
162
+ const decoded = Buffer.from(normalized, "base64").toString("utf-8");
163
+ return decoded;
164
+ } catch {
165
+ return href;
166
+ }
167
+ }
168
+ var bingSerpConverter = converter("BingSerp", allOf(byUrl(/^https:\/\/www\.bing\.com\/search\?q=/), anyOf(byExt(...ACCEPTED_EXTENSIONS2), byMime(...ACCEPTED_MIME_PREFIXES2))), async (ctx) => {
169
+ const encoding = ctx.info.charset ?? "utf-8";
170
+ const html = decodeBuffer(ctx.buffer, encoding);
171
+ const $ = cheerio.load(html);
172
+ const url = ctx.info.url ?? "";
173
+ let query = "";
174
+ try {
175
+ const parsed = new URL(url);
176
+ query = parsed.searchParams.get("q") || "";
177
+ } catch {}
178
+ $(".tptt").each((_, el) => {
179
+ const $el = $(el);
180
+ const text = $el.text();
181
+ if (text)
182
+ $el.text(`${text} `);
183
+ });
184
+ $(".algoSlug_icon").remove();
185
+ const td = new TurndownService({
186
+ headingStyle: "atx",
187
+ codeBlockStyle: "fenced"
188
+ });
189
+ td.addRule("links", {
190
+ filter: "a",
191
+ replacement(content, node) {
192
+ const el = node;
193
+ let href = el.getAttribute("href") || "";
194
+ const title2 = el.getAttribute("title") || "";
195
+ if (!content.trim())
196
+ return "";
197
+ if (href.includes("bing.com") && href.includes("u=")) {
198
+ href = decodeRedirectUrl(href);
199
+ }
200
+ const titlePart = title2 ? ` "${title2.replace(/"/g, "\\\"")}"` : "";
201
+ return href ? `[${content}](${href}${titlePart})` : content;
202
+ }
203
+ });
204
+ td.addRule("images", {
205
+ filter: "img",
206
+ replacement(_content, node) {
207
+ const el = node;
208
+ const src = el.getAttribute("src") || "";
209
+ if (src.startsWith("data:"))
210
+ return "";
211
+ const alt = (el.getAttribute("alt") || "").replace(/\n/g, " ");
212
+ return `![${alt}](${src})`;
213
+ }
214
+ });
215
+ const results = [];
216
+ $(".b_algo").each((_, el) => {
217
+ const resultHtml = $(el).html();
218
+ if (!resultHtml)
219
+ return;
220
+ const mdResult = td.turndown(resultHtml).trim();
221
+ const lines = mdResult.split(/\n+/).map((l) => l.trim()).filter((l) => l.length > 0);
222
+ results.push(lines.join(`
223
+ `));
224
+ });
225
+ const title = $("title").first().text() || undefined;
226
+ const markdown = `## A Bing search for '${query}' found the following results:
227
+
228
+ ${results.join(`
229
+
230
+ `)}`;
231
+ return { markdown, title };
232
+ });
233
+ // src/converters/csv.ts
234
+ var ACCEPTED_EXTENSIONS3 = [".csv"];
235
+ var ACCEPTED_MIME_PREFIXES3 = ["text/csv", "application/csv"];
236
+ function parseCsvLine(line) {
237
+ const fields = [];
238
+ let current = "";
239
+ let inQuotes = false;
240
+ for (let i = 0;i < line.length; i++) {
241
+ const ch = line[i];
242
+ if (inQuotes) {
243
+ if (ch === '"') {
244
+ if (i + 1 < line.length && line[i + 1] === '"') {
245
+ current += '"';
246
+ i++;
247
+ } else {
248
+ inQuotes = false;
249
+ }
250
+ } else {
251
+ current += ch;
252
+ }
253
+ } else {
254
+ if (ch === '"') {
255
+ inQuotes = true;
256
+ } else if (ch === ",") {
257
+ fields.push(current);
258
+ current = "";
259
+ } else {
260
+ current += ch;
261
+ }
262
+ }
263
+ }
264
+ fields.push(current);
265
+ return fields;
266
+ }
267
+ function parseCsv(text) {
268
+ const rows = [];
269
+ let current = "";
270
+ let inQuotes = false;
271
+ const lines = text.split(/\r?\n/);
272
+ for (const line of lines) {
273
+ if (inQuotes) {
274
+ current += `
275
+ ${line}`;
276
+ } else {
277
+ if (current.trim() !== "" || rows.length > 0) {
278
+ if (current.trim() !== "")
279
+ rows.push(parseCsvLine(current));
280
+ }
281
+ current = line;
282
+ }
283
+ let quotes = 0;
284
+ for (let i = 0;i < line.length; i++) {
285
+ if (line[i] === '"') {
286
+ if (i + 1 < line.length && line[i + 1] === '"') {
287
+ i++;
288
+ } else {
289
+ quotes++;
290
+ }
291
+ }
292
+ }
293
+ if (quotes % 2 !== 0)
294
+ inQuotes = !inQuotes;
295
+ }
296
+ if (current.trim() !== "")
297
+ rows.push(parseCsvLine(current));
298
+ return rows;
299
+ }
300
+ var csvConverter = converter("CSV", anyOf(byExt(...ACCEPTED_EXTENSIONS3), byMime(...ACCEPTED_MIME_PREFIXES3)), async (ctx) => {
301
+ const text = decodeBuffer(ctx.buffer, ctx.info.charset);
302
+ const rows = parseCsv(text);
303
+ if (rows.length === 0) {
304
+ return { markdown: "" };
305
+ }
306
+ const header = rows[0];
307
+ const numCols = header.length;
308
+ const mdLines = [];
309
+ mdLines.push(`| ${header.join(" | ")} |`);
310
+ mdLines.push(`| ${header.map(() => "---").join(" | ")} |`);
311
+ for (let i = 1;i < rows.length; i++) {
312
+ let row = rows[i];
313
+ while (row.length < numCols)
314
+ row.push("");
315
+ row = row.slice(0, numCols);
316
+ mdLines.push(`| ${row.join(" | ")} |`);
317
+ }
318
+ return { markdown: mdLines.join(`
319
+ `) };
320
+ });
321
+ // src/converters/docx.ts
322
+ import mammoth from "mammoth";
323
+
324
+ // src/converter-utils/docx/pre-process.ts
325
+ import JSZip from "jszip";
326
+
327
+ // src/converter-utils/docx/latex-dict.ts
328
+ var CHARS = new Set(["{", "}", "_", "^", "#", "&", "$", "%", "~"]);
329
+ var BLANK = "";
330
+ var BACKSLASH = "\\";
331
+ var ALN = "&";
332
+ var BRK = "\\\\";
333
+ var FUNC_PLACE = "{fe}";
334
+ var CHR = {
335
+ "̀": "\\grave{{{0}}}",
336
+ "́": "\\acute{{{0}}}",
337
+ "̂": "\\hat{{{0}}}",
338
+ "̃": "\\tilde{{{0}}}",
339
+ "̄": "\\bar{{{0}}}",
340
+ "̅": "\\overbar{{{0}}}",
341
+ "̆": "\\breve{{{0}}}",
342
+ "̇": "\\dot{{{0}}}",
343
+ "̈": "\\ddot{{{0}}}",
344
+ "̉": "\\ovhook{{{0}}}",
345
+ "̊": "\\ocirc{{{0}}}",
346
+ "̌": "\\check{{{0}}}",
347
+ "̐": "\\candra{{{0}}}",
348
+ "̒": "\\oturnedcomma{{{0}}}",
349
+ "̕": "\\ocommatopright{{{0}}}",
350
+ "̚": "\\droang{{{0}}}",
351
+ "̸": "\\not{{{0}}}",
352
+ "⃐": "\\leftharpoonaccent{{{0}}}",
353
+ "⃑": "\\rightharpoonaccent{{{0}}}",
354
+ "⃒": "\\vertoverlay{{{0}}}",
355
+ "⃖": "\\overleftarrow{{{0}}}",
356
+ "⃗": "\\vec{{{0}}}",
357
+ "⃛": "\\dddot{{{0}}}",
358
+ "⃜": "\\ddddot{{{0}}}",
359
+ "⃡": "\\overleftrightarrow{{{0}}}",
360
+ "⃧": "\\annuity{{{0}}}",
361
+ "⃩": "\\widebridgeabove{{{0}}}",
362
+ "⃰": "\\asteraccent{{{0}}}",
363
+ "̰": "\\wideutilde{{{0}}}",
364
+ "̱": "\\underbar{{{0}}}",
365
+ "⃨": "\\threeunderdot{{{0}}}",
366
+ "⃬": "\\underrightharpoondown{{{0}}}",
367
+ "⃭": "\\underleftharpoondown{{{0}}}",
368
+ "⃮": "\\underledtarrow{{{0}}}",
369
+ "⃯": "\\underrightarrow{{{0}}}",
370
+ "⎴": "\\overbracket{{{0}}}",
371
+ "⏜": "\\overparen{{{0}}}",
372
+ "⏞": "\\overbrace{{{0}}}",
373
+ "⎵": "\\underbracket{{{0}}}",
374
+ "⏝": "\\underparen{{{0}}}",
375
+ "⏟": "\\underbrace{{{0}}}"
376
+ };
377
+ var CHR_BO = {
378
+ "⅀": "\\Bbbsum",
379
+ "∏": "\\prod",
380
+ "∐": "\\coprod",
381
+ "∑": "\\sum",
382
+ "∫": "\\int",
383
+ "⋀": "\\bigwedge",
384
+ "⋁": "\\bigvee",
385
+ "⋂": "\\bigcap",
386
+ "⋃": "\\bigcup",
387
+ "⨀": "\\bigodot",
388
+ "⨁": "\\bigoplus",
389
+ "⨂": "\\bigotimes"
390
+ };
391
+ var T = {
392
+ "→": "\\rightarrow ",
393
+ "\uD835\uDEFC": "\\alpha ",
394
+ "\uD835\uDEFD": "\\beta ",
395
+ "\uD835\uDEFE": "\\gamma ",
396
+ "\uD835\uDEFF": "\\theta ",
397
+ "\uD835\uDF00": "\\epsilon ",
398
+ "\uD835\uDF01": "\\zeta ",
399
+ "\uD835\uDF02": "\\eta ",
400
+ "\uD835\uDF03": "\\theta ",
401
+ "\uD835\uDF04": "\\iota ",
402
+ "\uD835\uDF05": "\\kappa ",
403
+ "\uD835\uDF06": "\\lambda ",
404
+ "\uD835\uDF07": "\\m ",
405
+ "\uD835\uDF08": "\\n ",
406
+ "\uD835\uDF09": "\\xi ",
407
+ "\uD835\uDF0A": "\\omicron ",
408
+ "\uD835\uDF0B": "\\pi ",
409
+ "\uD835\uDF0C": "\\rho ",
410
+ "\uD835\uDF0D": "\\varsigma ",
411
+ "\uD835\uDF0E": "\\sigma ",
412
+ "\uD835\uDF0F": "\\ta ",
413
+ "\uD835\uDF10": "\\upsilon ",
414
+ "\uD835\uDF11": "\\phi ",
415
+ "\uD835\uDF12": "\\chi ",
416
+ "\uD835\uDF13": "\\psi ",
417
+ "\uD835\uDF14": "\\omega ",
418
+ "\uD835\uDF15": "\\partial ",
419
+ "\uD835\uDF16": "\\varepsilon ",
420
+ "\uD835\uDF17": "\\vartheta ",
421
+ "\uD835\uDF18": "\\varkappa ",
422
+ "\uD835\uDF19": "\\varphi ",
423
+ "\uD835\uDF1A": "\\varrho ",
424
+ "\uD835\uDF1B": "\\varpi ",
425
+ "←": "\\leftarrow ",
426
+ "↑": "\\uparrow ",
427
+ "↓": "\\downright ",
428
+ "↔": "\\leftrightarrow ",
429
+ "↕": "\\updownarrow ",
430
+ "↖": "\\nwarrow ",
431
+ "↗": "\\nearrow ",
432
+ "↘": "\\searrow ",
433
+ "↙": "\\swarrow ",
434
+ "⋮": "\\vdots ",
435
+ "⋯": "\\cdots ",
436
+ "⋰": "\\adots ",
437
+ "⋱": "\\ddots ",
438
+ "≠": "\\ne ",
439
+ "≤": "\\leq ",
440
+ "≥": "\\geq ",
441
+ "≦": "\\leqq ",
442
+ "≧": "\\geqq ",
443
+ "≨": "\\lneqq ",
444
+ "≩": "\\gneqq ",
445
+ "≪": "\\ll ",
446
+ "≫": "\\gg ",
447
+ "∈": "\\in ",
448
+ "∉": "\\notin ",
449
+ "∋": "\\ni ",
450
+ "∌": "\\nni ",
451
+ "∞": "\\infty ",
452
+ "±": "\\pm ",
453
+ "∓": "\\mp ",
454
+ "\uD835\uDC34": "A",
455
+ "\uD835\uDC35": "B",
456
+ "\uD835\uDC36": "C",
457
+ "\uD835\uDC37": "D",
458
+ "\uD835\uDC38": "E",
459
+ "\uD835\uDC39": "F",
460
+ "\uD835\uDC3A": "G",
461
+ "\uD835\uDC3B": "H",
462
+ "\uD835\uDC3C": "I",
463
+ "\uD835\uDC3D": "J",
464
+ "\uD835\uDC3E": "K",
465
+ "\uD835\uDC3F": "L",
466
+ "\uD835\uDC40": "M",
467
+ "\uD835\uDC41": "N",
468
+ "\uD835\uDC42": "O",
469
+ "\uD835\uDC43": "P",
470
+ "\uD835\uDC44": "Q",
471
+ "\uD835\uDC45": "R",
472
+ "\uD835\uDC46": "S",
473
+ "\uD835\uDC47": "T",
474
+ "\uD835\uDC48": "U",
475
+ "\uD835\uDC49": "V",
476
+ "\uD835\uDC4A": "W",
477
+ "\uD835\uDC4B": "X",
478
+ "\uD835\uDC4C": "Y",
479
+ "\uD835\uDC4D": "Z",
480
+ "\uD835\uDC4E": "a",
481
+ "\uD835\uDC4F": "b",
482
+ "\uD835\uDC50": "c",
483
+ "\uD835\uDC51": "d",
484
+ "\uD835\uDC52": "e",
485
+ "\uD835\uDC53": "f",
486
+ "\uD835\uDC54": "g",
487
+ "\uD835\uDC56": "i",
488
+ "\uD835\uDC57": "j",
489
+ "\uD835\uDC58": "k",
490
+ "\uD835\uDC59": "l",
491
+ "\uD835\uDC5A": "m",
492
+ "\uD835\uDC5B": "n",
493
+ "\uD835\uDC5C": "o",
494
+ "\uD835\uDC5D": "p",
495
+ "\uD835\uDC5E": "q",
496
+ "\uD835\uDC5F": "r",
497
+ "\uD835\uDC60": "s",
498
+ "\uD835\uDC61": "t",
499
+ "\uD835\uDC62": "u",
500
+ "\uD835\uDC63": "v",
501
+ "\uD835\uDC64": "w",
502
+ "\uD835\uDC65": "x",
503
+ "\uD835\uDC66": "y",
504
+ "\uD835\uDC67": "z"
505
+ };
506
+ var FUNC = {
507
+ sin: "\\sin({fe})",
508
+ cos: "\\cos({fe})",
509
+ tan: "\\tan({fe})",
510
+ arcsin: "\\arcsin({fe})",
511
+ arccos: "\\arccos({fe})",
512
+ arctan: "\\arctan({fe})",
513
+ arccot: "\\arccot({fe})",
514
+ sinh: "\\sinh({fe})",
515
+ cosh: "\\cosh({fe})",
516
+ tanh: "\\tanh({fe})",
517
+ coth: "\\coth({fe})",
518
+ sec: "\\sec({fe})",
519
+ csc: "\\csc({fe})"
520
+ };
521
+ var CHR_DEFAULT = {
522
+ ACC_VAL: "\\hat{{{0}}}"
523
+ };
524
+ var POS = {
525
+ top: "\\overline{{{0}}}",
526
+ bot: "\\underline{{{0}}}"
527
+ };
528
+ var POS_DEFAULT = {
529
+ BAR_VAL: "\\overline{{{0}}}"
530
+ };
531
+ var SUB = "_{{{0}}}";
532
+ var SUP = "^{{{0}}}";
533
+ var F = {
534
+ bar: "\\frac{{{num}}}{{{den}}}",
535
+ skw: "^{{{num}}}/_{{{den}}}",
536
+ noBar: "\\genfrac{{}}{{}}{0pt}{{}}{{{num}}}{{{den}}}",
537
+ lin: "{{{num}}}/{{{den}}}"
538
+ };
539
+ var F_DEFAULT = "\\frac{{{num}}}{{{den}}}";
540
+ var D = "\\left{left}{text}\\right{right}";
541
+ var D_DEFAULT = {
542
+ left: "(",
543
+ right: ")",
544
+ null: "."
545
+ };
546
+ var RAD = "\\sqrt[{deg}]{{{text}}}";
547
+ var RAD_DEFAULT = "\\sqrt{{{text}}}";
548
+ var ARR = "\\begin{{array}}{{c}}{text}\\end{{array}}";
549
+ var LIM_FUNC = {
550
+ lim: "\\lim_{{{lim}}}",
551
+ max: "\\max_{{{lim}}}",
552
+ min: "\\min_{{{lim}}}"
553
+ };
554
+ var LIM_TO = ["\\rightarrow", "\\to"];
555
+ var LIM_UPP = "\\overset{{{lim}}}{{{text}}}";
556
+ var M = "\\begin{{matrix}}{text}\\end{{matrix}}";
557
+
558
+ // src/converter-utils/docx/omml.ts
559
+ var OMML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math";
560
+ var OMML_NS_BRACE = `{${OMML_NS}}`;
561
+ function tpl(template, vars) {
562
+ let result = template;
563
+ for (const [key, value] of Object.entries(vars)) {
564
+ result = result.replace(new RegExp(`\\{${key}\\}`, "g"), value);
565
+ }
566
+ result = result.replace(/\{\{/g, "{").replace(/\}\}/g, "}");
567
+ return result;
568
+ }
569
+ function tpl0(template, value) {
570
+ let result = template.replace(/\{0\}/g, value);
571
+ result = result.replace(/\{\{/g, "{").replace(/\}\}/g, "}");
572
+ return result;
573
+ }
574
+ function escapeLatex(strs) {
575
+ let last = null;
576
+ const newChr = [];
577
+ strs = strs.replace(/\\\\/g, "\\");
578
+ for (const c of strs) {
579
+ if (CHARS.has(c) && last !== BACKSLASH) {
580
+ newChr.push(BACKSLASH + c);
581
+ } else {
582
+ newChr.push(c);
583
+ }
584
+ last = c;
585
+ }
586
+ return newChr.join(BLANK);
587
+ }
588
+ function getVal(key, defaultVal, store) {
589
+ if (key != null) {
590
+ return !store ? key : store[key] ?? key;
591
+ }
592
+ return defaultVal ?? "";
593
+ }
594
+ function parseOmmlXml(xmlStr) {
595
+ const nsMap = {};
596
+ const xmlnsRe = /xmlns(?::(\w+))?="([^"]+)"/g;
597
+ let m;
598
+ while ((m = xmlnsRe.exec(xmlStr)) !== null) {
599
+ const prefix = m[1] || "";
600
+ nsMap[prefix] = m[2];
601
+ }
602
+ const { XMLParser } = __require("fast-xml-parser");
603
+ const parser = new XMLParser({
604
+ ignoreAttributes: false,
605
+ attributeNamePrefix: "",
606
+ preserveOrder: true,
607
+ processEntities: false,
608
+ trimValues: false,
609
+ parseTagValue: false
610
+ });
611
+ const parsed = parser.parse(xmlStr);
612
+ function resolveNs(tagName) {
613
+ const parts = tagName.split(":");
614
+ if (parts.length === 2) {
615
+ const ns = nsMap[parts[0]];
616
+ if (ns)
617
+ return `{${ns}}${parts[1]}`;
618
+ } else if (nsMap[""]) {
619
+ return `{${nsMap[""]}}${tagName}`;
620
+ }
621
+ return tagName;
622
+ }
623
+ function convertNode(node) {
624
+ if (!node || typeof node !== "object")
625
+ return null;
626
+ const keys = Object.keys(node).filter((k) => k !== ":@");
627
+ if (keys.length === 0)
628
+ return null;
629
+ const tagName = keys[0];
630
+ const resolvedTag = resolveNs(tagName);
631
+ const attrib = {};
632
+ if (node[":@"]) {
633
+ for (const [k, v] of Object.entries(node[":@"])) {
634
+ attrib[resolveNs(k)] = String(v);
635
+ }
636
+ }
637
+ let text = null;
638
+ const children = [];
639
+ const content = node[tagName];
640
+ if (Array.isArray(content)) {
641
+ for (const child of content) {
642
+ if (child && typeof child === "object") {
643
+ const childKeys = Object.keys(child).filter((k) => k !== ":@");
644
+ if (childKeys.length > 0) {
645
+ if (childKeys[0] === "#text") {
646
+ text = String(child["#text"]);
647
+ } else {
648
+ const childNode = convertNode(child);
649
+ if (childNode)
650
+ children.push(childNode);
651
+ }
652
+ }
653
+ }
654
+ }
655
+ } else if (typeof content === "string") {
656
+ text = content;
657
+ }
658
+ return { tag: resolvedTag, attrib, text, children };
659
+ }
660
+ if (Array.isArray(parsed) && parsed.length > 0) {
661
+ const root = convertNode(parsed[0]);
662
+ if (root)
663
+ return root;
664
+ }
665
+ throw new Error("Failed to parse OMML XML");
666
+ }
667
+ function findAll(node, tag) {
668
+ const results = [];
669
+ for (const child of node.children) {
670
+ if (child.tag === tag)
671
+ results.push(child);
672
+ }
673
+ return results;
674
+ }
675
+ function find(node, tag) {
676
+ for (const child of node.children) {
677
+ if (child.tag === tag)
678
+ return child;
679
+ }
680
+ return null;
681
+ }
682
+ function stripNs(tag) {
683
+ return tag.replace(OMML_NS_BRACE, "");
684
+ }
685
+ var PR_VAL_TAGS = new Set(["chr", "pos", "begChr", "endChr", "type"]);
686
+ function processPr(elm) {
687
+ const result = { text: "" };
688
+ const parts = [];
689
+ for (const child of elm.children) {
690
+ if (!child.tag.includes(OMML_NS))
691
+ continue;
692
+ const stag = stripNs(child.tag);
693
+ if (stag === "brk") {
694
+ result.brk = BRK;
695
+ parts.push(BRK);
696
+ } else if (PR_VAL_TAGS.has(stag)) {
697
+ const val = child.attrib[`${OMML_NS_BRACE}val`] ?? null;
698
+ result[stag] = val;
699
+ }
700
+ }
701
+ result.text = parts.join(BLANK);
702
+ return result;
703
+ }
704
+ var DIRECT_TAGS = new Set(["box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e"]);
705
+ function* processChildrenList(elm, include) {
706
+ for (const child of elm.children) {
707
+ if (!child.tag.includes(OMML_NS))
708
+ continue;
709
+ const stag = stripNs(child.tag);
710
+ if (include && !include.has(stag))
711
+ continue;
712
+ let t = callMethod(child, stag);
713
+ if (t === null) {
714
+ t = processUnknown(child, stag);
715
+ if (t === null)
716
+ continue;
717
+ }
718
+ yield [stag, t, child];
719
+ }
720
+ }
721
+ function processChildrenDict(elm, include) {
722
+ const dict = {};
723
+ for (const [stag, t] of processChildrenList(elm, include)) {
724
+ dict[stag] = t;
725
+ }
726
+ return dict;
727
+ }
728
+ function processChildren(elm, include) {
729
+ const parts = [];
730
+ for (const [, t] of processChildrenList(elm, include)) {
731
+ if (typeof t === "string") {
732
+ parts.push(t);
733
+ } else if (t && typeof t === "object" && "text" in t) {
734
+ parts.push(t.text);
735
+ } else {
736
+ parts.push(String(t));
737
+ }
738
+ }
739
+ return parts.join(BLANK);
740
+ }
741
+ function processUnknown(elm, stag) {
742
+ if (DIRECT_TAGS.has(stag)) {
743
+ return processChildren(elm);
744
+ } else if (stag.endsWith("Pr")) {
745
+ return processPr(elm);
746
+ }
747
+ return null;
748
+ }
749
+ function callMethod(elm, stag) {
750
+ if (!stag)
751
+ stag = stripNs(elm.tag);
752
+ const method = TAG2METH[stag];
753
+ if (method)
754
+ return method(elm);
755
+ return null;
756
+ }
757
+ function doAcc(elm) {
758
+ const cDict = processChildrenDict(elm);
759
+ const pr = cDict.accPr;
760
+ const latexS = getVal(pr.chr, CHR_DEFAULT.ACC_VAL, CHR);
761
+ return tpl0(latexS, cDict.e);
762
+ }
763
+ function doBar(elm) {
764
+ const cDict = processChildrenDict(elm);
765
+ const pr = cDict.barPr;
766
+ const latexS = getVal(pr.pos, POS_DEFAULT.BAR_VAL, POS);
767
+ return pr.text + tpl0(latexS, cDict.e);
768
+ }
769
+ function doD(elm) {
770
+ const cDict = processChildrenDict(elm);
771
+ const pr = cDict.dPr;
772
+ const nullVal = D_DEFAULT.null;
773
+ const sVal = getVal(pr.begChr, D_DEFAULT.left, T);
774
+ const eVal = getVal(pr.endChr, D_DEFAULT.right, T);
775
+ return pr.text + tpl(D, {
776
+ left: !sVal ? nullVal : escapeLatex(sVal),
777
+ text: cDict.e,
778
+ right: !eVal ? nullVal : escapeLatex(eVal)
779
+ });
780
+ }
781
+ function doSub(elm) {
782
+ return tpl0(SUB, processChildren(elm));
783
+ }
784
+ function doSup(elm) {
785
+ return tpl0(SUP, processChildren(elm));
786
+ }
787
+ function doF(elm) {
788
+ const cDict = processChildrenDict(elm);
789
+ const pr = cDict.fPr;
790
+ const latexS = getVal(pr.type, F_DEFAULT, F);
791
+ return pr.text + tpl(latexS, { num: cDict.num ?? "", den: cDict.den ?? "" });
792
+ }
793
+ function doFunc(elm) {
794
+ const cDict = processChildrenDict(elm);
795
+ const funcName = cDict.fName ?? "";
796
+ return funcName.replace(FUNC_PLACE, cDict.e ?? "");
797
+ }
798
+ function doFname(elm) {
799
+ const latexChars = [];
800
+ for (const [stag, t] of processChildrenList(elm)) {
801
+ if (stag === "r") {
802
+ if (FUNC[t]) {
803
+ latexChars.push(FUNC[t]);
804
+ } else {
805
+ throw new Error(`Not supported func ${t}`);
806
+ }
807
+ } else {
808
+ latexChars.push(t);
809
+ }
810
+ }
811
+ const result = latexChars.join(BLANK);
812
+ return result.includes(FUNC_PLACE) ? result : result + FUNC_PLACE;
813
+ }
814
+ function doGroupchr(elm) {
815
+ const cDict = processChildrenDict(elm);
816
+ const pr = cDict.groupChrPr;
817
+ const latexS = getVal(pr.chr);
818
+ return pr.text + tpl0(latexS, cDict.e);
819
+ }
820
+ function doRad(elm) {
821
+ const cDict = processChildrenDict(elm);
822
+ const text = cDict.e ?? "";
823
+ const degText = cDict.deg ?? "";
824
+ if (degText) {
825
+ return tpl(RAD, { deg: degText, text });
826
+ }
827
+ return tpl(RAD_DEFAULT, { text });
828
+ }
829
+ function doEqarr(elm) {
830
+ const parts = [];
831
+ for (const [, t] of processChildrenList(elm, new Set(["e"]))) {
832
+ parts.push(t);
833
+ }
834
+ return tpl(ARR, { text: parts.join(BRK) });
835
+ }
836
+ function doLimlow(elm) {
837
+ const tDict = processChildrenDict(elm, new Set(["e", "lim"]));
838
+ const latexS = LIM_FUNC[tDict.e];
839
+ if (!latexS) {
840
+ throw new Error(`Not supported lim ${tDict.e}`);
841
+ }
842
+ return tpl(latexS, { lim: tDict.lim ?? "" });
843
+ }
844
+ function doLimupp(elm) {
845
+ const tDict = processChildrenDict(elm, new Set(["e", "lim"]));
846
+ return tpl(LIM_UPP, { lim: tDict.lim ?? "", text: tDict.e ?? "" });
847
+ }
848
+ function doLim(elm) {
849
+ return processChildren(elm).replace(LIM_TO[0], LIM_TO[1]);
850
+ }
851
+ function doM(elm) {
852
+ const rows = [];
853
+ for (const [stag, t] of processChildrenList(elm)) {
854
+ if (stag === "mPr") {} else if (stag === "mr") {
855
+ rows.push(t);
856
+ }
857
+ }
858
+ return tpl(M, { text: rows.join(BRK) });
859
+ }
860
+ function doMr(elm) {
861
+ const parts = [];
862
+ for (const [, t] of processChildrenList(elm, new Set(["e"]))) {
863
+ parts.push(t);
864
+ }
865
+ return parts.join(ALN);
866
+ }
867
+ function doNary(elm) {
868
+ const res = [];
869
+ let bo = "";
870
+ for (const [stag, t] of processChildrenList(elm)) {
871
+ if (stag === "naryPr") {
872
+ const pr = t;
873
+ bo = getVal(pr.chr, undefined, CHR_BO);
874
+ } else {
875
+ res.push(t);
876
+ }
877
+ }
878
+ return bo + res.join(BLANK);
879
+ }
880
+ function doR(elm) {
881
+ const tTag = find(elm, `${OMML_NS_BRACE}t`);
882
+ if (!tTag)
883
+ return "";
884
+ const text = tTag.text ?? "";
885
+ const chars = [];
886
+ for (const s of text) {
887
+ chars.push(T[s] ?? s);
888
+ }
889
+ return escapeLatex(chars.join(BLANK));
890
+ }
891
+ var TAG2METH = {
892
+ acc: doAcc,
893
+ r: doR,
894
+ bar: doBar,
895
+ sub: doSub,
896
+ sup: doSup,
897
+ f: doF,
898
+ func: doFunc,
899
+ fName: doFname,
900
+ groupChr: doGroupchr,
901
+ d: doD,
902
+ rad: doRad,
903
+ eqArr: doEqarr,
904
+ limLow: doLimlow,
905
+ limUpp: doLimupp,
906
+ lim: doLim,
907
+ m: doM,
908
+ mr: doMr,
909
+ nary: doNary
910
+ };
911
+ function oMathToLatex(element) {
912
+ return processChildren(element);
913
+ }
914
+
915
+ // src/converter-utils/docx/pre-process.ts
916
+ var OMML_NS_BRACE2 = `{${OMML_NS}}`;
917
+ var MATH_ROOT_TEMPLATE_PARTS = [
918
+ "<w:document ",
919
+ 'xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" ',
920
+ 'xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" ',
921
+ 'xmlns:o="urn:schemas-microsoft-com:office:office" ',
922
+ 'xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" ',
923
+ 'xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" ',
924
+ 'xmlns:v="urn:schemas-microsoft-com:vml" ',
925
+ 'xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" ',
926
+ 'xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" ',
927
+ 'xmlns:w10="urn:schemas-microsoft-com:office:word" ',
928
+ 'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" ',
929
+ 'xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" ',
930
+ 'xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" ',
931
+ 'xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" ',
932
+ 'xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" ',
933
+ 'xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" ',
934
+ 'mc:Ignorable="w14 wp14">'
935
+ ];
936
+ function mathRootTemplate(content) {
937
+ return `${MATH_ROOT_TEMPLATE_PARTS.join("") + content}</w:document>`;
938
+ }
939
+ function convertOmathToLatex(omathXml) {
940
+ const wrappedXml = mathRootTemplate(omathXml);
941
+ const root = parseOmmlXml(wrappedXml);
942
+ const oMathNode = findAll(root, `${OMML_NS_BRACE2}oMath`)[0];
943
+ if (!oMathNode)
944
+ return "";
945
+ return oMathToLatex(oMathNode);
946
+ }
947
+ function preProcessMath(content) {
948
+ content = content.replace(/<m:oMathPara\b[^>]*>([\s\S]*?)<\/m:oMathPara>/g, (_match, inner) => {
949
+ const oMathRegex = /<m:oMath\b[^>]*>[\s\S]*?<\/m:oMath>/g;
950
+ const parts = [];
951
+ let oMathMatch;
952
+ while ((oMathMatch = oMathRegex.exec(inner)) !== null) {
953
+ try {
954
+ const latex = convertOmathToLatex(oMathMatch[0]);
955
+ parts.push(`<w:r><w:t>$$${latex}$$</w:t></w:r>`);
956
+ } catch {
957
+ return _match;
958
+ }
959
+ }
960
+ if (parts.length === 0)
961
+ return _match;
962
+ return `<w:p>${parts.join("")}</w:p>`;
963
+ });
964
+ content = content.replace(/<m:oMath\b[^>]*>([\s\S]*?)<\/m:oMath>/g, (match) => {
965
+ try {
966
+ const latex = convertOmathToLatex(match);
967
+ return `<w:r><w:t>$${latex}$</w:t></w:r>`;
968
+ } catch {
969
+ return match;
970
+ }
971
+ });
972
+ return content;
973
+ }
974
+ var PRE_PROCESS_FILES = ["word/document.xml", "word/footnotes.xml", "word/endnotes.xml"];
975
+ async function preProcessDocx(inputBuffer) {
976
+ const zip = await JSZip.loadAsync(inputBuffer);
977
+ const outputZip = new JSZip;
978
+ for (const [path, file] of Object.entries(zip.files)) {
979
+ if (file.dir) {
980
+ outputZip.folder(path);
981
+ continue;
982
+ }
983
+ if (PRE_PROCESS_FILES.includes(path)) {
984
+ try {
985
+ const content = await file.async("string");
986
+ const processed = preProcessMath(content);
987
+ outputZip.file(path, processed);
988
+ } catch {
989
+ const content = await file.async("uint8array");
990
+ outputZip.file(path, content);
991
+ }
992
+ } else {
993
+ const content = await file.async("uint8array");
994
+ outputZip.file(path, content);
995
+ }
996
+ }
997
+ const result = await outputZip.generateAsync({ type: "nodebuffer" });
998
+ return result;
999
+ }
1000
+
1001
+ // src/transforms/html-to-markdown.ts
1002
+ import * as cheerio2 from "cheerio";
1003
+ import TurndownService2 from "turndown";
1004
+ import { tables } from "turndown-plugin-gfm";
1005
+ function createTurndownService(opts) {
1006
+ const td = new TurndownService2({
1007
+ headingStyle: "atx",
1008
+ codeBlockStyle: "fenced",
1009
+ bulletListMarker: "*",
1010
+ emDelimiter: "*"
1011
+ });
1012
+ td.use(tables);
1013
+ td.addRule("links", {
1014
+ filter: "a",
1015
+ replacement(content, node) {
1016
+ const el = node;
1017
+ const href = el.getAttribute("href") || "";
1018
+ const title = el.getAttribute("title") || "";
1019
+ if (!content.trim())
1020
+ return "";
1021
+ if (href) {
1022
+ try {
1023
+ const url = new URL(href, "http://placeholder.invalid");
1024
+ const scheme = url.protocol.replace(":", "").toLowerCase();
1025
+ if (scheme && !["http", "https", "file"].includes(scheme)) {
1026
+ if (href.includes(":") && !href.startsWith("/") && !href.startsWith("#")) {
1027
+ return content;
1028
+ }
1029
+ }
1030
+ } catch {}
1031
+ }
1032
+ const titlePart = title ? ` "${title.replace(/"/g, "\\\"")}"` : "";
1033
+ return href ? `[${content}](${href}${titlePart})` : content;
1034
+ }
1035
+ });
1036
+ td.addRule("images", {
1037
+ filter: "img",
1038
+ replacement(_content, node) {
1039
+ const el = node;
1040
+ const alt = (el.getAttribute("alt") || "").replace(/\n/g, " ");
1041
+ let src = el.getAttribute("src") || el.getAttribute("data-src") || "";
1042
+ const title = el.getAttribute("title") || "";
1043
+ if (src.startsWith("data:") && !opts?.keepDataUris) {
1044
+ src = `${src.split(",")[0]}...`;
1045
+ }
1046
+ const titlePart = title ? ` "${title.replace(/"/g, "\\\"")}"` : "";
1047
+ return `![${alt}](${src}${titlePart})`;
1048
+ }
1049
+ });
1050
+ td.addRule("listItem", {
1051
+ filter: "li",
1052
+ replacement(content, node, options) {
1053
+ content = content.replace(/^\n+/, "").replace(/\n+$/, `
1054
+ `).replace(/\n/gm, `
1055
+ `);
1056
+ const parent = node.parentNode;
1057
+ const isOrdered = parent?.nodeName === "OL";
1058
+ let prefix;
1059
+ if (isOrdered) {
1060
+ const start = parent.getAttribute("start");
1061
+ const index = Array.prototype.indexOf.call(parent.children, node);
1062
+ const num = (start ? parseInt(start, 10) : 1) + index;
1063
+ prefix = `${num}. `;
1064
+ } else {
1065
+ prefix = `${options.bulletListMarker} `;
1066
+ }
1067
+ return prefix + content + (node.nextSibling ? `
1068
+ ` : "");
1069
+ }
1070
+ });
1071
+ td.addRule("definitionList", {
1072
+ filter: "dl",
1073
+ replacement(content) {
1074
+ return `
1075
+
1076
+ ${content}
1077
+
1078
+ `;
1079
+ }
1080
+ });
1081
+ td.addRule("definitionTerm", {
1082
+ filter: "dt",
1083
+ replacement(content) {
1084
+ return `
1085
+ ${content}
1086
+ `;
1087
+ }
1088
+ });
1089
+ td.addRule("definitionDescription", {
1090
+ filter: "dd",
1091
+ replacement(content) {
1092
+ return `: ${content.trim()}
1093
+ `;
1094
+ }
1095
+ });
1096
+ td.addRule("checkboxes", {
1097
+ filter(node) {
1098
+ return node.nodeName === "INPUT" && node.getAttribute("type") === "checkbox";
1099
+ },
1100
+ replacement(_content, node) {
1101
+ const el = node;
1102
+ return el.hasAttribute("checked") ? "[x] " : "[ ] ";
1103
+ }
1104
+ });
1105
+ return td;
1106
+ }
1107
+ function htmlToMarkdown(html, opts) {
1108
+ const $ = cheerio2.load(html);
1109
+ $("script, style").remove();
1110
+ const title = $("title").first().text() || undefined;
1111
+ $("td, th").each((_, el) => {
1112
+ const $el = $(el);
1113
+ $el.find("> p").each((_2, p) => {
1114
+ $(p).replaceWith(`${$(p).html()} `);
1115
+ });
1116
+ $el.html(($el.html() || "").trim());
1117
+ });
1118
+ $("table").each((_, table) => {
1119
+ const $table = $(table);
1120
+ if ($table.find("thead").length)
1121
+ return;
1122
+ const $firstRow = $table.find("> tr, > tbody > tr").first();
1123
+ if (!$firstRow.length)
1124
+ return;
1125
+ $firstRow.find("> td").each((_2, td2) => {
1126
+ const $td = $(td2);
1127
+ const $th = $("<th>").html($td.html() || "");
1128
+ const attrs = td2.attribs || {};
1129
+ for (const [name, value] of Object.entries(attrs)) {
1130
+ $th.attr(name, value);
1131
+ }
1132
+ $td.replaceWith($th);
1133
+ });
1134
+ const $thead = $("<thead>").append($firstRow.clone());
1135
+ $firstRow.remove();
1136
+ const $tbody = $table.find("> tbody");
1137
+ if (!$tbody.length) {
1138
+ const remainingRows = $table.find("> tr");
1139
+ if (remainingRows.length) {
1140
+ const $newTbody = $("<tbody>");
1141
+ remainingRows.each((_2, row) => {
1142
+ $newTbody.append($(row));
1143
+ });
1144
+ $table.append($newTbody);
1145
+ }
1146
+ }
1147
+ $table.prepend($thead);
1148
+ });
1149
+ const body = $("body");
1150
+ const processedHtml = body.length > 0 ? body.html() || "" : $.html() || "";
1151
+ const td = createTurndownService(opts);
1152
+ let markdown = td.turndown(processedHtml);
1153
+ markdown = markdown.trim();
1154
+ return { markdown, title };
1155
+ }
1156
+
1157
+ // src/converters/docx.ts
1158
+ var ACCEPTED_EXTENSIONS4 = [".docx"];
1159
+ var ACCEPTED_MIME_PREFIXES4 = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"];
1160
+ var docxConverter = converter("DOCX", anyOf(byExt(...ACCEPTED_EXTENSIONS4), byMime(...ACCEPTED_MIME_PREFIXES4)), async (ctx) => {
1161
+ let buffer;
1162
+ try {
1163
+ buffer = await preProcessDocx(ctx.buffer);
1164
+ } catch {
1165
+ buffer = ctx.buffer;
1166
+ }
1167
+ const result = await mammoth.convertToHtml({ buffer }, { styleMap: ctx.opts.styleMap ? [ctx.opts.styleMap] : undefined });
1168
+ let { markdown, title } = htmlToMarkdown(result.value, ctx.opts);
1169
+ markdown = markdown.replace(/(\${1,2})((?:(?!\1).)+)\1/g, (_match, delim, body) => delim + body.replace(/\\\\/g, "\\") + delim);
1170
+ return { markdown, title };
1171
+ });
1172
+ // src/converters/epub.ts
1173
+ var ACCEPTED_EXTENSIONS5 = [".epub"];
1174
+ var ACCEPTED_MIME_PREFIXES5 = ["application/epub", "application/epub+zip", "application/x-epub+zip"];
1175
+ var epubConverter = converter("EPUB", anyOf(byExt(...ACCEPTED_EXTENSIONS5), byMime(...ACCEPTED_MIME_PREFIXES5)), async (ctx) => {
1176
+ const JSZip2 = (await import("jszip")).default;
1177
+ const { XMLParser } = await import("fast-xml-parser");
1178
+ const zip = await JSZip2.loadAsync(ctx.buffer);
1179
+ const containerXml = await zip.file("META-INF/container.xml")?.async("text");
1180
+ if (!containerXml)
1181
+ throw new Error("Invalid EPUB: missing META-INF/container.xml");
1182
+ const containerParser = new XMLParser({ ignoreAttributes: false });
1183
+ const container = containerParser.parse(containerXml);
1184
+ const rootfiles = container?.container?.rootfiles?.rootfile;
1185
+ const rootfile = Array.isArray(rootfiles) ? rootfiles[0] : rootfiles;
1186
+ const opfPath = rootfile?.["@_full-path"];
1187
+ if (!opfPath)
1188
+ throw new Error("Invalid EPUB: cannot find content.opf path");
1189
+ const opfXml = await zip.file(opfPath)?.async("text");
1190
+ if (!opfXml)
1191
+ throw new Error(`Invalid EPUB: missing ${opfPath}`);
1192
+ const opfParser = new XMLParser({ ignoreAttributes: false });
1193
+ const opf = opfParser.parse(opfXml);
1194
+ const pkg = opf?.package ?? opf?.package;
1195
+ const meta = pkg?.metadata ?? {};
1196
+ const metadata = {
1197
+ title: extractText(meta, "dc:title"),
1198
+ authors: extractAllTexts(meta, "dc:creator").join(", ") || null,
1199
+ language: extractText(meta, "dc:language"),
1200
+ publisher: extractText(meta, "dc:publisher"),
1201
+ date: extractText(meta, "dc:date"),
1202
+ description: extractText(meta, "dc:description"),
1203
+ identifier: extractText(meta, "dc:identifier")
1204
+ };
1205
+ const manifestItems = asArray(pkg?.manifest?.item);
1206
+ const manifest = new Map;
1207
+ for (const item of manifestItems) {
1208
+ const id = item["@_id"];
1209
+ const href = item["@_href"];
1210
+ if (id && href)
1211
+ manifest.set(id, href);
1212
+ }
1213
+ const spineItems = asArray(pkg?.spine?.itemref);
1214
+ const spineOrder = spineItems.map((item) => item["@_idref"]).filter(Boolean);
1215
+ const basePath = opfPath.includes("/") ? opfPath.split("/").slice(0, -1).join("/") : "";
1216
+ const spineFiles = spineOrder.map((id) => {
1217
+ const href = manifest.get(id);
1218
+ if (!href)
1219
+ return null;
1220
+ let decoded;
1221
+ try {
1222
+ decoded = decodeURIComponent(href);
1223
+ } catch {
1224
+ decoded = href;
1225
+ }
1226
+ return basePath ? `${basePath}/${decoded}` : decoded;
1227
+ }).filter(Boolean);
1228
+ const markdownParts = [];
1229
+ for (const file of spineFiles) {
1230
+ const zipFile = zip.file(file);
1231
+ if (!zipFile)
1232
+ continue;
1233
+ const htmlBuffer = Buffer.from(await zipFile.async("nodebuffer"));
1234
+ const htmlText = decodeBuffer(htmlBuffer, "utf-8");
1235
+ const { markdown } = htmlToMarkdown(htmlText, ctx.opts);
1236
+ const trimmed = markdown.trim();
1237
+ if (trimmed)
1238
+ markdownParts.push(trimmed);
1239
+ }
1240
+ const metadataLines = [];
1241
+ for (const [key, value] of Object.entries(metadata)) {
1242
+ if (value) {
1243
+ metadataLines.push(`**${key.charAt(0).toUpperCase() + key.slice(1)}:** ${value}`);
1244
+ }
1245
+ }
1246
+ if (metadataLines.length > 0) {
1247
+ markdownParts.unshift(metadataLines.join(`
1248
+ `));
1249
+ }
1250
+ return {
1251
+ markdown: markdownParts.join(`
1252
+
1253
+ `),
1254
+ title: metadata.title ?? undefined
1255
+ };
1256
+ });
1257
+ function extractText(meta, tag) {
1258
+ const val = meta?.[tag];
1259
+ if (!val)
1260
+ return null;
1261
+ if (typeof val === "string")
1262
+ return val;
1263
+ if (typeof val === "object" && "#text" in val)
1264
+ return val["#text"];
1265
+ if (Array.isArray(val)) {
1266
+ const first = val[0];
1267
+ if (typeof first === "string")
1268
+ return first;
1269
+ if (typeof first === "object" && "#text" in first)
1270
+ return first["#text"];
1271
+ }
1272
+ return null;
1273
+ }
1274
+ function extractAllTexts(meta, tag) {
1275
+ const val = meta?.[tag];
1276
+ if (!val)
1277
+ return [];
1278
+ const arr = Array.isArray(val) ? val : [val];
1279
+ return arr.map((v) => {
1280
+ if (typeof v === "string")
1281
+ return v;
1282
+ if (typeof v === "object" && "#text" in v)
1283
+ return v["#text"];
1284
+ return null;
1285
+ }).filter(Boolean);
1286
+ }
1287
+ function asArray(val) {
1288
+ if (!val)
1289
+ return [];
1290
+ return Array.isArray(val) ? val : [val];
1291
+ }
1292
+ // src/converters/html.ts
1293
+ var ACCEPTED_EXTENSIONS6 = [".html", ".htm"];
1294
+ var ACCEPTED_MIME_PREFIXES6 = ["text/html", "application/xhtml"];
1295
+ var htmlConverter = converter("HTML", anyOf(byExt(...ACCEPTED_EXTENSIONS6), byMime(...ACCEPTED_MIME_PREFIXES6)), async (ctx) => {
1296
+ const encoding = ctx.info.charset ?? "utf-8";
1297
+ const html = decodeBuffer(ctx.buffer, encoding);
1298
+ const { markdown, title } = htmlToMarkdown(html, ctx.opts);
1299
+ return { markdown, title };
1300
+ });
1301
+ // src/converters/image.ts
1302
+ var ACCEPTED_EXTENSIONS7 = [".jpg", ".jpeg", ".png"];
1303
+ var ACCEPTED_MIME_PREFIXES7 = ["image/jpeg", "image/png"];
1304
+ var IMAGE_METADATA_FIELDS = [
1305
+ "ImageSize",
1306
+ "Title",
1307
+ "Caption",
1308
+ "Description",
1309
+ "Keywords",
1310
+ "Artist",
1311
+ "Author",
1312
+ "DateTimeOriginal",
1313
+ "CreateDate",
1314
+ "GPSPosition"
1315
+ ];
1316
+ var imageConverter = converter("Image", anyOf(byExt(...ACCEPTED_EXTENSIONS7), byMime(...ACCEPTED_MIME_PREFIXES7)), async (ctx) => {
1317
+ let md = "";
1318
+ const metadata = await exiftoolMetadata(ctx.buffer, ctx.opts.exiftoolPath);
1319
+ const metaLines = [];
1320
+ for (const field of IMAGE_METADATA_FIELDS) {
1321
+ if (metadata[field]) {
1322
+ metaLines.push(`- **${field}:** ${metadata[field]}`);
1323
+ }
1324
+ }
1325
+ if (metaLines.length) {
1326
+ md += `# Image Metadata
1327
+
1328
+ ${metaLines.join(`
1329
+ `)}
1330
+ `;
1331
+ }
1332
+ if (ctx.opts.llmClient && ctx.opts.llmModel) {
1333
+ const extMimeMap = {
1334
+ ".jpg": "image/jpeg",
1335
+ ".jpeg": "image/jpeg",
1336
+ ".png": "image/png"
1337
+ };
1338
+ const contentType = ctx.info.mimetype || (ctx.info.extension ? extMimeMap[ctx.info.extension] : undefined) || "application/octet-stream";
1339
+ const base64Image = ctx.buffer.toString("base64");
1340
+ const dataUri = `data:${contentType};base64,${base64Image}`;
1341
+ const prompt = ctx.opts.llmPrompt?.trim() || "Write a detailed caption for this image.";
1342
+ try {
1343
+ const response = await ctx.opts.llmClient.chat.completions.create({
1344
+ model: ctx.opts.llmModel,
1345
+ messages: [
1346
+ {
1347
+ role: "user",
1348
+ content: [
1349
+ { type: "text", text: prompt },
1350
+ { type: "image_url", image_url: { url: dataUri } }
1351
+ ]
1352
+ }
1353
+ ]
1354
+ });
1355
+ const description = response.choices?.[0]?.message?.content;
1356
+ if (description) {
1357
+ md += `
1358
+ # Description:
1359
+ ${description.trim()}
1360
+ `;
1361
+ }
1362
+ } catch {}
1363
+ }
1364
+ return { markdown: md };
1365
+ });
1366
+ // src/exceptions.ts
1367
+ class MarkItDownError extends Error {
1368
+ constructor(message) {
1369
+ super(message);
1370
+ this.name = "MarkItDownError";
1371
+ }
1372
+ }
1373
+
1374
+ class MissingDependencyError extends MarkItDownError {
1375
+ constructor(message) {
1376
+ super(message);
1377
+ this.name = "MissingDependencyError";
1378
+ }
1379
+ }
1380
+
1381
+ class UnsupportedFormatError extends MarkItDownError {
1382
+ constructor(message) {
1383
+ super(message);
1384
+ this.name = "UnsupportedFormatError";
1385
+ }
1386
+ }
1387
+ var SUPPORTED_FORMATS_LIST = "pdf, docx, pptx, xlsx, xls, html, csv, epub, zip, msg, json, ipynb, jpg, png, mp3, wav";
1388
+ function getSuggestion(error) {
1389
+ if (error instanceof UnsupportedFormatError) {
1390
+ return `Supported formats: ${SUPPORTED_FORMATS_LIST}. Use --describe for full details.`;
1391
+ }
1392
+ if (error instanceof FileConversionError) {
1393
+ return "The file was recognized but conversion failed. Ensure the file is not corrupted.";
1394
+ }
1395
+ if (error instanceof MissingDependencyError) {
1396
+ return "A required dependency is missing. Check the installation.";
1397
+ }
1398
+ if (error && typeof error === "object" && "code" in error) {
1399
+ const code = error.code;
1400
+ if (code === "ENOENT")
1401
+ return "Check the file path and try again.";
1402
+ if (code === "EACCES" || code === "EPERM")
1403
+ return "Check file permissions and try again.";
1404
+ }
1405
+ return;
1406
+ }
1407
+
1408
+ class FileConversionError extends MarkItDownError {
1409
+ attempts;
1410
+ constructor(message, attempts) {
1411
+ if (!message && attempts) {
1412
+ message = `File conversion failed after ${attempts.length} attempts:
1413
+ `;
1414
+ for (const attempt of attempts) {
1415
+ if (attempt.error) {
1416
+ message += ` - ${attempt.converterName} threw ${attempt.error.name}: ${attempt.error.message}
1417
+ `;
1418
+ } else {
1419
+ message += ` - ${attempt.converterName} provided no error info.
1420
+ `;
1421
+ }
1422
+ }
1423
+ }
1424
+ super(message ?? "File conversion failed.");
1425
+ this.name = "FileConversionError";
1426
+ this.attempts = attempts;
1427
+ }
1428
+ }
1429
+
1430
+ // src/converters/ipynb.ts
1431
+ var ACCEPTED_EXTENSIONS8 = [".ipynb"];
1432
+ function looksLikeNotebook(buffer, charset) {
1433
+ try {
1434
+ const text = decodeBuffer(buffer, charset ?? "utf-8");
1435
+ return text.includes("nbformat") && text.includes("nbformat_minor");
1436
+ } catch {
1437
+ return false;
1438
+ }
1439
+ }
1440
+ var ipynbConverter = converter("Ipynb", anyOf(byExt(...ACCEPTED_EXTENSIONS8), (ctx) => {
1441
+ const mime = (ctx.info.mimetype ?? "").toLowerCase();
1442
+ if (!mime.startsWith("application/json"))
1443
+ return false;
1444
+ return looksLikeNotebook(ctx.buffer, ctx.info.charset);
1445
+ }), async (ctx) => {
1446
+ try {
1447
+ const text = decodeBuffer(ctx.buffer, ctx.info.charset ?? "utf-8");
1448
+ const notebook = JSON.parse(text);
1449
+ const mdOutput = [];
1450
+ let title;
1451
+ for (const cell of notebook.cells ?? []) {
1452
+ const cellType = cell.cell_type ?? "";
1453
+ const sourceLines = cell.source ?? [];
1454
+ const source = sourceLines.join("");
1455
+ if (cellType === "markdown") {
1456
+ mdOutput.push(source);
1457
+ if (title === undefined) {
1458
+ for (const line of sourceLines) {
1459
+ if (line.startsWith("# ")) {
1460
+ title = line.replace(/^#+\s*/, "").trim();
1461
+ break;
1462
+ }
1463
+ }
1464
+ }
1465
+ } else if (cellType === "code") {
1466
+ mdOutput.push(`\`\`\`python
1467
+ ${source}
1468
+ \`\`\``);
1469
+ } else if (cellType === "raw") {
1470
+ mdOutput.push(`\`\`\`
1471
+ ${source}
1472
+ \`\`\``);
1473
+ }
1474
+ }
1475
+ const mdText = mdOutput.join(`
1476
+
1477
+ `);
1478
+ const metadataTitle = notebook.metadata?.title;
1479
+ if (metadataTitle) {
1480
+ title = metadataTitle;
1481
+ }
1482
+ return { markdown: mdText, title };
1483
+ } catch (e) {
1484
+ throw new FileConversionError(`Error converting .ipynb file: ${e instanceof Error ? e.message : String(e)}`);
1485
+ }
1486
+ });
1487
+ // src/converters/outlook-msg.ts
1488
+ var ACCEPTED_EXTENSIONS9 = [".msg"];
1489
+ var ACCEPTED_MIME_PREFIXES8 = ["application/vnd.ms-outlook"];
1490
+ var STREAM_FROM = "__substg1.0_0C1F001F";
1491
+ var STREAM_TO = "__substg1.0_0E04001F";
1492
+ var STREAM_SUBJECT = "__substg1.0_0037001F";
1493
+ var STREAM_BODY = "__substg1.0_1000001F";
1494
+ var outlookMsgConverter = converter("OutlookMSG", anyOf(byExt(...ACCEPTED_EXTENSIONS9), byMime(...ACCEPTED_MIME_PREFIXES8)), async (ctx) => {
1495
+ const CFB = await import("cfb");
1496
+ const cfb = CFB.read(ctx.buffer, { type: "buffer" });
1497
+ function getStream(streamPath) {
1498
+ const paths = [`/${streamPath}`, streamPath, `/Root Entry/${streamPath}`];
1499
+ for (const p of paths) {
1500
+ const entry = CFB.find(cfb, p);
1501
+ if (entry?.content) {
1502
+ const data = entry.content;
1503
+ try {
1504
+ const text = new TextDecoder("utf-16le").decode(data).trim();
1505
+ if (text)
1506
+ return text;
1507
+ } catch {}
1508
+ try {
1509
+ return new TextDecoder("utf-8").decode(data).trim();
1510
+ } catch {
1511
+ return new TextDecoder("utf-8", { fatal: false }).decode(data).trim();
1512
+ }
1513
+ }
1514
+ }
1515
+ return null;
1516
+ }
1517
+ const from = getStream(STREAM_FROM);
1518
+ const to = getStream(STREAM_TO);
1519
+ const subject = getStream(STREAM_SUBJECT);
1520
+ const body = getStream(STREAM_BODY);
1521
+ let md = `# Email Message
1522
+
1523
+ `;
1524
+ if (from)
1525
+ md += `**From:** ${from}
1526
+ `;
1527
+ if (to)
1528
+ md += `**To:** ${to}
1529
+ `;
1530
+ if (subject)
1531
+ md += `**Subject:** ${subject}
1532
+ `;
1533
+ md += `
1534
+ ## Content
1535
+
1536
+ `;
1537
+ if (body)
1538
+ md += body;
1539
+ return {
1540
+ markdown: md.trim(),
1541
+ title: subject ?? undefined
1542
+ };
1543
+ });
1544
+ // src/converters/pdf.ts
1545
+ import fs from "node:fs";
1546
+ import path from "node:path";
1547
+ var ACCEPTED_EXTENSIONS10 = [".pdf"];
1548
+ var ACCEPTED_MIME_PREFIXES9 = ["application/pdf", "application/x-pdf"];
1549
+ var PARTIAL_NUMBERING_PATTERN = /^\.\d+$/;
1550
+ function normalizeWord(w) {
1551
+ if (w.bbox) {
1552
+ return {
1553
+ text: w.text,
1554
+ x0: w.bbox.x0,
1555
+ top: w.bbox.top,
1556
+ x1: w.bbox.x1,
1557
+ bottom: w.bbox.bottom
1558
+ };
1559
+ }
1560
+ return { text: w.text, x0: w.x0, top: w.top, x1: w.x1, bottom: w.bottom };
1561
+ }
1562
+ function mergePartialNumberingLines(text) {
1563
+ const lines = text.split(`
1564
+ `);
1565
+ const result = [];
1566
+ let i = 0;
1567
+ while (i < lines.length) {
1568
+ const stripped = lines[i].trim();
1569
+ if (PARTIAL_NUMBERING_PATTERN.test(stripped)) {
1570
+ let j = i + 1;
1571
+ while (j < lines.length && !lines[j].trim())
1572
+ j++;
1573
+ if (j < lines.length) {
1574
+ result.push(`${stripped} ${lines[j].trim()}`);
1575
+ i = j + 1;
1576
+ } else {
1577
+ result.push(lines[i]);
1578
+ i++;
1579
+ }
1580
+ } else {
1581
+ result.push(lines[i]);
1582
+ i++;
1583
+ }
1584
+ }
1585
+ return result.join(`
1586
+ `);
1587
+ }
1588
+ function extractFormContentFromWords(words, pageWidth) {
1589
+ if (!words.length)
1590
+ return null;
1591
+ const yTolerance = 5;
1592
+ const rowsByY = new Map;
1593
+ for (const word of words) {
1594
+ const yKey = Math.round(word.top / yTolerance) * yTolerance;
1595
+ let arr = rowsByY.get(yKey);
1596
+ if (!arr) {
1597
+ arr = [];
1598
+ rowsByY.set(yKey, arr);
1599
+ }
1600
+ arr.push(word);
1601
+ }
1602
+ const sortedYKeys = [...rowsByY.keys()].sort((a, b) => a - b);
1603
+ const rowInfo = [];
1604
+ for (const yKey of sortedYKeys) {
1605
+ const rowWords = rowsByY.get(yKey).sort((a, b) => a.x0 - b.x0);
1606
+ if (!rowWords.length)
1607
+ continue;
1608
+ const firstX0 = rowWords[0].x0;
1609
+ const lastX1 = rowWords[rowWords.length - 1].x1;
1610
+ const lineWidth = lastX1 - firstX0;
1611
+ const combinedText = rowWords.map((w) => w.text).join(" ");
1612
+ const xPositions = rowWords.map((w) => w.x0);
1613
+ const xGroups = [];
1614
+ for (const x of [...xPositions].sort((a, b) => a - b)) {
1615
+ if (!xGroups.length || x - xGroups[xGroups.length - 1] > 50) {
1616
+ xGroups.push(x);
1617
+ }
1618
+ }
1619
+ const isParagraph = lineWidth > pageWidth * 0.55 && combinedText.length > 60;
1620
+ let hasPartialNumbering = false;
1621
+ if (rowWords.length) {
1622
+ const firstWord = rowWords[0].text.trim();
1623
+ if (PARTIAL_NUMBERING_PATTERN.test(firstWord))
1624
+ hasPartialNumbering = true;
1625
+ }
1626
+ rowInfo.push({
1627
+ yKey,
1628
+ words: rowWords,
1629
+ text: combinedText,
1630
+ xGroups,
1631
+ isParagraph,
1632
+ numColumns: xGroups.length,
1633
+ hasPartialNumbering
1634
+ });
1635
+ }
1636
+ const allTableXPositions = [];
1637
+ for (const info of rowInfo) {
1638
+ if (info.numColumns >= 3 && !info.isParagraph) {
1639
+ allTableXPositions.push(...info.xGroups);
1640
+ }
1641
+ }
1642
+ if (!allTableXPositions.length)
1643
+ return null;
1644
+ allTableXPositions.sort((a, b) => a - b);
1645
+ const gaps = [];
1646
+ for (let i2 = 0;i2 < allTableXPositions.length - 1; i2++) {
1647
+ const gap = allTableXPositions[i2 + 1] - allTableXPositions[i2];
1648
+ if (gap > 5)
1649
+ gaps.push(gap);
1650
+ }
1651
+ let adaptiveTolerance;
1652
+ if (gaps.length >= 3) {
1653
+ const sortedGaps = [...gaps].sort((a, b) => a - b);
1654
+ const idx2 = Math.floor(sortedGaps.length * 0.7);
1655
+ adaptiveTolerance = Math.max(25, Math.min(50, sortedGaps[idx2]));
1656
+ } else {
1657
+ adaptiveTolerance = 35;
1658
+ }
1659
+ const globalColumns = [];
1660
+ for (const x of allTableXPositions) {
1661
+ if (!globalColumns.length || x - globalColumns[globalColumns.length - 1] > adaptiveTolerance) {
1662
+ globalColumns.push(x);
1663
+ }
1664
+ }
1665
+ if (globalColumns.length > 1) {
1666
+ const contentWidth = globalColumns[globalColumns.length - 1] - globalColumns[0];
1667
+ const avgColWidth = contentWidth / globalColumns.length;
1668
+ if (avgColWidth < 30)
1669
+ return null;
1670
+ const columnsPerInch = globalColumns.length / (contentWidth / 72);
1671
+ if (columnsPerInch > 10)
1672
+ return null;
1673
+ const adaptiveMaxColumns = Math.max(15, Math.floor(20 * (pageWidth / 612)));
1674
+ if (globalColumns.length > adaptiveMaxColumns)
1675
+ return null;
1676
+ } else {
1677
+ return null;
1678
+ }
1679
+ const numCols = globalColumns.length;
1680
+ for (const info of rowInfo) {
1681
+ if (info.isParagraph || info.hasPartialNumbering) {
1682
+ info.isTableRow = false;
1683
+ continue;
1684
+ }
1685
+ const alignedColumns = new Set;
1686
+ for (const word of info.words) {
1687
+ for (let colIdx = 0;colIdx < globalColumns.length; colIdx++) {
1688
+ if (Math.abs(word.x0 - globalColumns[colIdx]) < 40) {
1689
+ alignedColumns.add(colIdx);
1690
+ break;
1691
+ }
1692
+ }
1693
+ }
1694
+ info.isTableRow = alignedColumns.size >= 2;
1695
+ }
1696
+ const tableRegions = [];
1697
+ let i = 0;
1698
+ while (i < rowInfo.length) {
1699
+ if (rowInfo[i].isTableRow) {
1700
+ const start = i;
1701
+ while (i < rowInfo.length && rowInfo[i].isTableRow)
1702
+ i++;
1703
+ tableRegions.push([start, i]);
1704
+ } else {
1705
+ i++;
1706
+ }
1707
+ }
1708
+ const totalTableRows = tableRegions.reduce((sum, [s, e]) => sum + (e - s), 0);
1709
+ if (rowInfo.length > 0 && totalTableRows / rowInfo.length < 0.2)
1710
+ return null;
1711
+ function extractCells(info) {
1712
+ const cells = Array(numCols).fill("");
1713
+ for (const word of info.words) {
1714
+ let assignedCol = numCols - 1;
1715
+ for (let colIdx = 0;colIdx < numCols - 1; colIdx++) {
1716
+ if (word.x0 < globalColumns[colIdx + 1] - 20) {
1717
+ assignedCol = colIdx;
1718
+ break;
1719
+ }
1720
+ }
1721
+ cells[assignedCol] = cells[assignedCol] ? `${cells[assignedCol]} ${word.text}` : word.text;
1722
+ }
1723
+ return cells;
1724
+ }
1725
+ const resultLines = [];
1726
+ let idx = 0;
1727
+ while (idx < rowInfo.length) {
1728
+ const region = tableRegions.find(([start]) => start === idx);
1729
+ if (region) {
1730
+ const [start, end] = region;
1731
+ const tableData = [];
1732
+ for (let ti = start;ti < end; ti++) {
1733
+ tableData.push(extractCells(rowInfo[ti]));
1734
+ }
1735
+ if (tableData.length) {
1736
+ const colWidths = Array.from({ length: numCols }, (_, col) => Math.max(3, ...tableData.map((row) => (row[col] || "").length)));
1737
+ const header = tableData[0];
1738
+ resultLines.push(`| ${header.map((cell, ci) => cell.padEnd(colWidths[ci])).join(" | ")} |`);
1739
+ resultLines.push(`| ${colWidths.map((w) => "-".repeat(w)).join(" | ")} |`);
1740
+ for (const row of tableData.slice(1)) {
1741
+ resultLines.push(`| ${row.map((cell, ci) => cell.padEnd(colWidths[ci])).join(" | ")} |`);
1742
+ }
1743
+ }
1744
+ idx = end;
1745
+ } else {
1746
+ const inTable = tableRegions.some(([s, e]) => s < idx && idx < e);
1747
+ if (!inTable)
1748
+ resultLines.push(rowInfo[idx].text);
1749
+ idx++;
1750
+ }
1751
+ }
1752
+ return resultLines.join(`
1753
+ `);
1754
+ }
1755
+ var _initPromise = null;
1756
+ async function initPdfplumber() {
1757
+ if (!_initPromise) {
1758
+ _initPromise = (async () => {
1759
+ const bgModule = await import("pdfplumber-wasm/pdfplumber_wasm_bg.js");
1760
+ const imports = {
1761
+ "./pdfplumber_wasm_bg.js": {}
1762
+ };
1763
+ for (const [key, value] of Object.entries(bgModule)) {
1764
+ if ((key.startsWith("__wbg_") || key.startsWith("__wbindgen_")) && typeof value === "function") {
1765
+ imports["./pdfplumber_wasm_bg.js"][key] = value;
1766
+ }
1767
+ }
1768
+ const wasmPath = path.join(path.dirname(__require.resolve("pdfplumber-wasm/package.json")), "pdfplumber_wasm_bg.wasm");
1769
+ const wasmBytes = fs.readFileSync(wasmPath);
1770
+ const wasmModule = new WebAssembly.Module(wasmBytes);
1771
+ const wasmInstance = new WebAssembly.Instance(wasmModule, imports);
1772
+ bgModule.__wbg_set_wasm(wasmInstance.exports);
1773
+ if (typeof wasmInstance.exports.__wbindgen_start === "function") {
1774
+ wasmInstance.exports.__wbindgen_start();
1775
+ }
1776
+ return bgModule.WasmPdf;
1777
+ })();
1778
+ }
1779
+ return _initPromise;
1780
+ }
1781
+ var pdfConverter = converter("PDF", anyOf(byExt(...ACCEPTED_EXTENSIONS10), byMime(...ACCEPTED_MIME_PREFIXES9)), async (ctx) => {
1782
+ const markdownChunks = [];
1783
+ let formPages = 0;
1784
+ let plainPages = 0;
1785
+ try {
1786
+ const WasmPdf = await initPdfplumber();
1787
+ const pdf = WasmPdf.open(new Uint8Array(ctx.buffer));
1788
+ try {
1789
+ for (let i = 0;i < pdf.pageCount; i++) {
1790
+ const page = pdf.page(i);
1791
+ try {
1792
+ const rawWords = page.extractWords(3, 3);
1793
+ const words = rawWords.map(normalizeWord);
1794
+ const pageWidth = page.width || 612;
1795
+ const pageContent = extractFormContentFromWords(words, pageWidth);
1796
+ if (pageContent === null) {
1797
+ plainPages++;
1798
+ const text = page.extractText();
1799
+ if (text?.trim())
1800
+ markdownChunks.push(text.trim());
1801
+ } else {
1802
+ formPages++;
1803
+ if (pageContent.trim())
1804
+ markdownChunks.push(pageContent);
1805
+ }
1806
+ } finally {
1807
+ page.free();
1808
+ }
1809
+ }
1810
+ let markdown;
1811
+ if (plainPages > formPages && plainPages > 0) {
1812
+ markdown = await fallbackPdfParse(ctx.buffer);
1813
+ } else {
1814
+ markdown = markdownChunks.join(`
1815
+
1816
+ `).trim();
1817
+ }
1818
+ if (!markdown.trim()) {
1819
+ markdown = await fallbackPdfParse(ctx.buffer);
1820
+ }
1821
+ return { markdown: mergePartialNumberingLines(markdown) };
1822
+ } catch {
1823
+ const markdown = await fallbackPdfParse(ctx.buffer);
1824
+ return { markdown: mergePartialNumberingLines(markdown) };
1825
+ } finally {
1826
+ pdf.free();
1827
+ }
1828
+ } catch {
1829
+ const markdown = await fallbackPdfParse(ctx.buffer);
1830
+ return { markdown: mergePartialNumberingLines(markdown) };
1831
+ }
1832
+ });
1833
+ async function fallbackPdfParse(buffer) {
1834
+ const { PDFParse } = await import("pdf-parse");
1835
+ const parser = new PDFParse(new Uint8Array(buffer));
1836
+ await parser.load();
1837
+ const result = await parser.getText();
1838
+ return result.text;
1839
+ }
1840
+ // src/converters/plain-text.ts
1841
+ var ACCEPTED_EXTENSIONS11 = [".txt", ".text", ".md", ".markdown", ".json", ".jsonl"];
1842
+ var ACCEPTED_MIME_PREFIXES10 = ["text/", "application/json", "application/markdown"];
1843
+ var plainTextConverter = converter("PlainText", anyOf(hasCharset(), byExt(...ACCEPTED_EXTENSIONS11), byMime(...ACCEPTED_MIME_PREFIXES10)), async (ctx) => {
1844
+ const text = decodeBuffer(ctx.buffer, ctx.info.charset);
1845
+ return { markdown: text };
1846
+ });
1847
+ // src/converters/pptx.ts
1848
+ import { XMLParser } from "fast-xml-parser";
1849
+ import JSZip2 from "jszip";
1850
+ var ACCEPTED_EXTENSIONS12 = [".pptx"];
1851
+ var ACCEPTED_MIME_PREFIXES11 = ["application/vnd.openxmlformats-officedocument.presentationml"];
1852
+ function escapeMarkdown(text) {
1853
+ return text.replace(/[\r\n[\]]/g, " ").replace(/\s+/g, " ").trim();
1854
+ }
1855
+ function collectText(node) {
1856
+ if (typeof node === "string")
1857
+ return node;
1858
+ if (typeof node === "number")
1859
+ return String(node);
1860
+ if (!node || typeof node !== "object")
1861
+ return "";
1862
+ const results = [];
1863
+ if (node["#text"] !== undefined) {
1864
+ results.push(String(node["#text"]));
1865
+ }
1866
+ if (node["a:t"] !== undefined) {
1867
+ const t = node["a:t"];
1868
+ if (typeof t === "string" || typeof t === "number") {
1869
+ results.push(String(t));
1870
+ } else if (Array.isArray(t)) {
1871
+ for (const item of t) {
1872
+ results.push(collectText(item));
1873
+ }
1874
+ } else if (typeof t === "object" && t["#text"] !== undefined) {
1875
+ results.push(String(t["#text"]));
1876
+ }
1877
+ }
1878
+ for (const [key, val] of Object.entries(node)) {
1879
+ if (key === "#text" || key === "a:t")
1880
+ continue;
1881
+ if (key.startsWith("@_"))
1882
+ continue;
1883
+ if (Array.isArray(val)) {
1884
+ for (const item of val) {
1885
+ results.push(collectText(item));
1886
+ }
1887
+ } else if (typeof val === "object" && val !== null) {
1888
+ results.push(collectText(val));
1889
+ }
1890
+ }
1891
+ return results.join("");
1892
+ }
1893
+ function collectTextNsStripped(node) {
1894
+ if (typeof node === "string")
1895
+ return node;
1896
+ if (typeof node === "number")
1897
+ return String(node);
1898
+ if (!node || typeof node !== "object")
1899
+ return "";
1900
+ const results = [];
1901
+ if (node["#text"] !== undefined)
1902
+ results.push(String(node["#text"]));
1903
+ if (node.t !== undefined) {
1904
+ const t = node.t;
1905
+ if (typeof t === "string" || typeof t === "number") {
1906
+ results.push(String(t));
1907
+ } else if (Array.isArray(t)) {
1908
+ for (const item of t)
1909
+ results.push(collectTextNsStripped(item));
1910
+ } else if (typeof t === "object" && t["#text"] !== undefined) {
1911
+ results.push(String(t["#text"]));
1912
+ }
1913
+ }
1914
+ for (const [key, val] of Object.entries(node)) {
1915
+ if (key === "#text" || key === "t")
1916
+ continue;
1917
+ if (key.startsWith("@_"))
1918
+ continue;
1919
+ if (Array.isArray(val)) {
1920
+ for (const item of val)
1921
+ results.push(collectTextNsStripped(item));
1922
+ } else if (typeof val === "object" && val !== null) {
1923
+ results.push(collectTextNsStripped(val));
1924
+ }
1925
+ }
1926
+ return results.join("");
1927
+ }
1928
+ function extractTextFromShape(sp) {
1929
+ const txBody = sp["p:txBody"];
1930
+ if (!txBody)
1931
+ return "";
1932
+ let paragraphs = txBody["a:p"];
1933
+ if (!paragraphs)
1934
+ return "";
1935
+ if (!Array.isArray(paragraphs))
1936
+ paragraphs = [paragraphs];
1937
+ const lines = [];
1938
+ for (const p of paragraphs) {
1939
+ const text = collectText(p);
1940
+ if (text)
1941
+ lines.push(text);
1942
+ }
1943
+ return lines.join(`
1944
+ `);
1945
+ }
1946
+ function isTitle(sp) {
1947
+ const nvSpPr = sp["p:nvSpPr"];
1948
+ if (!nvSpPr)
1949
+ return false;
1950
+ const nvPr = nvSpPr["p:nvPr"];
1951
+ if (!nvPr)
1952
+ return false;
1953
+ const ph = nvPr["p:ph"];
1954
+ if (!ph)
1955
+ return false;
1956
+ const type = ph["@_type"];
1957
+ return type === "title" || type === "ctrTitle";
1958
+ }
1959
+ function extractTable(graphicFrame) {
1960
+ const graphic = graphicFrame["a:graphic"];
1961
+ if (!graphic)
1962
+ return "";
1963
+ const graphicData = graphic["a:graphicData"];
1964
+ if (!graphicData)
1965
+ return "";
1966
+ const tbl = graphicData["a:tbl"];
1967
+ if (!tbl)
1968
+ return "";
1969
+ let rows = tbl["a:tr"];
1970
+ if (!rows)
1971
+ return "";
1972
+ if (!Array.isArray(rows))
1973
+ rows = [rows];
1974
+ const htmlRows = [];
1975
+ let isFirst = true;
1976
+ for (const row of rows) {
1977
+ let cells = row["a:tc"];
1978
+ if (!cells)
1979
+ continue;
1980
+ if (!Array.isArray(cells))
1981
+ cells = [cells];
1982
+ const tag = isFirst ? "th" : "td";
1983
+ const cellsHtml = cells.map((cell) => {
1984
+ const text = collectText(cell);
1985
+ const escaped = text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
1986
+ return `<${tag}>${escaped}</${tag}>`;
1987
+ }).join("");
1988
+ htmlRows.push(`<tr>${cellsHtml}</tr>`);
1989
+ isFirst = false;
1990
+ }
1991
+ const html = `<html><body><table>${htmlRows.join("")}</table></body></html>`;
1992
+ return `${htmlToMarkdown(html).markdown.trim()}
1993
+ `;
1994
+ }
1995
+ function extractChartData(chartXml) {
1996
+ const parser = new XMLParser({
1997
+ ignoreAttributes: false,
1998
+ attributeNamePrefix: "@_",
1999
+ removeNSPrefix: true
2000
+ });
2001
+ const chart = parser.parse(chartXml);
2002
+ try {
2003
+ const chartSpace = chart.chartSpace || chart["c:chartSpace"];
2004
+ if (!chartSpace)
2005
+ return `
2006
+
2007
+ [chart]
2008
+
2009
+ `;
2010
+ const chartEl = chartSpace.chart || chartSpace["c:chart"];
2011
+ if (!chartEl)
2012
+ return `
2013
+
2014
+ [chart]
2015
+
2016
+ `;
2017
+ let title = "";
2018
+ const titleEl = chartEl.title || chartEl["c:title"];
2019
+ if (titleEl) {
2020
+ title = collectTextNsStripped(titleEl);
2021
+ }
2022
+ const plotArea = chartEl.plotArea || chartEl["c:plotArea"];
2023
+ if (!plotArea)
2024
+ return `
2025
+
2026
+ ### Chart${title ? `: ${title}` : ""}
2027
+
2028
+ [chart data unavailable]
2029
+ `;
2030
+ const chartTypes = ["barChart", "lineChart", "pieChart", "areaChart", "scatterChart"];
2031
+ let plotData = null;
2032
+ for (const type of chartTypes) {
2033
+ plotData = plotArea[type] || plotArea[`c:${type}`];
2034
+ if (plotData)
2035
+ break;
2036
+ }
2037
+ if (!plotData)
2038
+ return `
2039
+
2040
+ ### Chart${title ? `: ${title}` : ""}
2041
+
2042
+ [unsupported chart]
2043
+ `;
2044
+ let seriesList = plotData.ser || plotData["c:ser"];
2045
+ if (!seriesList)
2046
+ return `
2047
+
2048
+ ### Chart${title ? `: ${title}` : ""}
2049
+
2050
+ `;
2051
+ if (!Array.isArray(seriesList))
2052
+ seriesList = [seriesList];
2053
+ const categories = [];
2054
+ const firstSer = seriesList[0];
2055
+ const cat = firstSer.cat || firstSer["c:cat"];
2056
+ if (cat) {
2057
+ const strRef = cat.strRef || cat["c:strRef"];
2058
+ const numRef = cat.numRef || cat["c:numRef"];
2059
+ const ref = strRef || numRef;
2060
+ if (ref) {
2061
+ const cache = ref.strCache || ref["c:strCache"] || ref.numCache || ref["c:numCache"];
2062
+ if (cache) {
2063
+ let pts = cache.pt || cache["c:pt"];
2064
+ if (pts) {
2065
+ if (!Array.isArray(pts))
2066
+ pts = [pts];
2067
+ for (const pt of pts) {
2068
+ categories.push(String(pt.v || pt["c:v"] || ""));
2069
+ }
2070
+ }
2071
+ }
2072
+ }
2073
+ }
2074
+ const seriesNames = [];
2075
+ const seriesValues = [];
2076
+ for (const ser of seriesList) {
2077
+ const tx = ser.tx || ser["c:tx"];
2078
+ let name = "";
2079
+ if (tx) {
2080
+ const sr = tx.strRef || tx["c:strRef"];
2081
+ if (sr) {
2082
+ const sc = sr.strCache || sr["c:strCache"];
2083
+ if (sc) {
2084
+ let pts = sc.pt || sc["c:pt"];
2085
+ if (pts) {
2086
+ if (!Array.isArray(pts))
2087
+ pts = [pts];
2088
+ name = String(pts[0]?.v || pts[0]?.["c:v"] || "");
2089
+ }
2090
+ }
2091
+ }
2092
+ if (!name) {
2093
+ name = collectTextNsStripped(tx);
2094
+ }
2095
+ }
2096
+ seriesNames.push(name);
2097
+ const val = ser.val || ser["c:val"];
2098
+ const values = [];
2099
+ if (val) {
2100
+ const nr = val.numRef || val["c:numRef"];
2101
+ if (nr) {
2102
+ const nc = nr.numCache || nr["c:numCache"];
2103
+ if (nc) {
2104
+ let pts = nc.pt || nc["c:pt"];
2105
+ if (pts) {
2106
+ if (!Array.isArray(pts))
2107
+ pts = [pts];
2108
+ for (const pt of pts) {
2109
+ values.push(Number(pt.v || pt["c:v"] || 0));
2110
+ }
2111
+ }
2112
+ }
2113
+ }
2114
+ }
2115
+ seriesValues.push(values);
2116
+ }
2117
+ let md = `
2118
+
2119
+ ### Chart${title ? `: ${title}` : ""}
2120
+
2121
+ `;
2122
+ const header = ["Category", ...seriesNames];
2123
+ md += `| ${header.join(" | ")} |
2124
+ `;
2125
+ md += `|${header.map(() => "---").join("|")}|
2126
+ `;
2127
+ for (let i = 0;i < categories.length; i++) {
2128
+ const row = [categories[i], ...seriesValues.map((v) => String(v[i] ?? ""))];
2129
+ md += `| ${row.join(" | ")} |
2130
+ `;
2131
+ }
2132
+ return md;
2133
+ } catch {
2134
+ return `
2135
+
2136
+ [unsupported chart]
2137
+
2138
+ `;
2139
+ }
2140
+ }
2141
+ var pptxConverter = converter("PPTX", anyOf(byExt(...ACCEPTED_EXTENSIONS12), byMime(...ACCEPTED_MIME_PREFIXES11)), async (ctx) => {
2142
+ const zip = await JSZip2.loadAsync(ctx.buffer);
2143
+ const parser = new XMLParser({
2144
+ ignoreAttributes: false,
2145
+ attributeNamePrefix: "@_",
2146
+ preserveOrder: false,
2147
+ trimValues: false
2148
+ });
2149
+ const contentTypesXml = await zip.file("[Content_Types].xml")?.async("string");
2150
+ if (!contentTypesXml)
2151
+ throw new Error("Invalid PPTX: missing [Content_Types].xml");
2152
+ const slideFiles = [];
2153
+ for (const [filename] of Object.entries(zip.files)) {
2154
+ if (/^ppt\/slides\/slide\d+\.xml$/.test(filename)) {
2155
+ slideFiles.push(filename);
2156
+ }
2157
+ }
2158
+ slideFiles.sort((a, b) => {
2159
+ const numA = parseInt(a.match(/slide(\d+)/)?.[1] ?? "0", 10);
2160
+ const numB = parseInt(b.match(/slide(\d+)/)?.[1] ?? "0", 10);
2161
+ return numA - numB;
2162
+ });
2163
+ let mdContent = "";
2164
+ let slideNum = 0;
2165
+ for (const slideFile of slideFiles) {
2166
+ slideNum++;
2167
+ mdContent += `
2168
+
2169
+ <!-- Slide number: ${slideNum} -->
2170
+ `;
2171
+ const slideXml = await zip.file(slideFile)?.async("string");
2172
+ if (!slideXml)
2173
+ continue;
2174
+ const slideDoc = parser.parse(slideXml);
2175
+ const sld = slideDoc["p:sld"];
2176
+ if (!sld)
2177
+ continue;
2178
+ const cSld = sld["p:cSld"];
2179
+ if (!cSld)
2180
+ continue;
2181
+ const spTree = cSld["p:spTree"];
2182
+ if (!spTree)
2183
+ continue;
2184
+ const shapes = [];
2185
+ if (spTree["p:sp"]) {
2186
+ const sps = Array.isArray(spTree["p:sp"]) ? spTree["p:sp"] : [spTree["p:sp"]];
2187
+ shapes.push(...sps);
2188
+ }
2189
+ if (spTree["p:graphicFrame"]) {
2190
+ const gfs = Array.isArray(spTree["p:graphicFrame"]) ? spTree["p:graphicFrame"] : [spTree["p:graphicFrame"]];
2191
+ shapes.push(...gfs);
2192
+ }
2193
+ if (spTree["p:pic"]) {
2194
+ const pics = Array.isArray(spTree["p:pic"]) ? spTree["p:pic"] : [spTree["p:pic"]];
2195
+ shapes.push(...pics);
2196
+ }
2197
+ if (spTree["p:grpSp"]) {
2198
+ const grps = Array.isArray(spTree["p:grpSp"]) ? spTree["p:grpSp"] : [spTree["p:grpSp"]];
2199
+ shapes.push(...grps);
2200
+ }
2201
+ for (const shape of shapes) {
2202
+ if (shape["p:txBody"]) {
2203
+ const text = extractTextFromShape(shape);
2204
+ if (text.trim()) {
2205
+ if (isTitle(shape)) {
2206
+ mdContent += `# ${text.trim()}
2207
+ `;
2208
+ } else {
2209
+ mdContent += `${text}
2210
+ `;
2211
+ }
2212
+ }
2213
+ }
2214
+ const graphic = shape["a:graphic"];
2215
+ if (graphic) {
2216
+ const graphicData = graphic["a:graphicData"];
2217
+ if (graphicData) {
2218
+ if (graphicData["a:tbl"]) {
2219
+ mdContent += extractTable(shape);
2220
+ }
2221
+ const chartRef = graphicData["@_uri"] === "http://schemas.openxmlformats.org/drawingml/2006/chart";
2222
+ if (chartRef || graphicData["c:chart"]) {
2223
+ const slideRelsFile = `${slideFile.replace("slides/", "slides/_rels/")}.rels`;
2224
+ const relsXml = await zip.file(slideRelsFile)?.async("string");
2225
+ if (relsXml) {
2226
+ const relsParsed = parser.parse(relsXml);
2227
+ const rels = relsParsed.Relationships;
2228
+ if (rels) {
2229
+ let relList = rels.Relationship;
2230
+ if (relList && !Array.isArray(relList))
2231
+ relList = [relList];
2232
+ if (relList) {
2233
+ for (const rel of relList) {
2234
+ if (rel["@_Type"]?.includes("/chart") && rel["@_Target"]) {
2235
+ const chartPath = `ppt/${rel["@_Target"].replace("../", "")}`;
2236
+ const chartXml = await zip.file(chartPath)?.async("string");
2237
+ if (chartXml) {
2238
+ mdContent += extractChartData(chartXml);
2239
+ }
2240
+ }
2241
+ }
2242
+ }
2243
+ }
2244
+ }
2245
+ }
2246
+ }
2247
+ }
2248
+ if (shape["p:blipFill"]) {
2249
+ const blipFill = shape["p:blipFill"];
2250
+ const blip = blipFill["a:blip"];
2251
+ if (blip) {
2252
+ let altText = "";
2253
+ const nvPicPr = shape["p:nvPicPr"];
2254
+ if (nvPicPr) {
2255
+ const cNvPr = nvPicPr["p:cNvPr"];
2256
+ if (cNvPr) {
2257
+ altText = cNvPr["@_descr"] || cNvPr["@_name"] || "";
2258
+ }
2259
+ }
2260
+ altText = escapeMarkdown(altText || "image");
2261
+ if (ctx.opts.keepDataUris) {
2262
+ const rId = blip["@_r:embed"];
2263
+ if (rId) {
2264
+ const slideRelsFile = `${slideFile.replace("slides/", "slides/_rels/")}.rels`;
2265
+ const relsXml = await zip.file(slideRelsFile)?.async("string");
2266
+ if (relsXml) {
2267
+ const relsParsed = parser.parse(relsXml);
2268
+ const rels = relsParsed.Relationships;
2269
+ if (rels) {
2270
+ let relList = rels.Relationship;
2271
+ if (relList && !Array.isArray(relList))
2272
+ relList = [relList];
2273
+ if (relList) {
2274
+ for (const rel of relList) {
2275
+ if (rel["@_Id"] === rId && rel["@_Target"]) {
2276
+ const imgPath = `ppt/${rel["@_Target"].replace("../", "")}`;
2277
+ const imgData = await zip.file(imgPath)?.async("base64");
2278
+ if (imgData) {
2279
+ const ext = imgPath.split(".").pop()?.toLowerCase();
2280
+ const contentType = ext === "png" ? "image/png" : ext === "gif" ? "image/gif" : ext === "svg" ? "image/svg+xml" : "image/jpeg";
2281
+ mdContent += `
2282
+ ![${altText}](data:${contentType};base64,${imgData})
2283
+ `;
2284
+ }
2285
+ }
2286
+ }
2287
+ }
2288
+ }
2289
+ }
2290
+ }
2291
+ } else {
2292
+ const nvPicPr2 = shape["p:nvPicPr"];
2293
+ let filename = "image.jpg";
2294
+ if (nvPicPr2?.["p:cNvPr"]?.["@_name"]) {
2295
+ filename = `${nvPicPr2["p:cNvPr"]["@_name"].replace(/\W/g, "")}.jpg`;
2296
+ }
2297
+ mdContent += `
2298
+ ![${altText}](${filename})
2299
+ `;
2300
+ }
2301
+ }
2302
+ }
2303
+ }
2304
+ const notesFile = slideFile.replace("slides/slide", "notesSlides/notesSlide");
2305
+ const notesXml = await zip.file(notesFile)?.async("string");
2306
+ if (notesXml) {
2307
+ const notesDoc = parser.parse(notesXml);
2308
+ const notes = notesDoc["p:notes"];
2309
+ if (notes) {
2310
+ const cSld2 = notes["p:cSld"];
2311
+ if (cSld2) {
2312
+ const spTree2 = cSld2["p:spTree"];
2313
+ if (spTree2) {
2314
+ let sps = spTree2["p:sp"];
2315
+ if (sps && !Array.isArray(sps))
2316
+ sps = [sps];
2317
+ if (sps) {
2318
+ for (const sp of sps) {
2319
+ const nvSpPr = sp["p:nvSpPr"];
2320
+ if (nvSpPr?.["p:nvPr"]?.["p:ph"]?.["@_type"] === "body") {
2321
+ const text = extractTextFromShape(sp);
2322
+ if (text.trim()) {
2323
+ mdContent += `
2324
+
2325
+ ### Notes:
2326
+ ${text.trim()}`;
2327
+ }
2328
+ }
2329
+ }
2330
+ }
2331
+ }
2332
+ }
2333
+ }
2334
+ }
2335
+ }
2336
+ return { markdown: mdContent.trim() };
2337
+ });
2338
+ // src/converters/rss.ts
2339
+ import { XMLParser as XMLParser2 } from "fast-xml-parser";
2340
+ var PRECISE_MIME_PREFIXES = ["application/rss", "application/rss+xml", "application/atom", "application/atom+xml"];
2341
+ var PRECISE_EXTENSIONS = [".rss", ".atom"];
2342
+ var CANDIDATE_MIME_PREFIXES = ["text/xml", "application/xml"];
2343
+ var CANDIDATE_EXTENSIONS = [".xml"];
2344
+ function getFirstTextChild(parent, tagName) {
2345
+ if (!parent)
2346
+ return null;
2347
+ const val = parent[tagName];
2348
+ if (val === undefined || val === null)
2349
+ return null;
2350
+ if (typeof val === "string")
2351
+ return val;
2352
+ if (typeof val === "number")
2353
+ return String(val);
2354
+ if (typeof val === "object" && val["#text"] !== undefined)
2355
+ return String(val["#text"]);
2356
+ return null;
2357
+ }
2358
+ function parseContent(content) {
2359
+ try {
2360
+ const { markdown } = htmlToMarkdown(content);
2361
+ return markdown;
2362
+ } catch {
2363
+ return content;
2364
+ }
2365
+ }
2366
+ function detectFeedType(parsed) {
2367
+ if (parsed.rss)
2368
+ return "rss";
2369
+ if (parsed.feed) {
2370
+ const feed = parsed.feed;
2371
+ if (feed.entry || Array.isArray(feed) && feed.some((f) => f.entry)) {
2372
+ return "atom";
2373
+ }
2374
+ }
2375
+ return null;
2376
+ }
2377
+ function looksLikeFeed(buffer, charset) {
2378
+ try {
2379
+ const text = decodeBuffer(buffer, charset ?? "utf-8");
2380
+ const parser = new XMLParser2({ ignoreAttributes: true });
2381
+ const parsed = parser.parse(text);
2382
+ return detectFeedType(parsed) !== null;
2383
+ } catch {
2384
+ return false;
2385
+ }
2386
+ }
2387
+ var rssConverter = converter("RSS", anyOf(byExt(...PRECISE_EXTENSIONS), byMime(...PRECISE_MIME_PREFIXES), (ctx) => {
2388
+ const mime = (ctx.info.mimetype ?? "").toLowerCase();
2389
+ const ext = (ctx.info.extension ?? "").toLowerCase();
2390
+ const isCandidate = CANDIDATE_EXTENSIONS.includes(ext) || CANDIDATE_MIME_PREFIXES.some((p) => mime.startsWith(p));
2391
+ if (!isCandidate)
2392
+ return false;
2393
+ return looksLikeFeed(ctx.buffer, ctx.info.charset);
2394
+ }), async (ctx) => {
2395
+ const text = decodeBuffer(ctx.buffer, ctx.info.charset ?? "utf-8");
2396
+ const parser = new XMLParser2({
2397
+ ignoreAttributes: true,
2398
+ textNodeName: "#text",
2399
+ cdataPropName: "__cdata"
2400
+ });
2401
+ const parsed = parser.parse(text);
2402
+ const feedType = detectFeedType(parsed);
2403
+ if (feedType === "rss") {
2404
+ return parseRss(parsed);
2405
+ } else if (feedType === "atom") {
2406
+ return parseAtom(parsed);
2407
+ }
2408
+ throw new Error("Unknown feed type");
2409
+ });
2410
+ function parseRss(parsed) {
2411
+ const rss = parsed.rss;
2412
+ const channel = rss.channel;
2413
+ if (!channel)
2414
+ throw new Error("No channel found in RSS feed");
2415
+ const channelTitle = getFirstTextChild(channel, "title");
2416
+ const channelDescription = getFirstTextChild(channel, "description");
2417
+ let md = "";
2418
+ if (channelTitle)
2419
+ md += `# ${channelTitle}
2420
+ `;
2421
+ if (channelDescription)
2422
+ md += `${channelDescription}
2423
+ `;
2424
+ let items = channel.item;
2425
+ if (items && !Array.isArray(items))
2426
+ items = [items];
2427
+ if (items) {
2428
+ for (const item of items) {
2429
+ const title = getFirstTextChild(item, "title");
2430
+ const description = getFirstTextChild(item, "description") || (item.description?.__cdata ? String(item.description.__cdata) : null);
2431
+ const pubDate = getFirstTextChild(item, "pubDate");
2432
+ const content = getFirstTextChild(item, "content:encoded") || (item["content:encoded"]?.__cdata ? String(item["content:encoded"].__cdata) : null);
2433
+ if (title)
2434
+ md += `
2435
+ ## ${title}
2436
+ `;
2437
+ if (pubDate)
2438
+ md += `Published on: ${pubDate}
2439
+ `;
2440
+ if (description)
2441
+ md += parseContent(description);
2442
+ if (content)
2443
+ md += parseContent(content);
2444
+ }
2445
+ }
2446
+ return { markdown: md, title: channelTitle ?? undefined };
2447
+ }
2448
+ function parseAtom(parsed) {
2449
+ const feed = parsed.feed;
2450
+ const title = getFirstTextChild(feed, "title");
2451
+ const subtitle = getFirstTextChild(feed, "subtitle");
2452
+ let md = "";
2453
+ if (title)
2454
+ md += `# ${title}
2455
+ `;
2456
+ if (subtitle)
2457
+ md += `${subtitle}
2458
+ `;
2459
+ let entries = feed.entry;
2460
+ if (entries && !Array.isArray(entries))
2461
+ entries = [entries];
2462
+ if (entries) {
2463
+ for (const entry of entries) {
2464
+ const entryTitle = getFirstTextChild(entry, "title");
2465
+ const entrySummary = getFirstTextChild(entry, "summary");
2466
+ const entryUpdated = getFirstTextChild(entry, "updated");
2467
+ const entryContent = getFirstTextChild(entry, "content");
2468
+ if (entryTitle)
2469
+ md += `
2470
+ ## ${entryTitle}
2471
+ `;
2472
+ if (entryUpdated)
2473
+ md += `Updated on: ${entryUpdated}
2474
+ `;
2475
+ if (entrySummary)
2476
+ md += parseContent(entrySummary);
2477
+ if (entryContent)
2478
+ md += parseContent(entryContent);
2479
+ }
2480
+ }
2481
+ return { markdown: md, title: title ?? undefined };
2482
+ }
2483
+ // src/converters/wikipedia.ts
2484
+ import * as cheerio3 from "cheerio";
2485
+ import TurndownService3 from "turndown";
2486
+ import { tables as tables2 } from "turndown-plugin-gfm";
2487
+ var ACCEPTED_EXTENSIONS13 = [".html", ".htm"];
2488
+ var ACCEPTED_MIME_PREFIXES12 = ["text/html", "application/xhtml"];
2489
+ var wikipediaConverter = converter("Wikipedia", allOf(byUrl(/^https?:\/\/[a-zA-Z]{2,3}\.wikipedia\.org\//), anyOf(byExt(...ACCEPTED_EXTENSIONS13), byMime(...ACCEPTED_MIME_PREFIXES12))), async (ctx) => {
2490
+ const encoding = ctx.info.charset ?? "utf-8";
2491
+ const html = decodeBuffer(ctx.buffer, encoding);
2492
+ const $ = cheerio3.load(html);
2493
+ $("script, style").remove();
2494
+ let mainTitle = $("title").first().text() || undefined;
2495
+ const titleElm = $("span.mw-page-title-main").first();
2496
+ if (titleElm.length) {
2497
+ mainTitle = titleElm.text();
2498
+ }
2499
+ const bodyElm = $("#mw-content-text");
2500
+ let contentHtml;
2501
+ if (bodyElm.length) {
2502
+ contentHtml = bodyElm.html() || "";
2503
+ } else {
2504
+ contentHtml = $("body").html() || $.html() || "";
2505
+ }
2506
+ const td = new TurndownService3({
2507
+ headingStyle: "atx",
2508
+ codeBlockStyle: "fenced",
2509
+ bulletListMarker: "*",
2510
+ emDelimiter: "*"
2511
+ });
2512
+ td.use(tables2);
2513
+ td.addRule("links", {
2514
+ filter: "a",
2515
+ replacement(content, node) {
2516
+ const el = node;
2517
+ const href = el.getAttribute("href") || "";
2518
+ const title = el.getAttribute("title") || "";
2519
+ if (!content.trim())
2520
+ return "";
2521
+ if (href) {
2522
+ try {
2523
+ const url = new URL(href, "http://placeholder.invalid");
2524
+ const scheme = url.protocol.replace(":", "").toLowerCase();
2525
+ if (scheme && !["http", "https", "file"].includes(scheme)) {
2526
+ if (href.includes(":") && !href.startsWith("/") && !href.startsWith("#")) {
2527
+ return content;
2528
+ }
2529
+ }
2530
+ } catch {}
2531
+ }
2532
+ const titlePart = title ? ` "${title.replace(/"/g, "\\\"")}"` : "";
2533
+ return href ? `[${content}](${href}${titlePart})` : content;
2534
+ }
2535
+ });
2536
+ let markdown = td.turndown(contentHtml).trim();
2537
+ if (mainTitle) {
2538
+ markdown = `# ${mainTitle}
2539
+
2540
+ ${markdown}`;
2541
+ }
2542
+ return { markdown, title: mainTitle };
2543
+ });
2544
+ // src/converters/xlsx.ts
2545
+ import * as XLSX from "xlsx";
2546
+ var ACCEPTED_XLSX_EXTENSIONS = [".xlsx"];
2547
+ var ACCEPTED_XLSX_MIME_PREFIXES = ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"];
2548
+ var ACCEPTED_XLS_EXTENSIONS = [".xls"];
2549
+ var ACCEPTED_XLS_MIME_PREFIXES = ["application/vnd.ms-excel", "application/excel"];
2550
+ function sheetsToMarkdown(workbook) {
2551
+ const parts = [];
2552
+ for (const sheetName of workbook.SheetNames) {
2553
+ const sheet = workbook.Sheets[sheetName];
2554
+ if (!sheet)
2555
+ continue;
2556
+ parts.push(`## ${sheetName}`);
2557
+ const html = XLSX.utils.sheet_to_html(sheet);
2558
+ const { markdown } = htmlToMarkdown(html);
2559
+ parts.push(markdown.trim());
2560
+ parts.push("");
2561
+ }
2562
+ return parts.join(`
2563
+ `).trim();
2564
+ }
2565
+ var xlsxConverter = converter("XLSX", anyOf(byExt(...ACCEPTED_XLSX_EXTENSIONS), byMime(...ACCEPTED_XLSX_MIME_PREFIXES)), async (ctx) => {
2566
+ const workbook = XLSX.read(ctx.buffer, { type: "buffer" });
2567
+ return { markdown: sheetsToMarkdown(workbook) };
2568
+ });
2569
+ var xlsConverter = converter("XLS", anyOf(byExt(...ACCEPTED_XLS_EXTENSIONS), byMime(...ACCEPTED_XLS_MIME_PREFIXES)), async (ctx) => {
2570
+ const workbook = XLSX.read(ctx.buffer, { type: "buffer" });
2571
+ return { markdown: sheetsToMarkdown(workbook) };
2572
+ });
2573
+ // src/converters/youtube.ts
2574
+ import * as cheerio4 from "cheerio";
2575
+ var ACCEPTED_EXTENSIONS14 = [".html", ".htm"];
2576
+ var ACCEPTED_MIME_PREFIXES13 = ["text/html", "application/xhtml"];
2577
+ function findKey(obj, key) {
2578
+ if (Array.isArray(obj)) {
2579
+ for (const item of obj) {
2580
+ const result = findKey(item, key);
2581
+ if (result !== undefined)
2582
+ return result;
2583
+ }
2584
+ } else if (obj && typeof obj === "object") {
2585
+ for (const [k, v] of Object.entries(obj)) {
2586
+ if (k === key)
2587
+ return v;
2588
+ const result = findKey(v, key);
2589
+ if (result !== undefined)
2590
+ return result;
2591
+ }
2592
+ }
2593
+ return;
2594
+ }
2595
+ function extractVideoId(url) {
2596
+ try {
2597
+ const parsed = new URL(url);
2598
+ return parsed.searchParams.get("v");
2599
+ } catch {
2600
+ return null;
2601
+ }
2602
+ }
2603
+ async function fetchTranscript(videoId, languages) {
2604
+ try {
2605
+ const { YoutubeTranscript } = await import("youtube-transcript");
2606
+ const langAttempts = languages && languages.length > 0 ? [...languages, undefined] : [undefined];
2607
+ for (const lang of langAttempts) {
2608
+ for (let attempt = 0;attempt < 3; attempt++) {
2609
+ try {
2610
+ const config = lang ? { lang } : undefined;
2611
+ const parts = await YoutubeTranscript.fetchTranscript(videoId, config);
2612
+ if (parts && parts.length > 0) {
2613
+ return parts.map((p) => p.text).join(" ");
2614
+ }
2615
+ break;
2616
+ } catch (e) {
2617
+ if (attempt < 2) {
2618
+ await new Promise((resolve) => setTimeout(resolve, 2000));
2619
+ } else if (lang !== undefined) {
2620
+ break;
2621
+ } else {
2622
+ throw e;
2623
+ }
2624
+ }
2625
+ }
2626
+ }
2627
+ return null;
2628
+ } catch {
2629
+ return null;
2630
+ }
2631
+ }
2632
+ var youtubeConverter = converter("YouTube", allOf(byUrl(/^https:\/\/www\.youtube\.com\/watch\?/), anyOf(byExt(...ACCEPTED_EXTENSIONS14), byMime(...ACCEPTED_MIME_PREFIXES13))), async (ctx) => {
2633
+ const encoding = ctx.info.charset ?? "utf-8";
2634
+ const html = decodeBuffer(ctx.buffer, encoding);
2635
+ const $ = cheerio4.load(html);
2636
+ const metadata = {};
2637
+ const titleText = $("title").first().text();
2638
+ if (titleText)
2639
+ metadata.title = titleText;
2640
+ $("meta").each((_, el) => {
2641
+ const $el = $(el);
2642
+ for (const attr of ["itemprop", "property", "name"]) {
2643
+ const key = $el.attr(attr);
2644
+ const content = $el.attr("content");
2645
+ if (key && content) {
2646
+ metadata[key] = content;
2647
+ break;
2648
+ }
2649
+ }
2650
+ });
2651
+ $("script").each((_, el) => {
2652
+ const content = $(el).html();
2653
+ if (!content?.includes("ytInitialData"))
2654
+ return;
2655
+ const match = content.match(/var ytInitialData = ({.*?});/);
2656
+ if (match) {
2657
+ try {
2658
+ const data = JSON.parse(match[1]);
2659
+ const attrDesc = findKey(data, "attributedDescriptionBodyText");
2660
+ if (attrDesc && typeof attrDesc === "object" && attrDesc.content) {
2661
+ metadata.description = String(attrDesc.content);
2662
+ }
2663
+ } catch {}
2664
+ }
2665
+ });
2666
+ let md = `# YouTube
2667
+ `;
2668
+ const title = metadata.title || metadata["og:title"] || metadata.name || "";
2669
+ if (title)
2670
+ md += `
2671
+ ## ${title}
2672
+ `;
2673
+ let stats = "";
2674
+ if (metadata.interactionCount)
2675
+ stats += `- **Views:** ${metadata.interactionCount}
2676
+ `;
2677
+ if (metadata.keywords)
2678
+ stats += `- **Keywords:** ${metadata.keywords}
2679
+ `;
2680
+ if (metadata.duration)
2681
+ stats += `- **Runtime:** ${metadata.duration}
2682
+ `;
2683
+ if (stats)
2684
+ md += `
2685
+ ### Video Metadata
2686
+ ${stats}
2687
+ `;
2688
+ const description = metadata.description || metadata["og:description"];
2689
+ if (description)
2690
+ md += `
2691
+ ### Description
2692
+ ${description}
2693
+ `;
2694
+ const videoId = ctx.info.url ? extractVideoId(ctx.info.url) : null;
2695
+ if (videoId) {
2696
+ const transcript = await fetchTranscript(videoId, ctx.opts.youtubeTranscriptLanguages);
2697
+ if (transcript) {
2698
+ md += `
2699
+ ### Transcript
2700
+ ${transcript}
2701
+ `;
2702
+ }
2703
+ }
2704
+ return { markdown: md, title: title || undefined };
2705
+ });
2706
+ // src/converters/zip.ts
2707
+ import path2 from "node:path";
2708
+ var ACCEPTED_EXTENSIONS15 = [".zip"];
2709
+ var ACCEPTED_MIME_PREFIXES14 = ["application/zip"];
2710
+ var matcher = anyOf(byExt(...ACCEPTED_EXTENSIONS15), byMime(...ACCEPTED_MIME_PREFIXES14));
2711
+ var MAX_ZIP_DEPTH = 10;
2712
+ function createZipConverter(convertFn) {
2713
+ return {
2714
+ name: "ZIP",
2715
+ match: (ctx) => matcher(ctx),
2716
+ async convert(ctx) {
2717
+ const currentDepth = ctx.opts._zipDepth ?? 0;
2718
+ if (currentDepth >= MAX_ZIP_DEPTH) {
2719
+ return { markdown: "[Max ZIP nesting depth exceeded]" };
2720
+ }
2721
+ const JSZip3 = (await import("jszip")).default;
2722
+ const zip = await JSZip3.loadAsync(ctx.buffer);
2723
+ const filePath = ctx.info.url || ctx.info.localPath || ctx.info.filename || "archive.zip";
2724
+ let md = `Content from the zip file \`${filePath}\`:
2725
+
2726
+ `;
2727
+ for (const name of Object.keys(zip.files)) {
2728
+ const entry = zip.files[name];
2729
+ if (entry.dir)
2730
+ continue;
2731
+ try {
2732
+ const data = await entry.async("nodebuffer");
2733
+ const ext = path2.extname(name);
2734
+ const filename = path2.basename(name);
2735
+ const result = await convertFn(Buffer.from(data), {
2736
+ streamInfo: {
2737
+ extension: ext || undefined,
2738
+ filename
2739
+ },
2740
+ _zipDepth: currentDepth + 1
2741
+ });
2742
+ md += `## File: ${name}
2743
+
2744
+ `;
2745
+ md += `${result.markdown}
2746
+
2747
+ `;
2748
+ } catch (e) {
2749
+ if (e instanceof UnsupportedFormatError || e instanceof FileConversionError) {
2750
+ continue;
2751
+ }
2752
+ throw e;
2753
+ }
2754
+ }
2755
+ return { markdown: md.trim() };
2756
+ }
2757
+ };
2758
+ }
2759
+ // src/markitdown.ts
2760
+ import fs2 from "node:fs";
2761
+ import path4 from "node:path";
2762
+
2763
+ // src/stream-info.ts
2764
+ function mergeStreamInfo(base, ...overrides) {
2765
+ const result = { ...base };
2766
+ for (const override of overrides) {
2767
+ if (!override)
2768
+ continue;
2769
+ for (const [key, value] of Object.entries(override)) {
2770
+ if (value !== undefined) {
2771
+ result[key] = value;
2772
+ }
2773
+ }
2774
+ }
2775
+ return result;
2776
+ }
2777
+ function guessMimeFromExtension(ext) {
2778
+ const map = {
2779
+ ".txt": "text/plain",
2780
+ ".text": "text/plain",
2781
+ ".md": "text/markdown",
2782
+ ".markdown": "text/markdown",
2783
+ ".html": "text/html",
2784
+ ".htm": "text/html",
2785
+ ".json": "application/json",
2786
+ ".jsonl": "application/jsonl",
2787
+ ".csv": "text/csv",
2788
+ ".xml": "text/xml",
2789
+ ".pdf": "application/pdf",
2790
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
2791
+ ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
2792
+ ".xls": "application/vnd.ms-excel",
2793
+ ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
2794
+ ".epub": "application/epub+zip",
2795
+ ".zip": "application/zip",
2796
+ ".ipynb": "application/x-ipynb+json",
2797
+ ".jpg": "image/jpeg",
2798
+ ".jpeg": "image/jpeg",
2799
+ ".png": "image/png",
2800
+ ".gif": "image/gif",
2801
+ ".webp": "image/webp",
2802
+ ".svg": "image/svg+xml",
2803
+ ".mp3": "audio/mpeg",
2804
+ ".wav": "audio/wav",
2805
+ ".m4a": "audio/mp4",
2806
+ ".msg": "application/vnd.ms-outlook",
2807
+ ".rss": "application/rss+xml",
2808
+ ".atom": "application/atom+xml"
2809
+ };
2810
+ return map[ext.toLowerCase()];
2811
+ }
2812
+ function guessExtensionFromMime(mime) {
2813
+ const map = {
2814
+ "text/plain": ".txt",
2815
+ "text/markdown": ".md",
2816
+ "text/html": ".html",
2817
+ "application/json": ".json",
2818
+ "text/csv": ".csv",
2819
+ "text/xml": ".xml",
2820
+ "application/pdf": ".pdf",
2821
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
2822
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
2823
+ "application/vnd.ms-excel": ".xls",
2824
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
2825
+ "application/epub+zip": ".epub",
2826
+ "application/zip": ".zip",
2827
+ "image/jpeg": ".jpg",
2828
+ "image/png": ".png",
2829
+ "audio/mpeg": ".mp3",
2830
+ "application/vnd.ms-outlook": ".msg"
2831
+ };
2832
+ return map[mime.toLowerCase()];
2833
+ }
2834
+
2835
+ // src/uri-utils.ts
2836
+ import path3 from "node:path";
2837
+ import { fileURLToPath } from "node:url";
2838
+ function fileUriToPath(fileUri) {
2839
+ if (!fileUri.startsWith("file:")) {
2840
+ throw new Error(`Not a file URL: ${fileUri}`);
2841
+ }
2842
+ let netloc = null;
2843
+ const afterScheme = fileUri.slice(5);
2844
+ if (afterScheme.startsWith("//")) {
2845
+ const rest = afterScheme.slice(2);
2846
+ const slashIdx = rest.indexOf("/");
2847
+ if (slashIdx > 0) {
2848
+ const host = rest.slice(0, slashIdx);
2849
+ if (host && host !== "") {
2850
+ netloc = host;
2851
+ }
2852
+ }
2853
+ }
2854
+ const localPath = path3.resolve(fileURLToPath(fileUri));
2855
+ return { netloc, path: localPath };
2856
+ }
2857
+ function parseDataUri(uri) {
2858
+ if (!uri.startsWith("data:")) {
2859
+ throw new Error("Not a data URI");
2860
+ }
2861
+ const commaIndex = uri.indexOf(",");
2862
+ if (commaIndex === -1) {
2863
+ throw new Error("Malformed data URI, missing ',' separator");
2864
+ }
2865
+ const meta = uri.slice(5, commaIndex);
2866
+ const rawData = uri.slice(commaIndex + 1);
2867
+ const parts = meta.split(";");
2868
+ let isBase64 = false;
2869
+ if (parts[parts.length - 1] === "base64") {
2870
+ parts.pop();
2871
+ isBase64 = true;
2872
+ }
2873
+ let mimeType = null;
2874
+ if (parts.length > 0 && parts[0].length > 0) {
2875
+ mimeType = parts.shift();
2876
+ }
2877
+ const attributes = {};
2878
+ for (const part of parts) {
2879
+ if (part.includes("=")) {
2880
+ const [key, ...rest] = part.split("=");
2881
+ attributes[key] = rest.join("=");
2882
+ } else if (part.length > 0) {
2883
+ attributes[part] = "";
2884
+ }
2885
+ }
2886
+ const data = isBase64 ? Buffer.from(rawData, "base64") : Buffer.from(decodeURIComponent(rawData));
2887
+ return { mimeType, attributes, data };
2888
+ }
2889
+
2890
+ // src/markitdown.ts
2891
+ var PRIORITY_SPECIFIC = 0;
2892
+ var PRIORITY_GENERIC = 10;
2893
+ function createMarkItDown(options) {
2894
+ const opts = options ?? {};
2895
+ const registrations = [];
2896
+ function register(conv, priority = PRIORITY_SPECIFIC) {
2897
+ registrations.unshift({ converter: conv, priority });
2898
+ }
2899
+ register(plainTextConverter, PRIORITY_GENERIC);
2900
+ register(htmlConverter, PRIORITY_GENERIC);
2901
+ register(csvConverter);
2902
+ register(ipynbConverter);
2903
+ register(docxConverter);
2904
+ register(xlsxConverter);
2905
+ register(xlsConverter);
2906
+ register(pdfConverter);
2907
+ register(pptxConverter);
2908
+ register(rssConverter);
2909
+ register(wikipediaConverter);
2910
+ register(youtubeConverter);
2911
+ register(bingSerpConverter);
2912
+ register(epubConverter);
2913
+ register(imageConverter);
2914
+ register(audioConverter);
2915
+ register(outlookMsgConverter);
2916
+ register(createZipConverter(convert));
2917
+ function getSorted() {
2918
+ return [...registrations].sort((a, b) => a.priority - b.priority);
2919
+ }
2920
+ async function detectStreamInfo(buffer, base) {
2921
+ const guesses = [];
2922
+ let enhanced = { ...base };
2923
+ if (!enhanced.mimetype && enhanced.extension) {
2924
+ const guessedMime = guessMimeFromExtension(enhanced.extension);
2925
+ if (guessedMime)
2926
+ enhanced = { ...enhanced, mimetype: guessedMime };
2927
+ }
2928
+ if (enhanced.mimetype && !enhanced.extension) {
2929
+ const guessedExt = guessExtensionFromMime(enhanced.mimetype);
2930
+ if (guessedExt)
2931
+ enhanced = { ...enhanced, extension: guessedExt };
2932
+ }
2933
+ try {
2934
+ const { fileTypeFromBuffer } = await import("file-type");
2935
+ const detected = await fileTypeFromBuffer(buffer);
2936
+ if (detected) {
2937
+ const detectedExt = `.${detected.ext}`;
2938
+ const isCompatible = (!base.mimetype || base.mimetype === detected.mime) && (!base.extension || base.extension === detectedExt);
2939
+ if (isCompatible) {
2940
+ guesses.push({
2941
+ ...enhanced,
2942
+ mimetype: enhanced.mimetype ?? detected.mime,
2943
+ extension: enhanced.extension ?? detectedExt
2944
+ });
2945
+ } else {
2946
+ guesses.push(enhanced);
2947
+ guesses.push({
2948
+ ...base,
2949
+ mimetype: detected.mime,
2950
+ extension: detectedExt
2951
+ });
2952
+ }
2953
+ } else {
2954
+ guesses.push(enhanced);
2955
+ }
2956
+ } catch {
2957
+ guesses.push(enhanced);
2958
+ }
2959
+ return guesses;
2960
+ }
2961
+ function normalizeResult(result) {
2962
+ let md = result.markdown;
2963
+ md = md.split(/\r?\n/).map((line) => line.trimEnd()).join(`
2964
+ `);
2965
+ md = md.replace(/\n{3,}/g, `
2966
+
2967
+ `);
2968
+ return { ...result, markdown: md };
2969
+ }
2970
+ async function runConversion(buffer, streamInfoGuesses) {
2971
+ const sorted = getSorted();
2972
+ const failedAttempts = [];
2973
+ const allGuesses = [...streamInfoGuesses, {}];
2974
+ for (const info of allGuesses) {
2975
+ for (const reg of sorted) {
2976
+ const ctx = { buffer, info, opts };
2977
+ let accepts = false;
2978
+ try {
2979
+ accepts = reg.converter.match(ctx);
2980
+ } catch {}
2981
+ if (accepts) {
2982
+ try {
2983
+ const result = await reg.converter.convert(ctx);
2984
+ return normalizeResult(result);
2985
+ } catch (e) {
2986
+ failedAttempts.push({
2987
+ converterName: reg.converter.name,
2988
+ error: e instanceof Error ? e : new Error(String(e))
2989
+ });
2990
+ }
2991
+ }
2992
+ }
2993
+ }
2994
+ if (failedAttempts.length > 0) {
2995
+ throw new FileConversionError(undefined, failedAttempts);
2996
+ }
2997
+ throw new UnsupportedFormatError("Could not convert to Markdown. No converter attempted a conversion, suggesting the format is not supported.");
2998
+ }
2999
+ async function convert(source, input) {
3000
+ const prevDepth = opts._zipDepth;
3001
+ if (input?._zipDepth !== undefined) {
3002
+ opts._zipDepth = input._zipDepth;
3003
+ }
3004
+ try {
3005
+ if (typeof source === "string") {
3006
+ if (source.startsWith("http:") || source.startsWith("https:") || source.startsWith("file:") || source.startsWith("data:")) {
3007
+ return await convertUri(source, input?.streamInfo);
3008
+ }
3009
+ return await convertLocal(source, input?.streamInfo);
3010
+ }
3011
+ const info = input?.streamInfo ?? {};
3012
+ const guesses = await detectStreamInfo(source, info);
3013
+ return await runConversion(source, guesses);
3014
+ } finally {
3015
+ opts._zipDepth = prevDepth;
3016
+ }
3017
+ }
3018
+ async function convertLocal(filePath, streamInfo) {
3019
+ const ext = path4.extname(filePath);
3020
+ const filename = path4.basename(filePath);
3021
+ const base = {
3022
+ localPath: filePath,
3023
+ extension: ext || undefined,
3024
+ filename
3025
+ };
3026
+ const merged = streamInfo ? mergeStreamInfo(base, streamInfo) : base;
3027
+ const buffer = await fs2.promises.readFile(filePath);
3028
+ const guesses = await detectStreamInfo(buffer, merged);
3029
+ return runConversion(buffer, guesses);
3030
+ }
3031
+ async function convertUri(uri, streamInfo) {
3032
+ uri = uri.trim();
3033
+ if (uri.startsWith("file:")) {
3034
+ const { netloc, path: localPath } = fileUriToPath(uri);
3035
+ if (netloc && netloc !== "localhost") {
3036
+ throw new Error(`Unsupported file URI: ${uri}. Netloc must be empty or localhost.`);
3037
+ }
3038
+ return convertLocal(localPath, streamInfo);
3039
+ }
3040
+ if (uri.startsWith("data:")) {
3041
+ const { mimeType, attributes, data } = parseDataUri(uri);
3042
+ const base = {
3043
+ mimetype: mimeType ?? undefined,
3044
+ charset: attributes.charset
3045
+ };
3046
+ const merged = streamInfo ? mergeStreamInfo(base, streamInfo) : base;
3047
+ const guesses = await detectStreamInfo(data, merged);
3048
+ return runConversion(data, guesses);
3049
+ }
3050
+ if (uri.startsWith("http:") || uri.startsWith("https:")) {
3051
+ return convertUrl(uri, streamInfo);
3052
+ }
3053
+ throw new Error(`Unsupported URI scheme: ${uri.split(":")[0]}. Supported: file, data, http, https`);
3054
+ }
3055
+ async function convertUrl(url, streamInfo) {
3056
+ const response = await fetch(url, {
3057
+ headers: {
3058
+ Accept: "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1"
3059
+ }
3060
+ });
3061
+ if (!response.ok) {
3062
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
3063
+ }
3064
+ let mimetype;
3065
+ let charset;
3066
+ const contentType = response.headers.get("content-type");
3067
+ if (contentType) {
3068
+ const parts = contentType.split(";");
3069
+ mimetype = parts.shift()?.trim();
3070
+ for (const part of parts) {
3071
+ const trimmed = part.trim();
3072
+ if (trimmed.startsWith("charset=")) {
3073
+ const val = trimmed.slice(8).trim();
3074
+ if (val)
3075
+ charset = val;
3076
+ }
3077
+ }
3078
+ }
3079
+ let filename;
3080
+ let extension;
3081
+ const disposition = response.headers.get("content-disposition");
3082
+ if (disposition) {
3083
+ const match = disposition.match(/filename=([^;]+)/);
3084
+ if (match) {
3085
+ filename = match[1].replace(/["']/g, "");
3086
+ const ext = path4.extname(filename);
3087
+ if (ext)
3088
+ extension = ext;
3089
+ }
3090
+ }
3091
+ if (!filename) {
3092
+ try {
3093
+ const parsed = new URL(url);
3094
+ const ext = path4.extname(parsed.pathname);
3095
+ if (ext) {
3096
+ filename = path4.basename(parsed.pathname);
3097
+ extension = ext;
3098
+ }
3099
+ } catch {}
3100
+ }
3101
+ const base = {
3102
+ mimetype,
3103
+ charset,
3104
+ filename,
3105
+ extension,
3106
+ url
3107
+ };
3108
+ const merged = streamInfo ? mergeStreamInfo(base, streamInfo) : base;
3109
+ const buffer = Buffer.from(await response.arrayBuffer());
3110
+ const guesses = await detectStreamInfo(buffer, merged);
3111
+ return runConversion(buffer, guesses);
3112
+ }
3113
+ return {
3114
+ convert,
3115
+ convertLocal,
3116
+ convertUri,
3117
+ convertUrl,
3118
+ registerConverter: (conv, priority = PRIORITY_SPECIFIC) => {
3119
+ register(conv, priority);
3120
+ }
3121
+ };
3122
+ }
3123
+ // src/types.ts
3124
+ import { z } from "zod/v4";
3125
+ var StreamInfoSchema = z.object({
3126
+ mimetype: z.string().optional(),
3127
+ extension: z.string().optional(),
3128
+ charset: z.string().optional(),
3129
+ filename: z.string().optional(),
3130
+ localPath: z.string().optional(),
3131
+ url: z.string().optional()
3132
+ });
3133
+ var ConvertResultSchema = z.object({
3134
+ markdown: z.string(),
3135
+ title: z.string().optional()
3136
+ });
3137
+ var ConvertOptionsSchema = z.object({
3138
+ llmClient: z.any().optional(),
3139
+ llmModel: z.string().optional(),
3140
+ llmPrompt: z.string().optional(),
3141
+ exiftoolPath: z.string().optional(),
3142
+ styleMap: z.string().optional(),
3143
+ keepDataUris: z.boolean().optional(),
3144
+ youtubeTranscriptLanguages: z.array(z.string()).optional(),
3145
+ _zipDepth: z.number().optional()
3146
+ });
3147
+ export {
3148
+ youtubeConverter,
3149
+ xlsxConverter,
3150
+ xlsConverter,
3151
+ wikipediaConverter,
3152
+ rssConverter,
3153
+ pptxConverter,
3154
+ plainTextConverter,
3155
+ pdfConverter,
3156
+ outlookMsgConverter,
3157
+ mergeStreamInfo,
3158
+ ipynbConverter,
3159
+ imageConverter,
3160
+ htmlToMarkdown,
3161
+ htmlConverter,
3162
+ hasCharset,
3163
+ epubConverter,
3164
+ docxConverter,
3165
+ decodeBuffer,
3166
+ csvConverter,
3167
+ createZipConverter,
3168
+ createMarkItDown,
3169
+ converter,
3170
+ byUrl,
3171
+ byMime,
3172
+ byExt,
3173
+ bingSerpConverter,
3174
+ audioConverter,
3175
+ anyOf,
3176
+ allOf,
3177
+ UnsupportedFormatError,
3178
+ StreamInfoSchema,
3179
+ MissingDependencyError,
3180
+ MarkItDownError,
3181
+ FileConversionError,
3182
+ ConvertResultSchema,
3183
+ ConvertOptionsSchema
3184
+ };