castdown-cleaners 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/LICENSE +180 -0
  2. package/README.md +198 -0
  3. package/dist/index.d.ts +47 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +110 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/regex/annotate-figures-tables.d.ts +3 -0
  8. package/dist/regex/annotate-figures-tables.d.ts.map +1 -0
  9. package/dist/regex/annotate-figures-tables.js +11 -0
  10. package/dist/regex/annotate-figures-tables.js.map +1 -0
  11. package/dist/regex/collapse-blank-lines.d.ts +6 -0
  12. package/dist/regex/collapse-blank-lines.d.ts.map +1 -0
  13. package/dist/regex/collapse-blank-lines.js +8 -0
  14. package/dist/regex/collapse-blank-lines.js.map +1 -0
  15. package/dist/regex/collapse-redundant-emphasis.d.ts +2 -0
  16. package/dist/regex/collapse-redundant-emphasis.d.ts.map +1 -0
  17. package/dist/regex/collapse-redundant-emphasis.js +19 -0
  18. package/dist/regex/collapse-redundant-emphasis.js.map +1 -0
  19. package/dist/regex/decode-html-entities.d.ts +2 -0
  20. package/dist/regex/decode-html-entities.d.ts.map +1 -0
  21. package/dist/regex/decode-html-entities.js +73 -0
  22. package/dist/regex/decode-html-entities.js.map +1 -0
  23. package/dist/regex/dedupe-links.d.ts +9 -0
  24. package/dist/regex/dedupe-links.d.ts.map +1 -0
  25. package/dist/regex/dedupe-links.js +16 -0
  26. package/dist/regex/dedupe-links.js.map +1 -0
  27. package/dist/regex/detect-space-tables.d.ts +29 -0
  28. package/dist/regex/detect-space-tables.d.ts.map +1 -0
  29. package/dist/regex/detect-space-tables.js +125 -0
  30. package/dist/regex/detect-space-tables.js.map +1 -0
  31. package/dist/regex/detect-toc.d.ts +14 -0
  32. package/dist/regex/detect-toc.d.ts.map +1 -0
  33. package/dist/regex/detect-toc.js +35 -0
  34. package/dist/regex/detect-toc.js.map +1 -0
  35. package/dist/regex/extract-metadata-frontmatter.d.ts +3 -0
  36. package/dist/regex/extract-metadata-frontmatter.d.ts.map +1 -0
  37. package/dist/regex/extract-metadata-frontmatter.js +39 -0
  38. package/dist/regex/extract-metadata-frontmatter.js.map +1 -0
  39. package/dist/regex/fix-footnote-markers.d.ts +2 -0
  40. package/dist/regex/fix-footnote-markers.d.ts.map +1 -0
  41. package/dist/regex/fix-footnote-markers.js +23 -0
  42. package/dist/regex/fix-footnote-markers.js.map +1 -0
  43. package/dist/regex/fix-headings.d.ts +12 -0
  44. package/dist/regex/fix-headings.d.ts.map +1 -0
  45. package/dist/regex/fix-headings.js +40 -0
  46. package/dist/regex/fix-headings.js.map +1 -0
  47. package/dist/regex/fix-ligatures.d.ts +3 -0
  48. package/dist/regex/fix-ligatures.d.ts.map +1 -0
  49. package/dist/regex/fix-ligatures.js +16 -0
  50. package/dist/regex/fix-ligatures.js.map +1 -0
  51. package/dist/regex/fix-tables.d.ts +13 -0
  52. package/dist/regex/fix-tables.d.ts.map +1 -0
  53. package/dist/regex/fix-tables.js +63 -0
  54. package/dist/regex/fix-tables.js.map +1 -0
  55. package/dist/regex/html-tables-to-gfm.d.ts +21 -0
  56. package/dist/regex/html-tables-to-gfm.d.ts.map +1 -0
  57. package/dist/regex/html-tables-to-gfm.js +76 -0
  58. package/dist/regex/html-tables-to-gfm.js.map +1 -0
  59. package/dist/regex/join-broken-lines.d.ts +10 -0
  60. package/dist/regex/join-broken-lines.d.ts.map +1 -0
  61. package/dist/regex/join-broken-lines.js +40 -0
  62. package/dist/regex/join-broken-lines.js.map +1 -0
  63. package/dist/regex/join-soft-hyphens.d.ts +9 -0
  64. package/dist/regex/join-soft-hyphens.d.ts.map +1 -0
  65. package/dist/regex/join-soft-hyphens.js +11 -0
  66. package/dist/regex/join-soft-hyphens.js.map +1 -0
  67. package/dist/regex/normalize-horizontal-rules.d.ts +2 -0
  68. package/dist/regex/normalize-horizontal-rules.d.ts.map +1 -0
  69. package/dist/regex/normalize-horizontal-rules.js +20 -0
  70. package/dist/regex/normalize-horizontal-rules.js.map +1 -0
  71. package/dist/regex/normalize-list-markers.d.ts +2 -0
  72. package/dist/regex/normalize-list-markers.d.ts.map +1 -0
  73. package/dist/regex/normalize-list-markers.js +35 -0
  74. package/dist/regex/normalize-list-markers.js.map +1 -0
  75. package/dist/regex/normalize-numbered-lists.d.ts +2 -0
  76. package/dist/regex/normalize-numbered-lists.d.ts.map +1 -0
  77. package/dist/regex/normalize-numbered-lists.js +9 -0
  78. package/dist/regex/normalize-numbered-lists.js.map +1 -0
  79. package/dist/regex/normalize-unicode.d.ts +2 -0
  80. package/dist/regex/normalize-unicode.d.ts.map +1 -0
  81. package/dist/regex/normalize-unicode.js +49 -0
  82. package/dist/regex/normalize-unicode.js.map +1 -0
  83. package/dist/regex/normalize-whitespace-in-lines.d.ts +2 -0
  84. package/dist/regex/normalize-whitespace-in-lines.d.ts.map +1 -0
  85. package/dist/regex/normalize-whitespace-in-lines.js +24 -0
  86. package/dist/regex/normalize-whitespace-in-lines.js.map +1 -0
  87. package/dist/regex/strip-boilerplate.d.ts +3 -0
  88. package/dist/regex/strip-boilerplate.d.ts.map +1 -0
  89. package/dist/regex/strip-boilerplate.js +16 -0
  90. package/dist/regex/strip-boilerplate.js.map +1 -0
  91. package/dist/regex/strip-docx-artifacts.d.ts +19 -0
  92. package/dist/regex/strip-docx-artifacts.d.ts.map +1 -0
  93. package/dist/regex/strip-docx-artifacts.js +34 -0
  94. package/dist/regex/strip-docx-artifacts.js.map +1 -0
  95. package/dist/regex/strip-empty-headings.d.ts +2 -0
  96. package/dist/regex/strip-empty-headings.d.ts.map +1 -0
  97. package/dist/regex/strip-empty-headings.js +6 -0
  98. package/dist/regex/strip-empty-headings.js.map +1 -0
  99. package/dist/regex/strip-html-artifacts.d.ts +2 -0
  100. package/dist/regex/strip-html-artifacts.d.ts.map +1 -0
  101. package/dist/regex/strip-html-artifacts.js +24 -0
  102. package/dist/regex/strip-html-artifacts.js.map +1 -0
  103. package/dist/regex/strip-page-numbers.d.ts +2 -0
  104. package/dist/regex/strip-page-numbers.d.ts.map +1 -0
  105. package/dist/regex/strip-page-numbers.js +23 -0
  106. package/dist/regex/strip-page-numbers.js.map +1 -0
  107. package/dist/regex/strip-pptx-notes.d.ts +22 -0
  108. package/dist/regex/strip-pptx-notes.d.ts.map +1 -0
  109. package/dist/regex/strip-pptx-notes.js +32 -0
  110. package/dist/regex/strip-pptx-notes.js.map +1 -0
  111. package/dist/regex/strip-repeated-headers.d.ts +2 -0
  112. package/dist/regex/strip-repeated-headers.d.ts.map +1 -0
  113. package/dist/regex/strip-repeated-headers.js +37 -0
  114. package/dist/regex/strip-repeated-headers.js.map +1 -0
  115. package/dist/regex/strip-url-tracking-params.d.ts +2 -0
  116. package/dist/regex/strip-url-tracking-params.d.ts.map +1 -0
  117. package/dist/regex/strip-url-tracking-params.js +26 -0
  118. package/dist/regex/strip-url-tracking-params.js.map +1 -0
  119. package/dist/regex/wrap-long-cell-text.d.ts +28 -0
  120. package/dist/regex/wrap-long-cell-text.d.ts.map +1 -0
  121. package/dist/regex/wrap-long-cell-text.js +66 -0
  122. package/dist/regex/wrap-long-cell-text.js.map +1 -0
  123. package/dist/util/protect-code.d.ts +6 -0
  124. package/dist/util/protect-code.d.ts.map +1 -0
  125. package/dist/util/protect-code.js +20 -0
  126. package/dist/util/protect-code.js.map +1 -0
  127. package/package.json +63 -0
@@ -0,0 +1,21 @@
1
+ /**
2
+ * htmlTablesToGfm — convert HTML `<table>` blocks to GFM pipe tables.
3
+ *
4
+ * MarkItDown emits HTML tables for DOCX/XLSX/PPTX/PDF sources. GFM viewers
5
+ * and downstream Pandoc/Typst templates expect pipe-style tables, so HTML
6
+ * tables silently break the pipeline. Run BEFORE `fixTables` so the rebuilt
7
+ * pipe tables get normalized.
8
+ *
9
+ * Strategy (regex, no DOM):
10
+ * - Find each `<table …>…</table>` block (case-insensitive, multiline).
11
+ * - Extract `<tr>` rows; per row extract `<th>`/`<td>` cells.
12
+ * - First row carrying `<th>` (else first row) → header.
13
+ * - Inside cells: strip inline tags, collapse whitespace, escape pipes,
14
+ * map `<br>` to a space.
15
+ * - rowspan/colspan flattened (GFM has no equivalent). Cells padded to
16
+ * max column count.
17
+ *
18
+ * Idempotent on already-pipe MD (no `<table>` ⇒ no-op).
19
+ */
20
+ export declare function htmlTablesToGfm(md: string): string;
21
+ //# sourceMappingURL=html-tables-to-gfm.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"html-tables-to-gfm.d.ts","sourceRoot":"","sources":["../../src/regex/html-tables-to-gfm.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAgB,eAAe,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAwBlD"}
@@ -0,0 +1,76 @@
1
+ /**
2
+ * htmlTablesToGfm — convert HTML `<table>` blocks to GFM pipe tables.
3
+ *
4
+ * MarkItDown emits HTML tables for DOCX/XLSX/PPTX/PDF sources. GFM viewers
5
+ * and downstream Pandoc/Typst templates expect pipe-style tables, so HTML
6
+ * tables silently break the pipeline. Run BEFORE `fixTables` so the rebuilt
7
+ * pipe tables get normalized.
8
+ *
9
+ * Strategy (regex, no DOM):
10
+ * - Find each `<table …>…</table>` block (case-insensitive, multiline).
11
+ * - Extract `<tr>` rows; per row extract `<th>`/`<td>` cells.
12
+ * - First row carrying `<th>` (else first row) → header.
13
+ * - Inside cells: strip inline tags, collapse whitespace, escape pipes,
14
+ * map `<br>` to a space.
15
+ * - rowspan/colspan flattened (GFM has no equivalent). Cells padded to
16
+ * max column count.
17
+ *
18
+ * Idempotent on already-pipe MD (no `<table>` ⇒ no-op).
19
+ */
20
+ export function htmlTablesToGfm(md) {
21
+ const TABLE_RE = /<table\b[^>]*>([\s\S]*?)<\/table\s*>/gi;
22
+ return md.replace(TABLE_RE, (_full, inner) => {
23
+ const rows = extractRows(inner);
24
+ if (rows.length === 0)
25
+ return _full;
26
+ const headerIdx = rows.findIndex((r) => r.isHeader);
27
+ const headerRowIdx = headerIdx >= 0 ? headerIdx : 0;
28
+ const header = rows[headerRowIdx].cells;
29
+ const body = rows.filter((_, i) => i !== headerRowIdx).map((r) => r.cells);
30
+ const maxCols = Math.max(header.length, ...body.map((r) => r.length), 1);
31
+ const pad = (r) => {
32
+ const c = [...r];
33
+ while (c.length < maxCols)
34
+ c.push("");
35
+ return c;
36
+ };
37
+ const lines = [];
38
+ lines.push(renderRow(pad(header)));
39
+ lines.push(renderRow(Array(maxCols).fill("---")));
40
+ for (const r of body)
41
+ lines.push(renderRow(pad(r)));
42
+ return "\n" + lines.join("\n") + "\n";
43
+ });
44
+ }
45
+ function extractRows(inner) {
46
+ const out = [];
47
+ const TR_RE = /<tr\b[^>]*>([\s\S]*?)<\/tr\s*>/gi;
48
+ let m;
49
+ while ((m = TR_RE.exec(inner)) !== null) {
50
+ const body = m[1] ?? "";
51
+ const cells = [];
52
+ let isHeader = false;
53
+ const CELL_RE = /<(th|td)\b[^>]*>([\s\S]*?)<\/\1\s*>/gi;
54
+ let c;
55
+ while ((c = CELL_RE.exec(body)) !== null) {
56
+ if ((c[1] ?? "").toLowerCase() === "th")
57
+ isHeader = true;
58
+ cells.push(cleanCell(c[2] ?? ""));
59
+ }
60
+ if (cells.length > 0)
61
+ out.push({ cells, isHeader });
62
+ }
63
+ return out;
64
+ }
65
+ function cleanCell(raw) {
66
+ return raw
67
+ .replace(/<br\s*\/?>/gi, " ")
68
+ .replace(/<\/?[^>]+>/g, "") // strip remaining inline tags
69
+ .replace(/\s+/g, " ")
70
+ .trim()
71
+ .replace(/\|/g, "\\|");
72
+ }
73
+ function renderRow(cells) {
74
+ return "| " + cells.join(" | ") + " |";
75
+ }
76
+ //# sourceMappingURL=html-tables-to-gfm.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"html-tables-to-gfm.js","sourceRoot":"","sources":["../../src/regex/html-tables-to-gfm.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AACH,MAAM,UAAU,eAAe,CAAC,EAAU;IACxC,MAAM,QAAQ,GAAG,wCAAwC,CAAC;IAC1D,OAAO,EAAE,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC,KAAK,EAAE,KAAa,EAAE,EAAE;QACnD,MAAM,IAAI,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC;QAChC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,KAAe,CAAC;QAE9C,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;QACpD,MAAM,YAAY,GAAG,SAAS,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;QACpD,MAAM,MAAM,GAAG,IAAI,CAAC,YAAY,CAAE,CAAC,KAAK,CAAC;QACzC,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,KAAK,YAAY,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;QAE3E,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;QACzE,MAAM,GAAG,GAAG,CAAC,CAAW,EAAE,EAAE;YAC1B,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YACjB,OAAO,CAAC,CAAC,MAAM,GAAG,OAAO;gBAAE,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACtC,OAAO,CAAC,CAAC;QACX,CAAC,CAAC;QAEF,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;QACnC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAClD,KAAK,MAAM,CAAC,IAAI,IAAI;YAAE,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACpD,OAAO,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;IACxC,CAAC,CAAC,CAAC;AACL,CAAC;AAOD,SAAS,WAAW,CAAC,KAAa;IAChC,MAAM,GAAG,GAAU,EAAE,CAAC;IACtB,MAAM,KAAK,GAAG,kCAAkC,CAAC;IACjD,IAAI,CAAyB,CAAC;IAC9B,OAAO,CAAC,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACxC,MAAM,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACxB,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,IAAI,QAAQ,GAAG,KAAK,CAAC;QACrB,MAAM,OAAO,GAAG,uCAAuC,CAAC;QACxD,IAAI,CAAyB,CAAC;QAC9B,OAAO,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACzC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,KAAK,IAAI;gBAAE,QAAQ,GAAG,IAAI,CAAC;YACzD,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QACpC,CAAC;QACD,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC;YAAE,GAAG,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;IACtD,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,SAAS,CAAC,GAAW;IAC5B,OAAO,GAAG;SACP,OAAO,CAAC,cAAc,EAAE,GAAG,CAAC;SAC5B,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC,CAAI,8BAA8B;SAC5D,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,IAAI,EAAE;SACN,OAAO,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;AAC3B,CAAC;AAED,SAAS,SAAS,CAAC,KAAe;IAChC,OAAO,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC;AACzC,CAAC"}
@@ -0,0 +1,10 @@
1
+ /**
2
+ * joinBrokenLines — join lines that were broken mid-sentence (PDF column wrap).
3
+ *
4
+ * Rule: if line ends with a lowercase letter or comma AND next line starts
5
+ * with a lowercase letter, treat as single sentence.
6
+ *
7
+ * Conservative: skips lines inside fenced code blocks, lists, tables, headings.
8
+ */
9
+ export declare function joinBrokenLines(md: string): string;
10
+ //# sourceMappingURL=join-broken-lines.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"join-broken-lines.d.ts","sourceRoot":"","sources":["../../src/regex/join-broken-lines.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AACH,wBAAgB,eAAe,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAuBlD"}
@@ -0,0 +1,40 @@
1
+ /**
2
+ * joinBrokenLines — join lines that were broken mid-sentence (PDF column wrap).
3
+ *
4
+ * Rule: if line ends with a lowercase letter or comma AND next line starts
5
+ * with a lowercase letter, treat as single sentence.
6
+ *
7
+ * Conservative: skips lines inside fenced code blocks, lists, tables, headings.
8
+ */
9
+ export function joinBrokenLines(md) {
10
+ const lines = md.split("\n");
11
+ const out = [];
12
+ let inCode = false;
13
+ for (let i = 0; i < lines.length; i++) {
14
+ const line = lines[i] ?? "";
15
+ const next = lines[i + 1] ?? "";
16
+ if (/^```/.test(line.trim()))
17
+ inCode = !inCode;
18
+ if (!inCode &&
19
+ out.length > 0 &&
20
+ shouldJoin(out[out.length - 1] ?? "", line)) {
21
+ out[out.length - 1] = (out[out.length - 1] ?? "").replace(/\s+$/, "") + " " + line.trim();
22
+ continue;
23
+ }
24
+ out.push(line);
25
+ void next; // lookahead reserved for future heuristics
26
+ }
27
+ return out.join("\n");
28
+ }
29
+ function shouldJoin(prev, current) {
30
+ if (!prev.trim() || !current.trim())
31
+ return false;
32
+ if (/^[-*+]\s|^\d+\.\s|^#{1,6}\s|^>\s|^\|/.test(current))
33
+ return false; // list/heading/quote/table
34
+ if (/^[-*+]\s|^\d+\.\s|^#{1,6}\s|^>\s|^\|/.test(prev))
35
+ return false;
36
+ if (/[.!?:;]$/.test(prev.trim()))
37
+ return false;
38
+ return /[a-záéíóúñü,]\s*$/.test(prev) && /^[a-záéíóúñü]/.test(current.trim());
39
+ }
40
+ //# sourceMappingURL=join-broken-lines.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"join-broken-lines.js","sourceRoot":"","sources":["../../src/regex/join-broken-lines.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AACH,MAAM,UAAU,eAAe,CAAC,EAAU;IACxC,MAAM,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,IAAI,MAAM,GAAG,KAAK,CAAC;IAEnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5B,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QAEhC,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAAE,MAAM,GAAG,CAAC,MAAM,CAAC;QAE/C,IACE,CAAC,MAAM;YACP,GAAG,CAAC,MAAM,GAAG,CAAC;YACd,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,IAAI,CAAC,EAC3C,CAAC;YACD,GAAG,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,GAAG,GAAG,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;YAC1F,SAAS;QACX,CAAC;QACD,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACf,KAAK,IAAI,CAAC,CAAC,2CAA2C;IACxD,CAAC;IACD,OAAO,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACxB,CAAC;AAED,SAAS,UAAU,CAAC,IAAY,EAAE,OAAe;IAC/C,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE;QAAE,OAAO,KAAK,CAAC;IAClD,IAAI,sCAAsC,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,KAAK,CAAC,CAAC,2BAA2B;IACnG,IAAI,sCAAsC,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC;IACpE,IAAI,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;QAAE,OAAO,KAAK,CAAC;IAC/C,OAAO,mBAAmB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC;AAChF,CAAC"}
@@ -0,0 +1,9 @@
1
+ /**
2
+ * joinSoftHyphens — rejoin words split by line-end hyphens (common in PDF).
3
+ * "exam-\nple" → "example"
4
+ * "Conway-\nMaxwell" → "Conway-Maxwell" (preserves real compound hyphens)
5
+ *
6
+ * Heuristic: only join when next char is lowercase letter.
7
+ */
8
+ export declare function joinSoftHyphens(md: string): string;
9
+ //# sourceMappingURL=join-soft-hyphens.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"join-soft-hyphens.d.ts","sourceRoot":"","sources":["../../src/regex/join-soft-hyphens.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AACH,wBAAgB,eAAe,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAElD"}
@@ -0,0 +1,11 @@
1
+ /**
2
+ * joinSoftHyphens — rejoin words split by line-end hyphens (common in PDF).
3
+ * "exam-\nple" → "example"
4
+ * "Conway-\nMaxwell" → "Conway-Maxwell" (preserves real compound hyphens)
5
+ *
6
+ * Heuristic: only join when next char is lowercase letter.
7
+ */
8
+ export function joinSoftHyphens(md) {
9
+ return md.replace(/(\w)-\n([a-záéíóúñü])/g, "$1$2");
10
+ }
11
+ //# sourceMappingURL=join-soft-hyphens.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"join-soft-hyphens.js","sourceRoot":"","sources":["../../src/regex/join-soft-hyphens.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AACH,MAAM,UAAU,eAAe,CAAC,EAAU;IACxC,OAAO,EAAE,CAAC,OAAO,CAAC,wBAAwB,EAAE,MAAM,CAAC,CAAC;AACtD,CAAC"}
@@ -0,0 +1,2 @@
1
+ export declare function normalizeHorizontalRules(md: string): string;
2
+ //# sourceMappingURL=normalize-horizontal-rules.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"normalize-horizontal-rules.d.ts","sourceRoot":"","sources":["../../src/regex/normalize-horizontal-rules.ts"],"names":[],"mappings":"AAKA,wBAAgB,wBAAwB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAc3D"}
@@ -0,0 +1,20 @@
1
+ import { withProtectedCode } from "../util/protect-code.js";
2
+ const HR_SIMPLE_RE = /^[ \t]*([-=_*~—–])\1{2,}[ \t]*$/;
3
+ const HR_SPACED_RE = /^[ \t]*([-*•—–])(?:[ \t]+\1){2,}[ \t]*$/;
4
+ export function normalizeHorizontalRules(md) {
5
+ return withProtectedCode(md, (s) => {
6
+ const lines = s.split("\n");
7
+ return lines
8
+ .map((line, i) => {
9
+ const prev = lines[i - 1] ?? "";
10
+ const isSetextCandidate = /^[-=]+$/.test(line) && prev.trim().length > 0;
11
+ if (isSetextCandidate)
12
+ return line;
13
+ if (HR_SIMPLE_RE.test(line) || HR_SPACED_RE.test(line))
14
+ return "---";
15
+ return line;
16
+ })
17
+ .join("\n");
18
+ });
19
+ }
20
+ //# sourceMappingURL=normalize-horizontal-rules.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"normalize-horizontal-rules.js","sourceRoot":"","sources":["../../src/regex/normalize-horizontal-rules.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAE5D,MAAM,YAAY,GAAG,iCAAiC,CAAC;AACvD,MAAM,YAAY,GAAG,yCAAyC,CAAC;AAE/D,MAAM,UAAU,wBAAwB,CAAC,EAAU;IACjD,OAAO,iBAAiB,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE;QACjC,MAAM,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAC5B,OAAO,KAAK;aACT,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE;YACf,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YAChC,MAAM,iBAAiB,GACrB,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC;YACjD,IAAI,iBAAiB;gBAAE,OAAO,IAAI,CAAC;YACnC,IAAI,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC;gBAAE,OAAO,KAAK,CAAC;YACrE,OAAO,IAAI,CAAC;QACd,CAAC,CAAC;aACD,IAAI,CAAC,IAAI,CAAC,CAAC;IAChB,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,2 @@
1
+ export declare function normalizeListMarkers(md: string): string;
2
+ //# sourceMappingURL=normalize-list-markers.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"normalize-list-markers.d.ts","sourceRoot":"","sources":["../../src/regex/normalize-list-markers.ts"],"names":[],"mappings":"AAqBA,wBAAgB,oBAAoB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAgBvD"}
@@ -0,0 +1,35 @@
1
+ import { withProtectedCode } from "../util/protect-code.js";
2
+ const BULLET_LEVEL = {
3
+ "•": 0, "▪": 0, "▶": 0, "►": 0, "◆": 0, "❖": 0, "→": 0, "»": 0,
4
+ "◦": 1, "▫": 1, "▷": 1, "▸": 1, "◇": 1, "○": 1,
5
+ "‣": 2, "·": 2,
6
+ };
7
+ const CHECK_TRUE_CHARS = new Set(["✓", "✔", "☑"]);
8
+ const CHECK_FALSE_CHARS = new Set(["✗", "✘", "☐"]);
9
+ const ALL_BULLET_CHARS = [
10
+ ...Object.keys(BULLET_LEVEL),
11
+ ...CHECK_TRUE_CHARS,
12
+ ...CHECK_FALSE_CHARS,
13
+ ].join("");
14
+ const BULLET_LINE_RE = new RegExp(`^([ \\t]*)([${ALL_BULLET_CHARS}])\\s+(.+)$`);
15
+ export function normalizeListMarkers(md) {
16
+ return withProtectedCode(md, (s) => {
17
+ const lines = s.split("\n");
18
+ return lines
19
+ .map((line) => {
20
+ const m = line.match(BULLET_LINE_RE);
21
+ if (!m)
22
+ return line;
23
+ const [, indent, sym, content] = m;
24
+ if (CHECK_TRUE_CHARS.has(sym))
25
+ return `${indent}- [x] ${content}`;
26
+ if (CHECK_FALSE_CHARS.has(sym))
27
+ return `${indent}- [ ] ${content}`;
28
+ const level = BULLET_LEVEL[sym] ?? 0;
29
+ const extra = " ".repeat(level);
30
+ return `${indent}${extra}- ${content}`;
31
+ })
32
+ .join("\n");
33
+ });
34
+ }
35
+ //# sourceMappingURL=normalize-list-markers.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"normalize-list-markers.js","sourceRoot":"","sources":["../../src/regex/normalize-list-markers.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAE5D,MAAM,YAAY,GAA2B;IAC3C,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC;IAC9D,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC;IAC9C,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC;CACf,CAAC;AAEF,MAAM,gBAAgB,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC;AAClD,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC;AAEnD,MAAM,gBAAgB,GAAG;IACvB,GAAG,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC;IAC5B,GAAG,gBAAgB;IACnB,GAAG,iBAAiB;CACrB,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;AAEX,MAAM,cAAc,GAAG,IAAI,MAAM,CAC/B,eAAe,gBAAgB,aAAa,CAC7C,CAAC;AAEF,MAAM,UAAU,oBAAoB,CAAC,EAAU;IAC7C,OAAO,iBAAiB,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE;QACjC,MAAM,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAC5B,OAAO,KAAK;aACT,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;YACZ,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC;YACrC,IAAI,CAAC,CAAC;gBAAE,OAAO,IAAI,CAAC;YACpB,MAAM,CAAC,EAAE,MAAM,EAAE,GAAG,EAAE,OAAO,CAAC,GAAG,CAAqC,CAAC;YACvE,IAAI,gBAAgB,CAAC,GAAG,CAAC,GAAG,CAAC;gBAAE,OAAO,GAAG,MAAM,SAAS,OAAO,EAAE,CAAC;YAClE,IAAI,iBAAiB,CAAC,GAAG,CAAC,GAAG,CAAC;gBAAE,OAAO,GAAG,MAAM,SAAS,OAAO,EAAE,CAAC;YACnE,MAAM,KAAK,GAAG,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACrC,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACjC,OAAO,GAAG,MAAM,GAAG,KAAK,KAAK,OAAO,EAAE,CAAC;QACzC,CAAC,CAAC;aACD,IAAI,CAAC,IAAI,CAAC,CAAC;IAChB,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,2 @@
1
+ export declare function normalizeNumberedLists(md: string): string;
2
+ //# sourceMappingURL=normalize-numbered-lists.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"normalize-numbered-lists.d.ts","sourceRoot":"","sources":["../../src/regex/normalize-numbered-lists.ts"],"names":[],"mappings":"AAEA,wBAAgB,sBAAsB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAQzD"}
@@ -0,0 +1,9 @@
1
+ import { withProtectedCode } from "../util/protect-code.js";
2
+ export function normalizeNumberedLists(md) {
3
+ return withProtectedCode(md, (s) => s
4
+ .replace(/^([ \t]*)(\d+)\)(?=[ \t])/gm, "$1$2.")
5
+ .replace(/^([ \t]*)\((\d+)\)(?=[ \t])/gm, "$1$2.")
6
+ .replace(/^([ \t]*)([a-z])\)(?=[ \t])/gm, "$1$2.")
7
+ .replace(/^([ \t]*)\(([a-z])\)(?=[ \t])/gm, "$1$2."));
8
+ }
9
+ //# sourceMappingURL=normalize-numbered-lists.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"normalize-numbered-lists.js","sourceRoot":"","sources":["../../src/regex/normalize-numbered-lists.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAE5D,MAAM,UAAU,sBAAsB,CAAC,EAAU;IAC/C,OAAO,iBAAiB,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,CACjC,CAAC;SACE,OAAO,CAAC,6BAA6B,EAAE,OAAO,CAAC;SAC/C,OAAO,CAAC,+BAA+B,EAAE,OAAO,CAAC;SACjD,OAAO,CAAC,+BAA+B,EAAE,OAAO,CAAC;SACjD,OAAO,CAAC,iCAAiC,EAAE,OAAO,CAAC,CACvD,CAAC;AACJ,CAAC"}
@@ -0,0 +1,2 @@
1
+ export declare function normalizeUnicode(md: string): string;
2
+ //# sourceMappingURL=normalize-unicode.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"normalize-unicode.d.ts","sourceRoot":"","sources":["../../src/regex/normalize-unicode.ts"],"names":[],"mappings":"AAmCA,wBAAgB,gBAAgB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAkBnD"}
@@ -0,0 +1,49 @@
1
+ /**
2
+ * normalizeUnicode — fix misplaced PDF accents + NFC + smart-quote/dash mapping.
3
+ * Idempotent. Run first.
4
+ *
5
+ * PDF text extraction (pdfminer/pdfplumber) emits the accent BEFORE the vowel
6
+ * it belongs to, as either a combining mark ("est"+U+0301+"a") or, more often,
7
+ * a standalone non-ASCII spacing diacritic ("duraci´on", "electr´onicos").
8
+ * Neither composes under plain NFC. We move any such accent forward onto the
9
+ * next letter — but only when that letter has a precomposed accented form, so
10
+ * an accent is never forced onto a consonant — then NFC composes it.
11
+ *
12
+ * Only NON-ASCII spacing diacritics are handled; ASCII look-alikes (^ ~ ` ' ")
13
+ * are Markdown/code syntax and are deliberately left untouched.
14
+ */
15
+ import { withProtectedCode } from "../util/protect-code.js";
16
+ // Non-ASCII spacing diacritic → its combining counterpart.
17
+ const SPACING_TO_COMBINING = {
18
+ "´": "́", // ´ acute → á é í ó ú
19
+ "ˊ": "́", // ˊ modifier acute
20
+ "¨": "̈", // ¨ diaeresis → ä ë ï ö ü
21
+ "˜": "̃", // ˜ small tilde → ã ñ õ
22
+ "ˆ": "̂", // ˆ modifier circumflex → â ê î ô û
23
+ "¸": "̧", // ¸ cedilla → ç
24
+ "ˋ": "̀", // ˋ modifier grave → à è ì ò ù
25
+ };
26
+ const SPACING_CHARS = Object.keys(SPACING_TO_COMBINING).join("");
27
+ // an accent = a combining mark OR one of the spacing diacritics above
28
+ const ACCENT_BEFORE_LETTER = new RegExp(`([̀-ͯ${SPACING_CHARS}])[ \\u00a0]?(\\p{L})`, "gu");
29
+ function composesToOne(base, mark) {
30
+ return (base + mark).normalize("NFC").length === 1;
31
+ }
32
+ export function normalizeUnicode(md) {
33
+ return withProtectedCode(md, (s) => {
34
+ let out = s.replace(ACCENT_BEFORE_LETTER, (full, acc, letter) => {
35
+ const mark = SPACING_TO_COMBINING[acc] ?? acc; // spacing → combining; combining stays
36
+ return composesToOne(letter, mark) ? letter + mark : full;
37
+ });
38
+ out = out.normalize("NFC");
39
+ out = out
40
+ .replace(/[‘’‚‛]/g, "'")
41
+ .replace(/[“”„‟]/g, '"')
42
+ .replace(/[–—]/g, "—")
43
+ .replace(/­/g, "") // soft hyphen
44
+ .replace(/ /g, " ") // nbsp → space
45
+ .replace(/​/g, ""); // zero-width space
46
+ return out;
47
+ });
48
+ }
49
+ //# sourceMappingURL=normalize-unicode.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"normalize-unicode.js","sourceRoot":"","sources":["../../src/regex/normalize-unicode.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AACH,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAE5D,2DAA2D;AAC3D,MAAM,oBAAoB,GAAqC;IAC7D,GAAG,EAAE,GAAG,EAAE,iCAAiC;IAC3C,GAAG,EAAE,GAAG,EAAE,mBAAmB;IAC7B,GAAG,EAAE,GAAG,EAAE,iCAAiC;IAC3C,GAAG,EAAE,GAAG,EAAE,6BAA6B;IACvC,GAAG,EAAE,GAAG,EAAE,oCAAoC;IAC9C,GAAG,EAAE,GAAG,EAAE,yBAAyB;IACnC,GAAG,EAAE,GAAG,EAAE,iCAAiC;CAC5C,CAAC;AAEF,MAAM,aAAa,GAAG,MAAM,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;AACjE,sEAAsE;AACtE,MAAM,oBAAoB,GAAG,IAAI,MAAM,CAAC,QAAQ,aAAa,uBAAuB,EAAE,IAAI,CAAC,CAAC;AAE5F,SAAS,aAAa,CAAC,IAAY,EAAE,IAAY;IAC/C,OAAO,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC;AACrD,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,EAAU;IACzC,OAAO,iBAAiB,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE;QACjC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,oBAAoB,EAAE,CAAC,IAAI,EAAE,GAAW,EAAE,MAAc,EAAE,EAAE;YAC9E,MAAM,IAAI,GAAG,oBAAoB,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,CAAC,uCAAuC;YACtF,OAAO,aAAa,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC;QAC5D,CAAC,CAAC,CAAC;QAEH,GAAG,GAAG,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;QAE3B,GAAG,GAAG,GAAG;aACN,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;aACvB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;aACvB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;aACrB,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,cAAc;aAChC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC,eAAe;aAClC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,mBAAmB;QACzC,OAAO,GAAG,CAAC;IACb,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,2 @@
1
+ export declare function normalizeWhitespaceInLines(md: string): string;
2
+ //# sourceMappingURL=normalize-whitespace-in-lines.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"normalize-whitespace-in-lines.d.ts","sourceRoot":"","sources":["../../src/regex/normalize-whitespace-in-lines.ts"],"names":[],"mappings":"AAEA,wBAAgB,0BAA0B,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAmB7D"}
@@ -0,0 +1,24 @@
1
+ import { withProtectedCode } from "../util/protect-code.js";
2
+ export function normalizeWhitespaceInLines(md) {
3
+ return withProtectedCode(md, (s) => {
4
+ const lines = s.split("\n");
5
+ return lines
6
+ .map((line) => {
7
+ // Whitespace-only line → empty
8
+ if (/^\s+$/.test(line))
9
+ return "";
10
+ // Preserve double-space hard line break at end (exactly 2 trailing spaces)
11
+ if (/[^ \t] $/.test(line))
12
+ return line;
13
+ // Strip trailing whitespace
14
+ line = line.trimEnd();
15
+ // Collapse multiple internal spaces (skip code-indented lines)
16
+ if (!/^(?: |\t)/.test(line)) {
17
+ line = line.replace(/([^\s])[ \t]{2,}([^\s])/g, "$1 $2");
18
+ }
19
+ return line;
20
+ })
21
+ .join("\n");
22
+ });
23
+ }
24
+ //# sourceMappingURL=normalize-whitespace-in-lines.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"normalize-whitespace-in-lines.js","sourceRoot":"","sources":["../../src/regex/normalize-whitespace-in-lines.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAE5D,MAAM,UAAU,0BAA0B,CAAC,EAAU;IACnD,OAAO,iBAAiB,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE;QACjC,MAAM,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAC5B,OAAO,KAAK;aACT,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;YACZ,+BAA+B;YAC/B,IAAI,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC;gBAAE,OAAO,EAAE,CAAC;YAClC,2EAA2E;YAC3E,IAAI,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC;gBAAE,OAAO,IAAI,CAAC;YACxC,4BAA4B;YAC5B,IAAI,GAAG,IAAI,CAAC,OAAO,EAAE,CAAC;YACtB,+DAA+D;YAC/D,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC/B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,0BAA0B,EAAE,OAAO,CAAC,CAAC;YAC3D,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC,CAAC;aACD,IAAI,CAAC,IAAI,CAAC,CAAC;IAChB,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { CleanOptions } from "../index.js";
2
+ export declare function stripBoilerplate(md: string, opts?: CleanOptions): string;
3
+ //# sourceMappingURL=strip-boilerplate.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"strip-boilerplate.d.ts","sourceRoot":"","sources":["../../src/regex/strip-boilerplate.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAQhD,wBAAgB,gBAAgB,CAAC,EAAE,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,YAAY,GAAG,MAAM,CAQxE"}
@@ -0,0 +1,16 @@
1
+ const BOILERPLATE_PATTERNS = [
2
+ /^[^\n]*(?:©|&copy;|\bCopyright\b)[^\n]*\d{4}[^\n]*$/gim,
3
+ /^\s*All rights reserved\.?\s*$/gim,
4
+ /^\s*(?:CONFIDENTIAL|PROPRIETARY|FOR INTERNAL USE ONLY|DRAFT)\s*\.?\s*$/gim,
5
+ ];
6
+ export function stripBoilerplate(md, opts) {
7
+ if (opts?.keepBoilerplate)
8
+ return md;
9
+ let out = md;
10
+ for (const re of BOILERPLATE_PATTERNS) {
11
+ re.lastIndex = 0;
12
+ out = out.replace(re, "");
13
+ }
14
+ return out;
15
+ }
16
+ //# sourceMappingURL=strip-boilerplate.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"strip-boilerplate.js","sourceRoot":"","sources":["../../src/regex/strip-boilerplate.ts"],"names":[],"mappings":"AAEA,MAAM,oBAAoB,GAAa;IACrC,wDAAwD;IACxD,mCAAmC;IACnC,2EAA2E;CAC5E,CAAC;AAEF,MAAM,UAAU,gBAAgB,CAAC,EAAU,EAAE,IAAmB;IAC9D,IAAI,IAAI,EAAE,eAAe;QAAE,OAAO,EAAE,CAAC;IACrC,IAAI,GAAG,GAAG,EAAE,CAAC;IACb,KAAK,MAAM,EAAE,IAAI,oBAAoB,EAAE,CAAC;QACtC,EAAE,CAAC,SAAS,GAAG,CAAC,CAAC;QACjB,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC;IAC5B,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC"}
@@ -0,0 +1,19 @@
1
+ /**
2
+ * stripDocxArtifacts — remove pandoc DOCX-conversion leftovers.
3
+ *
4
+ * When pandoc converts .docx → markdown it emits span attributes for
5
+ * formatting that has no GFM equivalent: `[text]{.underline}`,
6
+ * `[text]{.smallcaps}`, `[text]{.mark}`, `[text]{.highlight}`.
7
+ * These pass through remark unchanged and end up in final output as
8
+ * literal punctuation noise.
9
+ *
10
+ * Also handles:
11
+ * - `[text]{.strikethrough}` → `~~text~~` (GFM has this)
12
+ * - `&nbsp;` → regular space
13
+ * - `\\ ` (pandoc hard line-break) → single space at end-of-line
14
+ * - `<!-- {.XXX} -->` comment-style span leftovers
15
+ *
16
+ * Safe on non-docx input — zero matches on plain markdown.
17
+ */
18
+ export declare function stripDocxArtifacts(md: string): string;
19
+ //# sourceMappingURL=strip-docx-artifacts.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"strip-docx-artifacts.d.ts","sourceRoot":"","sources":["../../src/regex/strip-docx-artifacts.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAgB,kBAAkB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAqBrD"}
@@ -0,0 +1,34 @@
1
+ /**
2
+ * stripDocxArtifacts — remove pandoc DOCX-conversion leftovers.
3
+ *
4
+ * When pandoc converts .docx → markdown it emits span attributes for
5
+ * formatting that has no GFM equivalent: `[text]{.underline}`,
6
+ * `[text]{.smallcaps}`, `[text]{.mark}`, `[text]{.highlight}`.
7
+ * These pass through remark unchanged and end up in final output as
8
+ * literal punctuation noise.
9
+ *
10
+ * Also handles:
11
+ * - `[text]{.strikethrough}` → `~~text~~` (GFM has this)
12
+ * - `&nbsp;` → regular space
13
+ * - `\\ ` (pandoc hard line-break) → single space at end-of-line
14
+ * - `<!-- {.XXX} -->` comment-style span leftovers
15
+ *
16
+ * Safe on non-docx input — zero matches on plain markdown.
17
+ */
18
+ export function stripDocxArtifacts(md) {
19
+ let out = md;
20
+ // [text]{.strikethrough} → ~~text~~ (semantic preservation)
21
+ out = out.replace(/\[([^\]]+)\]\{\.strikethrough\}/g, "~~$1~~");
22
+ // [text]{.class} spans with no GFM equivalent → bare text
23
+ out = out.replace(/\[([^\]]+)\]\{\.(?:underline|smallcaps|mark|highlight|subscript|superscript)\}/g, "$1");
24
+ // Generic span with only ignored attribute(s): [text]{.anything}
25
+ // More conservative: only strip if the entire attribute block has no
26
+ // semantic meaning we want to keep (no # id, no key=val other than class).
27
+ out = out.replace(/\[([^\]]+)\]\{(?:\.[a-z][a-z0-9-]*\s*)+\}/g, "$1");
28
+ // Pandoc hard line-break: trailing "\ " before newline
29
+ out = out.replace(/\\ $/gm, "");
30
+ // <!-- {.someclass} --> inline comments from div-fence conversion
31
+ out = out.replace(/<!--\s*\{[^}]+\}\s*-->/g, "");
32
+ return out;
33
+ }
34
+ //# sourceMappingURL=strip-docx-artifacts.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"strip-docx-artifacts.js","sourceRoot":"","sources":["../../src/regex/strip-docx-artifacts.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,UAAU,kBAAkB,CAAC,EAAU;IAC3C,IAAI,GAAG,GAAG,EAAE,CAAC;IAEb,4DAA4D;IAC5D,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,kCAAkC,EAAE,QAAQ,CAAC,CAAC;IAEhE,0DAA0D;IAC1D,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,iFAAiF,EAAE,IAAI,CAAC,CAAC;IAE3G,iEAAiE;IACjE,qEAAqE;IACrE,2EAA2E;IAC3E,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,4CAA4C,EAAE,IAAI,CAAC,CAAC;IAEtE,uDAAuD;IACvD,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;IAEhC,kEAAkE;IAClE,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,yBAAyB,EAAE,EAAE,CAAC,CAAC;IAEjD,OAAO,GAAG,CAAC;AACb,CAAC"}
@@ -0,0 +1,2 @@
1
+ export declare function stripEmptyHeadings(md: string): string;
2
+ //# sourceMappingURL=strip-empty-headings.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"strip-empty-headings.d.ts","sourceRoot":"","sources":["../../src/regex/strip-empty-headings.ts"],"names":[],"mappings":"AAIA,wBAAgB,kBAAkB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAErD"}
@@ -0,0 +1,6 @@
1
+ const EMPTY_RE = /^#{1,6}[ \t]*$/gm;
2
+ const PUNCT_ONLY_RE = /^(#{1,6})[ \t]+(\d+\.[\d.]*|[ivxlcdmIVXLCDM]+\.|[-–—.:;])[ \t]*$/gm;
3
+ export function stripEmptyHeadings(md) {
4
+ return md.replace(EMPTY_RE, "").replace(PUNCT_ONLY_RE, "");
5
+ }
6
+ //# sourceMappingURL=strip-empty-headings.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"strip-empty-headings.js","sourceRoot":"","sources":["../../src/regex/strip-empty-headings.ts"],"names":[],"mappings":"AAAA,MAAM,QAAQ,GAAG,kBAAkB,CAAC;AACpC,MAAM,aAAa,GACjB,oEAAoE,CAAC;AAEvE,MAAM,UAAU,kBAAkB,CAAC,EAAU;IAC3C,OAAO,EAAE,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC,CAAC;AAC7D,CAAC"}
@@ -0,0 +1,2 @@
1
+ export declare function stripHtmlArtifacts(md: string): string;
2
+ //# sourceMappingURL=strip-html-artifacts.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"strip-html-artifacts.d.ts","sourceRoot":"","sources":["../../src/regex/strip-html-artifacts.ts"],"names":[],"mappings":"AAKA,wBAAgB,kBAAkB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAuBrD"}
@@ -0,0 +1,24 @@
1
+ import { withProtectedCode } from "../util/protect-code.js";
2
+ const SAFE_TAGS = /^\/?(p|section|article|header|footer|main|figure|figcaption|caption|mark|abbr|cite|q|ins|u)$/i;
3
+ export function stripHtmlArtifacts(md) {
4
+ return withProtectedCode(md, (s) => {
5
+ let out = s;
6
+ out = out.replace(/^[ \t]*<br\s*\/?>[ \t]*$/gim, "");
7
+ out = out.replace(/<br\s*\/?>/gi, " ");
8
+ out = out.replace(/^[ \t]*<hr\s*\/?>[ \t]*$/gim, "\n---\n");
9
+ out = out.replace(/<strong\b[^>]*>([\s\S]*?)<\/strong>/gi, "**$1**");
10
+ out = out.replace(/<b\b[^>]*>([\s\S]*?)<\/b>/gi, "**$1**");
11
+ out = out.replace(/<em\b[^>]*>([\s\S]*?)<\/em>/gi, "_$1_");
12
+ out = out.replace(/<i\b[^>]*>([\s\S]*?)<\/i>/gi, "_$1_");
13
+ out = out.replace(/<s\b[^>]*>([\s\S]*?)<\/s>/gi, "~~$1~~");
14
+ out = out.replace(/<del\b[^>]*>([\s\S]*?)<\/del>/gi, "~~$1~~");
15
+ out = out.replace(/<code\b[^>]*>([\s\S]*?)<\/code>/gi, "`$1`");
16
+ out = out.replace(/<sup\b[^>]*>([\s\S]*?)<\/sup>/gi, "$1");
17
+ out = out.replace(/<sub\b[^>]*>([\s\S]*?)<\/sub>/gi, "$1");
18
+ out = out.replace(/<div\b[^>]*>([\s\S]*?)<\/div>/gi, "$1");
19
+ out = out.replace(/<span\b[^>]*>([\s\S]*?)<\/span>/gi, "$1");
20
+ out = out.replace(/<(\/?)([a-zA-Z][a-zA-Z0-9]*)\b[^>]*>/g, (full, slash, tag) => (SAFE_TAGS.test(`${slash}${tag}`) ? "" : full));
21
+ return out;
22
+ });
23
+ }
24
+ //# sourceMappingURL=strip-html-artifacts.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"strip-html-artifacts.js","sourceRoot":"","sources":["../../src/regex/strip-html-artifacts.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAE5D,MAAM,SAAS,GACb,+FAA+F,CAAC;AAElG,MAAM,UAAU,kBAAkB,CAAC,EAAU;IAC3C,OAAO,iBAAiB,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE;QACjC,IAAI,GAAG,GAAG,CAAC,CAAC;QACZ,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,6BAA6B,EAAE,EAAE,CAAC,CAAC;QACrD,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,cAAc,EAAE,GAAG,CAAC,CAAC;QACvC,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,6BAA6B,EAAE,SAAS,CAAC,CAAC;QAC5D,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,uCAAuC,EAAE,QAAQ,CAAC,CAAC;QACrE,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,6BAA6B,EAAE,QAAQ,CAAC,CAAC;QAC3D,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,+BAA+B,EAAE,MAAM,CAAC,CAAC;QAC3D,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,6BAA6B,EAAE,MAAM,CAAC,CAAC;QACzD,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,6BAA6B,EAAE,QAAQ,CAAC,CAAC;QAC3D,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,iCAAiC,EAAE,QAAQ,CAAC,CAAC;QAC/D,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,mCAAmC,EAAE,MAAM,CAAC,CAAC;QAC/D,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,iCAAiC,EAAE,IAAI,CAAC,CAAC;QAC3D,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,iCAAiC,EAAE,IAAI,CAAC,CAAC;QAC3D,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,iCAAiC,EAAE,IAAI,CAAC,CAAC;QAC3D,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,mCAAmC,EAAE,IAAI,CAAC,CAAC;QAC7D,GAAG,GAAG,GAAG,CAAC,OAAO,CACf,uCAAuC,EACvC,CAAC,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,KAAK,GAAG,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CACrE,CAAC;QACF,OAAO,GAAG,CAAC;IACb,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,2 @@
1
+ export declare function stripPageNumbers(md: string): string;
2
+ //# sourceMappingURL=strip-page-numbers.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"strip-page-numbers.d.ts","sourceRoot":"","sources":["../../src/regex/strip-page-numbers.ts"],"names":[],"mappings":"AAiBA,wBAAgB,gBAAgB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAInD"}
@@ -0,0 +1,23 @@
1
+ /**
2
+ * stripPageNumbers — remove lines that are *only* a page marker.
3
+ *
4
+ * Patterns covered:
5
+ * "12"
6
+ * "Page 12"
7
+ * "Page 12 of 340"
8
+ * "- 12 -"
9
+ * "[12]"
10
+ */
11
+ const PATTERNS = [
12
+ /^\s*\d{1,4}\s*$/gm,
13
+ /^\s*Page\s+\d+(\s+of\s+\d+)?\s*$/gim,
14
+ /^\s*-\s*\d{1,4}\s*-\s*$/gm,
15
+ /^\s*\[\s*\d{1,4}\s*\]\s*$/gm,
16
+ ];
17
+ export function stripPageNumbers(md) {
18
+ let out = md;
19
+ for (const p of PATTERNS)
20
+ out = out.replace(p, "");
21
+ return out;
22
+ }
23
+ //# sourceMappingURL=strip-page-numbers.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"strip-page-numbers.js","sourceRoot":"","sources":["../../src/regex/strip-page-numbers.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AACH,MAAM,QAAQ,GAAG;IACf,mBAAmB;IACnB,qCAAqC;IACrC,2BAA2B;IAC3B,6BAA6B;CAC9B,CAAC;AAEF,MAAM,UAAU,gBAAgB,CAAC,EAAU;IACzC,IAAI,GAAG,GAAG,EAAE,CAAC;IACb,KAAK,MAAM,CAAC,IAAI,QAAQ;QAAE,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IACnD,OAAO,GAAG,CAAC;AACb,CAAC"}