castdown-cleaners 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/LICENSE +180 -0
  2. package/README.md +198 -0
  3. package/dist/index.d.ts +47 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +110 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/regex/annotate-figures-tables.d.ts +3 -0
  8. package/dist/regex/annotate-figures-tables.d.ts.map +1 -0
  9. package/dist/regex/annotate-figures-tables.js +11 -0
  10. package/dist/regex/annotate-figures-tables.js.map +1 -0
  11. package/dist/regex/collapse-blank-lines.d.ts +6 -0
  12. package/dist/regex/collapse-blank-lines.d.ts.map +1 -0
  13. package/dist/regex/collapse-blank-lines.js +8 -0
  14. package/dist/regex/collapse-blank-lines.js.map +1 -0
  15. package/dist/regex/collapse-redundant-emphasis.d.ts +2 -0
  16. package/dist/regex/collapse-redundant-emphasis.d.ts.map +1 -0
  17. package/dist/regex/collapse-redundant-emphasis.js +19 -0
  18. package/dist/regex/collapse-redundant-emphasis.js.map +1 -0
  19. package/dist/regex/decode-html-entities.d.ts +2 -0
  20. package/dist/regex/decode-html-entities.d.ts.map +1 -0
  21. package/dist/regex/decode-html-entities.js +73 -0
  22. package/dist/regex/decode-html-entities.js.map +1 -0
  23. package/dist/regex/dedupe-links.d.ts +9 -0
  24. package/dist/regex/dedupe-links.d.ts.map +1 -0
  25. package/dist/regex/dedupe-links.js +16 -0
  26. package/dist/regex/dedupe-links.js.map +1 -0
  27. package/dist/regex/detect-space-tables.d.ts +29 -0
  28. package/dist/regex/detect-space-tables.d.ts.map +1 -0
  29. package/dist/regex/detect-space-tables.js +125 -0
  30. package/dist/regex/detect-space-tables.js.map +1 -0
  31. package/dist/regex/detect-toc.d.ts +14 -0
  32. package/dist/regex/detect-toc.d.ts.map +1 -0
  33. package/dist/regex/detect-toc.js +35 -0
  34. package/dist/regex/detect-toc.js.map +1 -0
  35. package/dist/regex/extract-metadata-frontmatter.d.ts +3 -0
  36. package/dist/regex/extract-metadata-frontmatter.d.ts.map +1 -0
  37. package/dist/regex/extract-metadata-frontmatter.js +39 -0
  38. package/dist/regex/extract-metadata-frontmatter.js.map +1 -0
  39. package/dist/regex/fix-footnote-markers.d.ts +2 -0
  40. package/dist/regex/fix-footnote-markers.d.ts.map +1 -0
  41. package/dist/regex/fix-footnote-markers.js +23 -0
  42. package/dist/regex/fix-footnote-markers.js.map +1 -0
  43. package/dist/regex/fix-headings.d.ts +12 -0
  44. package/dist/regex/fix-headings.d.ts.map +1 -0
  45. package/dist/regex/fix-headings.js +40 -0
  46. package/dist/regex/fix-headings.js.map +1 -0
  47. package/dist/regex/fix-ligatures.d.ts +3 -0
  48. package/dist/regex/fix-ligatures.d.ts.map +1 -0
  49. package/dist/regex/fix-ligatures.js +16 -0
  50. package/dist/regex/fix-ligatures.js.map +1 -0
  51. package/dist/regex/fix-tables.d.ts +13 -0
  52. package/dist/regex/fix-tables.d.ts.map +1 -0
  53. package/dist/regex/fix-tables.js +63 -0
  54. package/dist/regex/fix-tables.js.map +1 -0
  55. package/dist/regex/html-tables-to-gfm.d.ts +21 -0
  56. package/dist/regex/html-tables-to-gfm.d.ts.map +1 -0
  57. package/dist/regex/html-tables-to-gfm.js +76 -0
  58. package/dist/regex/html-tables-to-gfm.js.map +1 -0
  59. package/dist/regex/join-broken-lines.d.ts +10 -0
  60. package/dist/regex/join-broken-lines.d.ts.map +1 -0
  61. package/dist/regex/join-broken-lines.js +40 -0
  62. package/dist/regex/join-broken-lines.js.map +1 -0
  63. package/dist/regex/join-soft-hyphens.d.ts +9 -0
  64. package/dist/regex/join-soft-hyphens.d.ts.map +1 -0
  65. package/dist/regex/join-soft-hyphens.js +11 -0
  66. package/dist/regex/join-soft-hyphens.js.map +1 -0
  67. package/dist/regex/normalize-horizontal-rules.d.ts +2 -0
  68. package/dist/regex/normalize-horizontal-rules.d.ts.map +1 -0
  69. package/dist/regex/normalize-horizontal-rules.js +20 -0
  70. package/dist/regex/normalize-horizontal-rules.js.map +1 -0
  71. package/dist/regex/normalize-list-markers.d.ts +2 -0
  72. package/dist/regex/normalize-list-markers.d.ts.map +1 -0
  73. package/dist/regex/normalize-list-markers.js +35 -0
  74. package/dist/regex/normalize-list-markers.js.map +1 -0
  75. package/dist/regex/normalize-numbered-lists.d.ts +2 -0
  76. package/dist/regex/normalize-numbered-lists.d.ts.map +1 -0
  77. package/dist/regex/normalize-numbered-lists.js +9 -0
  78. package/dist/regex/normalize-numbered-lists.js.map +1 -0
  79. package/dist/regex/normalize-unicode.d.ts +2 -0
  80. package/dist/regex/normalize-unicode.d.ts.map +1 -0
  81. package/dist/regex/normalize-unicode.js +49 -0
  82. package/dist/regex/normalize-unicode.js.map +1 -0
  83. package/dist/regex/normalize-whitespace-in-lines.d.ts +2 -0
  84. package/dist/regex/normalize-whitespace-in-lines.d.ts.map +1 -0
  85. package/dist/regex/normalize-whitespace-in-lines.js +24 -0
  86. package/dist/regex/normalize-whitespace-in-lines.js.map +1 -0
  87. package/dist/regex/strip-boilerplate.d.ts +3 -0
  88. package/dist/regex/strip-boilerplate.d.ts.map +1 -0
  89. package/dist/regex/strip-boilerplate.js +16 -0
  90. package/dist/regex/strip-boilerplate.js.map +1 -0
  91. package/dist/regex/strip-docx-artifacts.d.ts +19 -0
  92. package/dist/regex/strip-docx-artifacts.d.ts.map +1 -0
  93. package/dist/regex/strip-docx-artifacts.js +34 -0
  94. package/dist/regex/strip-docx-artifacts.js.map +1 -0
  95. package/dist/regex/strip-empty-headings.d.ts +2 -0
  96. package/dist/regex/strip-empty-headings.d.ts.map +1 -0
  97. package/dist/regex/strip-empty-headings.js +6 -0
  98. package/dist/regex/strip-empty-headings.js.map +1 -0
  99. package/dist/regex/strip-html-artifacts.d.ts +2 -0
  100. package/dist/regex/strip-html-artifacts.d.ts.map +1 -0
  101. package/dist/regex/strip-html-artifacts.js +24 -0
  102. package/dist/regex/strip-html-artifacts.js.map +1 -0
  103. package/dist/regex/strip-page-numbers.d.ts +2 -0
  104. package/dist/regex/strip-page-numbers.d.ts.map +1 -0
  105. package/dist/regex/strip-page-numbers.js +23 -0
  106. package/dist/regex/strip-page-numbers.js.map +1 -0
  107. package/dist/regex/strip-pptx-notes.d.ts +22 -0
  108. package/dist/regex/strip-pptx-notes.d.ts.map +1 -0
  109. package/dist/regex/strip-pptx-notes.js +32 -0
  110. package/dist/regex/strip-pptx-notes.js.map +1 -0
  111. package/dist/regex/strip-repeated-headers.d.ts +2 -0
  112. package/dist/regex/strip-repeated-headers.d.ts.map +1 -0
  113. package/dist/regex/strip-repeated-headers.js +37 -0
  114. package/dist/regex/strip-repeated-headers.js.map +1 -0
  115. package/dist/regex/strip-url-tracking-params.d.ts +2 -0
  116. package/dist/regex/strip-url-tracking-params.d.ts.map +1 -0
  117. package/dist/regex/strip-url-tracking-params.js +26 -0
  118. package/dist/regex/strip-url-tracking-params.js.map +1 -0
  119. package/dist/regex/wrap-long-cell-text.d.ts +28 -0
  120. package/dist/regex/wrap-long-cell-text.d.ts.map +1 -0
  121. package/dist/regex/wrap-long-cell-text.js +66 -0
  122. package/dist/regex/wrap-long-cell-text.js.map +1 -0
  123. package/dist/util/protect-code.d.ts +6 -0
  124. package/dist/util/protect-code.d.ts.map +1 -0
  125. package/dist/util/protect-code.js +20 -0
  126. package/dist/util/protect-code.js.map +1 -0
  127. package/package.json +63 -0
@@ -0,0 +1,73 @@
1
+ import { withProtectedCode } from "../util/protect-code.js";
2
+ const ENTITY_RE = /&(?:#(\d{1,6})|#x([0-9a-fA-F]{1,6})|([a-zA-Z][a-zA-Z0-9]{1,31}));/g;
3
+ const NAMED_ENTITIES = new Map([
4
+ ["amp", "&"], ["lt", "<"], ["gt", ">"], ["quot", '"'], ["apos", "'"],
5
+ ["nbsp", " "], ["ensp", " "], ["emsp", " "], ["thinsp", " "],
6
+ ["ndash", "–"], ["mdash", "—"], ["horbar", "―"],
7
+ ["lsquo", "‘"], ["rsquo", "’"], ["sbquo", "‚"],
8
+ ["ldquo", "“"], ["rdquo", "”"], ["bdquo", "„"],
9
+ ["laquo", "«"], ["raquo", "»"],
10
+ ["hellip", "…"], ["middot", "·"], ["bull", "•"],
11
+ ["copy", "©"], ["reg", "®"], ["trade", "™"],
12
+ ["euro", "€"], ["pound", "£"], ["yen", "¥"], ["cent", "¢"],
13
+ ["times", "×"], ["divide", "÷"], ["plusmn", "±"],
14
+ ["frac12", "½"], ["frac14", "¼"], ["frac34", "¾"],
15
+ ["sup1", "¹"], ["sup2", "²"], ["sup3", "³"],
16
+ ["deg", "°"], ["micro", "µ"], ["para", "¶"],
17
+ ["sect", "§"], ["dagger", "†"], ["Dagger", "‡"],
18
+ ["prime", "′"], ["Prime", "″"],
19
+ ["larr", "←"], ["rarr", "→"], ["uarr", "↑"], ["darr", "↓"],
20
+ ["harr", "↔"], ["lArr", "⇐"], ["rArr", "⇒"],
21
+ ["forall", "∀"], ["exist", "∃"], ["empty", "∅"],
22
+ ["isin", "∈"], ["notin", "∉"], ["ni", "∋"],
23
+ ["sum", "∑"], ["prod", "∏"], ["infin", "∞"],
24
+ ["and", "∧"], ["or", "∨"], ["cap", "∩"], ["cup", "∪"],
25
+ ["int", "∫"], ["there4", "∴"], ["sim", "∼"],
26
+ ["cong", "≅"], ["asymp", "≈"], ["ne", "≠"],
27
+ ["le", "≤"], ["ge", "≥"],
28
+ ["sub", "⊂"], ["sup", "⊃"], ["sube", "⊆"], ["supe", "⊇"],
29
+ ["oplus", "⊕"], ["otimes", "⊗"], ["perp", "⊥"],
30
+ ["sdot", "⋅"], ["lceil", "⌈"], ["rceil", "⌉"],
31
+ ["lfloor", "⌊"], ["rfloor", "⌋"],
32
+ ["lang", "〈"], ["rang", "〉"],
33
+ ["loz", "◊"], ["spades", "♠"], ["clubs", "♣"],
34
+ ["hearts", "♥"], ["diams", "♦"],
35
+ ["Alpha", "Α"], ["Beta", "Β"], ["Gamma", "Γ"], ["Delta", "Δ"],
36
+ ["alpha", "α"], ["beta", "β"], ["gamma", "γ"], ["delta", "δ"],
37
+ ["epsilon", "ε"], ["zeta", "ζ"], ["eta", "η"], ["theta", "θ"],
38
+ ["iota", "ι"], ["kappa", "κ"], ["lambda", "λ"], ["mu", "μ"],
39
+ ["nu", "ν"], ["xi", "ξ"], ["omicron", "ο"], ["pi", "π"],
40
+ ["rho", "ρ"], ["sigma", "σ"], ["tau", "τ"], ["upsilon", "υ"],
41
+ ["phi", "φ"], ["chi", "χ"], ["psi", "ψ"], ["omega", "ω"],
42
+ ["szlig", "ß"], ["Agrave", "À"], ["Aacute", "Á"],
43
+ ["Atilde", "Ã"], ["Auml", "Ä"], ["Aring", "Å"],
44
+ ["AElig", "Æ"], ["Ccedil", "Ç"], ["Egrave", "È"],
45
+ ["Eacute", "É"], ["Ecirc", "Ê"], ["Euml", "Ë"],
46
+ ["Igrave", "Ì"], ["Iacute", "Í"], ["Ntilde", "Ñ"],
47
+ ["Ograve", "Ò"], ["Oacute", "Ó"], ["Otilde", "Õ"],
48
+ ["Ouml", "Ö"], ["Oslash", "Ø"], ["Ugrave", "Ù"],
49
+ ["Uacute", "Ú"], ["Uuml", "Ü"], ["Yacute", "Ý"],
50
+ ["agrave", "à"], ["aacute", "á"], ["acirc", "â"],
51
+ ["atilde", "ã"], ["auml", "ä"], ["aring", "å"],
52
+ ["aelig", "æ"], ["ccedil", "ç"], ["egrave", "è"],
53
+ ["eacute", "é"], ["ecirc", "ê"], ["euml", "ë"],
54
+ ["igrave", "ì"], ["iacute", "í"], ["icirc", "î"], ["iuml", "ï"],
55
+ ["eth", "ð"], ["ntilde", "ñ"], ["ograve", "ò"], ["oacute", "ó"],
56
+ ["ocirc", "ô"], ["otilde", "õ"], ["ouml", "ö"], ["oslash", "ø"],
57
+ ["ugrave", "ù"], ["uacute", "ú"], ["ucirc", "û"], ["uuml", "ü"],
58
+ ["yacute", "ý"], ["thorn", "þ"], ["yuml", "ÿ"],
59
+ ]);
60
+ export function decodeHtmlEntities(md) {
61
+ return withProtectedCode(md, (s) => s.replace(ENTITY_RE, (_full, dec, hex, name) => {
62
+ if (dec !== undefined) {
63
+ const cp = parseInt(dec, 10);
64
+ return cp > 0 && cp <= 0x10ffff ? String.fromCodePoint(cp) : _full;
65
+ }
66
+ if (hex !== undefined) {
67
+ const cp = parseInt(hex, 16);
68
+ return cp > 0 && cp <= 0x10ffff ? String.fromCodePoint(cp) : _full;
69
+ }
70
+ return NAMED_ENTITIES.get(name ?? "") ?? _full;
71
+ }));
72
+ }
73
+ //# sourceMappingURL=decode-html-entities.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"decode-html-entities.js","sourceRoot":"","sources":["../../src/regex/decode-html-entities.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAE5D,MAAM,SAAS,GAAG,oEAAoE,CAAC;AAEvF,MAAM,cAAc,GAAgC,IAAI,GAAG,CAAC;IAC1D,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;IACpE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IAC5D,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IAC/C,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC9C,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC9C,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC9B,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;IAC/C,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC3C,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;IAC1D,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IAChD,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IACjD,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;IAC3C,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;IAC3C,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IAC/C,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC9B,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;IAC1D,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;IAC3C,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC/C,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,GAAG,CAAC;IAC1C,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC3C,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,GAAG,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,CAAC;IACrD,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,CAAC;IAC3C,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,GAAG,CAAC;IAC1C,CAAC,IAAI,EAAE,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,GAAG,CAAC;IACxB,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;IACxD,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;IAC9C,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC7C,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IAChC,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;IAC5B,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC7C,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC/B,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC7D,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC7D,CAAC,SAAS,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC7D,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,GAAG,CAAC;IAC3D,CAAC,IAAI,EAAE,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,GAAG,CAAC,EAAE,CAAC,SAAS,EAAE,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,GAAG,CAAC;IACvD,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,SAAS,EAAE,GAAG,CAAC;IAC5D,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IACxD,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IAChD,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC9C,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IAChD,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;IAC9C,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IACjD,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IACjD,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IAC/C,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IAC/C,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAChD,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC;IAC9C,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IAChD,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;IAC9C,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;IAC/D,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IAC/D,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC;IAC/D,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;IAC/D,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,CAAC;CAC/C,CAAC,CAAC;AAEH,MAAM,UAAU,kBAAkB,CAAC,EAAU;IAC3C,OAAO,iBAAiB,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,CACjC,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC,KAAK,EAAE,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,EAAE;QAC7C,IAAI,GAAG,KAAK,SAAS,EAAE,CAAC;YACtB,MAAM,EAAE,GAAG,QAAQ,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;YAC7B,OAAO,EAAE,GAAG,CAAC,IAAI,EAAE,IAAI,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,aAAa,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;QACrE,CAAC;QACD,IAAI,GAAG,KAAK,SAAS,EAAE,CAAC;YACtB,MAAM,EAAE,GAAG,QAAQ,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;YAC7B,OAAO,EAAE,GAAG,CAAC,IAAI,EAAE,IAAI,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,aAAa,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;QACrE,CAAC;QACD,OAAO,cAAc,CAAC,GAAG,CAAC,IAAI,IAAI,EAAE,CAAC,IAAI,KAAK,CAAC;IACjD,CAAC,CAAC,CACH,CAAC;AACJ,CAAC"}
@@ -0,0 +1,9 @@
1
+ /**
2
+ * dedupeLinks — collapse `[text](url) [text](url)` where the same URL appears
3
+ * twice adjacent. Common when PDFs duplicate hyperlink annotations and the
4
+ * parser emits both.
5
+ *
6
+ * Also: empty link text `[](url)` → bare URL.
7
+ */
8
+ export declare function dedupeLinks(md: string): string;
9
+ //# sourceMappingURL=dedupe-links.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dedupe-links.d.ts","sourceRoot":"","sources":["../../src/regex/dedupe-links.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AACH,wBAAgB,WAAW,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAO9C"}
@@ -0,0 +1,16 @@
1
+ /**
2
+ * dedupeLinks — collapse `[text](url) [text](url)` where the same URL appears
3
+ * twice adjacent. Common when PDFs duplicate hyperlink annotations and the
4
+ * parser emits both.
5
+ *
6
+ * Also: empty link text `[](url)` → bare URL.
7
+ */
8
+ export function dedupeLinks(md) {
9
+ let out = md;
10
+ // Same link adjacent
11
+ out = out.replace(/\[([^\]]+)\]\(([^)]+)\)\s+\[\1\]\(\2\)/g, "[$1]($2)");
12
+ // Empty-text link
13
+ out = out.replace(/\[\s*\]\(([^)]+)\)/g, "<$1>");
14
+ return out;
15
+ }
16
+ //# sourceMappingURL=dedupe-links.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dedupe-links.js","sourceRoot":"","sources":["../../src/regex/dedupe-links.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AACH,MAAM,UAAU,WAAW,CAAC,EAAU;IACpC,IAAI,GAAG,GAAG,EAAE,CAAC;IACb,qBAAqB;IACrB,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,yCAAyC,EAAE,UAAU,CAAC,CAAC;IACzE,kBAAkB;IAClB,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC;IACjD,OAAO,GAAG,CAAC;AACb,CAAC"}
@@ -0,0 +1,29 @@
1
+ /**
2
+ * detectSpaceTables — recover tables that PDF/pdfminer outputs as
3
+ * whitespace-aligned text (no `|`, no `<table>`).
4
+ *
5
+ * pdfminer (and therefore MarkItDown on PDF) has no notion of table cells;
6
+ * it just emits text positioned by spaces. A block like:
7
+ *
8
+ * KPI Q2 Q3
9
+ * Conversions 84,210 142,580
10
+ * Keys 412 1,031
11
+ *
12
+ * survives the earlier cleaners untouched and renders as a wall of text.
13
+ *
14
+ * Heuristic (conservative — false positives on code/ASCII art are worse
15
+ * than a missed conversion):
16
+ * - Walk runs of ≥3 contiguous non-blank lines that are NOT inside a
17
+ * fenced code block, NOT already pipe-tables, NOT list/heading/quote.
18
+ * - Split each line on `/\s{2,}/`. Require ≥2 columns and identical
19
+ * column count across the whole run.
20
+ * - Require ≥1 column to be a multi-word "label" cell (avg cell length
21
+ * ≥ 3 chars) to avoid converting space-separated number columns
22
+ * that are really paragraphs of digits.
23
+ * - First row → header.
24
+ *
25
+ * Idempotent: pipe tables already produced by earlier passes are skipped
26
+ * because their lines start with `|`.
27
+ */
28
+ export declare function detectSpaceTables(md: string): string;
29
+ //# sourceMappingURL=detect-space-tables.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"detect-space-tables.d.ts","sourceRoot":"","sources":["../../src/regex/detect-space-tables.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AACH,wBAAgB,iBAAiB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAkDpD"}
@@ -0,0 +1,125 @@
1
+ /**
2
+ * detectSpaceTables — recover tables that PDF/pdfminer outputs as
3
+ * whitespace-aligned text (no `|`, no `<table>`).
4
+ *
5
+ * pdfminer (and therefore MarkItDown on PDF) has no notion of table cells;
6
+ * it just emits text positioned by spaces. A block like:
7
+ *
8
+ * KPI Q2 Q3
9
+ * Conversions 84,210 142,580
10
+ * Keys 412 1,031
11
+ *
12
+ * survives the earlier cleaners untouched and renders as a wall of text.
13
+ *
14
+ * Heuristic (conservative — false positives on code/ASCII art are worse
15
+ * than a missed conversion):
16
+ * - Walk runs of ≥3 contiguous non-blank lines that are NOT inside a
17
+ * fenced code block, NOT already pipe-tables, NOT list/heading/quote.
18
+ * - Split each line on `/\s{2,}/`. Require ≥2 columns and identical
19
+ * column count across the whole run.
20
+ * - Require ≥1 column to be a multi-word "label" cell (avg cell length
21
+ * ≥ 3 chars) to avoid converting space-separated number columns
22
+ * that are really paragraphs of digits.
23
+ * - First row → header.
24
+ *
25
+ * Idempotent: pipe tables already produced by earlier passes are skipped
26
+ * because their lines start with `|`.
27
+ */
28
+ export function detectSpaceTables(md) {
29
+ const lines = md.split("\n");
30
+ const out = [];
31
+ let inCode = false;
32
+ let i = 0;
33
+ while (i < lines.length) {
34
+ const line = lines[i] ?? "";
35
+ if (/^\s*```/.test(line)) {
36
+ inCode = !inCode;
37
+ out.push(line);
38
+ i++;
39
+ continue;
40
+ }
41
+ if (inCode || !isCandidate(line)) {
42
+ out.push(line);
43
+ i++;
44
+ continue;
45
+ }
46
+ // Probe contiguous candidate lines with identical column count.
47
+ const cells0 = splitCols(line);
48
+ if (cells0.length < 2) {
49
+ out.push(line);
50
+ i++;
51
+ continue;
52
+ }
53
+ const cols = cells0.length;
54
+ const block = [cells0];
55
+ let j = i + 1;
56
+ while (j < lines.length) {
57
+ const lj = lines[j] ?? "";
58
+ if (!isCandidate(lj))
59
+ break;
60
+ const c = splitCols(lj);
61
+ if (c.length !== cols)
62
+ break;
63
+ block.push(c);
64
+ j++;
65
+ }
66
+ if (block.length >= 3 && looksTabular(block)) {
67
+ out.push(...renderGfm(block));
68
+ i = j;
69
+ continue;
70
+ }
71
+ out.push(line);
72
+ i++;
73
+ }
74
+ return out.join("\n");
75
+ }
76
+ function isCandidate(line) {
77
+ if (!line.trim())
78
+ return false;
79
+ // skip lines that are clearly something else
80
+ if (/^\s*\|/.test(line))
81
+ return false; // already a pipe table
82
+ if (/^\s*[#>]/.test(line))
83
+ return false; // heading / quote
84
+ if (/^\s*[-*+]\s/.test(line))
85
+ return false; // bullet list
86
+ if (/^\s*\d+\.\s/.test(line))
87
+ return false; // ordered list
88
+ if (/^\s{4,}\S/.test(line))
89
+ return false; // indented code
90
+ return /\S\s{2,}\S/.test(line); // ≥1 multi-space gap
91
+ }
92
+ function splitCols(line) {
93
+ return line.trim().split(/\s{2,}/).map((s) => s.trim()).filter((s) => s.length > 0);
94
+ }
95
+ function looksTabular(block) {
96
+ const cols = block[0].length;
97
+ // At least one column must average ≥ 3 chars across rows — filters
98
+ // out runs of pure-numeric or single-char columns that are usually
99
+ // not tables.
100
+ for (let c = 0; c < cols; c++) {
101
+ let sum = 0;
102
+ for (const row of block)
103
+ sum += (row[c] ?? "").length;
104
+ if (sum / block.length >= 3)
105
+ return true;
106
+ }
107
+ return false;
108
+ }
109
+ function renderGfm(block) {
110
+ const cols = block[0].length;
111
+ const header = block[0];
112
+ const body = block.slice(1);
113
+ const sep = Array(cols).fill("---");
114
+ return [
115
+ "",
116
+ renderRow(header),
117
+ renderRow(sep),
118
+ ...body.map(renderRow),
119
+ "",
120
+ ];
121
+ }
122
+ function renderRow(cells) {
123
+ return "| " + cells.map((c) => c.replace(/\|/g, "\\|")).join(" | ") + " |";
124
+ }
125
+ //# sourceMappingURL=detect-space-tables.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"detect-space-tables.js","sourceRoot":"","sources":["../../src/regex/detect-space-tables.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AACH,MAAM,UAAU,iBAAiB,CAAC,EAAU;IAC1C,MAAM,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,IAAI,MAAM,GAAG,KAAK,CAAC;IACnB,IAAI,CAAC,GAAG,CAAC,CAAC;IAEV,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;QACxB,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YACzB,MAAM,GAAG,CAAC,MAAM,CAAC;YACjB,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACf,CAAC,EAAE,CAAC;YACJ,SAAS;QACX,CAAC;QACD,IAAI,MAAM,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,EAAE,CAAC;YACjC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACf,CAAC,EAAE,CAAC;YACJ,SAAS;QACX,CAAC;QAED,gEAAgE;QAChE,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAC/B,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtB,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACf,CAAC,EAAE,CAAC;YACJ,SAAS;QACX,CAAC;QACD,MAAM,IAAI,GAAG,MAAM,CAAC,MAAM,CAAC;QAC3B,MAAM,KAAK,GAAe,CAAC,MAAM,CAAC,CAAC;QACnC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACd,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YACxB,MAAM,EAAE,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAC1B,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC;gBAAE,MAAM;YAC5B,MAAM,CAAC,GAAG,SAAS,CAAC,EAAE,CAAC,CAAC;YACxB,IAAI,CAAC,CAAC,MAAM,KAAK,IAAI;gBAAE,MAAM;YAC7B,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACd,CAAC,EAAE,CAAC;QACN,CAAC;QAED,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC,IAAI,YAAY,CAAC,KAAK,CAAC,EAAE,CAAC;YAC7C,GAAG,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC;YAC9B,CAAC,GAAG,CAAC,CAAC;YACN,SAAS;QACX,CAAC;QAED,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACf,CAAC,EAAE,CAAC;IACN,CAAC;IAED,OAAO,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACxB,CAAC;AAED,SAAS,WAAW,CAAC,IAAY;IAC/B,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;QAAE,OAAO,KAAK,CAAC;IAC/B,6CAA6C;IAC7C,IAAI,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC,CAAY,uBAAuB;IACzE,IAAI,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC,CAAU,kBAAkB;IACpE,IAAI,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC,CAAO,cAAc;IAChE,IAAI,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC,CAAO,eAAe;IACjE,IAAI,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC,CAAS,gBAAgB;IAClE,OAAO,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAmB,qBAAqB;AACzE,CAAC;AAED,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AACtF,CAAC;AAED,SAAS,YAAY,CAAC,KAAiB;IACrC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC,MAAM,CAAC;IAC9B,mEAAmE;IACnE,mEAAmE;IACnE,cAAc;IACd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;QAC9B,IAAI,GAAG,GAAG,CAAC,CAAC;QACZ,KAAK,MAAM,GAAG,IAAI,KAAK;YAAE,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;QACtD,IAAI,GAAG,GAAG,KAAK,CAAC,MAAM,IAAI,CAAC;YAAE,OAAO,IAAI,CAAC;IAC3C,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,SAAS,CAAC,KAAiB;IAClC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC,MAAM,CAAC;IAC9B,MAAM,MAAM,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC;IACzB,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IAC5B,MAAM,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACpC,OAAO;QACL,EAAE;QACF,SAAS,CAAC,MAAM,CAAC;QACjB,SAAS,CAAC,GAAG,CAAC;QACd,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC;QACtB,EAAE;KACH,CAAC;AACJ,CAAC;AAED,SAAS,SAAS,CAAC,KAAe;IAChC,OAAO,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC;AAC7E,CAAC"}
@@ -0,0 +1,14 @@
1
+ /**
2
+ * detectToc — recognize a Table of Contents block and wrap it in a fenced
3
+ * `<!-- toc -->` block, OR strip it entirely if opts.stripToc is true.
4
+ *
5
+ * Pattern signal: 3+ consecutive lines matching `<title> ... <page>` style,
6
+ * e.g. "Chapter 1 .................... 12".
7
+ *
8
+ * By default it's marked, not stripped (consumers can choose to strip).
9
+ */
10
+ export interface DetectTocOptions {
11
+ stripToc?: boolean;
12
+ }
13
+ export declare function detectToc(md: string, opts?: DetectTocOptions): string;
14
+ //# sourceMappingURL=detect-toc.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"detect-toc.d.ts","sourceRoot":"","sources":["../../src/regex/detect-toc.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AACH,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB;AAKD,wBAAgB,SAAS,CAAC,EAAE,EAAE,MAAM,EAAE,IAAI,GAAE,gBAAqB,GAAG,MAAM,CA4BzE"}
@@ -0,0 +1,35 @@
1
+ const TOC_LINE = /^.{3,80}\s*\.{2,}\s*\d{1,4}\s*$/;
2
+ const MIN_RUN = 3;
3
+ export function detectToc(md, opts = {}) {
4
+ const lines = md.split("\n");
5
+ const ranges = [];
6
+ let runStart = -1;
7
+ for (let i = 0; i <= lines.length; i++) {
8
+ const line = lines[i] ?? "";
9
+ if (TOC_LINE.test(line.trim())) {
10
+ if (runStart < 0)
11
+ runStart = i;
12
+ }
13
+ else if (runStart >= 0) {
14
+ if (i - runStart >= MIN_RUN)
15
+ ranges.push([runStart, i]);
16
+ runStart = -1;
17
+ }
18
+ }
19
+ if (ranges.length === 0)
20
+ return md;
21
+ // Apply ranges in reverse so indices stay valid
22
+ const out = lines.slice();
23
+ for (let r = ranges.length - 1; r >= 0; r--) {
24
+ const [s, e] = ranges[r];
25
+ if (opts.stripToc) {
26
+ out.splice(s, e - s);
27
+ }
28
+ else {
29
+ out.splice(s, 0, "<!-- toc:start -->");
30
+ out.splice(e + 1, 0, "<!-- toc:end -->");
31
+ }
32
+ }
33
+ return out.join("\n");
34
+ }
35
+ //# sourceMappingURL=detect-toc.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"detect-toc.js","sourceRoot":"","sources":["../../src/regex/detect-toc.ts"],"names":[],"mappings":"AAaA,MAAM,QAAQ,GAAG,iCAAiC,CAAC;AACnD,MAAM,OAAO,GAAG,CAAC,CAAC;AAElB,MAAM,UAAU,SAAS,CAAC,EAAU,EAAE,OAAyB,EAAE;IAC/D,MAAM,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,MAAM,GAA4B,EAAE,CAAC;IAC3C,IAAI,QAAQ,GAAG,CAAC,CAAC,CAAC;IAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,EAAE,CAAC;YAC/B,IAAI,QAAQ,GAAG,CAAC;gBAAE,QAAQ,GAAG,CAAC,CAAC;QACjC,CAAC;aAAM,IAAI,QAAQ,IAAI,CAAC,EAAE,CAAC;YACzB,IAAI,CAAC,GAAG,QAAQ,IAAI,OAAO;gBAAE,MAAM,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC;YACxD,QAAQ,GAAG,CAAC,CAAC,CAAC;QAChB,CAAC;IACH,CAAC;IACD,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEnC,gDAAgD;IAChD,MAAM,GAAG,GAAG,KAAK,CAAC,KAAK,EAAE,CAAC;IAC1B,KAAK,IAAI,CAAC,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5C,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC,CAAE,CAAC;QAC1B,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAClB,GAAG,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;QACvB,CAAC;aAAM,CAAC;YACN,GAAG,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,oBAAoB,CAAC,CAAC;YACvC,GAAG,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,kBAAkB,CAAC,CAAC;QAC3C,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACxB,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { CleanOptions } from "../index.js";
2
+ export declare function extractMetadataFrontmatter(md: string, opts?: CleanOptions): string;
3
+ //# sourceMappingURL=extract-metadata-frontmatter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extract-metadata-frontmatter.d.ts","sourceRoot":"","sources":["../../src/regex/extract-metadata-frontmatter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAShD,wBAAgB,0BAA0B,CACxC,EAAE,EAAE,MAAM,EACV,IAAI,CAAC,EAAE,YAAY,GAClB,MAAM,CAgCR"}
@@ -0,0 +1,39 @@
1
+ const TITLE_RE = /^#[ \t]+(.+)$/m;
2
+ const META_LINE_RE = /^(?:Title|Título|Author|Authors|Autor|By|Date|Fecha|Published|Updated|Version|Revision):\s*(.+)$/im;
3
+ const DATE_LABELS = /^(?:Date|Fecha|Published|Updated):/i;
4
+ const AUTHOR_LABELS = /^(?:Author|Authors|Autor|By):/i;
5
+ const VERSION_LABELS = /^(?:Version|Revision):/i;
6
+ export function extractMetadataFrontmatter(md, opts) {
7
+ if (!opts?.extractFrontmatter)
8
+ return md;
9
+ if (md.startsWith("---\n"))
10
+ return md;
11
+ const scanLines = opts?.frontmatterScanLines ?? 20;
12
+ const lines = md.split("\n");
13
+ const head = lines.slice(0, scanLines);
14
+ const meta = {};
15
+ // Title from first H1
16
+ const h1 = head.join("\n").match(TITLE_RE);
17
+ if (h1)
18
+ meta["title"] = h1[1].trim();
19
+ // Key: value lines
20
+ for (const line of head) {
21
+ const m = line.match(META_LINE_RE);
22
+ if (!m)
23
+ continue;
24
+ const val = m[1].trim();
25
+ if (DATE_LABELS.test(line) && !meta["date"])
26
+ meta["date"] = val;
27
+ else if (AUTHOR_LABELS.test(line) && !meta["author"])
28
+ meta["author"] = val;
29
+ else if (VERSION_LABELS.test(line) && !meta["version"])
30
+ meta["version"] = val;
31
+ }
32
+ if (Object.keys(meta).length === 0)
33
+ return md;
34
+ const yaml = Object.entries(meta)
35
+ .map(([k, v]) => `${k}: "${v.replace(/"/g, '\\"')}"`)
36
+ .join("\n");
37
+ return `---\n${yaml}\n---\n\n${md}`;
38
+ }
39
+ //# sourceMappingURL=extract-metadata-frontmatter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extract-metadata-frontmatter.js","sourceRoot":"","sources":["../../src/regex/extract-metadata-frontmatter.ts"],"names":[],"mappings":"AAEA,MAAM,QAAQ,GAAG,gBAAgB,CAAC;AAClC,MAAM,YAAY,GAChB,oGAAoG,CAAC;AACvG,MAAM,WAAW,GAAG,qCAAqC,CAAC;AAC1D,MAAM,aAAa,GAAG,gCAAgC,CAAC;AACvD,MAAM,cAAc,GAAG,yBAAyB,CAAC;AAEjD,MAAM,UAAU,0BAA0B,CACxC,EAAU,EACV,IAAmB;IAEnB,IAAI,CAAC,IAAI,EAAE,kBAAkB;QAAE,OAAO,EAAE,CAAC;IACzC,IAAI,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC;QAAE,OAAO,EAAE,CAAC;IAEtC,MAAM,SAAS,GAAG,IAAI,EAAE,oBAAoB,IAAI,EAAE,CAAC;IACnD,MAAM,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;IAEvC,MAAM,IAAI,GAA2B,EAAE,CAAC;IAExC,sBAAsB;IACtB,MAAM,EAAE,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IAC3C,IAAI,EAAE;QAAE,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,CAAC;IAEtC,mBAAmB;IACnB,KAAK,MAAM,IAAI,IAAI,IAAI,EAAE,CAAC;QACxB,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QACnC,IAAI,CAAC,CAAC;YAAE,SAAS;QACjB,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,CAAC;QACzB,IAAI,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC;YAAE,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;aAC3D,IAAI,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC;YAAE,IAAI,CAAC,QAAQ,CAAC,GAAG,GAAG,CAAC;aACtE,IAAI,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC;YACpD,IAAI,CAAC,SAAS,CAAC,GAAG,GAAG,CAAC;IAC1B,CAAC;IAED,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAE9C,MAAM,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC;SAC9B,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,GAAG,CAAC;SACpD,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,OAAO,QAAQ,IAAI,YAAY,EAAE,EAAE,CAAC;AACtC,CAAC"}
@@ -0,0 +1,2 @@
1
+ export declare function fixFootnoteMarkers(md: string): string;
2
+ //# sourceMappingURL=fix-footnote-markers.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fix-footnote-markers.d.ts","sourceRoot":"","sources":["../../src/regex/fix-footnote-markers.ts"],"names":[],"mappings":"AAiBA,wBAAgB,kBAAkB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAgBrD"}
@@ -0,0 +1,23 @@
1
+ import { withProtectedCode } from "../util/protect-code.js";
2
+ const SUPER_MAP = {
3
+ "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5",
4
+ "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9", "⁰": "0",
5
+ };
6
+ const SUPER_CHARS = Object.keys(SUPER_MAP).join("");
7
+ const SUPER_RE = new RegExp(`[${SUPER_CHARS}]+`, "g");
8
+ function toDigits(sup) {
9
+ return sup
10
+ .split("")
11
+ .map((c) => SUPER_MAP[c] ?? c)
12
+ .join("");
13
+ }
14
+ export function fixFootnoteMarkers(md) {
15
+ return withProtectedCode(md, (s) => {
16
+ // Inline: word¹ → word[^1] (only when superscript directly follows a word char)
17
+ let out = s.replace(new RegExp(`(\\w)([${SUPER_CHARS}]+)`, "g"), (_, w, sup) => `${w}[^${toDigits(sup)}]`);
18
+ // Standalone footnote definitions at line start: ¹ Text → [^1]: Text
19
+ out = out.replace(new RegExp(`^([${SUPER_CHARS}]+)[ \\t]+(.+)$`, "gm"), (_, sup, text) => `[^${toDigits(sup)}]: ${text}`);
20
+ return out;
21
+ });
22
+ }
23
+ //# sourceMappingURL=fix-footnote-markers.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fix-footnote-markers.js","sourceRoot":"","sources":["../../src/regex/fix-footnote-markers.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAE5D,MAAM,SAAS,GAA2B;IACxC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG;IAChD,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG;CACjD,CAAC;AAEF,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;AACpD,MAAM,QAAQ,GAAG,IAAI,MAAM,CAAC,IAAI,WAAW,IAAI,EAAE,GAAG,CAAC,CAAC;AAEtD,SAAS,QAAQ,CAAC,GAAW;IAC3B,OAAO,GAAG;SACP,KAAK,CAAC,EAAE,CAAC;SACT,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;SAC7B,IAAI,CAAC,EAAE,CAAC,CAAC;AACd,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,EAAU;IAC3C,OAAO,iBAAiB,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE;QACjC,iFAAiF;QACjF,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CACjB,IAAI,MAAM,CAAC,UAAU,WAAW,KAAK,EAAE,GAAG,CAAC,EAC3C,CAAC,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,GAAG,CAAW,KAAK,QAAQ,CAAC,GAAa,CAAC,GAAG,CAC7D,CAAC;QAEF,qEAAqE;QACrE,GAAG,GAAG,GAAG,CAAC,OAAO,CACf,IAAI,MAAM,CAAC,MAAM,WAAW,iBAAiB,EAAE,IAAI,CAAC,EACpD,CAAC,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,KAAK,QAAQ,CAAC,GAAa,CAAC,MAAM,IAAc,EAAE,CACrE,CAAC;QAEF,OAAO,GAAG,CAAC;IACb,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,12 @@
1
+ /**
2
+ * fixHeadings — repair heading levels coming out of PDF/DOCX parsers.
3
+ *
4
+ * Common issues:
5
+ * - Inconsistent shifting: doc starts at H2 because H1 was on a title page.
6
+ * If no H1 exists, promote the shallowest heading to H1.
7
+ * - Stray ALL-CAPS lines that should clearly be headings but weren't tagged.
8
+ * Conservative: only promote isolated 4-60 char ALL-CAPS lines surrounded
9
+ * by blank lines.
10
+ */
11
+ export declare function fixHeadings(md: string): string;
12
+ //# sourceMappingURL=fix-headings.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fix-headings.d.ts","sourceRoot":"","sources":["../../src/regex/fix-headings.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AACH,wBAAgB,WAAW,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CA2B9C"}
@@ -0,0 +1,40 @@
1
+ /**
2
+ * fixHeadings — repair heading levels coming out of PDF/DOCX parsers.
3
+ *
4
+ * Common issues:
5
+ * - Inconsistent shifting: doc starts at H2 because H1 was on a title page.
6
+ * If no H1 exists, promote the shallowest heading to H1.
7
+ * - Stray ALL-CAPS lines that should clearly be headings but weren't tagged.
8
+ * Conservative: only promote isolated 4-60 char ALL-CAPS lines surrounded
9
+ * by blank lines.
10
+ */
11
+ export function fixHeadings(md) {
12
+ let out = md;
13
+ // 1. Promote shallowest level to H1 if no H1 exists
14
+ const hasH1 = /^#\s/m.test(out);
15
+ if (!hasH1) {
16
+ const levels = new Set();
17
+ for (const m of out.matchAll(/^(#{2,6})\s/gm))
18
+ levels.add(m[1].length);
19
+ if (levels.size > 0) {
20
+ const shallowest = Math.min(...levels);
21
+ const promoteBy = shallowest - 1;
22
+ out = out.replace(/^(#{2,6})(\s)/gm, (_, hashes, sp) => {
23
+ return "#".repeat(hashes.length - promoteBy) + sp;
24
+ });
25
+ }
26
+ }
27
+ // 2. Stray ALL-CAPS isolated line → H2 (skip if any lowercase, numeric-only, or punctuation-only)
28
+ out = out.replace(/(^|\n)\n([A-ZÁÉÍÓÚÑÜ][A-ZÁÉÍÓÚÑÜ0-9 ,&\-]{3,58}[A-ZÁÉÍÓÚÑÜ0-9])\n\n/g, (_full, prefix, line) => {
29
+ if (/[a-záéíóúñü]/.test(line))
30
+ return _full;
31
+ return `${prefix}\n## ${toTitleCase(line)}\n\n`;
32
+ });
33
+ return out;
34
+ }
35
+ function toTitleCase(s) {
36
+ return s
37
+ .toLowerCase()
38
+ .replace(/\b(\w)/g, (c) => c.toUpperCase());
39
+ }
40
+ //# sourceMappingURL=fix-headings.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fix-headings.js","sourceRoot":"","sources":["../../src/regex/fix-headings.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AACH,MAAM,UAAU,WAAW,CAAC,EAAU;IACpC,IAAI,GAAG,GAAG,EAAE,CAAC;IAEb,oDAAoD;IACpD,MAAM,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAChC,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,MAAM,GAAG,IAAI,GAAG,EAAU,CAAC;QACjC,KAAK,MAAM,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,eAAe,CAAC;YAAE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAE,CAAC,MAAM,CAAC,CAAC;QACxE,IAAI,MAAM,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;YACpB,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC;YACvC,MAAM,SAAS,GAAG,UAAU,GAAG,CAAC,CAAC;YACjC,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,iBAAiB,EAAE,CAAC,CAAC,EAAE,MAAc,EAAE,EAAU,EAAE,EAAE;gBACrE,OAAO,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,SAAS,CAAC,GAAG,EAAE,CAAC;YACpD,CAAC,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,kGAAkG;IAClG,GAAG,GAAG,GAAG,CAAC,OAAO,CACf,sEAAsE,EACtE,CAAC,KAAK,EAAE,MAAc,EAAE,IAAY,EAAE,EAAE;QACtC,IAAI,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC;YAAE,OAAO,KAAe,CAAC;QACtD,OAAO,GAAG,MAAM,QAAQ,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC;IAClD,CAAC,CACF,CAAC;IAEF,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,WAAW,CAAC,CAAS;IAC5B,OAAO,CAAC;SACL,WAAW,EAAE;SACb,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;AAChD,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { CleanOptions } from "../index.js";
2
+ export declare function fixLigatures(md: string, opts?: CleanOptions): string;
3
+ //# sourceMappingURL=fix-ligatures.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fix-ligatures.d.ts","sourceRoot":"","sources":["../../src/regex/fix-ligatures.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAchD,wBAAgB,YAAY,CAAC,EAAE,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,YAAY,GAAG,MAAM,CAKpE"}
@@ -0,0 +1,16 @@
1
+ import { withProtectedCode } from "../util/protect-code.js";
2
+ const LIGATURE_MAP = {
3
+ "ff": "ff",
4
+ "fi": "fi",
5
+ "fl": "fl",
6
+ "ffi": "ffi",
7
+ "ffl": "ffl",
8
+ "ſt": "st",
9
+ "st": "st",
10
+ };
11
+ const LIGATURE_RE = new RegExp(`[ff-st]`, "g");
12
+ export function fixLigatures(md, opts) {
13
+ const map = { ...LIGATURE_MAP, ...(opts?.ligatureMap ?? {}) };
14
+ return withProtectedCode(md, (s) => s.replace(LIGATURE_RE, (ch) => map[ch] ?? ch));
15
+ }
16
+ //# sourceMappingURL=fix-ligatures.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fix-ligatures.js","sourceRoot":"","sources":["../../src/regex/fix-ligatures.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAG5D,MAAM,YAAY,GAAqC;IACrD,GAAG,EAAE,IAAI;IACT,GAAG,EAAE,IAAI;IACT,GAAG,EAAE,IAAI;IACT,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,IAAI;IACT,GAAG,EAAE,IAAI;CACV,CAAC;AAEF,MAAM,WAAW,GAAG,IAAI,MAAM,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;AAE7C,MAAM,UAAU,YAAY,CAAC,EAAU,EAAE,IAAmB;IAC1D,MAAM,GAAG,GAA2B,EAAE,GAAG,YAAY,EAAE,GAAG,CAAC,IAAI,EAAE,WAAW,IAAI,EAAE,CAAC,EAAE,CAAC;IACtF,OAAO,iBAAiB,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,CACjC,CAAC,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,CAAC,CAC9C,CAAC;AACJ,CAAC"}
@@ -0,0 +1,13 @@
1
+ /**
2
+ * fixTables — best-effort repair of broken GFM tables.
3
+ *
4
+ * Common parser issues:
5
+ * - Missing leading/trailing pipes on rows.
6
+ * - Separator row has too few/many dashes per column.
7
+ * - Inconsistent column count across rows (pad with empty cells).
8
+ *
9
+ * Strategy: detect runs of contiguous lines that look table-ish
10
+ * (contain `|`), parse into cells, rebuild a clean GFM table.
11
+ */
12
+ export declare function fixTables(md: string): string;
13
+ //# sourceMappingURL=fix-tables.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fix-tables.d.ts","sourceRoot":"","sources":["../../src/regex/fix-tables.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AACH,wBAAgB,SAAS,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAqB5C"}
@@ -0,0 +1,63 @@
1
+ /**
2
+ * fixTables — best-effort repair of broken GFM tables.
3
+ *
4
+ * Common parser issues:
5
+ * - Missing leading/trailing pipes on rows.
6
+ * - Separator row has too few/many dashes per column.
7
+ * - Inconsistent column count across rows (pad with empty cells).
8
+ *
9
+ * Strategy: detect runs of contiguous lines that look table-ish
10
+ * (contain `|`), parse into cells, rebuild a clean GFM table.
11
+ */
12
+ export function fixTables(md) {
13
+ const lines = md.split("\n");
14
+ const out = [];
15
+ let i = 0;
16
+ let inCode = false;
17
+ while (i < lines.length) {
18
+ const line = lines[i] ?? "";
19
+ if (/^```/.test(line.trim()))
20
+ inCode = !inCode;
21
+ if (!inCode && looksLikeTable(line)) {
22
+ const start = i;
23
+ while (i < lines.length && looksLikeTable(lines[i] ?? ""))
24
+ i++;
25
+ const block = lines.slice(start, i);
26
+ out.push(...rebuildTable(block));
27
+ continue;
28
+ }
29
+ out.push(line);
30
+ i++;
31
+ }
32
+ return out.join("\n");
33
+ }
34
+ function looksLikeTable(line) {
35
+ // ≥1 pipe AND not a code fence / blockquote
36
+ return /\|/.test(line) && !/^```/.test(line.trim()) && !/^>/.test(line.trim());
37
+ }
38
+ function rebuildTable(block) {
39
+ const rows = block.map(parseRow).filter((r) => r.length > 0);
40
+ if (rows.length < 2)
41
+ return block; // not enough rows to bother
42
+ // Drop existing separator row if present
43
+ const sepIdx = rows.findIndex((r) => r.every((c) => /^:?-+:?$/.test(c.trim())));
44
+ if (sepIdx >= 0)
45
+ rows.splice(sepIdx, 1);
46
+ const maxCols = Math.max(...rows.map((r) => r.length));
47
+ const padded = rows.map((r) => {
48
+ const copy = [...r];
49
+ while (copy.length < maxCols)
50
+ copy.push("");
51
+ return copy;
52
+ });
53
+ const sep = Array(maxCols).fill("---");
54
+ return [renderRow(padded[0]), renderRow(sep), ...padded.slice(1).map(renderRow)];
55
+ }
56
+ function parseRow(line) {
57
+ const trimmed = line.trim().replace(/^\|/, "").replace(/\|$/, "");
58
+ return trimmed.split("|").map((c) => c.trim());
59
+ }
60
+ function renderRow(cells) {
61
+ return "| " + cells.join(" | ") + " |";
62
+ }
63
+ //# sourceMappingURL=fix-tables.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fix-tables.js","sourceRoot":"","sources":["../../src/regex/fix-tables.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AACH,MAAM,UAAU,SAAS,CAAC,EAAU;IAClC,MAAM,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,IAAI,CAAC,GAAG,CAAC,CAAC;IACV,IAAI,MAAM,GAAG,KAAK,CAAC;IAEnB,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;QACxB,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAAE,MAAM,GAAG,CAAC,MAAM,CAAC;QAE/C,IAAI,CAAC,MAAM,IAAI,cAAc,CAAC,IAAI,CAAC,EAAE,CAAC;YACpC,MAAM,KAAK,GAAG,CAAC,CAAC;YAChB,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBAAE,CAAC,EAAE,CAAC;YAC/D,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;YACpC,GAAG,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC;YACjC,SAAS;QACX,CAAC;QACD,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACf,CAAC,EAAE,CAAC;IACN,CAAC;IACD,OAAO,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACxB,CAAC;AAED,SAAS,cAAc,CAAC,IAAY;IAClC,4CAA4C;IAC5C,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;AACjF,CAAC;AAED,SAAS,YAAY,CAAC,KAAe;IACnC,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC7D,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,KAAK,CAAC,CAAC,4BAA4B;IAE/D,yCAAyC;IACzC,MAAM,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAChF,IAAI,MAAM,IAAI,CAAC;QAAE,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAExC,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC;IACvD,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;QAC5B,MAAM,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;QACpB,OAAO,IAAI,CAAC,MAAM,GAAG,OAAO;YAAE,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAC5C,OAAO,IAAI,CAAC;IACd,CAAC,CAAC,CAAC;IAEH,MAAM,GAAG,GAAG,KAAK,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACvC,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,EAAE,SAAS,CAAC,GAAG,CAAC,EAAE,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC;AACpF,CAAC;AAED,SAAS,QAAQ,CAAC,IAAY;IAC5B,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAClE,OAAO,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;AACjD,CAAC;AAED,SAAS,SAAS,CAAC,KAAe;IAChC,OAAO,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC;AACzC,CAAC"}