castdown-cleaners 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/LICENSE +180 -0
  2. package/README.md +198 -0
  3. package/dist/index.d.ts +47 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +110 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/regex/annotate-figures-tables.d.ts +3 -0
  8. package/dist/regex/annotate-figures-tables.d.ts.map +1 -0
  9. package/dist/regex/annotate-figures-tables.js +11 -0
  10. package/dist/regex/annotate-figures-tables.js.map +1 -0
  11. package/dist/regex/collapse-blank-lines.d.ts +6 -0
  12. package/dist/regex/collapse-blank-lines.d.ts.map +1 -0
  13. package/dist/regex/collapse-blank-lines.js +8 -0
  14. package/dist/regex/collapse-blank-lines.js.map +1 -0
  15. package/dist/regex/collapse-redundant-emphasis.d.ts +2 -0
  16. package/dist/regex/collapse-redundant-emphasis.d.ts.map +1 -0
  17. package/dist/regex/collapse-redundant-emphasis.js +19 -0
  18. package/dist/regex/collapse-redundant-emphasis.js.map +1 -0
  19. package/dist/regex/decode-html-entities.d.ts +2 -0
  20. package/dist/regex/decode-html-entities.d.ts.map +1 -0
  21. package/dist/regex/decode-html-entities.js +73 -0
  22. package/dist/regex/decode-html-entities.js.map +1 -0
  23. package/dist/regex/dedupe-links.d.ts +9 -0
  24. package/dist/regex/dedupe-links.d.ts.map +1 -0
  25. package/dist/regex/dedupe-links.js +16 -0
  26. package/dist/regex/dedupe-links.js.map +1 -0
  27. package/dist/regex/detect-space-tables.d.ts +29 -0
  28. package/dist/regex/detect-space-tables.d.ts.map +1 -0
  29. package/dist/regex/detect-space-tables.js +125 -0
  30. package/dist/regex/detect-space-tables.js.map +1 -0
  31. package/dist/regex/detect-toc.d.ts +14 -0
  32. package/dist/regex/detect-toc.d.ts.map +1 -0
  33. package/dist/regex/detect-toc.js +35 -0
  34. package/dist/regex/detect-toc.js.map +1 -0
  35. package/dist/regex/extract-metadata-frontmatter.d.ts +3 -0
  36. package/dist/regex/extract-metadata-frontmatter.d.ts.map +1 -0
  37. package/dist/regex/extract-metadata-frontmatter.js +39 -0
  38. package/dist/regex/extract-metadata-frontmatter.js.map +1 -0
  39. package/dist/regex/fix-footnote-markers.d.ts +2 -0
  40. package/dist/regex/fix-footnote-markers.d.ts.map +1 -0
  41. package/dist/regex/fix-footnote-markers.js +23 -0
  42. package/dist/regex/fix-footnote-markers.js.map +1 -0
  43. package/dist/regex/fix-headings.d.ts +12 -0
  44. package/dist/regex/fix-headings.d.ts.map +1 -0
  45. package/dist/regex/fix-headings.js +40 -0
  46. package/dist/regex/fix-headings.js.map +1 -0
  47. package/dist/regex/fix-ligatures.d.ts +3 -0
  48. package/dist/regex/fix-ligatures.d.ts.map +1 -0
  49. package/dist/regex/fix-ligatures.js +16 -0
  50. package/dist/regex/fix-ligatures.js.map +1 -0
  51. package/dist/regex/fix-tables.d.ts +13 -0
  52. package/dist/regex/fix-tables.d.ts.map +1 -0
  53. package/dist/regex/fix-tables.js +63 -0
  54. package/dist/regex/fix-tables.js.map +1 -0
  55. package/dist/regex/html-tables-to-gfm.d.ts +21 -0
  56. package/dist/regex/html-tables-to-gfm.d.ts.map +1 -0
  57. package/dist/regex/html-tables-to-gfm.js +76 -0
  58. package/dist/regex/html-tables-to-gfm.js.map +1 -0
  59. package/dist/regex/join-broken-lines.d.ts +10 -0
  60. package/dist/regex/join-broken-lines.d.ts.map +1 -0
  61. package/dist/regex/join-broken-lines.js +40 -0
  62. package/dist/regex/join-broken-lines.js.map +1 -0
  63. package/dist/regex/join-soft-hyphens.d.ts +9 -0
  64. package/dist/regex/join-soft-hyphens.d.ts.map +1 -0
  65. package/dist/regex/join-soft-hyphens.js +11 -0
  66. package/dist/regex/join-soft-hyphens.js.map +1 -0
  67. package/dist/regex/normalize-horizontal-rules.d.ts +2 -0
  68. package/dist/regex/normalize-horizontal-rules.d.ts.map +1 -0
  69. package/dist/regex/normalize-horizontal-rules.js +20 -0
  70. package/dist/regex/normalize-horizontal-rules.js.map +1 -0
  71. package/dist/regex/normalize-list-markers.d.ts +2 -0
  72. package/dist/regex/normalize-list-markers.d.ts.map +1 -0
  73. package/dist/regex/normalize-list-markers.js +35 -0
  74. package/dist/regex/normalize-list-markers.js.map +1 -0
  75. package/dist/regex/normalize-numbered-lists.d.ts +2 -0
  76. package/dist/regex/normalize-numbered-lists.d.ts.map +1 -0
  77. package/dist/regex/normalize-numbered-lists.js +9 -0
  78. package/dist/regex/normalize-numbered-lists.js.map +1 -0
  79. package/dist/regex/normalize-unicode.d.ts +2 -0
  80. package/dist/regex/normalize-unicode.d.ts.map +1 -0
  81. package/dist/regex/normalize-unicode.js +49 -0
  82. package/dist/regex/normalize-unicode.js.map +1 -0
  83. package/dist/regex/normalize-whitespace-in-lines.d.ts +2 -0
  84. package/dist/regex/normalize-whitespace-in-lines.d.ts.map +1 -0
  85. package/dist/regex/normalize-whitespace-in-lines.js +24 -0
  86. package/dist/regex/normalize-whitespace-in-lines.js.map +1 -0
  87. package/dist/regex/strip-boilerplate.d.ts +3 -0
  88. package/dist/regex/strip-boilerplate.d.ts.map +1 -0
  89. package/dist/regex/strip-boilerplate.js +16 -0
  90. package/dist/regex/strip-boilerplate.js.map +1 -0
  91. package/dist/regex/strip-docx-artifacts.d.ts +19 -0
  92. package/dist/regex/strip-docx-artifacts.d.ts.map +1 -0
  93. package/dist/regex/strip-docx-artifacts.js +34 -0
  94. package/dist/regex/strip-docx-artifacts.js.map +1 -0
  95. package/dist/regex/strip-empty-headings.d.ts +2 -0
  96. package/dist/regex/strip-empty-headings.d.ts.map +1 -0
  97. package/dist/regex/strip-empty-headings.js +6 -0
  98. package/dist/regex/strip-empty-headings.js.map +1 -0
  99. package/dist/regex/strip-html-artifacts.d.ts +2 -0
  100. package/dist/regex/strip-html-artifacts.d.ts.map +1 -0
  101. package/dist/regex/strip-html-artifacts.js +24 -0
  102. package/dist/regex/strip-html-artifacts.js.map +1 -0
  103. package/dist/regex/strip-page-numbers.d.ts +2 -0
  104. package/dist/regex/strip-page-numbers.d.ts.map +1 -0
  105. package/dist/regex/strip-page-numbers.js +23 -0
  106. package/dist/regex/strip-page-numbers.js.map +1 -0
  107. package/dist/regex/strip-pptx-notes.d.ts +22 -0
  108. package/dist/regex/strip-pptx-notes.d.ts.map +1 -0
  109. package/dist/regex/strip-pptx-notes.js +32 -0
  110. package/dist/regex/strip-pptx-notes.js.map +1 -0
  111. package/dist/regex/strip-repeated-headers.d.ts +2 -0
  112. package/dist/regex/strip-repeated-headers.d.ts.map +1 -0
  113. package/dist/regex/strip-repeated-headers.js +37 -0
  114. package/dist/regex/strip-repeated-headers.js.map +1 -0
  115. package/dist/regex/strip-url-tracking-params.d.ts +2 -0
  116. package/dist/regex/strip-url-tracking-params.d.ts.map +1 -0
  117. package/dist/regex/strip-url-tracking-params.js +26 -0
  118. package/dist/regex/strip-url-tracking-params.js.map +1 -0
  119. package/dist/regex/wrap-long-cell-text.d.ts +28 -0
  120. package/dist/regex/wrap-long-cell-text.d.ts.map +1 -0
  121. package/dist/regex/wrap-long-cell-text.js +66 -0
  122. package/dist/regex/wrap-long-cell-text.js.map +1 -0
  123. package/dist/util/protect-code.d.ts +6 -0
  124. package/dist/util/protect-code.d.ts.map +1 -0
  125. package/dist/util/protect-code.js +20 -0
  126. package/dist/util/protect-code.js.map +1 -0
  127. package/package.json +63 -0
package/LICENSE ADDED
@@ -0,0 +1,180 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship made available under
36
+ the License, as indicated by a copyright notice that is included in
37
+ or attached to the work (an example is provided in the Appendix below).
38
+
39
+ "Derivative Works" shall mean any work, whether in Source or Object
40
+ form, that is based on (or derived from) the Work and for which the
41
+ editorial revisions, annotations, elaborations, or other
42
+ transformations represent, as a whole, an original work of authorship.
43
+ For the purposes of this License, Derivative Works shall not include
44
+ works that remain separable from, or merely link (or bind by name)
45
+ to the interfaces of, the Work and Derivative Works thereof.
46
+
47
+ "Contribution" shall mean, as submitted to the Licensor for inclusion
48
+ in the Work by the copyright owner or by an Individual or Legal Entity
49
+ authorized to submit on behalf of the copyright owner. For the purposes
50
+ of this definition, "submitted" means any form of electronic, verbal,
51
+ or written communication sent to the Licensor or its representatives,
52
+ including but not limited to communication on electronic mailing lists,
53
+ source code control systems, and issue tracking systems that are managed
54
+ by, or on behalf of, the Licensor for the purpose of submitting and
55
+ discussing improvements to the Work, but excluding communication that is
56
+ conspicuously marked or designated in writing by the copyright owner as
57
+ "Not a Contribution."
58
+
59
+ "Contributor" shall mean Licensor and any Legal Entity on behalf of
60
+ whom a Contribution has been received by the Licensor and included
61
+ within the Work.
62
+
63
+ 2. Grant of Copyright License. Subject to the terms and conditions of
64
+ this License, each Contributor hereby grants to You a perpetual,
65
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
66
+ copyright license to reproduce, prepare Derivative Works of,
67
+ publicly display, publicly perform, sublicense, and distribute the
68
+ Work and such Derivative Works in Source or Object form.
69
+
70
+ 3. Grant of Patent License. Subject to the terms and conditions of
71
+ this License, each Contributor hereby grants to You a perpetual,
72
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
73
+ (except as stated in this section) patent license to make, have made,
74
+ use, offer to sell, sell, import, and otherwise transfer the Work,
75
+ where such license applies only to those patent claims licensable
76
+ by such Contributor that are necessarily infringed by their
77
+ Contribution(s) alone or by the combination of their Contribution(s)
78
+ with the Work to which such Contribution(s) was submitted. If You
79
+ institute patent litigation against any entity (including a cross-claim
80
+ or counterclaim in a lawsuit) alleging that the Work or any
81
+ Contribution embodied within the Work constitutes direct or contributory
82
+ patent infringement, then any patent licenses granted to You under
83
+ this License for that Work shall terminate as of the date such
84
+ litigation is filed.
85
+
86
+ 4. Redistribution. You may reproduce and distribute copies of the
87
+ Work or Derivative Works thereof in any medium, with or without
88
+ modifications, and in Source or Object form, provided that You
89
+ meet the following conditions:
90
+
91
+ (a) You must give any other recipients of the Work or Derivative
92
+ Works a copy of this License; and
93
+
94
+ (b) You must cause any modified files to carry prominent notices
95
+ stating that You changed the files; and
96
+
97
+ (c) You must retain, in the Source form of any Derivative Works
98
+ that You distribute, all copyright, patent, trademark, and
99
+ attribution notices from the Source form of the Work,
100
+ excluding those notices that do not pertain to any part of
101
+ the Derivative Works; and
102
+
103
+ (d) If the Work includes a "NOTICE" text file as part of its
104
+ distribution, You must include a readable copy of the
105
+ attribution notices contained within such NOTICE file, in
106
+ at least one of the following places: within a NOTICE text
107
+ file distributed as part of the Derivative Works; within
108
+ the Source form or documentation, if provided along with the
109
+ Derivative Works; or, within a display generated by the
110
+ Derivative Works, if and wherever such third-party notices
111
+ normally appear. The contents of the NOTICE file are for
112
+ informational purposes only and do not modify the License.
113
+ You may add Your own attribution notices within Derivative
114
+ Works that You distribute, alongside or in addition to the
115
+ NOTICE text from the Work, provided that such additional
116
+ attribution notices cannot be construed as modifying the License.
117
+
118
+ You may add Your own license statement for Your modifications and
119
+ may provide additional grant of rights to use, copy, modify, merge,
120
+ publish, distribute, sublicense, and/or sell copies of the
121
+ Contribution, and to permit persons to whom the Contribution is
122
+ furnished to do so.
123
+
124
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
125
+ any Contribution intentionally submitted for inclusion in the Work
126
+ by You to the Licensor shall be under the terms and conditions of
127
+ this License, without any additional terms or conditions.
128
+ Notwithstanding the above, nothing herein shall supersede or modify
129
+ the terms of any separate license agreement you may have executed
130
+ with Licensor regarding such Contributions.
131
+
132
+ 6. Trademarks. This License does not grant permission to use the trade
133
+ names, trademarks, service marks, or product names of the Licensor,
134
+ except as required for reasonable and customary use in describing the
135
+ origin of the Work and reproducing the content of the NOTICE file.
136
+
137
+ 7. Disclaimer of Warranty. Unless required by applicable law or
138
+ agreed to in writing, Licensor provides the Work (and each
139
+ Contributor provides its Contributions) on an "AS IS" BASIS,
140
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
141
+ implied, including, without limitation, any conditions of TITLE,
142
+ NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR
143
+ PURPOSE. You are solely responsible for determining the
144
+ appropriateness of using or reproducing the Work and assume any
145
+ risks associated with Your exercise of permissions under this License.
146
+
147
+ 8. Limitation of Liability. In no event and under no legal theory,
148
+ whether in tort (including negligence), contract, or otherwise,
149
+ unless required by applicable law (such as deliberate and grossly
150
+ negligent acts) or agreed to in writing, shall any Contributor be
151
+ liable to You for damages, including any direct, indirect, special,
152
+ incidental, or exemplary damages of any character arising as a result
153
+ of this License or out of the use or inability to use the Work
154
+ (including but not limited to damages for loss of goodwill, work
155
+ stoppage, computer failure or malfunction, or all other commercial
156
+ damages or losses), even if such Contributor has been advised of
157
+ the possibility of such damages.
158
+
159
+ 9. Accepting Warranty or Additional Liability. While redistributing
160
+ the Work or Derivative Works thereof, You may choose to offer,
161
+ and charge a fee for, acceptance of support, warranty, indemnity,
162
+ or other liability obligations and/or rights consistent with this
163
+ License. However, in accepting such obligations, You may offer only
164
+ conditions consistent with this License.
165
+
166
+ END OF TERMS AND CONDITIONS
167
+
168
+ Copyright 2026 castdown contributors
169
+
170
+ Licensed under the Apache License, Version 2.0 (the "License");
171
+ you may not use this file except in compliance with the License.
172
+ You may obtain a copy of the License at
173
+
174
+ http://www.apache.org/licenses/LICENSE-2.0
175
+
176
+ Unless required by applicable law or agreed to in writing, software
177
+ distributed under the License is distributed on an "AS IS" BASIS,
178
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
179
+ See the License for the specific language governing permissions and
180
+ limitations under the License.
package/README.md ADDED
@@ -0,0 +1,198 @@
1
+ # castdown-cleaners
2
+
3
+ [![npm](https://img.shields.io/npm/v/castdown-cleaners)](https://www.npmjs.com/package/castdown-cleaners)
4
+ [![License](https://img.shields.io/badge/license-Apache%202.0-blue)](LICENSE)
5
+
6
+ Composable Markdown post-processing pipeline. Fixes the dirty output that PDF parsers, DOCX converters, and web crawlers produce before it reaches your LLM or RAG pipeline.
7
+
8
+ **Works independently with MarkItDown, Docling, Pandoc, LlamaParse, or any tool that outputs Markdown.**
9
+
10
+ ---
11
+
12
+ ## Why
13
+
14
+ PDF parsers produce ligatures (`figure` instead of `figure`), broken bullets (`•`), superscript footnotes (`¹`), and HTML entity noise (`&`). DOCX converters leave span artifacts and `{.underline}` syntax. Web crawlers embed UTM tracking params. LLMs and vector databases see all of this as noise — tokens that aren't searchable, chunks that split poorly.
15
+
16
+ `castdown-cleaners` applies 29 targeted transformations in a validated pipeline to produce clean, normalized Markdown ready for downstream use.
17
+
18
+ ---
19
+
20
+ ## Install
21
+
22
+ ```bash
23
+ npm install castdown-cleaners
24
+ # or
25
+ pnpm add castdown-cleaners
26
+ ```
27
+
28
+ ---
29
+
30
+ ## Quick start
31
+
32
+ ```typescript
33
+ import { clean } from "castdown-cleaners";
34
+
35
+ const raw = `AT&T Q4 Report\n\n• Revenue grew 15%\n◦ Digital: +22%\n\nfigure 1 shows flow of financial data.\n\n¹ Preliminary data only`;
36
+
37
+ const { markdown, applied } = await clean(raw, { source: "pdf" });
38
+
39
+ console.log(markdown);
40
+ // AT&T Q4 Report
41
+ //
42
+ // - Revenue grew 15%
43
+ // - Digital: +22%
44
+ //
45
+ // figure 1 shows flow of financial data.
46
+ //
47
+ // [^1]: Preliminary data only
48
+
49
+ console.log(applied);
50
+ // ["decodeHtmlEntities", "fixLigatures", "normalizeListMarkers",
51
+ // "fixFootnoteMarkers", "remark-normalize"]
52
+ ```
53
+
54
+ ---
55
+
56
+ ## Usage with MarkItDown
57
+
58
+ ```typescript
59
+ import { markitdown } from "markitdown"; // your MarkItDown wrapper
60
+ import { clean } from "castdown-cleaners";
61
+
62
+ const raw = await markitdown.convert("report.pdf");
63
+ const { markdown } = await clean(raw, { source: "pdf" });
64
+ ```
65
+
66
+ ## Usage with Docling
67
+
68
+ ```typescript
69
+ import { clean } from "castdown-cleaners";
70
+
71
+ // Docling output typically comes from HTML conversion path
72
+ const raw = await doclingClient.convert("document.pdf");
73
+ const { markdown } = await clean(raw.markdown, { source: "pdf" });
74
+ ```
75
+
76
+ ## Usage with Pandoc / LlamaParse output
77
+
78
+ ```typescript
79
+ import { clean } from "castdown-cleaners";
80
+
81
+ // DOCX via Pandoc
82
+ const { markdown } = await clean(pandocOutput, { source: "docx" });
83
+
84
+ // LlamaParse returns Markdown — treat as unknown source
85
+ const { markdown: cleaned } = await clean(llamaParseOutput);
86
+ ```
87
+
88
+ ---
89
+
90
+ ## API
91
+
92
+ ### `clean(input, opts?): Promise<CleanResult>`
93
+
94
+ ```typescript
95
+ interface CleanOptions {
96
+ source?: "pdf" | "docx" | "pptx" | "html" | "epub" | "unknown";
97
+ skip?: string[]; // cleaner names to skip
98
+ stripToc?: boolean; // remove table of contents (default: false)
99
+ keepNotes?: boolean; // keep PPTX speaker notes (default: false)
100
+ ligatureMap?: Record<string, string>; // extend/override ligature map
101
+ extractFrontmatter?: boolean; // extract YAML frontmatter (default: false)
102
+ frontmatterScanLines?: number; // lines to scan for metadata (default: 20)
103
+ keepBoilerplate?: boolean; // keep copyright lines (default: false)
104
+ keepUrlTracking?: boolean; // keep UTM params (default: false)
105
+ }
106
+
107
+ interface CleanResult {
108
+ markdown: string;
109
+ applied: string[]; // names of cleaners that made changes
110
+ }
111
+ ```
112
+
113
+ ### Individual cleaners
114
+
115
+ Every cleaner is exported and usable standalone:
116
+
117
+ ```typescript
118
+ import {
119
+ decodeHtmlEntities,
120
+ fixLigatures,
121
+ normalizeListMarkers,
122
+ stripUrlTrackingParams,
123
+ // ... all 29 cleaners
124
+ } from "castdown-cleaners";
125
+
126
+ const fixed = fixLigatures("The first figure shows flow.");
127
+ // "The first figure shows flow."
128
+ ```
129
+
130
+ ---
131
+
132
+ ## Pipeline (29 steps)
133
+
134
+ Steps applied in order. Each is idempotent and skippable via `opts.skip`.
135
+
136
+ | # | Name | What it fixes |
137
+ |---|------|--------------|
138
+ | 1 | `decodeHtmlEntities` | `&amp;` `&lt;` `&mdash;` `&#8212;` `&#x2014;` |
139
+ | 2 | `normalizeUnicode` | NFC normalization, smart quotes, dashes, ZWSP |
140
+ | 3 | `fixLigatures` | `fi`→`fi`, `fl`→`fl`, `ffi`→`ffi` (PDF-specific) |
141
+ | 4 | `htmlTablesToGfm` | `<table>` → GFM pipe tables |
142
+ | 5 | `stripHtmlArtifacts` | `<br>` `<span>` `<b>` `<hr>` `<div>` survivors |
143
+ | 6 | `stripDocxArtifacts` | `{.underline}` `{.smallcaps}` DOCX span syntax |
144
+ | 7 | `stripPptxNotes` | PPTX speaker note sections |
145
+ | 8 | `stripEmptyHeadings` | `## ` blank/punctuation-only headings |
146
+ | 9 | `normalizeHorizontalRules` | `======` `————` `* * *` → `---` |
147
+ | 10 | `normalizeListMarkers` | `•◦►▸✓✗` → `- / - [x] / - [ ]` |
148
+ | 11 | `normalizeNumberedLists` | `1)` `(1)` `a)` `(a)` → `1.` `a.` |
149
+ | 12 | `joinSoftHyphens` | Removes soft-hyphen line breaks |
150
+ | 13 | `stripPageNumbers` | `— 42 —` page number lines |
151
+ | 14 | `stripRepeatedHeaders` | Repeated header/footer text |
152
+ | 15 | `detectSpaceTables` | Space-aligned text → GFM tables (PDF) |
153
+ | 16 | `joinBrokenLines` | Rejoins hard-wrapped paragraph lines |
154
+ | 17 | `fixHeadings` | Promotes/normalizes heading levels |
155
+ | 18 | `stripUrlTrackingParams` | `utm_*` `fbclid` `gclid` from links |
156
+ | 19 | `dedupeLinks` | Removes duplicate link definitions |
157
+ | 20 | `collapseRedundantEmphasis` | `**a** **b**` → `**a b**` |
158
+ | 21 | `fixTables` | Repairs malformed GFM tables |
159
+ | 22 | `wrapLongCellText` | Wraps overlong table cells |
160
+ | 23 | `fixFootnoteMarkers` | `word¹` → `word[^1]`, `¹ text` → `[^1]: text` |
161
+ | 24 | `annotateFiguresTables` | Adds `<!-- figure:N -->` markers for RAG |
162
+ | 25 | `detectToc` | Marks/removes table of contents |
163
+ | 26 | `stripBoilerplate` | Copyright, CONFIDENTIAL, All rights reserved |
164
+ | 27 | `normalizeWhitespaceInLines` | Trailing whitespace, whitespace-only lines |
165
+ | 28 | `collapseBlankLines` | 3+ blank lines → 2 |
166
+ | 29 | `extractMetadataFrontmatter` | Extracts title/date/author as YAML (opt-in) |
167
+ | — | `remark-normalize` | Final AST-based normalization via remark+GFM |
168
+
169
+ ---
170
+
171
+ ## Skip specific cleaners
172
+
173
+ ```typescript
174
+ const { markdown } = await clean(input, {
175
+ source: "html",
176
+ skip: ["stripBoilerplate", "annotateFiguresTables"],
177
+ });
178
+ ```
179
+
180
+ ---
181
+
182
+ ## Opt-in: extract YAML frontmatter
183
+
184
+ ```typescript
185
+ const { markdown } = await clean(input, {
186
+ source: "pdf",
187
+ extractFrontmatter: true,
188
+ });
189
+ // Prepends --- title/date/author block if found in first 20 lines
190
+ ```
191
+
192
+ ---
193
+
194
+ ## License
195
+
196
+ Apache 2.0 — see [LICENSE](LICENSE).
197
+
198
+ Part of the [castdown](https://github.com/castdown/castdown) toolkit.
@@ -0,0 +1,47 @@
1
+ import { decodeHtmlEntities } from "./regex/decode-html-entities.js";
2
+ import { normalizeUnicode } from "./regex/normalize-unicode.js";
3
+ import { fixLigatures } from "./regex/fix-ligatures.js";
4
+ import { htmlTablesToGfm } from "./regex/html-tables-to-gfm.js";
5
+ import { stripHtmlArtifacts } from "./regex/strip-html-artifacts.js";
6
+ import { stripDocxArtifacts } from "./regex/strip-docx-artifacts.js";
7
+ import { stripPptxNotes } from "./regex/strip-pptx-notes.js";
8
+ import { stripEmptyHeadings } from "./regex/strip-empty-headings.js";
9
+ import { normalizeHorizontalRules } from "./regex/normalize-horizontal-rules.js";
10
+ import { normalizeListMarkers } from "./regex/normalize-list-markers.js";
11
+ import { normalizeNumberedLists } from "./regex/normalize-numbered-lists.js";
12
+ import { joinSoftHyphens } from "./regex/join-soft-hyphens.js";
13
+ import { stripPageNumbers } from "./regex/strip-page-numbers.js";
14
+ import { stripRepeatedHeaders } from "./regex/strip-repeated-headers.js";
15
+ import { detectSpaceTables } from "./regex/detect-space-tables.js";
16
+ import { joinBrokenLines } from "./regex/join-broken-lines.js";
17
+ import { fixHeadings } from "./regex/fix-headings.js";
18
+ import { stripUrlTrackingParams } from "./regex/strip-url-tracking-params.js";
19
+ import { dedupeLinks } from "./regex/dedupe-links.js";
20
+ import { collapseRedundantEmphasis } from "./regex/collapse-redundant-emphasis.js";
21
+ import { fixTables } from "./regex/fix-tables.js";
22
+ import { wrapLongCellText } from "./regex/wrap-long-cell-text.js";
23
+ import { fixFootnoteMarkers } from "./regex/fix-footnote-markers.js";
24
+ import { annotateFiguresTables } from "./regex/annotate-figures-tables.js";
25
+ import { detectToc } from "./regex/detect-toc.js";
26
+ import { stripBoilerplate } from "./regex/strip-boilerplate.js";
27
+ import { normalizeWhitespaceInLines } from "./regex/normalize-whitespace-in-lines.js";
28
+ import { collapseBlankLines } from "./regex/collapse-blank-lines.js";
29
+ import { extractMetadataFrontmatter } from "./regex/extract-metadata-frontmatter.js";
30
+ export interface CleanOptions {
31
+ source?: "pdf" | "docx" | "pptx" | "html" | "epub" | "unknown";
32
+ skip?: string[];
33
+ stripToc?: boolean;
34
+ keepNotes?: boolean;
35
+ ligatureMap?: Record<string, string>;
36
+ extractFrontmatter?: boolean;
37
+ frontmatterScanLines?: number;
38
+ keepBoilerplate?: boolean;
39
+ keepUrlTracking?: boolean;
40
+ }
41
+ export interface CleanResult {
42
+ markdown: string;
43
+ applied: string[];
44
+ }
45
+ export declare function clean(input: string, opts?: CleanOptions): Promise<CleanResult>;
46
+ export { decodeHtmlEntities, normalizeUnicode, fixLigatures, htmlTablesToGfm, stripHtmlArtifacts, stripDocxArtifacts, stripPptxNotes, stripEmptyHeadings, normalizeHorizontalRules, normalizeListMarkers, normalizeNumberedLists, joinSoftHyphens, stripPageNumbers, stripRepeatedHeaders, detectSpaceTables, joinBrokenLines, fixHeadings, stripUrlTrackingParams, dedupeLinks, collapseRedundantEmphasis, fixTables, wrapLongCellText, fixFootnoteMarkers, annotateFiguresTables, detectToc, stripBoilerplate, normalizeWhitespaceInLines, collapseBlankLines, extractMetadataFrontmatter, };
47
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAUA,OAAO,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AACrE,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AACxD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AACrE,OAAO,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AACrE,OAAO,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAC7D,OAAO,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AACrE,OAAO,EAAE,wBAAwB,EAAE,MAAM,uCAAuC,CAAC;AACjF,OAAO,EAAE,oBAAoB,EAAE,MAAM,mCAAmC,CAAC;AACzE,OAAO,EAAE,sBAAsB,EAAE,MAAM,qCAAqC,CAAC;AAC7E,OAAO,EAAE,eAAe,EAAE,MAAM,8BAA8B,CAAC;AAC/D,OAAO,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AACjE,OAAO,EAAE,oBAAoB,EAAE,MAAM,mCAAmC,CAAC;AACzE,OAAO,EAAE,iBAAiB,EAAE,MAAM,gCAAgC,CAAC;AACnE,OAAO,EAAE,eAAe,EAAE,MAAM,8BAA8B,CAAC;AAC/D,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,sBAAsB,EAAE,MAAM,sCAAsC,CAAC;AAC9E,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,yBAAyB,EAAE,MAAM,wCAAwC,CAAC;AACnF,OAAO,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AACrE,OAAO,EAAE,qBAAqB,EAAE,MAAM,oCAAoC,CAAC;AAC3E,OAAO,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,0BAA0B,EAAE,MAAM,0CAA0C,CAAC;AACtF,OAAO,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AACrE,OAAO,EAAE,0BAA0B,EAAE,MAAM,yCAAyC,CAAC;AAErF,MAAM,WAAW,YAAY;IAC3B,MAAM,CAAC,EAAE,KAAK,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,GAAG,SAAS,CAAC;IAC/D,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,WAAW,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACrC,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,eAAe,CAAC,EAAE,OAAO,CAAC;CAC3B;AAED,MAAM,WAAW,WAAW;IAC1B,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,EAAE,CAAC;CACnB;AA0DD,wBAAsB,KAAK,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,GAAE,YAAiB,GAAG,OAAO,CAAC,WAAW,CAAC,CAwBxF;AAED,OAAO,EACL,kBAAkB,EAClB,gBAAgB,EAChB,YAAY,EACZ,eAAe,EACf,kBAAkB,EAClB,kBAAkB,EAClB,cAAc,EACd,kBAAkB,EAClB,wBAAwB,EACxB,oBAAoB,EACpB,sBAAsB,EACtB,eAAe,EACf,gBAAgB,EAChB,oBAAoB,EACpB,iBAAiB,EACjB,eAAe,EACf,WAAW,EACX,sBAAsB,EACtB,WAAW,EACX,yBAAyB,EACzB,SAAS,EACT,gBAAgB,EAChB,kBAAkB,EAClB,qBAAqB,EACrB,SAAS,EACT,gBAAgB,EAChB,0BAA0B,EAC1B,kBAAkB,EAClB,0BAA0B,GAC3B,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,110 @@
1
+ /**
2
+ * Cleaners pipeline — post-process raw MD from parsers (MarkItDown, Pandoc).
3
+ *
4
+ * Composition: regex passes → unified/remark passes → output.
5
+ * Each stage is small, named, testable. Reorder freely.
6
+ */
7
+ import { remark } from "remark";
8
+ import remarkGfm from "remark-gfm";
9
+ import remarkStringify from "remark-stringify";
10
+ import { decodeHtmlEntities } from "./regex/decode-html-entities.js";
11
+ import { normalizeUnicode } from "./regex/normalize-unicode.js";
12
+ import { fixLigatures } from "./regex/fix-ligatures.js";
13
+ import { htmlTablesToGfm } from "./regex/html-tables-to-gfm.js";
14
+ import { stripHtmlArtifacts } from "./regex/strip-html-artifacts.js";
15
+ import { stripDocxArtifacts } from "./regex/strip-docx-artifacts.js";
16
+ import { stripPptxNotes } from "./regex/strip-pptx-notes.js";
17
+ import { stripEmptyHeadings } from "./regex/strip-empty-headings.js";
18
+ import { normalizeHorizontalRules } from "./regex/normalize-horizontal-rules.js";
19
+ import { normalizeListMarkers } from "./regex/normalize-list-markers.js";
20
+ import { normalizeNumberedLists } from "./regex/normalize-numbered-lists.js";
21
+ import { joinSoftHyphens } from "./regex/join-soft-hyphens.js";
22
+ import { stripPageNumbers } from "./regex/strip-page-numbers.js";
23
+ import { stripRepeatedHeaders } from "./regex/strip-repeated-headers.js";
24
+ import { detectSpaceTables } from "./regex/detect-space-tables.js";
25
+ import { joinBrokenLines } from "./regex/join-broken-lines.js";
26
+ import { fixHeadings } from "./regex/fix-headings.js";
27
+ import { stripUrlTrackingParams } from "./regex/strip-url-tracking-params.js";
28
+ import { dedupeLinks } from "./regex/dedupe-links.js";
29
+ import { collapseRedundantEmphasis } from "./regex/collapse-redundant-emphasis.js";
30
+ import { fixTables } from "./regex/fix-tables.js";
31
+ import { wrapLongCellText } from "./regex/wrap-long-cell-text.js";
32
+ import { fixFootnoteMarkers } from "./regex/fix-footnote-markers.js";
33
+ import { annotateFiguresTables } from "./regex/annotate-figures-tables.js";
34
+ import { detectToc } from "./regex/detect-toc.js";
35
+ import { stripBoilerplate } from "./regex/strip-boilerplate.js";
36
+ import { normalizeWhitespaceInLines } from "./regex/normalize-whitespace-in-lines.js";
37
+ import { collapseBlankLines } from "./regex/collapse-blank-lines.js";
38
+ import { extractMetadataFrontmatter } from "./regex/extract-metadata-frontmatter.js";
39
+ const PIPELINE = [
40
+ { name: "decodeHtmlEntities", fn: decodeHtmlEntities },
41
+ { name: "normalizeUnicode", fn: normalizeUnicode },
42
+ { name: "fixLigatures", fn: fixLigatures },
43
+ { name: "htmlTablesToGfm", fn: htmlTablesToGfm },
44
+ { name: "stripHtmlArtifacts", fn: stripHtmlArtifacts },
45
+ {
46
+ name: "stripDocxArtifacts",
47
+ fn: (md, opts) => (opts?.source === "docx" ? stripDocxArtifacts(md) : md),
48
+ },
49
+ {
50
+ name: "stripPptxNotes",
51
+ fn: (md, opts) => opts?.source === "pptx" && !opts?.keepNotes ? stripPptxNotes(md) : md,
52
+ },
53
+ { name: "stripEmptyHeadings", fn: stripEmptyHeadings },
54
+ { name: "normalizeHorizontalRules", fn: normalizeHorizontalRules },
55
+ { name: "normalizeListMarkers", fn: normalizeListMarkers },
56
+ { name: "normalizeNumberedLists", fn: normalizeNumberedLists },
57
+ { name: "joinSoftHyphens", fn: joinSoftHyphens },
58
+ { name: "stripPageNumbers", fn: stripPageNumbers },
59
+ { name: "stripRepeatedHeaders", fn: stripRepeatedHeaders },
60
+ {
61
+ name: "detectSpaceTables",
62
+ fn: (md, opts) => (opts?.source === "pdf" ? detectSpaceTables(md) : md),
63
+ },
64
+ { name: "joinBrokenLines", fn: joinBrokenLines },
65
+ { name: "fixHeadings", fn: fixHeadings },
66
+ {
67
+ name: "stripUrlTrackingParams",
68
+ fn: (md, opts) => opts?.keepUrlTracking ? md : stripUrlTrackingParams(md),
69
+ },
70
+ { name: "dedupeLinks", fn: dedupeLinks },
71
+ { name: "collapseRedundantEmphasis", fn: collapseRedundantEmphasis },
72
+ { name: "fixTables", fn: fixTables },
73
+ { name: "wrapLongCellText", fn: wrapLongCellText },
74
+ { name: "fixFootnoteMarkers", fn: fixFootnoteMarkers },
75
+ { name: "annotateFiguresTables", fn: annotateFiguresTables },
76
+ {
77
+ name: "detectToc",
78
+ fn: (md, opts) => detectToc(md, { stripToc: opts?.stripToc }),
79
+ },
80
+ { name: "stripBoilerplate", fn: stripBoilerplate },
81
+ { name: "normalizeWhitespaceInLines", fn: normalizeWhitespaceInLines },
82
+ { name: "collapseBlankLines", fn: collapseBlankLines },
83
+ { name: "extractMetadataFrontmatter", fn: extractMetadataFrontmatter },
84
+ ];
85
+ export async function clean(input, opts = {}) {
86
+ const applied = [];
87
+ let md = input;
88
+ for (const step of PIPELINE) {
89
+ if (opts.skip?.includes(step.name))
90
+ continue;
91
+ const before = md;
92
+ md = step.fn(md, opts);
93
+ if (md !== before)
94
+ applied.push(step.name);
95
+ }
96
+ const file = await remark()
97
+ .use(remarkGfm)
98
+ .use(remarkStringify, {
99
+ bullet: "-",
100
+ fences: true,
101
+ listItemIndent: "one",
102
+ rule: "-",
103
+ })
104
+ .process(md);
105
+ md = String(file);
106
+ applied.push("remark-normalize");
107
+ return { markdown: md, applied };
108
+ }
109
+ export { decodeHtmlEntities, normalizeUnicode, fixLigatures, htmlTablesToGfm, stripHtmlArtifacts, stripDocxArtifacts, stripPptxNotes, stripEmptyHeadings, normalizeHorizontalRules, normalizeListMarkers, normalizeNumberedLists, joinSoftHyphens, stripPageNumbers, stripRepeatedHeaders, detectSpaceTables, joinBrokenLines, fixHeadings, stripUrlTrackingParams, dedupeLinks, collapseRedundantEmphasis, fixTables, wrapLongCellText, fixFootnoteMarkers, annotateFiguresTables, detectToc, stripBoilerplate, normalizeWhitespaceInLines, collapseBlankLines, extractMetadataFrontmatter, };
110
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAChC,OAAO,SAAS,MAAM,YAAY,CAAC;AACnC,OAAO,eAAe,MAAM,kBAAkB,CAAC;AAE/C,OAAO,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AACrE,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AACxD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AACrE,OAAO,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AACrE,OAAO,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAC7D,OAAO,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AACrE,OAAO,EAAE,wBAAwB,EAAE,MAAM,uCAAuC,CAAC;AACjF,OAAO,EAAE,oBAAoB,EAAE,MAAM,mCAAmC,CAAC;AACzE,OAAO,EAAE,sBAAsB,EAAE,MAAM,qCAAqC,CAAC;AAC7E,OAAO,EAAE,eAAe,EAAE,MAAM,8BAA8B,CAAC;AAC/D,OAAO,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AACjE,OAAO,EAAE,oBAAoB,EAAE,MAAM,mCAAmC,CAAC;AACzE,OAAO,EAAE,iBAAiB,EAAE,MAAM,gCAAgC,CAAC;AACnE,OAAO,EAAE,eAAe,EAAE,MAAM,8BAA8B,CAAC;AAC/D,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,sBAAsB,EAAE,MAAM,sCAAsC,CAAC;AAC9E,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,yBAAyB,EAAE,MAAM,wCAAwC,CAAC;AACnF,OAAO,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AACrE,OAAO,EAAE,qBAAqB,EAAE,MAAM,oCAAoC,CAAC;AAC3E,OAAO,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,0BAA0B,EAAE,MAAM,0CAA0C,CAAC;AACtF,OAAO,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AACrE,OAAO,EAAE,0BAA0B,EAAE,MAAM,yCAAyC,CAAC;AA0BrF,MAAM,QAAQ,GAAmB;IAC/B,EAAE,IAAI,EAAE,oBAAoB,EAAQ,EAAE,EAAE,kBAAkB,EAAE;IAC5D,EAAE,IAAI,EAAE,kBAAkB,EAAU,EAAE,EAAE,gBAAgB,EAAE;IAC1D,EAAE,IAAI,EAAE,cAAc,EAAc,EAAE,EAAE,YAAY,EAAE;IACtD,EAAE,IAAI,EAAE,iBAAiB,EAAW,EAAE,EAAE,eAAe,EAAE;IACzD,EAAE,IAAI,EAAE,oBAAoB,EAAQ,EAAE,EAAE,kBAAkB,EAAE;IAC5D;QACE,IAAI,EAAE,oBAAoB;QAC1B,EAAE,EAAE,CAAC,EAAE,EAAE,IAAI,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,kBAAkB,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;KAC1E;IACD;QACE,IAAI,EAAE,gBAAgB;QACtB,EAAE,EAAE,CAAC,EAAE,EAAE,IAAI,EAAE,EAAE,CACf,IAAI,EAAE,MAAM,KAAK,MAAM,IAAI,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,cAAc,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE;KACxE;IACD,EAAE,IAAI,EAAE,oBAAoB,EAAQ,EAAE,EAAE,kBAAkB,EAAE;IAC5D,EAAE,IAAI,EAAE,0BAA0B,EAAE,EAAE,EAAE,wBAAwB,EAAE;IAClE,EAAE,IAAI,EAAE,sBAAsB,EAAM,EAAE,EAAE,oBAAoB,EAAE;IAC9D,EAAE,IAAI,EAAE,wBAAwB,EAAI,EAAE,EAAE,sBAAsB,EAAE;IAChE,EAAE,IAAI,EAAE,iBAAiB,EAAW,EAAE,EAAE,eAAe,EAAE;IACzD,EAAE,IAAI,EAAE,kBAAkB,EAAU,EAAE,EAAE,gBAAgB,EAAE;IAC1D,EAAE,IAAI,EAAE,sBAAsB,EAAM,EAAE,EAAE,oBAAoB,EAAE;IAC9D;QACE,IAAI,EAAE,mBAAmB;QACzB,EAAE,EAAE,CAAC,EAAE,EAAE,IAAI,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,KAAK,KAAK,CAAC,CAAC,CAAC,iBAAiB,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;KACxE;IACD,EAAE,IAAI,EAAE,iBAAiB,EAAW,EAAE,EAAE,eAAe,EAAE;IACzD,EAAE,IAAI,EAAE,aAAa,EAAe,EAAE,EAAE,WAAW,EAAE;IACrD;QACE,IAAI,EAAE,wBAAwB;QAC9B,EAAE,EAAE,CAAC,EAAE,EAAE,IAAI,EAAE,EAAE,CACf,IAAI,EAAE,eAAe,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,sBAAsB,CAAC,EAAE,CAAC;KAC1D;IACD,EAAE,IAAI,EAAE,aAAa,EAAe,EAAE,EAAE,WAAW,EAAE;IACrD,EAAE,IAAI,EAAE,2BAA2B,EAAE,EAAE,EAAE,yBAAyB,EAAE;IACpE,EAAE,IAAI,EAAE,WAAW,EAAiB,EAAE,EAAE,SAAS,EAAE;IACnD,EAAE,IAAI,EAAE,kBAAkB,EAAU,EAAE,EAAE,gBAAgB,EAAE;IAC1D,EAAE,IAAI,EAAE,oBAAoB,EAAQ,EAAE,EAAE,kBAAkB,EAAE;IAC5D,EAAE,IAAI,EAAE,uBAAuB,EAAK,EAAE,EAAE,qBAAqB,EAAE;IAC/D;QACE,IAAI,EAAE,WAAW;QACjB,EAAE,EAAE,CAAC,EAAE,EAAE,IAAI,EAAE,EAAE,CAAC,SAAS,CAAC,EAAE,EAAE,EAAE,QAAQ,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;KAC9D;IACD,EAAE,IAAI,EAAE,kBAAkB,EAAU,EAAE,EAAE,gBAAgB,EAAE;IAC1D,EAAE,IAAI,EAAE,4BAA4B,EAAE,EAAE,EAAE,0BAA0B,EAAE;IACtE,EAAE,IAAI,EAAE,oBAAoB,EAAQ,EAAE,EAAE,kBAAkB,EAAE;IAC5D,EAAE,IAAI,EAAE,4BAA4B,EAAE,EAAE,EAAE,0BAA0B,EAAE;CACvE,CAAC;AAEF,MAAM,CAAC,KAAK,UAAU,KAAK,CAAC,KAAa,EAAE,OAAqB,EAAE;IAChE,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,IAAI,EAAE,GAAG,KAAK,CAAC;IAEf,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;QAC5B,IAAI,IAAI,CAAC,IAAI,EAAE,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC;YAAE,SAAS;QAC7C,MAAM,MAAM,GAAG,EAAE,CAAC;QAClB,EAAE,GAAG,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,IAAI,CAAC,CAAC;QACvB,IAAI,EAAE,KAAK,MAAM;YAAE,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7C,CAAC;IAED,MAAM,IAAI,GAAG,MAAM,MAAM,EAAE;SACxB,GAAG,CAAC,SAAS,CAAC;SACd,GAAG,CAAC,eAAe,EAAE;QACpB,MAAM,EAAE,GAAG;QACX,MAAM,EAAE,IAAI;QACZ,cAAc,EAAE,KAAK;QACrB,IAAI,EAAE,GAAG;KACV,CAAC;SACD,OAAO,CAAC,EAAE,CAAC,CAAC;IACf,EAAE,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC;IAClB,OAAO,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;IAEjC,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,OAAO,EAAE,CAAC;AACnC,CAAC;AAED,OAAO,EACL,kBAAkB,EAClB,gBAAgB,EAChB,YAAY,EACZ,eAAe,EACf,kBAAkB,EAClB,kBAAkB,EAClB,cAAc,EACd,kBAAkB,EAClB,wBAAwB,EACxB,oBAAoB,EACpB,sBAAsB,EACtB,eAAe,EACf,gBAAgB,EAChB,oBAAoB,EACpB,iBAAiB,EACjB,eAAe,EACf,WAAW,EACX,sBAAsB,EACtB,WAAW,EACX,yBAAyB,EACzB,SAAS,EACT,gBAAgB,EAChB,kBAAkB,EAClB,qBAAqB,EACrB,SAAS,EACT,gBAAgB,EAChB,0BAA0B,EAC1B,kBAAkB,EAClB,0BAA0B,GAC3B,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { CleanOptions } from "../index.js";
2
+ export declare function annotateFiguresTables(md: string, opts?: CleanOptions): string;
3
+ //# sourceMappingURL=annotate-figures-tables.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"annotate-figures-tables.d.ts","sourceRoot":"","sources":["../../src/regex/annotate-figures-tables.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAKhD,wBAAgB,qBAAqB,CAAC,EAAE,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,YAAY,GAAG,MAAM,CAO7E"}
@@ -0,0 +1,11 @@
1
+ const CAPTION_RE = /^(Figure|Fig\.|Table|Exhibit|Chart|Box|Diagram)[ \t]+(\d+[a-z]?)\.?:?[ \t]*(.+)$/gim;
2
+ export function annotateFiguresTables(md, opts) {
3
+ if (opts?.skip?.includes("annotateFiguresTables"))
4
+ return md;
5
+ return md.replace(CAPTION_RE, (full, type, num) => {
6
+ const marker = `<!-- ${type.replace(".", "").toLowerCase()}:${num.toLowerCase()} -->`;
7
+ // Idempotent: don't add if already preceded by this exact marker
8
+ return `${marker}\n${full}`;
9
+ });
10
+ }
11
+ //# sourceMappingURL=annotate-figures-tables.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"annotate-figures-tables.js","sourceRoot":"","sources":["../../src/regex/annotate-figures-tables.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,GACd,qFAAqF,CAAC;AAExF,MAAM,UAAU,qBAAqB,CAAC,EAAU,EAAE,IAAmB;IACnE,IAAI,IAAI,EAAE,IAAI,EAAE,QAAQ,CAAC,uBAAuB,CAAC;QAAE,OAAO,EAAE,CAAC;IAC7D,OAAO,EAAE,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE;QAChD,MAAM,MAAM,GAAG,QAAS,IAAe,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,WAAW,EAAE,IAAK,GAAc,CAAC,WAAW,EAAE,MAAM,CAAC;QAC9G,iEAAiE;QACjE,OAAO,GAAG,MAAM,KAAK,IAAI,EAAE,CAAC;IAC9B,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,6 @@
1
+ /**
2
+ * collapseBlankLines — three or more blank lines → exactly one blank line.
3
+ * Run last (after other passes may have emptied lines).
4
+ */
5
+ export declare function collapseBlankLines(md: string): string;
6
+ //# sourceMappingURL=collapse-blank-lines.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"collapse-blank-lines.d.ts","sourceRoot":"","sources":["../../src/regex/collapse-blank-lines.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,wBAAgB,kBAAkB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAErD"}
@@ -0,0 +1,8 @@
1
+ /**
2
+ * collapseBlankLines — three or more blank lines → exactly one blank line.
3
+ * Run last (after other passes may have emptied lines).
4
+ */
5
+ export function collapseBlankLines(md) {
6
+ return md.replace(/\n{3,}/g, "\n\n").trim() + "\n";
7
+ }
8
+ //# sourceMappingURL=collapse-blank-lines.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"collapse-blank-lines.js","sourceRoot":"","sources":["../../src/regex/collapse-blank-lines.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,MAAM,UAAU,kBAAkB,CAAC,EAAU;IAC3C,OAAO,EAAE,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,IAAI,EAAE,GAAG,IAAI,CAAC;AACrD,CAAC"}
@@ -0,0 +1,2 @@
1
+ export declare function collapseRedundantEmphasis(md: string): string;
2
+ //# sourceMappingURL=collapse-redundant-emphasis.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"collapse-redundant-emphasis.d.ts","sourceRoot":"","sources":["../../src/regex/collapse-redundant-emphasis.ts"],"names":[],"mappings":"AAEA,wBAAgB,yBAAyB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAe5D"}
@@ -0,0 +1,19 @@
1
+ import { withProtectedCode } from "../util/protect-code.js";
2
+ export function collapseRedundantEmphasis(md) {
3
+ return withProtectedCode(md, (s) => {
4
+ let out = s;
5
+ for (let i = 0; i < 5; i++) {
6
+ const prev = out;
7
+ out = out
8
+ .replace(/\*\*([^*\n]+)\*\*[ \t]+\*\*([^*\n]+)\*\*/g, "**$1 $2**")
9
+ .replace(/__([^_\n]+)__[ \t]+__([^_\n]+)__/g, "__$1 $2__")
10
+ .replace(/(?<!\*)\*([^*\n]+)\*[ \t]+\*([^*\n]+)\*(?!\*)/g, "*$1 $2*")
11
+ .replace(/(?<!_)_([^_\n]+)_[ \t]+_([^_\n]+)_(?!_)/g, "_$1 $2_")
12
+ .replace(/~~([^~\n]+)~~[ \t]+~~([^~\n]+)~~/g, "~~$1 $2~~");
13
+ if (out === prev)
14
+ break;
15
+ }
16
+ return out;
17
+ });
18
+ }
19
+ //# sourceMappingURL=collapse-redundant-emphasis.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"collapse-redundant-emphasis.js","sourceRoot":"","sources":["../../src/regex/collapse-redundant-emphasis.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAE5D,MAAM,UAAU,yBAAyB,CAAC,EAAU;IAClD,OAAO,iBAAiB,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE;QACjC,IAAI,GAAG,GAAG,CAAC,CAAC;QACZ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3B,MAAM,IAAI,GAAG,GAAG,CAAC;YACjB,GAAG,GAAG,GAAG;iBACN,OAAO,CAAC,2CAA2C,EAAE,WAAW,CAAC;iBACjE,OAAO,CAAC,mCAAmC,EAAE,WAAW,CAAC;iBACzD,OAAO,CAAC,gDAAgD,EAAE,SAAS,CAAC;iBACpE,OAAO,CAAC,0CAA0C,EAAE,SAAS,CAAC;iBAC9D,OAAO,CAAC,mCAAmC,EAAE,WAAW,CAAC,CAAC;YAC7D,IAAI,GAAG,KAAK,IAAI;gBAAE,MAAM;QAC1B,CAAC;QACD,OAAO,GAAG,CAAC;IACb,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,2 @@
1
+ export declare function decodeHtmlEntities(md: string): string;
2
+ //# sourceMappingURL=decode-html-entities.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"decode-html-entities.d.ts","sourceRoot":"","sources":["../../src/regex/decode-html-entities.ts"],"names":[],"mappings":"AA8DA,wBAAgB,kBAAkB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAcrD"}