castdown-cleaners 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +180 -0
- package/README.md +198 -0
- package/dist/index.d.ts +47 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +110 -0
- package/dist/index.js.map +1 -0
- package/dist/regex/annotate-figures-tables.d.ts +3 -0
- package/dist/regex/annotate-figures-tables.d.ts.map +1 -0
- package/dist/regex/annotate-figures-tables.js +11 -0
- package/dist/regex/annotate-figures-tables.js.map +1 -0
- package/dist/regex/collapse-blank-lines.d.ts +6 -0
- package/dist/regex/collapse-blank-lines.d.ts.map +1 -0
- package/dist/regex/collapse-blank-lines.js +8 -0
- package/dist/regex/collapse-blank-lines.js.map +1 -0
- package/dist/regex/collapse-redundant-emphasis.d.ts +2 -0
- package/dist/regex/collapse-redundant-emphasis.d.ts.map +1 -0
- package/dist/regex/collapse-redundant-emphasis.js +19 -0
- package/dist/regex/collapse-redundant-emphasis.js.map +1 -0
- package/dist/regex/decode-html-entities.d.ts +2 -0
- package/dist/regex/decode-html-entities.d.ts.map +1 -0
- package/dist/regex/decode-html-entities.js +73 -0
- package/dist/regex/decode-html-entities.js.map +1 -0
- package/dist/regex/dedupe-links.d.ts +9 -0
- package/dist/regex/dedupe-links.d.ts.map +1 -0
- package/dist/regex/dedupe-links.js +16 -0
- package/dist/regex/dedupe-links.js.map +1 -0
- package/dist/regex/detect-space-tables.d.ts +29 -0
- package/dist/regex/detect-space-tables.d.ts.map +1 -0
- package/dist/regex/detect-space-tables.js +125 -0
- package/dist/regex/detect-space-tables.js.map +1 -0
- package/dist/regex/detect-toc.d.ts +14 -0
- package/dist/regex/detect-toc.d.ts.map +1 -0
- package/dist/regex/detect-toc.js +35 -0
- package/dist/regex/detect-toc.js.map +1 -0
- package/dist/regex/extract-metadata-frontmatter.d.ts +3 -0
- package/dist/regex/extract-metadata-frontmatter.d.ts.map +1 -0
- package/dist/regex/extract-metadata-frontmatter.js +39 -0
- package/dist/regex/extract-metadata-frontmatter.js.map +1 -0
- package/dist/regex/fix-footnote-markers.d.ts +2 -0
- package/dist/regex/fix-footnote-markers.d.ts.map +1 -0
- package/dist/regex/fix-footnote-markers.js +23 -0
- package/dist/regex/fix-footnote-markers.js.map +1 -0
- package/dist/regex/fix-headings.d.ts +12 -0
- package/dist/regex/fix-headings.d.ts.map +1 -0
- package/dist/regex/fix-headings.js +40 -0
- package/dist/regex/fix-headings.js.map +1 -0
- package/dist/regex/fix-ligatures.d.ts +3 -0
- package/dist/regex/fix-ligatures.d.ts.map +1 -0
- package/dist/regex/fix-ligatures.js +16 -0
- package/dist/regex/fix-ligatures.js.map +1 -0
- package/dist/regex/fix-tables.d.ts +13 -0
- package/dist/regex/fix-tables.d.ts.map +1 -0
- package/dist/regex/fix-tables.js +63 -0
- package/dist/regex/fix-tables.js.map +1 -0
- package/dist/regex/html-tables-to-gfm.d.ts +21 -0
- package/dist/regex/html-tables-to-gfm.d.ts.map +1 -0
- package/dist/regex/html-tables-to-gfm.js +76 -0
- package/dist/regex/html-tables-to-gfm.js.map +1 -0
- package/dist/regex/join-broken-lines.d.ts +10 -0
- package/dist/regex/join-broken-lines.d.ts.map +1 -0
- package/dist/regex/join-broken-lines.js +40 -0
- package/dist/regex/join-broken-lines.js.map +1 -0
- package/dist/regex/join-soft-hyphens.d.ts +9 -0
- package/dist/regex/join-soft-hyphens.d.ts.map +1 -0
- package/dist/regex/join-soft-hyphens.js +11 -0
- package/dist/regex/join-soft-hyphens.js.map +1 -0
- package/dist/regex/normalize-horizontal-rules.d.ts +2 -0
- package/dist/regex/normalize-horizontal-rules.d.ts.map +1 -0
- package/dist/regex/normalize-horizontal-rules.js +20 -0
- package/dist/regex/normalize-horizontal-rules.js.map +1 -0
- package/dist/regex/normalize-list-markers.d.ts +2 -0
- package/dist/regex/normalize-list-markers.d.ts.map +1 -0
- package/dist/regex/normalize-list-markers.js +35 -0
- package/dist/regex/normalize-list-markers.js.map +1 -0
- package/dist/regex/normalize-numbered-lists.d.ts +2 -0
- package/dist/regex/normalize-numbered-lists.d.ts.map +1 -0
- package/dist/regex/normalize-numbered-lists.js +9 -0
- package/dist/regex/normalize-numbered-lists.js.map +1 -0
- package/dist/regex/normalize-unicode.d.ts +2 -0
- package/dist/regex/normalize-unicode.d.ts.map +1 -0
- package/dist/regex/normalize-unicode.js +49 -0
- package/dist/regex/normalize-unicode.js.map +1 -0
- package/dist/regex/normalize-whitespace-in-lines.d.ts +2 -0
- package/dist/regex/normalize-whitespace-in-lines.d.ts.map +1 -0
- package/dist/regex/normalize-whitespace-in-lines.js +24 -0
- package/dist/regex/normalize-whitespace-in-lines.js.map +1 -0
- package/dist/regex/strip-boilerplate.d.ts +3 -0
- package/dist/regex/strip-boilerplate.d.ts.map +1 -0
- package/dist/regex/strip-boilerplate.js +16 -0
- package/dist/regex/strip-boilerplate.js.map +1 -0
- package/dist/regex/strip-docx-artifacts.d.ts +19 -0
- package/dist/regex/strip-docx-artifacts.d.ts.map +1 -0
- package/dist/regex/strip-docx-artifacts.js +34 -0
- package/dist/regex/strip-docx-artifacts.js.map +1 -0
- package/dist/regex/strip-empty-headings.d.ts +2 -0
- package/dist/regex/strip-empty-headings.d.ts.map +1 -0
- package/dist/regex/strip-empty-headings.js +6 -0
- package/dist/regex/strip-empty-headings.js.map +1 -0
- package/dist/regex/strip-html-artifacts.d.ts +2 -0
- package/dist/regex/strip-html-artifacts.d.ts.map +1 -0
- package/dist/regex/strip-html-artifacts.js +24 -0
- package/dist/regex/strip-html-artifacts.js.map +1 -0
- package/dist/regex/strip-page-numbers.d.ts +2 -0
- package/dist/regex/strip-page-numbers.d.ts.map +1 -0
- package/dist/regex/strip-page-numbers.js +23 -0
- package/dist/regex/strip-page-numbers.js.map +1 -0
- package/dist/regex/strip-pptx-notes.d.ts +22 -0
- package/dist/regex/strip-pptx-notes.d.ts.map +1 -0
- package/dist/regex/strip-pptx-notes.js +32 -0
- package/dist/regex/strip-pptx-notes.js.map +1 -0
- package/dist/regex/strip-repeated-headers.d.ts +2 -0
- package/dist/regex/strip-repeated-headers.d.ts.map +1 -0
- package/dist/regex/strip-repeated-headers.js +37 -0
- package/dist/regex/strip-repeated-headers.js.map +1 -0
- package/dist/regex/strip-url-tracking-params.d.ts +2 -0
- package/dist/regex/strip-url-tracking-params.d.ts.map +1 -0
- package/dist/regex/strip-url-tracking-params.js +26 -0
- package/dist/regex/strip-url-tracking-params.js.map +1 -0
- package/dist/regex/wrap-long-cell-text.d.ts +28 -0
- package/dist/regex/wrap-long-cell-text.d.ts.map +1 -0
- package/dist/regex/wrap-long-cell-text.js +66 -0
- package/dist/regex/wrap-long-cell-text.js.map +1 -0
- package/dist/util/protect-code.d.ts +6 -0
- package/dist/util/protect-code.d.ts.map +1 -0
- package/dist/util/protect-code.js +20 -0
- package/dist/util/protect-code.js.map +1 -0
- package/package.json +63 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* stripPptxNotes — remove speaker-notes sections from PPTX → MD output.
|
|
3
|
+
*
|
|
4
|
+
* MarkItDown (via python-pptx) appends speaker notes after each slide's
|
|
5
|
+
* content. The typical patterns:
|
|
6
|
+
*
|
|
7
|
+
* ## Notes: (heading variant)
|
|
8
|
+
* <notes text>
|
|
9
|
+
*
|
|
10
|
+
* **Notes:** (bold-paragraph variant)
|
|
11
|
+
* <notes text>
|
|
12
|
+
*
|
|
13
|
+
* <!-- notes --> (comment marker MarkItDown occasionally emits)
|
|
14
|
+
* <notes text>
|
|
15
|
+
*
|
|
16
|
+
* The section is terminated by the next heading or end-of-string.
|
|
17
|
+
* Stripping prevents notes from appearing in rendered output.
|
|
18
|
+
*
|
|
19
|
+
* Idempotent and safe on non-pptx markdown — no match ⇒ no-op.
|
|
20
|
+
*/
|
|
21
|
+
export declare function stripPptxNotes(md: string): string;
|
|
22
|
+
//# sourceMappingURL=strip-pptx-notes.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"strip-pptx-notes.d.ts","sourceRoot":"","sources":["../../src/regex/strip-pptx-notes.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AACH,wBAAgB,cAAc,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAcjD"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* stripPptxNotes — remove speaker-notes sections from PPTX → MD output.
|
|
3
|
+
*
|
|
4
|
+
* MarkItDown (via python-pptx) appends speaker notes after each slide's
|
|
5
|
+
* content. The typical patterns:
|
|
6
|
+
*
|
|
7
|
+
* ## Notes: (heading variant)
|
|
8
|
+
* <notes text>
|
|
9
|
+
*
|
|
10
|
+
* **Notes:** (bold-paragraph variant)
|
|
11
|
+
* <notes text>
|
|
12
|
+
*
|
|
13
|
+
* <!-- notes --> (comment marker MarkItDown occasionally emits)
|
|
14
|
+
* <notes text>
|
|
15
|
+
*
|
|
16
|
+
* The section is terminated by the next heading or end-of-string.
|
|
17
|
+
* Stripping prevents notes from appearing in rendered output.
|
|
18
|
+
*
|
|
19
|
+
* Idempotent and safe on non-pptx markdown — no match ⇒ no-op.
|
|
20
|
+
*/
|
|
21
|
+
export function stripPptxNotes(md) {
|
|
22
|
+
let out = md;
|
|
23
|
+
// "## Notes:" or "### Notes:" heading + all following non-heading lines
|
|
24
|
+
// Pattern: match the Notes: heading line, then consume lines until next heading or EOF.
|
|
25
|
+
out = out.replace(/^#{1,4}\s+Notes:?\s*$(?:\n(?!#{1,6}\s)[^\n]*|\n)*\n?/gim, "");
|
|
26
|
+
// "**Notes:**" or "**Notes**:" bold-paragraph variant + following paragraph
|
|
27
|
+
out = out.replace(/^\*\*Notes:?\*\*:?\s*$(?:\n(?!\*\*|#{1,6}\s)[^\n]*|\n)*\n?/gim, "");
|
|
28
|
+
// <!-- notes --> comment marker + following paragraph
|
|
29
|
+
out = out.replace(/^<!--\s*notes?\s*-->$(?:\n(?!<!--|#{1,6}\s)[^\n]*|\n)*\n?/gim, "");
|
|
30
|
+
return out;
|
|
31
|
+
}
|
|
32
|
+
//# sourceMappingURL=strip-pptx-notes.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"strip-pptx-notes.js","sourceRoot":"","sources":["../../src/regex/strip-pptx-notes.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,UAAU,cAAc,CAAC,EAAU;IACvC,IAAI,GAAG,GAAG,EAAE,CAAC;IAEb,wEAAwE;IACxE,wFAAwF;IACxF,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,yDAAyD,EAAE,EAAE,CAAC,CAAC;IAEjF,4EAA4E;IAC5E,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,+DAA+D,EAAE,EAAE,CAAC,CAAC;IAEvF,sDAAsD;IACtD,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,8DAA8D,EAAE,EAAE,CAAC,CAAC;IAEtF,OAAO,GAAG,CAAC;AACb,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"strip-repeated-headers.d.ts","sourceRoot":"","sources":["../../src/regex/strip-repeated-headers.ts"],"names":[],"mappings":"AAaA,wBAAgB,oBAAoB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAmBvD"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* stripRepeatedHeaders — remove lines repeated across many pages.
|
|
3
|
+
*
|
|
4
|
+
* Heuristic: any line ≥ 8 chars that appears ≥ 4 times across the doc
|
|
5
|
+
* is likely a running header/footer (e.g., "Chapter 3 — Foundations").
|
|
6
|
+
*
|
|
7
|
+
* Avoids stripping real content by requiring multiple occurrences AND
|
|
8
|
+
* that the line is short enough to be a header (≤ 120 chars).
|
|
9
|
+
*/
|
|
10
|
+
const MIN_OCCURRENCES = 4;
|
|
11
|
+
const MIN_LEN = 8;
|
|
12
|
+
const MAX_LEN = 120;
|
|
13
|
+
export function stripRepeatedHeaders(md) {
|
|
14
|
+
const lines = md.split("\n");
|
|
15
|
+
const counts = new Map();
|
|
16
|
+
for (const raw of lines) {
|
|
17
|
+
const line = raw.trim();
|
|
18
|
+
if (line.length < MIN_LEN || line.length > MAX_LEN)
|
|
19
|
+
continue;
|
|
20
|
+
if (/^#{1,6}\s/.test(line))
|
|
21
|
+
continue; // real headings
|
|
22
|
+
if (/^[-*+]\s/.test(line))
|
|
23
|
+
continue; // lists
|
|
24
|
+
if (/^>\s/.test(line))
|
|
25
|
+
continue; // quotes
|
|
26
|
+
counts.set(line, (counts.get(line) ?? 0) + 1);
|
|
27
|
+
}
|
|
28
|
+
const toStrip = new Set();
|
|
29
|
+
for (const [line, n] of counts) {
|
|
30
|
+
if (n >= MIN_OCCURRENCES)
|
|
31
|
+
toStrip.add(line);
|
|
32
|
+
}
|
|
33
|
+
if (toStrip.size === 0)
|
|
34
|
+
return md;
|
|
35
|
+
return lines.filter((raw) => !toStrip.has(raw.trim())).join("\n");
|
|
36
|
+
}
|
|
37
|
+
//# sourceMappingURL=strip-repeated-headers.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"strip-repeated-headers.js","sourceRoot":"","sources":["../../src/regex/strip-repeated-headers.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AACH,MAAM,eAAe,GAAG,CAAC,CAAC;AAC1B,MAAM,OAAO,GAAG,CAAC,CAAC;AAClB,MAAM,OAAO,GAAG,GAAG,CAAC;AAEpB,MAAM,UAAU,oBAAoB,CAAC,EAAU;IAC7C,MAAM,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;IACzC,KAAK,MAAM,GAAG,IAAI,KAAK,EAAE,CAAC;QACxB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;QACxB,IAAI,IAAI,CAAC,MAAM,GAAG,OAAO,IAAI,IAAI,CAAC,MAAM,GAAG,OAAO;YAAE,SAAS;QAC7D,IAAI,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC;YAAE,SAAS,CAAC,gBAAgB;QACtD,IAAI,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC;YAAE,SAAS,CAAC,QAAQ;QAC7C,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC;YAAE,SAAS,CAAC,SAAS;QAC1C,MAAM,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAChD,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAClC,KAAK,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,MAAM,EAAE,CAAC;QAC/B,IAAI,CAAC,IAAI,eAAe;YAAE,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IAC9C,CAAC;IACD,IAAI,OAAO,CAAC,IAAI,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAElC,OAAO,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACpE,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"strip-url-tracking-params.d.ts","sourceRoot":"","sources":["../../src/regex/strip-url-tracking-params.ts"],"names":[],"mappings":"AAqBA,wBAAgB,sBAAsB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAKzD"}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
const TRACKING_PARAMS = new Set([
|
|
2
|
+
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
|
|
3
|
+
"utm_id", "utm_source_platform", "utm_creative_format", "utm_marketing_tactic",
|
|
4
|
+
"fbclid", "gclid", "msclkid", "twclid", "ttclid", "li_fat_id",
|
|
5
|
+
"mc_cid", "mc_eid", "_ga", "_gl",
|
|
6
|
+
]);
|
|
7
|
+
function cleanUrl(url) {
|
|
8
|
+
try {
|
|
9
|
+
const u = new URL(url);
|
|
10
|
+
const toDelete = [...u.searchParams.keys()].filter((k) => TRACKING_PARAMS.has(k));
|
|
11
|
+
if (toDelete.length === 0)
|
|
12
|
+
return url;
|
|
13
|
+
toDelete.forEach((k) => u.searchParams.delete(k));
|
|
14
|
+
return u.toString();
|
|
15
|
+
}
|
|
16
|
+
catch {
|
|
17
|
+
return url;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
export function stripUrlTrackingParams(md) {
|
|
21
|
+
return md.replace(/\[([^\]]*)\]\(([^)]+)\)/g, (full, text, url) => {
|
|
22
|
+
const clean = cleanUrl(url);
|
|
23
|
+
return clean === url ? full : `[${text}](${clean})`;
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
//# sourceMappingURL=strip-url-tracking-params.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"strip-url-tracking-params.js","sourceRoot":"","sources":["../../src/regex/strip-url-tracking-params.ts"],"names":[],"mappings":"AAAA,MAAM,eAAe,GAAG,IAAI,GAAG,CAAC;IAC9B,YAAY,EAAE,YAAY,EAAE,cAAc,EAAE,UAAU,EAAE,aAAa;IACrE,QAAQ,EAAE,qBAAqB,EAAE,qBAAqB,EAAE,sBAAsB;IAC9E,QAAQ,EAAE,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,QAAQ,EAAE,WAAW;IAC7D,QAAQ,EAAE,QAAQ,EAAE,KAAK,EAAE,KAAK;CACjC,CAAC,CAAC;AAEH,SAAS,QAAQ,CAAC,GAAW;IAC3B,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QACvB,MAAM,QAAQ,GAAG,CAAC,GAAG,CAAC,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CACvD,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,CACvB,CAAC;QACF,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,GAAG,CAAC;QACtC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QAClD,OAAO,CAAC,CAAC,QAAQ,EAAE,CAAC;IACtB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,GAAG,CAAC;IACb,CAAC;AACH,CAAC;AAED,MAAM,UAAU,sBAAsB,CAAC,EAAU;IAC/C,OAAO,EAAE,CAAC,OAAO,CAAC,0BAA0B,EAAE,CAAC,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE;QAChE,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAa,CAAC,CAAC;QACtC,OAAO,KAAK,KAAK,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,IAAc,KAAK,KAAK,GAAG,CAAC;IAChE,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* wrapLongCellText — insert zero-width spaces (U+200B) at natural break
|
|
3
|
+
* points inside pipe-table cells whose content is a single very long token
|
|
4
|
+
* (no whitespace gaps, ≥ MIN_CHARS characters).
|
|
5
|
+
*
|
|
6
|
+
* Problem: URLs, file paths, and long identifiers in table cells cannot wrap
|
|
7
|
+
* because they contain no space characters. Typst, HTML, and Word renderers
|
|
8
|
+
* have limited support for `word-break: break-all` equivalents. Inserting
|
|
9
|
+
* ZWS at common delimiters gives the renderer explicit break opportunities
|
|
10
|
+
* without changing the visual content.
|
|
11
|
+
*
|
|
12
|
+
* Break points injected after: / . - _ = & ? # , : @ +
|
|
13
|
+
*
|
|
14
|
+
* Conservative by design:
|
|
15
|
+
* - Only triggers for tokens ≥ MIN_CHARS (default 40).
|
|
16
|
+
* - Skips cells that are already short or contain natural spaces.
|
|
17
|
+
* - Skips code-fenced lines.
|
|
18
|
+
* - Skips separator rows (`| --- |`).
|
|
19
|
+
* - Idempotent: ZWS are already invisible so re-running is safe.
|
|
20
|
+
*
|
|
21
|
+
* Downstream compatibility:
|
|
22
|
+
* - Typst: ZWS is a Unicode line-break opportunity (UAX#14 BK rule).
|
|
23
|
+
* - HTML/EPUB: browsers honour ZWS as a word-break point.
|
|
24
|
+
* - DOCX: Word respects ZWS.
|
|
25
|
+
* - LaTeX: pandoc translates ZWS to `\hspace{0pt}` or ignores it — safe.
|
|
26
|
+
*/
|
|
27
|
+
export declare function wrapLongCellText(md: string): string;
|
|
28
|
+
//# sourceMappingURL=wrap-long-cell-text.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"wrap-long-cell-text.d.ts","sourceRoot":"","sources":["../../src/regex/wrap-long-cell-text.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAWH,wBAAgB,gBAAgB,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,CAqBnD"}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* wrapLongCellText — insert zero-width spaces (U+200B) at natural break
|
|
3
|
+
* points inside pipe-table cells whose content is a single very long token
|
|
4
|
+
* (no whitespace gaps, ≥ MIN_CHARS characters).
|
|
5
|
+
*
|
|
6
|
+
* Problem: URLs, file paths, and long identifiers in table cells cannot wrap
|
|
7
|
+
* because they contain no space characters. Typst, HTML, and Word renderers
|
|
8
|
+
* have limited support for `word-break: break-all` equivalents. Inserting
|
|
9
|
+
* ZWS at common delimiters gives the renderer explicit break opportunities
|
|
10
|
+
* without changing the visual content.
|
|
11
|
+
*
|
|
12
|
+
* Break points injected after: / . - _ = & ? # , : @ +
|
|
13
|
+
*
|
|
14
|
+
* Conservative by design:
|
|
15
|
+
* - Only triggers for tokens ≥ MIN_CHARS (default 40).
|
|
16
|
+
* - Skips cells that are already short or contain natural spaces.
|
|
17
|
+
* - Skips code-fenced lines.
|
|
18
|
+
* - Skips separator rows (`| --- |`).
|
|
19
|
+
* - Idempotent: ZWS are already invisible so re-running is safe.
|
|
20
|
+
*
|
|
21
|
+
* Downstream compatibility:
|
|
22
|
+
* - Typst: ZWS is a Unicode line-break opportunity (UAX#14 BK rule).
|
|
23
|
+
* - HTML/EPUB: browsers honour ZWS as a word-break point.
|
|
24
|
+
* - DOCX: Word respects ZWS.
|
|
25
|
+
* - LaTeX: pandoc translates ZWS to `\hspace{0pt}` or ignores it — safe.
|
|
26
|
+
*/
|
|
27
|
+
const ZWS = "";
|
|
28
|
+
const MIN_CHARS = 40;
|
|
29
|
+
// Characters that mark a good break point. We insert ZWS AFTER each.
|
|
30
|
+
const BREAK_RE = /([/._\-=&?#,:@+])/g;
|
|
31
|
+
const isSeparatorRow = (line) => /^\s*\|[\s|:-]+\|\s*$/.test(line);
|
|
32
|
+
const isPipeRow = (line) => /^\s*\|/.test(line) && !isSeparatorRow(line);
|
|
33
|
+
export function wrapLongCellText(md) {
|
|
34
|
+
const lines = md.split("\n");
|
|
35
|
+
let inCode = false;
|
|
36
|
+
return lines.map((line) => {
|
|
37
|
+
if (/^\s*```/.test(line)) {
|
|
38
|
+
inCode = !inCode;
|
|
39
|
+
return line;
|
|
40
|
+
}
|
|
41
|
+
if (inCode || !isPipeRow(line))
|
|
42
|
+
return line;
|
|
43
|
+
// Split on pipe-delimiters, wrap long tokens, re-join.
|
|
44
|
+
return line
|
|
45
|
+
.split(/(?<!\\)\|/)
|
|
46
|
+
.map((cell, idx) => {
|
|
47
|
+
// Keep leading / trailing pipes as-is (empty strings at boundaries).
|
|
48
|
+
if (!cell.trim())
|
|
49
|
+
return cell;
|
|
50
|
+
return wrapCellTokens(cell);
|
|
51
|
+
})
|
|
52
|
+
.join("|");
|
|
53
|
+
}).join("\n");
|
|
54
|
+
}
|
|
55
|
+
function wrapCellTokens(cell) {
|
|
56
|
+
return cell.replace(/\S+/g, (token) => {
|
|
57
|
+
// ZWS () is non-whitespace so \S+ captures already-wrapped tokens.
|
|
58
|
+
// Skip them to stay idempotent.
|
|
59
|
+
if (token.includes(ZWS))
|
|
60
|
+
return token;
|
|
61
|
+
if (token.length < MIN_CHARS)
|
|
62
|
+
return token;
|
|
63
|
+
return token.replace(BREAK_RE, `$1${ZWS}`);
|
|
64
|
+
});
|
|
65
|
+
}
|
|
66
|
+
//# sourceMappingURL=wrap-long-cell-text.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"wrap-long-cell-text.js","sourceRoot":"","sources":["../../src/regex/wrap-long-cell-text.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAEH,MAAM,GAAG,GAAG,GAAG,CAAC;AAChB,MAAM,SAAS,GAAG,EAAE,CAAC;AAErB,qEAAqE;AACrE,MAAM,QAAQ,GAAG,oBAAoB,CAAC;AAEtC,MAAM,cAAc,GAAG,CAAC,IAAY,EAAE,EAAE,CAAC,sBAAsB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC3E,MAAM,SAAS,GAAO,CAAC,IAAY,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;AAErF,MAAM,UAAU,gBAAgB,CAAC,EAAU;IACzC,MAAM,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC7B,IAAI,MAAM,GAAG,KAAK,CAAC;IAEnB,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QACxB,IAAI,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YACzB,MAAM,GAAG,CAAC,MAAM,CAAC;YACjB,OAAO,IAAI,CAAC;QACd,CAAC;QACD,IAAI,MAAM,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;YAAE,OAAO,IAAI,CAAC;QAE5C,uDAAuD;QACvD,OAAO,IAAI;aACR,KAAK,CAAC,WAAW,CAAC;aAClB,GAAG,CAAC,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE;YACjB,qEAAqE;YACrE,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;gBAAE,OAAO,IAAI,CAAC;YAC9B,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC;QAC9B,CAAC,CAAC;aACD,IAAI,CAAC,GAAG,CAAC,CAAC;IACf,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAChB,CAAC;AAED,SAAS,cAAc,CAAC,IAAY;IAClC,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE;QACpC,oEAAoE;QACpE,gCAAgC;QAChC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC;YAAE,OAAO,KAAK,CAAC;QACtC,IAAI,KAAK,CAAC,MAAM,GAAG,SAAS;YAAE,OAAO,KAAK,CAAC;QAC3C,OAAO,KAAK,CAAC,OAAO,CAAC,QAAQ,EAAE,KAAK,GAAG,EAAE,CAAC,CAAC;IAC7C,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Masks fenced and indented code blocks with placeholders before applying a
|
|
3
|
+
* transformation, then restores them. Prevents cleaners from corrupting code.
|
|
4
|
+
*/
|
|
5
|
+
export declare function withProtectedCode(md: string, fn: (masked: string) => string): string;
|
|
6
|
+
//# sourceMappingURL=protect-code.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"protect-code.d.ts","sourceRoot":"","sources":["../../src/util/protect-code.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,wBAAgB,iBAAiB,CAAC,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,MAAM,GAAG,MAAM,CAqBpF"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Masks fenced and indented code blocks with placeholders before applying a
|
|
3
|
+
* transformation, then restores them. Prevents cleaners from corrupting code.
|
|
4
|
+
*/
|
|
5
|
+
export function withProtectedCode(md, fn) {
|
|
6
|
+
const blocks = [];
|
|
7
|
+
const FENCED_RE = /^(`{3,}|~{3,})[^\n]*\n[\s\S]*?\n\1[ \t]*$/gm;
|
|
8
|
+
let masked = md.replace(FENCED_RE, (match) => {
|
|
9
|
+
const i = blocks.push(match) - 1;
|
|
10
|
+
return `\x00CODE_${i}\x00`;
|
|
11
|
+
});
|
|
12
|
+
// Only protect indented blocks that contain at least one non-whitespace char
|
|
13
|
+
masked = masked.replace(/(^|\n)((?:[ ]{4}|\t)[^\n]*\S[^\n]*(?:\n(?:[ ]{4}|\t)[^\n]*\S[^\n]*)*)/g, (_, sep, block) => {
|
|
14
|
+
const i = blocks.push(block) - 1;
|
|
15
|
+
return `${sep}\x00CODE_${i}\x00`;
|
|
16
|
+
});
|
|
17
|
+
const result = fn(masked);
|
|
18
|
+
return result.replace(/\x00CODE_(\d+)\x00/g, (_, i) => blocks[+i] ?? "");
|
|
19
|
+
}
|
|
20
|
+
//# sourceMappingURL=protect-code.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"protect-code.js","sourceRoot":"","sources":["../../src/util/protect-code.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,MAAM,UAAU,iBAAiB,CAAC,EAAU,EAAE,EAA8B;IAC1E,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,MAAM,SAAS,GAAG,6CAA6C,CAAC;IAChE,IAAI,MAAM,GAAG,EAAE,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC,KAAK,EAAE,EAAE;QAC3C,MAAM,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QACjC,OAAO,YAAY,CAAC,MAAM,CAAC;IAC7B,CAAC,CAAC,CAAC;IAEH,6EAA6E;IAC7E,MAAM,GAAG,MAAM,CAAC,OAAO,CACrB,wEAAwE,EACxE,CAAC,CAAC,EAAE,GAAW,EAAE,KAAa,EAAE,EAAE;QAChC,MAAM,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QACjC,OAAO,GAAG,GAAG,YAAY,CAAC,MAAM,CAAC;IACnC,CAAC,CACF,CAAC;IAEF,MAAM,MAAM,GAAG,EAAE,CAAC,MAAM,CAAC,CAAC;IAE1B,OAAO,MAAM,CAAC,OAAO,CAAC,qBAAqB,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;AAC3E,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "castdown-cleaners",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Composable Markdown post-processing pipeline for MarkItDown, Docling, Pandoc, and LlamaParse output.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./src/index.ts",
|
|
7
|
+
"types": "./src/index.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": "./src/index.ts"
|
|
10
|
+
},
|
|
11
|
+
"publishConfig": {
|
|
12
|
+
"main": "./dist/index.js",
|
|
13
|
+
"types": "./dist/index.d.ts",
|
|
14
|
+
"exports": {
|
|
15
|
+
".": {
|
|
16
|
+
"import": "./dist/index.js",
|
|
17
|
+
"types": "./dist/index.d.ts"
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"files": [
|
|
22
|
+
"dist",
|
|
23
|
+
"README.md",
|
|
24
|
+
"LICENSE"
|
|
25
|
+
],
|
|
26
|
+
"scripts": {
|
|
27
|
+
"build": "tsc -p tsconfig.build.json",
|
|
28
|
+
"test": "vitest",
|
|
29
|
+
"typecheck": "tsc -p tsconfig.json --noEmit",
|
|
30
|
+
"prepublishOnly": "pnpm build && pnpm exec vitest run"
|
|
31
|
+
},
|
|
32
|
+
"keywords": [
|
|
33
|
+
"markdown",
|
|
34
|
+
"cleaner",
|
|
35
|
+
"pipeline",
|
|
36
|
+
"pdf",
|
|
37
|
+
"docx",
|
|
38
|
+
"rag",
|
|
39
|
+
"markitdown",
|
|
40
|
+
"docling",
|
|
41
|
+
"pandoc"
|
|
42
|
+
],
|
|
43
|
+
"license": "Apache-2.0",
|
|
44
|
+
"repository": {
|
|
45
|
+
"type": "git",
|
|
46
|
+
"url": "https://github.com/castdown/castdown",
|
|
47
|
+
"directory": "packages/cleaners"
|
|
48
|
+
},
|
|
49
|
+
"dependencies": {
|
|
50
|
+
"remark": "^15.0.1",
|
|
51
|
+
"remark-gfm": "^4.0.0",
|
|
52
|
+
"remark-stringify": "^11.0.0",
|
|
53
|
+
"unified": "^11.0.5"
|
|
54
|
+
},
|
|
55
|
+
"devDependencies": {
|
|
56
|
+
"@types/node": "^22.7.0",
|
|
57
|
+
"typescript": "^5.6.0",
|
|
58
|
+
"vitest": "^2.1.0"
|
|
59
|
+
},
|
|
60
|
+
"engines": {
|
|
61
|
+
"node": ">=20"
|
|
62
|
+
}
|
|
63
|
+
}
|