text-sanctifier 1.0.16 โ†’ 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -5,10 +5,10 @@
5
5
  [![downloads](https://img.shields.io/npm/dw/text-sanctifier)](https://www.npmjs.com/package/text-sanctifier)
6
6
  [![GitHub stars](https://img.shields.io/github/stars/iWhatty/text-sanctifier?style=social)](https://github.com/iWhatty/text-sanctifier)
7
7
 
8
- Brutal text normalizer and invisible trash scrubber for modern web projects.
8
+ Brutal text normalizer and invisible Unicode scrubber for modern web projects.
9
9
 
10
- * Minified: (3.09 KB)
11
- * Gzipped (GCC): (1.36 KB)
10
+ * Minified: (3.70 KB)
11
+ * Gzipped (GCC): (1.66 KB)
12
12
 
13
13
  ## Features
14
14
 
@@ -17,9 +17,20 @@ Brutal text normalizer and invisible trash scrubber for modern web projects.
17
17
  * Collapses unwanted spaces and paragraphs
18
18
  * Nukes control characters (if enabled)
19
19
  * Smart normalization of typographic junk (quotes, dashes, bullets, full-width punctuation)
20
- * Keyboard-only filtering (retain printable ASCII + emoji, or restrict)
20
+ * Keyboard-only filtering (retain printable ASCII + full emoji sequences)
21
+
22
+ * Preserves ZWJ emoji clusters (๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ)
23
+ * Preserves VS16 emoji presentation variants (โœŒ๏ธ, โ€ผ๏ธ)
21
24
  * Configurable via fine-grained flags or ready-made presets
22
25
  * Includes strict, loose, and keyboard-only modes
26
+ * Deterministic RegExp usage (no global `lastIndex` state leaks)
27
+
28
+ ## Security notes
29
+
30
+ - **Not an HTML/XSS sanitizer.** This library normalizes and filters plain text.
31
+ - If you need to render **untrusted content**, render it as text (e.g. `textContent`), not HTML (`innerHTML`).
32
+ - If you need to sanitize **HTML**, use a dedicated HTML sanitizer (e.g. DOMPurify / sanitize-html).
33
+ - Like any text-processing library, extremely large untrusted inputs can be used for CPU/DoS pressure; consider input size limits in high-risk environments.
23
34
 
24
35
  ## Install
25
36
 
@@ -27,6 +38,15 @@ Brutal text normalizer and invisible trash scrubber for modern web projects.
27
38
  npm install text-sanctifier
28
39
  ```
29
40
 
41
+ ## Runtime Requirements
42
+
43
+ Requires modern JavaScript runtime with ES2020+ support.
44
+
45
+ * Node.js 14+
46
+ * Modern evergreen browsers
47
+
48
+ ---
49
+
30
50
  ## ๐Ÿ“ฆ Package & Build Info
31
51
 
32
52
  * **Source (`src/`)**: ES2020+ ESM modules with JSDoc
@@ -109,7 +129,7 @@ const report = inspectText(input);
109
129
 
110
130
  Use `inspectText` to preflight text content before rendering, storing, or linting. It's a diagnostic tool to help inform sanitization needs.
111
131
 
112
- Pass the report to getRecommendedSanctifierOptions(report) to auto-generate config flags for summonSanctifier().
132
+ Pass the report to `getRecommendedSanctifierOptions(report)` to auto-generate config flags for `summonSanctifier()`.
113
133
 
114
134
  ---
115
135
 
@@ -133,7 +153,8 @@ Restricts to printable ASCII only (removes emojis).
133
153
 
134
154
  ### `summonSanctifier.keyboardOnlyEmoji`
135
155
 
136
- Restricts to keyboard-safe ASCII + emojis. Preserves fun, removes weird.
156
+ Restricts to printable ASCII + full emoji sequences.
157
+ Preserves ZWJ emoji clusters and emoji presentation variants.
137
158
 
138
159
  ### `inspectText(text: string): UnicodeTrashReport`
139
160
 
@@ -143,4 +164,4 @@ Returns a structural report of control codes, invisible chars, newline styles, a
143
164
 
144
165
  ## License
145
166
 
146
- \--{DR.WATT v3.0}--
167
+ --{DR.WATT v3.0}--
@@ -1,8 +1,9 @@
1
- function g(a={}){const b=!!a.purgeInvisibleChars,c=!!a.purgeEmojis,d=!!a.nukeControls,e=!!a.keyboardOnlyFilter,k=!!a.normalizeNewlines,f=!!a.trimSpacesAroundNewlines,l=!!a.collapseNewLines,m=!!a.preserveParagraphs,p=!!a.collapseSpaces,q=!!a.finalTrim;return w=>h(w,b,c,d,e,k,f,l,m,p,q)}g.strict=a=>h(a,!0,!0,!0,!1,!0,!0,!0,!1,!0,!0);g.loose=a=>h(a,!1,!1,!1,!1,!0,!0,!0,!0,!0,!0);g.keyboardOnlyEmoji=a=>h(a,!1,!1,!1,!0,!0,!0,!1,!1,!1,!0);g.keyboardOnly=a=>h(a,!0,!0,!0,!0,!0,!0,!0,!1,!0,!0);
2
- function h(a,b=!1,c=!1,d=!1,e=!1,k=!1,f=!1,l=!1,m=!1,p=!1,q=!1){if("string"!==typeof a)throw new TypeError("sanctifyText expects a string input.");b&&(a=a.replace(n,""));c&&(a=a.replace(r,""));d&&(a=a.replace(t,""));e&&(a=u(a,c));k&&(a=a.replace(v,"\n"));f&&(a=a.replace(x,"$1"));l&&(b=a,a=m?b.replace(y,"\n\n"):b.replace(z,"\n"));p&&(a=a.replace(A," "));return q?a.trim():a}var n=/[\u00A0\u2000-\u200D\u202F\u2060\u3000\uFEFF\u200E\u200F\u202A-\u202E]+/g,B=/[^\x20-\x7E\r\n]+/gu;
3
- function u(a,b=!1){a=C(a);return b?a.replace(B,""):a.replace(B,c=>c.match(r)?c:"")}var D=/[\u2018\u2019\u201A\u201B\u2032\u2035]/g,E=/[\u201C\u201D\u201E\u201F\u2033\u2036\u00AB\u00BB]/g,F=/[\u2012\u2013\u2014\u2015\u2212]/g,G=/\u2026/g,H=/[\u2022\u00B7]/g,I=/[\uFF01-\uFF5E]/g;function C(a){return a.replace(D,"'").replace(E,'"').replace(F,"-").replace(G,"...").replace(H,"*").replace(I,b=>String.fromCharCode(b.charCodeAt(0)-65248))}var r;
4
- try{r=RegExp("(?:\\p{Extended_Pictographic}(?:\\uFE0F|\\uFE0E)?(?:\\u200D(?:\\p{Extended_Pictographic}|\\w)+)*)","gu")}catch{r=/[\u{2700}-\u{27BF}\u{1F300}-\u{1FAFF}\u{1F3FB}-\u{1F3FF}\u200D\uFE0F\u{1F1E6}-\u{1F1FF}]/gu}var v=/\r\n|\r|\n/g,x=/[ \t]*(\n+)[ \t]*/g,z=/\n{2,}/g,y=/\n{3,}/g,A=/ {2,}/g,t=/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F\u0080-\u009F\u200E\u200F\u202A-\u202E]+/g;
5
- function J(a){if("string"!==typeof a)throw new TypeError("inspectText expects a string input.");const b=[],c={hasControlChars:!1,hasInvisibleChars:!1,hasMixedNewlines:!1,newlineStyle:null,hasEmojis:!1,hasNonKeyboardChars:!1,summary:b},d=(f,l,m)=>{f&&(c[l]=!0,b.push(m))};d(t.test(a),"hasControlChars","Control characters detected.");d(n.test(a),"hasInvisibleChars","Invisible Unicode characters detected.");d(r.test(a),"hasEmojis","Emojis detected.");const {j:e,types:k}=K(a);c.hasMixedNewlines=e;c.newlineStyle=
6
- e?"Mixed":k[0]||null;c.newlineStyle&&b.push(e?"Mixed newline styles detected.":`Consistent newline style: ${c.newlineStyle}`);a=C(a).replace(B,f=>f.match(r)?"":"\u2612");d(/[\u2612]/.test(a),"hasNonKeyboardChars","Non-keyboard characters detected.");return c}
7
- function K(a){if("string"!==typeof a)throw new TypeError("getNewlineStats expects a string input.");var b=a.replace(/\r\n/g,"");a={h:(a.match(/\r\n/g)||[]).length,g:(b.match(/\r/g)||[]).length,i:(b.match(/\n/g)||[]).length};b=[];0<a.h&&b.push("CRLF");0<a.g&&b.push("CR");0<a.i&&b.push("LF");return{...a,types:b,j:1<b.length}}
8
- function L(a){return{purgeInvisibleChars:a.hasInvisibleChars,purgeEmojis:a.hasEmojis,nukeControls:a.hasControlChars,keyboardOnlyFilter:a.hasNonKeyboardChars,normalizeNewlines:a.hasMixedNewlines||"CRLF"===a.newlineStyle||"CR"===a.newlineStyle}}export { g as summonSanctifier, J as inspectText, L as getRecommendedSanctifierOptions };
1
+ function g(a={}){const c=!!a.purgeInvisibleChars,b=!!a.purgeEmojis,d=!!a.nukeControls,f=!!a.keyboardOnlyFilter,h=!!a.normalizeNewlines,m=!!a.trimSpacesAroundNewlines,e=!!a.collapseNewLines,r=!!a.preserveParagraphs,t=!!a.collapseSpaces,u=!!a.finalTrim,n=Number.isFinite(a.maxLength)&&0<=a.maxLength?a.maxLength:Infinity,k="truncate"===a.g||"noop"===a.g||"throw"===a.g?a.g:"throw";return z=>l(z,c,b,d,f,h,m,e,r,t,u,n,k)}g.strict=a=>l(a,!0,!0,!0,!1,!0,!0,!0,!1,!0,!0);
2
+ g.loose=a=>l(a,!1,!1,!1,!1,!0,!0,!0,!0,!0,!0);g.keyboardOnlyEmoji=a=>l(a,!1,!1,!1,!0,!0,!0,!1,!1,!1,!0);g.keyboardOnly=a=>l(a,!0,!0,!0,!0,!0,!0,!0,!1,!0,!0);
3
+ function l(a,c=!1,b=!1,d=!1,f=!1,h=!1,m=!1,e=!1,r=!1,t=!1,u=!1,n=Infinity,k="throw"){if("string"!==typeof a)throw new TypeError("sanctifyText expects a string input.");"truncate"!==k&&"noop"!==k&&"throw"!==k&&(k="throw");if(a.length>n)switch(k){case "truncate":a=a.slice(0,n);break;case "noop":return a;default:throw new RangeError(`sanctifyText input length ${a.length} exceeds maxLength ${n}.`);}c&&(a=a.replace(p,""));b&&(a=a.replace(q,""));d&&(a=a.replace(v,""));f&&(a=w(a,b));h&&(a=a.replace(x,"\n"));
4
+ m&&(a=a.replace(y,"$1"));e&&(c=a,a=r?c.replace(A,"\n\n"):c.replace(B,"\n"));t&&(a=a.replace(C," "));return u?a.trim():a}var p=/[\u00A0\u2000-\u200D\u202F\u2060\u3000\uFEFF\u200E\u200F\u202A-\u202E]+/g,D=/[^\x20-\x7E\r\n]+/gu;function w(a,c=!1){a=E(a);return c?a.replace(D,""):a.replace(D,b=>{q.lastIndex=0;if(q.test(b)){q.lastIndex=0;var d="";for(const f of b.matchAll(q))d+=f[0];b=d}else b="";return b})}
5
+ var F=/[\u2018\u2019\u201A\u201B\u2032\u2035]/g,G=/[\u201C\u201D\u201E\u201F\u2033\u2036\u00AB\u00BB]/g,H=/[\u2012\u2013\u2014\u2015\u2212]/g,I=/\u2026/g,J=/[\u2022\u00B7]/g,K=/[\uFF01-\uFF5E]/g;function E(a){return a.replace(F,"'").replace(G,'"').replace(H,"-").replace(I,"...").replace(J,"*").replace(K,c=>String.fromCharCode(c.charCodeAt(0)-65248))}var q;try{q=RegExp("(?:\\p{Extended_Pictographic}(?:\\uFE0F|\\uFE0E)?(?:\\u200D(?:\\p{Extended_Pictographic}|\\w)+)*)","gu")}catch{q=/[\u{2700}-\u{27BF}\u{1F300}-\u{1FAFF}\u{1F3FB}-\u{1F3FF}\u200D\uFE0F\u{1F1E6}-\u{1F1FF}]/gu}
6
+ var x=/\r\n|\r|\n/g,y=/[ \t]*(\n+)[ \t]*/g,B=/\n{2,}/g,A=/\n{3,}/g,C=/ {2,}/g,v=/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F\u0080-\u009F\u200E\u200F\u202A-\u202E]+/g;function L(a,c){a.lastIndex=0;return a.test(c)}
7
+ function M(a){if("string"!==typeof a)throw new TypeError("inspectText expects a string input.");const c=[],b={hasControlChars:!1,hasInvisibleChars:!1,hasMixedNewlines:!1,newlineStyle:null,hasEmojis:!1,hasNonKeyboardChars:!1,summary:c};L(v,a)&&(b.hasControlChars=!0,c.push("Control characters detected."));L(p,a)&&(b.hasInvisibleChars=!0,c.push("Invisible Unicode characters detected."));L(q,a)&&(b.hasEmojis=!0,c.push("Emojis detected."));if("string"!==typeof a)throw new TypeError("getNewlineStats expects a string input.");
8
+ var d=a.replace(/\r\n/g,"");d={i:(a.match(/\r\n/g)||[]).length,h:(d.match(/\r/g)||[]).length,j:(d.match(/\n/g)||[]).length};const f=[];0<d.i&&f.push("CRLF");0<d.h&&f.push("CR");0<d.j&&f.push("LF");const {l:h,types:m}={...d,types:f,l:1<f.length};b.hasMixedNewlines=h;b.newlineStyle=h?"Mixed":m[0]||null;b.newlineStyle&&c.push(h?"Mixed newline styles detected.":`Consistent newline style: ${b.newlineStyle}`);a:{a=E(a);if(L(D,a))for(e of a)if(!("\n"===e||"\r"===e||" "<=e&&"~">=e||L(q,e))){var e=!0;break a}e=
9
+ !1}e&&(b.hasNonKeyboardChars=!0,c.push("Non-keyboard characters detected."));return b}function N(a){return{purgeInvisibleChars:a.hasInvisibleChars,purgeEmojis:a.hasEmojis,nukeControls:a.hasControlChars,keyboardOnlyFilter:a.hasNonKeyboardChars,normalizeNewlines:a.hasMixedNewlines||"CRLF"===a.newlineStyle||"CR"===a.newlineStyle}}export { g as summonSanctifier, M as inspectText, N as getRecommendedSanctifierOptions };
@@ -0,0 +1,4 @@
1
+ import { inspectText } from './inspectText.js';
2
+ import { getRecommendedSanctifierOptions } from './inspectText.js';
3
+ import { summonSanctifier } from './sanctifyText.js';
4
+ export { inspectText, getRecommendedSanctifierOptions, summonSanctifier };
@@ -0,0 +1,38 @@
1
+ /**
2
+ * Detects textual "trash" or anomalies in a given string.
3
+ * @param {string} text @returns {UnicodeTrashReport}
4
+ */
5
+ export function inspectText(text: string): UnicodeTrashReport;
6
+ /**
7
+ * Counts the number of different newline types in a string.
8
+ * @param {string} text
9
+ * @returns {{
10
+ * crlf: number,
11
+ * cr: number,
12
+ * lf: number,
13
+ * types: string[],
14
+ * mixed: boolean
15
+ * }}
16
+ */
17
+ export function getNewlineStats(text: string): {
18
+ crlf: number;
19
+ cr: number;
20
+ lf: number;
21
+ types: string[];
22
+ mixed: boolean;
23
+ };
24
+ /**
25
+ * Creates defaultOptions for summonSanctifier based on inspectText result
26
+ * @param {!UnicodeTrashReport} report
27
+ * @return {!SanctifyOptions}
28
+ */
29
+ export function getRecommendedSanctifierOptions(report: UnicodeTrashReport): SanctifyOptions;
30
+ export type UnicodeTrashReport = {
31
+ hasControlChars: boolean;
32
+ hasInvisibleChars: boolean;
33
+ hasMixedNewlines: boolean;
34
+ newlineStyle: "LF" | "CRLF" | "CR" | "Mixed" | null;
35
+ hasEmojis: boolean;
36
+ hasNonKeyboardChars: boolean;
37
+ summary: string[];
38
+ };
@@ -0,0 +1,167 @@
1
+ /**
2
+ * @typedef {Object} SanctifyOptions
3
+ * @property {boolean} [purgeInvisibleChars]
4
+ * @property {boolean} [purgeEmojis]
5
+ * @property {boolean} [nukeControls]
6
+ * @property {boolean} [keyboardOnlyFilter]
7
+ * @property {boolean} [normalizeNewlines]
8
+ * @property {boolean} [trimSpacesAroundNewlines]
9
+ * @property {boolean} [collapseNewLines]
10
+ * @property {boolean} [preserveParagraphs]
11
+ * @property {boolean} [collapseSpaces]
12
+ * @property {boolean} [finalTrim]
13
+ * @property {number} [maxLength]
14
+ * @property {'throw'|'truncate'|'noop'} [onMaxLength]
15
+ */
16
+ /**
17
+ * Summons a customized sanctifier function with pre-bound booleans.
18
+ *
19
+ * Accepts full flag names and returns a text-cleaning function.
20
+ *
21
+ * @param {Object} [defaultOptions={}]
22
+ * @param {boolean} [defaultOptions.purgeInvisibleChars]
23
+ * @param {boolean} [defaultOptions.purgeEmojis]
24
+ * @param {boolean} [defaultOptions.nukeControls]
25
+ * @param {boolean} [defaultOptions.keyboardOnlyFilter]
26
+ * @param {boolean} [defaultOptions.normalizeNewlines]
27
+ * @param {boolean} [defaultOptions.trimSpacesAroundNewlines]
28
+ * @param {boolean} [defaultOptions.collapseNewLines]
29
+ * @param {boolean} [defaultOptions.preserveParagraphs]
30
+ * @param {boolean} [defaultOptions.collapseSpaces]
31
+ * @param {boolean} [defaultOptions.finalTrim]
32
+ * @param {number} [defaultOptions.maxLength=Infinity] - Hard input length cap (UTF-16 code units).
33
+ * @param {'throw' | 'truncate' | 'noop'} [defaultOptions.onMaxLength='throw'] - Behavior when input exceeds maxLength.
34
+ * @returns {(text: string) => string}
35
+ */
36
+ export function summonSanctifier(defaultOptions?: {
37
+ purgeInvisibleChars?: boolean;
38
+ purgeEmojis?: boolean;
39
+ nukeControls?: boolean;
40
+ keyboardOnlyFilter?: boolean;
41
+ normalizeNewlines?: boolean;
42
+ trimSpacesAroundNewlines?: boolean;
43
+ collapseNewLines?: boolean;
44
+ preserveParagraphs?: boolean;
45
+ collapseSpaces?: boolean;
46
+ finalTrim?: boolean;
47
+ maxLength?: number;
48
+ onMaxLength?: "throw" | "truncate" | "noop";
49
+ }): (text: string) => string;
50
+ export namespace summonSanctifier {
51
+ /**
52
+ * Strict sanitizer:
53
+ * - Purge emojis
54
+ * - Collapse all newlines
55
+ * - Collapse spaces
56
+ * - Nuke control characters
57
+ */
58
+ /** @param {string} text @returns {string} */
59
+ function strict(text: string): string;
60
+ /**
61
+ * Loose sanitizer:
62
+ * - Collapse spaces
63
+ * - Preserve paragraphs
64
+ * - Normalize newlines
65
+ */
66
+ /** @param {string} text @returns {string} */
67
+ function loose(text: string): string;
68
+ /**
69
+ * Keyboard-only (with emojis):
70
+ * - Keeps emojis and printable ASCII
71
+ * - Strips non-standard characters
72
+ * - Normalizes typographic trash
73
+ */
74
+ /** @param {string} text @returns {string} */
75
+ function keyboardOnlyEmoji(text: string): string;
76
+ /**
77
+ * Keyboard-only (strict):
78
+ * - Removes emojis
79
+ * - Collapses all whitespace
80
+ * - Restricts to printable ASCII only
81
+ */
82
+ /** @param {string} text @returns {string} */
83
+ function keyboardOnly(text: string): string;
84
+ }
85
+ /**
86
+ * Text Sanctifier
87
+ *
88
+ * Brutal text normalizer and invisible trash scrubber,
89
+ * configurable to kill whatever ghosts you want dead.
90
+ *
91
+ * โš ๏ธ Note: This is plain-text normalization/filtering โ€” not an HTML/XSS sanitizer.
92
+ *
93
+ * @param {string} text
94
+ * @param {boolean} [purgeInvisibleChars=false] - Remove ZWSP, NBSP, bidi, etc.
95
+ * @param {boolean} [purgeEmojis=false] - Remove emoji characters entirely.
96
+ * @param {boolean} [nukeControls=false] - Remove non-whitespace control characters.
97
+ * @param {boolean} [keyboardOnlyFilter=false] - Keep printable ASCII + full emoji sequences only (drops other Unicode).
98
+ * @param {boolean} [normalizeNewlines=false] - Convert all newlines to `\n`.
99
+ * @param {boolean} [trimSpacesAroundNewlines=false] - Remove spaces/tabs around newlines.
100
+ * @param {boolean} [collapseNewLines=false] - Collapse `\n` runs (optionally preserve paragraphs).
101
+ * @param {boolean} [preserveParagraphs=false] - Preserve paragraph breaks when collapsing newlines.
102
+ * @param {boolean} [collapseSpaces=false] - Collapse multiple spaces into one.
103
+ * @param {boolean} [finalTrim=false] - `.trim()` the final output (head/tail).
104
+ * @param {number} [maxLength=Infinity] - Hard input length cap (UTF-16 code units via `text.length`).
105
+ * @param {'throw'|'truncate'|'noop'} [onMaxLength='throw'] - Behavior when `text.length` exceeds `maxLength`:
106
+ * - `'throw'`: throw a `RangeError`
107
+ * - `'truncate'`: slice to `maxLength` before processing
108
+ * - `'noop'`: return the original input unchanged (skips processing)
109
+ * @returns {string}
110
+ * @throws {TypeError} If `text` is not a string.
111
+ * @throws {RangeError} If input exceeds `maxLength` and `onMaxLength` is `'throw'`.
112
+ */
113
+ export function sanctifyText(text: string, purgeInvisibleChars?: boolean, purgeEmojis?: boolean, nukeControls?: boolean, keyboardOnlyFilter?: boolean, normalizeNewlines?: boolean, trimSpacesAroundNewlines?: boolean, collapseNewLines?: boolean, preserveParagraphs?: boolean, collapseSpaces?: boolean, finalTrim?: boolean, maxLength?: number, onMaxLength?: "throw" | "truncate" | "noop"): string;
114
+ /**
115
+ * Normalizes typographic Unicode punctuation into ASCII equivalents.
116
+ * @param {string} text
117
+ * @returns {string}
118
+ */
119
+ export function normalizeTypographicJank(text: string): string;
120
+ /**
121
+ * Purges invisible Unicode "trash" characters and removes them.
122
+ *
123
+ * Targets:
124
+ * - Non-breaking spaces (\u00A0)
125
+ * - Zero-width spaces and miscellaneous Unicode spaces (\u2000โ€“\u200D, \u202F, \u2060, \u3000, \uFEFF)
126
+ * - Left-to-right/right-to-left markers and overrides (\u200E, \u200F, \u202Aโ€“\u202E)
127
+ *
128
+ * @param {string} text
129
+ * @returns {string}
130
+ */
131
+ export const INVISIBLE_TRASH_REGEX: RegExp;
132
+ /**
133
+ * Matches any character that is NOT:
134
+ * - Printable ASCII (U+0020โ€“U+007E)
135
+ * - Newline characters: \n (LF), \r (CR)
136
+ * This allows for \n, \r, and \r\n line endings.
137
+ */
138
+ export const ASCII_KEYBOARD_SAFE_REGEX: RegExp;
139
+ /** @type {RegExp} */
140
+ export let EMOJI_REGEX: RegExp;
141
+ /**
142
+ * Nukes hidden control characters that are invisible and often dangerous.
143
+ * (Excludes necessary whitespace like \n and \t.)
144
+ *
145
+ * Control characters nuked:
146
+ * - ASCII control range (0x00-0x1F, 0x7F)
147
+ * - Unicode control range (0x80-0x9F)
148
+ * - RTL/LTR markers (U+200E, U+200F, U+202Aโ€“U+202E)
149
+ *
150
+ * @param {string} text
151
+ * @returns {string}
152
+ */
153
+ export const CONTROL_CHARS_REGEX: RegExp;
154
+ export type SanctifyOptions = {
155
+ purgeInvisibleChars?: boolean;
156
+ purgeEmojis?: boolean;
157
+ nukeControls?: boolean;
158
+ keyboardOnlyFilter?: boolean;
159
+ normalizeNewlines?: boolean;
160
+ trimSpacesAroundNewlines?: boolean;
161
+ collapseNewLines?: boolean;
162
+ preserveParagraphs?: boolean;
163
+ collapseSpaces?: boolean;
164
+ finalTrim?: boolean;
165
+ maxLength?: number;
166
+ onMaxLength?: "throw" | "truncate" | "noop";
167
+ };
package/package.json CHANGED
@@ -1,27 +1,35 @@
1
1
  {
2
2
  "name": "text-sanctifier",
3
- "version": "1.0.16",
3
+ "version": "1.0.18",
4
4
  "type": "module",
5
5
  "description": "A brutal text normalizer and invisible trash scrubber for modern web projects.",
6
6
  "main": "./src/index.js",
7
- "module": "./src/index.js",
7
+ "module": "./src/index.js",
8
8
  "browser": "./dist/text-sanctifier.min.js",
9
9
  "files": [
10
10
  "src",
11
11
  "dist/text-sanctifier.min.js",
12
+ "dist/types",
12
13
  "LICENSE.md",
13
14
  "README.md"
14
15
  ],
15
16
  "exports": {
16
- ".": "./src/index.js",
17
- "./browser": "./dist/text-sanctifier.min.js"
18
- },
19
- "types": "./src/index.d.ts",
17
+ ".": {
18
+ "types": "./dist/types/index.d.ts",
19
+ "default": "./src/index.js"
20
+ },
21
+ "./browser": {
22
+ "default": "./dist/text-sanctifier.min.js"
23
+ }
24
+ },
25
+ "types": "./dist/types/index.d.ts",
20
26
  "sideEffects": false,
21
27
  "scripts": {
22
28
  "build": "node scripts/buildc.js",
29
+ "build:types": "tsc -p tsconfig.types.json",
23
30
  "test": "node tests/sanctifyText.test.js",
24
- "test-min": "node tests/sanctifyText.test.min.js"
31
+ "test-min": "node tests/sanctifyText.test.min.js",
32
+ "ci": "npm run test && npm run build && npm run build:types && npm run test-min"
25
33
  },
26
34
  "keywords": [
27
35
  "text",
@@ -34,7 +42,7 @@
34
42
  "sanctify"
35
43
  ],
36
44
  "author": "๐Ÿ‘พDr.Watt๐Ÿ‘พ <WATT3D@protonmail.com>",
37
- "license": "๐Ÿ‘พDr.Watt๐Ÿ‘พ License v3.0",
45
+ "license": "๐Ÿ‘พDr.Watt๐Ÿ‘พ License v3.0",
38
46
  "repository": {
39
47
  "type": "git",
40
48
  "url": "git+https://github.com/iWhatty/text-sanctifier.git"
@@ -47,9 +55,9 @@
47
55
  "test": "tests"
48
56
  },
49
57
  "devDependencies": {
50
- "terser": "^5.39.0",
51
58
  "esbuild": "^0.25.3",
52
- "google-closure-compiler": "^20240317.0.0"
53
-
59
+ "google-closure-compiler": "^20240317.0.0",
60
+ "terser": "^5.39.0",
61
+ "typescript": "^5.9.3"
54
62
  }
55
- }
63
+ }
package/src/index.d.ts CHANGED
@@ -1,116 +1,4 @@
1
- // src/index.d.ts
2
-
3
- export interface SanctifyOptions {
4
- /** Remove ZWSP, NBSP, bidi, and other invisible Unicode trash */
5
- purgeInvisibleChars?: boolean;
6
-
7
- /** Remove emoji characters */
8
- purgeEmojis?: boolean;
9
-
10
- /** Nuke hidden control characters (excluding whitespace like \n and \t) */
11
- nukeControls?: boolean;
12
-
13
- /** Restrict to printable ASCII (+ emoji if `purgeEmojis` is false) */
14
- keyboardOnlyFilter?: boolean;
15
-
16
- /** Normalize all newline sequences to LF (`\n`) */
17
- normalizeNewlines?: boolean;
18
-
19
- /** Remove tabs and spaces before/after newline characters */
20
- trimSpacesAroundNewlines?: boolean;
21
-
22
- /** Collapse multiple consecutive newlines */
23
- collapseNewLines?: boolean;
24
-
25
- /** When collapsing newlines, preserve paragraph breaks as double `\n\n` */
26
- preserveParagraphs?: boolean;
27
-
28
- /** Collapse multiple spaces into a single space */
29
- collapseSpaces?: boolean;
30
-
31
- /** Trim leading and trailing whitespace from final result */
32
- finalTrim?: boolean;
33
- }
34
-
35
- /** Preconfigured sanitizer function */
36
- export type Sanctifier = (text: string) => string;
37
-
38
- /**
39
- * Summon a reusable text sanitizer.
40
- */
41
- export function summonSanctifier(
42
- defaultOptions?: SanctifyOptions,
43
- ): Sanctifier;
44
-
45
- /**
46
- * Strict sanitizer preset:
47
- * - Collapse spaces
48
- * - Collapse all newlines
49
- * - Nuke control characters
50
- * - Purge emojis
51
- */
52
- export namespace summonSanctifier {
53
- const strict: Sanctifier;
54
- const loose: Sanctifier;
55
-
56
- /**
57
- * Keeps printable ASCII and emoji.
58
- * Leaves spacing soft and preserves emoji.
59
- */
60
- const keyboardOnlyEmoji: Sanctifier;
61
-
62
- /**
63
- * Keeps printable ASCII only.
64
- * Collapses whitespace and purges emoji.
65
- */
66
- const keyboardOnly: Sanctifier;
67
- }
68
-
69
- /**
70
- * Brutally normalizes and cleans a string of text.
71
- */
72
- export function sanctifyText(
73
- text: string,
74
- purgeInvisibleChars?: boolean,
75
- purgeEmojis?: boolean,
76
- nukeControls?: boolean,
77
- keyboardOnlyFilter?: boolean,
78
- normalizeNewlines?: boolean,
79
- trimSpacesAroundNewlines?: boolean,
80
- collapseNewLines?: boolean,
81
- preserveParagraphs?: boolean,
82
- collapseSpaces?: boolean,
83
- finalTrim?: boolean,
84
- ): string;
85
-
86
- /** Style of newline characters detected in a string */
87
- export type NewlineStyle = 'LF' | 'CRLF' | 'CR' | 'Mixed' | null;
88
-
89
- /**
90
- * A structural report of anomalies found in text.
91
- */
92
- export interface UnicodeTrashReport {
93
- hasControlChars: boolean;
94
- hasInvisibleChars: boolean;
95
- hasMixedNewlines: boolean;
96
- newlineStyle: NewlineStyle;
97
- hasEmojis: boolean;
98
- hasNonKeyboardChars: boolean;
99
- summary: string[];
100
- }
101
-
102
- /**
103
- * Analyze a string and return a report of Unicode/control character issues,
104
- * invisible characters, newline styles, emojis, and more.
105
- */
106
- export function inspectText(text: string): UnicodeTrashReport;
107
-
108
-
109
- /**
110
- * Creates a recommended set of `summonSanctifier` options based on the findings
111
- * of `inspectText()`. This maps only what can be inferred automatically โ€”
112
- * user-preference settings like whitespace collapsing are left unset.
113
- */
114
- export function getRecommendedSanctifierOptions(
115
- report: UnicodeTrashReport
116
- ): SanctifyOptions;
1
+ import { inspectText } from './inspectText.js';
2
+ import { getRecommendedSanctifierOptions } from './inspectText.js';
3
+ import { summonSanctifier } from './sanctifyText.js';
4
+ export { inspectText, getRecommendedSanctifierOptions, summonSanctifier };
@@ -1,30 +1,74 @@
1
-
2
1
  // ./src/inspectText.js
3
2
 
4
-
5
3
  import {
6
4
  CONTROL_CHARS_REGEX,
7
5
  INVISIBLE_TRASH_REGEX,
8
6
  EMOJI_REGEX,
9
7
  ASCII_KEYBOARD_SAFE_REGEX,
10
- normalizeTypographicJank
8
+ normalizeTypographicJank,
11
9
  } from './sanctifyText.js';
12
10
 
13
11
 
12
+ /**
13
+ * @typedef {Object} UnicodeTrashReport
14
+ * @property {boolean} hasControlChars
15
+ * @property {boolean} hasInvisibleChars
16
+ * @property {boolean} hasMixedNewlines
17
+ * @property {'LF'|'CRLF'|'CR'|'Mixed'|null} newlineStyle
18
+ * @property {boolean} hasEmojis
19
+ * @property {boolean} hasNonKeyboardChars
20
+ * @property {string[]} summary
21
+ */
22
+
14
23
 
15
24
  /**
16
- * Detects textual "trash" or anomalies in a given string.
25
+ * Safe `.test()` for global/sticky regexes.
26
+ * Global regexes mutate `lastIndex`, which makes `.test()` unreliable across calls.
27
+ * @param {RegExp} re
28
+ * @param {string} s
29
+ */
30
+ function stableTest(re, s) {
31
+ re.lastIndex = 0;
32
+ return re.test(s);
33
+ }
34
+
35
+ /**
36
+ * Returns true if text contains any non-keyboard characters (excluding emojis),
37
+ * after typographic normalization.
38
+ *
39
+ * "Keyboard" here means:
40
+ * - Printable ASCII (0x20โ€“0x7E)
41
+ * - CR/LF
42
+ * - Emojis (per EMOJI_REGEX)
43
+ *
17
44
  * @param {string} text
18
- * @returns {{
19
- * hasControlChars: boolean,
20
- * hasInvisibleChars: boolean,
21
- * hasMixedNewlines: boolean,
22
- * newlineStyle: 'LF' | 'CRLF' | 'CR' | 'Mixed' | null,
23
- * hasEmojis: boolean,
24
- * hasNonKeyboardChars: boolean,
25
- * summary: string[]
26
- * }}
27
- */
45
+ * @returns {boolean}
46
+ */
47
+ function hasNonKeyboardCharsExcludingEmoji(text) {
48
+ const normalized = normalizeTypographicJank(text);
49
+
50
+ // Fast path: no non-ascii runs => no non-keyboard chars
51
+ if (!stableTest(ASCII_KEYBOARD_SAFE_REGEX, normalized)) return false;
52
+
53
+ // Walk code points so we can distinguish emoji from other non-ascii safely
54
+ for (const ch of normalized) {
55
+ // allowed: ASCII printable + newlines
56
+ if (ch === '\n' || ch === '\r' || (ch >= ' ' && ch <= '~')) continue;
57
+
58
+ // allowed: emoji
59
+ if (stableTest(EMOJI_REGEX, ch)) continue;
60
+
61
+ // anything else non-ascii is "non-keyboard"
62
+ return true;
63
+ }
64
+
65
+ return false;
66
+ }
67
+
68
+ /**
69
+ * Detects textual "trash" or anomalies in a given string.
70
+ * @param {string} text @returns {UnicodeTrashReport}
71
+ */
28
72
  export function inspectText(text) {
29
73
  if (typeof text !== 'string') {
30
74
  throw new TypeError('inspectText expects a string input.');
@@ -38,7 +82,7 @@ export function inspectText(text) {
38
82
  newlineStyle: null,
39
83
  hasEmojis: false,
40
84
  hasNonKeyboardChars: false,
41
- summary
85
+ summary,
42
86
  };
43
87
 
44
88
  const flag = (condition, key, message) => {
@@ -49,9 +93,9 @@ export function inspectText(text) {
49
93
  };
50
94
 
51
95
  // === Pattern Checks ===
52
- flag(CONTROL_CHARS_REGEX.test(text), 'hasControlChars', 'Control characters detected.');
53
- flag(INVISIBLE_TRASH_REGEX.test(text), 'hasInvisibleChars', 'Invisible Unicode characters detected.');
54
- flag(EMOJI_REGEX.test(text), 'hasEmojis', 'Emojis detected.');
96
+ flag(stableTest(CONTROL_CHARS_REGEX, text), 'hasControlChars', 'Control characters detected.');
97
+ flag(stableTest(INVISIBLE_TRASH_REGEX, text), 'hasInvisibleChars', 'Invisible Unicode characters detected.');
98
+ flag(stableTest(EMOJI_REGEX, text), 'hasEmojis', 'Emojis detected.');
55
99
 
56
100
  // === Newline Analysis ===
57
101
  const { mixed, types } = getNewlineStats(text);
@@ -60,33 +104,31 @@ export function inspectText(text) {
60
104
 
61
105
  if (report.newlineStyle) {
62
106
  summary.push(
63
- mixed
64
- ? 'Mixed newline styles detected.'
65
- : `Consistent newline style: ${report.newlineStyle}`
107
+ mixed ? 'Mixed newline styles detected.' : `Consistent newline style: ${report.newlineStyle}`
66
108
  );
67
109
  }
68
110
 
69
111
  // === Non-keyboard characters (excluding emojis) ===
70
- const filtered = normalizeTypographicJank(text).replace(ASCII_KEYBOARD_SAFE_REGEX, m =>
71
- m.match(EMOJI_REGEX) ? '' : 'โ˜’'
112
+ flag(
113
+ hasNonKeyboardCharsExcludingEmoji(text),
114
+ 'hasNonKeyboardChars',
115
+ 'Non-keyboard characters detected.'
72
116
  );
73
- flag(/[โ˜’]/.test(filtered), 'hasNonKeyboardChars', 'Non-keyboard characters detected.');
74
117
 
75
118
  return report;
76
119
  }
77
120
 
78
-
79
121
  /**
80
122
  * Counts the number of different newline types in a string.
81
123
  * @param {string} text
82
124
  * @returns {{
83
- * crlf: number,
84
- * cr: number,
85
- * lf: number,
86
- * types: string[],
87
- * mixed: boolean
88
- * }}
89
- */
125
+ * crlf: number,
126
+ * cr: number,
127
+ * lf: number,
128
+ * types: string[],
129
+ * mixed: boolean
130
+ * }}
131
+ */
90
132
  export function getNewlineStats(text) {
91
133
  if (typeof text !== 'string') {
92
134
  throw new TypeError('getNewlineStats expects a string input.');
@@ -101,7 +143,7 @@ export function getNewlineStats(text) {
101
143
  const count = {
102
144
  crlf: crlfMatches.length,
103
145
  cr: crMatches.length,
104
- lf: lfMatches.length
146
+ lf: lfMatches.length,
105
147
  };
106
148
 
107
149
  const types = [];
@@ -112,12 +154,10 @@ export function getNewlineStats(text) {
112
154
  return {
113
155
  ...count,
114
156
  types,
115
- mixed: types.length > 1
157
+ mixed: types.length > 1,
116
158
  };
117
159
  }
118
160
 
119
-
120
-
121
161
  /**
122
162
  * Creates defaultOptions for summonSanctifier based on inspectText result
123
163
  * @param {!UnicodeTrashReport} report
@@ -129,11 +169,7 @@ export function getRecommendedSanctifierOptions(report) {
129
169
  purgeEmojis: report.hasEmojis,
130
170
  nukeControls: report.hasControlChars,
131
171
  keyboardOnlyFilter: report.hasNonKeyboardChars,
132
- normalizeNewlines: report.hasMixedNewlines || report.newlineStyle === 'CRLF' || report.newlineStyle === 'CR',
133
- // trimSpacesAroundNewlines: true,
134
- // collapseNewLines: false,
135
- // preserveParagraphs: true,
136
- // collapseSpaces: true,
137
- // finalTrim: true,
172
+ normalizeNewlines:
173
+ report.hasMixedNewlines || report.newlineStyle === 'CRLF' || report.newlineStyle === 'CR',
138
174
  };
139
175
  }
@@ -13,6 +13,8 @@
13
13
  * @property {boolean} [preserveParagraphs]
14
14
  * @property {boolean} [collapseSpaces]
15
15
  * @property {boolean} [finalTrim]
16
+ * @property {number} [maxLength]
17
+ * @property {'throw'|'truncate'|'noop'} [onMaxLength]
16
18
  */
17
19
 
18
20
 
@@ -32,6 +34,8 @@
32
34
  * @param {boolean} [defaultOptions.preserveParagraphs]
33
35
  * @param {boolean} [defaultOptions.collapseSpaces]
34
36
  * @param {boolean} [defaultOptions.finalTrim]
37
+ * @param {number} [defaultOptions.maxLength=Infinity] - Hard input length cap (UTF-16 code units).
38
+ * @param {'throw' | 'truncate' | 'noop'} [defaultOptions.onMaxLength='throw'] - Behavior when input exceeds maxLength.
35
39
  * @returns {(text: string) => string}
36
40
  */
37
41
  export function summonSanctifier(defaultOptions = {}) {
@@ -46,19 +50,34 @@ export function summonSanctifier(defaultOptions = {}) {
46
50
  const collapseSpaces = !!defaultOptions.collapseSpaces;
47
51
  const finalTrim = !!defaultOptions.finalTrim;
48
52
 
49
- return text => sanctifyText(
50
- text,
51
- purgeInvisibleChars,
52
- purgeEmojis,
53
- nukeControls,
54
- keyboardOnlyFilter,
55
- normalizeNewlines,
56
- trimSpacesAroundNewlines,
57
- collapseNewLines,
58
- preserveParagraphs,
59
- collapseSpaces,
60
- finalTrim
61
- );
53
+ const maxLength =
54
+ Number.isFinite(defaultOptions.maxLength) && defaultOptions.maxLength >= 0
55
+ ? defaultOptions.maxLength
56
+ : Infinity;
57
+
58
+ const onMaxLength =
59
+ defaultOptions.onMaxLength === 'truncate' ||
60
+ defaultOptions.onMaxLength === 'noop' ||
61
+ defaultOptions.onMaxLength === 'throw'
62
+ ? defaultOptions.onMaxLength
63
+ : 'throw';
64
+
65
+ return (text) =>
66
+ sanctifyText(
67
+ text,
68
+ purgeInvisibleChars,
69
+ purgeEmojis,
70
+ nukeControls,
71
+ keyboardOnlyFilter,
72
+ normalizeNewlines,
73
+ trimSpacesAroundNewlines,
74
+ collapseNewLines,
75
+ preserveParagraphs,
76
+ collapseSpaces,
77
+ finalTrim,
78
+ maxLength,
79
+ onMaxLength
80
+ );
62
81
  }
63
82
 
64
83
  // --- Added Presets ---
@@ -70,6 +89,7 @@ export function summonSanctifier(defaultOptions = {}) {
70
89
  * - Collapse spaces
71
90
  * - Nuke control characters
72
91
  */
92
+ /** @param {string} text @returns {string} */
73
93
  summonSanctifier.strict = text => sanctifyText(
74
94
  text,
75
95
  true, // purgeInvisibleChars
@@ -91,6 +111,7 @@ summonSanctifier.strict = text => sanctifyText(
91
111
  * - Preserve paragraphs
92
112
  * - Normalize newlines
93
113
  */
114
+ /** @param {string} text @returns {string} */
94
115
  summonSanctifier.loose = text => sanctifyText(
95
116
  text,
96
117
  false, // purgeInvisibleChars
@@ -112,6 +133,7 @@ summonSanctifier.loose = text => sanctifyText(
112
133
  * - Strips non-standard characters
113
134
  * - Normalizes typographic trash
114
135
  */
136
+ /** @param {string} text @returns {string} */
115
137
  summonSanctifier.keyboardOnlyEmoji = text => sanctifyText(
116
138
  text,
117
139
  false, // purgeInvisibleChars
@@ -133,9 +155,10 @@ summonSanctifier.keyboardOnlyEmoji = text => sanctifyText(
133
155
  * - Collapses all whitespace
134
156
  * - Restricts to printable ASCII only
135
157
  */
158
+ /** @param {string} text @returns {string} */
136
159
  summonSanctifier.keyboardOnly = text => sanctifyText(
137
160
  text,
138
- true, // purgeInvisibleChars
161
+ true, // purgeInvisibleChars
139
162
  true, // purgeEmojis
140
163
  true, // nukeControls
141
164
  true, // keyboardOnlyFilter
@@ -150,22 +173,31 @@ summonSanctifier.keyboardOnly = text => sanctifyText(
150
173
 
151
174
  /**
152
175
  * Text Sanctifier
153
- *
176
+ *
154
177
  * Brutal text normalizer and invisible trash scrubber,
155
178
  * configurable to kill whatever ghosts you want dead.
156
- *
157
- * @param {string | null | undefined} text
179
+ *
180
+ * โš ๏ธ Note: This is plain-text normalization/filtering โ€” not an HTML/XSS sanitizer.
181
+ *
182
+ * @param {string} text
158
183
  * @param {boolean} [purgeInvisibleChars=false] - Remove ZWSP, NBSP, bidi, etc.
159
184
  * @param {boolean} [purgeEmojis=false] - Remove emoji characters entirely.
160
185
  * @param {boolean} [nukeControls=false] - Remove non-whitespace control characters.
161
- * @param {boolean} [keyboardOnlyFilter=false] - Keep printable ASCII and emojis only.
186
+ * @param {boolean} [keyboardOnlyFilter=false] - Keep printable ASCII + full emoji sequences only (drops other Unicode).
162
187
  * @param {boolean} [normalizeNewlines=false] - Convert all newlines to `\n`.
163
188
  * @param {boolean} [trimSpacesAroundNewlines=false] - Remove spaces/tabs around newlines.
164
189
  * @param {boolean} [collapseNewLines=false] - Collapse `\n` runs (optionally preserve paragraphs).
165
190
  * @param {boolean} [preserveParagraphs=false] - Preserve paragraph breaks when collapsing newlines.
166
191
  * @param {boolean} [collapseSpaces=false] - Collapse multiple spaces into one.
167
192
  * @param {boolean} [finalTrim=false] - `.trim()` the final output (head/tail).
193
+ * @param {number} [maxLength=Infinity] - Hard input length cap (UTF-16 code units via `text.length`).
194
+ * @param {'throw'|'truncate'|'noop'} [onMaxLength='throw'] - Behavior when `text.length` exceeds `maxLength`:
195
+ * - `'throw'`: throw a `RangeError`
196
+ * - `'truncate'`: slice to `maxLength` before processing
197
+ * - `'noop'`: return the original input unchanged (skips processing)
168
198
  * @returns {string}
199
+ * @throws {TypeError} If `text` is not a string.
200
+ * @throws {RangeError} If input exceeds `maxLength` and `onMaxLength` is `'throw'`.
169
201
  */
170
202
  export function sanctifyText(
171
203
  text,
@@ -179,11 +211,32 @@ export function sanctifyText(
179
211
  preserveParagraphs = false,
180
212
  collapseSpaces = false,
181
213
  finalTrim = false,
214
+ maxLength = Infinity,
215
+ onMaxLength = 'throw'
182
216
  ) {
183
217
  if (typeof text !== 'string') {
184
218
  throw new TypeError('sanctifyText expects a string input.');
185
219
  }
186
220
 
221
+ if (onMaxLength !== 'truncate' && onMaxLength !== 'noop' && onMaxLength !== 'throw') {
222
+ onMaxLength = 'throw';
223
+ }
224
+
225
+ if (text.length > maxLength) {
226
+ switch (onMaxLength) {
227
+ case 'truncate':
228
+ text = text.slice(0, maxLength);
229
+ break;
230
+ case 'noop':
231
+ return text;
232
+ case 'throw':
233
+ default:
234
+ throw new RangeError(
235
+ `sanctifyText input length ${text.length} exceeds maxLength ${maxLength}.`
236
+ );
237
+ }
238
+ }
239
+
187
240
  let cleaned = text;
188
241
 
189
242
  // Purge invisible Unicode trash (zero-width, non-breaking, bidi junk, etc.)
@@ -217,6 +270,7 @@ export function sanctifyText(
217
270
 
218
271
  // --- Micro helpers ---
219
272
 
273
+
220
274
  /**
221
275
  * Purges invisible Unicode "trash" characters and removes them.
222
276
  *
@@ -234,6 +288,35 @@ function purgeInvisibleTrash(text) {
234
288
  }
235
289
 
236
290
 
291
+ /**
292
+ * Extracts and preserves complete emoji sequences from a string.
293
+ *
294
+ * Uses the global `EMOJI_REGEX` to match full emoji grapheme sequences,
295
+ * including:
296
+ * - Extended pictographic characters
297
+ * - Zero-width joiner (ZWJ) sequences (e.g. ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ)
298
+ * - Variation Selector-16 (VS16) emoji presentation forms (e.g. โœŒ๏ธ, โ€ผ๏ธ)
299
+ *
300
+ * Any non-emoji characters are ignored.
301
+ *
302
+ * Important:
303
+ * `EMOJI_REGEX` is global (`/g`), so `lastIndex` is reset before matching
304
+ * to ensure deterministic behavior across repeated calls.
305
+ *
306
+ * @param {string} text - Input string potentially containing emoji sequences.
307
+ * @returns {string} A string containing only the matched emoji sequences,
308
+ * concatenated in original order.
309
+ */
310
+ function extractEmojiSequences(text) {
311
+ EMOJI_REGEX.lastIndex = 0;
312
+ if (!EMOJI_REGEX.test(text)) return ''; // quick reject
313
+ EMOJI_REGEX.lastIndex = 0;
314
+
315
+ let out = '';
316
+ for (const match of text.matchAll(EMOJI_REGEX)) out += match[0];
317
+ return out;
318
+ }
319
+
237
320
  /**
238
321
  * Matches any character that is NOT:
239
322
  * - Printable ASCII (U+0020โ€“U+007E)
@@ -250,14 +333,15 @@ export const ASCII_KEYBOARD_SAFE_REGEX = /[^\x20-\x7E\r\n]+/gu;
250
333
  function purgeNonKeyboardChars(text, purgeEmojis = false) {
251
334
  const normalized = normalizeTypographicJank(text);
252
335
 
336
+ // If emojis are being purged, keyboard-only becomes "ASCII + CR/LF only"
253
337
  if (purgeEmojis) {
254
338
  return normalized.replace(ASCII_KEYBOARD_SAFE_REGEX, '');
255
339
  }
256
340
 
257
- // Remove non-ASCII unless it's a valid emoji
258
- return normalized.replace(ASCII_KEYBOARD_SAFE_REGEX, m =>
259
- m.match(EMOJI_REGEX) ? m : ''
260
- );
341
+ // Replace each non-ASCII run with ONLY the emoji sequences inside it.
342
+ // This preserves ZWJ sequences and VS16 variants (e.g. ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ, โœŒ๏ธ, โ€ผ๏ธ),
343
+ // while dropping non-emoji non-ASCII (e.g. ๐Œ†).
344
+ return normalized.replace(ASCII_KEYBOARD_SAFE_REGEX, (run) => extractEmojiSequences(run));
261
345
  }
262
346
 
263
347
 
@@ -279,6 +363,11 @@ const BULLETS_REGEX = /[\u2022\u00B7]/g;
279
363
  // Full-width ASCII punctuation: U+FF01 - U+FF5E
280
364
  const FULLWIDTH_PUNCTUATION_REGEX = /[\uFF01-\uFF5E]/g;
281
365
 
366
+ /**
367
+ * Normalizes typographic Unicode punctuation into ASCII equivalents.
368
+ * @param {string} text
369
+ * @returns {string}
370
+ */
282
371
  export function normalizeTypographicJank(text) {
283
372
  return text
284
373
  .replace(SMART_SINGLE_QUOTES_REGEX, "'")
@@ -292,7 +381,7 @@ export function normalizeTypographicJank(text) {
292
381
  }
293
382
 
294
383
 
295
-
384
+ /** @type {RegExp} */
296
385
  export let EMOJI_REGEX;
297
386
 
298
387
  /**