text-sanctifier 1.0.16 โ 1.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -7
- package/dist/text-sanctifier.min.js +9 -8
- package/dist/types/index.d.ts +4 -0
- package/dist/types/inspectText.d.ts +38 -0
- package/dist/types/sanctifyText.d.ts +167 -0
- package/package.json +20 -12
- package/src/index.d.ts +4 -116
- package/src/inspectText.js +78 -42
- package/src/sanctifyText.js +112 -23
package/README.md
CHANGED
|
@@ -5,10 +5,10 @@
|
|
|
5
5
|
[](https://www.npmjs.com/package/text-sanctifier)
|
|
6
6
|
[](https://github.com/iWhatty/text-sanctifier)
|
|
7
7
|
|
|
8
|
-
Brutal text normalizer and invisible
|
|
8
|
+
Brutal text normalizer and invisible Unicode scrubber for modern web projects.
|
|
9
9
|
|
|
10
|
-
* Minified: (3.
|
|
11
|
-
* Gzipped (GCC): (1.
|
|
10
|
+
* Minified: (3.70 KB)
|
|
11
|
+
* Gzipped (GCC): (1.66 KB)
|
|
12
12
|
|
|
13
13
|
## Features
|
|
14
14
|
|
|
@@ -17,9 +17,20 @@ Brutal text normalizer and invisible trash scrubber for modern web projects.
|
|
|
17
17
|
* Collapses unwanted spaces and paragraphs
|
|
18
18
|
* Nukes control characters (if enabled)
|
|
19
19
|
* Smart normalization of typographic junk (quotes, dashes, bullets, full-width punctuation)
|
|
20
|
-
* Keyboard-only filtering (retain printable ASCII + emoji
|
|
20
|
+
* Keyboard-only filtering (retain printable ASCII + full emoji sequences)
|
|
21
|
+
|
|
22
|
+
* Preserves ZWJ emoji clusters (๐จโ๐ฉโ๐งโ๐ฆ)
|
|
23
|
+
* Preserves VS16 emoji presentation variants (โ๏ธ, โผ๏ธ)
|
|
21
24
|
* Configurable via fine-grained flags or ready-made presets
|
|
22
25
|
* Includes strict, loose, and keyboard-only modes
|
|
26
|
+
* Deterministic RegExp usage (no global `lastIndex` state leaks)
|
|
27
|
+
|
|
28
|
+
## Security notes
|
|
29
|
+
|
|
30
|
+
- **Not an HTML/XSS sanitizer.** This library normalizes and filters plain text.
|
|
31
|
+
- If you need to render **untrusted content**, render it as text (e.g. `textContent`), not HTML (`innerHTML`).
|
|
32
|
+
- If you need to sanitize **HTML**, use a dedicated HTML sanitizer (e.g. DOMPurify / sanitize-html).
|
|
33
|
+
- Like any text-processing library, extremely large untrusted inputs can be used for CPU/DoS pressure; consider input size limits in high-risk environments.
|
|
23
34
|
|
|
24
35
|
## Install
|
|
25
36
|
|
|
@@ -27,6 +38,15 @@ Brutal text normalizer and invisible trash scrubber for modern web projects.
|
|
|
27
38
|
npm install text-sanctifier
|
|
28
39
|
```
|
|
29
40
|
|
|
41
|
+
## Runtime Requirements
|
|
42
|
+
|
|
43
|
+
Requires modern JavaScript runtime with ES2020+ support.
|
|
44
|
+
|
|
45
|
+
* Node.js 14+
|
|
46
|
+
* Modern evergreen browsers
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
30
50
|
## ๐ฆ Package & Build Info
|
|
31
51
|
|
|
32
52
|
* **Source (`src/`)**: ES2020+ ESM modules with JSDoc
|
|
@@ -109,7 +129,7 @@ const report = inspectText(input);
|
|
|
109
129
|
|
|
110
130
|
Use `inspectText` to preflight text content before rendering, storing, or linting. It's a diagnostic tool to help inform sanitization needs.
|
|
111
131
|
|
|
112
|
-
Pass the report to getRecommendedSanctifierOptions(report) to auto-generate config flags for summonSanctifier()
|
|
132
|
+
Pass the report to `getRecommendedSanctifierOptions(report)` to auto-generate config flags for `summonSanctifier()`.
|
|
113
133
|
|
|
114
134
|
---
|
|
115
135
|
|
|
@@ -133,7 +153,8 @@ Restricts to printable ASCII only (removes emojis).
|
|
|
133
153
|
|
|
134
154
|
### `summonSanctifier.keyboardOnlyEmoji`
|
|
135
155
|
|
|
136
|
-
Restricts to
|
|
156
|
+
Restricts to printable ASCII + full emoji sequences.
|
|
157
|
+
Preserves ZWJ emoji clusters and emoji presentation variants.
|
|
137
158
|
|
|
138
159
|
### `inspectText(text: string): UnicodeTrashReport`
|
|
139
160
|
|
|
@@ -143,4 +164,4 @@ Returns a structural report of control codes, invisible chars, newline styles, a
|
|
|
143
164
|
|
|
144
165
|
## License
|
|
145
166
|
|
|
146
|
-
|
|
167
|
+
--{DR.WATT v3.0}--
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
function g(a={}){const
|
|
2
|
-
|
|
3
|
-
function
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
function
|
|
8
|
-
|
|
1
|
+
function g(a={}){const c=!!a.purgeInvisibleChars,b=!!a.purgeEmojis,d=!!a.nukeControls,f=!!a.keyboardOnlyFilter,h=!!a.normalizeNewlines,m=!!a.trimSpacesAroundNewlines,e=!!a.collapseNewLines,r=!!a.preserveParagraphs,t=!!a.collapseSpaces,u=!!a.finalTrim,n=Number.isFinite(a.maxLength)&&0<=a.maxLength?a.maxLength:Infinity,k="truncate"===a.g||"noop"===a.g||"throw"===a.g?a.g:"throw";return z=>l(z,c,b,d,f,h,m,e,r,t,u,n,k)}g.strict=a=>l(a,!0,!0,!0,!1,!0,!0,!0,!1,!0,!0);
|
|
2
|
+
g.loose=a=>l(a,!1,!1,!1,!1,!0,!0,!0,!0,!0,!0);g.keyboardOnlyEmoji=a=>l(a,!1,!1,!1,!0,!0,!0,!1,!1,!1,!0);g.keyboardOnly=a=>l(a,!0,!0,!0,!0,!0,!0,!0,!1,!0,!0);
|
|
3
|
+
function l(a,c=!1,b=!1,d=!1,f=!1,h=!1,m=!1,e=!1,r=!1,t=!1,u=!1,n=Infinity,k="throw"){if("string"!==typeof a)throw new TypeError("sanctifyText expects a string input.");"truncate"!==k&&"noop"!==k&&"throw"!==k&&(k="throw");if(a.length>n)switch(k){case "truncate":a=a.slice(0,n);break;case "noop":return a;default:throw new RangeError(`sanctifyText input length ${a.length} exceeds maxLength ${n}.`);}c&&(a=a.replace(p,""));b&&(a=a.replace(q,""));d&&(a=a.replace(v,""));f&&(a=w(a,b));h&&(a=a.replace(x,"\n"));
|
|
4
|
+
m&&(a=a.replace(y,"$1"));e&&(c=a,a=r?c.replace(A,"\n\n"):c.replace(B,"\n"));t&&(a=a.replace(C," "));return u?a.trim():a}var p=/[\u00A0\u2000-\u200D\u202F\u2060\u3000\uFEFF\u200E\u200F\u202A-\u202E]+/g,D=/[^\x20-\x7E\r\n]+/gu;function w(a,c=!1){a=E(a);return c?a.replace(D,""):a.replace(D,b=>{q.lastIndex=0;if(q.test(b)){q.lastIndex=0;var d="";for(const f of b.matchAll(q))d+=f[0];b=d}else b="";return b})}
|
|
5
|
+
var F=/[\u2018\u2019\u201A\u201B\u2032\u2035]/g,G=/[\u201C\u201D\u201E\u201F\u2033\u2036\u00AB\u00BB]/g,H=/[\u2012\u2013\u2014\u2015\u2212]/g,I=/\u2026/g,J=/[\u2022\u00B7]/g,K=/[\uFF01-\uFF5E]/g;function E(a){return a.replace(F,"'").replace(G,'"').replace(H,"-").replace(I,"...").replace(J,"*").replace(K,c=>String.fromCharCode(c.charCodeAt(0)-65248))}var q;try{q=RegExp("(?:\\p{Extended_Pictographic}(?:\\uFE0F|\\uFE0E)?(?:\\u200D(?:\\p{Extended_Pictographic}|\\w)+)*)","gu")}catch{q=/[\u{2700}-\u{27BF}\u{1F300}-\u{1FAFF}\u{1F3FB}-\u{1F3FF}\u200D\uFE0F\u{1F1E6}-\u{1F1FF}]/gu}
|
|
6
|
+
var x=/\r\n|\r|\n/g,y=/[ \t]*(\n+)[ \t]*/g,B=/\n{2,}/g,A=/\n{3,}/g,C=/ {2,}/g,v=/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F\u0080-\u009F\u200E\u200F\u202A-\u202E]+/g;function L(a,c){a.lastIndex=0;return a.test(c)}
|
|
7
|
+
function M(a){if("string"!==typeof a)throw new TypeError("inspectText expects a string input.");const c=[],b={hasControlChars:!1,hasInvisibleChars:!1,hasMixedNewlines:!1,newlineStyle:null,hasEmojis:!1,hasNonKeyboardChars:!1,summary:c};L(v,a)&&(b.hasControlChars=!0,c.push("Control characters detected."));L(p,a)&&(b.hasInvisibleChars=!0,c.push("Invisible Unicode characters detected."));L(q,a)&&(b.hasEmojis=!0,c.push("Emojis detected."));if("string"!==typeof a)throw new TypeError("getNewlineStats expects a string input.");
|
|
8
|
+
var d=a.replace(/\r\n/g,"");d={i:(a.match(/\r\n/g)||[]).length,h:(d.match(/\r/g)||[]).length,j:(d.match(/\n/g)||[]).length};const f=[];0<d.i&&f.push("CRLF");0<d.h&&f.push("CR");0<d.j&&f.push("LF");const {l:h,types:m}={...d,types:f,l:1<f.length};b.hasMixedNewlines=h;b.newlineStyle=h?"Mixed":m[0]||null;b.newlineStyle&&c.push(h?"Mixed newline styles detected.":`Consistent newline style: ${b.newlineStyle}`);a:{a=E(a);if(L(D,a))for(e of a)if(!("\n"===e||"\r"===e||" "<=e&&"~">=e||L(q,e))){var e=!0;break a}e=
|
|
9
|
+
!1}e&&(b.hasNonKeyboardChars=!0,c.push("Non-keyboard characters detected."));return b}function N(a){return{purgeInvisibleChars:a.hasInvisibleChars,purgeEmojis:a.hasEmojis,nukeControls:a.hasControlChars,keyboardOnlyFilter:a.hasNonKeyboardChars,normalizeNewlines:a.hasMixedNewlines||"CRLF"===a.newlineStyle||"CR"===a.newlineStyle}}export { g as summonSanctifier, M as inspectText, N as getRecommendedSanctifierOptions };
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Detects textual "trash" or anomalies in a given string.
|
|
3
|
+
* @param {string} text @returns {UnicodeTrashReport}
|
|
4
|
+
*/
|
|
5
|
+
export function inspectText(text: string): UnicodeTrashReport;
|
|
6
|
+
/**
|
|
7
|
+
* Counts the number of different newline types in a string.
|
|
8
|
+
* @param {string} text
|
|
9
|
+
* @returns {{
|
|
10
|
+
* crlf: number,
|
|
11
|
+
* cr: number,
|
|
12
|
+
* lf: number,
|
|
13
|
+
* types: string[],
|
|
14
|
+
* mixed: boolean
|
|
15
|
+
* }}
|
|
16
|
+
*/
|
|
17
|
+
export function getNewlineStats(text: string): {
|
|
18
|
+
crlf: number;
|
|
19
|
+
cr: number;
|
|
20
|
+
lf: number;
|
|
21
|
+
types: string[];
|
|
22
|
+
mixed: boolean;
|
|
23
|
+
};
|
|
24
|
+
/**
|
|
25
|
+
* Creates defaultOptions for summonSanctifier based on inspectText result
|
|
26
|
+
* @param {!UnicodeTrashReport} report
|
|
27
|
+
* @return {!SanctifyOptions}
|
|
28
|
+
*/
|
|
29
|
+
export function getRecommendedSanctifierOptions(report: UnicodeTrashReport): SanctifyOptions;
|
|
30
|
+
export type UnicodeTrashReport = {
|
|
31
|
+
hasControlChars: boolean;
|
|
32
|
+
hasInvisibleChars: boolean;
|
|
33
|
+
hasMixedNewlines: boolean;
|
|
34
|
+
newlineStyle: "LF" | "CRLF" | "CR" | "Mixed" | null;
|
|
35
|
+
hasEmojis: boolean;
|
|
36
|
+
hasNonKeyboardChars: boolean;
|
|
37
|
+
summary: string[];
|
|
38
|
+
};
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @typedef {Object} SanctifyOptions
|
|
3
|
+
* @property {boolean} [purgeInvisibleChars]
|
|
4
|
+
* @property {boolean} [purgeEmojis]
|
|
5
|
+
* @property {boolean} [nukeControls]
|
|
6
|
+
* @property {boolean} [keyboardOnlyFilter]
|
|
7
|
+
* @property {boolean} [normalizeNewlines]
|
|
8
|
+
* @property {boolean} [trimSpacesAroundNewlines]
|
|
9
|
+
* @property {boolean} [collapseNewLines]
|
|
10
|
+
* @property {boolean} [preserveParagraphs]
|
|
11
|
+
* @property {boolean} [collapseSpaces]
|
|
12
|
+
* @property {boolean} [finalTrim]
|
|
13
|
+
* @property {number} [maxLength]
|
|
14
|
+
* @property {'throw'|'truncate'|'noop'} [onMaxLength]
|
|
15
|
+
*/
|
|
16
|
+
/**
|
|
17
|
+
* Summons a customized sanctifier function with pre-bound booleans.
|
|
18
|
+
*
|
|
19
|
+
* Accepts full flag names and returns a text-cleaning function.
|
|
20
|
+
*
|
|
21
|
+
* @param {Object} [defaultOptions={}]
|
|
22
|
+
* @param {boolean} [defaultOptions.purgeInvisibleChars]
|
|
23
|
+
* @param {boolean} [defaultOptions.purgeEmojis]
|
|
24
|
+
* @param {boolean} [defaultOptions.nukeControls]
|
|
25
|
+
* @param {boolean} [defaultOptions.keyboardOnlyFilter]
|
|
26
|
+
* @param {boolean} [defaultOptions.normalizeNewlines]
|
|
27
|
+
* @param {boolean} [defaultOptions.trimSpacesAroundNewlines]
|
|
28
|
+
* @param {boolean} [defaultOptions.collapseNewLines]
|
|
29
|
+
* @param {boolean} [defaultOptions.preserveParagraphs]
|
|
30
|
+
* @param {boolean} [defaultOptions.collapseSpaces]
|
|
31
|
+
* @param {boolean} [defaultOptions.finalTrim]
|
|
32
|
+
* @param {number} [defaultOptions.maxLength=Infinity] - Hard input length cap (UTF-16 code units).
|
|
33
|
+
* @param {'throw' | 'truncate' | 'noop'} [defaultOptions.onMaxLength='throw'] - Behavior when input exceeds maxLength.
|
|
34
|
+
* @returns {(text: string) => string}
|
|
35
|
+
*/
|
|
36
|
+
export function summonSanctifier(defaultOptions?: {
|
|
37
|
+
purgeInvisibleChars?: boolean;
|
|
38
|
+
purgeEmojis?: boolean;
|
|
39
|
+
nukeControls?: boolean;
|
|
40
|
+
keyboardOnlyFilter?: boolean;
|
|
41
|
+
normalizeNewlines?: boolean;
|
|
42
|
+
trimSpacesAroundNewlines?: boolean;
|
|
43
|
+
collapseNewLines?: boolean;
|
|
44
|
+
preserveParagraphs?: boolean;
|
|
45
|
+
collapseSpaces?: boolean;
|
|
46
|
+
finalTrim?: boolean;
|
|
47
|
+
maxLength?: number;
|
|
48
|
+
onMaxLength?: "throw" | "truncate" | "noop";
|
|
49
|
+
}): (text: string) => string;
|
|
50
|
+
export namespace summonSanctifier {
|
|
51
|
+
/**
|
|
52
|
+
* Strict sanitizer:
|
|
53
|
+
* - Purge emojis
|
|
54
|
+
* - Collapse all newlines
|
|
55
|
+
* - Collapse spaces
|
|
56
|
+
* - Nuke control characters
|
|
57
|
+
*/
|
|
58
|
+
/** @param {string} text @returns {string} */
|
|
59
|
+
function strict(text: string): string;
|
|
60
|
+
/**
|
|
61
|
+
* Loose sanitizer:
|
|
62
|
+
* - Collapse spaces
|
|
63
|
+
* - Preserve paragraphs
|
|
64
|
+
* - Normalize newlines
|
|
65
|
+
*/
|
|
66
|
+
/** @param {string} text @returns {string} */
|
|
67
|
+
function loose(text: string): string;
|
|
68
|
+
/**
|
|
69
|
+
* Keyboard-only (with emojis):
|
|
70
|
+
* - Keeps emojis and printable ASCII
|
|
71
|
+
* - Strips non-standard characters
|
|
72
|
+
* - Normalizes typographic trash
|
|
73
|
+
*/
|
|
74
|
+
/** @param {string} text @returns {string} */
|
|
75
|
+
function keyboardOnlyEmoji(text: string): string;
|
|
76
|
+
/**
|
|
77
|
+
* Keyboard-only (strict):
|
|
78
|
+
* - Removes emojis
|
|
79
|
+
* - Collapses all whitespace
|
|
80
|
+
* - Restricts to printable ASCII only
|
|
81
|
+
*/
|
|
82
|
+
/** @param {string} text @returns {string} */
|
|
83
|
+
function keyboardOnly(text: string): string;
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Text Sanctifier
|
|
87
|
+
*
|
|
88
|
+
* Brutal text normalizer and invisible trash scrubber,
|
|
89
|
+
* configurable to kill whatever ghosts you want dead.
|
|
90
|
+
*
|
|
91
|
+
* โ ๏ธ Note: This is plain-text normalization/filtering โ not an HTML/XSS sanitizer.
|
|
92
|
+
*
|
|
93
|
+
* @param {string} text
|
|
94
|
+
* @param {boolean} [purgeInvisibleChars=false] - Remove ZWSP, NBSP, bidi, etc.
|
|
95
|
+
* @param {boolean} [purgeEmojis=false] - Remove emoji characters entirely.
|
|
96
|
+
* @param {boolean} [nukeControls=false] - Remove non-whitespace control characters.
|
|
97
|
+
* @param {boolean} [keyboardOnlyFilter=false] - Keep printable ASCII + full emoji sequences only (drops other Unicode).
|
|
98
|
+
* @param {boolean} [normalizeNewlines=false] - Convert all newlines to `\n`.
|
|
99
|
+
* @param {boolean} [trimSpacesAroundNewlines=false] - Remove spaces/tabs around newlines.
|
|
100
|
+
* @param {boolean} [collapseNewLines=false] - Collapse `\n` runs (optionally preserve paragraphs).
|
|
101
|
+
* @param {boolean} [preserveParagraphs=false] - Preserve paragraph breaks when collapsing newlines.
|
|
102
|
+
* @param {boolean} [collapseSpaces=false] - Collapse multiple spaces into one.
|
|
103
|
+
* @param {boolean} [finalTrim=false] - `.trim()` the final output (head/tail).
|
|
104
|
+
* @param {number} [maxLength=Infinity] - Hard input length cap (UTF-16 code units via `text.length`).
|
|
105
|
+
* @param {'throw'|'truncate'|'noop'} [onMaxLength='throw'] - Behavior when `text.length` exceeds `maxLength`:
|
|
106
|
+
* - `'throw'`: throw a `RangeError`
|
|
107
|
+
* - `'truncate'`: slice to `maxLength` before processing
|
|
108
|
+
* - `'noop'`: return the original input unchanged (skips processing)
|
|
109
|
+
* @returns {string}
|
|
110
|
+
* @throws {TypeError} If `text` is not a string.
|
|
111
|
+
* @throws {RangeError} If input exceeds `maxLength` and `onMaxLength` is `'throw'`.
|
|
112
|
+
*/
|
|
113
|
+
export function sanctifyText(text: string, purgeInvisibleChars?: boolean, purgeEmojis?: boolean, nukeControls?: boolean, keyboardOnlyFilter?: boolean, normalizeNewlines?: boolean, trimSpacesAroundNewlines?: boolean, collapseNewLines?: boolean, preserveParagraphs?: boolean, collapseSpaces?: boolean, finalTrim?: boolean, maxLength?: number, onMaxLength?: "throw" | "truncate" | "noop"): string;
|
|
114
|
+
/**
|
|
115
|
+
* Normalizes typographic Unicode punctuation into ASCII equivalents.
|
|
116
|
+
* @param {string} text
|
|
117
|
+
* @returns {string}
|
|
118
|
+
*/
|
|
119
|
+
export function normalizeTypographicJank(text: string): string;
|
|
120
|
+
/**
|
|
121
|
+
* Purges invisible Unicode "trash" characters and removes them.
|
|
122
|
+
*
|
|
123
|
+
* Targets:
|
|
124
|
+
* - Non-breaking spaces (\u00A0)
|
|
125
|
+
* - Zero-width spaces and miscellaneous Unicode spaces (\u2000โ\u200D, \u202F, \u2060, \u3000, \uFEFF)
|
|
126
|
+
* - Left-to-right/right-to-left markers and overrides (\u200E, \u200F, \u202Aโ\u202E)
|
|
127
|
+
*
|
|
128
|
+
* @param {string} text
|
|
129
|
+
* @returns {string}
|
|
130
|
+
*/
|
|
131
|
+
export const INVISIBLE_TRASH_REGEX: RegExp;
|
|
132
|
+
/**
|
|
133
|
+
* Matches any character that is NOT:
|
|
134
|
+
* - Printable ASCII (U+0020โU+007E)
|
|
135
|
+
* - Newline characters: \n (LF), \r (CR)
|
|
136
|
+
* This allows for \n, \r, and \r\n line endings.
|
|
137
|
+
*/
|
|
138
|
+
export const ASCII_KEYBOARD_SAFE_REGEX: RegExp;
|
|
139
|
+
/** @type {RegExp} */
|
|
140
|
+
export let EMOJI_REGEX: RegExp;
|
|
141
|
+
/**
|
|
142
|
+
* Nukes hidden control characters that are invisible and often dangerous.
|
|
143
|
+
* (Excludes necessary whitespace like \n and \t.)
|
|
144
|
+
*
|
|
145
|
+
* Control characters nuked:
|
|
146
|
+
* - ASCII control range (0x00-0x1F, 0x7F)
|
|
147
|
+
* - Unicode control range (0x80-0x9F)
|
|
148
|
+
* - RTL/LTR markers (U+200E, U+200F, U+202AโU+202E)
|
|
149
|
+
*
|
|
150
|
+
* @param {string} text
|
|
151
|
+
* @returns {string}
|
|
152
|
+
*/
|
|
153
|
+
export const CONTROL_CHARS_REGEX: RegExp;
|
|
154
|
+
export type SanctifyOptions = {
|
|
155
|
+
purgeInvisibleChars?: boolean;
|
|
156
|
+
purgeEmojis?: boolean;
|
|
157
|
+
nukeControls?: boolean;
|
|
158
|
+
keyboardOnlyFilter?: boolean;
|
|
159
|
+
normalizeNewlines?: boolean;
|
|
160
|
+
trimSpacesAroundNewlines?: boolean;
|
|
161
|
+
collapseNewLines?: boolean;
|
|
162
|
+
preserveParagraphs?: boolean;
|
|
163
|
+
collapseSpaces?: boolean;
|
|
164
|
+
finalTrim?: boolean;
|
|
165
|
+
maxLength?: number;
|
|
166
|
+
onMaxLength?: "throw" | "truncate" | "noop";
|
|
167
|
+
};
|
package/package.json
CHANGED
|
@@ -1,27 +1,35 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "text-sanctifier",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.18",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "A brutal text normalizer and invisible trash scrubber for modern web projects.",
|
|
6
6
|
"main": "./src/index.js",
|
|
7
|
-
"module":
|
|
7
|
+
"module": "./src/index.js",
|
|
8
8
|
"browser": "./dist/text-sanctifier.min.js",
|
|
9
9
|
"files": [
|
|
10
10
|
"src",
|
|
11
11
|
"dist/text-sanctifier.min.js",
|
|
12
|
+
"dist/types",
|
|
12
13
|
"LICENSE.md",
|
|
13
14
|
"README.md"
|
|
14
15
|
],
|
|
15
16
|
"exports": {
|
|
16
|
-
".":
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
".": {
|
|
18
|
+
"types": "./dist/types/index.d.ts",
|
|
19
|
+
"default": "./src/index.js"
|
|
20
|
+
},
|
|
21
|
+
"./browser": {
|
|
22
|
+
"default": "./dist/text-sanctifier.min.js"
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"types": "./dist/types/index.d.ts",
|
|
20
26
|
"sideEffects": false,
|
|
21
27
|
"scripts": {
|
|
22
28
|
"build": "node scripts/buildc.js",
|
|
29
|
+
"build:types": "tsc -p tsconfig.types.json",
|
|
23
30
|
"test": "node tests/sanctifyText.test.js",
|
|
24
|
-
"test-min": "node tests/sanctifyText.test.min.js"
|
|
31
|
+
"test-min": "node tests/sanctifyText.test.min.js",
|
|
32
|
+
"ci": "npm run test && npm run build && npm run build:types && npm run test-min"
|
|
25
33
|
},
|
|
26
34
|
"keywords": [
|
|
27
35
|
"text",
|
|
@@ -34,7 +42,7 @@
|
|
|
34
42
|
"sanctify"
|
|
35
43
|
],
|
|
36
44
|
"author": "๐พDr.Watt๐พ <WATT3D@protonmail.com>",
|
|
37
|
-
"license":
|
|
45
|
+
"license": "๐พDr.Watt๐พ License v3.0",
|
|
38
46
|
"repository": {
|
|
39
47
|
"type": "git",
|
|
40
48
|
"url": "git+https://github.com/iWhatty/text-sanctifier.git"
|
|
@@ -47,9 +55,9 @@
|
|
|
47
55
|
"test": "tests"
|
|
48
56
|
},
|
|
49
57
|
"devDependencies": {
|
|
50
|
-
"terser": "^5.39.0",
|
|
51
58
|
"esbuild": "^0.25.3",
|
|
52
|
-
"google-closure-compiler": "^20240317.0.0"
|
|
53
|
-
|
|
59
|
+
"google-closure-compiler": "^20240317.0.0",
|
|
60
|
+
"terser": "^5.39.0",
|
|
61
|
+
"typescript": "^5.9.3"
|
|
54
62
|
}
|
|
55
|
-
}
|
|
63
|
+
}
|
package/src/index.d.ts
CHANGED
|
@@ -1,116 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
purgeInvisibleChars?: boolean;
|
|
6
|
-
|
|
7
|
-
/** Remove emoji characters */
|
|
8
|
-
purgeEmojis?: boolean;
|
|
9
|
-
|
|
10
|
-
/** Nuke hidden control characters (excluding whitespace like \n and \t) */
|
|
11
|
-
nukeControls?: boolean;
|
|
12
|
-
|
|
13
|
-
/** Restrict to printable ASCII (+ emoji if `purgeEmojis` is false) */
|
|
14
|
-
keyboardOnlyFilter?: boolean;
|
|
15
|
-
|
|
16
|
-
/** Normalize all newline sequences to LF (`\n`) */
|
|
17
|
-
normalizeNewlines?: boolean;
|
|
18
|
-
|
|
19
|
-
/** Remove tabs and spaces before/after newline characters */
|
|
20
|
-
trimSpacesAroundNewlines?: boolean;
|
|
21
|
-
|
|
22
|
-
/** Collapse multiple consecutive newlines */
|
|
23
|
-
collapseNewLines?: boolean;
|
|
24
|
-
|
|
25
|
-
/** When collapsing newlines, preserve paragraph breaks as double `\n\n` */
|
|
26
|
-
preserveParagraphs?: boolean;
|
|
27
|
-
|
|
28
|
-
/** Collapse multiple spaces into a single space */
|
|
29
|
-
collapseSpaces?: boolean;
|
|
30
|
-
|
|
31
|
-
/** Trim leading and trailing whitespace from final result */
|
|
32
|
-
finalTrim?: boolean;
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
/** Preconfigured sanitizer function */
|
|
36
|
-
export type Sanctifier = (text: string) => string;
|
|
37
|
-
|
|
38
|
-
/**
|
|
39
|
-
* Summon a reusable text sanitizer.
|
|
40
|
-
*/
|
|
41
|
-
export function summonSanctifier(
|
|
42
|
-
defaultOptions?: SanctifyOptions,
|
|
43
|
-
): Sanctifier;
|
|
44
|
-
|
|
45
|
-
/**
|
|
46
|
-
* Strict sanitizer preset:
|
|
47
|
-
* - Collapse spaces
|
|
48
|
-
* - Collapse all newlines
|
|
49
|
-
* - Nuke control characters
|
|
50
|
-
* - Purge emojis
|
|
51
|
-
*/
|
|
52
|
-
export namespace summonSanctifier {
|
|
53
|
-
const strict: Sanctifier;
|
|
54
|
-
const loose: Sanctifier;
|
|
55
|
-
|
|
56
|
-
/**
|
|
57
|
-
* Keeps printable ASCII and emoji.
|
|
58
|
-
* Leaves spacing soft and preserves emoji.
|
|
59
|
-
*/
|
|
60
|
-
const keyboardOnlyEmoji: Sanctifier;
|
|
61
|
-
|
|
62
|
-
/**
|
|
63
|
-
* Keeps printable ASCII only.
|
|
64
|
-
* Collapses whitespace and purges emoji.
|
|
65
|
-
*/
|
|
66
|
-
const keyboardOnly: Sanctifier;
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
/**
|
|
70
|
-
* Brutally normalizes and cleans a string of text.
|
|
71
|
-
*/
|
|
72
|
-
export function sanctifyText(
|
|
73
|
-
text: string,
|
|
74
|
-
purgeInvisibleChars?: boolean,
|
|
75
|
-
purgeEmojis?: boolean,
|
|
76
|
-
nukeControls?: boolean,
|
|
77
|
-
keyboardOnlyFilter?: boolean,
|
|
78
|
-
normalizeNewlines?: boolean,
|
|
79
|
-
trimSpacesAroundNewlines?: boolean,
|
|
80
|
-
collapseNewLines?: boolean,
|
|
81
|
-
preserveParagraphs?: boolean,
|
|
82
|
-
collapseSpaces?: boolean,
|
|
83
|
-
finalTrim?: boolean,
|
|
84
|
-
): string;
|
|
85
|
-
|
|
86
|
-
/** Style of newline characters detected in a string */
|
|
87
|
-
export type NewlineStyle = 'LF' | 'CRLF' | 'CR' | 'Mixed' | null;
|
|
88
|
-
|
|
89
|
-
/**
|
|
90
|
-
* A structural report of anomalies found in text.
|
|
91
|
-
*/
|
|
92
|
-
export interface UnicodeTrashReport {
|
|
93
|
-
hasControlChars: boolean;
|
|
94
|
-
hasInvisibleChars: boolean;
|
|
95
|
-
hasMixedNewlines: boolean;
|
|
96
|
-
newlineStyle: NewlineStyle;
|
|
97
|
-
hasEmojis: boolean;
|
|
98
|
-
hasNonKeyboardChars: boolean;
|
|
99
|
-
summary: string[];
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
/**
|
|
103
|
-
* Analyze a string and return a report of Unicode/control character issues,
|
|
104
|
-
* invisible characters, newline styles, emojis, and more.
|
|
105
|
-
*/
|
|
106
|
-
export function inspectText(text: string): UnicodeTrashReport;
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
/**
|
|
110
|
-
* Creates a recommended set of `summonSanctifier` options based on the findings
|
|
111
|
-
* of `inspectText()`. This maps only what can be inferred automatically โ
|
|
112
|
-
* user-preference settings like whitespace collapsing are left unset.
|
|
113
|
-
*/
|
|
114
|
-
export function getRecommendedSanctifierOptions(
|
|
115
|
-
report: UnicodeTrashReport
|
|
116
|
-
): SanctifyOptions;
|
|
1
|
+
import { inspectText } from './inspectText.js';
|
|
2
|
+
import { getRecommendedSanctifierOptions } from './inspectText.js';
|
|
3
|
+
import { summonSanctifier } from './sanctifyText.js';
|
|
4
|
+
export { inspectText, getRecommendedSanctifierOptions, summonSanctifier };
|
package/src/inspectText.js
CHANGED
|
@@ -1,30 +1,74 @@
|
|
|
1
|
-
|
|
2
1
|
// ./src/inspectText.js
|
|
3
2
|
|
|
4
|
-
|
|
5
3
|
import {
|
|
6
4
|
CONTROL_CHARS_REGEX,
|
|
7
5
|
INVISIBLE_TRASH_REGEX,
|
|
8
6
|
EMOJI_REGEX,
|
|
9
7
|
ASCII_KEYBOARD_SAFE_REGEX,
|
|
10
|
-
normalizeTypographicJank
|
|
8
|
+
normalizeTypographicJank,
|
|
11
9
|
} from './sanctifyText.js';
|
|
12
10
|
|
|
13
11
|
|
|
12
|
+
/**
|
|
13
|
+
* @typedef {Object} UnicodeTrashReport
|
|
14
|
+
* @property {boolean} hasControlChars
|
|
15
|
+
* @property {boolean} hasInvisibleChars
|
|
16
|
+
* @property {boolean} hasMixedNewlines
|
|
17
|
+
* @property {'LF'|'CRLF'|'CR'|'Mixed'|null} newlineStyle
|
|
18
|
+
* @property {boolean} hasEmojis
|
|
19
|
+
* @property {boolean} hasNonKeyboardChars
|
|
20
|
+
* @property {string[]} summary
|
|
21
|
+
*/
|
|
22
|
+
|
|
14
23
|
|
|
15
24
|
/**
|
|
16
|
-
*
|
|
25
|
+
* Safe `.test()` for global/sticky regexes.
|
|
26
|
+
* Global regexes mutate `lastIndex`, which makes `.test()` unreliable across calls.
|
|
27
|
+
* @param {RegExp} re
|
|
28
|
+
* @param {string} s
|
|
29
|
+
*/
|
|
30
|
+
function stableTest(re, s) {
|
|
31
|
+
re.lastIndex = 0;
|
|
32
|
+
return re.test(s);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Returns true if text contains any non-keyboard characters (excluding emojis),
|
|
37
|
+
* after typographic normalization.
|
|
38
|
+
*
|
|
39
|
+
* "Keyboard" here means:
|
|
40
|
+
* - Printable ASCII (0x20โ0x7E)
|
|
41
|
+
* - CR/LF
|
|
42
|
+
* - Emojis (per EMOJI_REGEX)
|
|
43
|
+
*
|
|
17
44
|
* @param {string} text
|
|
18
|
-
* @returns {
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
45
|
+
* @returns {boolean}
|
|
46
|
+
*/
|
|
47
|
+
function hasNonKeyboardCharsExcludingEmoji(text) {
|
|
48
|
+
const normalized = normalizeTypographicJank(text);
|
|
49
|
+
|
|
50
|
+
// Fast path: no non-ascii runs => no non-keyboard chars
|
|
51
|
+
if (!stableTest(ASCII_KEYBOARD_SAFE_REGEX, normalized)) return false;
|
|
52
|
+
|
|
53
|
+
// Walk code points so we can distinguish emoji from other non-ascii safely
|
|
54
|
+
for (const ch of normalized) {
|
|
55
|
+
// allowed: ASCII printable + newlines
|
|
56
|
+
if (ch === '\n' || ch === '\r' || (ch >= ' ' && ch <= '~')) continue;
|
|
57
|
+
|
|
58
|
+
// allowed: emoji
|
|
59
|
+
if (stableTest(EMOJI_REGEX, ch)) continue;
|
|
60
|
+
|
|
61
|
+
// anything else non-ascii is "non-keyboard"
|
|
62
|
+
return true;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
return false;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Detects textual "trash" or anomalies in a given string.
|
|
70
|
+
* @param {string} text @returns {UnicodeTrashReport}
|
|
71
|
+
*/
|
|
28
72
|
export function inspectText(text) {
|
|
29
73
|
if (typeof text !== 'string') {
|
|
30
74
|
throw new TypeError('inspectText expects a string input.');
|
|
@@ -38,7 +82,7 @@ export function inspectText(text) {
|
|
|
38
82
|
newlineStyle: null,
|
|
39
83
|
hasEmojis: false,
|
|
40
84
|
hasNonKeyboardChars: false,
|
|
41
|
-
summary
|
|
85
|
+
summary,
|
|
42
86
|
};
|
|
43
87
|
|
|
44
88
|
const flag = (condition, key, message) => {
|
|
@@ -49,9 +93,9 @@ export function inspectText(text) {
|
|
|
49
93
|
};
|
|
50
94
|
|
|
51
95
|
// === Pattern Checks ===
|
|
52
|
-
flag(CONTROL_CHARS_REGEX
|
|
53
|
-
flag(INVISIBLE_TRASH_REGEX
|
|
54
|
-
flag(EMOJI_REGEX
|
|
96
|
+
flag(stableTest(CONTROL_CHARS_REGEX, text), 'hasControlChars', 'Control characters detected.');
|
|
97
|
+
flag(stableTest(INVISIBLE_TRASH_REGEX, text), 'hasInvisibleChars', 'Invisible Unicode characters detected.');
|
|
98
|
+
flag(stableTest(EMOJI_REGEX, text), 'hasEmojis', 'Emojis detected.');
|
|
55
99
|
|
|
56
100
|
// === Newline Analysis ===
|
|
57
101
|
const { mixed, types } = getNewlineStats(text);
|
|
@@ -60,33 +104,31 @@ export function inspectText(text) {
|
|
|
60
104
|
|
|
61
105
|
if (report.newlineStyle) {
|
|
62
106
|
summary.push(
|
|
63
|
-
mixed
|
|
64
|
-
? 'Mixed newline styles detected.'
|
|
65
|
-
: `Consistent newline style: ${report.newlineStyle}`
|
|
107
|
+
mixed ? 'Mixed newline styles detected.' : `Consistent newline style: ${report.newlineStyle}`
|
|
66
108
|
);
|
|
67
109
|
}
|
|
68
110
|
|
|
69
111
|
// === Non-keyboard characters (excluding emojis) ===
|
|
70
|
-
|
|
71
|
-
|
|
112
|
+
flag(
|
|
113
|
+
hasNonKeyboardCharsExcludingEmoji(text),
|
|
114
|
+
'hasNonKeyboardChars',
|
|
115
|
+
'Non-keyboard characters detected.'
|
|
72
116
|
);
|
|
73
|
-
flag(/[โ]/.test(filtered), 'hasNonKeyboardChars', 'Non-keyboard characters detected.');
|
|
74
117
|
|
|
75
118
|
return report;
|
|
76
119
|
}
|
|
77
120
|
|
|
78
|
-
|
|
79
121
|
/**
|
|
80
122
|
* Counts the number of different newline types in a string.
|
|
81
123
|
* @param {string} text
|
|
82
124
|
* @returns {{
|
|
83
|
-
* crlf: number,
|
|
84
|
-
* cr: number,
|
|
85
|
-
* lf: number,
|
|
86
|
-
* types: string[],
|
|
87
|
-
* mixed: boolean
|
|
88
|
-
* }}
|
|
89
|
-
*/
|
|
125
|
+
* crlf: number,
|
|
126
|
+
* cr: number,
|
|
127
|
+
* lf: number,
|
|
128
|
+
* types: string[],
|
|
129
|
+
* mixed: boolean
|
|
130
|
+
* }}
|
|
131
|
+
*/
|
|
90
132
|
export function getNewlineStats(text) {
|
|
91
133
|
if (typeof text !== 'string') {
|
|
92
134
|
throw new TypeError('getNewlineStats expects a string input.');
|
|
@@ -101,7 +143,7 @@ export function getNewlineStats(text) {
|
|
|
101
143
|
const count = {
|
|
102
144
|
crlf: crlfMatches.length,
|
|
103
145
|
cr: crMatches.length,
|
|
104
|
-
lf: lfMatches.length
|
|
146
|
+
lf: lfMatches.length,
|
|
105
147
|
};
|
|
106
148
|
|
|
107
149
|
const types = [];
|
|
@@ -112,12 +154,10 @@ export function getNewlineStats(text) {
|
|
|
112
154
|
return {
|
|
113
155
|
...count,
|
|
114
156
|
types,
|
|
115
|
-
mixed: types.length > 1
|
|
157
|
+
mixed: types.length > 1,
|
|
116
158
|
};
|
|
117
159
|
}
|
|
118
160
|
|
|
119
|
-
|
|
120
|
-
|
|
121
161
|
/**
|
|
122
162
|
* Creates defaultOptions for summonSanctifier based on inspectText result
|
|
123
163
|
* @param {!UnicodeTrashReport} report
|
|
@@ -129,11 +169,7 @@ export function getRecommendedSanctifierOptions(report) {
|
|
|
129
169
|
purgeEmojis: report.hasEmojis,
|
|
130
170
|
nukeControls: report.hasControlChars,
|
|
131
171
|
keyboardOnlyFilter: report.hasNonKeyboardChars,
|
|
132
|
-
normalizeNewlines:
|
|
133
|
-
|
|
134
|
-
// collapseNewLines: false,
|
|
135
|
-
// preserveParagraphs: true,
|
|
136
|
-
// collapseSpaces: true,
|
|
137
|
-
// finalTrim: true,
|
|
172
|
+
normalizeNewlines:
|
|
173
|
+
report.hasMixedNewlines || report.newlineStyle === 'CRLF' || report.newlineStyle === 'CR',
|
|
138
174
|
};
|
|
139
175
|
}
|
package/src/sanctifyText.js
CHANGED
|
@@ -13,6 +13,8 @@
|
|
|
13
13
|
* @property {boolean} [preserveParagraphs]
|
|
14
14
|
* @property {boolean} [collapseSpaces]
|
|
15
15
|
* @property {boolean} [finalTrim]
|
|
16
|
+
* @property {number} [maxLength]
|
|
17
|
+
* @property {'throw'|'truncate'|'noop'} [onMaxLength]
|
|
16
18
|
*/
|
|
17
19
|
|
|
18
20
|
|
|
@@ -32,6 +34,8 @@
|
|
|
32
34
|
* @param {boolean} [defaultOptions.preserveParagraphs]
|
|
33
35
|
* @param {boolean} [defaultOptions.collapseSpaces]
|
|
34
36
|
* @param {boolean} [defaultOptions.finalTrim]
|
|
37
|
+
* @param {number} [defaultOptions.maxLength=Infinity] - Hard input length cap (UTF-16 code units).
|
|
38
|
+
* @param {'throw' | 'truncate' | 'noop'} [defaultOptions.onMaxLength='throw'] - Behavior when input exceeds maxLength.
|
|
35
39
|
* @returns {(text: string) => string}
|
|
36
40
|
*/
|
|
37
41
|
export function summonSanctifier(defaultOptions = {}) {
|
|
@@ -46,19 +50,34 @@ export function summonSanctifier(defaultOptions = {}) {
|
|
|
46
50
|
const collapseSpaces = !!defaultOptions.collapseSpaces;
|
|
47
51
|
const finalTrim = !!defaultOptions.finalTrim;
|
|
48
52
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
)
|
|
53
|
+
const maxLength =
|
|
54
|
+
Number.isFinite(defaultOptions.maxLength) && defaultOptions.maxLength >= 0
|
|
55
|
+
? defaultOptions.maxLength
|
|
56
|
+
: Infinity;
|
|
57
|
+
|
|
58
|
+
const onMaxLength =
|
|
59
|
+
defaultOptions.onMaxLength === 'truncate' ||
|
|
60
|
+
defaultOptions.onMaxLength === 'noop' ||
|
|
61
|
+
defaultOptions.onMaxLength === 'throw'
|
|
62
|
+
? defaultOptions.onMaxLength
|
|
63
|
+
: 'throw';
|
|
64
|
+
|
|
65
|
+
return (text) =>
|
|
66
|
+
sanctifyText(
|
|
67
|
+
text,
|
|
68
|
+
purgeInvisibleChars,
|
|
69
|
+
purgeEmojis,
|
|
70
|
+
nukeControls,
|
|
71
|
+
keyboardOnlyFilter,
|
|
72
|
+
normalizeNewlines,
|
|
73
|
+
trimSpacesAroundNewlines,
|
|
74
|
+
collapseNewLines,
|
|
75
|
+
preserveParagraphs,
|
|
76
|
+
collapseSpaces,
|
|
77
|
+
finalTrim,
|
|
78
|
+
maxLength,
|
|
79
|
+
onMaxLength
|
|
80
|
+
);
|
|
62
81
|
}
|
|
63
82
|
|
|
64
83
|
// --- Added Presets ---
|
|
@@ -70,6 +89,7 @@ export function summonSanctifier(defaultOptions = {}) {
|
|
|
70
89
|
* - Collapse spaces
|
|
71
90
|
* - Nuke control characters
|
|
72
91
|
*/
|
|
92
|
+
/** @param {string} text @returns {string} */
|
|
73
93
|
summonSanctifier.strict = text => sanctifyText(
|
|
74
94
|
text,
|
|
75
95
|
true, // purgeInvisibleChars
|
|
@@ -91,6 +111,7 @@ summonSanctifier.strict = text => sanctifyText(
|
|
|
91
111
|
* - Preserve paragraphs
|
|
92
112
|
* - Normalize newlines
|
|
93
113
|
*/
|
|
114
|
+
/** @param {string} text @returns {string} */
|
|
94
115
|
summonSanctifier.loose = text => sanctifyText(
|
|
95
116
|
text,
|
|
96
117
|
false, // purgeInvisibleChars
|
|
@@ -112,6 +133,7 @@ summonSanctifier.loose = text => sanctifyText(
|
|
|
112
133
|
* - Strips non-standard characters
|
|
113
134
|
* - Normalizes typographic trash
|
|
114
135
|
*/
|
|
136
|
+
/** @param {string} text @returns {string} */
|
|
115
137
|
summonSanctifier.keyboardOnlyEmoji = text => sanctifyText(
|
|
116
138
|
text,
|
|
117
139
|
false, // purgeInvisibleChars
|
|
@@ -133,9 +155,10 @@ summonSanctifier.keyboardOnlyEmoji = text => sanctifyText(
|
|
|
133
155
|
* - Collapses all whitespace
|
|
134
156
|
* - Restricts to printable ASCII only
|
|
135
157
|
*/
|
|
158
|
+
/** @param {string} text @returns {string} */
|
|
136
159
|
summonSanctifier.keyboardOnly = text => sanctifyText(
|
|
137
160
|
text,
|
|
138
|
-
true,
|
|
161
|
+
true, // purgeInvisibleChars
|
|
139
162
|
true, // purgeEmojis
|
|
140
163
|
true, // nukeControls
|
|
141
164
|
true, // keyboardOnlyFilter
|
|
@@ -150,22 +173,31 @@ summonSanctifier.keyboardOnly = text => sanctifyText(
|
|
|
150
173
|
|
|
151
174
|
/**
|
|
152
175
|
* Text Sanctifier
|
|
153
|
-
*
|
|
176
|
+
*
|
|
154
177
|
* Brutal text normalizer and invisible trash scrubber,
|
|
155
178
|
* configurable to kill whatever ghosts you want dead.
|
|
156
|
-
*
|
|
157
|
-
*
|
|
179
|
+
*
|
|
180
|
+
* โ ๏ธ Note: This is plain-text normalization/filtering โ not an HTML/XSS sanitizer.
|
|
181
|
+
*
|
|
182
|
+
* @param {string} text
|
|
158
183
|
* @param {boolean} [purgeInvisibleChars=false] - Remove ZWSP, NBSP, bidi, etc.
|
|
159
184
|
* @param {boolean} [purgeEmojis=false] - Remove emoji characters entirely.
|
|
160
185
|
* @param {boolean} [nukeControls=false] - Remove non-whitespace control characters.
|
|
161
|
-
* @param {boolean} [keyboardOnlyFilter=false] - Keep printable ASCII
|
|
186
|
+
* @param {boolean} [keyboardOnlyFilter=false] - Keep printable ASCII + full emoji sequences only (drops other Unicode).
|
|
162
187
|
* @param {boolean} [normalizeNewlines=false] - Convert all newlines to `\n`.
|
|
163
188
|
* @param {boolean} [trimSpacesAroundNewlines=false] - Remove spaces/tabs around newlines.
|
|
164
189
|
* @param {boolean} [collapseNewLines=false] - Collapse `\n` runs (optionally preserve paragraphs).
|
|
165
190
|
* @param {boolean} [preserveParagraphs=false] - Preserve paragraph breaks when collapsing newlines.
|
|
166
191
|
* @param {boolean} [collapseSpaces=false] - Collapse multiple spaces into one.
|
|
167
192
|
* @param {boolean} [finalTrim=false] - `.trim()` the final output (head/tail).
|
|
193
|
+
* @param {number} [maxLength=Infinity] - Hard input length cap (UTF-16 code units via `text.length`).
|
|
194
|
+
* @param {'throw'|'truncate'|'noop'} [onMaxLength='throw'] - Behavior when `text.length` exceeds `maxLength`:
|
|
195
|
+
* - `'throw'`: throw a `RangeError`
|
|
196
|
+
* - `'truncate'`: slice to `maxLength` before processing
|
|
197
|
+
* - `'noop'`: return the original input unchanged (skips processing)
|
|
168
198
|
* @returns {string}
|
|
199
|
+
* @throws {TypeError} If `text` is not a string.
|
|
200
|
+
* @throws {RangeError} If input exceeds `maxLength` and `onMaxLength` is `'throw'`.
|
|
169
201
|
*/
|
|
170
202
|
export function sanctifyText(
|
|
171
203
|
text,
|
|
@@ -179,11 +211,32 @@ export function sanctifyText(
|
|
|
179
211
|
preserveParagraphs = false,
|
|
180
212
|
collapseSpaces = false,
|
|
181
213
|
finalTrim = false,
|
|
214
|
+
maxLength = Infinity,
|
|
215
|
+
onMaxLength = 'throw'
|
|
182
216
|
) {
|
|
183
217
|
if (typeof text !== 'string') {
|
|
184
218
|
throw new TypeError('sanctifyText expects a string input.');
|
|
185
219
|
}
|
|
186
220
|
|
|
221
|
+
if (onMaxLength !== 'truncate' && onMaxLength !== 'noop' && onMaxLength !== 'throw') {
|
|
222
|
+
onMaxLength = 'throw';
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
if (text.length > maxLength) {
|
|
226
|
+
switch (onMaxLength) {
|
|
227
|
+
case 'truncate':
|
|
228
|
+
text = text.slice(0, maxLength);
|
|
229
|
+
break;
|
|
230
|
+
case 'noop':
|
|
231
|
+
return text;
|
|
232
|
+
case 'throw':
|
|
233
|
+
default:
|
|
234
|
+
throw new RangeError(
|
|
235
|
+
`sanctifyText input length ${text.length} exceeds maxLength ${maxLength}.`
|
|
236
|
+
);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
187
240
|
let cleaned = text;
|
|
188
241
|
|
|
189
242
|
// Purge invisible Unicode trash (zero-width, non-breaking, bidi junk, etc.)
|
|
@@ -217,6 +270,7 @@ export function sanctifyText(
|
|
|
217
270
|
|
|
218
271
|
// --- Micro helpers ---
|
|
219
272
|
|
|
273
|
+
|
|
220
274
|
/**
|
|
221
275
|
* Purges invisible Unicode "trash" characters and removes them.
|
|
222
276
|
*
|
|
@@ -234,6 +288,35 @@ function purgeInvisibleTrash(text) {
|
|
|
234
288
|
}
|
|
235
289
|
|
|
236
290
|
|
|
291
|
+
/**
|
|
292
|
+
* Extracts and preserves complete emoji sequences from a string.
|
|
293
|
+
*
|
|
294
|
+
* Uses the global `EMOJI_REGEX` to match full emoji grapheme sequences,
|
|
295
|
+
* including:
|
|
296
|
+
* - Extended pictographic characters
|
|
297
|
+
* - Zero-width joiner (ZWJ) sequences (e.g. ๐จโ๐ฉโ๐งโ๐ฆ)
|
|
298
|
+
* - Variation Selector-16 (VS16) emoji presentation forms (e.g. โ๏ธ, โผ๏ธ)
|
|
299
|
+
*
|
|
300
|
+
* Any non-emoji characters are ignored.
|
|
301
|
+
*
|
|
302
|
+
* Important:
|
|
303
|
+
* `EMOJI_REGEX` is global (`/g`), so `lastIndex` is reset before matching
|
|
304
|
+
* to ensure deterministic behavior across repeated calls.
|
|
305
|
+
*
|
|
306
|
+
* @param {string} text - Input string potentially containing emoji sequences.
|
|
307
|
+
* @returns {string} A string containing only the matched emoji sequences,
|
|
308
|
+
* concatenated in original order.
|
|
309
|
+
*/
|
|
310
|
+
function extractEmojiSequences(text) {
|
|
311
|
+
EMOJI_REGEX.lastIndex = 0;
|
|
312
|
+
if (!EMOJI_REGEX.test(text)) return ''; // quick reject
|
|
313
|
+
EMOJI_REGEX.lastIndex = 0;
|
|
314
|
+
|
|
315
|
+
let out = '';
|
|
316
|
+
for (const match of text.matchAll(EMOJI_REGEX)) out += match[0];
|
|
317
|
+
return out;
|
|
318
|
+
}
|
|
319
|
+
|
|
237
320
|
/**
|
|
238
321
|
* Matches any character that is NOT:
|
|
239
322
|
* - Printable ASCII (U+0020โU+007E)
|
|
@@ -250,14 +333,15 @@ export const ASCII_KEYBOARD_SAFE_REGEX = /[^\x20-\x7E\r\n]+/gu;
|
|
|
250
333
|
function purgeNonKeyboardChars(text, purgeEmojis = false) {
|
|
251
334
|
const normalized = normalizeTypographicJank(text);
|
|
252
335
|
|
|
336
|
+
// If emojis are being purged, keyboard-only becomes "ASCII + CR/LF only"
|
|
253
337
|
if (purgeEmojis) {
|
|
254
338
|
return normalized.replace(ASCII_KEYBOARD_SAFE_REGEX, '');
|
|
255
339
|
}
|
|
256
340
|
|
|
257
|
-
//
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
);
|
|
341
|
+
// Replace each non-ASCII run with ONLY the emoji sequences inside it.
|
|
342
|
+
// This preserves ZWJ sequences and VS16 variants (e.g. ๐จโ๐ฉโ๐งโ๐ฆ, โ๏ธ, โผ๏ธ),
|
|
343
|
+
// while dropping non-emoji non-ASCII (e.g. ๐).
|
|
344
|
+
return normalized.replace(ASCII_KEYBOARD_SAFE_REGEX, (run) => extractEmojiSequences(run));
|
|
261
345
|
}
|
|
262
346
|
|
|
263
347
|
|
|
@@ -279,6 +363,11 @@ const BULLETS_REGEX = /[\u2022\u00B7]/g;
|
|
|
279
363
|
// Full-width ASCII punctuation: U+FF01 - U+FF5E
|
|
280
364
|
const FULLWIDTH_PUNCTUATION_REGEX = /[\uFF01-\uFF5E]/g;
|
|
281
365
|
|
|
366
|
+
/**
|
|
367
|
+
* Normalizes typographic Unicode punctuation into ASCII equivalents.
|
|
368
|
+
* @param {string} text
|
|
369
|
+
* @returns {string}
|
|
370
|
+
*/
|
|
282
371
|
export function normalizeTypographicJank(text) {
|
|
283
372
|
return text
|
|
284
373
|
.replace(SMART_SINGLE_QUOTES_REGEX, "'")
|
|
@@ -292,7 +381,7 @@ export function normalizeTypographicJank(text) {
|
|
|
292
381
|
}
|
|
293
382
|
|
|
294
383
|
|
|
295
|
-
|
|
384
|
+
/** @type {RegExp} */
|
|
296
385
|
export let EMOJI_REGEX;
|
|
297
386
|
|
|
298
387
|
/**
|