text-sanctifier 1.0.14 → 1.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -2
- package/dist/text-sanctifier.min.js +5 -4
- package/package.json +1 -1
- package/src/index.d.ts +10 -0
- package/src/index.js +2 -2
- package/src/inspectText.js +99 -75
- package/src/sanctifyText.js +11 -6
package/README.md
CHANGED
|
@@ -7,8 +7,8 @@
|
|
|
7
7
|
|
|
8
8
|
Brutal text normalizer and invisible trash scrubber for modern web projects.
|
|
9
9
|
|
|
10
|
-
* Minified: (
|
|
11
|
-
* Gzipped (GCC): (1.
|
|
10
|
+
* Minified: (3.09 KB)
|
|
11
|
+
* Gzipped (GCC): (1.36 KB)
|
|
12
12
|
|
|
13
13
|
## Features
|
|
14
14
|
|
|
@@ -109,6 +109,8 @@ const report = inspectText(input);
|
|
|
109
109
|
|
|
110
110
|
Use `inspectText` to preflight text content before rendering, storing, or linting. It's a diagnostic tool to help inform sanitization needs.
|
|
111
111
|
|
|
112
|
+
Pass the report to getRecommendedSanctifierOptions(report) to auto-generate config flags for summonSanctifier().
|
|
113
|
+
|
|
112
114
|
---
|
|
113
115
|
|
|
114
116
|
## API
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
function g(a={}){const b=!!a.purgeInvisibleChars,c=!!a.purgeEmojis,d=!!a.nukeControls,e=!!a.keyboardOnlyFilter,k=!!a.normalizeNewlines,f=!!a.trimSpacesAroundNewlines,l=!!a.collapseNewLines,m=!!a.preserveParagraphs,p=!!a.collapseSpaces,q=!!a.finalTrim;return w=>h(w,b,c,d,e,k,f,l,m,p,q)}g.strict=a=>h(a,!0,!0,!0,!1,!0,!0,!0,!1,!0,!0);g.loose=a=>h(a,!1,!1,!1,!1,!0,!0,!0,!0,!0,!0);g.keyboardOnlyEmoji=a=>h(a,!1,!1,!1,!0,!0,!0,!1,!1,!1,!0);g.keyboardOnly=a=>h(a,!0,!0,!0,!0,!0,!0,!0,!1,!0,!0);
|
|
2
2
|
function h(a,b=!1,c=!1,d=!1,e=!1,k=!1,f=!1,l=!1,m=!1,p=!1,q=!1){if("string"!==typeof a)throw new TypeError("sanctifyText expects a string input.");b&&(a=a.replace(n,""));c&&(a=a.replace(r,""));d&&(a=a.replace(t,""));e&&(a=u(a,c));k&&(a=a.replace(v,"\n"));f&&(a=a.replace(x,"$1"));l&&(b=a,a=m?b.replace(y,"\n\n"):b.replace(z,"\n"));p&&(a=a.replace(A," "));return q?a.trim():a}var n=/[\u00A0\u2000-\u200D\u202F\u2060\u3000\uFEFF\u200E\u200F\u202A-\u202E]+/g,B=/[^\x20-\x7E\r\n]+/gu;
|
|
3
3
|
function u(a,b=!1){a=C(a);return b?a.replace(B,""):a.replace(B,c=>c.match(r)?c:"")}var D=/[\u2018\u2019\u201A\u201B\u2032\u2035]/g,E=/[\u201C\u201D\u201E\u201F\u2033\u2036\u00AB\u00BB]/g,F=/[\u2012\u2013\u2014\u2015\u2212]/g,G=/\u2026/g,H=/[\u2022\u00B7]/g,I=/[\uFF01-\uFF5E]/g;function C(a){return a.replace(D,"'").replace(E,'"').replace(F,"-").replace(G,"...").replace(H,"*").replace(I,b=>String.fromCharCode(b.charCodeAt(0)-65248))}var r;
|
|
4
|
-
try{r=RegExp("(?:\\p{Extended_Pictographic}(?:\\uFE0F|\\uFE0E)?(?:\\u200D(?:\\p{Extended_Pictographic}|\\w)+)*)","gu")}catch{r=/[\u{1F300}-\u{1FAFF}]/gu}var v=/\r\n|\r|\n/g,x=/[ \t]*(\n+)[ \t]*/g,z=/\n{2,}/g,y=/\n{3,}/g,A=/ {2,}/g,t=/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F\u0080-\u009F\u200E\u200F\u202A-\u202E]+/g;
|
|
5
|
-
function J(a){if("string"!==typeof a)throw new TypeError("inspectText expects a string input.");const b=[],c={
|
|
6
|
-
a=C(a).replace(B,f=>f.match(r)?"":"\u2612");d(/[\u2612]/.test(a),"hasNonKeyboardChars","Non-keyboard characters detected.");return c}
|
|
7
|
-
|
|
4
|
+
try{r=RegExp("(?:\\p{Extended_Pictographic}(?:\\uFE0F|\\uFE0E)?(?:\\u200D(?:\\p{Extended_Pictographic}|\\w)+)*)","gu")}catch{r=/[\u{2700}-\u{27BF}\u{1F300}-\u{1FAFF}\u{1F3FB}-\u{1F3FF}\u200D\uFE0F\u{1F1E6}-\u{1F1FF}]/gu}var v=/\r\n|\r|\n/g,x=/[ \t]*(\n+)[ \t]*/g,z=/\n{2,}/g,y=/\n{3,}/g,A=/ {2,}/g,t=/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F\u0080-\u009F\u200E\u200F\u202A-\u202E]+/g;
|
|
5
|
+
function J(a){if("string"!==typeof a)throw new TypeError("inspectText expects a string input.");const b=[],c={hasControlChars:!1,hasInvisibleChars:!1,hasMixedNewlines:!1,newlineStyle:null,hasEmojis:!1,hasNonKeyboardChars:!1,summary:b},d=(f,l,m)=>{f&&(c[l]=!0,b.push(m))};d(t.test(a),"hasControlChars","Control characters detected.");d(n.test(a),"hasInvisibleChars","Invisible Unicode characters detected.");d(r.test(a),"hasEmojis","Emojis detected.");const {j:e,types:k}=K(a);c.hasMixedNewlines=e;c.newlineStyle=
|
|
6
|
+
e?"Mixed":k[0]||null;c.newlineStyle&&b.push(e?"Mixed newline styles detected.":`Consistent newline style: ${c.newlineStyle}`);a=C(a).replace(B,f=>f.match(r)?"":"\u2612");d(/[\u2612]/.test(a),"hasNonKeyboardChars","Non-keyboard characters detected.");return c}
|
|
7
|
+
function K(a){if("string"!==typeof a)throw new TypeError("getNewlineStats expects a string input.");var b=a.replace(/\r\n/g,"");a={h:(a.match(/\r\n/g)||[]).length,g:(b.match(/\r/g)||[]).length,i:(b.match(/\n/g)||[]).length};b=[];0<a.h&&b.push("CRLF");0<a.g&&b.push("CR");0<a.i&&b.push("LF");return{...a,types:b,j:1<b.length}}
|
|
8
|
+
function L(a){return{purgeInvisibleChars:a.hasInvisibleChars,purgeEmojis:a.hasEmojis,nukeControls:a.hasControlChars,keyboardOnlyFilter:a.hasNonKeyboardChars,normalizeNewlines:a.hasMixedNewlines||"CRLF"===a.newlineStyle||"CR"===a.newlineStyle}}export { g as summonSanctifier, J as inspectText, L as getRecommendedSanctifierOptions };
|
package/package.json
CHANGED
package/src/index.d.ts
CHANGED
|
@@ -104,3 +104,13 @@ export interface UnicodeTrashReport {
|
|
|
104
104
|
* invisible characters, newline styles, emojis, and more.
|
|
105
105
|
*/
|
|
106
106
|
export function inspectText(text: string): UnicodeTrashReport;
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Creates a recommended set of `summonSanctifier` options based on the findings
|
|
111
|
+
* of `inspectText()`. This maps only what can be inferred automatically —
|
|
112
|
+
* user-preference settings like whitespace collapsing are left unset.
|
|
113
|
+
*/
|
|
114
|
+
export function getRecommendedSanctifierOptions(
|
|
115
|
+
report: UnicodeTrashReport
|
|
116
|
+
): SanctifyOptions;
|
package/src/index.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
// src/index.js
|
|
2
2
|
|
|
3
3
|
|
|
4
|
-
import { inspectText } from './inspectText.js';
|
|
5
|
-
export { inspectText };
|
|
4
|
+
import { inspectText, getRecommendedSanctifierOptions } from './inspectText.js';
|
|
5
|
+
export { inspectText, getRecommendedSanctifierOptions };
|
|
6
6
|
|
|
7
7
|
import { summonSanctifier } from './sanctifyText.js';
|
|
8
8
|
export { summonSanctifier };
|
package/src/inspectText.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
|
|
2
|
+
// ./src/inspectText.js
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
import {
|
|
@@ -7,9 +8,10 @@ import {
|
|
|
7
8
|
EMOJI_REGEX,
|
|
8
9
|
ASCII_KEYBOARD_SAFE_REGEX,
|
|
9
10
|
normalizeTypographicJank
|
|
10
|
-
|
|
11
|
+
} from './sanctifyText.js';
|
|
12
|
+
|
|
13
|
+
|
|
11
14
|
|
|
12
|
-
|
|
13
15
|
/**
|
|
14
16
|
* Detects textual "trash" or anomalies in a given string.
|
|
15
17
|
* @param {string} text
|
|
@@ -24,53 +26,53 @@ import {
|
|
|
24
26
|
* }}
|
|
25
27
|
*/
|
|
26
28
|
export function inspectText(text) {
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
29
|
+
if (typeof text !== 'string') {
|
|
30
|
+
throw new TypeError('inspectText expects a string input.');
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const summary = [];
|
|
34
|
+
const report = {
|
|
35
|
+
hasControlChars: false,
|
|
36
|
+
hasInvisibleChars: false,
|
|
37
|
+
hasMixedNewlines: false,
|
|
38
|
+
newlineStyle: null,
|
|
39
|
+
hasEmojis: false,
|
|
40
|
+
hasNonKeyboardChars: false,
|
|
41
|
+
summary
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
const flag = (condition, key, message) => {
|
|
45
|
+
if (condition) {
|
|
46
|
+
report[key] = true;
|
|
47
|
+
summary.push(message);
|
|
48
|
+
}
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
// === Pattern Checks ===
|
|
52
|
+
flag(CONTROL_CHARS_REGEX.test(text), 'hasControlChars', 'Control characters detected.');
|
|
53
|
+
flag(INVISIBLE_TRASH_REGEX.test(text), 'hasInvisibleChars', 'Invisible Unicode characters detected.');
|
|
54
|
+
flag(EMOJI_REGEX.test(text), 'hasEmojis', 'Emojis detected.');
|
|
55
|
+
|
|
56
|
+
// === Newline Analysis ===
|
|
57
|
+
const { mixed, types } = getNewlineStats(text);
|
|
58
|
+
report.hasMixedNewlines = mixed;
|
|
59
|
+
report.newlineStyle = mixed ? 'Mixed' : types[0] || null;
|
|
60
|
+
|
|
61
|
+
if (report.newlineStyle) {
|
|
62
|
+
summary.push(
|
|
63
|
+
mixed
|
|
64
|
+
? 'Mixed newline styles detected.'
|
|
65
|
+
: `Consistent newline style: ${report.newlineStyle}`
|
|
66
|
+
);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// === Non-keyboard characters (excluding emojis) ===
|
|
70
|
+
const filtered = normalizeTypographicJank(text).replace(ASCII_KEYBOARD_SAFE_REGEX, m =>
|
|
71
|
+
m.match(EMOJI_REGEX) ? '' : '☒'
|
|
72
|
+
);
|
|
73
|
+
flag(/[☒]/.test(filtered), 'hasNonKeyboardChars', 'Non-keyboard characters detected.');
|
|
74
|
+
|
|
75
|
+
return report;
|
|
74
76
|
}
|
|
75
77
|
|
|
76
78
|
|
|
@@ -86,30 +88,52 @@ export function inspectText(text) {
|
|
|
86
88
|
* }}
|
|
87
89
|
*/
|
|
88
90
|
export function getNewlineStats(text) {
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
91
|
+
if (typeof text !== 'string') {
|
|
92
|
+
throw new TypeError('getNewlineStats expects a string input.');
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const crlfMatches = text.match(/\r\n/g) || [];
|
|
96
|
+
const textWithoutCRLF = text.replace(/\r\n/g, '');
|
|
97
|
+
|
|
98
|
+
const crMatches = textWithoutCRLF.match(/\r/g) || [];
|
|
99
|
+
const lfMatches = textWithoutCRLF.match(/\n/g) || [];
|
|
100
|
+
|
|
101
|
+
const count = {
|
|
102
|
+
crlf: crlfMatches.length,
|
|
103
|
+
cr: crMatches.length,
|
|
104
|
+
lf: lfMatches.length
|
|
105
|
+
};
|
|
106
|
+
|
|
107
|
+
const types = [];
|
|
108
|
+
if (count.crlf > 0) types.push('CRLF');
|
|
109
|
+
if (count.cr > 0) types.push('CR');
|
|
110
|
+
if (count.lf > 0) types.push('LF');
|
|
111
|
+
|
|
112
|
+
return {
|
|
113
|
+
...count,
|
|
114
|
+
types,
|
|
115
|
+
mixed: types.length > 1
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Creates defaultOptions for summonSanctifier based on inspectText result
|
|
123
|
+
* @param {!UnicodeTrashReport} report
|
|
124
|
+
* @return {!SanctifyOptions}
|
|
125
|
+
*/
|
|
126
|
+
export function getRecommendedSanctifierOptions(report) {
|
|
127
|
+
return {
|
|
128
|
+
purgeInvisibleChars: report.hasInvisibleChars,
|
|
129
|
+
purgeEmojis: report.hasEmojis,
|
|
130
|
+
nukeControls: report.hasControlChars,
|
|
131
|
+
keyboardOnlyFilter: report.hasNonKeyboardChars,
|
|
132
|
+
normalizeNewlines: report.hasMixedNewlines || report.newlineStyle === 'CRLF' || report.newlineStyle === 'CR',
|
|
133
|
+
// trimSpacesAroundNewlines: true,
|
|
134
|
+
// collapseNewLines: false,
|
|
135
|
+
// preserveParagraphs: true,
|
|
136
|
+
// collapseSpaces: true,
|
|
137
|
+
// finalTrim: true,
|
|
138
|
+
};
|
|
115
139
|
}
|
package/src/sanctifyText.js
CHANGED
|
@@ -190,7 +190,7 @@ export function sanctifyText(
|
|
|
190
190
|
if (purgeInvisibleChars) cleaned = purgeInvisibleTrash(cleaned);
|
|
191
191
|
|
|
192
192
|
// Remove emojis
|
|
193
|
-
if (purgeEmojis) cleaned =
|
|
193
|
+
if (purgeEmojis) cleaned = purgeEmojiCharacters(cleaned);
|
|
194
194
|
|
|
195
195
|
// Nuke control characters (excluding whitespace)
|
|
196
196
|
if (nukeControls) cleaned = purgeControlCharacters(cleaned);
|
|
@@ -297,7 +297,7 @@ export let EMOJI_REGEX;
|
|
|
297
297
|
|
|
298
298
|
/**
|
|
299
299
|
* Try Unicode property escape regex (preferred).
|
|
300
|
-
* Fallback to basic emoji
|
|
300
|
+
* Fallback to basic emoji ranges if unsupported.
|
|
301
301
|
*/
|
|
302
302
|
try {
|
|
303
303
|
EMOJI_REGEX = new RegExp(
|
|
@@ -305,11 +305,16 @@ try {
|
|
|
305
305
|
'gu'
|
|
306
306
|
);
|
|
307
307
|
} catch {
|
|
308
|
-
// Fallback:
|
|
309
|
-
|
|
308
|
+
// Fallback: wide-range emoji component match (flags, tones, symbols)
|
|
309
|
+
// * Covers:
|
|
310
|
+
// * - Emoji base chars (1F300–1FAFF)
|
|
311
|
+
// * - Dingbats (2700–27BF) like ❌, ✅, ☑️
|
|
312
|
+
// * - Skin tones (1F3FB–1F3FF)
|
|
313
|
+
// * - ZWJ (200D), variation selectors (FE0F)
|
|
314
|
+
// * - Regional indicators (1F1E6–1F1FF)
|
|
315
|
+
EMOJI_REGEX = /[\u{2700}-\u{27BF}\u{1F300}-\u{1FAFF}\u{1F3FB}-\u{1F3FF}\u200D\uFE0F\u{1F1E6}-\u{1F1FF}]/gu;
|
|
310
316
|
}
|
|
311
317
|
|
|
312
|
-
|
|
313
318
|
/**
|
|
314
319
|
* Removes all emoji characters using Unicode property escapes.
|
|
315
320
|
* Supports modern environments (Unicode v13+) with fallback.
|
|
@@ -317,7 +322,7 @@ try {
|
|
|
317
322
|
* @param {string} text
|
|
318
323
|
* @returns {string}
|
|
319
324
|
*/
|
|
320
|
-
function
|
|
325
|
+
function purgeEmojiCharacters(text) {
|
|
321
326
|
return text.replace(EMOJI_REGEX, '');
|
|
322
327
|
}
|
|
323
328
|
|