text-sanctifier 1.0.7 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -5
- package/dist/text-sanctifier.min.js +6 -3
- package/package.json +1 -1
- package/src/index.js +3 -0
- package/src/inspectText.js +108 -0
- package/src/sanctifyText.js +13 -14
package/README.md
CHANGED
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
# text-sanctifier
|
|
2
2
|
|
|
3
|
-

|
|
4
|
-

|
|
5
|
-

|
|
3
|
+
[](https://www.npmjs.com/package/text-sanctifier)
|
|
4
|
+
[](https://bundlephobia.com/package/text-sanctifier)
|
|
5
|
+
[](https://www.npmjs.com/package/text-sanctifier)
|
|
6
|
+
[](https://github.com/iWhatty/text-sanctifier)
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
Brutal text normalizer and invisible trash scrubber for modern web projects.
|
|
8
10
|
|
|
9
|
-
* Minified:
|
|
10
|
-
* Gzipped (GCC) :
|
|
11
|
+
* Minified: (2.47 KB)
|
|
12
|
+
* Gzipped (GCC) : (1.18 KB)
|
|
11
13
|
|
|
12
14
|
## Features
|
|
13
15
|
|
|
@@ -111,6 +113,37 @@ Removes everything except printable ASCII. Emojis are removed. Spaces are collap
|
|
|
111
113
|
|
|
112
114
|
Keeps printable ASCII and emoji characters. Typographic normalization included.
|
|
113
115
|
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
### Unicode Trash Detection
|
|
120
|
+
|
|
121
|
+
```javascript
|
|
122
|
+
import { inspectText } from 'text-sanctifier';
|
|
123
|
+
|
|
124
|
+
const report = inspectText(rawInput);
|
|
125
|
+
|
|
126
|
+
/*
|
|
127
|
+
{
|
|
128
|
+
hasControlChars: true,
|
|
129
|
+
hasInvisibleChars: true,
|
|
130
|
+
hasMixedNewlines: false,
|
|
131
|
+
newlineStyle: 'LF',
|
|
132
|
+
hasEmojis: true,
|
|
133
|
+
hasNonKeyboardChars: false,
|
|
134
|
+
summary: [
|
|
135
|
+
'Control characters detected.',
|
|
136
|
+
'Invisible Unicode characters detected.',
|
|
137
|
+
'Emojis detected.',
|
|
138
|
+
'Consistent newline style: LF'
|
|
139
|
+
]
|
|
140
|
+
}
|
|
141
|
+
*/
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Use this to preflight inputs and flag unwanted characters (like control codes, zero-width spaces, or mixed newline styles) before sanitization or storage.
|
|
145
|
+
|
|
146
|
+
|
|
114
147
|
---
|
|
115
148
|
|
|
116
149
|
## License
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
function
|
|
2
|
-
function
|
|
1
|
+
function f(a={}){const b=!!a.preserveParagraphs,c=!!a.collapseSpaces,d=!!a.nukeControls,e=!!a.purgeEmojis,h=!!a.keyboardOnlyFilter;return k=>g(k,b,c,d,e,h)}f.strict=a=>g(a,!1,!0,!0,!0);f.loose=a=>g(a,!0,!0);f.keyboardOnlyEmoji=a=>g(a,!1,!1,!0,!1,!0);f.keyboardOnly=a=>g(a,!1,!0,!0,!0,!0);
|
|
2
|
+
function g(a,b=!1,c=!1,d=!1,e=!1,h=!1){if("string"!==typeof a)throw new TypeError("sanctifyText expects a string input.");a=a.replace(l,"");e&&(a=a.replace(m,""));d&&(a=a.replace(n,""));h&&(a=p(a,e));a=a.replace(q,"\n");d=a=a.replace(r,"$1");a=b?d.replace(t,"\n\n"):d.replace(u,"\n");c&&(a=a.replace(v," "));return a.trim()}var l=/[\u00A0\u2000-\u200D\u202F\u2060\u3000\uFEFF\u200E\u200F\u202A-\u202E]+/g,w=/[^\x20-\x7E]/gu;
|
|
3
3
|
function p(a,b=!1){a=x(a);return b?a.replace(w,""):a.replace(/[^\x20-\x7E]+/gu,c=>c.match(m)?c:"")}var y=/[\u2018\u2019\u201A\u201B\u2032\u2035]/g,z=/[\u201C\u201D\u201E\u201F\u2033\u2036\u00AB\u00BB]/g,A=/[\u2012\u2013\u2014\u2015\u2212]/g,B=/\u2026/g,C=/[\u2022\u00B7]/g,D=/[\uFF01-\uFF5E]/g;function x(a){return a.replace(y,"'").replace(z,'"').replace(A,"-").replace(B,"...").replace(C,"*").replace(D,b=>String.fromCharCode(b.charCodeAt(0)-65248))}var m;
|
|
4
|
-
try{m=RegExp("(?:\\p{Extended_Pictographic}(?:\\uFE0F|\\uFE0E)?(?:\\u200D(?:\\p{Extended_Pictographic}|\\w)+)*)","gu")}catch{m=/[\u{1F300}-\u{1FAFF}]/gu}var q=/\r\n
|
|
4
|
+
try{m=RegExp("(?:\\p{Extended_Pictographic}(?:\\uFE0F|\\uFE0E)?(?:\\u200D(?:\\p{Extended_Pictographic}|\\w)+)*)","gu")}catch{m=/[\u{1F300}-\u{1FAFF}]/gu}var q=/\r\n|\r|\n/g,r=/[ \t]*(\n+)[ \t]*/g,u=/\n{2,}/g,t=/\n{3,}/g,v=/ {2,}/g,n=/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F\u0080-\u009F\u200E\u200F\u202A-\u202E]+/g;
|
|
5
|
+
function E(a){if("string"!==typeof a)throw new TypeError("inspectText expects a string input.");const b=[],c={o:!1,u:!1,j:!1,g:null,s:!1,v:!1,summary:b},d=(k,F,G)=>{k&&(c[F]=!0,b.push(G))};d(n.test(a),"hasControlChars","Control characters detected.");d(l.test(a),"hasInvisibleChars","Invisible Unicode characters detected.");d(m.test(a),"hasEmojis","Emojis detected.");const {m:e,types:h}=H(a);c.j=e;c.g=e?"Mixed":h[0]||null;c.g&&b.push(e?"Mixed newline styles detected.":`Consistent newline style: ${c.g}`);
|
|
6
|
+
a=x(a).replace(/[^\x20-\x7E]+/gu,k=>k.match(m)?"":"\u2612");d(/[\u2612]/.test(a),"hasNonKeyboardChars","Non-keyboard characters detected.");return c}function H(a){if("string"!==typeof a)throw new TypeError("getNewlineStats expects a string input.");var b=a.replace(/\r\n/g,"");a={i:(a.match(/\r\n/g)||[]).length,h:(b.match(/\r/g)||[]).length,l:(b.match(/\n/g)||[]).length};b=[];0<a.i&&b.push("CRLF");0<a.h&&b.push("CR");0<a.l&&b.push("LF");return{...a,types:b,m:1<b.length}}
|
|
7
|
+
export { f as summonSanctifier, E as inspectText };
|
package/package.json
CHANGED
package/src/index.js
CHANGED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
import { CONTROL_CHARS_REGEX, INVISIBLE_TRASH_REGEX, EMOJI_REGEX, normalizeTypographicJank } from './sanctifyText.js'
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Detects textual "trash" or anomalies in a given string.
|
|
8
|
+
* @param {string} text
|
|
9
|
+
* @returns {{
|
|
10
|
+
* hasControlChars: boolean,
|
|
11
|
+
* hasInvisibleChars: boolean,
|
|
12
|
+
* hasMixedNewlines: boolean,
|
|
13
|
+
* newlineStyle: 'LF' | 'CRLF' | 'CR' | 'Mixed' | null,
|
|
14
|
+
* hasEmojis: boolean,
|
|
15
|
+
* hasNonKeyboardChars: boolean,
|
|
16
|
+
* summary: string[]
|
|
17
|
+
* }}
|
|
18
|
+
*/
|
|
19
|
+
export function inspectText(text) {
|
|
20
|
+
if (typeof text !== 'string') {
|
|
21
|
+
throw new TypeError('inspectText expects a string input.');
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const summary = [];
|
|
25
|
+
const report = {
|
|
26
|
+
hasControlChars: false,
|
|
27
|
+
hasInvisibleChars: false,
|
|
28
|
+
hasMixedNewlines: false,
|
|
29
|
+
newlineStyle: null,
|
|
30
|
+
hasEmojis: false,
|
|
31
|
+
hasNonKeyboardChars: false,
|
|
32
|
+
summary
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
const flag = (condition, key, message) => {
|
|
36
|
+
if (condition) {
|
|
37
|
+
report[key] = true;
|
|
38
|
+
summary.push(message);
|
|
39
|
+
}
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
// === Pattern Checks ===
|
|
43
|
+
flag(CONTROL_CHARS_REGEX.test(text), 'hasControlChars', 'Control characters detected.');
|
|
44
|
+
flag(INVISIBLE_TRASH_REGEX.test(text), 'hasInvisibleChars', 'Invisible Unicode characters detected.');
|
|
45
|
+
flag(EMOJI_REGEX.test(text), 'hasEmojis', 'Emojis detected.');
|
|
46
|
+
|
|
47
|
+
// === Newline Analysis ===
|
|
48
|
+
const { mixed, types } = getNewlineStats(text);
|
|
49
|
+
report.hasMixedNewlines = mixed;
|
|
50
|
+
report.newlineStyle = mixed ? 'Mixed' : types[0] || null;
|
|
51
|
+
|
|
52
|
+
if (report.newlineStyle) {
|
|
53
|
+
summary.push(
|
|
54
|
+
mixed
|
|
55
|
+
? 'Mixed newline styles detected.'
|
|
56
|
+
: `Consistent newline style: ${report.newlineStyle}`
|
|
57
|
+
);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// === Non-keyboard characters (excluding emojis) ===
|
|
61
|
+
const filtered = normalizeTypographicJank(text).replace(/[^\x20-\x7E]+/gu, m =>
|
|
62
|
+
m.match(EMOJI_REGEX) ? '' : '☒'
|
|
63
|
+
);
|
|
64
|
+
flag(/[☒]/.test(filtered), 'hasNonKeyboardChars', 'Non-keyboard characters detected.');
|
|
65
|
+
|
|
66
|
+
return report;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Counts the number of different newline types in a string.
|
|
72
|
+
* @param {string} text
|
|
73
|
+
* @returns {{
|
|
74
|
+
* crlf: number,
|
|
75
|
+
* cr: number,
|
|
76
|
+
* lf: number,
|
|
77
|
+
* types: string[],
|
|
78
|
+
* mixed: boolean
|
|
79
|
+
* }}
|
|
80
|
+
*/
|
|
81
|
+
export function getNewlineStats(text) {
|
|
82
|
+
if (typeof text !== 'string') {
|
|
83
|
+
throw new TypeError('getNewlineStats expects a string input.');
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const crlfMatches = text.match(/\r\n/g) || [];
|
|
87
|
+
const textWithoutCRLF = text.replace(/\r\n/g, '');
|
|
88
|
+
|
|
89
|
+
const crMatches = textWithoutCRLF.match(/\r/g) || [];
|
|
90
|
+
const lfMatches = textWithoutCRLF.match(/\n/g) || [];
|
|
91
|
+
|
|
92
|
+
const count = {
|
|
93
|
+
crlf: crlfMatches.length,
|
|
94
|
+
cr: crMatches.length,
|
|
95
|
+
lf: lfMatches.length
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
const types = [];
|
|
99
|
+
if (count.crlf > 0) types.push('CRLF');
|
|
100
|
+
if (count.cr > 0) types.push('CR');
|
|
101
|
+
if (count.lf > 0) types.push('LF');
|
|
102
|
+
|
|
103
|
+
return {
|
|
104
|
+
...count,
|
|
105
|
+
types,
|
|
106
|
+
mixed: types.length > 1
|
|
107
|
+
};
|
|
108
|
+
}
|
package/src/sanctifyText.js
CHANGED
|
@@ -162,7 +162,7 @@ export function sanctifyText(
|
|
|
162
162
|
* @param {string} text
|
|
163
163
|
* @returns {string}
|
|
164
164
|
*/
|
|
165
|
-
const INVISIBLE_TRASH_REGEX = /[\u00A0\u2000-\u200D\u202F\u2060\u3000\uFEFF\u200E\u200F\u202A-\u202E]+/g;
|
|
165
|
+
export const INVISIBLE_TRASH_REGEX = /[\u00A0\u2000-\u200D\u202F\u2060\u3000\uFEFF\u200E\u200F\u202A-\u202E]+/g;
|
|
166
166
|
function purgeInvisibleTrash(text) {
|
|
167
167
|
return text.replace(INVISIBLE_TRASH_REGEX, '');
|
|
168
168
|
}
|
|
@@ -207,7 +207,7 @@ const BULLETS_REGEX = /[\u2022\u00B7]/g;
|
|
|
207
207
|
// Full-width ASCII punctuation: U+FF01 - U+FF5E
|
|
208
208
|
const FULLWIDTH_PUNCTUATION_REGEX = /[\uFF01-\uFF5E]/g;
|
|
209
209
|
|
|
210
|
-
function normalizeTypographicJank(text) {
|
|
210
|
+
export function normalizeTypographicJank(text) {
|
|
211
211
|
return text
|
|
212
212
|
.replace(SMART_SINGLE_QUOTES_REGEX, "'")
|
|
213
213
|
.replace(SMART_DOUBLE_QUOTES_REGEX, '"')
|
|
@@ -221,7 +221,7 @@ function normalizeTypographicJank(text) {
|
|
|
221
221
|
|
|
222
222
|
|
|
223
223
|
|
|
224
|
-
let EMOJI_REGEX;
|
|
224
|
+
export let EMOJI_REGEX;
|
|
225
225
|
|
|
226
226
|
/**
|
|
227
227
|
* Try Unicode property escape regex (preferred).
|
|
@@ -237,6 +237,7 @@ try {
|
|
|
237
237
|
EMOJI_REGEX = /[\u{1F300}-\u{1FAFF}]/gu;
|
|
238
238
|
}
|
|
239
239
|
|
|
240
|
+
|
|
240
241
|
/**
|
|
241
242
|
* Removes all emoji characters using Unicode property escapes.
|
|
242
243
|
* Supports modern environments (Unicode v13+) with fallback.
|
|
@@ -250,21 +251,19 @@ function purgeEmojisCharacters(text) {
|
|
|
250
251
|
|
|
251
252
|
|
|
252
253
|
/**
|
|
253
|
-
* Normalizes all line endings to
|
|
254
|
+
* Normalizes all line endings to a consistent format.
|
|
254
255
|
*
|
|
255
256
|
* Converts:
|
|
256
|
-
* - Windows
|
|
257
|
-
*
|
|
258
|
-
*
|
|
259
|
-
* Example:
|
|
260
|
-
* "Line1\r\nLine2\rLine3" → "Line1\nLine2\nLine3"
|
|
257
|
+
* - Windows ("\r\n"), Old Mac ("\r"), Unix ("\n")
|
|
258
|
+
* Into the specified newline format (default: Unix "\n").
|
|
261
259
|
*
|
|
262
|
-
* @param {string} text
|
|
260
|
+
* @param {string} text - Input string to normalize.
|
|
261
|
+
* @param {string} [normalized='\n'] - Target newline style (e.g. '\n', '\r\n').
|
|
263
262
|
* @returns {string}
|
|
264
263
|
*/
|
|
265
|
-
const NORMALIZE_NEWLINES_REGEX = /\r\n
|
|
266
|
-
function normalizeNewlines(text) {
|
|
267
|
-
return text.replace(NORMALIZE_NEWLINES_REGEX,
|
|
264
|
+
const NORMALIZE_NEWLINES_REGEX = /\r\n|\r|\n/g;
|
|
265
|
+
function normalizeNewlines(text, normalized = '\n') {
|
|
266
|
+
return text.replace(NORMALIZE_NEWLINES_REGEX, normalized);
|
|
268
267
|
}
|
|
269
268
|
|
|
270
269
|
|
|
@@ -336,7 +335,7 @@ function collapseExtraSpaces(text) {
|
|
|
336
335
|
* @param {string} text
|
|
337
336
|
* @returns {string}
|
|
338
337
|
*/
|
|
339
|
-
const CONTROL_CHARS_REGEX = /[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F\u0080-\u009F\u200E\u200F\u202A-\u202E]+/g;
|
|
338
|
+
export const CONTROL_CHARS_REGEX = /[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F\u0080-\u009F\u200E\u200F\u202A-\u202E]+/g;
|
|
340
339
|
function purgeControlCharacters(text) {
|
|
341
340
|
return text.replace(CONTROL_CHARS_REGEX, '');
|
|
342
341
|
}
|