text-sanctifier 1.0.7 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,13 +1,15 @@
1
1
  # text-sanctifier
2
2
 
3
- ![npm](https://img.shields.io/npm/v/text-sanctifier)
4
- ![gzip size](https://img.shields.io/bundlephobia/minzip/text-sanctifier)
5
- ![downloads](https://img.shields.io/npm/dw/text-sanctifier)
3
+ [![npm](https://img.shields.io/npm/v/text-sanctifier)](https://www.npmjs.com/package/text-sanctifier)
4
+ [![gzip size](https://img.shields.io/bundlephobia/minzip/text-sanctifier)](https://bundlephobia.com/package/text-sanctifier)
5
+ [![downloads](https://img.shields.io/npm/dw/text-sanctifier)](https://www.npmjs.com/package/text-sanctifier)
6
+ [![GitHub stars](https://img.shields.io/github/stars/iWhatty/text-sanctifier?style=social)](https://github.com/iWhatty/text-sanctifier)
7
+
6
8
 
7
9
  Brutal text normalizer and invisible trash scrubber for modern web projects.
8
10
 
9
- * Minified: 1425 bytes (1.39 KB)
10
- * Gzipped (GCC) : 784 bytes (0.77 KB)
11
+ * Minified: (2.47 KB)
12
+ * Gzipped (GCC) : (1.18 KB)
11
13
 
12
14
  ## Features
13
15
 
@@ -111,6 +113,37 @@ Removes everything except printable ASCII. Emojis are removed. Spaces are collap
111
113
 
112
114
  Keeps printable ASCII and emoji characters. Typographic normalization included.
113
115
 
116
+ ---
117
+
118
+
119
+ ### Unicode Trash Detection
120
+
121
+ ```javascript
122
+ import { inspectText } from 'text-sanctifier';
123
+
124
+ const report = inspectText(rawInput);
125
+
126
+ /*
127
+ {
128
+ hasControlChars: true,
129
+ hasInvisibleChars: true,
130
+ hasMixedNewlines: false,
131
+ newlineStyle: 'LF',
132
+ hasEmojis: true,
133
+ hasNonKeyboardChars: false,
134
+ summary: [
135
+ 'Control characters detected.',
136
+ 'Invisible Unicode characters detected.',
137
+ 'Emojis detected.',
138
+ 'Consistent newline style: LF'
139
+ ]
140
+ }
141
+ */
142
+ ```
143
+
144
+ Use this to preflight inputs and flag unwanted characters (like control codes, zero-width spaces, or mixed newline styles) before sanitization or storage.
145
+
146
+
114
147
  ---
115
148
 
116
149
  ## License
@@ -1,4 +1,7 @@
1
- function e(a={}){const b=!!a.preserveParagraphs,c=!!a.collapseSpaces,d=!!a.nukeControls,g=!!a.purgeEmojis,k=!!a.keyboardOnlyFilter;return l=>f(l,b,c,d,g,k)}e.strict=a=>f(a,!1,!0,!0,!0);e.loose=a=>f(a,!0,!0);e.keyboardOnlyEmoji=a=>f(a,!1,!1,!0,!1,!0);e.keyboardOnly=a=>f(a,!1,!0,!0,!0,!0);
2
- function f(a,b=!1,c=!1,d=!1,g=!1,k=!1){if("string"!==typeof a)throw new TypeError("sanctifyText expects a string input.");a=a.replace(h,"");g&&(a=a.replace(m,""));d&&(a=a.replace(n,""));k&&(a=p(a,g));a=a.replace(q,"\n");d=a=a.replace(r,"$1");a=b?d.replace(t,"\n\n"):d.replace(u,"\n");c&&(a=a.replace(v," "));return a.trim()}var h=/[\u00A0\u2000-\u200D\u202F\u2060\u3000\uFEFF\u200E\u200F\u202A-\u202E]+/g,w=/[^\x20-\x7E]/gu;
1
+ function f(a={}){const b=!!a.preserveParagraphs,c=!!a.collapseSpaces,d=!!a.nukeControls,e=!!a.purgeEmojis,h=!!a.keyboardOnlyFilter;return k=>g(k,b,c,d,e,h)}f.strict=a=>g(a,!1,!0,!0,!0);f.loose=a=>g(a,!0,!0);f.keyboardOnlyEmoji=a=>g(a,!1,!1,!0,!1,!0);f.keyboardOnly=a=>g(a,!1,!0,!0,!0,!0);
2
+ function g(a,b=!1,c=!1,d=!1,e=!1,h=!1){if("string"!==typeof a)throw new TypeError("sanctifyText expects a string input.");a=a.replace(l,"");e&&(a=a.replace(m,""));d&&(a=a.replace(n,""));h&&(a=p(a,e));a=a.replace(q,"\n");d=a=a.replace(r,"$1");a=b?d.replace(t,"\n\n"):d.replace(u,"\n");c&&(a=a.replace(v," "));return a.trim()}var l=/[\u00A0\u2000-\u200D\u202F\u2060\u3000\uFEFF\u200E\u200F\u202A-\u202E]+/g,w=/[^\x20-\x7E]/gu;
3
3
  function p(a,b=!1){a=x(a);return b?a.replace(w,""):a.replace(/[^\x20-\x7E]+/gu,c=>c.match(m)?c:"")}var y=/[\u2018\u2019\u201A\u201B\u2032\u2035]/g,z=/[\u201C\u201D\u201E\u201F\u2033\u2036\u00AB\u00BB]/g,A=/[\u2012\u2013\u2014\u2015\u2212]/g,B=/\u2026/g,C=/[\u2022\u00B7]/g,D=/[\uFF01-\uFF5E]/g;function x(a){return a.replace(y,"'").replace(z,'"').replace(A,"-").replace(B,"...").replace(C,"*").replace(D,b=>String.fromCharCode(b.charCodeAt(0)-65248))}var m;
4
- try{m=RegExp("(?:\\p{Extended_Pictographic}(?:\\uFE0F|\\uFE0E)?(?:\\u200D(?:\\p{Extended_Pictographic}|\\w)+)*)","gu")}catch{m=/[\u{1F300}-\u{1FAFF}]/gu}var q=/\r\n?/g,r=/[ \t]*(\n+)[ \t]*/g,u=/\n{2,}/g,t=/\n{3,}/g,v=/ {2,}/g,n=/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F\u0080-\u009F\u200E\u200F\u202A-\u202E]+/g;export { e as summonSanctifier };
4
+ try{m=RegExp("(?:\\p{Extended_Pictographic}(?:\\uFE0F|\\uFE0E)?(?:\\u200D(?:\\p{Extended_Pictographic}|\\w)+)*)","gu")}catch{m=/[\u{1F300}-\u{1FAFF}]/gu}var q=/\r\n|\r|\n/g,r=/[ \t]*(\n+)[ \t]*/g,u=/\n{2,}/g,t=/\n{3,}/g,v=/ {2,}/g,n=/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F\u0080-\u009F\u200E\u200F\u202A-\u202E]+/g;
5
+ function E(a){if("string"!==typeof a)throw new TypeError("inspectText expects a string input.");const b=[],c={o:!1,u:!1,j:!1,g:null,s:!1,v:!1,summary:b},d=(k,F,G)=>{k&&(c[F]=!0,b.push(G))};d(n.test(a),"hasControlChars","Control characters detected.");d(l.test(a),"hasInvisibleChars","Invisible Unicode characters detected.");d(m.test(a),"hasEmojis","Emojis detected.");const {m:e,types:h}=H(a);c.j=e;c.g=e?"Mixed":h[0]||null;c.g&&b.push(e?"Mixed newline styles detected.":`Consistent newline style: ${c.g}`);
6
+ a=x(a).replace(/[^\x20-\x7E]+/gu,k=>k.match(m)?"":"\u2612");d(/[\u2612]/.test(a),"hasNonKeyboardChars","Non-keyboard characters detected.");return c}function H(a){if("string"!==typeof a)throw new TypeError("getNewlineStats expects a string input.");var b=a.replace(/\r\n/g,"");a={i:(a.match(/\r\n/g)||[]).length,h:(b.match(/\r/g)||[]).length,l:(b.match(/\n/g)||[]).length};b=[];0<a.i&&b.push("CRLF");0<a.h&&b.push("CR");0<a.l&&b.push("LF");return{...a,types:b,m:1<b.length}}
7
+ export { f as summonSanctifier, E as inspectText };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "text-sanctifier",
3
- "version": "1.0.7",
3
+ "version": "1.0.9",
4
4
  "type": "module",
5
5
  "description": "A brutal text normalizer and invisible trash scrubber for modern web projects.",
6
6
  "main": "./src/index.js",
package/src/index.js CHANGED
@@ -1,6 +1,9 @@
1
1
  // src/index.js
2
2
 
3
3
 
4
+ import { inspectText } from './inspectText.js';
5
+ export { inspectText };
6
+
4
7
  import { summonSanctifier } from './sanctifyText.js';
5
8
  export { summonSanctifier };
6
9
 
@@ -0,0 +1,108 @@
1
+
2
+
3
+
4
+ import { CONTROL_CHARS_REGEX, INVISIBLE_TRASH_REGEX, EMOJI_REGEX, normalizeTypographicJank } from './sanctifyText.js'
5
+
6
+ /**
7
+ * Detects textual "trash" or anomalies in a given string.
8
+ * @param {string} text
9
+ * @returns {{
10
+ * hasControlChars: boolean,
11
+ * hasInvisibleChars: boolean,
12
+ * hasMixedNewlines: boolean,
13
+ * newlineStyle: 'LF' | 'CRLF' | 'CR' | 'Mixed' | null,
14
+ * hasEmojis: boolean,
15
+ * hasNonKeyboardChars: boolean,
16
+ * summary: string[]
17
+ * }}
18
+ */
19
+ export function inspectText(text) {
20
+ if (typeof text !== 'string') {
21
+ throw new TypeError('inspectText expects a string input.');
22
+ }
23
+
24
+ const summary = [];
25
+ const report = {
26
+ hasControlChars: false,
27
+ hasInvisibleChars: false,
28
+ hasMixedNewlines: false,
29
+ newlineStyle: null,
30
+ hasEmojis: false,
31
+ hasNonKeyboardChars: false,
32
+ summary
33
+ };
34
+
35
+ const flag = (condition, key, message) => {
36
+ if (condition) {
37
+ report[key] = true;
38
+ summary.push(message);
39
+ }
40
+ };
41
+
42
+ // === Pattern Checks ===
43
+ flag(CONTROL_CHARS_REGEX.test(text), 'hasControlChars', 'Control characters detected.');
44
+ flag(INVISIBLE_TRASH_REGEX.test(text), 'hasInvisibleChars', 'Invisible Unicode characters detected.');
45
+ flag(EMOJI_REGEX.test(text), 'hasEmojis', 'Emojis detected.');
46
+
47
+ // === Newline Analysis ===
48
+ const { mixed, types } = getNewlineStats(text);
49
+ report.hasMixedNewlines = mixed;
50
+ report.newlineStyle = mixed ? 'Mixed' : types[0] || null;
51
+
52
+ if (report.newlineStyle) {
53
+ summary.push(
54
+ mixed
55
+ ? 'Mixed newline styles detected.'
56
+ : `Consistent newline style: ${report.newlineStyle}`
57
+ );
58
+ }
59
+
60
+ // === Non-keyboard characters (excluding emojis) ===
61
+ const filtered = normalizeTypographicJank(text).replace(/[^\x20-\x7E]+/gu, m =>
62
+ m.match(EMOJI_REGEX) ? '' : '☒'
63
+ );
64
+ flag(/[☒]/.test(filtered), 'hasNonKeyboardChars', 'Non-keyboard characters detected.');
65
+
66
+ return report;
67
+ }
68
+
69
+
70
+ /**
71
+ * Counts the number of different newline types in a string.
72
+ * @param {string} text
73
+ * @returns {{
74
+ * crlf: number,
75
+ * cr: number,
76
+ * lf: number,
77
+ * types: string[],
78
+ * mixed: boolean
79
+ * }}
80
+ */
81
+ export function getNewlineStats(text) {
82
+ if (typeof text !== 'string') {
83
+ throw new TypeError('getNewlineStats expects a string input.');
84
+ }
85
+
86
+ const crlfMatches = text.match(/\r\n/g) || [];
87
+ const textWithoutCRLF = text.replace(/\r\n/g, '');
88
+
89
+ const crMatches = textWithoutCRLF.match(/\r/g) || [];
90
+ const lfMatches = textWithoutCRLF.match(/\n/g) || [];
91
+
92
+ const count = {
93
+ crlf: crlfMatches.length,
94
+ cr: crMatches.length,
95
+ lf: lfMatches.length
96
+ };
97
+
98
+ const types = [];
99
+ if (count.crlf > 0) types.push('CRLF');
100
+ if (count.cr > 0) types.push('CR');
101
+ if (count.lf > 0) types.push('LF');
102
+
103
+ return {
104
+ ...count,
105
+ types,
106
+ mixed: types.length > 1
107
+ };
108
+ }
@@ -162,7 +162,7 @@ export function sanctifyText(
162
162
  * @param {string} text
163
163
  * @returns {string}
164
164
  */
165
- const INVISIBLE_TRASH_REGEX = /[\u00A0\u2000-\u200D\u202F\u2060\u3000\uFEFF\u200E\u200F\u202A-\u202E]+/g;
165
+ export const INVISIBLE_TRASH_REGEX = /[\u00A0\u2000-\u200D\u202F\u2060\u3000\uFEFF\u200E\u200F\u202A-\u202E]+/g;
166
166
  function purgeInvisibleTrash(text) {
167
167
  return text.replace(INVISIBLE_TRASH_REGEX, '');
168
168
  }
@@ -207,7 +207,7 @@ const BULLETS_REGEX = /[\u2022\u00B7]/g;
207
207
  // Full-width ASCII punctuation: U+FF01 - U+FF5E
208
208
  const FULLWIDTH_PUNCTUATION_REGEX = /[\uFF01-\uFF5E]/g;
209
209
 
210
- function normalizeTypographicJank(text) {
210
+ export function normalizeTypographicJank(text) {
211
211
  return text
212
212
  .replace(SMART_SINGLE_QUOTES_REGEX, "'")
213
213
  .replace(SMART_DOUBLE_QUOTES_REGEX, '"')
@@ -221,7 +221,7 @@ function normalizeTypographicJank(text) {
221
221
 
222
222
 
223
223
 
224
- let EMOJI_REGEX;
224
+ export let EMOJI_REGEX;
225
225
 
226
226
  /**
227
227
  * Try Unicode property escape regex (preferred).
@@ -237,6 +237,7 @@ try {
237
237
  EMOJI_REGEX = /[\u{1F300}-\u{1FAFF}]/gu;
238
238
  }
239
239
 
240
+
240
241
  /**
241
242
  * Removes all emoji characters using Unicode property escapes.
242
243
  * Supports modern environments (Unicode v13+) with fallback.
@@ -250,21 +251,19 @@ function purgeEmojisCharacters(text) {
250
251
 
251
252
 
252
253
  /**
253
- * Normalizes all line endings to Unix-style (\n).
254
+ * Normalizes all line endings to a consistent format.
254
255
  *
255
256
  * Converts:
256
- * - Windows line endings ("\r\n") "\n"
257
- * - Old Mac line endings ("\r") "\n"
258
- *
259
- * Example:
260
- * "Line1\r\nLine2\rLine3" → "Line1\nLine2\nLine3"
257
+ * - Windows ("\r\n"), Old Mac ("\r"), Unix ("\n")
258
+ * Into the specified newline format (default: Unix "\n").
261
259
  *
262
- * @param {string} text
260
+ * @param {string} text - Input string to normalize.
261
+ * @param {string} [normalized='\n'] - Target newline style (e.g. '\n', '\r\n').
263
262
  * @returns {string}
264
263
  */
265
- const NORMALIZE_NEWLINES_REGEX = /\r\n?/g;
266
- function normalizeNewlines(text) {
267
- return text.replace(NORMALIZE_NEWLINES_REGEX, '\n');
264
+ const NORMALIZE_NEWLINES_REGEX = /\r\n|\r|\n/g;
265
+ function normalizeNewlines(text, normalized = '\n') {
266
+ return text.replace(NORMALIZE_NEWLINES_REGEX, normalized);
268
267
  }
269
268
 
270
269
 
@@ -336,7 +335,7 @@ function collapseExtraSpaces(text) {
336
335
  * @param {string} text
337
336
  * @returns {string}
338
337
  */
339
- const CONTROL_CHARS_REGEX = /[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F\u0080-\u009F\u200E\u200F\u202A-\u202E]+/g;
338
+ export const CONTROL_CHARS_REGEX = /[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F\u0080-\u009F\u200E\u200F\u202A-\u202E]+/g;
340
339
  function purgeControlCharacters(text) {
341
340
  return text.replace(CONTROL_CHARS_REGEX, '');
342
341
  }