agentshield-sdk 11.0.0 → 12.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +88 -79
- package/package.json +2 -2
- package/src/agent-intent.js +359 -672
- package/src/cross-turn.js +215 -563
- package/src/detector-core.js +106 -0
- package/src/ensemble.js +300 -409
- package/src/incident-response.js +265 -0
- package/src/main.js +70 -33
- package/src/mcp-guard.js +4 -0
- package/src/micro-model.js +12 -1
- package/src/ml-detector.js +110 -266
- package/src/normalizer.js +296 -604
- package/src/persistent-learning.js +104 -620
- package/src/semantic-isolation.js +1 -0
- package/src/smart-config.js +557 -705
- package/src/sota-benchmark.js +268 -10
- package/types/index.d.ts +251 -580
package/src/normalizer.js
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* Agent Shield — Text
|
|
4
|
+
* Agent Shield — Advanced Text Normalizer (v12.0)
|
|
5
5
|
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
6
|
+
* Consolidates all text normalization logic into a standalone module.
|
|
7
|
+
* Handles zero-width character stripping, leetspeak reversal, spaced-out
|
|
8
|
+
* character collapsing, context wrapper removal, Unicode escape decoding,
|
|
9
|
+
* and HTML entity decoding.
|
|
10
10
|
*
|
|
11
11
|
* All processing runs locally — no data ever leaves your environment.
|
|
12
12
|
*
|
|
@@ -14,701 +14,396 @@
|
|
|
14
14
|
*/
|
|
15
15
|
|
|
16
16
|
// =========================================================================
|
|
17
|
-
//
|
|
17
|
+
// CONSTANTS
|
|
18
18
|
// =========================================================================
|
|
19
19
|
|
|
20
20
|
/**
|
|
21
|
-
*
|
|
22
|
-
* These are commonly inserted between letters to break pattern matching.
|
|
21
|
+
* Zero-width and invisible Unicode characters to strip.
|
|
23
22
|
* @type {RegExp}
|
|
24
23
|
*/
|
|
25
|
-
const ZERO_WIDTH_RE = /[\u200B\u200C\u200D\uFEFF\u00AD\
|
|
24
|
+
const ZERO_WIDTH_RE = /[\u200B\u200C\u200D\u200E\u200F\uFEFF\u00AD\u2060\u2061\u2062\u2063\u2064\u180E\u034F]/g;
|
|
26
25
|
|
|
27
26
|
/**
|
|
28
|
-
*
|
|
29
|
-
* (U+E0100–U+E01EF) live in the SMP and require surrogate pair matching.
|
|
30
|
-
* Used in evasion attacks to insert invisible data between visible chars.
|
|
31
|
-
* @type {RegExp}
|
|
32
|
-
*/
|
|
33
|
-
const TAG_CHARS_RE = /\uDB40[\uDC01-\uDC7F\uDD00-\uDDEF]/g;
|
|
34
|
-
|
|
35
|
-
/**
|
|
36
|
-
* Combining diacritical marks used for obfuscation (U+0300–U+036F).
|
|
37
|
-
* @type {RegExp}
|
|
38
|
-
*/
|
|
39
|
-
const COMBINING_MARKS_RE = /[\u0300-\u036F]/g;
|
|
40
|
-
|
|
41
|
-
// =========================================================================
|
|
42
|
-
// HOMOGLYPH MAP (200+ mappings)
|
|
43
|
-
// =========================================================================
|
|
44
|
-
|
|
45
|
-
/**
|
|
46
|
-
* Comprehensive mapping of Unicode lookalikes to ASCII equivalents.
|
|
47
|
-
* Covers Cyrillic, Greek, Cherokee, Georgian, Mathematical, Fullwidth,
|
|
48
|
-
* Enclosed/Circled, Small Caps, IPA, Armenian, superscript/subscript,
|
|
49
|
-
* and common Latin Extended characters.
|
|
50
|
-
*
|
|
27
|
+
* Leetspeak substitution map (character → ASCII letter).
|
|
51
28
|
* @type {Object<string, string>}
|
|
52
29
|
*/
|
|
53
|
-
const
|
|
54
|
-
|
|
55
|
-
'
|
|
56
|
-
'
|
|
57
|
-
'
|
|
58
|
-
'
|
|
59
|
-
'
|
|
60
|
-
|
|
61
|
-
'\
|
|
62
|
-
'\
|
|
63
|
-
'\
|
|
64
|
-
'\
|
|
65
|
-
'\u04BB': 'h', '\u0501': 'd', '\u051B': 'q', '\u051D': 'w',
|
|
66
|
-
|
|
67
|
-
// --- Greek look-alikes ---
|
|
68
|
-
'\u0391': 'A', '\u0392': 'B', '\u0395': 'E', '\u0396': 'Z',
|
|
69
|
-
'\u0397': 'H', '\u0399': 'I', '\u039A': 'K', '\u039C': 'M',
|
|
70
|
-
'\u039D': 'N', '\u039F': 'O', '\u03A1': 'P', '\u03A4': 'T',
|
|
71
|
-
'\u03A5': 'Y', '\u03A7': 'X', '\u03BF': 'o', '\u03B1': 'a',
|
|
72
|
-
'\u03B5': 'e', '\u03B9': 'i', '\u03BA': 'k', '\u03BD': 'v',
|
|
73
|
-
'\u03C1': 'p', '\u03C4': 't', '\u03C5': 'u', '\u03C7': 'x',
|
|
74
|
-
'\u03C9': 'w', '\u03B7': 'n',
|
|
75
|
-
|
|
76
|
-
// --- Armenian look-alikes ---
|
|
77
|
-
'\u0555': 'O', '\u0585': 'o', '\u0578': 'n', '\u057C': 'n',
|
|
78
|
-
'\u0570': 'h', '\u0561': 'a', '\u0575': 'u', '\u0572': 'q',
|
|
79
|
-
'\u0565': 'e', '\u056B': 'i', '\u0574': 'm', '\u057D': 's',
|
|
80
|
-
|
|
81
|
-
// --- Cherokee look-alikes ---
|
|
82
|
-
'\u13A0': 'D', '\u13A1': 'R', '\u13A2': 'T', '\u13A9': 'Y',
|
|
83
|
-
'\u13AA': 'A', '\u13AB': 'J', '\u13AC': 'S', '\u13B3': 'W',
|
|
84
|
-
'\u13B7': 'M', '\u13BB': 'H', '\u13C0': 'G', '\u13C2': 'h',
|
|
85
|
-
'\u13C3': 'Z', '\u13CF': 'b', '\u13D2': 'R', '\u13DA': 'V',
|
|
86
|
-
'\u13DE': 'L', '\u13DF': 'C', '\u13E2': 'P', '\u13E6': 'K',
|
|
87
|
-
|
|
88
|
-
// --- Georgian look-alikes ---
|
|
89
|
-
'\u10D0': 'a', '\u10D5': 'b', '\u10D3': 'd', '\u10DA': 'l',
|
|
90
|
-
'\u10DD': 'o', '\u10DE': 'p', '\u10E1': 's', '\u10E2': 't',
|
|
91
|
-
'\u10E3': 'u', '\u10EF': 'j',
|
|
92
|
-
|
|
93
|
-
// --- Latin Extended (accented → base) ---
|
|
94
|
-
// A variants
|
|
95
|
-
'\u00C0': 'A', '\u00C1': 'A', '\u00C2': 'A', '\u00C3': 'A',
|
|
96
|
-
'\u00C4': 'A', '\u00C5': 'A', '\u00E0': 'a', '\u00E1': 'a',
|
|
97
|
-
'\u00E2': 'a', '\u00E3': 'a', '\u00E4': 'a', '\u00E5': 'a',
|
|
98
|
-
'\u0100': 'A', '\u0101': 'a', '\u0102': 'A', '\u0103': 'a',
|
|
99
|
-
'\u0104': 'A', '\u0105': 'a',
|
|
100
|
-
// E variants
|
|
101
|
-
'\u00C8': 'E', '\u00C9': 'E', '\u00CA': 'E', '\u00CB': 'E',
|
|
102
|
-
'\u00E8': 'e', '\u00E9': 'e', '\u00EA': 'e', '\u00EB': 'e',
|
|
103
|
-
'\u0112': 'E', '\u0113': 'e', '\u0114': 'E', '\u0115': 'e',
|
|
104
|
-
'\u0116': 'E', '\u0117': 'e', '\u0118': 'E', '\u0119': 'e',
|
|
105
|
-
// I variants
|
|
106
|
-
'\u00CC': 'I', '\u00CD': 'I', '\u00CE': 'I', '\u00CF': 'I',
|
|
107
|
-
'\u00EC': 'i', '\u00ED': 'i', '\u00EE': 'i', '\u00EF': 'i',
|
|
108
|
-
'\u012A': 'I', '\u012B': 'i', '\u012C': 'I', '\u012D': 'i',
|
|
109
|
-
'\u012E': 'I', '\u012F': 'i', '\u0130': 'I', '\u0131': 'i',
|
|
110
|
-
// O variants
|
|
111
|
-
'\u00D2': 'O', '\u00D3': 'O', '\u00D4': 'O', '\u00D5': 'O',
|
|
112
|
-
'\u00D6': 'O', '\u00D8': 'O', '\u00F2': 'o', '\u00F3': 'o',
|
|
113
|
-
'\u00F4': 'o', '\u00F5': 'o', '\u00F6': 'o', '\u00F8': 'o',
|
|
114
|
-
'\u014C': 'O', '\u014D': 'o', '\u014E': 'O', '\u014F': 'o',
|
|
115
|
-
'\u0150': 'O', '\u0151': 'o',
|
|
116
|
-
// U variants
|
|
117
|
-
'\u00D9': 'U', '\u00DA': 'U', '\u00DB': 'U', '\u00DC': 'U',
|
|
118
|
-
'\u00F9': 'u', '\u00FA': 'u', '\u00FB': 'u', '\u00FC': 'u',
|
|
119
|
-
'\u016A': 'U', '\u016B': 'u', '\u016C': 'U', '\u016D': 'u',
|
|
120
|
-
'\u016E': 'U', '\u016F': 'u', '\u0170': 'U', '\u0171': 'u',
|
|
121
|
-
// Other Latin Extended
|
|
122
|
-
'\u00C7': 'C', '\u00E7': 'c', '\u00D1': 'N', '\u00F1': 'n',
|
|
123
|
-
'\u00DD': 'Y', '\u00FD': 'y', '\u00FF': 'y',
|
|
124
|
-
'\u0144': 'n', '\u0146': 'n', '\u0148': 'n',
|
|
125
|
-
'\u015A': 'S', '\u015B': 's', '\u015C': 'S', '\u015D': 's',
|
|
126
|
-
'\u015E': 'S', '\u015F': 's', '\u0160': 'S', '\u0161': 's',
|
|
127
|
-
'\u010C': 'C', '\u010D': 'c', '\u010E': 'D', '\u010F': 'd',
|
|
128
|
-
'\u0158': 'R', '\u0159': 'r', '\u0164': 'T', '\u0165': 't',
|
|
129
|
-
'\u017D': 'Z', '\u017E': 'z', '\u017B': 'Z', '\u017C': 'z',
|
|
130
|
-
'\u017A': 'z', '\u0179': 'Z',
|
|
131
|
-
'\u0141': 'L', '\u0142': 'l', '\u0110': 'D', '\u0111': 'd',
|
|
132
|
-
|
|
133
|
-
// --- Fullwidth (lowercase) ---
|
|
134
|
-
'\uFF41': 'a', '\uFF42': 'b', '\uFF43': 'c', '\uFF44': 'd',
|
|
135
|
-
'\uFF45': 'e', '\uFF46': 'f', '\uFF47': 'g', '\uFF48': 'h',
|
|
136
|
-
'\uFF49': 'i', '\uFF4A': 'j', '\uFF4B': 'k', '\uFF4C': 'l',
|
|
137
|
-
'\uFF4D': 'm', '\uFF4E': 'n', '\uFF4F': 'o', '\uFF50': 'p',
|
|
138
|
-
'\uFF51': 'q', '\uFF52': 'r', '\uFF53': 's', '\uFF54': 't',
|
|
139
|
-
'\uFF55': 'u', '\uFF56': 'v', '\uFF57': 'w', '\uFF58': 'x',
|
|
140
|
-
'\uFF59': 'y', '\uFF5A': 'z',
|
|
141
|
-
// --- Fullwidth (uppercase) ---
|
|
142
|
-
'\uFF21': 'A', '\uFF22': 'B', '\uFF23': 'C', '\uFF24': 'D',
|
|
143
|
-
'\uFF25': 'E', '\uFF26': 'F', '\uFF27': 'G', '\uFF28': 'H',
|
|
144
|
-
'\uFF29': 'I', '\uFF2A': 'J', '\uFF2B': 'K', '\uFF2C': 'L',
|
|
145
|
-
'\uFF2D': 'M', '\uFF2E': 'N', '\uFF2F': 'O', '\uFF30': 'P',
|
|
146
|
-
'\uFF31': 'Q', '\uFF32': 'R', '\uFF33': 'S', '\uFF34': 'T',
|
|
147
|
-
'\uFF35': 'U', '\uFF36': 'V', '\uFF37': 'W', '\uFF38': 'X',
|
|
148
|
-
'\uFF39': 'Y', '\uFF3A': 'Z',
|
|
149
|
-
// --- Fullwidth digits ---
|
|
150
|
-
'\uFF10': '0', '\uFF11': '1', '\uFF12': '2', '\uFF13': '3',
|
|
151
|
-
'\uFF14': '4', '\uFF15': '5', '\uFF16': '6', '\uFF17': '7',
|
|
152
|
-
'\uFF18': '8', '\uFF19': '9',
|
|
153
|
-
|
|
154
|
-
// --- Enclosed/Circled letters ---
|
|
155
|
-
'\u24B6': 'A', '\u24B7': 'B', '\u24B8': 'C', '\u24B9': 'D',
|
|
156
|
-
'\u24BA': 'E', '\u24BB': 'F', '\u24BC': 'G', '\u24BD': 'H',
|
|
157
|
-
'\u24BE': 'I', '\u24BF': 'J', '\u24C0': 'K', '\u24C1': 'L',
|
|
158
|
-
'\u24C2': 'M', '\u24C3': 'N', '\u24C4': 'O', '\u24C5': 'P',
|
|
159
|
-
'\u24C6': 'Q', '\u24C7': 'R', '\u24C8': 'S', '\u24C9': 'T',
|
|
160
|
-
'\u24CA': 'U', '\u24CB': 'V', '\u24CC': 'W', '\u24CD': 'X',
|
|
161
|
-
'\u24CE': 'Y', '\u24CF': 'Z',
|
|
162
|
-
'\u24D0': 'a', '\u24D1': 'b', '\u24D2': 'c', '\u24D3': 'd',
|
|
163
|
-
'\u24D4': 'e', '\u24D5': 'f', '\u24D6': 'g', '\u24D7': 'h',
|
|
164
|
-
'\u24D8': 'i', '\u24D9': 'j', '\u24DA': 'k', '\u24DB': 'l',
|
|
165
|
-
'\u24DC': 'm', '\u24DD': 'n', '\u24DE': 'o', '\u24DF': 'p',
|
|
166
|
-
'\u24E0': 'q', '\u24E1': 'r', '\u24E2': 's', '\u24E3': 't',
|
|
167
|
-
'\u24E4': 'u', '\u24E5': 'v', '\u24E6': 'w', '\u24E7': 'x',
|
|
168
|
-
'\u24E8': 'y', '\u24E9': 'z',
|
|
169
|
-
|
|
170
|
-
// --- Small Caps (Unicode phonetic) ---
|
|
171
|
-
'\u1D00': 'A', '\u0299': 'B', '\u1D04': 'C', '\u1D05': 'D',
|
|
172
|
-
'\u1D07': 'E', '\u0262': 'G', '\u029C': 'H', '\u026A': 'I',
|
|
173
|
-
'\u1D0A': 'J', '\u1D0B': 'K', '\u029F': 'L', '\u1D0D': 'M',
|
|
174
|
-
'\u0274': 'N', '\u1D0F': 'O', '\u1D18': 'P', '\u0280': 'R',
|
|
175
|
-
'\u1D1B': 'T', '\u1D1C': 'U', '\u1D20': 'V', '\u1D21': 'W',
|
|
176
|
-
|
|
177
|
-
// --- IPA / Phonetic extensions ---
|
|
178
|
-
'\u0250': 'a', '\u0253': 'b', '\u0254': 'c', '\u0256': 'd',
|
|
179
|
-
'\u025B': 'e', '\u025F': 'f', '\u0260': 'g', '\u0266': 'h',
|
|
180
|
-
'\u0268': 'i', '\u026D': 'l', '\u0271': 'm', '\u0272': 'n',
|
|
181
|
-
'\u0275': 'o', '\u0278': 'p', '\u027E': 'r', '\u0282': 's',
|
|
182
|
-
'\u0288': 't', '\u028A': 'u', '\u028B': 'v', '\u0290': 'z',
|
|
183
|
-
'\u0237': 'j', '\u0261': 'g',
|
|
184
|
-
|
|
185
|
-
// --- Mathematical Alphanumeric Symbols (bold italic) ---
|
|
186
|
-
'\uD835\uDC1A': 'a', '\uD835\uDC1B': 'b', '\uD835\uDC1C': 'c',
|
|
187
|
-
'\uD835\uDC1D': 'd', '\uD835\uDC1E': 'e', '\uD835\uDC1F': 'f',
|
|
188
|
-
'\uD835\uDC20': 'g', '\uD835\uDC21': 'h', '\uD835\uDC22': 'i',
|
|
189
|
-
'\uD835\uDC23': 'j', '\uD835\uDC24': 'k', '\uD835\uDC25': 'l',
|
|
190
|
-
'\uD835\uDC26': 'm', '\uD835\uDC27': 'n', '\uD835\uDC28': 'o',
|
|
191
|
-
'\uD835\uDC29': 'p', '\uD835\uDC2A': 'q', '\uD835\uDC2B': 'r',
|
|
192
|
-
'\uD835\uDC2C': 's', '\uD835\uDC2D': 't', '\uD835\uDC2E': 'u',
|
|
193
|
-
'\uD835\uDC2F': 'v', '\uD835\uDC30': 'w', '\uD835\uDC31': 'x',
|
|
194
|
-
'\uD835\uDC32': 'y', '\uD835\uDC33': 'z',
|
|
195
|
-
|
|
196
|
-
// --- Superscript / subscript ---
|
|
197
|
-
'\u00B2': '2', '\u00B3': '3', '\u00B9': '1', '\u2070': '0',
|
|
198
|
-
'\u2071': 'i', '\u2074': '4', '\u2075': '5', '\u2076': '6',
|
|
199
|
-
'\u2077': '7', '\u2078': '8', '\u2079': '9', '\u207A': '+',
|
|
200
|
-
'\u207B': '-', '\u207F': 'n',
|
|
201
|
-
'\u2080': '0', '\u2081': '1', '\u2082': '2', '\u2083': '3',
|
|
202
|
-
'\u2084': '4', '\u2090': 'a', '\u2091': 'e', '\u2092': 'o',
|
|
203
|
-
'\u2093': 'x',
|
|
204
|
-
|
|
205
|
-
// --- Modifier letters (superscript-like) ---
|
|
206
|
-
'\u02B0': 'h', '\u02B1': 'h', '\u02B2': 'j', '\u02B3': 'r',
|
|
207
|
-
'\u02B4': 'r', '\u02B7': 'w', '\u02B8': 'y', '\u02E0': 'g',
|
|
208
|
-
'\u02E1': 'l', '\u02E2': 's', '\u02E3': 'x', '\u1D43': 'a',
|
|
209
|
-
'\u1D47': 'b', '\u1D48': 'd', '\u1D49': 'e', '\u1D4D': 'g',
|
|
210
|
-
'\u1D4F': 'k', '\u1D50': 'm', '\u1D52': 'o', '\u1D56': 'p',
|
|
211
|
-
'\u1D57': 't', '\u1D58': 'u', '\u1D5B': 'v',
|
|
30
|
+
const LEET_MAP = {
|
|
31
|
+
'0': 'o', '1': 'i', '3': 'e', '4': 'a', '5': 's',
|
|
32
|
+
'7': 't', '8': 'b', '9': 'g', '@': 'a', '!': 'i',
|
|
33
|
+
'$': 's', '+': 't', '(': 'c', '|': 'l',
|
|
34
|
+
'}{': 'h', '}{': 'h', '/\\': 'a', '\\/': 'v',
|
|
35
|
+
'|3': 'b', '|)': 'd', '|<': 'k', '|_': 'l',
|
|
36
|
+
'|-|': 'h', '|\\|': 'n', '|2': 'r',
|
|
37
|
+
// Common Unicode lookalikes
|
|
38
|
+
'\u0430': 'a', '\u0435': 'e', '\u043E': 'o', '\u0440': 'p',
|
|
39
|
+
'\u0441': 'c', '\u0443': 'y', '\u0445': 'x',
|
|
40
|
+
'\u0410': 'A', '\u0415': 'E', '\u041E': 'O', '\u0420': 'P',
|
|
41
|
+
'\u0421': 'C', '\u0423': 'Y', '\u0425': 'X'
|
|
212
42
|
};
|
|
213
43
|
|
|
214
|
-
// =========================================================================
|
|
215
|
-
// UNICODE WHITESPACE SET
|
|
216
|
-
// =========================================================================
|
|
217
|
-
|
|
218
44
|
/**
|
|
219
|
-
*
|
|
220
|
-
* @type {
|
|
45
|
+
* Multi-character leet sequences sorted by length (longest first for greedy matching).
|
|
46
|
+
* @type {Array<[string, string]>}
|
|
221
47
|
*/
|
|
222
|
-
const
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
/**
|
|
229
|
-
* Common leet speak substitutions (number/symbol → letter).
|
|
230
|
-
* @type {Object<string, string>}
|
|
231
|
-
*/
|
|
232
|
-
const LEET_MAP = {
|
|
233
|
-
'0': 'o', '1': 'i', '2': 'z', '3': 'e', '4': 'a',
|
|
234
|
-
'5': 's', '6': 'g', '7': 't', '8': 'b', '9': 'g',
|
|
235
|
-
'@': 'a', '$': 's', '!': 'i', '|': 'l', '+': 't',
|
|
236
|
-
'(': 'c', '{': 'c', '<': 'c', '#': 'h', '^': 'a',
|
|
237
|
-
};
|
|
48
|
+
const MULTI_LEET = [
|
|
49
|
+
['|\\|', 'n'], ['|-|', 'h'], ['/\\', 'a'], ['\\/', 'v'],
|
|
50
|
+
['}{', 'h'], ['|3', 'b'], ['|)', 'd'], ['|<', 'k'],
|
|
51
|
+
['|_', 'l'], ['|2', 'r']
|
|
52
|
+
];
|
|
238
53
|
|
|
239
54
|
/**
|
|
240
|
-
*
|
|
241
|
-
* @type {
|
|
55
|
+
* Context wrapper phrases that attackers prepend to bypass filters.
|
|
56
|
+
* @type {RegExp[]}
|
|
242
57
|
*/
|
|
243
|
-
const
|
|
244
|
-
[
|
|
245
|
-
[
|
|
246
|
-
|
|
247
|
-
[
|
|
248
|
-
[
|
|
249
|
-
[
|
|
250
|
-
|
|
251
|
-
[
|
|
252
|
-
[/ph/gi, 'f'],
|
|
58
|
+
const CONTEXT_WRAPPERS = [
|
|
59
|
+
/^for\s+(?:research|educational|testing|academic|safety)\s*(?:purposes?\s*)?[:\-,]\s*/im,
|
|
60
|
+
/^(?:hypothetically|theoretically|in\s+theory)\s*[,:\-]\s*/im,
|
|
61
|
+
/^(?:imagine|pretend|suppose|assume)\s+(?:that\s+)?(?:you\s+(?:are|were)\s+)?/im,
|
|
62
|
+
/^(?:as\s+a\s+(?:thought\s+)?experiment)\s*[,:\-]\s*/im,
|
|
63
|
+
/^(?:just\s+)?(?:out\s+of\s+)?(?:curiosity|interest)\s*[,:\-]\s*/im,
|
|
64
|
+
/^(?:in\s+a\s+(?:fictional|hypothetical)\s+(?:scenario|world|context))\s*[,:\-]\s*/im,
|
|
65
|
+
/^(?:please\s+)?(?:help\s+me\s+)?(?:understand|explain)\s+(?:how\s+(?:to\s+)?)?/im,
|
|
66
|
+
/^(?:i'?m\s+(?:a\s+)?(?:security\s+)?researcher)\s*[,:\-]\s*/im
|
|
253
67
|
];
|
|
254
68
|
|
|
255
|
-
// =========================================================================
|
|
256
|
-
// MARKDOWN PATTERN
|
|
257
|
-
// =========================================================================
|
|
258
|
-
|
|
259
69
|
/**
|
|
260
|
-
*
|
|
261
|
-
*
|
|
262
|
-
* @type {RegExp}
|
|
70
|
+
* Named HTML entities map (common subset).
|
|
71
|
+
* @type {Object<string, string>}
|
|
263
72
|
*/
|
|
264
|
-
const
|
|
73
|
+
const HTML_ENTITIES = {
|
|
74
|
+
'amp': '&', 'lt': '<', 'gt': '>', 'quot': '"', 'apos': "'",
|
|
75
|
+
'nbsp': ' ', 'tab': '\t', 'newline': '\n',
|
|
76
|
+
'lpar': '(', 'rpar': ')', 'lsqb': '[', 'rsqb': ']',
|
|
77
|
+
'lcub': '{', 'rcub': '}', 'sol': '/', 'bsol': '\\',
|
|
78
|
+
'comma': ',', 'period': '.', 'colon': ':', 'semi': ';',
|
|
79
|
+
'excl': '!', 'quest': '?', 'num': '#', 'ast': '*',
|
|
80
|
+
'plus': '+', 'equals': '=', 'hyphen': '-', 'lowbar': '_',
|
|
81
|
+
'percnt': '%', 'dollar': '$', 'commat': '@', 'circ': '^',
|
|
82
|
+
'tilde': '~', 'grave': '`', 'vert': '|'
|
|
83
|
+
};
|
|
265
84
|
|
|
266
85
|
// =========================================================================
|
|
267
|
-
//
|
|
86
|
+
// NORMALIZER FUNCTIONS
|
|
268
87
|
// =========================================================================
|
|
269
88
|
|
|
270
89
|
/**
|
|
271
|
-
*
|
|
272
|
-
*
|
|
273
|
-
* @
|
|
90
|
+
* Remove zero-width and invisible Unicode characters.
|
|
91
|
+
* @param {string} text
|
|
92
|
+
* @returns {string}
|
|
274
93
|
*/
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
// =========================================================================
|
|
94
|
+
function stripZeroWidth(text) {
|
|
95
|
+
if (!text || typeof text !== 'string') return text || '';
|
|
96
|
+
return text.replace(ZERO_WIDTH_RE, '');
|
|
97
|
+
}
|
|
280
98
|
|
|
281
99
|
/**
|
|
282
|
-
*
|
|
283
|
-
*
|
|
284
|
-
*
|
|
100
|
+
* Convert leetspeak substitutions back to standard ASCII letters.
|
|
101
|
+
* Handles multi-character sequences first, then single-character replacements.
|
|
285
102
|
* @param {string} text
|
|
286
|
-
* @returns {
|
|
103
|
+
* @returns {string}
|
|
287
104
|
*/
|
|
288
|
-
function
|
|
105
|
+
function reverseLeetspeak(text) {
|
|
106
|
+
if (!text || typeof text !== 'string') return text || '';
|
|
107
|
+
|
|
289
108
|
let result = text;
|
|
290
109
|
|
|
291
|
-
//
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
110
|
+
// Multi-character sequences first (longest match wins)
|
|
111
|
+
for (const [leet, replacement] of MULTI_LEET) {
|
|
112
|
+
// Escape special regex characters in the leet string
|
|
113
|
+
const escaped = leet.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
114
|
+
result = result.replace(new RegExp(escaped, 'g'), replacement);
|
|
295
115
|
}
|
|
296
116
|
|
|
297
|
-
//
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
// Strip combining diacritical marks (now separated by NFKD decomposition)
|
|
304
|
-
result = result.replace(COMBINING_MARKS_RE, '');
|
|
305
|
-
|
|
306
|
-
// Re-compose with NFC to get clean canonical form
|
|
307
|
-
if (typeof result.normalize === 'function') {
|
|
308
|
-
result = result.normalize('NFC');
|
|
117
|
+
// Single-character replacements
|
|
118
|
+
let out = '';
|
|
119
|
+
for (let i = 0; i < result.length; i++) {
|
|
120
|
+
const ch = result[i];
|
|
121
|
+
out += LEET_MAP[ch] !== undefined ? LEET_MAP[ch] : ch;
|
|
309
122
|
}
|
|
310
123
|
|
|
311
|
-
return
|
|
124
|
+
return out;
|
|
312
125
|
}
|
|
313
126
|
|
|
314
127
|
/**
|
|
315
|
-
*
|
|
316
|
-
*
|
|
317
|
-
*
|
|
128
|
+
* Collapse spaced-out character obfuscation (e.g. "i g n o r e" → "ignore").
|
|
129
|
+
* Only collapses when most characters are single with uniform spacing.
|
|
318
130
|
* @param {string} text
|
|
319
|
-
* @returns {
|
|
131
|
+
* @returns {string}
|
|
320
132
|
*/
|
|
321
|
-
function
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
for (
|
|
329
|
-
|
|
330
|
-
//
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
continue;
|
|
338
|
-
}
|
|
339
|
-
}
|
|
340
|
-
if (HOMOGLYPH_MAP[ch] !== undefined) {
|
|
341
|
-
result += HOMOGLYPH_MAP[ch];
|
|
342
|
-
changed = true;
|
|
133
|
+
function collapseCharSpacing(text) {
|
|
134
|
+
if (!text || typeof text !== 'string') return text || '';
|
|
135
|
+
|
|
136
|
+
// Process line by line to preserve structure
|
|
137
|
+
const lines = text.split('\n');
|
|
138
|
+
const result = [];
|
|
139
|
+
|
|
140
|
+
for (const line of lines) {
|
|
141
|
+
// Match pattern: single chars separated by uniform whitespace
|
|
142
|
+
// e.g., "i g n o r e" or "i g n o r e"
|
|
143
|
+
const spacedPattern = /^(\s*)([a-zA-Z])((\s{1,3})[a-zA-Z]){3,}(\s*)$/;
|
|
144
|
+
if (spacedPattern.test(line.trim())) {
|
|
145
|
+
// Extract only the letter characters
|
|
146
|
+
const collapsed = line.trim().replace(/\s+/g, '');
|
|
147
|
+
const leadingSpace = line.match(/^(\s*)/)[1];
|
|
148
|
+
result.push(leadingSpace + collapsed);
|
|
343
149
|
} else {
|
|
344
|
-
result
|
|
150
|
+
result.push(line);
|
|
345
151
|
}
|
|
346
152
|
}
|
|
347
|
-
return { text: result, applied: changed };
|
|
348
|
-
}
|
|
349
153
|
|
|
350
|
-
|
|
351
|
-
* Layer 3: Encoding Decode
|
|
352
|
-
* Detects and decodes base64, hex escapes, URL encoding, HTML entities,
|
|
353
|
-
* and Unicode escapes within the text.
|
|
354
|
-
*
|
|
355
|
-
* @param {string} text
|
|
356
|
-
* @returns {{ text: string, applied: boolean }}
|
|
357
|
-
*/
|
|
358
|
-
function encodingDecode(text) {
|
|
359
|
-
let result = text;
|
|
360
|
-
let changed = false;
|
|
361
|
-
|
|
362
|
-
// Decode Unicode escapes: \u0041 → A
|
|
363
|
-
if (/\\u[0-9a-fA-F]{4}/.test(result)) {
|
|
364
|
-
const decoded = result.replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) => {
|
|
365
|
-
return String.fromCharCode(parseInt(hex, 16));
|
|
366
|
-
});
|
|
367
|
-
if (decoded !== result) { result = decoded; changed = true; }
|
|
368
|
-
}
|
|
369
|
-
|
|
370
|
-
// Decode hex escapes: \x41 → A
|
|
371
|
-
if (/\\x[0-9a-fA-F]{2}/.test(result)) {
|
|
372
|
-
const decoded = result.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) => {
|
|
373
|
-
return String.fromCharCode(parseInt(hex, 16));
|
|
374
|
-
});
|
|
375
|
-
if (decoded !== result) { result = decoded; changed = true; }
|
|
376
|
-
}
|
|
377
|
-
|
|
378
|
-
// Decode URL encoding: %41 → A
|
|
379
|
-
if (/%[0-9a-fA-F]{2}/.test(result)) {
|
|
380
|
-
try {
|
|
381
|
-
const decoded = decodeURIComponent(result);
|
|
382
|
-
if (decoded !== result) { result = decoded; changed = true; }
|
|
383
|
-
} catch (e) {
|
|
384
|
-
// Partial URL encoding — decode individual sequences
|
|
385
|
-
const decoded = result.replace(/%([0-9a-fA-F]{2})/g, (_, hex) => {
|
|
386
|
-
return String.fromCharCode(parseInt(hex, 16));
|
|
387
|
-
});
|
|
388
|
-
if (decoded !== result) { result = decoded; changed = true; }
|
|
389
|
-
}
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
// Decode HTML entities: A A & etc.
|
|
393
|
-
if (/&(?:#\d+|#x[0-9a-fA-F]+|[a-zA-Z]+);/.test(result)) {
|
|
394
|
-
const decoded = result
|
|
395
|
-
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
|
|
396
|
-
.replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(parseInt(code, 16)))
|
|
397
|
-
.replace(/&/g, '&')
|
|
398
|
-
.replace(/</g, '<')
|
|
399
|
-
.replace(/>/g, '>')
|
|
400
|
-
.replace(/"/g, '"')
|
|
401
|
-
.replace(/'/g, "'")
|
|
402
|
-
.replace(/ /g, ' ');
|
|
403
|
-
if (decoded !== result) { result = decoded; changed = true; }
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
// Decode base64 segments embedded in text
|
|
407
|
-
// Only decode if the decoded content looks like printable text
|
|
408
|
-
const b64Matches = [];
|
|
409
|
-
let m;
|
|
410
|
-
const b64Re = /(?:^|\s)([A-Za-z0-9+/]{20,}={0,2})(?:\s|$)/g;
|
|
411
|
-
while ((m = b64Re.exec(result)) !== null) {
|
|
412
|
-
b64Matches.push({ match: m[1], index: m.index + (m[0].length - m[1].length - (m[0].endsWith(' ') ? 1 : 0)) });
|
|
413
|
-
}
|
|
414
|
-
for (let i = b64Matches.length - 1; i >= 0; i--) {
|
|
415
|
-
const seg = b64Matches[i];
|
|
416
|
-
try {
|
|
417
|
-
let decoded;
|
|
418
|
-
if (typeof Buffer !== 'undefined') {
|
|
419
|
-
decoded = Buffer.from(seg.match, 'base64').toString('utf-8');
|
|
420
|
-
} else if (typeof atob !== 'undefined') {
|
|
421
|
-
decoded = atob(seg.match);
|
|
422
|
-
}
|
|
423
|
-
if (decoded) {
|
|
424
|
-
const printable = decoded.split('').filter(c => {
|
|
425
|
-
const code = c.charCodeAt(0);
|
|
426
|
-
return code >= 32 && code <= 126;
|
|
427
|
-
}).length;
|
|
428
|
-
if (printable / decoded.length > 0.8 && decoded.length >= 4) {
|
|
429
|
-
// Replace the base64 segment with decoded text
|
|
430
|
-
result = result.substring(0, seg.index) + decoded + result.substring(seg.index + seg.match.length);
|
|
431
|
-
changed = true;
|
|
432
|
-
}
|
|
433
|
-
}
|
|
434
|
-
} catch (e) {
|
|
435
|
-
// Not valid base64
|
|
436
|
-
}
|
|
437
|
-
}
|
|
438
|
-
|
|
439
|
-
return { text: result, applied: changed };
|
|
154
|
+
return result.join('\n');
|
|
440
155
|
}
|
|
441
156
|
|
|
442
157
|
/**
|
|
443
|
-
*
|
|
444
|
-
* Collapses multiple spaces/tabs/newlines, strips Unicode whitespace variants.
|
|
445
|
-
*
|
|
158
|
+
* Remove common context wrapper phrases used to disguise malicious prompts.
|
|
446
159
|
* @param {string} text
|
|
447
|
-
* @returns {
|
|
160
|
+
* @returns {string}
|
|
448
161
|
*/
|
|
449
|
-
function
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
// Replace Unicode whitespace with standard space
|
|
453
|
-
result = result.replace(UNICODE_WHITESPACE_RE, ' ');
|
|
454
|
-
|
|
455
|
-
// Collapse multiple whitespace characters to single space
|
|
456
|
-
result = result.replace(/[ \t]+/g, ' ');
|
|
162
|
+
function stripContextWrappers(text) {
|
|
163
|
+
if (!text || typeof text !== 'string') return text || '';
|
|
457
164
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
result = result.replace(/^[ \t]+|[ \t]+$/gm, '');
|
|
165
|
+
let result = text;
|
|
166
|
+
for (const pattern of CONTEXT_WRAPPERS) {
|
|
167
|
+
result = result.replace(pattern, '');
|
|
168
|
+
}
|
|
463
169
|
|
|
464
|
-
return
|
|
170
|
+
return result;
|
|
465
171
|
}
|
|
466
172
|
|
|
467
173
|
/**
|
|
468
|
-
*
|
|
469
|
-
*
|
|
470
|
-
*
|
|
174
|
+
* Decode percent-encoded (%XX), Unicode escape (\uXXXX), hex escape (\xXX),
|
|
175
|
+
* numeric HTML entities (&#DDD; / &#xHH;), and named HTML entities (&name;).
|
|
471
176
|
* @param {string} text
|
|
472
|
-
* @returns {
|
|
177
|
+
* @returns {string}
|
|
473
178
|
*/
|
|
474
|
-
function
|
|
475
|
-
|
|
476
|
-
return { text: result, applied: result !== text };
|
|
477
|
-
}
|
|
179
|
+
function decodeUnicodeEscapes(text) {
|
|
180
|
+
if (!text || typeof text !== 'string') return text || '';
|
|
478
181
|
|
|
479
|
-
/**
|
|
480
|
-
* Layer 6: Leet Speak Decode
|
|
481
|
-
* Maps common number/symbol substitutions back to letters.
|
|
482
|
-
*
|
|
483
|
-
* @param {string} text
|
|
484
|
-
* @returns {{ text: string, applied: boolean }}
|
|
485
|
-
*/
|
|
486
|
-
function leetDecode(text) {
|
|
487
182
|
let result = text;
|
|
488
183
|
|
|
489
|
-
//
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
// part of a run that touches at least one actual letter (not just a cluster
|
|
496
|
-
// of numbers like "2024"). We use flood-fill: mark leet positions, then
|
|
497
|
-
// propagate "reachable from a letter" through adjacent leet positions.
|
|
498
|
-
const chars = result.split('');
|
|
499
|
-
const isLeet = new Array(chars.length).fill(false);
|
|
500
|
-
const isLetter = new Array(chars.length).fill(false);
|
|
501
|
-
|
|
502
|
-
for (let i = 0; i < chars.length; i++) {
|
|
503
|
-
if (LEET_MAP[chars[i]] !== undefined) isLeet[i] = true;
|
|
504
|
-
if (/[a-zA-Z]/.test(chars[i])) isLetter[i] = true;
|
|
505
|
-
}
|
|
506
|
-
|
|
507
|
-
// Mark which leet positions can reach a letter through adjacent leet/letter chain
|
|
508
|
-
const reachable = new Array(chars.length).fill(false);
|
|
509
|
-
for (let i = 0; i < chars.length; i++) {
|
|
510
|
-
if (isLeet[i]) {
|
|
511
|
-
// Check left neighbor
|
|
512
|
-
if (i > 0 && isLetter[i - 1]) { reachable[i] = true; continue; }
|
|
513
|
-
// Check right neighbor
|
|
514
|
-
if (i < chars.length - 1 && isLetter[i + 1]) { reachable[i] = true; continue; }
|
|
184
|
+
// Decode \\uXXXX sequences
|
|
185
|
+
result = result.replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) => {
|
|
186
|
+
try {
|
|
187
|
+
return String.fromCharCode(parseInt(hex, 16));
|
|
188
|
+
} catch (_e) {
|
|
189
|
+
return _;
|
|
515
190
|
}
|
|
516
|
-
}
|
|
191
|
+
});
|
|
517
192
|
|
|
518
|
-
//
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
if ((i > 0 && reachable[i - 1] && (isLeet[i - 1] || isLetter[i - 1])) ||
|
|
525
|
-
(i < chars.length - 1 && reachable[i + 1] && (isLeet[i + 1] || isLetter[i + 1]))) {
|
|
526
|
-
reachable[i] = true;
|
|
527
|
-
changed = true;
|
|
528
|
-
}
|
|
529
|
-
}
|
|
193
|
+
// Decode \\xXX sequences
|
|
194
|
+
result = result.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) => {
|
|
195
|
+
try {
|
|
196
|
+
return String.fromCharCode(parseInt(hex, 16));
|
|
197
|
+
} catch (_e) {
|
|
198
|
+
return _;
|
|
530
199
|
}
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
// Decode percent-encoded %XX sequences
|
|
203
|
+
try {
|
|
204
|
+
result = decodeURIComponent(result);
|
|
205
|
+
} catch (_e) {
|
|
206
|
+
// If decodeURIComponent fails (malformed), do manual single-byte decode
|
|
207
|
+
result = result.replace(/%([0-9a-fA-F]{2})/g, (_, hex) => {
|
|
208
|
+
try {
|
|
209
|
+
return String.fromCharCode(parseInt(hex, 16));
|
|
210
|
+
} catch (_e2) {
|
|
211
|
+
return _;
|
|
212
|
+
}
|
|
213
|
+
});
|
|
531
214
|
}
|
|
532
215
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
}
|
|
538
|
-
|
|
216
|
+
// Decode numeric HTML entities &#DDD; and &#xHH;
|
|
217
|
+
result = result.replace(/&#x([0-9a-fA-F]+);/gi, (_, hex) => {
|
|
218
|
+
try {
|
|
219
|
+
return String.fromCodePoint(parseInt(hex, 16));
|
|
220
|
+
} catch (_e) {
|
|
221
|
+
return _;
|
|
539
222
|
}
|
|
540
|
-
}
|
|
541
|
-
result =
|
|
223
|
+
});
|
|
224
|
+
result = result.replace(/&#(\d+);/g, (_, dec) => {
|
|
225
|
+
try {
|
|
226
|
+
return String.fromCodePoint(parseInt(dec, 10));
|
|
227
|
+
} catch (_e) {
|
|
228
|
+
return _;
|
|
229
|
+
}
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
// Decode named HTML entities &name;
|
|
233
|
+
result = result.replace(/&([a-zA-Z]+);/g, (match, name) => {
|
|
234
|
+
const lower = name.toLowerCase();
|
|
235
|
+
return HTML_ENTITIES[lower] !== undefined ? HTML_ENTITIES[lower] : match;
|
|
236
|
+
});
|
|
542
237
|
|
|
543
|
-
return
|
|
238
|
+
return result;
|
|
544
239
|
}
|
|
545
240
|
|
|
546
241
|
/**
|
|
547
|
-
*
|
|
548
|
-
*
|
|
549
|
-
*
|
|
242
|
+
* Apply all normalizers in the recommended sequence.
|
|
243
|
+
* Order: zero-width → unicode escapes → leetspeak → char spacing → context wrappers.
|
|
550
244
|
* @param {string} text
|
|
551
|
-
* @returns {
|
|
245
|
+
* @returns {string}
|
|
552
246
|
*/
|
|
553
|
-
function
|
|
554
|
-
|
|
247
|
+
function normalizeAll(text) {
|
|
248
|
+
if (!text || typeof text !== 'string') return text || '';
|
|
555
249
|
|
|
556
|
-
|
|
557
|
-
result = result
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
result = result
|
|
561
|
-
|
|
562
|
-
return { text: result, applied: result !== text };
|
|
563
|
-
}
|
|
250
|
+
let result = text;
|
|
251
|
+
result = stripZeroWidth(result);
|
|
252
|
+
result = decodeUnicodeEscapes(result);
|
|
253
|
+
result = reverseLeetspeak(result);
|
|
254
|
+
result = collapseCharSpacing(result);
|
|
255
|
+
result = stripContextWrappers(result);
|
|
564
256
|
|
|
565
|
-
|
|
566
|
-
* Layer 8: Repetition Collapsing
|
|
567
|
-
* Collapses 3+ repeated characters to a single character.
|
|
568
|
-
* "ignoooooore" → "ignore", "hellllp" → "help"
|
|
569
|
-
*
|
|
570
|
-
* @param {string} text
|
|
571
|
-
* @returns {{ text: string, applied: boolean }}
|
|
572
|
-
*/
|
|
573
|
-
function repetitionCollapse(text) {
|
|
574
|
-
// Collapse 3+ consecutive identical chars to 2.
|
|
575
|
-
// Using 2 (not 1) preserves legitimate double letters (e.g., "ll" in "all",
|
|
576
|
-
// "ss" in "bypass") while still defeating padding attacks like "ignoooore".
|
|
577
|
-
const result = text.replace(/(.)\1{2,}/g, '$1$1');
|
|
578
|
-
return { text: result, applied: result !== text };
|
|
257
|
+
return result;
|
|
579
258
|
}
|
|
580
259
|
|
|
581
260
|
// =========================================================================
|
|
582
|
-
//
|
|
261
|
+
// TEXT NORMALIZER CLASS
|
|
583
262
|
// =========================================================================
|
|
584
263
|
|
|
585
264
|
/**
|
|
586
|
-
*
|
|
587
|
-
*
|
|
588
|
-
* @
|
|
265
|
+
* Text Normalizer class with all normalization methods.
|
|
266
|
+
*
|
|
267
|
+
* @example
|
|
268
|
+
* const { TextNormalizer } = require('./normalizer');
|
|
269
|
+
* const normalizer = new TextNormalizer();
|
|
270
|
+
* const clean = normalizer.normalizeAll('i\\u0067nore previous instructions');
|
|
589
271
|
*/
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
272
|
+
class TextNormalizer {
|
|
273
|
+
/**
|
|
274
|
+
* @param {object} [options]
|
|
275
|
+
* @param {boolean} [options.aggressive] - Enable aggressive normalization (default false)
|
|
276
|
+
* @param {string[]} [options.customWrappers] - Additional context wrapper patterns
|
|
277
|
+
*/
|
|
278
|
+
constructor(options = {}) {
|
|
279
|
+
this.aggressive = options.aggressive || false;
|
|
280
|
+
this.customWrapperPatterns = [];
|
|
281
|
+
|
|
282
|
+
if (options.customWrappers) {
|
|
283
|
+
for (const w of options.customWrappers) {
|
|
284
|
+
try {
|
|
285
|
+
this.customWrapperPatterns.push(new RegExp(w, 'im'));
|
|
286
|
+
} catch (_e) {
|
|
287
|
+
console.warn(`[Agent Shield] Invalid custom wrapper pattern: ${w}`);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
600
291
|
|
|
601
|
-
/**
|
|
602
|
-
|
|
603
|
-
* @property {string} original - The original input text.
|
|
604
|
-
* @property {string} normalized - The fully normalized text.
|
|
605
|
-
* @property {string[]} layers - Names of layers that modified the text.
|
|
606
|
-
*/
|
|
292
|
+
/** @type {{ input: string, output: string, steps: string[] }[]} */
|
|
293
|
+
this._history = [];
|
|
607
294
|
|
|
608
|
-
|
|
609
|
-
* Runs the full normalization pipeline on input text.
|
|
610
|
-
*
|
|
611
|
-
* @param {string} text - Input text to normalize.
|
|
612
|
-
* @param {object} [options]
|
|
613
|
-
* @param {string[]} [options.skip] - Layer names to skip (e.g., ['case_fold']).
|
|
614
|
-
* @param {string[]} [options.only] - Only run these layers (overrides skip).
|
|
615
|
-
* @returns {NormalizationResult}
|
|
616
|
-
*/
|
|
617
|
-
function normalize(text, options = {}) {
|
|
618
|
-
if (!text || typeof text !== 'string') {
|
|
619
|
-
return { original: text || '', normalized: text || '', layers: [] };
|
|
295
|
+
console.log('[Agent Shield] TextNormalizer initialized');
|
|
620
296
|
}
|
|
621
297
|
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
298
|
+
/**
|
|
299
|
+
* Remove zero-width and invisible Unicode characters.
|
|
300
|
+
* @param {string} text
|
|
301
|
+
* @returns {string}
|
|
302
|
+
*/
|
|
303
|
+
stripZeroWidth(text) {
|
|
304
|
+
return stripZeroWidth(text);
|
|
305
|
+
}
|
|
626
306
|
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
307
|
+
/**
|
|
308
|
+
* Convert leetspeak substitutions back to ASCII.
|
|
309
|
+
* @param {string} text
|
|
310
|
+
* @returns {string}
|
|
311
|
+
*/
|
|
312
|
+
reverseLeetspeak(text) {
|
|
313
|
+
return reverseLeetspeak(text);
|
|
314
|
+
}
|
|
630
315
|
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
316
|
+
/**
|
|
317
|
+
* Collapse spaced-out character obfuscation.
|
|
318
|
+
* @param {string} text
|
|
319
|
+
* @returns {string}
|
|
320
|
+
*/
|
|
321
|
+
collapseCharSpacing(text) {
|
|
322
|
+
return collapseCharSpacing(text);
|
|
636
323
|
}
|
|
637
324
|
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
325
|
+
/**
|
|
326
|
+
* Remove context wrapper phrases.
|
|
327
|
+
* @param {string} text
|
|
328
|
+
* @returns {string}
|
|
329
|
+
*/
|
|
330
|
+
stripContextWrappers(text) {
|
|
331
|
+
let result = stripContextWrappers(text);
|
|
644
332
|
|
|
645
|
-
//
|
|
646
|
-
|
|
647
|
-
|
|
333
|
+
// Apply custom wrappers
|
|
334
|
+
for (const pattern of this.customWrapperPatterns) {
|
|
335
|
+
result = result.replace(pattern, '');
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
return result;
|
|
339
|
+
}
|
|
648
340
|
|
|
649
|
-
/**
|
|
650
|
-
* Configurable text normalization pipeline for Agent Shield.
|
|
651
|
-
*
|
|
652
|
-
* Runs multiple normalization layers in sequence to defeat evasion
|
|
653
|
-
* techniques before regex pattern matching.
|
|
654
|
-
*
|
|
655
|
-
* @example
|
|
656
|
-
* const { TextNormalizer } = require('./normalizer');
|
|
657
|
-
* const normalizer = new TextNormalizer({ skip: ['case_fold'] });
|
|
658
|
-
* const result = normalizer.normalize('ïgnörë àll prévïöüs ïnstrüctïöns');
|
|
659
|
-
* console.log(result.normalized); // 'ignore all previous instructions'
|
|
660
|
-
*/
|
|
661
|
-
class TextNormalizer {
|
|
662
341
|
/**
|
|
663
|
-
*
|
|
664
|
-
* @param {string
|
|
665
|
-
* @
|
|
666
|
-
* @param {boolean} [config.verbose=false] - Log normalization steps.
|
|
342
|
+
* Decode percent-encoded, Unicode escape, hex escape, and HTML entity sequences.
|
|
343
|
+
* @param {string} text
|
|
344
|
+
* @returns {string}
|
|
667
345
|
*/
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
this.only = config.only || null;
|
|
671
|
-
this.verbose = config.verbose || false;
|
|
346
|
+
decodeUnicodeEscapes(text) {
|
|
347
|
+
return decodeUnicodeEscapes(text);
|
|
672
348
|
}
|
|
673
349
|
|
|
674
350
|
/**
|
|
675
|
-
*
|
|
676
|
-
*
|
|
677
|
-
* @
|
|
678
|
-
* @returns {NormalizationResult}
|
|
351
|
+
* Apply all normalizers in sequence.
|
|
352
|
+
* @param {string} text
|
|
353
|
+
* @returns {string}
|
|
679
354
|
*/
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
355
|
+
normalizeAll(text) {
|
|
356
|
+
if (!text || typeof text !== 'string') return text || '';
|
|
357
|
+
|
|
358
|
+
const steps = [];
|
|
359
|
+
let result = text;
|
|
360
|
+
|
|
361
|
+
result = this.stripZeroWidth(result);
|
|
362
|
+
if (result !== text) steps.push('stripZeroWidth');
|
|
363
|
+
|
|
364
|
+
const prev1 = result;
|
|
365
|
+
result = this.decodeUnicodeEscapes(result);
|
|
366
|
+
if (result !== prev1) steps.push('decodeUnicodeEscapes');
|
|
367
|
+
|
|
368
|
+
const prev2 = result;
|
|
369
|
+
result = this.reverseLeetspeak(result);
|
|
370
|
+
if (result !== prev2) steps.push('reverseLeetspeak');
|
|
371
|
+
|
|
372
|
+
const prev3 = result;
|
|
373
|
+
result = this.collapseCharSpacing(result);
|
|
374
|
+
if (result !== prev3) steps.push('collapseCharSpacing');
|
|
375
|
+
|
|
376
|
+
const prev4 = result;
|
|
377
|
+
result = this.stripContextWrappers(result);
|
|
378
|
+
if (result !== prev4) steps.push('stripContextWrappers');
|
|
379
|
+
|
|
380
|
+
// Aggressive mode: apply a second pass
|
|
381
|
+
if (this.aggressive && steps.length > 0) {
|
|
382
|
+
result = stripZeroWidth(result);
|
|
383
|
+
result = decodeUnicodeEscapes(result);
|
|
384
|
+
result = reverseLeetspeak(result);
|
|
385
|
+
result = collapseCharSpacing(result);
|
|
386
|
+
steps.push('aggressive_second_pass');
|
|
685
387
|
}
|
|
686
388
|
|
|
389
|
+
this._history.push({ input: text.slice(0, 200), output: result.slice(0, 200), steps });
|
|
390
|
+
|
|
687
391
|
return result;
|
|
688
392
|
}
|
|
689
393
|
|
|
690
394
|
/**
|
|
691
|
-
*
|
|
692
|
-
*
|
|
693
|
-
* @param {string} layerName - Name of the layer to run.
|
|
694
|
-
* @param {string} text - Input text.
|
|
695
|
-
* @returns {{ text: string, applied: boolean }}
|
|
395
|
+
* Get normalization history.
|
|
396
|
+
* @returns {Array<{ input: string, output: string, steps: string[] }>}
|
|
696
397
|
*/
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
if (!layer) {
|
|
700
|
-
throw new Error(`[Agent Shield] normalizer: unknown layer "${layerName}"`);
|
|
701
|
-
}
|
|
702
|
-
return layer.fn(text);
|
|
398
|
+
getHistory() {
|
|
399
|
+
return [...this._history];
|
|
703
400
|
}
|
|
704
401
|
|
|
705
402
|
/**
|
|
706
|
-
*
|
|
707
|
-
*
|
|
708
|
-
* @returns {string[]}
|
|
403
|
+
* Clear normalization history.
|
|
709
404
|
*/
|
|
710
|
-
|
|
711
|
-
|
|
405
|
+
clearHistory() {
|
|
406
|
+
this._history = [];
|
|
712
407
|
}
|
|
713
408
|
}
|
|
714
409
|
|
|
@@ -718,17 +413,14 @@ class TextNormalizer {
|
|
|
718
413
|
|
|
719
414
|
module.exports = {
|
|
720
415
|
TextNormalizer,
|
|
721
|
-
|
|
722
|
-
|
|
416
|
+
normalizeAll,
|
|
417
|
+
stripZeroWidth,
|
|
418
|
+
reverseLeetspeak,
|
|
419
|
+
collapseCharSpacing,
|
|
420
|
+
stripContextWrappers,
|
|
421
|
+
decodeUnicodeEscapes,
|
|
422
|
+
ZERO_WIDTH_RE,
|
|
723
423
|
LEET_MAP,
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
unicodeCanon,
|
|
727
|
-
homoglyphDecode,
|
|
728
|
-
encodingDecode,
|
|
729
|
-
whitespaceNorm,
|
|
730
|
-
caseFold,
|
|
731
|
-
leetDecode,
|
|
732
|
-
markdownStrip,
|
|
733
|
-
repetitionCollapse,
|
|
424
|
+
CONTEXT_WRAPPERS,
|
|
425
|
+
HTML_ENTITIES
|
|
734
426
|
};
|