agentshield-sdk 11.0.0 → 12.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/normalizer.js CHANGED
@@ -1,12 +1,12 @@
1
1
  'use strict';
2
2
 
3
3
  /**
4
- * Agent Shield — Text Normalization Pipeline
4
+ * Agent Shield — Advanced Text Normalizer (v12.0)
5
5
  *
6
- * Pre-processing pipeline that runs BEFORE regex pattern matching to defeat
7
- * evasion techniques. Each layer strips a class of obfuscation, and the
8
- * pipeline returns both the original and normalized text so patterns can be
9
- * matched against both.
6
+ * Consolidates all text normalization logic into a standalone module.
7
+ * Handles zero-width character stripping, leetspeak reversal, spaced-out
8
+ * character collapsing, context wrapper removal, Unicode escape decoding,
9
+ * and HTML entity decoding.
10
10
  *
11
11
  * All processing runs locally — no data ever leaves your environment.
12
12
  *
@@ -14,701 +14,396 @@
14
14
  */
15
15
 
16
16
  // =========================================================================
17
- // ZERO-WIDTH / INVISIBLE CHARACTER SET
17
+ // CONSTANTS
18
18
  // =========================================================================
19
19
 
20
20
  /**
21
- * Characters that are invisible or have zero display width.
22
- * These are commonly inserted between letters to break pattern matching.
21
+ * Zero-width and invisible Unicode characters to strip.
23
22
  * @type {RegExp}
24
23
  */
25
- const ZERO_WIDTH_RE = /[\u200B\u200C\u200D\uFEFF\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180E\u2060\u2061\u2062\u2063\u2064\u200E\u200F\u202A-\u202E\u2066-\u2069\uFFF9-\uFFFB\uFE00-\uFE0F]/g;
24
+ const ZERO_WIDTH_RE = /[\u200B\u200C\u200D\u200E\u200F\uFEFF\u00AD\u2060\u2061\u2062\u2063\u2064\u180E\u034F]/g;
26
25
 
27
26
  /**
28
- * Tag characters (U+E0001–U+E007F) and variation selectors supplement
29
- * (U+E0100–U+E01EF) live in the SMP and require surrogate pair matching.
30
- * Used in evasion attacks to insert invisible data between visible chars.
31
- * @type {RegExp}
32
- */
33
- const TAG_CHARS_RE = /\uDB40[\uDC01-\uDC7F\uDD00-\uDDEF]/g;
34
-
35
- /**
36
- * Combining diacritical marks used for obfuscation (U+0300–U+036F).
37
- * @type {RegExp}
38
- */
39
- const COMBINING_MARKS_RE = /[\u0300-\u036F]/g;
40
-
41
- // =========================================================================
42
- // HOMOGLYPH MAP (200+ mappings)
43
- // =========================================================================
44
-
45
- /**
46
- * Comprehensive mapping of Unicode lookalikes to ASCII equivalents.
47
- * Covers Cyrillic, Greek, Cherokee, Georgian, Mathematical, Fullwidth,
48
- * Enclosed/Circled, Small Caps, IPA, Armenian, superscript/subscript,
49
- * and common Latin Extended characters.
50
- *
27
+ * Leetspeak substitution map (character ASCII letter).
51
28
  * @type {Object<string, string>}
52
29
  */
53
- const HOMOGLYPH_MAP = {
54
- // --- Cyrillic look-alikes ---
55
- '\u0410': 'A', '\u0430': 'a', '\u0412': 'B', '\u0432': 'v',
56
- '\u0435': 'e', '\u0415': 'E', '\u041A': 'K', '\u043A': 'k',
57
- '\u041C': 'M', '\u043C': 'm', '\u041D': 'H', '\u043E': 'o',
58
- '\u041E': 'O', '\u0440': 'p', '\u0420': 'P', '\u0441': 'c',
59
- '\u0421': 'C', '\u0422': 'T', '\u0442': 't', '\u0443': 'y',
60
- '\u0423': 'Y', '\u0445': 'x', '\u0425': 'X', '\u0456': 'i',
61
- '\u0406': 'I', '\u0458': 'j', '\u0455': 's', '\u0405': 'S',
62
- '\u0459': 'lj', '\u0452': 'd', '\u0460': 'O', '\u0461': 'o',
63
- '\u0472': 'F', '\u0473': 'f',
64
- '\u0433': 'r', '\u0457': 'i', '\u0491': 'r',
65
- '\u04BB': 'h', '\u0501': 'd', '\u051B': 'q', '\u051D': 'w',
66
-
67
- // --- Greek look-alikes ---
68
- '\u0391': 'A', '\u0392': 'B', '\u0395': 'E', '\u0396': 'Z',
69
- '\u0397': 'H', '\u0399': 'I', '\u039A': 'K', '\u039C': 'M',
70
- '\u039D': 'N', '\u039F': 'O', '\u03A1': 'P', '\u03A4': 'T',
71
- '\u03A5': 'Y', '\u03A7': 'X', '\u03BF': 'o', '\u03B1': 'a',
72
- '\u03B5': 'e', '\u03B9': 'i', '\u03BA': 'k', '\u03BD': 'v',
73
- '\u03C1': 'p', '\u03C4': 't', '\u03C5': 'u', '\u03C7': 'x',
74
- '\u03C9': 'w', '\u03B7': 'n',
75
-
76
- // --- Armenian look-alikes ---
77
- '\u0555': 'O', '\u0585': 'o', '\u0578': 'n', '\u057C': 'n',
78
- '\u0570': 'h', '\u0561': 'a', '\u0575': 'u', '\u0572': 'q',
79
- '\u0565': 'e', '\u056B': 'i', '\u0574': 'm', '\u057D': 's',
80
-
81
- // --- Cherokee look-alikes ---
82
- '\u13A0': 'D', '\u13A1': 'R', '\u13A2': 'T', '\u13A9': 'Y',
83
- '\u13AA': 'A', '\u13AB': 'J', '\u13AC': 'S', '\u13B3': 'W',
84
- '\u13B7': 'M', '\u13BB': 'H', '\u13C0': 'G', '\u13C2': 'h',
85
- '\u13C3': 'Z', '\u13CF': 'b', '\u13D2': 'R', '\u13DA': 'V',
86
- '\u13DE': 'L', '\u13DF': 'C', '\u13E2': 'P', '\u13E6': 'K',
87
-
88
- // --- Georgian look-alikes ---
89
- '\u10D0': 'a', '\u10D5': 'b', '\u10D3': 'd', '\u10DA': 'l',
90
- '\u10DD': 'o', '\u10DE': 'p', '\u10E1': 's', '\u10E2': 't',
91
- '\u10E3': 'u', '\u10EF': 'j',
92
-
93
- // --- Latin Extended (accented → base) ---
94
- // A variants
95
- '\u00C0': 'A', '\u00C1': 'A', '\u00C2': 'A', '\u00C3': 'A',
96
- '\u00C4': 'A', '\u00C5': 'A', '\u00E0': 'a', '\u00E1': 'a',
97
- '\u00E2': 'a', '\u00E3': 'a', '\u00E4': 'a', '\u00E5': 'a',
98
- '\u0100': 'A', '\u0101': 'a', '\u0102': 'A', '\u0103': 'a',
99
- '\u0104': 'A', '\u0105': 'a',
100
- // E variants
101
- '\u00C8': 'E', '\u00C9': 'E', '\u00CA': 'E', '\u00CB': 'E',
102
- '\u00E8': 'e', '\u00E9': 'e', '\u00EA': 'e', '\u00EB': 'e',
103
- '\u0112': 'E', '\u0113': 'e', '\u0114': 'E', '\u0115': 'e',
104
- '\u0116': 'E', '\u0117': 'e', '\u0118': 'E', '\u0119': 'e',
105
- // I variants
106
- '\u00CC': 'I', '\u00CD': 'I', '\u00CE': 'I', '\u00CF': 'I',
107
- '\u00EC': 'i', '\u00ED': 'i', '\u00EE': 'i', '\u00EF': 'i',
108
- '\u012A': 'I', '\u012B': 'i', '\u012C': 'I', '\u012D': 'i',
109
- '\u012E': 'I', '\u012F': 'i', '\u0130': 'I', '\u0131': 'i',
110
- // O variants
111
- '\u00D2': 'O', '\u00D3': 'O', '\u00D4': 'O', '\u00D5': 'O',
112
- '\u00D6': 'O', '\u00D8': 'O', '\u00F2': 'o', '\u00F3': 'o',
113
- '\u00F4': 'o', '\u00F5': 'o', '\u00F6': 'o', '\u00F8': 'o',
114
- '\u014C': 'O', '\u014D': 'o', '\u014E': 'O', '\u014F': 'o',
115
- '\u0150': 'O', '\u0151': 'o',
116
- // U variants
117
- '\u00D9': 'U', '\u00DA': 'U', '\u00DB': 'U', '\u00DC': 'U',
118
- '\u00F9': 'u', '\u00FA': 'u', '\u00FB': 'u', '\u00FC': 'u',
119
- '\u016A': 'U', '\u016B': 'u', '\u016C': 'U', '\u016D': 'u',
120
- '\u016E': 'U', '\u016F': 'u', '\u0170': 'U', '\u0171': 'u',
121
- // Other Latin Extended
122
- '\u00C7': 'C', '\u00E7': 'c', '\u00D1': 'N', '\u00F1': 'n',
123
- '\u00DD': 'Y', '\u00FD': 'y', '\u00FF': 'y',
124
- '\u0144': 'n', '\u0146': 'n', '\u0148': 'n',
125
- '\u015A': 'S', '\u015B': 's', '\u015C': 'S', '\u015D': 's',
126
- '\u015E': 'S', '\u015F': 's', '\u0160': 'S', '\u0161': 's',
127
- '\u010C': 'C', '\u010D': 'c', '\u010E': 'D', '\u010F': 'd',
128
- '\u0158': 'R', '\u0159': 'r', '\u0164': 'T', '\u0165': 't',
129
- '\u017D': 'Z', '\u017E': 'z', '\u017B': 'Z', '\u017C': 'z',
130
- '\u017A': 'z', '\u0179': 'Z',
131
- '\u0141': 'L', '\u0142': 'l', '\u0110': 'D', '\u0111': 'd',
132
-
133
- // --- Fullwidth (lowercase) ---
134
- '\uFF41': 'a', '\uFF42': 'b', '\uFF43': 'c', '\uFF44': 'd',
135
- '\uFF45': 'e', '\uFF46': 'f', '\uFF47': 'g', '\uFF48': 'h',
136
- '\uFF49': 'i', '\uFF4A': 'j', '\uFF4B': 'k', '\uFF4C': 'l',
137
- '\uFF4D': 'm', '\uFF4E': 'n', '\uFF4F': 'o', '\uFF50': 'p',
138
- '\uFF51': 'q', '\uFF52': 'r', '\uFF53': 's', '\uFF54': 't',
139
- '\uFF55': 'u', '\uFF56': 'v', '\uFF57': 'w', '\uFF58': 'x',
140
- '\uFF59': 'y', '\uFF5A': 'z',
141
- // --- Fullwidth (uppercase) ---
142
- '\uFF21': 'A', '\uFF22': 'B', '\uFF23': 'C', '\uFF24': 'D',
143
- '\uFF25': 'E', '\uFF26': 'F', '\uFF27': 'G', '\uFF28': 'H',
144
- '\uFF29': 'I', '\uFF2A': 'J', '\uFF2B': 'K', '\uFF2C': 'L',
145
- '\uFF2D': 'M', '\uFF2E': 'N', '\uFF2F': 'O', '\uFF30': 'P',
146
- '\uFF31': 'Q', '\uFF32': 'R', '\uFF33': 'S', '\uFF34': 'T',
147
- '\uFF35': 'U', '\uFF36': 'V', '\uFF37': 'W', '\uFF38': 'X',
148
- '\uFF39': 'Y', '\uFF3A': 'Z',
149
- // --- Fullwidth digits ---
150
- '\uFF10': '0', '\uFF11': '1', '\uFF12': '2', '\uFF13': '3',
151
- '\uFF14': '4', '\uFF15': '5', '\uFF16': '6', '\uFF17': '7',
152
- '\uFF18': '8', '\uFF19': '9',
153
-
154
- // --- Enclosed/Circled letters ---
155
- '\u24B6': 'A', '\u24B7': 'B', '\u24B8': 'C', '\u24B9': 'D',
156
- '\u24BA': 'E', '\u24BB': 'F', '\u24BC': 'G', '\u24BD': 'H',
157
- '\u24BE': 'I', '\u24BF': 'J', '\u24C0': 'K', '\u24C1': 'L',
158
- '\u24C2': 'M', '\u24C3': 'N', '\u24C4': 'O', '\u24C5': 'P',
159
- '\u24C6': 'Q', '\u24C7': 'R', '\u24C8': 'S', '\u24C9': 'T',
160
- '\u24CA': 'U', '\u24CB': 'V', '\u24CC': 'W', '\u24CD': 'X',
161
- '\u24CE': 'Y', '\u24CF': 'Z',
162
- '\u24D0': 'a', '\u24D1': 'b', '\u24D2': 'c', '\u24D3': 'd',
163
- '\u24D4': 'e', '\u24D5': 'f', '\u24D6': 'g', '\u24D7': 'h',
164
- '\u24D8': 'i', '\u24D9': 'j', '\u24DA': 'k', '\u24DB': 'l',
165
- '\u24DC': 'm', '\u24DD': 'n', '\u24DE': 'o', '\u24DF': 'p',
166
- '\u24E0': 'q', '\u24E1': 'r', '\u24E2': 's', '\u24E3': 't',
167
- '\u24E4': 'u', '\u24E5': 'v', '\u24E6': 'w', '\u24E7': 'x',
168
- '\u24E8': 'y', '\u24E9': 'z',
169
-
170
- // --- Small Caps (Unicode phonetic) ---
171
- '\u1D00': 'A', '\u0299': 'B', '\u1D04': 'C', '\u1D05': 'D',
172
- '\u1D07': 'E', '\u0262': 'G', '\u029C': 'H', '\u026A': 'I',
173
- '\u1D0A': 'J', '\u1D0B': 'K', '\u029F': 'L', '\u1D0D': 'M',
174
- '\u0274': 'N', '\u1D0F': 'O', '\u1D18': 'P', '\u0280': 'R',
175
- '\u1D1B': 'T', '\u1D1C': 'U', '\u1D20': 'V', '\u1D21': 'W',
176
-
177
- // --- IPA / Phonetic extensions ---
178
- '\u0250': 'a', '\u0253': 'b', '\u0254': 'c', '\u0256': 'd',
179
- '\u025B': 'e', '\u025F': 'f', '\u0260': 'g', '\u0266': 'h',
180
- '\u0268': 'i', '\u026D': 'l', '\u0271': 'm', '\u0272': 'n',
181
- '\u0275': 'o', '\u0278': 'p', '\u027E': 'r', '\u0282': 's',
182
- '\u0288': 't', '\u028A': 'u', '\u028B': 'v', '\u0290': 'z',
183
- '\u0237': 'j', '\u0261': 'g',
184
-
185
- // --- Mathematical Alphanumeric Symbols (bold italic) ---
186
- '\uD835\uDC1A': 'a', '\uD835\uDC1B': 'b', '\uD835\uDC1C': 'c',
187
- '\uD835\uDC1D': 'd', '\uD835\uDC1E': 'e', '\uD835\uDC1F': 'f',
188
- '\uD835\uDC20': 'g', '\uD835\uDC21': 'h', '\uD835\uDC22': 'i',
189
- '\uD835\uDC23': 'j', '\uD835\uDC24': 'k', '\uD835\uDC25': 'l',
190
- '\uD835\uDC26': 'm', '\uD835\uDC27': 'n', '\uD835\uDC28': 'o',
191
- '\uD835\uDC29': 'p', '\uD835\uDC2A': 'q', '\uD835\uDC2B': 'r',
192
- '\uD835\uDC2C': 's', '\uD835\uDC2D': 't', '\uD835\uDC2E': 'u',
193
- '\uD835\uDC2F': 'v', '\uD835\uDC30': 'w', '\uD835\uDC31': 'x',
194
- '\uD835\uDC32': 'y', '\uD835\uDC33': 'z',
195
-
196
- // --- Superscript / subscript ---
197
- '\u00B2': '2', '\u00B3': '3', '\u00B9': '1', '\u2070': '0',
198
- '\u2071': 'i', '\u2074': '4', '\u2075': '5', '\u2076': '6',
199
- '\u2077': '7', '\u2078': '8', '\u2079': '9', '\u207A': '+',
200
- '\u207B': '-', '\u207F': 'n',
201
- '\u2080': '0', '\u2081': '1', '\u2082': '2', '\u2083': '3',
202
- '\u2084': '4', '\u2090': 'a', '\u2091': 'e', '\u2092': 'o',
203
- '\u2093': 'x',
204
-
205
- // --- Modifier letters (superscript-like) ---
206
- '\u02B0': 'h', '\u02B1': 'h', '\u02B2': 'j', '\u02B3': 'r',
207
- '\u02B4': 'r', '\u02B7': 'w', '\u02B8': 'y', '\u02E0': 'g',
208
- '\u02E1': 'l', '\u02E2': 's', '\u02E3': 'x', '\u1D43': 'a',
209
- '\u1D47': 'b', '\u1D48': 'd', '\u1D49': 'e', '\u1D4D': 'g',
210
- '\u1D4F': 'k', '\u1D50': 'm', '\u1D52': 'o', '\u1D56': 'p',
211
- '\u1D57': 't', '\u1D58': 'u', '\u1D5B': 'v',
30
+ const LEET_MAP = {
31
+ '0': 'o', '1': 'i', '3': 'e', '4': 'a', '5': 's',
32
+ '7': 't', '8': 'b', '9': 'g', '@': 'a', '!': 'i',
33
+ '$': 's', '+': 't', '(': 'c', '|': 'l',
34
+ '}{': 'h', '}{': 'h', '/\\': 'a', '\\/': 'v',
35
+ '|3': 'b', '|)': 'd', '|<': 'k', '|_': 'l',
36
+ '|-|': 'h', '|\\|': 'n', '|2': 'r',
37
+ // Common Unicode lookalikes
38
+ '\u0430': 'a', '\u0435': 'e', '\u043E': 'o', '\u0440': 'p',
39
+ '\u0441': 'c', '\u0443': 'y', '\u0445': 'x',
40
+ '\u0410': 'A', '\u0415': 'E', '\u041E': 'O', '\u0420': 'P',
41
+ '\u0421': 'C', '\u0423': 'Y', '\u0425': 'X'
212
42
  };
213
43
 
214
- // =========================================================================
215
- // UNICODE WHITESPACE SET
216
- // =========================================================================
217
-
218
44
  /**
219
- * Unicode whitespace characters beyond standard space/tab/newline.
220
- * @type {RegExp}
45
+ * Multi-character leet sequences sorted by length (longest first for greedy matching).
46
+ * @type {Array<[string, string]>}
221
47
  */
222
- const UNICODE_WHITESPACE_RE = /[\u00A0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]/g;
223
-
224
- // =========================================================================
225
- // LEET SPEAK MAP
226
- // =========================================================================
227
-
228
- /**
229
- * Common leet speak substitutions (number/symbol → letter).
230
- * @type {Object<string, string>}
231
- */
232
- const LEET_MAP = {
233
- '0': 'o', '1': 'i', '2': 'z', '3': 'e', '4': 'a',
234
- '5': 's', '6': 'g', '7': 't', '8': 'b', '9': 'g',
235
- '@': 'a', '$': 's', '!': 'i', '|': 'l', '+': 't',
236
- '(': 'c', '{': 'c', '<': 'c', '#': 'h', '^': 'a',
237
- };
48
+ const MULTI_LEET = [
49
+ ['|\\|', 'n'], ['|-|', 'h'], ['/\\', 'a'], ['\\/', 'v'],
50
+ ['}{', 'h'], ['|3', 'b'], ['|)', 'd'], ['|<', 'k'],
51
+ ['|_', 'l'], ['|2', 'r']
52
+ ];
238
53
 
239
54
  /**
240
- * Extended leet map: multi-char patterns decoded first.
241
- * @type {Array<[RegExp, string]>}
55
+ * Context wrapper phrases that attackers prepend to bypass filters.
56
+ * @type {RegExp[]}
242
57
  */
243
- const LEET_MULTI = [
244
- [/\/\\\/\\/g, 'm'],
245
- [/\|-\|/g, 'h'],
246
- [/\|\)/g, 'd'],
247
- [/\|3/g, 'b'],
248
- [/\|_\|/g, 'u'],
249
- [/\|_/g, 'l'],
250
- [/\/\\/g, 'v'],
251
- [/\|\//g, 'v'],
252
- [/ph/gi, 'f'],
58
+ const CONTEXT_WRAPPERS = [
59
+ /^for\s+(?:research|educational|testing|academic|safety)\s*(?:purposes?\s*)?[:\-,]\s*/im,
60
+ /^(?:hypothetically|theoretically|in\s+theory)\s*[,:\-]\s*/im,
61
+ /^(?:imagine|pretend|suppose|assume)\s+(?:that\s+)?(?:you\s+(?:are|were)\s+)?/im,
62
+ /^(?:as\s+a\s+(?:thought\s+)?experiment)\s*[,:\-]\s*/im,
63
+ /^(?:just\s+)?(?:out\s+of\s+)?(?:curiosity|interest)\s*[,:\-]\s*/im,
64
+ /^(?:in\s+a\s+(?:fictional|hypothetical)\s+(?:scenario|world|context))\s*[,:\-]\s*/im,
65
+ /^(?:please\s+)?(?:help\s+me\s+)?(?:understand|explain)\s+(?:how\s+(?:to\s+)?)?/im,
66
+ /^(?:i'?m\s+(?:a\s+)?(?:security\s+)?researcher)\s*[,:\-]\s*/im
253
67
  ];
254
68
 
255
- // =========================================================================
256
- // MARKDOWN PATTERN
257
- // =========================================================================
258
-
259
69
  /**
260
- * Regex to strip markdown formatting markers.
261
- * Removes bold, italic, strikethrough, code, and heading markers.
262
- * @type {RegExp}
70
+ * Named HTML entities map (common subset).
71
+ * @type {Object<string, string>}
263
72
  */
264
- const MARKDOWN_RE = /(\*{1,3}|_{1,3}|~{2}|`{1,3}|#{1,6}\s)/g;
73
+ const HTML_ENTITIES = {
74
+ 'amp': '&', 'lt': '<', 'gt': '>', 'quot': '"', 'apos': "'",
75
+ 'nbsp': ' ', 'tab': '\t', 'newline': '\n',
76
+ 'lpar': '(', 'rpar': ')', 'lsqb': '[', 'rsqb': ']',
77
+ 'lcub': '{', 'rcub': '}', 'sol': '/', 'bsol': '\\',
78
+ 'comma': ',', 'period': '.', 'colon': ':', 'semi': ';',
79
+ 'excl': '!', 'quest': '?', 'num': '#', 'ast': '*',
80
+ 'plus': '+', 'equals': '=', 'hyphen': '-', 'lowbar': '_',
81
+ 'percnt': '%', 'dollar': '$', 'commat': '@', 'circ': '^',
82
+ 'tilde': '~', 'grave': '`', 'vert': '|'
83
+ };
265
84
 
266
85
  // =========================================================================
267
- // BASE64 SEGMENT DETECTION
86
+ // NORMALIZER FUNCTIONS
268
87
  // =========================================================================
269
88
 
270
89
  /**
271
- * Matches potential base64-encoded segments embedded in text.
272
- * Requires at least 20 chars to reduce false positives.
273
- * @type {RegExp}
90
+ * Remove zero-width and invisible Unicode characters.
91
+ * @param {string} text
92
+ * @returns {string}
274
93
  */
275
- const BASE64_SEGMENT_RE = /(?:^|\s)([A-Za-z0-9+/]{20,}={0,2})(?:\s|$)/g;
276
-
277
- // =========================================================================
278
- // NORMALIZATION LAYERS
279
- // =========================================================================
94
+ function stripZeroWidth(text) {
95
+ if (!text || typeof text !== 'string') return text || '';
96
+ return text.replace(ZERO_WIDTH_RE, '');
97
+ }
280
98
 
281
99
  /**
282
- * Layer 1: Unicode Canonicalization
283
- * Applies NFKC normalization, strips zero-width chars and combining marks.
284
- *
100
+ * Convert leetspeak substitutions back to standard ASCII letters.
101
+ * Handles multi-character sequences first, then single-character replacements.
285
102
  * @param {string} text
286
- * @returns {{ text: string, applied: boolean }}
103
+ * @returns {string}
287
104
  */
288
- function unicodeCanon(text) {
105
+ function reverseLeetspeak(text) {
106
+ if (!text || typeof text !== 'string') return text || '';
107
+
289
108
  let result = text;
290
109
 
291
- // First, apply NFKD to decompose everything (including precomposed chars)
292
- // so that combining marks become separate characters we can strip.
293
- if (typeof result.normalize === 'function') {
294
- result = result.normalize('NFKD');
110
+ // Multi-character sequences first (longest match wins)
111
+ for (const [leet, replacement] of MULTI_LEET) {
112
+ // Escape special regex characters in the leet string
113
+ const escaped = leet.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
114
+ result = result.replace(new RegExp(escaped, 'g'), replacement);
295
115
  }
296
116
 
297
- // Strip zero-width / invisible characters (BMP)
298
- result = result.replace(ZERO_WIDTH_RE, '');
299
-
300
- // Strip tag characters and variation selectors supplement (SMP, surrogate pairs)
301
- result = result.replace(TAG_CHARS_RE, '');
302
-
303
- // Strip combining diacritical marks (now separated by NFKD decomposition)
304
- result = result.replace(COMBINING_MARKS_RE, '');
305
-
306
- // Re-compose with NFC to get clean canonical form
307
- if (typeof result.normalize === 'function') {
308
- result = result.normalize('NFC');
117
+ // Single-character replacements
118
+ let out = '';
119
+ for (let i = 0; i < result.length; i++) {
120
+ const ch = result[i];
121
+ out += LEET_MAP[ch] !== undefined ? LEET_MAP[ch] : ch;
309
122
  }
310
123
 
311
- return { text: result, applied: result !== text };
124
+ return out;
312
125
  }
313
126
 
314
127
  /**
315
- * Layer 2: Homoglyph Mapping
316
- * Replaces Unicode lookalikes with ASCII equivalents.
317
- *
128
+ * Collapse spaced-out character obfuscation (e.g. "i g n o r e" → "ignore").
129
+ * Only collapses when most characters are single with uniform spacing.
318
130
  * @param {string} text
319
- * @returns {{ text: string, applied: boolean }}
131
+ * @returns {string}
320
132
  */
321
- function homoglyphDecode(text) {
322
- // Fast path: skip if text is pure ASCII (no homoglyphs possible)
323
- if (!/[^\x00-\x7F]/.test(text)) {
324
- return { text, applied: false };
325
- }
326
- let changed = false;
327
- let result = '';
328
- for (let i = 0; i < text.length; i++) {
329
- const ch = text[i];
330
- // Check for surrogate pairs (mathematical symbols, etc.)
331
- if (i + 1 < text.length && ch >= '\uD800' && ch <= '\uDBFF') {
332
- const pair = ch + text[i + 1];
333
- if (HOMOGLYPH_MAP[pair] !== undefined) {
334
- result += HOMOGLYPH_MAP[pair];
335
- changed = true;
336
- i++; // skip low surrogate
337
- continue;
338
- }
339
- }
340
- if (HOMOGLYPH_MAP[ch] !== undefined) {
341
- result += HOMOGLYPH_MAP[ch];
342
- changed = true;
133
+ function collapseCharSpacing(text) {
134
+ if (!text || typeof text !== 'string') return text || '';
135
+
136
+ // Process line by line to preserve structure
137
+ const lines = text.split('\n');
138
+ const result = [];
139
+
140
+ for (const line of lines) {
141
+ // Match pattern: single chars separated by uniform whitespace
142
+ // e.g., "i g n o r e" or "i g n o r e"
143
+ const spacedPattern = /^(\s*)([a-zA-Z])((\s{1,3})[a-zA-Z]){3,}(\s*)$/;
144
+ if (spacedPattern.test(line.trim())) {
145
+ // Extract only the letter characters
146
+ const collapsed = line.trim().replace(/\s+/g, '');
147
+ const leadingSpace = line.match(/^(\s*)/)[1];
148
+ result.push(leadingSpace + collapsed);
343
149
  } else {
344
- result += ch;
150
+ result.push(line);
345
151
  }
346
152
  }
347
- return { text: result, applied: changed };
348
- }
349
153
 
350
- /**
351
- * Layer 3: Encoding Decode
352
- * Detects and decodes base64, hex escapes, URL encoding, HTML entities,
353
- * and Unicode escapes within the text.
354
- *
355
- * @param {string} text
356
- * @returns {{ text: string, applied: boolean }}
357
- */
358
- function encodingDecode(text) {
359
- let result = text;
360
- let changed = false;
361
-
362
- // Decode Unicode escapes: \u0041 → A
363
- if (/\\u[0-9a-fA-F]{4}/.test(result)) {
364
- const decoded = result.replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) => {
365
- return String.fromCharCode(parseInt(hex, 16));
366
- });
367
- if (decoded !== result) { result = decoded; changed = true; }
368
- }
369
-
370
- // Decode hex escapes: \x41 → A
371
- if (/\\x[0-9a-fA-F]{2}/.test(result)) {
372
- const decoded = result.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) => {
373
- return String.fromCharCode(parseInt(hex, 16));
374
- });
375
- if (decoded !== result) { result = decoded; changed = true; }
376
- }
377
-
378
- // Decode URL encoding: %41 → A
379
- if (/%[0-9a-fA-F]{2}/.test(result)) {
380
- try {
381
- const decoded = decodeURIComponent(result);
382
- if (decoded !== result) { result = decoded; changed = true; }
383
- } catch (e) {
384
- // Partial URL encoding — decode individual sequences
385
- const decoded = result.replace(/%([0-9a-fA-F]{2})/g, (_, hex) => {
386
- return String.fromCharCode(parseInt(hex, 16));
387
- });
388
- if (decoded !== result) { result = decoded; changed = true; }
389
- }
390
- }
391
-
392
- // Decode HTML entities: &#65; &#x41; &amp; etc.
393
- if (/&(?:#\d+|#x[0-9a-fA-F]+|[a-zA-Z]+);/.test(result)) {
394
- const decoded = result
395
- .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
396
- .replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(parseInt(code, 16)))
397
- .replace(/&amp;/g, '&')
398
- .replace(/&lt;/g, '<')
399
- .replace(/&gt;/g, '>')
400
- .replace(/&quot;/g, '"')
401
- .replace(/&apos;/g, "'")
402
- .replace(/&nbsp;/g, ' ');
403
- if (decoded !== result) { result = decoded; changed = true; }
404
- }
405
-
406
- // Decode base64 segments embedded in text
407
- // Only decode if the decoded content looks like printable text
408
- const b64Matches = [];
409
- let m;
410
- const b64Re = /(?:^|\s)([A-Za-z0-9+/]{20,}={0,2})(?:\s|$)/g;
411
- while ((m = b64Re.exec(result)) !== null) {
412
- b64Matches.push({ match: m[1], index: m.index + (m[0].length - m[1].length - (m[0].endsWith(' ') ? 1 : 0)) });
413
- }
414
- for (let i = b64Matches.length - 1; i >= 0; i--) {
415
- const seg = b64Matches[i];
416
- try {
417
- let decoded;
418
- if (typeof Buffer !== 'undefined') {
419
- decoded = Buffer.from(seg.match, 'base64').toString('utf-8');
420
- } else if (typeof atob !== 'undefined') {
421
- decoded = atob(seg.match);
422
- }
423
- if (decoded) {
424
- const printable = decoded.split('').filter(c => {
425
- const code = c.charCodeAt(0);
426
- return code >= 32 && code <= 126;
427
- }).length;
428
- if (printable / decoded.length > 0.8 && decoded.length >= 4) {
429
- // Replace the base64 segment with decoded text
430
- result = result.substring(0, seg.index) + decoded + result.substring(seg.index + seg.match.length);
431
- changed = true;
432
- }
433
- }
434
- } catch (e) {
435
- // Not valid base64
436
- }
437
- }
438
-
439
- return { text: result, applied: changed };
154
+ return result.join('\n');
440
155
  }
441
156
 
442
157
  /**
443
- * Layer 4: Whitespace Normalization
444
- * Collapses multiple spaces/tabs/newlines, strips Unicode whitespace variants.
445
- *
158
+ * Remove common context wrapper phrases used to disguise malicious prompts.
446
159
  * @param {string} text
447
- * @returns {{ text: string, applied: boolean }}
160
+ * @returns {string}
448
161
  */
449
- function whitespaceNorm(text) {
450
- let result = text;
451
-
452
- // Replace Unicode whitespace with standard space
453
- result = result.replace(UNICODE_WHITESPACE_RE, ' ');
454
-
455
- // Collapse multiple whitespace characters to single space
456
- result = result.replace(/[ \t]+/g, ' ');
162
+ function stripContextWrappers(text) {
163
+ if (!text || typeof text !== 'string') return text || '';
457
164
 
458
- // Collapse multiple newlines to single newline
459
- result = result.replace(/\n{3,}/g, '\n\n');
460
-
461
- // Trim leading/trailing whitespace on each line
462
- result = result.replace(/^[ \t]+|[ \t]+$/gm, '');
165
+ let result = text;
166
+ for (const pattern of CONTEXT_WRAPPERS) {
167
+ result = result.replace(pattern, '');
168
+ }
463
169
 
464
- return { text: result, applied: result !== text };
170
+ return result;
465
171
  }
466
172
 
467
173
  /**
468
- * Layer 5: Case Folding
469
- * Converts text to lowercase for comparison.
470
- *
174
+ * Decode percent-encoded (%XX), Unicode escape (\uXXXX), hex escape (\xXX),
175
+ * numeric HTML entities (&#DDD; / &#xHH;), and named HTML entities (&name;).
471
176
  * @param {string} text
472
- * @returns {{ text: string, applied: boolean }}
177
+ * @returns {string}
473
178
  */
474
- function caseFold(text) {
475
- const result = text.toLowerCase();
476
- return { text: result, applied: result !== text };
477
- }
179
+ function decodeUnicodeEscapes(text) {
180
+ if (!text || typeof text !== 'string') return text || '';
478
181
 
479
- /**
480
- * Layer 6: Leet Speak Decode
481
- * Maps common number/symbol substitutions back to letters.
482
- *
483
- * @param {string} text
484
- * @returns {{ text: string, applied: boolean }}
485
- */
486
- function leetDecode(text) {
487
182
  let result = text;
488
183
 
489
- // Apply multi-character patterns first
490
- for (const [pattern, replacement] of LEET_MULTI) {
491
- result = result.replace(pattern, replacement);
492
- }
493
-
494
- // Apply single-character mappings. A leet char is decoded only if it is
495
- // part of a run that touches at least one actual letter (not just a cluster
496
- // of numbers like "2024"). We use flood-fill: mark leet positions, then
497
- // propagate "reachable from a letter" through adjacent leet positions.
498
- const chars = result.split('');
499
- const isLeet = new Array(chars.length).fill(false);
500
- const isLetter = new Array(chars.length).fill(false);
501
-
502
- for (let i = 0; i < chars.length; i++) {
503
- if (LEET_MAP[chars[i]] !== undefined) isLeet[i] = true;
504
- if (/[a-zA-Z]/.test(chars[i])) isLetter[i] = true;
505
- }
506
-
507
- // Mark which leet positions can reach a letter through adjacent leet/letter chain
508
- const reachable = new Array(chars.length).fill(false);
509
- for (let i = 0; i < chars.length; i++) {
510
- if (isLeet[i]) {
511
- // Check left neighbor
512
- if (i > 0 && isLetter[i - 1]) { reachable[i] = true; continue; }
513
- // Check right neighbor
514
- if (i < chars.length - 1 && isLetter[i + 1]) { reachable[i] = true; continue; }
184
+ // Decode \\uXXXX sequences
185
+ result = result.replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) => {
186
+ try {
187
+ return String.fromCharCode(parseInt(hex, 16));
188
+ } catch (_e) {
189
+ return _;
515
190
  }
516
- }
191
+ });
517
192
 
518
- // Propagate: if a leet char is reachable and its neighbor is leet, that neighbor is reachable too
519
- let changed = true;
520
- while (changed) {
521
- changed = false;
522
- for (let i = 0; i < chars.length; i++) {
523
- if (isLeet[i] && !reachable[i]) {
524
- if ((i > 0 && reachable[i - 1] && (isLeet[i - 1] || isLetter[i - 1])) ||
525
- (i < chars.length - 1 && reachable[i + 1] && (isLeet[i + 1] || isLetter[i + 1]))) {
526
- reachable[i] = true;
527
- changed = true;
528
- }
529
- }
193
+ // Decode \\xXX sequences
194
+ result = result.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) => {
195
+ try {
196
+ return String.fromCharCode(parseInt(hex, 16));
197
+ } catch (_e) {
198
+ return _;
530
199
  }
200
+ });
201
+
202
+ // Decode percent-encoded %XX sequences
203
+ try {
204
+ result = decodeURIComponent(result);
205
+ } catch (_e) {
206
+ // If decodeURIComponent fails (malformed), do manual single-byte decode
207
+ result = result.replace(/%([0-9a-fA-F]{2})/g, (_, hex) => {
208
+ try {
209
+ return String.fromCharCode(parseInt(hex, 16));
210
+ } catch (_e2) {
211
+ return _;
212
+ }
213
+ });
531
214
  }
532
215
 
533
- let decoded = '';
534
- for (let i = 0; i < chars.length; i++) {
535
- if (isLeet[i] && reachable[i]) {
536
- decoded += LEET_MAP[chars[i]];
537
- } else {
538
- decoded += chars[i];
216
+ // Decode numeric HTML entities &#DDD; and &#xHH;
217
+ result = result.replace(/&#x([0-9a-fA-F]+);/gi, (_, hex) => {
218
+ try {
219
+ return String.fromCodePoint(parseInt(hex, 16));
220
+ } catch (_e) {
221
+ return _;
539
222
  }
540
- }
541
- result = decoded;
223
+ });
224
+ result = result.replace(/&#(\d+);/g, (_, dec) => {
225
+ try {
226
+ return String.fromCodePoint(parseInt(dec, 10));
227
+ } catch (_e) {
228
+ return _;
229
+ }
230
+ });
231
+
232
+ // Decode named HTML entities &name;
233
+ result = result.replace(/&([a-zA-Z]+);/g, (match, name) => {
234
+ const lower = name.toLowerCase();
235
+ return HTML_ENTITIES[lower] !== undefined ? HTML_ENTITIES[lower] : match;
236
+ });
542
237
 
543
- return { text: result, applied: result !== text };
238
+ return result;
544
239
  }
545
240
 
546
241
  /**
547
- * Layer 7: Markdown/Format Stripping
548
- * Removes markdown bold, italic, code, and heading markers.
549
- *
242
+ * Apply all normalizers in the recommended sequence.
243
+ * Order: zero-width unicode escapes leetspeak → char spacing → context wrappers.
550
244
  * @param {string} text
551
- * @returns {{ text: string, applied: boolean }}
245
+ * @returns {string}
552
246
  */
553
- function markdownStrip(text) {
554
- let result = text;
247
+ function normalizeAll(text) {
248
+ if (!text || typeof text !== 'string') return text || '';
555
249
 
556
- // Remove markdown formatting markers
557
- result = result.replace(MARKDOWN_RE, '');
558
-
559
- // Remove bracket insertions: i]g[n]o[r]e → ignore
560
- result = result.replace(/[\[\]{}()]/g, '');
561
-
562
- return { text: result, applied: result !== text };
563
- }
250
+ let result = text;
251
+ result = stripZeroWidth(result);
252
+ result = decodeUnicodeEscapes(result);
253
+ result = reverseLeetspeak(result);
254
+ result = collapseCharSpacing(result);
255
+ result = stripContextWrappers(result);
564
256
 
565
- /**
566
- * Layer 8: Repetition Collapsing
567
- * Collapses 3+ repeated characters to a single character.
568
- * "ignoooooore" → "ignore", "hellllp" → "help"
569
- *
570
- * @param {string} text
571
- * @returns {{ text: string, applied: boolean }}
572
- */
573
- function repetitionCollapse(text) {
574
- // Collapse 3+ consecutive identical chars to 2.
575
- // Using 2 (not 1) preserves legitimate double letters (e.g., "ll" in "all",
576
- // "ss" in "bypass") while still defeating padding attacks like "ignoooore".
577
- const result = text.replace(/(.)\1{2,}/g, '$1$1');
578
- return { text: result, applied: result !== text };
257
+ return result;
579
258
  }
580
259
 
581
260
  // =========================================================================
582
- // PIPELINE
261
+ // TEXT NORMALIZER CLASS
583
262
  // =========================================================================
584
263
 
585
264
  /**
586
- * Ordered list of normalization layers.
587
- * Each layer runs in sequence; order matters.
588
- * @type {Array<{ name: string, fn: Function }>}
265
+ * Text Normalizer class with all normalization methods.
266
+ *
267
+ * @example
268
+ * const { TextNormalizer } = require('./normalizer');
269
+ * const normalizer = new TextNormalizer();
270
+ * const clean = normalizer.normalizeAll('i\\u0067nore previous instructions');
589
271
  */
590
- const DEFAULT_LAYERS = [
591
- { name: 'unicode', fn: unicodeCanon },
592
- { name: 'homoglyph', fn: homoglyphDecode },
593
- { name: 'encoding', fn: encodingDecode },
594
- { name: 'whitespace', fn: whitespaceNorm },
595
- { name: 'case_fold', fn: caseFold },
596
- { name: 'leet_speak', fn: leetDecode },
597
- { name: 'markdown', fn: markdownStrip },
598
- { name: 'repetition', fn: repetitionCollapse },
599
- ];
272
+ class TextNormalizer {
273
+ /**
274
+ * @param {object} [options]
275
+ * @param {boolean} [options.aggressive] - Enable aggressive normalization (default false)
276
+ * @param {string[]} [options.customWrappers] - Additional context wrapper patterns
277
+ */
278
+ constructor(options = {}) {
279
+ this.aggressive = options.aggressive || false;
280
+ this.customWrapperPatterns = [];
281
+
282
+ if (options.customWrappers) {
283
+ for (const w of options.customWrappers) {
284
+ try {
285
+ this.customWrapperPatterns.push(new RegExp(w, 'im'));
286
+ } catch (_e) {
287
+ console.warn(`[Agent Shield] Invalid custom wrapper pattern: ${w}`);
288
+ }
289
+ }
290
+ }
600
291
 
601
- /**
602
- * @typedef {Object} NormalizationResult
603
- * @property {string} original - The original input text.
604
- * @property {string} normalized - The fully normalized text.
605
- * @property {string[]} layers - Names of layers that modified the text.
606
- */
292
+ /** @type {{ input: string, output: string, steps: string[] }[]} */
293
+ this._history = [];
607
294
 
608
- /**
609
- * Runs the full normalization pipeline on input text.
610
- *
611
- * @param {string} text - Input text to normalize.
612
- * @param {object} [options]
613
- * @param {string[]} [options.skip] - Layer names to skip (e.g., ['case_fold']).
614
- * @param {string[]} [options.only] - Only run these layers (overrides skip).
615
- * @returns {NormalizationResult}
616
- */
617
- function normalize(text, options = {}) {
618
- if (!text || typeof text !== 'string') {
619
- return { original: text || '', normalized: text || '', layers: [] };
295
+ console.log('[Agent Shield] TextNormalizer initialized');
620
296
  }
621
297
 
622
- const skip = options.skip || [];
623
- const only = options.only || null;
624
- const appliedLayers = [];
625
- let current = text;
298
+ /**
299
+ * Remove zero-width and invisible Unicode characters.
300
+ * @param {string} text
301
+ * @returns {string}
302
+ */
303
+ stripZeroWidth(text) {
304
+ return stripZeroWidth(text);
305
+ }
626
306
 
627
- for (const layer of DEFAULT_LAYERS) {
628
- if (only && !only.includes(layer.name)) continue;
629
- if (!only && skip.includes(layer.name)) continue;
307
+ /**
308
+ * Convert leetspeak substitutions back to ASCII.
309
+ * @param {string} text
310
+ * @returns {string}
311
+ */
312
+ reverseLeetspeak(text) {
313
+ return reverseLeetspeak(text);
314
+ }
630
315
 
631
- const result = layer.fn(current);
632
- if (result.applied) {
633
- appliedLayers.push(layer.name);
634
- current = result.text;
635
- }
316
+ /**
317
+ * Collapse spaced-out character obfuscation.
318
+ * @param {string} text
319
+ * @returns {string}
320
+ */
321
+ collapseCharSpacing(text) {
322
+ return collapseCharSpacing(text);
636
323
  }
637
324
 
638
- return {
639
- original: text,
640
- normalized: current,
641
- layers: appliedLayers
642
- };
643
- }
325
+ /**
326
+ * Remove context wrapper phrases.
327
+ * @param {string} text
328
+ * @returns {string}
329
+ */
330
+ stripContextWrappers(text) {
331
+ let result = stripContextWrappers(text);
644
332
 
645
- // =========================================================================
646
- // TextNormalizer CLASS
647
- // =========================================================================
333
+ // Apply custom wrappers
334
+ for (const pattern of this.customWrapperPatterns) {
335
+ result = result.replace(pattern, '');
336
+ }
337
+
338
+ return result;
339
+ }
648
340
 
649
- /**
650
- * Configurable text normalization pipeline for Agent Shield.
651
- *
652
- * Runs multiple normalization layers in sequence to defeat evasion
653
- * techniques before regex pattern matching.
654
- *
655
- * @example
656
- * const { TextNormalizer } = require('./normalizer');
657
- * const normalizer = new TextNormalizer({ skip: ['case_fold'] });
658
- * const result = normalizer.normalize('ïgnörë àll prévïöüs ïnstrüctïöns');
659
- * console.log(result.normalized); // 'ignore all previous instructions'
660
- */
661
- class TextNormalizer {
662
341
  /**
663
- * @param {object} [config]
664
- * @param {string[]} [config.skip] - Layer names to skip.
665
- * @param {string[]} [config.only] - Only run these layers.
666
- * @param {boolean} [config.verbose=false] - Log normalization steps.
342
+ * Decode percent-encoded, Unicode escape, hex escape, and HTML entity sequences.
343
+ * @param {string} text
344
+ * @returns {string}
667
345
  */
668
- constructor(config = {}) {
669
- this.skip = config.skip || [];
670
- this.only = config.only || null;
671
- this.verbose = config.verbose || false;
346
+ decodeUnicodeEscapes(text) {
347
+ return decodeUnicodeEscapes(text);
672
348
  }
673
349
 
674
350
  /**
675
- * Normalizes input text through the pipeline.
676
- *
677
- * @param {string} text - Input text.
678
- * @returns {NormalizationResult}
351
+ * Apply all normalizers in sequence.
352
+ * @param {string} text
353
+ * @returns {string}
679
354
  */
680
- normalize(text) {
681
- const result = normalize(text, { skip: this.skip, only: this.only });
682
-
683
- if (this.verbose && result.layers.length > 0) {
684
- console.log(`[Agent Shield] normalizer: applied ${result.layers.length} layer(s): ${result.layers.join(', ')}`);
355
+ normalizeAll(text) {
356
+ if (!text || typeof text !== 'string') return text || '';
357
+
358
+ const steps = [];
359
+ let result = text;
360
+
361
+ result = this.stripZeroWidth(result);
362
+ if (result !== text) steps.push('stripZeroWidth');
363
+
364
+ const prev1 = result;
365
+ result = this.decodeUnicodeEscapes(result);
366
+ if (result !== prev1) steps.push('decodeUnicodeEscapes');
367
+
368
+ const prev2 = result;
369
+ result = this.reverseLeetspeak(result);
370
+ if (result !== prev2) steps.push('reverseLeetspeak');
371
+
372
+ const prev3 = result;
373
+ result = this.collapseCharSpacing(result);
374
+ if (result !== prev3) steps.push('collapseCharSpacing');
375
+
376
+ const prev4 = result;
377
+ result = this.stripContextWrappers(result);
378
+ if (result !== prev4) steps.push('stripContextWrappers');
379
+
380
+ // Aggressive mode: apply a second pass
381
+ if (this.aggressive && steps.length > 0) {
382
+ result = stripZeroWidth(result);
383
+ result = decodeUnicodeEscapes(result);
384
+ result = reverseLeetspeak(result);
385
+ result = collapseCharSpacing(result);
386
+ steps.push('aggressive_second_pass');
685
387
  }
686
388
 
389
+ this._history.push({ input: text.slice(0, 200), output: result.slice(0, 200), steps });
390
+
687
391
  return result;
688
392
  }
689
393
 
690
394
  /**
691
- * Runs a single named layer on the input text.
692
- *
693
- * @param {string} layerName - Name of the layer to run.
694
- * @param {string} text - Input text.
695
- * @returns {{ text: string, applied: boolean }}
395
+ * Get normalization history.
396
+ * @returns {Array<{ input: string, output: string, steps: string[] }>}
696
397
  */
697
- runLayer(layerName, text) {
698
- const layer = DEFAULT_LAYERS.find(l => l.name === layerName);
699
- if (!layer) {
700
- throw new Error(`[Agent Shield] normalizer: unknown layer "${layerName}"`);
701
- }
702
- return layer.fn(text);
398
+ getHistory() {
399
+ return [...this._history];
703
400
  }
704
401
 
705
402
  /**
706
- * Returns the list of available layer names.
707
- *
708
- * @returns {string[]}
403
+ * Clear normalization history.
709
404
  */
710
- getLayerNames() {
711
- return DEFAULT_LAYERS.map(l => l.name);
405
+ clearHistory() {
406
+ this._history = [];
712
407
  }
713
408
  }
714
409
 
@@ -718,17 +413,14 @@ class TextNormalizer {
718
413
 
719
414
  module.exports = {
720
415
  TextNormalizer,
721
- normalize,
722
- HOMOGLYPH_MAP,
416
+ normalizeAll,
417
+ stripZeroWidth,
418
+ reverseLeetspeak,
419
+ collapseCharSpacing,
420
+ stripContextWrappers,
421
+ decodeUnicodeEscapes,
422
+ ZERO_WIDTH_RE,
723
423
  LEET_MAP,
724
- DEFAULT_LAYERS,
725
- // Individual layer functions for direct use
726
- unicodeCanon,
727
- homoglyphDecode,
728
- encodingDecode,
729
- whitespaceNorm,
730
- caseFold,
731
- leetDecode,
732
- markdownStrip,
733
- repetitionCollapse,
424
+ CONTEXT_WRAPPERS,
425
+ HTML_ENTITIES
734
426
  };