re2js 2.1.1 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +157 -62
- package/build/index.cjs.cjs +676 -206
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts +14 -2
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +676 -206
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +676 -206
- package/build/index.umd.js.map +1 -1
- package/package.json +7 -7
package/README.md
CHANGED
|
@@ -90,6 +90,10 @@ RE2JS.DISABLE_UNICODE_GROUPS
|
|
|
90
90
|
* Flag: matches longest possible string (changes the match semantics to leftmost-longest).
|
|
91
91
|
*/
|
|
92
92
|
RE2JS.LONGEST_MATCH
|
|
93
|
+
/**
|
|
94
|
+
* Flag: enable linear-time captureless lookbehinds.
|
|
95
|
+
*/
|
|
96
|
+
RE2JS.LOOKBEHINDS
|
|
93
97
|
```
|
|
94
98
|
|
|
95
99
|
### Checking for Matches
|
|
@@ -114,49 +118,6 @@ RE2JS.compile(
|
|
|
114
118
|
).matches('AB\nc') // true
|
|
115
119
|
```
|
|
116
120
|
|
|
117
|
-
### Finding Matches
|
|
118
|
-
|
|
119
|
-
To find a match for a given regex pattern in a string, you can use the `find()` function
|
|
120
|
-
|
|
121
|
-
```js
|
|
122
|
-
import { RE2JS } from 're2js'
|
|
123
|
-
|
|
124
|
-
RE2JS.compile('ab+c').matcher('xxabbbc').find() // true
|
|
125
|
-
RE2JS.compile('ab+c').matcher('cbbba').find() // false
|
|
126
|
-
// with flags
|
|
127
|
-
RE2JS.compile('ab+c', RE2JS.CASE_INSENSITIVE).matcher('abBBc').find() // true
|
|
128
|
-
```
|
|
129
|
-
|
|
130
|
-
Example to collect all matches in string
|
|
131
|
-
|
|
132
|
-
```js
|
|
133
|
-
import { RE2JS } from 're2js'
|
|
134
|
-
|
|
135
|
-
const p = RE2JS.compile('abc+')
|
|
136
|
-
const matchString = p.matcher('abc abcccc abcc')
|
|
137
|
-
const results = []
|
|
138
|
-
while (matchString.find()) {
|
|
139
|
-
results.push(matchString.group())
|
|
140
|
-
}
|
|
141
|
-
results // ['abc', 'abcccc', 'abcc']
|
|
142
|
-
```
|
|
143
|
-
|
|
144
|
-
The `find()` method searches for a pattern match in a string starting from a specific index
|
|
145
|
-
|
|
146
|
-
```js
|
|
147
|
-
import { RE2JS } from 're2js'
|
|
148
|
-
|
|
149
|
-
const p = RE2JS.compile('.*[aeiou]')
|
|
150
|
-
const matchString = p.matcher('abcdefgh')
|
|
151
|
-
matchString.find(0) // true
|
|
152
|
-
matchString.group() // 'abcde'
|
|
153
|
-
matchString.find(1) // true
|
|
154
|
-
matchString.group() // 'bcde'
|
|
155
|
-
matchString.find(4) // true
|
|
156
|
-
matchString.group() // 'e'
|
|
157
|
-
matchString.find(7) // false
|
|
158
|
-
```
|
|
159
|
-
|
|
160
121
|
### High-Performance Boolean Testing
|
|
161
122
|
|
|
162
123
|
If you only need to know **whether** a string matches a pattern (without extracting capture groups), you should use the `test()`, `testExact()`, or `matches()` methods. Unlike `.matcher()`, these methods do not instantiate stateful `Matcher` objects and request exactly `0` capture groups. This guarantees that execution is securely routed to the high-speed DFA (Deterministic Finite Automaton) engine whenever possible in linear `O(n)` time
|
|
@@ -206,28 +167,47 @@ RE2JS.compile('abc').matcher('ab').lookingAt() // false
|
|
|
206
167
|
|
|
207
168
|
Note that the `lookingAt` method only checks the start of the string. It does not search the entire string for a match
|
|
208
169
|
|
|
209
|
-
###
|
|
170
|
+
### Finding Matches
|
|
210
171
|
|
|
211
|
-
|
|
172
|
+
To find a match for a given regex pattern in a string, you can use the `find()` function
|
|
212
173
|
|
|
213
174
|
```js
|
|
214
175
|
import { RE2JS } from 're2js'
|
|
215
176
|
|
|
216
|
-
RE2JS.compile('
|
|
217
|
-
RE2JS.compile('
|
|
218
|
-
|
|
177
|
+
RE2JS.compile('ab+c').matcher('xxabbbc').find() // true
|
|
178
|
+
RE2JS.compile('ab+c').matcher('cbbba').find() // false
|
|
179
|
+
// with flags
|
|
180
|
+
RE2JS.compile('ab+c', RE2JS.CASE_INSENSITIVE).matcher('abBBc').find() // true
|
|
219
181
|
```
|
|
220
182
|
|
|
221
|
-
|
|
183
|
+
Example to collect all matches in string
|
|
222
184
|
|
|
223
185
|
```js
|
|
224
186
|
import { RE2JS } from 're2js'
|
|
225
187
|
|
|
226
|
-
RE2JS.compile('
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
188
|
+
const p = RE2JS.compile('abc+')
|
|
189
|
+
const matchString = p.matcher('abc abcccc abcc')
|
|
190
|
+
const results = []
|
|
191
|
+
while (matchString.find()) {
|
|
192
|
+
results.push(matchString.group())
|
|
193
|
+
}
|
|
194
|
+
results // ['abc', 'abcccc', 'abcc']
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
The `find()` method searches for a pattern match in a string starting from a specific index
|
|
198
|
+
|
|
199
|
+
```js
|
|
200
|
+
import { RE2JS } from 're2js'
|
|
201
|
+
|
|
202
|
+
const p = RE2JS.compile('.*[aeiou]')
|
|
203
|
+
const matchString = p.matcher('abcdefgh')
|
|
204
|
+
matchString.find(0) // true
|
|
205
|
+
matchString.group() // 'abcde'
|
|
206
|
+
matchString.find(1) // true
|
|
207
|
+
matchString.group() // 'bcde'
|
|
208
|
+
matchString.find(4) // true
|
|
209
|
+
matchString.group() // 'e'
|
|
210
|
+
matchString.find(7) // false
|
|
231
211
|
```
|
|
232
212
|
|
|
233
213
|
### Multi-Pattern Matching (RE2Set)
|
|
@@ -237,10 +217,10 @@ RE2JS includes a highly optimized `RE2Set` API that allows you to match multiple
|
|
|
237
217
|
This is incredibly powerful for profanity filters, routing engines, or log parsers.
|
|
238
218
|
|
|
239
219
|
```js
|
|
240
|
-
import { RE2Set
|
|
220
|
+
import { RE2Set } from 're2js'
|
|
241
221
|
|
|
242
|
-
// Create a new set. You can optionally pass
|
|
243
|
-
// Default:
|
|
222
|
+
// Create a new set. You can optionally pass an anchor and public RE2JS flags.
|
|
223
|
+
// Default anchor: RE2Set.UNANCHORED
|
|
244
224
|
const set = new RE2Set()
|
|
245
225
|
|
|
246
226
|
// Add patterns to the set.
|
|
@@ -263,24 +243,110 @@ console.log(set.match('All systems operational.'))
|
|
|
263
243
|
|
|
264
244
|
#### Anchoring a Set
|
|
265
245
|
|
|
266
|
-
You can strictly anchor the entire set by passing an anchor
|
|
246
|
+
You can strictly anchor the entire set by passing an anchor constant to the constructor (`RE2Set.UNANCHORED`, `RE2Set.ANCHOR_START`, or `RE2Set.ANCHOR_BOTH`).
|
|
247
|
+
|
|
248
|
+
Additionally, you can pass standard public `RE2JS` flags (like `CASE_INSENSITIVE` or `LOOKBEHINDS`) as the second argument to apply them to all patterns in the set.
|
|
267
249
|
|
|
268
250
|
```js
|
|
269
|
-
import { RE2Set,
|
|
251
|
+
import { RE2Set, RE2JS } from 're2js'
|
|
252
|
+
|
|
253
|
+
// Anchor the set to match the entire string, and make it case-insensitive
|
|
254
|
+
const set = new RE2Set(RE2Set.ANCHOR_BOTH, RE2JS.CASE_INSENSITIVE)
|
|
270
255
|
|
|
271
|
-
const set = new RE2Set(RE2Flags.ANCHOR_BOTH)
|
|
272
256
|
set.add('foo') // ID: 0
|
|
273
257
|
set.add('bar') // ID: 1
|
|
274
258
|
set.add('.*') // ID: 2
|
|
275
259
|
|
|
276
260
|
set.compile()
|
|
277
261
|
|
|
278
|
-
console.log(set.match('
|
|
279
|
-
console.log(set.match('foobar')) // [2] (Only '.*' matches the entire string)
|
|
262
|
+
console.log(set.match('FOO')) // [0, 2] (Matches 'foo' and '.*' because of CASE_INSENSITIVE)
|
|
263
|
+
console.log(set.match('foobar')) // [2] (Only '.*' matches the entire string because of ANCHOR_BOTH)
|
|
280
264
|
```
|
|
281
265
|
|
|
282
266
|
***Performance Note:** `RE2Set` heavily utilizes the high-speed DFA engine to process multi-pattern matches simultaneously. However, if your patterns contain boundaries (e.g., `\b`) or trigger a massive state explosion, it will seamlessly and safely fall back to the bounded NFA engine.*
|
|
283
267
|
|
|
268
|
+
#### Example: Fast JS Routing with RE2Set
|
|
269
|
+
|
|
270
|
+
```js
|
|
271
|
+
import { RE2Set, RE2JS } from 're2js'
|
|
272
|
+
|
|
273
|
+
class Router {
|
|
274
|
+
constructor() {
|
|
275
|
+
this.set = new RE2Set()
|
|
276
|
+
this.routes = []
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
addRoute(pattern, handler) {
|
|
280
|
+
// compile the individual regex (for extracting groups later)
|
|
281
|
+
const re = RE2JS.compile(pattern)
|
|
282
|
+
|
|
283
|
+
// add the raw string to the blazing-fast Set
|
|
284
|
+
const id = this.set.add(pattern)
|
|
285
|
+
|
|
286
|
+
// store them together
|
|
287
|
+
this.routes[id] = { re, handler }
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
compile() {
|
|
291
|
+
this.set.compile()
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
execute(path) {
|
|
295
|
+
// find WHICH routes matched in O(N) time
|
|
296
|
+
const matchedIDs = this.set.match(path)
|
|
297
|
+
|
|
298
|
+
if (matchedIDs.length === 0) {
|
|
299
|
+
return '404 Not Found'
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// extract groups ONLY for the routes that won
|
|
303
|
+
for (const id of matchedIDs) {
|
|
304
|
+
const route = this.routes[id]
|
|
305
|
+
const matcher = route.re.matcher(path)
|
|
306
|
+
|
|
307
|
+
if (matcher.matches()) {
|
|
308
|
+
const params = matcher.getNamedGroups()
|
|
309
|
+
return route.handler(params)
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// --- Usage ---
|
|
316
|
+
const router = new Router()
|
|
317
|
+
|
|
318
|
+
router.addRoute('^/users/(?P<id>\\d+)$', (params) => `User ID: ${params.id}`)
|
|
319
|
+
router.addRoute('^/posts/(?P<slug>[a-z-]+)$', (params) => `Post: ${params.slug}`)
|
|
320
|
+
|
|
321
|
+
router.compile()
|
|
322
|
+
|
|
323
|
+
console.log(router.execute('/users/42')) // Outputs: "User ID: 42"
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
### Splitting Strings
|
|
327
|
+
|
|
328
|
+
You can split a string based on a regex pattern using the `split()` function
|
|
329
|
+
|
|
330
|
+
```js
|
|
331
|
+
import { RE2JS } from 're2js'
|
|
332
|
+
|
|
333
|
+
RE2JS.compile('/').split('abcde') // ['abcde']
|
|
334
|
+
RE2JS.compile('/').split('a/b/cc//d/e//') // ['a', 'b', 'cc', '', 'd', 'e']
|
|
335
|
+
RE2JS.compile(':').split(':a::b') // ['', 'a', '', 'b']
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
The `split()` function also supports a limit parameter
|
|
339
|
+
|
|
340
|
+
```js
|
|
341
|
+
import { RE2JS } from 're2js'
|
|
342
|
+
|
|
343
|
+
RE2JS.compile('/').split('a/b/cc//d/e//', 3) // ['a', 'b', 'cc//d/e//']
|
|
344
|
+
RE2JS.compile('/').split('a/b/cc//d/e//', 4) // ['a', 'b', 'cc', '/d/e//']
|
|
345
|
+
RE2JS.compile('/').split('a/b/cc//d/e//', 9) // ['a', 'b', 'cc', '', 'd', 'e', '', '']
|
|
346
|
+
RE2JS.compile(':').split('boo:and:foo', 2) // ['boo', 'and:foo']
|
|
347
|
+
RE2JS.compile(':').split('boo:and:foo', 5) // ['boo', 'and', 'foo']
|
|
348
|
+
```
|
|
349
|
+
|
|
284
350
|
### Working with Groups
|
|
285
351
|
|
|
286
352
|
RE2JS supports capturing groups in regex patterns
|
|
@@ -587,6 +653,35 @@ In the second example, a ReDoS scenario is depicted. The regular expression `([a
|
|
|
587
653
|
|
|
588
654
|
RE2JS processed this poison-pill string **30,000 times in just ~454 milliseconds**, while the native RegExp completely locked up the main thread for **over 1 minute and 45 seconds trying to evaluate it just once**. This demonstrates why RE2JS is absolutely essential for securely handling untrusted regular expressions and protecting Node.js and browser applications against ReDoS attacks.
|
|
589
655
|
|
|
656
|
+
## Lookbehinds (Linear-Time Execution)
|
|
657
|
+
|
|
658
|
+
Historically, the RE2 specification has strictly forbidden lookaround assertions (like lookbehinds) because traditional regex engines use backtracking to evaluate them, leading to catastrophic exponential execution times and ReDoS vulnerabilities.
|
|
659
|
+
|
|
660
|
+
However, `re2js` implements a breakthrough algorithmic approach ([developed by researchers at EPFL](https://arxiv.org/pdf/2311.17620), [RE2 guide how to add it](https://systemf.epfl.ch/blog/re2-lookbehinds/)) that evaluates **captureless lookbehinds in strict linear $O(n)$ time** without backtracking. Because this diverges from the standard RE2 specification and carries a slight performance trade-off, it is disabled by default.
|
|
661
|
+
|
|
662
|
+
You can enable it by passing the `RE2JS.LOOKBEHINDS` flag during compilation:
|
|
663
|
+
|
|
664
|
+
```js
|
|
665
|
+
import { RE2JS } from 're2js';
|
|
666
|
+
|
|
667
|
+
// Positive Lookbehind: Match 'bar' only if preceded by 'foo'
|
|
668
|
+
const positive = RE2JS.compile('(?<=foo)bar', RE2JS.LOOKBEHINDS);
|
|
669
|
+
positive.test('foobar'); // true
|
|
670
|
+
positive.test('bazbar'); // false
|
|
671
|
+
|
|
672
|
+
// Negative Lookbehind: Match 'bar' only if NOT preceded by 'foo'
|
|
673
|
+
const negative = RE2JS.compile('(?<!foo)bar', RE2JS.LOOKBEHINDS);
|
|
674
|
+
negative.test('bazbar'); // true
|
|
675
|
+
negative.test('foobar'); // false
|
|
676
|
+
```
|
|
677
|
+
|
|
678
|
+
### Important Limitations and Warnings
|
|
679
|
+
|
|
680
|
+
1. **Performance Overhead:** If a regex contains a lookbehind, the engine is forced to safely bypass the ultra-fast Lazy DFA and OnePass engines. It evaluates the lookbehinds using parallel automata running on the NFA (Pike VM). While execution remains mathematically safe and linear $O(n)$, the NFA engine is generally slower than the DFA fast-paths. Use lookbehinds only when necessary.
|
|
681
|
+
2. **Prefix Acceleration is Disabled:** To ensure the parallel tracking automata initialize correctly, high-speed string prefix skipping (e.g., using `indexOf` to jump to a starting literal) is disabled when lookbehinds are present.
|
|
682
|
+
3. **Captureless Guarantee:** To prevent state-explosion vulnerabilities, lookbehinds are strictly evaluated as *captureless*. If you include a capturing group inside a lookbehind (e.g., `(?<=(foo))bar`), the engine will match successfully, but `group(1)` will safely return `null`.
|
|
683
|
+
|
|
684
|
+
|
|
590
685
|
## Development
|
|
591
686
|
|
|
592
687
|
Some files like `CharGroup.js` and `UnicodeTables.js` are generated and should be edited in their respective generator files:
|