re2js 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +103 -8
- package/build/index.cjs.cjs +272 -71
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts +11 -1
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +272 -71
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +272 -71
- package/build/index.umd.js.map +1 -1
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -90,6 +90,10 @@ RE2JS.DISABLE_UNICODE_GROUPS
|
|
|
90
90
|
* Flag: matches longest possible string (changes the match semantics to leftmost-longest).
|
|
91
91
|
*/
|
|
92
92
|
RE2JS.LONGEST_MATCH
|
|
93
|
+
/**
|
|
94
|
+
* Flag: enable linear-time captureless lookbehinds.
|
|
95
|
+
*/
|
|
96
|
+
RE2JS.LOOKBEHINDS
|
|
93
97
|
```
|
|
94
98
|
|
|
95
99
|
### Checking for Matches
|
|
@@ -237,10 +241,10 @@ RE2JS includes a highly optimized `RE2Set` API that allows you to match multiple
|
|
|
237
241
|
This is incredibly powerful for profanity filters, routing engines, or log parsers.
|
|
238
242
|
|
|
239
243
|
```js
|
|
240
|
-
import { RE2Set
|
|
244
|
+
import { RE2Set } from 're2js'
|
|
241
245
|
|
|
242
|
-
// Create a new set. You can optionally pass
|
|
243
|
-
// Default:
|
|
246
|
+
// Create a new set. You can optionally pass an anchor and public RE2JS flags.
|
|
247
|
+
// Default anchor: RE2Set.UNANCHORED
|
|
244
248
|
const set = new RE2Set()
|
|
245
249
|
|
|
246
250
|
// Add patterns to the set.
|
|
@@ -263,24 +267,86 @@ console.log(set.match('All systems operational.'))
|
|
|
263
267
|
|
|
264
268
|
#### Anchoring a Set
|
|
265
269
|
|
|
266
|
-
You can strictly anchor the entire set by passing an anchor
|
|
270
|
+
You can strictly anchor the entire set by passing an anchor constant to the constructor (`RE2Set.UNANCHORED`, `RE2Set.ANCHOR_START`, or `RE2Set.ANCHOR_BOTH`).
|
|
271
|
+
|
|
272
|
+
Additionally, you can pass standard public `RE2JS` flags (like `CASE_INSENSITIVE` or `LOOKBEHINDS`) as the second argument to apply them to all patterns in the set.
|
|
267
273
|
|
|
268
274
|
```js
|
|
269
|
-
import { RE2Set,
|
|
275
|
+
import { RE2Set, RE2JS } from 're2js'
|
|
276
|
+
|
|
277
|
+
// Anchor the set to match the entire string, and make it case-insensitive
|
|
278
|
+
const set = new RE2Set(RE2Set.ANCHOR_BOTH, RE2JS.CASE_INSENSITIVE)
|
|
270
279
|
|
|
271
|
-
const set = new RE2Set(RE2Flags.ANCHOR_BOTH)
|
|
272
280
|
set.add('foo') // ID: 0
|
|
273
281
|
set.add('bar') // ID: 1
|
|
274
282
|
set.add('.*') // ID: 2
|
|
275
283
|
|
|
276
284
|
set.compile()
|
|
277
285
|
|
|
278
|
-
console.log(set.match('
|
|
279
|
-
console.log(set.match('foobar')) // [2] (Only '.*' matches the entire string)
|
|
286
|
+
console.log(set.match('FOO')) // [0, 2] (Matches 'foo' and '.*' because of CASE_INSENSITIVE)
|
|
287
|
+
console.log(set.match('foobar')) // [2] (Only '.*' matches the entire string because of ANCHOR_BOTH)
|
|
280
288
|
```
|
|
281
289
|
|
|
282
290
|
***Performance Note:** `RE2Set` heavily utilizes the high-speed DFA engine to process multi-pattern matches simultaneously. However, if your patterns contain boundaries (e.g., `\b`) or trigger a massive state explosion, it will seamlessly and safely fall back to the bounded NFA engine.*
|
|
283
291
|
|
|
292
|
+
#### Example: Fast JS Routing with RE2Set
|
|
293
|
+
|
|
294
|
+
```js
|
|
295
|
+
import { RE2Set, RE2JS } from 're2js'
|
|
296
|
+
|
|
297
|
+
class Router {
|
|
298
|
+
constructor() {
|
|
299
|
+
this.set = new RE2Set()
|
|
300
|
+
this.routes = []
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
addRoute(pattern, handler) {
|
|
304
|
+
// compile the individual regex (for extracting groups later)
|
|
305
|
+
const re = RE2JS.compile(pattern)
|
|
306
|
+
|
|
307
|
+
// add the raw string to the blazing-fast Set
|
|
308
|
+
const id = this.set.add(pattern)
|
|
309
|
+
|
|
310
|
+
// store them together
|
|
311
|
+
this.routes[id] = { re, handler }
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
compile() {
|
|
315
|
+
this.set.compile()
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
execute(path) {
|
|
319
|
+
// find WHICH routes matched in O(N) time
|
|
320
|
+
const matchedIDs = this.set.match(path)
|
|
321
|
+
|
|
322
|
+
if (matchedIDs.length === 0) {
|
|
323
|
+
return '404 Not Found'
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
// extract groups ONLY for the routes that won
|
|
327
|
+
for (const id of matchedIDs) {
|
|
328
|
+
const route = this.routes[id]
|
|
329
|
+
const matcher = route.re.matcher(path)
|
|
330
|
+
|
|
331
|
+
if (matcher.matches()) {
|
|
332
|
+
const params = matcher.getNamedGroups()
|
|
333
|
+
return route.handler(params)
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
// --- Usage ---
|
|
340
|
+
const router = new Router()
|
|
341
|
+
|
|
342
|
+
router.addRoute('^/users/(?P<id>\\d+)$', (params) => `User ID: ${params.id}`)
|
|
343
|
+
router.addRoute('^/posts/(?P<slug>[a-z-]+)$', (params) => `Post: ${params.slug}`)
|
|
344
|
+
|
|
345
|
+
router.compile()
|
|
346
|
+
|
|
347
|
+
console.log(router.execute('/users/42')) // Outputs: "User ID: 42"
|
|
348
|
+
```
|
|
349
|
+
|
|
284
350
|
### Working with Groups
|
|
285
351
|
|
|
286
352
|
RE2JS supports capturing groups in regex patterns
|
|
@@ -587,6 +653,35 @@ In the second example, a ReDoS scenario is depicted. The regular expression `([a
|
|
|
587
653
|
|
|
588
654
|
RE2JS processed this poison-pill string **30,000 times in just ~454 milliseconds**, while the native RegExp completely locked up the main thread for **over 1 minute and 45 seconds trying to evaluate it just once**. This demonstrates why RE2JS is absolutely essential for securely handling untrusted regular expressions and protecting Node.js and browser applications against ReDoS attacks.
|
|
589
655
|
|
|
656
|
+
## Lookbehinds (Linear-Time Execution)
|
|
657
|
+
|
|
658
|
+
Historically, the RE2 specification has strictly forbidden lookaround assertions (like lookbehinds) because traditional regex engines use backtracking to evaluate them, leading to catastrophic exponential execution times and ReDoS vulnerabilities.
|
|
659
|
+
|
|
660
|
+
However, `re2js` implements a breakthrough algorithmic approach ([developed by researchers at EPFL](https://arxiv.org/pdf/2311.17620), [RE2 guide how to add it](https://systemf.epfl.ch/blog/re2-lookbehinds/)) that evaluates **captureless lookbehinds in strict linear $O(n)$ time** without backtracking. Because this diverges from the standard RE2 specification and carries a slight performance trade-off, it is disabled by default.
|
|
661
|
+
|
|
662
|
+
You can enable it by passing the `RE2JS.LOOKBEHINDS` flag during compilation:
|
|
663
|
+
|
|
664
|
+
```js
|
|
665
|
+
import { RE2JS } from 're2js';
|
|
666
|
+
|
|
667
|
+
// Positive Lookbehind: Match 'bar' only if preceded by 'foo'
|
|
668
|
+
const positive = RE2JS.compile('(?<=foo)bar', RE2JS.LOOKBEHINDS);
|
|
669
|
+
positive.test('foobar'); // true
|
|
670
|
+
positive.test('bazbar'); // false
|
|
671
|
+
|
|
672
|
+
// Negative Lookbehind: Match 'bar' only if NOT preceded by 'foo'
|
|
673
|
+
const negative = RE2JS.compile('(?<!foo)bar', RE2JS.LOOKBEHINDS);
|
|
674
|
+
negative.test('bazbar'); // true
|
|
675
|
+
negative.test('foobar'); // false
|
|
676
|
+
```
|
|
677
|
+
|
|
678
|
+
### Important Limitations and Warnings
|
|
679
|
+
|
|
680
|
+
1. **Performance Overhead:** If a regex contains a lookbehind, the engine is forced to safely bypass the ultra-fast Lazy DFA and OnePass engines. It evaluates the lookbehinds using parallel automata running on the NFA (Pike VM). While execution remains mathematically safe and linear $O(n)$, the NFA engine is generally slower than the DFA fast-paths. Use lookbehinds only when necessary.
|
|
681
|
+
2. **Prefix Acceleration is Disabled:** To ensure the parallel tracking automata initialize correctly, high-speed string prefix skipping (e.g., using `indexOf` to jump to a starting literal) is disabled when lookbehinds are present.
|
|
682
|
+
3. **Captureless Guarantee:** To prevent state-explosion vulnerabilities, lookbehinds are strictly evaluated as *captureless*. If you include a capturing group inside a lookbehind (e.g., `(?<=(foo))bar`), the engine will match successfully, but `group(1)` will safely return `null`.
|
|
683
|
+
|
|
684
|
+
|
|
590
685
|
## Development
|
|
591
686
|
|
|
592
687
|
Some files like `CharGroup.js` and `UnicodeTables.js` are generated and should be edited in their respective generator files:
|