re2 1.16.0 → 1.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/actions/{linux-alpine-node-15 → linux-alpine-node-17}/Dockerfile +1 -1
- package/.github/actions/linux-alpine-node-17/action.yml +7 -0
- package/.github/actions/{linux-alpine-node-15 → linux-alpine-node-17}/entrypoint.sh +0 -0
- package/.github/dependabot.yml +1 -1
- package/.github/workflows/build.yml +8 -8
- package/.github/workflows/tests.yml +1 -1
- package/README.md +1 -0
- package/package.json +3 -3
- package/tests/test_general.js +6 -0
- package/vendor/README +3 -1
- package/vendor/re2/bitstate.cc +3 -3
- package/vendor/re2/compile.cc +50 -34
- package/vendor/re2/dfa.cc +24 -21
- package/vendor/re2/fuzzing/re2_fuzzer.cc +96 -20
- package/vendor/re2/make_perl_groups.pl +1 -1
- package/vendor/re2/nfa.cc +5 -5
- package/vendor/re2/onepass.cc +2 -2
- package/vendor/re2/parse.cc +41 -22
- package/vendor/re2/perl_groups.cc +34 -34
- package/vendor/re2/prog.cc +188 -4
- package/vendor/re2/prog.h +45 -13
- package/vendor/re2/re2.cc +7 -12
- package/vendor/re2/re2.h +7 -3
- package/vendor/re2/regexp.cc +11 -5
- package/vendor/re2/regexp.h +7 -2
- package/vendor/re2/set.cc +3 -0
- package/vendor/re2/testing/backtrack.cc +3 -3
- package/vendor/re2/testing/compile_test.cc +45 -21
- package/vendor/re2/testing/dfa_test.cc +4 -4
- package/vendor/re2/testing/exhaustive_tester.cc +2 -2
- package/vendor/re2/testing/parse_test.cc +1 -0
- package/vendor/re2/testing/re2_test.cc +31 -16
- package/vendor/re2/testing/regexp_benchmark.cc +108 -121
- package/vendor/re2/testing/required_prefix_test.cc +78 -24
- package/vendor/re2/testing/search_test.cc +2 -0
- package/vendor/re2/testing/tester.cc +9 -9
- package/vendor/re2/tostring.cc +1 -1
- package/vendor/re2/unicode.py +1 -1
- package/vendor/re2/unicode_casefold.cc +25 -11
- package/vendor/re2/unicode_groups.cc +319 -151
- package/vendor/re2/walker-inl.h +3 -2
- package/vendor/util/mutex.h +2 -2
- package/.github/actions/linux-alpine-node-15/action.yml +0 -7
|
File without changes
|
package/.github/dependabot.yml
CHANGED
|
@@ -31,7 +31,7 @@ jobs:
|
|
|
31
31
|
strategy:
|
|
32
32
|
matrix:
|
|
33
33
|
os: [windows-latest, macOS-latest]
|
|
34
|
-
node-version: [12, 14,
|
|
34
|
+
node-version: [12, 14, 16, 17]
|
|
35
35
|
|
|
36
36
|
steps:
|
|
37
37
|
- uses: actions/checkout@v2
|
|
@@ -73,7 +73,7 @@ jobs:
|
|
|
73
73
|
|
|
74
74
|
strategy:
|
|
75
75
|
matrix:
|
|
76
|
-
node-version: [12, 14,
|
|
76
|
+
node-version: [12, 14, 16, 17]
|
|
77
77
|
|
|
78
78
|
steps:
|
|
79
79
|
- uses: actions/checkout@v2
|
|
@@ -152,8 +152,8 @@ jobs:
|
|
|
152
152
|
env:
|
|
153
153
|
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
|
|
154
154
|
|
|
155
|
-
build-linux-alpine-node-
|
|
156
|
-
name: Node.js
|
|
155
|
+
build-linux-alpine-node-16:
|
|
156
|
+
name: Node.js 16 on Alpine Linux
|
|
157
157
|
needs: create-release
|
|
158
158
|
runs-on: ubuntu-latest
|
|
159
159
|
continue-on-error: true
|
|
@@ -175,12 +175,12 @@ jobs:
|
|
|
175
175
|
Linux-Alpine-node-
|
|
176
176
|
Linux-Alpine-
|
|
177
177
|
- name: Install, test, and create artifact
|
|
178
|
-
uses: ./.github/actions/linux-alpine-node-
|
|
178
|
+
uses: ./.github/actions/linux-alpine-node-16/
|
|
179
179
|
env:
|
|
180
180
|
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
|
|
181
181
|
|
|
182
|
-
build-linux-alpine-node-
|
|
183
|
-
name: Node.js
|
|
182
|
+
build-linux-alpine-node-17:
|
|
183
|
+
name: Node.js 17 on Alpine Linux
|
|
184
184
|
needs: create-release
|
|
185
185
|
runs-on: ubuntu-latest
|
|
186
186
|
continue-on-error: true
|
|
@@ -202,6 +202,6 @@ jobs:
|
|
|
202
202
|
Linux-Alpine-node-
|
|
203
203
|
Linux-Alpine-
|
|
204
204
|
- name: Install, test, and create artifact
|
|
205
|
-
uses: ./.github/actions/linux-alpine-node-
|
|
205
|
+
uses: ./.github/actions/linux-alpine-node-17/
|
|
206
206
|
env:
|
|
207
207
|
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
|
package/README.md
CHANGED
|
@@ -343,6 +343,7 @@ console.log('re2_res : ' + re2_res); // prints: re2_res : abc,a,b,c
|
|
|
343
343
|
|
|
344
344
|
## Release history
|
|
345
345
|
|
|
346
|
+
- 1.17.0 *Updated GYP, added support for Node 17, updated deps.*
|
|
346
347
|
- 1.16.0 *Updated the compiler (thx, [Sergei Dyshel](https://github.com/sergei-dyshel)), updated GYP, removed support for Node 10, added support for Node 16, updated TS bindings (thx, [BannerBomb](https://github.com/BannerBomb)).*
|
|
347
348
|
- 1.15.9 *Updated deps.*
|
|
348
349
|
- 1.15.8 *Updated deps.*
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "re2",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.17.0",
|
|
4
4
|
"description": "Bindings for RE2: fast, safe alternative to backtracking regular expression engines.",
|
|
5
5
|
"homepage": "https://github.com/uhop/node-re2",
|
|
6
6
|
"bugs": "https://github.com/uhop/node-re2/issues",
|
|
@@ -11,8 +11,8 @@
|
|
|
11
11
|
},
|
|
12
12
|
"dependencies": {
|
|
13
13
|
"install-artifact-from-github": "^1.2.0",
|
|
14
|
-
"nan": "^2.
|
|
15
|
-
"node-gyp": "^8.
|
|
14
|
+
"nan": "^2.15.0",
|
|
15
|
+
"node-gyp": "^8.4.1"
|
|
16
16
|
},
|
|
17
17
|
"devDependencies": {
|
|
18
18
|
"heya-unit": "^0.3.0"
|
package/tests/test_general.js
CHANGED
|
@@ -209,6 +209,12 @@ unit.add(module, [
|
|
|
209
209
|
eval(t.TEST("s3.length === 1"));
|
|
210
210
|
eval(t.TEST("RE2.getUtf8Length(s3) === 3"));
|
|
211
211
|
|
|
212
|
+
var s4 = "🤡";
|
|
213
|
+
|
|
214
|
+
eval(t.TEST("s4.length === 2"));
|
|
215
|
+
eval(t.TEST("RE2.getUtf8Length(s4) === 4"));
|
|
216
|
+
eval(t.TEST("RE2.getUtf16Length(Buffer.from(s4, 'utf8')) === s4.length"));
|
|
217
|
+
|
|
212
218
|
var b3 = new Buffer([0xF0]);
|
|
213
219
|
|
|
214
220
|
eval(t.TEST("b3.length === 1"));
|
package/vendor/README
CHANGED
|
@@ -31,10 +31,12 @@ The Python wrapper is at https://github.com/google/re2/tree/abseil/python
|
|
|
31
31
|
and on PyPI (https://pypi.org/project/google-re2/).
|
|
32
32
|
|
|
33
33
|
A C wrapper is at https://github.com/marcomaggi/cre2/.
|
|
34
|
+
A D wrapper is at https://github.com/ShigekiKarita/re2d/ and on DUB (code.dlang.org).
|
|
34
35
|
An Erlang wrapper is at https://github.com/dukesoferl/re2/ and on Hex (hex.pm).
|
|
35
36
|
An Inferno wrapper is at https://github.com/powerman/inferno-re2/.
|
|
36
37
|
A Node.js wrapper is at https://github.com/uhop/node-re2/ and on NPM (npmjs.com).
|
|
37
38
|
An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM (opam.ocaml.org).
|
|
38
39
|
A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN (cpan.org).
|
|
39
|
-
An R wrapper is at https://github.com/
|
|
40
|
+
An R wrapper is at https://github.com/girishji/re2/ and on CRAN (cran.r-project.org).
|
|
40
41
|
A Ruby wrapper is at https://github.com/mudge/re2/ and on RubyGems (rubygems.org).
|
|
42
|
+
A WebAssembly wrapper is at https://github.com/google/re2-wasm/ and on NPM (npmjs.com).
|
package/vendor/re2/bitstate.cc
CHANGED
|
@@ -293,9 +293,9 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
|
|
|
293
293
|
context_ = context;
|
|
294
294
|
if (context_.data() == NULL)
|
|
295
295
|
context_ = text;
|
|
296
|
-
if (prog_->anchor_start() && context_
|
|
296
|
+
if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text))
|
|
297
297
|
return false;
|
|
298
|
-
if (prog_->anchor_end() && context_
|
|
298
|
+
if (prog_->anchor_end() && EndPtr(context_) != EndPtr(text))
|
|
299
299
|
return false;
|
|
300
300
|
anchored_ = anchored || prog_->anchor_start();
|
|
301
301
|
longest_ = longest || prog_->anchor_end();
|
|
@@ -377,7 +377,7 @@ bool Prog::SearchBitState(const StringPiece& text,
|
|
|
377
377
|
bool longest = kind != kFirstMatch;
|
|
378
378
|
if (!b.Search(text, context, anchored, longest, match, nmatch))
|
|
379
379
|
return false;
|
|
380
|
-
if (kind == kFullMatch && match[0]
|
|
380
|
+
if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
|
|
381
381
|
return false;
|
|
382
382
|
return true;
|
|
383
383
|
}
|
package/vendor/re2/compile.cc
CHANGED
|
@@ -79,9 +79,11 @@ static const PatchList kNullPatchList = {0, 0};
|
|
|
79
79
|
struct Frag {
|
|
80
80
|
uint32_t begin;
|
|
81
81
|
PatchList end;
|
|
82
|
+
bool nullable;
|
|
82
83
|
|
|
83
|
-
Frag() : begin(0)
|
|
84
|
-
Frag(uint32_t begin, PatchList end
|
|
84
|
+
Frag() : begin(0), end(kNullPatchList), nullable(false) {}
|
|
85
|
+
Frag(uint32_t begin, PatchList end, bool nullable)
|
|
86
|
+
: begin(begin), end(end), nullable(nullable) {}
|
|
85
87
|
};
|
|
86
88
|
|
|
87
89
|
// Input encodings.
|
|
@@ -264,7 +266,7 @@ int Compiler::AllocInst(int n) {
|
|
|
264
266
|
|
|
265
267
|
// Returns an unmatchable fragment.
|
|
266
268
|
Frag Compiler::NoMatch() {
|
|
267
|
-
return Frag(
|
|
269
|
+
return Frag();
|
|
268
270
|
}
|
|
269
271
|
|
|
270
272
|
// Is a an unmatchable fragment?
|
|
@@ -290,11 +292,11 @@ Frag Compiler::Cat(Frag a, Frag b) {
|
|
|
290
292
|
// To run backward over string, reverse all concatenations.
|
|
291
293
|
if (reversed_) {
|
|
292
294
|
PatchList::Patch(inst_.data(), b.end, a.begin);
|
|
293
|
-
return Frag(b.begin, a.end);
|
|
295
|
+
return Frag(b.begin, a.end, b.nullable && a.nullable);
|
|
294
296
|
}
|
|
295
297
|
|
|
296
298
|
PatchList::Patch(inst_.data(), a.end, b.begin);
|
|
297
|
-
return Frag(a.begin, b.end);
|
|
299
|
+
return Frag(a.begin, b.end, a.nullable && b.nullable);
|
|
298
300
|
}
|
|
299
301
|
|
|
300
302
|
// Given fragments for a and b, returns fragment for a|b.
|
|
@@ -310,7 +312,8 @@ Frag Compiler::Alt(Frag a, Frag b) {
|
|
|
310
312
|
return NoMatch();
|
|
311
313
|
|
|
312
314
|
inst_[id].InitAlt(a.begin, b.begin);
|
|
313
|
-
return Frag(id, PatchList::Append(inst_.data(), a.end, b.end)
|
|
315
|
+
return Frag(id, PatchList::Append(inst_.data(), a.end, b.end),
|
|
316
|
+
a.nullable || b.nullable);
|
|
314
317
|
}
|
|
315
318
|
|
|
316
319
|
// When capturing submatches in like-Perl mode, a kOpAlt Inst
|
|
@@ -320,27 +323,44 @@ Frag Compiler::Alt(Frag a, Frag b) {
|
|
|
320
323
|
// then the operator is greedy. If out1_ is the repetition
|
|
321
324
|
// (and out_ moves forward), then the operator is non-greedy.
|
|
322
325
|
|
|
323
|
-
// Given a fragment a, returns a fragment for a
|
|
324
|
-
Frag Compiler::
|
|
326
|
+
// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
|
|
327
|
+
Frag Compiler::Plus(Frag a, bool nongreedy) {
|
|
325
328
|
int id = AllocInst(1);
|
|
326
329
|
if (id < 0)
|
|
327
330
|
return NoMatch();
|
|
328
|
-
|
|
329
|
-
PatchList::Patch(inst_.data(), a.end, id);
|
|
331
|
+
PatchList pl;
|
|
330
332
|
if (nongreedy) {
|
|
331
|
-
inst_[id].
|
|
332
|
-
|
|
333
|
+
inst_[id].InitAlt(0, a.begin);
|
|
334
|
+
pl = PatchList::Mk(id << 1);
|
|
333
335
|
} else {
|
|
334
|
-
inst_[id].
|
|
335
|
-
|
|
336
|
+
inst_[id].InitAlt(a.begin, 0);
|
|
337
|
+
pl = PatchList::Mk((id << 1) | 1);
|
|
336
338
|
}
|
|
339
|
+
PatchList::Patch(inst_.data(), a.end, id);
|
|
340
|
+
return Frag(a.begin, pl, a.nullable);
|
|
337
341
|
}
|
|
338
342
|
|
|
339
|
-
// Given a fragment for a, returns a fragment for a
|
|
340
|
-
Frag Compiler::
|
|
341
|
-
//
|
|
342
|
-
|
|
343
|
-
|
|
343
|
+
// Given a fragment for a, returns a fragment for a* or a*? (if nongreedy)
|
|
344
|
+
Frag Compiler::Star(Frag a, bool nongreedy) {
|
|
345
|
+
// When the subexpression is nullable, one Alt isn't enough to guarantee
|
|
346
|
+
// correct priority ordering within the transitive closure. The simplest
|
|
347
|
+
// solution is to handle it as (a+)? instead, which adds the second Alt.
|
|
348
|
+
if (a.nullable)
|
|
349
|
+
return Quest(Plus(a, nongreedy), nongreedy);
|
|
350
|
+
|
|
351
|
+
int id = AllocInst(1);
|
|
352
|
+
if (id < 0)
|
|
353
|
+
return NoMatch();
|
|
354
|
+
PatchList pl;
|
|
355
|
+
if (nongreedy) {
|
|
356
|
+
inst_[id].InitAlt(0, a.begin);
|
|
357
|
+
pl = PatchList::Mk(id << 1);
|
|
358
|
+
} else {
|
|
359
|
+
inst_[id].InitAlt(a.begin, 0);
|
|
360
|
+
pl = PatchList::Mk((id << 1) | 1);
|
|
361
|
+
}
|
|
362
|
+
PatchList::Patch(inst_.data(), a.end, id);
|
|
363
|
+
return Frag(id, pl, true);
|
|
344
364
|
}
|
|
345
365
|
|
|
346
366
|
// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
|
|
@@ -358,7 +378,7 @@ Frag Compiler::Quest(Frag a, bool nongreedy) {
|
|
|
358
378
|
inst_[id].InitAlt(a.begin, 0);
|
|
359
379
|
pl = PatchList::Mk((id << 1) | 1);
|
|
360
380
|
}
|
|
361
|
-
return Frag(id, PatchList::Append(inst_.data(), pl, a.end));
|
|
381
|
+
return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true);
|
|
362
382
|
}
|
|
363
383
|
|
|
364
384
|
// Returns a fragment for the byte range lo-hi.
|
|
@@ -367,7 +387,7 @@ Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
|
|
|
367
387
|
if (id < 0)
|
|
368
388
|
return NoMatch();
|
|
369
389
|
inst_[id].InitByteRange(lo, hi, foldcase, 0);
|
|
370
|
-
return Frag(id, PatchList::Mk(id << 1));
|
|
390
|
+
return Frag(id, PatchList::Mk(id << 1), false);
|
|
371
391
|
}
|
|
372
392
|
|
|
373
393
|
// Returns a no-op fragment. Sometimes unavoidable.
|
|
@@ -376,7 +396,7 @@ Frag Compiler::Nop() {
|
|
|
376
396
|
if (id < 0)
|
|
377
397
|
return NoMatch();
|
|
378
398
|
inst_[id].InitNop(0);
|
|
379
|
-
return Frag(id, PatchList::Mk(id << 1));
|
|
399
|
+
return Frag(id, PatchList::Mk(id << 1), true);
|
|
380
400
|
}
|
|
381
401
|
|
|
382
402
|
// Returns a fragment that signals a match.
|
|
@@ -385,7 +405,7 @@ Frag Compiler::Match(int32_t match_id) {
|
|
|
385
405
|
if (id < 0)
|
|
386
406
|
return NoMatch();
|
|
387
407
|
inst_[id].InitMatch(match_id);
|
|
388
|
-
return Frag(id, kNullPatchList);
|
|
408
|
+
return Frag(id, kNullPatchList, false);
|
|
389
409
|
}
|
|
390
410
|
|
|
391
411
|
// Returns a fragment matching a particular empty-width op (like ^ or $)
|
|
@@ -394,7 +414,7 @@ Frag Compiler::EmptyWidth(EmptyOp empty) {
|
|
|
394
414
|
if (id < 0)
|
|
395
415
|
return NoMatch();
|
|
396
416
|
inst_[id].InitEmptyWidth(empty, 0);
|
|
397
|
-
return Frag(id, PatchList::Mk(id << 1));
|
|
417
|
+
return Frag(id, PatchList::Mk(id << 1), true);
|
|
398
418
|
}
|
|
399
419
|
|
|
400
420
|
// Given a fragment a, returns a fragment with capturing parens around a.
|
|
@@ -408,7 +428,7 @@ Frag Compiler::Capture(Frag a, int n) {
|
|
|
408
428
|
inst_[id+1].InitCapture(2*n+1, 0);
|
|
409
429
|
PatchList::Patch(inst_.data(), a.end, id+1);
|
|
410
430
|
|
|
411
|
-
return Frag(id, PatchList::Mk((id+1) << 1));
|
|
431
|
+
return Frag(id, PatchList::Mk((id+1) << 1), a.nullable);
|
|
412
432
|
}
|
|
413
433
|
|
|
414
434
|
// A Rune is a name for a Unicode code point.
|
|
@@ -567,7 +587,7 @@ bool Compiler::ByteRangeEqual(int id1, int id2) {
|
|
|
567
587
|
Frag Compiler::FindByteRange(int root, int id) {
|
|
568
588
|
if (inst_[root].opcode() == kInstByteRange) {
|
|
569
589
|
if (ByteRangeEqual(root, id))
|
|
570
|
-
return Frag(root, kNullPatchList);
|
|
590
|
+
return Frag(root, kNullPatchList, false);
|
|
571
591
|
else
|
|
572
592
|
return NoMatch();
|
|
573
593
|
}
|
|
@@ -575,7 +595,7 @@ Frag Compiler::FindByteRange(int root, int id) {
|
|
|
575
595
|
while (inst_[root].opcode() == kInstAlt) {
|
|
576
596
|
int out1 = inst_[root].out1();
|
|
577
597
|
if (ByteRangeEqual(out1, id))
|
|
578
|
-
return Frag(root, PatchList::Mk((root << 1) | 1));
|
|
598
|
+
return Frag(root, PatchList::Mk((root << 1) | 1), false);
|
|
579
599
|
|
|
580
600
|
// CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
|
|
581
601
|
// what we're looking for, then we can stop immediately. Unfortunately, we
|
|
@@ -587,7 +607,7 @@ Frag Compiler::FindByteRange(int root, int id) {
|
|
|
587
607
|
if (inst_[out].opcode() == kInstAlt)
|
|
588
608
|
root = out;
|
|
589
609
|
else if (ByteRangeEqual(out, id))
|
|
590
|
-
return Frag(root, PatchList::Mk(root << 1));
|
|
610
|
+
return Frag(root, PatchList::Mk(root << 1), false);
|
|
591
611
|
else
|
|
592
612
|
return NoMatch();
|
|
593
613
|
}
|
|
@@ -1156,12 +1176,8 @@ Prog* Compiler::Finish(Regexp* re) {
|
|
|
1156
1176
|
if (!prog_->reversed()) {
|
|
1157
1177
|
std::string prefix;
|
|
1158
1178
|
bool prefix_foldcase;
|
|
1159
|
-
if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase)
|
|
1160
|
-
|
|
1161
|
-
prog_->prefix_size_ = prefix.size();
|
|
1162
|
-
prog_->prefix_front_ = prefix.front();
|
|
1163
|
-
prog_->prefix_back_ = prefix.back();
|
|
1164
|
-
}
|
|
1179
|
+
if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase))
|
|
1180
|
+
prog_->ConfigurePrefixAccel(prefix, prefix_foldcase);
|
|
1165
1181
|
}
|
|
1166
1182
|
|
|
1167
1183
|
// Record remaining memory for DFA.
|
package/vendor/re2/dfa.cc
CHANGED
|
@@ -56,6 +56,10 @@ namespace re2 {
|
|
|
56
56
|
// Controls whether the DFA should bail out early if the NFA would be faster.
|
|
57
57
|
static bool dfa_should_bail_when_slow = true;
|
|
58
58
|
|
|
59
|
+
void Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(bool b) {
|
|
60
|
+
dfa_should_bail_when_slow = b;
|
|
61
|
+
}
|
|
62
|
+
|
|
59
63
|
// Changing this to true compiles in prints that trace execution of the DFA.
|
|
60
64
|
// Generates a lot of output -- only useful for debugging.
|
|
61
65
|
static const bool ExtraDebug = false;
|
|
@@ -167,6 +171,9 @@ class DFA {
|
|
|
167
171
|
typedef std::unordered_set<State*, StateHash, StateEqual> StateSet;
|
|
168
172
|
|
|
169
173
|
private:
|
|
174
|
+
// Make it easier to swap in a scalable reader-writer mutex.
|
|
175
|
+
using CacheMutex = Mutex;
|
|
176
|
+
|
|
170
177
|
enum {
|
|
171
178
|
// Indices into start_ for unanchored searches.
|
|
172
179
|
// Add kStartAnchored for anchored searches.
|
|
@@ -331,7 +338,7 @@ class DFA {
|
|
|
331
338
|
// while holding cache_mutex_ for writing, to avoid interrupting other
|
|
332
339
|
// readers. Any State* pointers are only valid while cache_mutex_
|
|
333
340
|
// is held.
|
|
334
|
-
|
|
341
|
+
CacheMutex cache_mutex_;
|
|
335
342
|
int64_t mem_budget_; // Total memory budget for all States.
|
|
336
343
|
int64_t state_budget_; // Amount of memory remaining for new States.
|
|
337
344
|
StateSet state_cache_; // All States computed so far.
|
|
@@ -1106,7 +1113,7 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) {
|
|
|
1106
1113
|
|
|
1107
1114
|
class DFA::RWLocker {
|
|
1108
1115
|
public:
|
|
1109
|
-
explicit RWLocker(
|
|
1116
|
+
explicit RWLocker(CacheMutex* mu);
|
|
1110
1117
|
~RWLocker();
|
|
1111
1118
|
|
|
1112
1119
|
// If the lock is only held for reading right now,
|
|
@@ -1116,14 +1123,14 @@ class DFA::RWLocker {
|
|
|
1116
1123
|
void LockForWriting();
|
|
1117
1124
|
|
|
1118
1125
|
private:
|
|
1119
|
-
|
|
1126
|
+
CacheMutex* mu_;
|
|
1120
1127
|
bool writing_;
|
|
1121
1128
|
|
|
1122
1129
|
RWLocker(const RWLocker&) = delete;
|
|
1123
1130
|
RWLocker& operator=(const RWLocker&) = delete;
|
|
1124
1131
|
};
|
|
1125
1132
|
|
|
1126
|
-
DFA::RWLocker::RWLocker(
|
|
1133
|
+
DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) {
|
|
1127
1134
|
mu_->ReaderLock();
|
|
1128
1135
|
}
|
|
1129
1136
|
|
|
@@ -1481,15 +1488,15 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) {
|
|
|
1481
1488
|
|
|
1482
1489
|
int lastbyte;
|
|
1483
1490
|
if (run_forward) {
|
|
1484
|
-
if (params->text
|
|
1491
|
+
if (EndPtr(params->text) == EndPtr(params->context))
|
|
1485
1492
|
lastbyte = kByteEndText;
|
|
1486
1493
|
else
|
|
1487
|
-
lastbyte = params->text
|
|
1494
|
+
lastbyte = EndPtr(params->text)[0] & 0xFF;
|
|
1488
1495
|
} else {
|
|
1489
|
-
if (params->text
|
|
1496
|
+
if (BeginPtr(params->text) == BeginPtr(params->context))
|
|
1490
1497
|
lastbyte = kByteEndText;
|
|
1491
1498
|
else
|
|
1492
|
-
lastbyte = params->text
|
|
1499
|
+
lastbyte = BeginPtr(params->text)[-1] & 0xFF;
|
|
1493
1500
|
}
|
|
1494
1501
|
|
|
1495
1502
|
State* ns = s->next_[ByteMap(lastbyte)].load(std::memory_order_acquire);
|
|
@@ -1620,7 +1627,7 @@ bool DFA::AnalyzeSearch(SearchParams* params) {
|
|
|
1620
1627
|
const StringPiece& context = params->context;
|
|
1621
1628
|
|
|
1622
1629
|
// Sanity check: make sure that text lies within context.
|
|
1623
|
-
if (text
|
|
1630
|
+
if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) {
|
|
1624
1631
|
LOG(DFATAL) << "context does not contain text";
|
|
1625
1632
|
params->start = DeadState;
|
|
1626
1633
|
return true;
|
|
@@ -1630,13 +1637,13 @@ bool DFA::AnalyzeSearch(SearchParams* params) {
|
|
|
1630
1637
|
int start;
|
|
1631
1638
|
uint32_t flags;
|
|
1632
1639
|
if (params->run_forward) {
|
|
1633
|
-
if (text
|
|
1640
|
+
if (BeginPtr(text) == BeginPtr(context)) {
|
|
1634
1641
|
start = kStartBeginText;
|
|
1635
1642
|
flags = kEmptyBeginText|kEmptyBeginLine;
|
|
1636
|
-
} else if (text
|
|
1643
|
+
} else if (BeginPtr(text)[-1] == '\n') {
|
|
1637
1644
|
start = kStartBeginLine;
|
|
1638
1645
|
flags = kEmptyBeginLine;
|
|
1639
|
-
} else if (Prog::IsWordChar(text
|
|
1646
|
+
} else if (Prog::IsWordChar(BeginPtr(text)[-1] & 0xFF)) {
|
|
1640
1647
|
start = kStartAfterWordChar;
|
|
1641
1648
|
flags = kFlagLastWord;
|
|
1642
1649
|
} else {
|
|
@@ -1644,13 +1651,13 @@ bool DFA::AnalyzeSearch(SearchParams* params) {
|
|
|
1644
1651
|
flags = 0;
|
|
1645
1652
|
}
|
|
1646
1653
|
} else {
|
|
1647
|
-
if (text
|
|
1654
|
+
if (EndPtr(text) == EndPtr(context)) {
|
|
1648
1655
|
start = kStartBeginText;
|
|
1649
1656
|
flags = kEmptyBeginText|kEmptyBeginLine;
|
|
1650
|
-
} else if (text
|
|
1657
|
+
} else if (EndPtr(text)[0] == '\n') {
|
|
1651
1658
|
start = kStartBeginLine;
|
|
1652
1659
|
flags = kEmptyBeginLine;
|
|
1653
|
-
} else if (Prog::IsWordChar(text
|
|
1660
|
+
} else if (Prog::IsWordChar(EndPtr(text)[0] & 0xFF)) {
|
|
1654
1661
|
start = kStartAfterWordChar;
|
|
1655
1662
|
flags = kFlagLastWord;
|
|
1656
1663
|
} else {
|
|
@@ -1830,9 +1837,9 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
|
|
|
1830
1837
|
using std::swap;
|
|
1831
1838
|
swap(caret, dollar);
|
|
1832
1839
|
}
|
|
1833
|
-
if (caret && context
|
|
1840
|
+
if (caret && BeginPtr(context) != BeginPtr(text))
|
|
1834
1841
|
return false;
|
|
1835
|
-
if (dollar && context
|
|
1842
|
+
if (dollar && EndPtr(context) != EndPtr(text))
|
|
1836
1843
|
return false;
|
|
1837
1844
|
|
|
1838
1845
|
// Handle full match by running an anchored longest match
|
|
@@ -1963,10 +1970,6 @@ int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) {
|
|
|
1963
1970
|
return GetDFA(kind)->BuildAllStates(cb);
|
|
1964
1971
|
}
|
|
1965
1972
|
|
|
1966
|
-
void Prog::TEST_dfa_should_bail_when_slow(bool b) {
|
|
1967
|
-
dfa_should_bail_when_slow = b;
|
|
1968
|
-
}
|
|
1969
|
-
|
|
1970
1973
|
// Computes min and max for matching string.
|
|
1971
1974
|
// Won't return strings bigger than maxlen.
|
|
1972
1975
|
bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
|
|
@@ -5,19 +5,96 @@
|
|
|
5
5
|
#include <fuzzer/FuzzedDataProvider.h>
|
|
6
6
|
#include <stddef.h>
|
|
7
7
|
#include <stdint.h>
|
|
8
|
-
#include <
|
|
9
|
-
#include <queue>
|
|
8
|
+
#include <algorithm>
|
|
10
9
|
#include <string>
|
|
11
10
|
#include <vector>
|
|
12
11
|
|
|
13
|
-
#include "re2/prefilter.h"
|
|
14
12
|
#include "re2/re2.h"
|
|
13
|
+
#include "re2/regexp.h"
|
|
14
|
+
#include "re2/walker-inl.h"
|
|
15
15
|
|
|
16
16
|
using re2::StringPiece;
|
|
17
17
|
|
|
18
18
|
// NOT static, NOT signed.
|
|
19
19
|
uint8_t dummy = 0;
|
|
20
20
|
|
|
21
|
+
// Walks kRegexpConcat and kRegexpAlternate subexpressions
|
|
22
|
+
// to determine their maximum length.
|
|
23
|
+
class SubexpressionWalker : public re2::Regexp::Walker<int> {
|
|
24
|
+
public:
|
|
25
|
+
SubexpressionWalker() = default;
|
|
26
|
+
~SubexpressionWalker() override = default;
|
|
27
|
+
|
|
28
|
+
int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
|
|
29
|
+
int* child_args, int nchild_args) override {
|
|
30
|
+
switch (re->op()) {
|
|
31
|
+
case re2::kRegexpConcat:
|
|
32
|
+
case re2::kRegexpAlternate: {
|
|
33
|
+
int max = nchild_args;
|
|
34
|
+
for (int i = 0; i < nchild_args; i++)
|
|
35
|
+
max = std::max(max, child_args[i]);
|
|
36
|
+
return max;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
default:
|
|
40
|
+
break;
|
|
41
|
+
}
|
|
42
|
+
return -1;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Should never be called: we use Walk(), not WalkExponential().
|
|
46
|
+
int ShortVisit(re2::Regexp* re, int parent_arg) override {
|
|
47
|
+
return parent_arg;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
private:
|
|
51
|
+
SubexpressionWalker(const SubexpressionWalker&) = delete;
|
|
52
|
+
SubexpressionWalker& operator=(const SubexpressionWalker&) = delete;
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
// Walks substrings (i.e. kRegexpLiteralString subexpressions)
|
|
56
|
+
// to determine their maximum length... in runes, but avoiding
|
|
57
|
+
// overheads due to UTF-8 encoding is worthwhile when fuzzing.
|
|
58
|
+
class SubstringWalker : public re2::Regexp::Walker<int> {
|
|
59
|
+
public:
|
|
60
|
+
SubstringWalker() = default;
|
|
61
|
+
~SubstringWalker() override = default;
|
|
62
|
+
|
|
63
|
+
int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
|
|
64
|
+
int* child_args, int nchild_args) override {
|
|
65
|
+
switch (re->op()) {
|
|
66
|
+
case re2::kRegexpConcat:
|
|
67
|
+
case re2::kRegexpAlternate:
|
|
68
|
+
case re2::kRegexpStar:
|
|
69
|
+
case re2::kRegexpPlus:
|
|
70
|
+
case re2::kRegexpQuest:
|
|
71
|
+
case re2::kRegexpRepeat:
|
|
72
|
+
case re2::kRegexpCapture: {
|
|
73
|
+
int max = -1;
|
|
74
|
+
for (int i = 0; i < nchild_args; i++)
|
|
75
|
+
max = std::max(max, child_args[i]);
|
|
76
|
+
return max;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
case re2::kRegexpLiteralString:
|
|
80
|
+
return re->nrunes();
|
|
81
|
+
|
|
82
|
+
default:
|
|
83
|
+
break;
|
|
84
|
+
}
|
|
85
|
+
return -1;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Should never be called: we use Walk(), not WalkExponential().
|
|
89
|
+
int ShortVisit(re2::Regexp* re, int parent_arg) override {
|
|
90
|
+
return parent_arg;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
private:
|
|
94
|
+
SubstringWalker(const SubstringWalker&) = delete;
|
|
95
|
+
SubstringWalker& operator=(const SubstringWalker&) = delete;
|
|
96
|
+
};
|
|
97
|
+
|
|
21
98
|
void TestOneInput(StringPiece pattern, const RE2::Options& options,
|
|
22
99
|
StringPiece text) {
|
|
23
100
|
// Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
|
|
@@ -26,11 +103,15 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options,
|
|
|
26
103
|
// generating such patterns that fall within the other limits, but result
|
|
27
104
|
// in timeouts nonetheless. The marginal cost is high - even more so when
|
|
28
105
|
// counted repetition is involved - whereas the marginal benefit is zero.
|
|
106
|
+
// Crudely limit the use of 'k', 'K', 's' and 'S' too because they become
|
|
107
|
+
// three-element character classes when case-insensitive and using UTF-8.
|
|
29
108
|
// TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain.
|
|
30
109
|
int char_class = 0;
|
|
31
110
|
int backslash_p = 0; // very expensive, so handle specially
|
|
32
111
|
for (size_t i = 0; i < pattern.size(); i++) {
|
|
33
|
-
if (pattern[i] == '.'
|
|
112
|
+
if (pattern[i] == '.' ||
|
|
113
|
+
pattern[i] == 'k' || pattern[i] == 'K' ||
|
|
114
|
+
pattern[i] == 's' || pattern[i] == 'S')
|
|
34
115
|
char_class++;
|
|
35
116
|
if (pattern[i] != '\\')
|
|
36
117
|
continue;
|
|
@@ -50,31 +131,26 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options,
|
|
|
50
131
|
if (backslash_p > 1)
|
|
51
132
|
return;
|
|
52
133
|
|
|
134
|
+
// The default is 1000. Even 100 turned out to be too generous
|
|
135
|
+
// for fuzzing, empirically speaking, so let's try 10 instead.
|
|
136
|
+
re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10);
|
|
137
|
+
|
|
53
138
|
RE2 re(pattern, options);
|
|
54
139
|
if (!re.ok())
|
|
55
140
|
return;
|
|
56
141
|
|
|
142
|
+
// Don't waste time fuzzing programs with large subexpressions.
|
|
143
|
+
// They can cause bug reports due to fuzzer timeouts. And they
|
|
144
|
+
// aren't interesting for fuzzing purposes.
|
|
145
|
+
if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9)
|
|
146
|
+
return;
|
|
147
|
+
|
|
57
148
|
// Don't waste time fuzzing programs with large substrings.
|
|
58
149
|
// They can cause bug reports due to fuzzer timeouts when they
|
|
59
150
|
// are repetitions (e.g. hundreds of NUL bytes) and matching is
|
|
60
151
|
// unanchored. And they aren't interesting for fuzzing purposes.
|
|
61
|
-
|
|
62
|
-
if (prefilter == nullptr)
|
|
152
|
+
if (SubstringWalker().Walk(re.Regexp(), -1) > 9)
|
|
63
153
|
return;
|
|
64
|
-
std::queue<re2::Prefilter*> nodes;
|
|
65
|
-
nodes.push(prefilter.get());
|
|
66
|
-
while (!nodes.empty()) {
|
|
67
|
-
re2::Prefilter* node = nodes.front();
|
|
68
|
-
nodes.pop();
|
|
69
|
-
if (node->op() == re2::Prefilter::ATOM) {
|
|
70
|
-
if (node->atom().size() > 9)
|
|
71
|
-
return;
|
|
72
|
-
} else if (node->op() == re2::Prefilter::AND ||
|
|
73
|
-
node->op() == re2::Prefilter::OR) {
|
|
74
|
-
for (re2::Prefilter* sub : *node->subs())
|
|
75
|
-
nodes.push(sub);
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
154
|
|
|
79
155
|
// Don't waste time fuzzing high-size programs.
|
|
80
156
|
// They can cause bug reports due to fuzzer timeouts.
|
|
@@ -76,7 +76,7 @@ sub PrintClass($$@) {
|
|
|
76
76
|
} else {
|
|
77
77
|
$negname =~ y/a-z/A-Z/;
|
|
78
78
|
}
|
|
79
|
-
return "{ \"$escname\", +1, code$cnum, $n }", "{ \"$negname\", -1, code$cnum, $n }";
|
|
79
|
+
return "{ \"$escname\", +1, code$cnum, $n, 0, 0 }", "{ \"$negname\", -1, code$cnum, $n, 0, 0 }";
|
|
80
80
|
}
|
|
81
81
|
|
|
82
82
|
my $cnum = 0;
|