re2 1.16.0 → 1.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.github/actions/{linux-alpine-node-15 → linux-alpine-node-17}/Dockerfile +1 -1
  2. package/.github/actions/linux-alpine-node-17/action.yml +7 -0
  3. package/.github/actions/{linux-alpine-node-15 → linux-alpine-node-17}/entrypoint.sh +0 -0
  4. package/.github/dependabot.yml +1 -1
  5. package/.github/workflows/build.yml +8 -8
  6. package/.github/workflows/tests.yml +1 -1
  7. package/README.md +1 -0
  8. package/package.json +3 -3
  9. package/tests/test_general.js +6 -0
  10. package/vendor/README +3 -1
  11. package/vendor/re2/bitstate.cc +3 -3
  12. package/vendor/re2/compile.cc +50 -34
  13. package/vendor/re2/dfa.cc +24 -21
  14. package/vendor/re2/fuzzing/re2_fuzzer.cc +96 -20
  15. package/vendor/re2/make_perl_groups.pl +1 -1
  16. package/vendor/re2/nfa.cc +5 -5
  17. package/vendor/re2/onepass.cc +2 -2
  18. package/vendor/re2/parse.cc +41 -22
  19. package/vendor/re2/perl_groups.cc +34 -34
  20. package/vendor/re2/prog.cc +188 -4
  21. package/vendor/re2/prog.h +45 -13
  22. package/vendor/re2/re2.cc +7 -12
  23. package/vendor/re2/re2.h +7 -3
  24. package/vendor/re2/regexp.cc +11 -5
  25. package/vendor/re2/regexp.h +7 -2
  26. package/vendor/re2/set.cc +3 -0
  27. package/vendor/re2/testing/backtrack.cc +3 -3
  28. package/vendor/re2/testing/compile_test.cc +45 -21
  29. package/vendor/re2/testing/dfa_test.cc +4 -4
  30. package/vendor/re2/testing/exhaustive_tester.cc +2 -2
  31. package/vendor/re2/testing/parse_test.cc +1 -0
  32. package/vendor/re2/testing/re2_test.cc +31 -16
  33. package/vendor/re2/testing/regexp_benchmark.cc +108 -121
  34. package/vendor/re2/testing/required_prefix_test.cc +78 -24
  35. package/vendor/re2/testing/search_test.cc +2 -0
  36. package/vendor/re2/testing/tester.cc +9 -9
  37. package/vendor/re2/tostring.cc +1 -1
  38. package/vendor/re2/unicode.py +1 -1
  39. package/vendor/re2/unicode_casefold.cc +25 -11
  40. package/vendor/re2/unicode_groups.cc +319 -151
  41. package/vendor/re2/walker-inl.h +3 -2
  42. package/vendor/util/mutex.h +2 -2
  43. package/.github/actions/linux-alpine-node-15/action.yml +0 -7
@@ -1,4 +1,4 @@
1
- FROM node:15-alpine
1
+ FROM node:17-alpine
2
2
 
3
3
  RUN apk add --no-cache python3 make gcc g++
4
4
 
@@ -0,0 +1,7 @@
1
+ name: 'Create a binary artifact for Node 17 on Alpine Linux'
2
+ description: 'Create a binary artifact for Node 17 on Alpine Linux using musl'
3
+ runs:
4
+ using: 'docker'
5
+ image: 'Dockerfile'
6
+ args:
7
+ - ${{inputs.node-version}}
@@ -5,7 +5,7 @@
5
5
 
6
6
  version: 2
7
7
  updates:
8
- - package-ecosystem: "nvm" # See documentation for possible values
8
+ - package-ecosystem: "npm" # See documentation for possible values
9
9
  directory: "/" # Location of package manifests
10
10
  schedule:
11
11
  interval: "weekly"
@@ -31,7 +31,7 @@ jobs:
31
31
  strategy:
32
32
  matrix:
33
33
  os: [windows-latest, macOS-latest]
34
- node-version: [12, 14, 15, 16]
34
+ node-version: [12, 14, 16, 17]
35
35
 
36
36
  steps:
37
37
  - uses: actions/checkout@v2
@@ -73,7 +73,7 @@ jobs:
73
73
 
74
74
  strategy:
75
75
  matrix:
76
- node-version: [12, 14, 15, 16]
76
+ node-version: [12, 14, 16, 17]
77
77
 
78
78
  steps:
79
79
  - uses: actions/checkout@v2
@@ -152,8 +152,8 @@ jobs:
152
152
  env:
153
153
  GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
154
154
 
155
- build-linux-alpine-node-15:
156
- name: Node.js 15 on Alpine Linux
155
+ build-linux-alpine-node-16:
156
+ name: Node.js 16 on Alpine Linux
157
157
  needs: create-release
158
158
  runs-on: ubuntu-latest
159
159
  continue-on-error: true
@@ -175,12 +175,12 @@ jobs:
175
175
  Linux-Alpine-node-
176
176
  Linux-Alpine-
177
177
  - name: Install, test, and create artifact
178
- uses: ./.github/actions/linux-alpine-node-15/
178
+ uses: ./.github/actions/linux-alpine-node-16/
179
179
  env:
180
180
  GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
181
181
 
182
- build-linux-alpine-node-16:
183
- name: Node.js 16 on Alpine Linux
182
+ build-linux-alpine-node-17:
183
+ name: Node.js 17 on Alpine Linux
184
184
  needs: create-release
185
185
  runs-on: ubuntu-latest
186
186
  continue-on-error: true
@@ -202,6 +202,6 @@ jobs:
202
202
  Linux-Alpine-node-
203
203
  Linux-Alpine-
204
204
  - name: Install, test, and create artifact
205
- uses: ./.github/actions/linux-alpine-node-16/
205
+ uses: ./.github/actions/linux-alpine-node-17/
206
206
  env:
207
207
  GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
@@ -14,7 +14,7 @@ jobs:
14
14
  strategy:
15
15
  matrix:
16
16
  os: [ubuntu-latest, windows-latest, macOS-latest]
17
- node-version: [12, 14, 15, 16]
17
+ node-version: [12, 14, 16, 17]
18
18
 
19
19
  steps:
20
20
  - uses: actions/checkout@v2
package/README.md CHANGED
@@ -343,6 +343,7 @@ console.log('re2_res : ' + re2_res); // prints: re2_res : abc,a,b,c
343
343
 
344
344
  ## Release history
345
345
 
346
+ - 1.17.0 *Updated GYP, added support for Node 17, updated deps.*
346
347
  - 1.16.0 *Updated the compiler (thx, [Sergei Dyshel](https://github.com/sergei-dyshel)), updated GYP, removed support for Node 10, added support for Node 16, updated TS bindings (thx, [BannerBomb](https://github.com/BannerBomb)).*
347
348
  - 1.15.9 *Updated deps.*
348
349
  - 1.15.8 *Updated deps.*
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "re2",
3
- "version": "1.16.0",
3
+ "version": "1.17.0",
4
4
  "description": "Bindings for RE2: fast, safe alternative to backtracking regular expression engines.",
5
5
  "homepage": "https://github.com/uhop/node-re2",
6
6
  "bugs": "https://github.com/uhop/node-re2/issues",
@@ -11,8 +11,8 @@
11
11
  },
12
12
  "dependencies": {
13
13
  "install-artifact-from-github": "^1.2.0",
14
- "nan": "^2.14.2",
15
- "node-gyp": "^8.0.0"
14
+ "nan": "^2.15.0",
15
+ "node-gyp": "^8.4.1"
16
16
  },
17
17
  "devDependencies": {
18
18
  "heya-unit": "^0.3.0"
@@ -209,6 +209,12 @@ unit.add(module, [
209
209
  eval(t.TEST("s3.length === 1"));
210
210
  eval(t.TEST("RE2.getUtf8Length(s3) === 3"));
211
211
 
212
+ var s4 = "🤡";
213
+
214
+ eval(t.TEST("s4.length === 2"));
215
+ eval(t.TEST("RE2.getUtf8Length(s4) === 4"));
216
+ eval(t.TEST("RE2.getUtf16Length(Buffer.from(s4, 'utf8')) === s4.length"));
217
+
212
218
  var b3 = new Buffer([0xF0]);
213
219
 
214
220
  eval(t.TEST("b3.length === 1"));
package/vendor/README CHANGED
@@ -31,10 +31,12 @@ The Python wrapper is at https://github.com/google/re2/tree/abseil/python
31
31
  and on PyPI (https://pypi.org/project/google-re2/).
32
32
 
33
33
  A C wrapper is at https://github.com/marcomaggi/cre2/.
34
+ A D wrapper is at https://github.com/ShigekiKarita/re2d/ and on DUB (code.dlang.org).
34
35
  An Erlang wrapper is at https://github.com/dukesoferl/re2/ and on Hex (hex.pm).
35
36
  An Inferno wrapper is at https://github.com/powerman/inferno-re2/.
36
37
  A Node.js wrapper is at https://github.com/uhop/node-re2/ and on NPM (npmjs.com).
37
38
  An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM (opam.ocaml.org).
38
39
  A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN (cpan.org).
39
- An R wrapper is at https://github.com/qinwf/re2r/ and on CRAN (cran.r-project.org).
40
+ An R wrapper is at https://github.com/girishji/re2/ and on CRAN (cran.r-project.org).
40
41
  A Ruby wrapper is at https://github.com/mudge/re2/ and on RubyGems (rubygems.org).
42
+ A WebAssembly wrapper is at https://github.com/google/re2-wasm/ and on NPM (npmjs.com).
@@ -293,9 +293,9 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
293
293
  context_ = context;
294
294
  if (context_.data() == NULL)
295
295
  context_ = text;
296
- if (prog_->anchor_start() && context_.begin() != text.begin())
296
+ if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text))
297
297
  return false;
298
- if (prog_->anchor_end() && context_.end() != text.end())
298
+ if (prog_->anchor_end() && EndPtr(context_) != EndPtr(text))
299
299
  return false;
300
300
  anchored_ = anchored || prog_->anchor_start();
301
301
  longest_ = longest || prog_->anchor_end();
@@ -377,7 +377,7 @@ bool Prog::SearchBitState(const StringPiece& text,
377
377
  bool longest = kind != kFirstMatch;
378
378
  if (!b.Search(text, context, anchored, longest, match, nmatch))
379
379
  return false;
380
- if (kind == kFullMatch && match[0].end() != text.end())
380
+ if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
381
381
  return false;
382
382
  return true;
383
383
  }
@@ -79,9 +79,11 @@ static const PatchList kNullPatchList = {0, 0};
79
79
  struct Frag {
80
80
  uint32_t begin;
81
81
  PatchList end;
82
+ bool nullable;
82
83
 
83
- Frag() : begin(0) { end.head = 0; } // needed so Frag can go in vector
84
- Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {}
84
+ Frag() : begin(0), end(kNullPatchList), nullable(false) {}
85
+ Frag(uint32_t begin, PatchList end, bool nullable)
86
+ : begin(begin), end(end), nullable(nullable) {}
85
87
  };
86
88
 
87
89
  // Input encodings.
@@ -264,7 +266,7 @@ int Compiler::AllocInst(int n) {
264
266
 
265
267
  // Returns an unmatchable fragment.
266
268
  Frag Compiler::NoMatch() {
267
- return Frag(0, kNullPatchList);
269
+ return Frag();
268
270
  }
269
271
 
270
272
  // Is a an unmatchable fragment?
@@ -290,11 +292,11 @@ Frag Compiler::Cat(Frag a, Frag b) {
290
292
  // To run backward over string, reverse all concatenations.
291
293
  if (reversed_) {
292
294
  PatchList::Patch(inst_.data(), b.end, a.begin);
293
- return Frag(b.begin, a.end);
295
+ return Frag(b.begin, a.end, b.nullable && a.nullable);
294
296
  }
295
297
 
296
298
  PatchList::Patch(inst_.data(), a.end, b.begin);
297
- return Frag(a.begin, b.end);
299
+ return Frag(a.begin, b.end, a.nullable && b.nullable);
298
300
  }
299
301
 
300
302
  // Given fragments for a and b, returns fragment for a|b.
@@ -310,7 +312,8 @@ Frag Compiler::Alt(Frag a, Frag b) {
310
312
  return NoMatch();
311
313
 
312
314
  inst_[id].InitAlt(a.begin, b.begin);
313
- return Frag(id, PatchList::Append(inst_.data(), a.end, b.end));
315
+ return Frag(id, PatchList::Append(inst_.data(), a.end, b.end),
316
+ a.nullable || b.nullable);
314
317
  }
315
318
 
316
319
  // When capturing submatches in like-Perl mode, a kOpAlt Inst
@@ -320,27 +323,44 @@ Frag Compiler::Alt(Frag a, Frag b) {
320
323
  // then the operator is greedy. If out1_ is the repetition
321
324
  // (and out_ moves forward), then the operator is non-greedy.
322
325
 
323
- // Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
324
- Frag Compiler::Star(Frag a, bool nongreedy) {
326
+ // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
327
+ Frag Compiler::Plus(Frag a, bool nongreedy) {
325
328
  int id = AllocInst(1);
326
329
  if (id < 0)
327
330
  return NoMatch();
328
- inst_[id].InitAlt(0, 0);
329
- PatchList::Patch(inst_.data(), a.end, id);
331
+ PatchList pl;
330
332
  if (nongreedy) {
331
- inst_[id].out1_ = a.begin;
332
- return Frag(id, PatchList::Mk(id << 1));
333
+ inst_[id].InitAlt(0, a.begin);
334
+ pl = PatchList::Mk(id << 1);
333
335
  } else {
334
- inst_[id].set_out(a.begin);
335
- return Frag(id, PatchList::Mk((id << 1) | 1));
336
+ inst_[id].InitAlt(a.begin, 0);
337
+ pl = PatchList::Mk((id << 1) | 1);
336
338
  }
339
+ PatchList::Patch(inst_.data(), a.end, id);
340
+ return Frag(a.begin, pl, a.nullable);
337
341
  }
338
342
 
339
- // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
340
- Frag Compiler::Plus(Frag a, bool nongreedy) {
341
- // a+ is just a* with a different entry point.
342
- Frag f = Star(a, nongreedy);
343
- return Frag(a.begin, f.end);
343
+ // Given a fragment for a, returns a fragment for a* or a*? (if nongreedy)
344
+ Frag Compiler::Star(Frag a, bool nongreedy) {
345
+ // When the subexpression is nullable, one Alt isn't enough to guarantee
346
+ // correct priority ordering within the transitive closure. The simplest
347
+ // solution is to handle it as (a+)? instead, which adds the second Alt.
348
+ if (a.nullable)
349
+ return Quest(Plus(a, nongreedy), nongreedy);
350
+
351
+ int id = AllocInst(1);
352
+ if (id < 0)
353
+ return NoMatch();
354
+ PatchList pl;
355
+ if (nongreedy) {
356
+ inst_[id].InitAlt(0, a.begin);
357
+ pl = PatchList::Mk(id << 1);
358
+ } else {
359
+ inst_[id].InitAlt(a.begin, 0);
360
+ pl = PatchList::Mk((id << 1) | 1);
361
+ }
362
+ PatchList::Patch(inst_.data(), a.end, id);
363
+ return Frag(id, pl, true);
344
364
  }
345
365
 
346
366
  // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
@@ -358,7 +378,7 @@ Frag Compiler::Quest(Frag a, bool nongreedy) {
358
378
  inst_[id].InitAlt(a.begin, 0);
359
379
  pl = PatchList::Mk((id << 1) | 1);
360
380
  }
361
- return Frag(id, PatchList::Append(inst_.data(), pl, a.end));
381
+ return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true);
362
382
  }
363
383
 
364
384
  // Returns a fragment for the byte range lo-hi.
@@ -367,7 +387,7 @@ Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
367
387
  if (id < 0)
368
388
  return NoMatch();
369
389
  inst_[id].InitByteRange(lo, hi, foldcase, 0);
370
- return Frag(id, PatchList::Mk(id << 1));
390
+ return Frag(id, PatchList::Mk(id << 1), false);
371
391
  }
372
392
 
373
393
  // Returns a no-op fragment. Sometimes unavoidable.
@@ -376,7 +396,7 @@ Frag Compiler::Nop() {
376
396
  if (id < 0)
377
397
  return NoMatch();
378
398
  inst_[id].InitNop(0);
379
- return Frag(id, PatchList::Mk(id << 1));
399
+ return Frag(id, PatchList::Mk(id << 1), true);
380
400
  }
381
401
 
382
402
  // Returns a fragment that signals a match.
@@ -385,7 +405,7 @@ Frag Compiler::Match(int32_t match_id) {
385
405
  if (id < 0)
386
406
  return NoMatch();
387
407
  inst_[id].InitMatch(match_id);
388
- return Frag(id, kNullPatchList);
408
+ return Frag(id, kNullPatchList, false);
389
409
  }
390
410
 
391
411
  // Returns a fragment matching a particular empty-width op (like ^ or $)
@@ -394,7 +414,7 @@ Frag Compiler::EmptyWidth(EmptyOp empty) {
394
414
  if (id < 0)
395
415
  return NoMatch();
396
416
  inst_[id].InitEmptyWidth(empty, 0);
397
- return Frag(id, PatchList::Mk(id << 1));
417
+ return Frag(id, PatchList::Mk(id << 1), true);
398
418
  }
399
419
 
400
420
  // Given a fragment a, returns a fragment with capturing parens around a.
@@ -408,7 +428,7 @@ Frag Compiler::Capture(Frag a, int n) {
408
428
  inst_[id+1].InitCapture(2*n+1, 0);
409
429
  PatchList::Patch(inst_.data(), a.end, id+1);
410
430
 
411
- return Frag(id, PatchList::Mk((id+1) << 1));
431
+ return Frag(id, PatchList::Mk((id+1) << 1), a.nullable);
412
432
  }
413
433
 
414
434
  // A Rune is a name for a Unicode code point.
@@ -567,7 +587,7 @@ bool Compiler::ByteRangeEqual(int id1, int id2) {
567
587
  Frag Compiler::FindByteRange(int root, int id) {
568
588
  if (inst_[root].opcode() == kInstByteRange) {
569
589
  if (ByteRangeEqual(root, id))
570
- return Frag(root, kNullPatchList);
590
+ return Frag(root, kNullPatchList, false);
571
591
  else
572
592
  return NoMatch();
573
593
  }
@@ -575,7 +595,7 @@ Frag Compiler::FindByteRange(int root, int id) {
575
595
  while (inst_[root].opcode() == kInstAlt) {
576
596
  int out1 = inst_[root].out1();
577
597
  if (ByteRangeEqual(out1, id))
578
- return Frag(root, PatchList::Mk((root << 1) | 1));
598
+ return Frag(root, PatchList::Mk((root << 1) | 1), false);
579
599
 
580
600
  // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
581
601
  // what we're looking for, then we can stop immediately. Unfortunately, we
@@ -587,7 +607,7 @@ Frag Compiler::FindByteRange(int root, int id) {
587
607
  if (inst_[out].opcode() == kInstAlt)
588
608
  root = out;
589
609
  else if (ByteRangeEqual(out, id))
590
- return Frag(root, PatchList::Mk(root << 1));
610
+ return Frag(root, PatchList::Mk(root << 1), false);
591
611
  else
592
612
  return NoMatch();
593
613
  }
@@ -1156,12 +1176,8 @@ Prog* Compiler::Finish(Regexp* re) {
1156
1176
  if (!prog_->reversed()) {
1157
1177
  std::string prefix;
1158
1178
  bool prefix_foldcase;
1159
- if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase) &&
1160
- !prefix_foldcase) {
1161
- prog_->prefix_size_ = prefix.size();
1162
- prog_->prefix_front_ = prefix.front();
1163
- prog_->prefix_back_ = prefix.back();
1164
- }
1179
+ if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase))
1180
+ prog_->ConfigurePrefixAccel(prefix, prefix_foldcase);
1165
1181
  }
1166
1182
 
1167
1183
  // Record remaining memory for DFA.
package/vendor/re2/dfa.cc CHANGED
@@ -56,6 +56,10 @@ namespace re2 {
56
56
  // Controls whether the DFA should bail out early if the NFA would be faster.
57
57
  static bool dfa_should_bail_when_slow = true;
58
58
 
59
+ void Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(bool b) {
60
+ dfa_should_bail_when_slow = b;
61
+ }
62
+
59
63
  // Changing this to true compiles in prints that trace execution of the DFA.
60
64
  // Generates a lot of output -- only useful for debugging.
61
65
  static const bool ExtraDebug = false;
@@ -167,6 +171,9 @@ class DFA {
167
171
  typedef std::unordered_set<State*, StateHash, StateEqual> StateSet;
168
172
 
169
173
  private:
174
+ // Make it easier to swap in a scalable reader-writer mutex.
175
+ using CacheMutex = Mutex;
176
+
170
177
  enum {
171
178
  // Indices into start_ for unanchored searches.
172
179
  // Add kStartAnchored for anchored searches.
@@ -331,7 +338,7 @@ class DFA {
331
338
  // while holding cache_mutex_ for writing, to avoid interrupting other
332
339
  // readers. Any State* pointers are only valid while cache_mutex_
333
340
  // is held.
334
- Mutex cache_mutex_;
341
+ CacheMutex cache_mutex_;
335
342
  int64_t mem_budget_; // Total memory budget for all States.
336
343
  int64_t state_budget_; // Amount of memory remaining for new States.
337
344
  StateSet state_cache_; // All States computed so far.
@@ -1106,7 +1113,7 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) {
1106
1113
 
1107
1114
  class DFA::RWLocker {
1108
1115
  public:
1109
- explicit RWLocker(Mutex* mu);
1116
+ explicit RWLocker(CacheMutex* mu);
1110
1117
  ~RWLocker();
1111
1118
 
1112
1119
  // If the lock is only held for reading right now,
@@ -1116,14 +1123,14 @@ class DFA::RWLocker {
1116
1123
  void LockForWriting();
1117
1124
 
1118
1125
  private:
1119
- Mutex* mu_;
1126
+ CacheMutex* mu_;
1120
1127
  bool writing_;
1121
1128
 
1122
1129
  RWLocker(const RWLocker&) = delete;
1123
1130
  RWLocker& operator=(const RWLocker&) = delete;
1124
1131
  };
1125
1132
 
1126
- DFA::RWLocker::RWLocker(Mutex* mu) : mu_(mu), writing_(false) {
1133
+ DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) {
1127
1134
  mu_->ReaderLock();
1128
1135
  }
1129
1136
 
@@ -1481,15 +1488,15 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) {
1481
1488
 
1482
1489
  int lastbyte;
1483
1490
  if (run_forward) {
1484
- if (params->text.end() == params->context.end())
1491
+ if (EndPtr(params->text) == EndPtr(params->context))
1485
1492
  lastbyte = kByteEndText;
1486
1493
  else
1487
- lastbyte = params->text.end()[0] & 0xFF;
1494
+ lastbyte = EndPtr(params->text)[0] & 0xFF;
1488
1495
  } else {
1489
- if (params->text.begin() == params->context.begin())
1496
+ if (BeginPtr(params->text) == BeginPtr(params->context))
1490
1497
  lastbyte = kByteEndText;
1491
1498
  else
1492
- lastbyte = params->text.begin()[-1] & 0xFF;
1499
+ lastbyte = BeginPtr(params->text)[-1] & 0xFF;
1493
1500
  }
1494
1501
 
1495
1502
  State* ns = s->next_[ByteMap(lastbyte)].load(std::memory_order_acquire);
@@ -1620,7 +1627,7 @@ bool DFA::AnalyzeSearch(SearchParams* params) {
1620
1627
  const StringPiece& context = params->context;
1621
1628
 
1622
1629
  // Sanity check: make sure that text lies within context.
1623
- if (text.begin() < context.begin() || text.end() > context.end()) {
1630
+ if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) {
1624
1631
  LOG(DFATAL) << "context does not contain text";
1625
1632
  params->start = DeadState;
1626
1633
  return true;
@@ -1630,13 +1637,13 @@ bool DFA::AnalyzeSearch(SearchParams* params) {
1630
1637
  int start;
1631
1638
  uint32_t flags;
1632
1639
  if (params->run_forward) {
1633
- if (text.begin() == context.begin()) {
1640
+ if (BeginPtr(text) == BeginPtr(context)) {
1634
1641
  start = kStartBeginText;
1635
1642
  flags = kEmptyBeginText|kEmptyBeginLine;
1636
- } else if (text.begin()[-1] == '\n') {
1643
+ } else if (BeginPtr(text)[-1] == '\n') {
1637
1644
  start = kStartBeginLine;
1638
1645
  flags = kEmptyBeginLine;
1639
- } else if (Prog::IsWordChar(text.begin()[-1] & 0xFF)) {
1646
+ } else if (Prog::IsWordChar(BeginPtr(text)[-1] & 0xFF)) {
1640
1647
  start = kStartAfterWordChar;
1641
1648
  flags = kFlagLastWord;
1642
1649
  } else {
@@ -1644,13 +1651,13 @@ bool DFA::AnalyzeSearch(SearchParams* params) {
1644
1651
  flags = 0;
1645
1652
  }
1646
1653
  } else {
1647
- if (text.end() == context.end()) {
1654
+ if (EndPtr(text) == EndPtr(context)) {
1648
1655
  start = kStartBeginText;
1649
1656
  flags = kEmptyBeginText|kEmptyBeginLine;
1650
- } else if (text.end()[0] == '\n') {
1657
+ } else if (EndPtr(text)[0] == '\n') {
1651
1658
  start = kStartBeginLine;
1652
1659
  flags = kEmptyBeginLine;
1653
- } else if (Prog::IsWordChar(text.end()[0] & 0xFF)) {
1660
+ } else if (Prog::IsWordChar(EndPtr(text)[0] & 0xFF)) {
1654
1661
  start = kStartAfterWordChar;
1655
1662
  flags = kFlagLastWord;
1656
1663
  } else {
@@ -1830,9 +1837,9 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
1830
1837
  using std::swap;
1831
1838
  swap(caret, dollar);
1832
1839
  }
1833
- if (caret && context.begin() != text.begin())
1840
+ if (caret && BeginPtr(context) != BeginPtr(text))
1834
1841
  return false;
1835
- if (dollar && context.end() != text.end())
1842
+ if (dollar && EndPtr(context) != EndPtr(text))
1836
1843
  return false;
1837
1844
 
1838
1845
  // Handle full match by running an anchored longest match
@@ -1963,10 +1970,6 @@ int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) {
1963
1970
  return GetDFA(kind)->BuildAllStates(cb);
1964
1971
  }
1965
1972
 
1966
- void Prog::TEST_dfa_should_bail_when_slow(bool b) {
1967
- dfa_should_bail_when_slow = b;
1968
- }
1969
-
1970
1973
  // Computes min and max for matching string.
1971
1974
  // Won't return strings bigger than maxlen.
1972
1975
  bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
@@ -5,19 +5,96 @@
5
5
  #include <fuzzer/FuzzedDataProvider.h>
6
6
  #include <stddef.h>
7
7
  #include <stdint.h>
8
- #include <memory>
9
- #include <queue>
8
+ #include <algorithm>
10
9
  #include <string>
11
10
  #include <vector>
12
11
 
13
- #include "re2/prefilter.h"
14
12
  #include "re2/re2.h"
13
+ #include "re2/regexp.h"
14
+ #include "re2/walker-inl.h"
15
15
 
16
16
  using re2::StringPiece;
17
17
 
18
18
  // NOT static, NOT signed.
19
19
  uint8_t dummy = 0;
20
20
 
21
+ // Walks kRegexpConcat and kRegexpAlternate subexpressions
22
+ // to determine their maximum length.
23
+ class SubexpressionWalker : public re2::Regexp::Walker<int> {
24
+ public:
25
+ SubexpressionWalker() = default;
26
+ ~SubexpressionWalker() override = default;
27
+
28
+ int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
29
+ int* child_args, int nchild_args) override {
30
+ switch (re->op()) {
31
+ case re2::kRegexpConcat:
32
+ case re2::kRegexpAlternate: {
33
+ int max = nchild_args;
34
+ for (int i = 0; i < nchild_args; i++)
35
+ max = std::max(max, child_args[i]);
36
+ return max;
37
+ }
38
+
39
+ default:
40
+ break;
41
+ }
42
+ return -1;
43
+ }
44
+
45
+ // Should never be called: we use Walk(), not WalkExponential().
46
+ int ShortVisit(re2::Regexp* re, int parent_arg) override {
47
+ return parent_arg;
48
+ }
49
+
50
+ private:
51
+ SubexpressionWalker(const SubexpressionWalker&) = delete;
52
+ SubexpressionWalker& operator=(const SubexpressionWalker&) = delete;
53
+ };
54
+
55
+ // Walks substrings (i.e. kRegexpLiteralString subexpressions)
56
+ // to determine their maximum length... in runes, but avoiding
57
+ // overheads due to UTF-8 encoding is worthwhile when fuzzing.
58
+ class SubstringWalker : public re2::Regexp::Walker<int> {
59
+ public:
60
+ SubstringWalker() = default;
61
+ ~SubstringWalker() override = default;
62
+
63
+ int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
64
+ int* child_args, int nchild_args) override {
65
+ switch (re->op()) {
66
+ case re2::kRegexpConcat:
67
+ case re2::kRegexpAlternate:
68
+ case re2::kRegexpStar:
69
+ case re2::kRegexpPlus:
70
+ case re2::kRegexpQuest:
71
+ case re2::kRegexpRepeat:
72
+ case re2::kRegexpCapture: {
73
+ int max = -1;
74
+ for (int i = 0; i < nchild_args; i++)
75
+ max = std::max(max, child_args[i]);
76
+ return max;
77
+ }
78
+
79
+ case re2::kRegexpLiteralString:
80
+ return re->nrunes();
81
+
82
+ default:
83
+ break;
84
+ }
85
+ return -1;
86
+ }
87
+
88
+ // Should never be called: we use Walk(), not WalkExponential().
89
+ int ShortVisit(re2::Regexp* re, int parent_arg) override {
90
+ return parent_arg;
91
+ }
92
+
93
+ private:
94
+ SubstringWalker(const SubstringWalker&) = delete;
95
+ SubstringWalker& operator=(const SubstringWalker&) = delete;
96
+ };
97
+
21
98
  void TestOneInput(StringPiece pattern, const RE2::Options& options,
22
99
  StringPiece text) {
23
100
  // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
@@ -26,11 +103,15 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options,
26
103
  // generating such patterns that fall within the other limits, but result
27
104
  // in timeouts nonetheless. The marginal cost is high - even more so when
28
105
  // counted repetition is involved - whereas the marginal benefit is zero.
106
+ // Crudely limit the use of 'k', 'K', 's' and 'S' too because they become
107
+ // three-element character classes when case-insensitive and using UTF-8.
29
108
  // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain.
30
109
  int char_class = 0;
31
110
  int backslash_p = 0; // very expensive, so handle specially
32
111
  for (size_t i = 0; i < pattern.size(); i++) {
33
- if (pattern[i] == '.')
112
+ if (pattern[i] == '.' ||
113
+ pattern[i] == 'k' || pattern[i] == 'K' ||
114
+ pattern[i] == 's' || pattern[i] == 'S')
34
115
  char_class++;
35
116
  if (pattern[i] != '\\')
36
117
  continue;
@@ -50,31 +131,26 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options,
50
131
  if (backslash_p > 1)
51
132
  return;
52
133
 
134
+ // The default is 1000. Even 100 turned out to be too generous
135
+ // for fuzzing, empirically speaking, so let's try 10 instead.
136
+ re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10);
137
+
53
138
  RE2 re(pattern, options);
54
139
  if (!re.ok())
55
140
  return;
56
141
 
142
+ // Don't waste time fuzzing programs with large subexpressions.
143
+ // They can cause bug reports due to fuzzer timeouts. And they
144
+ // aren't interesting for fuzzing purposes.
145
+ if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9)
146
+ return;
147
+
57
148
  // Don't waste time fuzzing programs with large substrings.
58
149
  // They can cause bug reports due to fuzzer timeouts when they
59
150
  // are repetitions (e.g. hundreds of NUL bytes) and matching is
60
151
  // unanchored. And they aren't interesting for fuzzing purposes.
61
- std::unique_ptr<re2::Prefilter> prefilter(re2::Prefilter::FromRE2(&re));
62
- if (prefilter == nullptr)
152
+ if (SubstringWalker().Walk(re.Regexp(), -1) > 9)
63
153
  return;
64
- std::queue<re2::Prefilter*> nodes;
65
- nodes.push(prefilter.get());
66
- while (!nodes.empty()) {
67
- re2::Prefilter* node = nodes.front();
68
- nodes.pop();
69
- if (node->op() == re2::Prefilter::ATOM) {
70
- if (node->atom().size() > 9)
71
- return;
72
- } else if (node->op() == re2::Prefilter::AND ||
73
- node->op() == re2::Prefilter::OR) {
74
- for (re2::Prefilter* sub : *node->subs())
75
- nodes.push(sub);
76
- }
77
- }
78
154
 
79
155
  // Don't waste time fuzzing high-size programs.
80
156
  // They can cause bug reports due to fuzzer timeouts.
@@ -76,7 +76,7 @@ sub PrintClass($$@) {
76
76
  } else {
77
77
  $negname =~ y/a-z/A-Z/;
78
78
  }
79
- return "{ \"$escname\", +1, code$cnum, $n }", "{ \"$negname\", -1, code$cnum, $n }";
79
+ return "{ \"$escname\", +1, code$cnum, $n, 0, 0 }", "{ \"$negname\", -1, code$cnum, $n, 0, 0 }";
80
80
  }
81
81
 
82
82
  my $cnum = 0;