crawler-user-agents 1.0.151 → 1.0.153

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4972,5 +4972,13 @@
4972
4972
  "meta-externalfetcher/1.1"
4973
4973
  ],
4974
4974
  "url": "https://developers.facebook.com/docs/sharing/webmasters/web-crawlers"
4975
+ },
4976
+ {
4977
+ "pattern": "KeybaseBot",
4978
+ "addition_date": "2024/10/21",
4979
+ "url": "https://book.keybase.io/docs/chat/link-previews",
4980
+ "instances": [
4981
+ "Mozilla/5.0 (compatible; KeybaseBot; +https://keybase.io)"
4982
+ ]
4975
4983
  }
4976
4984
  ]
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "crawler-user-agents",
3
- "version": "1.0.151",
3
+ "version": "1.0.153",
4
4
  "main": "crawler-user-agents.json",
5
5
  "typings": "./index.d.ts",
6
6
  "author": "Martin Monperrus <martin.monperrus@gnieh.org>",
package/validate.go CHANGED
@@ -2,10 +2,16 @@ package agents
2
2
 
3
3
  import (
4
4
  _ "embed"
5
+ "encoding/hex"
5
6
  "encoding/json"
6
7
  "fmt"
8
+ "hash/maphash"
7
9
  "regexp"
10
+ "regexp/syntax"
11
+ "strconv"
12
+ "strings"
8
13
  "time"
14
+ "unicode"
9
15
  )
10
16
 
11
17
  //go:embed crawler-user-agents.json
@@ -26,7 +32,7 @@ type Crawler struct {
26
32
  Instances []string `json:"instances"`
27
33
  }
28
34
 
29
- // Private time needed to convert addition_date from/to the format used in JSON.
35
+ // Private type needed to convert addition_date from/to the format used in JSON.
30
36
  type jsonCrawler struct {
31
37
  Pattern string `json:"pattern"`
32
38
  AdditionDate string `json:"addition_date"`
@@ -80,31 +86,454 @@ var Crawlers = func() []Crawler {
80
86
  return crawlers
81
87
  }()
82
88
 
83
- var regexps = func() []*regexp.Regexp {
84
- regexps := make([]*regexp.Regexp, len(Crawlers))
89
+ // analyzePattern expands a regular expression to the list of matching texts
90
+ // for plain search. The list is complete, i.e. iff a text matches the input
91
+ // pattern, then it contains at least one of the returned texts. If such a list
92
+ // can't be built, then the resulting list contains one element (main literal),
93
+ // it also returns built regexp object to run in this case. The main literal is
94
+ // a text that is contained in any matching text and is used to optimize search
95
+ // (pre-filter with this main literal before running a regexp). In the case such
96
+ // a main literal can't be found or the regexp is invalid, an error is returned.
97
+ func analyzePattern(pattern string) ([]string, *regexp.Regexp, error) {
98
+ re, err := syntax.Parse(pattern, syntax.Perl)
99
+ if err != nil {
100
+ return nil, nil, fmt.Errorf("re %q does not compile: %w", pattern, err)
101
+ }
102
+ re = re.Simplify()
103
+
104
+ // Try to convert it to the list of literals.
105
+ const maxLiterals = 100
106
+ literals, ok := literalizeRegexp(re, maxLiterals)
107
+ if ok {
108
+ return literals, nil, nil
109
+ }
110
+
111
+ // Fallback to using a regexp, but we need some string serving as
112
+ // an indicator of its possible presence.
113
+ mainLiteral := findLongestCommonLiteral(re)
114
+ const minLiteralLen = 3
115
+ if len(mainLiteral) < minLiteralLen {
116
+ return nil, nil, fmt.Errorf("re %q does not contain sufficiently long literal to serve an indicator. The longest literal is %q", pattern, mainLiteral)
117
+ }
118
+
119
+ return []string{mainLiteral}, regexp.MustCompile(pattern), nil
120
+ }
121
+
122
+ // literalizeRegexp expands a regexp to the list of matching sub-strings.
123
+ // Iff a text matches the regexp, it contains at least one of the returned
124
+ // texts. Argument maxLiterals regulates the maximum number of patterns to
125
+ // return. In case of an overflow or if it is impossible to build such a list
126
+ // from the regexp, false is returned.
127
+ func literalizeRegexp(re *syntax.Regexp, maxLiterals int) (literals []string, ok bool) {
128
+ switch re.Op {
129
+ case syntax.OpNoMatch:
130
+ return nil, true
131
+
132
+ case syntax.OpEmptyMatch:
133
+ return []string{""}, true
134
+
135
+ case syntax.OpLiteral:
136
+ return unwrapCase(re, []string{string(re.Rune)}, maxLiterals)
137
+
138
+ case syntax.OpCharClass:
139
+ count := 0
140
+ for i := 0; i < len(re.Rune); i += 2 {
141
+ first := re.Rune[i]
142
+ last := re.Rune[i+1]
143
+ count += int(last - first + 1)
144
+ }
145
+
146
+ if count > maxLiterals {
147
+ return nil, false
148
+ }
149
+
150
+ patterns := make([]string, 0, count)
151
+ for i := 0; i < len(re.Rune); i += 2 {
152
+ first := re.Rune[i]
153
+ last := re.Rune[i+1]
154
+ for r := first; r <= last; r++ {
155
+ patterns = append(patterns, string([]rune{r}))
156
+ }
157
+ }
158
+
159
+ return unwrapCase(re, patterns, maxLiterals)
160
+
161
+ case syntax.OpAnyCharNotNL, syntax.OpAnyChar:
162
+ // Not supported.
163
+ return nil, false
164
+
165
+ case syntax.OpBeginLine, syntax.OpBeginText:
166
+ return []string{"^"}, true
167
+
168
+ case syntax.OpEndLine, syntax.OpEndText:
169
+ return []string{"$"}, true
170
+
171
+ case syntax.OpWordBoundary, syntax.OpNoWordBoundary:
172
+ // Not supported.
173
+ return nil, false
174
+
175
+ case syntax.OpCapture:
176
+ subList, ok := literalizeRegexp(re.Sub[0], maxLiterals)
177
+ if !ok {
178
+ return nil, false
179
+ }
180
+
181
+ return unwrapCase(re, subList, maxLiterals)
182
+
183
+ case syntax.OpStar, syntax.OpPlus:
184
+ // Not supported.
185
+ return nil, false
186
+
187
+ case syntax.OpQuest:
188
+ if re.Flags&syntax.FoldCase != 0 {
189
+ return nil, false
190
+ }
191
+
192
+ subList, ok := literalizeRegexp(re.Sub[0], maxLiterals)
193
+ if !ok {
194
+ return nil, false
195
+ }
196
+ subList = append(subList, "")
197
+
198
+ return subList, true
199
+
200
+ case syntax.OpRepeat:
201
+ // Not supported.
202
+ return nil, false
203
+
204
+ case syntax.OpConcat:
205
+ if re.Flags&syntax.FoldCase != 0 {
206
+ return nil, false
207
+ }
208
+
209
+ matrix := make([][]string, len(re.Sub))
210
+ for i, sub := range re.Sub {
211
+ subList, ok := literalizeRegexp(sub, maxLiterals)
212
+ if !ok {
213
+ return nil, false
214
+ }
215
+ matrix[i] = subList
216
+ }
217
+
218
+ return combinations(matrix, maxLiterals)
219
+
220
+ case syntax.OpAlternate:
221
+ results := []string{}
222
+ for _, sub := range re.Sub {
223
+ subList, ok := literalizeRegexp(sub, maxLiterals)
224
+ if !ok {
225
+ return nil, false
226
+ }
227
+ results = append(results, subList...)
228
+ }
229
+
230
+ if len(results) > maxLiterals {
231
+ return nil, false
232
+ }
233
+
234
+ return unwrapCase(re, results, maxLiterals)
235
+
236
+ default:
237
+ // Not supported.
238
+ return nil, false
239
+ }
240
+ }
241
+
242
+ // combinations produces all combination of elements of matrix.
243
+ // Each sub-slice of matrix contributes one part of a resulting string.
244
+ // If the number of combinations is larger than maxLiterals, the function
245
+ // returns false.
246
+ func combinations(matrix [][]string, maxLiterals int) ([]string, bool) {
247
+ if len(matrix) == 1 {
248
+ if len(matrix[0]) > maxLiterals {
249
+ return nil, false
250
+ }
251
+
252
+ return matrix[0], true
253
+ }
254
+
255
+ prefixes := matrix[0]
256
+ suffixes, ok := combinations(matrix[1:], maxLiterals)
257
+ if !ok {
258
+ return nil, false
259
+ }
260
+
261
+ size := len(prefixes) * len(suffixes)
262
+ if size > maxLiterals {
263
+ return nil, false
264
+ }
265
+
266
+ results := make([]string, 0, size)
267
+ for _, prefix := range prefixes {
268
+ for _, suffix := range suffixes {
269
+ results = append(results, prefix+suffix)
270
+ }
271
+ }
272
+
273
+ return results, true
274
+ }
275
+
276
+ // unwrapCase takes the regexp and the list of patterns expanded from it and
277
+ // further expands it for a case-insensitive regexp, if needed. Argument
278
+ // maxLiterals regulates the maximum number of patterns to return. In case of an
279
+ // overflow, false is returned.
280
+ func unwrapCase(re *syntax.Regexp, patterns []string, maxLiterals int) ([]string, bool) {
281
+ if re.Flags&syntax.FoldCase == 0 {
282
+ return patterns, true
283
+ }
284
+
285
+ results := []string{}
286
+ for _, pattern := range patterns {
287
+ matrix := make([][]string, len(pattern))
288
+ for i, r := range pattern {
289
+ upper := unicode.ToUpper(r)
290
+ lower := unicode.ToLower(r)
291
+ matrix[i] = []string{
292
+ string([]rune{upper}),
293
+ string([]rune{lower}),
294
+ }
295
+ }
296
+
297
+ patterns, ok := combinations(matrix, maxLiterals)
298
+ if !ok {
299
+ return nil, false
300
+ }
301
+
302
+ results = append(results, patterns...)
303
+ if len(results) > maxLiterals {
304
+ return nil, false
305
+ }
306
+ }
307
+
308
+ return results, true
309
+ }
310
+
311
+ // findLongestCommonLiteral finds the longest common literal in the regexp. It's
312
+ // such a string which is contained in any text matching the regexp. If such a
313
+ // literal can't be found, it returns an empty string.
314
+ func findLongestCommonLiteral(re *syntax.Regexp) string {
315
+ if re.Flags&syntax.FoldCase != 0 {
316
+ return ""
317
+ }
318
+
319
+ switch re.Op {
320
+ case syntax.OpNoMatch, syntax.OpEmptyMatch:
321
+ return ""
322
+
323
+ case syntax.OpLiteral:
324
+ return string(re.Rune)
325
+
326
+ case syntax.OpCharClass, syntax.OpAnyCharNotNL, syntax.OpAnyChar:
327
+ return ""
328
+
329
+ case syntax.OpBeginLine, syntax.OpBeginText:
330
+ return "^"
331
+
332
+ case syntax.OpEndLine, syntax.OpEndText:
333
+ return "$"
334
+
335
+ case syntax.OpWordBoundary, syntax.OpNoWordBoundary:
336
+ return ""
337
+
338
+ case syntax.OpCapture:
339
+ return findLongestCommonLiteral(re.Sub[0])
340
+
341
+ case syntax.OpStar:
342
+ return ""
343
+
344
+ case syntax.OpPlus:
345
+ return findLongestCommonLiteral(re.Sub[0])
346
+
347
+ case syntax.OpQuest:
348
+ return ""
349
+
350
+ case syntax.OpRepeat:
351
+ if re.Min >= 1 {
352
+ return findLongestCommonLiteral(re.Sub[0])
353
+ }
354
+
355
+ return ""
356
+
357
+ case syntax.OpConcat:
358
+ longest := ""
359
+ for _, sub := range re.Sub {
360
+ str := findLongestCommonLiteral(sub)
361
+ if len(str) > len(longest) {
362
+ longest = str
363
+ }
364
+ }
365
+
366
+ return longest
367
+
368
+ case syntax.OpAlternate:
369
+ return ""
370
+
371
+ default:
372
+ return ""
373
+ }
374
+ }
375
+
376
+ type regexpPattern struct {
377
+ re *regexp.Regexp
378
+ index int
379
+ }
380
+
381
+ type matcher struct {
382
+ replacer *strings.Replacer
383
+ regexps []regexpPattern
384
+ }
385
+
386
+ var uniqueToken = hex.EncodeToString((&maphash.Hash{}).Sum(nil))
387
+
388
+ const (
389
+ uniqueTokenLen = 2 * 8
390
+ numLen = 5
391
+ literalLabel = '-'
392
+ regexpLabel = '*'
393
+ )
394
+
395
+ var m = func() matcher {
396
+ if len(uniqueToken) != uniqueTokenLen {
397
+ panic("len(uniqueToken) != uniqueTokenLen")
398
+ }
399
+
400
+ regexps := []regexpPattern{}
401
+ oldnew := make([]string, 0, len(Crawlers)*2)
402
+
403
+ // Put re-based patterns to the end to prevent AdsBot-Google from
404
+ // shadowing AdsBot-Google-Mobile.
405
+ var oldnew2 []string
406
+
85
407
  for i, crawler := range Crawlers {
86
- regexps[i] = regexp.MustCompile(crawler.Pattern)
408
+ literals, re, err := analyzePattern(crawler.Pattern)
409
+ if err != nil {
410
+ panic(err)
411
+ }
412
+
413
+ label := literalLabel
414
+ num := i
415
+ if re != nil {
416
+ label = regexpLabel
417
+ num = len(regexps)
418
+ regexps = append(regexps, regexpPattern{
419
+ re: re,
420
+ index: i,
421
+ })
422
+ }
423
+
424
+ replaceWith := fmt.Sprintf(" %s%c%0*d ", uniqueToken, label, numLen, num)
425
+
426
+ for _, literal := range literals {
427
+ if re != nil {
428
+ oldnew2 = append(oldnew2, literal, replaceWith)
429
+ } else {
430
+ oldnew = append(oldnew, literal, replaceWith)
431
+ }
432
+ }
433
+ }
434
+ oldnew = append(oldnew, oldnew2...)
435
+
436
+ // Allocate another array with regexps of exact size to save memory.
437
+ regexps2 := make([]regexpPattern, len(regexps))
438
+ copy(regexps2, regexps)
439
+
440
+ r := strings.NewReplacer(oldnew...)
441
+ r.Replace("") // To cause internal build process.
442
+
443
+ return matcher{
444
+ replacer: r,
445
+ regexps: regexps2,
87
446
  }
88
- return regexps
89
447
  }()
90
448
 
91
449
  // Returns if User Agent string matches any of crawler patterns.
92
450
  func IsCrawler(userAgent string) bool {
93
- for _, re := range regexps {
94
- if re.MatchString(userAgent) {
451
+ // This code is mostly copy-paste of MatchingCrawlers,
452
+ // but with early exit logic, so it works a but faster.
453
+
454
+ text := "^" + userAgent + "$"
455
+ replaced := m.replacer.Replace(text)
456
+ if replaced == text {
457
+ return false
458
+ }
459
+
460
+ for {
461
+ uniquePos := strings.Index(replaced, uniqueToken)
462
+ if uniquePos == -1 {
463
+ break
464
+ }
465
+
466
+ start := uniquePos + uniqueTokenLen + 1
467
+ if start+numLen >= len(replaced) {
468
+ panic("corrupt replaced: " + replaced)
469
+ }
470
+
471
+ label := replaced[start-1]
472
+ switch label {
473
+ case literalLabel:
95
474
  return true
475
+ case regexpLabel:
476
+ // Rare case. Run regexp to confirm the match.
477
+ indexStr := replaced[start : start+numLen]
478
+ index, err := strconv.Atoi(indexStr)
479
+ if err != nil {
480
+ panic("corrupt replaced: " + replaced)
481
+ }
482
+ rp := m.regexps[index]
483
+ if rp.re.MatchString(userAgent) {
484
+ return true
485
+ }
486
+ default:
487
+ panic("corrupt replaced: " + replaced)
96
488
  }
489
+
490
+ replaced = replaced[start+numLen:]
97
491
  }
492
+
98
493
  return false
99
494
  }
100
495
 
101
496
  // Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers.
102
497
  func MatchingCrawlers(userAgent string) []int {
498
+ text := "^" + userAgent + "$"
499
+ replaced := m.replacer.Replace(text)
500
+ if replaced == text {
501
+ return []int{}
502
+ }
503
+
103
504
  indices := []int{}
104
- for i, re := range regexps {
105
- if re.MatchString(userAgent) {
106
- indices = append(indices, i)
505
+ for {
506
+ uniquePos := strings.Index(replaced, uniqueToken)
507
+ if uniquePos == -1 {
508
+ break
107
509
  }
510
+
511
+ start := uniquePos + uniqueTokenLen + 1
512
+ if start+numLen >= len(replaced) {
513
+ panic("corrupt replaced: " + replaced)
514
+ }
515
+ indexStr := replaced[start : start+numLen]
516
+ index, err := strconv.Atoi(indexStr)
517
+ if err != nil {
518
+ panic("corrupt replaced: " + replaced)
519
+ }
520
+
521
+ label := replaced[start-1]
522
+ switch label {
523
+ case literalLabel:
524
+ indices = append(indices, index)
525
+ case regexpLabel:
526
+ // Rare case. Run regexp to confirm the match.
527
+ rp := m.regexps[index]
528
+ if rp.re.MatchString(userAgent) {
529
+ indices = append(indices, rp.index)
530
+ }
531
+ default:
532
+ panic("corrupt replaced: " + replaced)
533
+ }
534
+
535
+ replaced = replaced[start+numLen:]
108
536
  }
537
+
109
538
  return indices
110
539
  }
package/validate_test.go CHANGED
@@ -4,9 +4,467 @@ import (
4
4
  "encoding/json"
5
5
  "fmt"
6
6
  "net/http"
7
+ "reflect"
8
+ "regexp/syntax"
9
+ "sort"
10
+ "strings"
7
11
  "testing"
8
12
  )
9
13
 
14
+ // TestAnalyzePattern tests analyzePattern function on many cases, including
15
+ // edge cases.
16
+ func TestAnalyzePattern(t *testing.T) {
17
+ cases := []struct {
18
+ input string
19
+ wantError string
20
+ wantPatterns []string
21
+ wantRe bool
22
+ shouldMatchRe []string
23
+ shouldNotMatchRe []string
24
+ }{
25
+ {
26
+ input: "simple phrase",
27
+ wantPatterns: []string{"simple phrase"},
28
+ },
29
+ {
30
+ input: "^begin anchor",
31
+ wantPatterns: []string{"^begin anchor"},
32
+ },
33
+ {
34
+ input: "end anchor$",
35
+ wantPatterns: []string{"end anchor$"},
36
+ },
37
+ {
38
+ input: "^both anchors$",
39
+ wantPatterns: []string{"^both anchors$"},
40
+ },
41
+ {
42
+ input: "(alter|nation)",
43
+ wantPatterns: []string{"alter", "nation"},
44
+ },
45
+ {
46
+ input: "too many [aA][lL][tT][eE][rR][nN][aA][tT][iI][oO][nN][sS]",
47
+ wantPatterns: []string{"too many "},
48
+ wantRe: true,
49
+ shouldMatchRe: []string{"too many ALTERNATIONs"},
50
+ shouldNotMatchRe: []string{"too many combinations "},
51
+ },
52
+ {
53
+ input: "(alter|nation) concatenation (alter|nation)",
54
+ wantPatterns: []string{
55
+ "alter concatenation alter",
56
+ "alter concatenation nation",
57
+ "nation concatenation alter",
58
+ "nation concatenation nation",
59
+ },
60
+ },
61
+ {
62
+ input: "clas[sS] of [c]haract[eiu]rs",
63
+ wantPatterns: []string{
64
+ "clasS of characters",
65
+ "clasS of charactirs",
66
+ "clasS of characturs",
67
+ "class of characters",
68
+ "class of charactirs",
69
+ "class of characturs",
70
+ },
71
+ },
72
+ {
73
+ input: "ranges [0-3]x[a-c]",
74
+ wantPatterns: []string{
75
+ "ranges 0xa", "ranges 0xb", "ranges 0xc",
76
+ "ranges 1xa", "ranges 1xb", "ranges 1xc",
77
+ "ranges 2xa", "ranges 2xb", "ranges 2xc",
78
+ "ranges 3xa", "ranges 3xb", "ranges 3xc",
79
+ },
80
+ },
81
+ {
82
+ input: "Quest?",
83
+ wantPatterns: []string{"Ques", "Quest"},
84
+ },
85
+ {
86
+ input: "Q?ue(st)?",
87
+ wantPatterns: []string{"Que", "Quest", "ue", "uest"},
88
+ },
89
+ {
90
+ input: "too many combinations [0-9][a-z]",
91
+ wantPatterns: []string{"too many combinations "},
92
+ wantRe: true,
93
+ shouldMatchRe: []string{"too many combinations 0a"},
94
+ shouldNotMatchRe: []string{"too many combinations "},
95
+ },
96
+ {
97
+ input: "negation in char class [^x]",
98
+ wantPatterns: []string{"negation in char class "},
99
+ wantRe: true,
100
+ shouldMatchRe: []string{"negation in char class y"},
101
+ shouldNotMatchRe: []string{"negation in char class x"},
102
+ },
103
+ {
104
+ input: "any char .",
105
+ wantPatterns: []string{"any char "},
106
+ wantRe: true,
107
+ shouldMatchRe: []string{"any char x"},
108
+ shouldNotMatchRe: []string{"any char_x"},
109
+ },
110
+ {
111
+ input: `word \boundary`,
112
+ wantPatterns: []string{"oundary"},
113
+ wantRe: true,
114
+ shouldMatchRe: []string{"word oundary"},
115
+ shouldNotMatchRe: []string{"word boundary"},
116
+ },
117
+ {
118
+ input: "asterisk*",
119
+ wantPatterns: []string{"asteris"},
120
+ wantRe: true,
121
+ shouldMatchRe: []string{"asteris", "asterisk", "asteriskk"},
122
+ shouldNotMatchRe: []string{"asterik"},
123
+ },
124
+ {
125
+ input: "plus+",
126
+ wantPatterns: []string{"plu"},
127
+ wantRe: true,
128
+ shouldMatchRe: []string{"plus", "pluss"},
129
+ shouldNotMatchRe: []string{"plu"},
130
+ },
131
+ {
132
+ input: "repeat{3,5}$",
133
+ wantPatterns: []string{"repeattt$", "repeatttt$", "repeattttt$"},
134
+ },
135
+ {
136
+ input: "repeat{1,120}$",
137
+ wantPatterns: []string{"repea"},
138
+ wantRe: true,
139
+ shouldMatchRe: []string{"repeattt", "repeatttt", "repeattttt"},
140
+ shouldNotMatchRe: []string{"repea5"},
141
+ },
142
+ {
143
+ input: "broken re[",
144
+ wantError: "does not compile",
145
+ },
146
+ {
147
+ input: "n?o? ?l?o?n?g? ?l?i?t?e?r?a?l?",
148
+ wantError: "does not contain sufficiently long literal",
149
+ },
150
+ }
151
+
152
+ for _, tc := range cases {
153
+ tc := tc
154
+
155
+ t.Run(tc.input, func(t *testing.T) {
156
+ gotPatterns, re, err := analyzePattern(tc.input)
157
+ if tc.wantError != "" {
158
+ if err == nil {
159
+ t.Fatalf("expected to get an error, got success")
160
+ }
161
+ if !strings.Contains(err.Error(), tc.wantError) {
162
+ t.Fatalf("the error returned must contain text %q, got %q", tc.wantError, err.Error())
163
+ }
164
+
165
+ return
166
+ }
167
+
168
+ if err != nil {
169
+ t.Fatalf("unexpected error: %v", err)
170
+ }
171
+
172
+ sort.Strings(tc.wantPatterns)
173
+ sort.Strings(gotPatterns)
174
+ if !reflect.DeepEqual(tc.wantPatterns, gotPatterns) {
175
+ t.Fatalf("returned list of patterns (%#v) does not match the expected value (%#v)", gotPatterns, tc.wantPatterns)
176
+ }
177
+
178
+ if !tc.wantRe {
179
+ if re != nil {
180
+ t.Fatalf("unexpectedly got a re")
181
+ }
182
+
183
+ return
184
+ }
185
+
186
+ if re == nil {
187
+ t.Fatalf("expected to get a re, got nil")
188
+ }
189
+ for _, text := range tc.shouldMatchRe {
190
+ if !re.MatchString(text) {
191
+ t.Fatalf("test %q must match against the re, but it doesn't", text)
192
+ }
193
+ }
194
+ for _, text := range tc.shouldNotMatchRe {
195
+ if re.MatchString(text) {
196
+ t.Fatalf("test %q must not match against the re, but it does", text)
197
+ }
198
+ }
199
+ })
200
+ }
201
+ }
202
+
203
+ // TestLiteralizeRegexp tests expansion of a regexp to a list of literals.
204
+ func TestLiteralizeRegexp(t *testing.T) {
205
+ cases := []struct {
206
+ input string
207
+ maxLiterals int
208
+ wantOutput []string
209
+ wantOverflow bool
210
+ }{
211
+ {
212
+ input: "simple phrase",
213
+ maxLiterals: 100,
214
+ wantOutput: []string{"simple phrase"},
215
+ },
216
+ {
217
+ input: "cases [1-2x-z]",
218
+ maxLiterals: 100,
219
+ wantOutput: []string{"cases 1", "cases 2", "cases x", "cases y", "cases z"},
220
+ },
221
+ {
222
+ input: "[Ii]gnore case",
223
+ maxLiterals: 100,
224
+ wantOutput: []string{"Ignore case", "ignore case"},
225
+ },
226
+ {
227
+ input: "overflow [1-2x-z]",
228
+ maxLiterals: 2,
229
+ wantOverflow: true,
230
+ },
231
+ }
232
+
233
+ for _, tc := range cases {
234
+ tc := tc
235
+
236
+ t.Run(tc.input, func(t *testing.T) {
237
+ re, err := syntax.Parse(tc.input, syntax.Perl)
238
+ if err != nil {
239
+ t.Fatalf("failed to parse regexp %q: %v", tc.input, err)
240
+ }
241
+
242
+ gotPatterns, ok := literalizeRegexp(re, tc.maxLiterals)
243
+ if tc.wantOverflow {
244
+ if ok {
245
+ t.Fatalf("expected to get an overflow, got success")
246
+ }
247
+
248
+ return
249
+ }
250
+
251
+ if !ok {
252
+ t.Fatalf("unexpected overflow")
253
+ }
254
+
255
+ sort.Strings(tc.wantOutput)
256
+ sort.Strings(gotPatterns)
257
+ if !reflect.DeepEqual(tc.wantOutput, gotPatterns) {
258
+ t.Fatalf("returned list of patterns (%#v) does not match the expected value (%#v)", gotPatterns, tc.wantOutput)
259
+ }
260
+ })
261
+ }
262
+ }
263
+
264
+ // TestCombinations tests combinations() function.
265
+ func TestCombinations(t *testing.T) {
266
+ cases := []struct {
267
+ name string
268
+ input [][]string
269
+ maxLiterals int
270
+ wantOutput []string
271
+ wantOverflow bool
272
+ }{
273
+ {
274
+ name: "1x1",
275
+ input: [][]string{{"A"}, {"B"}},
276
+ maxLiterals: 100,
277
+ wantOutput: []string{"AB"},
278
+ },
279
+ {
280
+ name: "0x1",
281
+ input: [][]string{{}, {"B"}},
282
+ maxLiterals: 100,
283
+ wantOutput: []string{},
284
+ },
285
+ {
286
+ name: "1x2",
287
+ input: [][]string{{"A"}, {"1", "2"}},
288
+ maxLiterals: 100,
289
+ wantOutput: []string{"A1", "A2"},
290
+ },
291
+ {
292
+ name: "2x2",
293
+ input: [][]string{{"A", "B"}, {"1", "2"}},
294
+ maxLiterals: 100,
295
+ wantOutput: []string{"A1", "A2", "B1", "B2"},
296
+ },
297
+ {
298
+ name: "empty string as an option",
299
+ input: [][]string{{"A", ""}, {"1", "2"}},
300
+ maxLiterals: 100,
301
+ wantOutput: []string{"A1", "A2", "1", "2"},
302
+ },
303
+ {
304
+ name: "overflow",
305
+ input: [][]string{{"A", "B"}, {"1", "2"}},
306
+ maxLiterals: 3,
307
+ wantOverflow: true,
308
+ },
309
+ }
310
+
311
+ for _, tc := range cases {
312
+ tc := tc
313
+
314
+ t.Run(tc.name, func(t *testing.T) {
315
+ gotPatterns, ok := combinations(tc.input, tc.maxLiterals)
316
+ if tc.wantOverflow {
317
+ if ok {
318
+ t.Fatalf("expected to get an overflow, got success")
319
+ }
320
+
321
+ return
322
+ }
323
+
324
+ if !ok {
325
+ t.Fatalf("unexpected overflow")
326
+ }
327
+
328
+ sort.Strings(tc.wantOutput)
329
+ sort.Strings(gotPatterns)
330
+ if !reflect.DeepEqual(tc.wantOutput, gotPatterns) {
331
+ t.Fatalf("returned list of patterns (%#v) does not match the expected value (%#v)", gotPatterns, tc.wantOutput)
332
+ }
333
+ })
334
+ }
335
+ }
336
+
337
+ // TestUnwrapCase tests unwrapping literals of case-insensitive regexps.
338
+ func TestUnwrapCase(t *testing.T) {
339
+ cases := []struct {
340
+ name string
341
+ ignoreCase bool
342
+ inputPatterns []string
343
+ maxLiterals int
344
+ wantOutput []string
345
+ wantOverflow bool
346
+ }{
347
+ {
348
+ name: "simple phrase",
349
+ inputPatterns: []string{"simple phrase"},
350
+ maxLiterals: 100,
351
+ wantOutput: []string{"simple phrase"},
352
+ },
353
+ {
354
+ name: "ignore case",
355
+ ignoreCase: true,
356
+ inputPatterns: []string{"i"},
357
+ maxLiterals: 100,
358
+ wantOutput: []string{"i", "I"},
359
+ },
360
+ {
361
+ name: "ignore case two letters",
362
+ ignoreCase: true,
363
+ inputPatterns: []string{"ic"},
364
+ maxLiterals: 100,
365
+ wantOutput: []string{"IC", "Ic", "iC", "ic"},
366
+ },
367
+ {
368
+ name: "ignore case two words",
369
+ ignoreCase: true,
370
+ inputPatterns: []string{"i", "c"},
371
+ maxLiterals: 100,
372
+ wantOutput: []string{"C", "I", "c", "i"},
373
+ },
374
+ {
375
+ name: "ignore case overflow",
376
+ ignoreCase: true,
377
+ inputPatterns: []string{"long text"},
378
+ maxLiterals: 100,
379
+ wantOverflow: true,
380
+ },
381
+ }
382
+
383
+ for _, tc := range cases {
384
+ tc := tc
385
+
386
+ t.Run(tc.name, func(t *testing.T) {
387
+ re := &syntax.Regexp{}
388
+ if tc.ignoreCase {
389
+ re.Flags = syntax.FoldCase
390
+ }
391
+
392
+ gotPatterns, ok := unwrapCase(re, tc.inputPatterns, tc.maxLiterals)
393
+ if tc.wantOverflow {
394
+ if ok {
395
+ t.Fatalf("expected to get an overflow, got success")
396
+ }
397
+
398
+ return
399
+ }
400
+
401
+ if !ok {
402
+ t.Fatalf("unexpected overflow")
403
+ }
404
+
405
+ sort.Strings(tc.wantOutput)
406
+ sort.Strings(gotPatterns)
407
+ if !reflect.DeepEqual(tc.wantOutput, gotPatterns) {
408
+ t.Fatalf("returned list of patterns (%#v) does not match the expected value (%#v)", gotPatterns, tc.wantOutput)
409
+ }
410
+ })
411
+ }
412
+ }
413
+
414
+ // TestFindLongestCommonLiteral tests finding longest literal in a regexp.
415
+ func TestFindLongestCommonLiteral(t *testing.T) {
416
+ cases := []struct {
417
+ input string
418
+ wantOutput string
419
+ }{
420
+ {
421
+ input: "simple phrase",
422
+ wantOutput: "simple phrase",
423
+ },
424
+ {
425
+ input: "simple (phrase)?",
426
+ wantOutput: "simple ",
427
+ },
428
+ {
429
+ input: "[iI]",
430
+ wantOutput: "",
431
+ },
432
+ {
433
+ input: "[i]b",
434
+ wantOutput: "ib",
435
+ },
436
+ {
437
+ input: "simple (phrase)+",
438
+ wantOutput: "simple ",
439
+ },
440
+ {
441
+ input: "a*",
442
+ wantOutput: "",
443
+ },
444
+ {
445
+ input: "(abc)|(ab)",
446
+ wantOutput: "",
447
+ },
448
+ }
449
+
450
+ for _, tc := range cases {
451
+ tc := tc
452
+
453
+ t.Run(tc.input, func(t *testing.T) {
454
+ re, err := syntax.Parse(tc.input, syntax.Perl)
455
+ if err != nil {
456
+ t.Fatalf("failed to parse regexp %q: %v", tc.input, err)
457
+ }
458
+
459
+ gotOutput := findLongestCommonLiteral(re)
460
+
461
+ if gotOutput != tc.wantOutput {
462
+ t.Fatalf("returned value (%q) does not match the expected value (%q)", gotOutput, tc.wantOutput)
463
+ }
464
+ })
465
+ }
466
+ }
467
+
10
468
  func contains(list []int, value int) bool {
11
469
  for _, elem := range list {
12
470
  if elem == value {