crawler-user-agents 1.0.151 → 1.0.152
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/validate.go +439 -10
- package/validate_test.go +458 -0
package/package.json
CHANGED
package/validate.go
CHANGED
|
@@ -2,10 +2,16 @@ package agents
|
|
|
2
2
|
|
|
3
3
|
import (
|
|
4
4
|
_ "embed"
|
|
5
|
+
"encoding/hex"
|
|
5
6
|
"encoding/json"
|
|
6
7
|
"fmt"
|
|
8
|
+
"hash/maphash"
|
|
7
9
|
"regexp"
|
|
10
|
+
"regexp/syntax"
|
|
11
|
+
"strconv"
|
|
12
|
+
"strings"
|
|
8
13
|
"time"
|
|
14
|
+
"unicode"
|
|
9
15
|
)
|
|
10
16
|
|
|
11
17
|
//go:embed crawler-user-agents.json
|
|
@@ -26,7 +32,7 @@ type Crawler struct {
|
|
|
26
32
|
Instances []string `json:"instances"`
|
|
27
33
|
}
|
|
28
34
|
|
|
29
|
-
// Private
|
|
35
|
+
// Private type needed to convert addition_date from/to the format used in JSON.
|
|
30
36
|
type jsonCrawler struct {
|
|
31
37
|
Pattern string `json:"pattern"`
|
|
32
38
|
AdditionDate string `json:"addition_date"`
|
|
@@ -80,31 +86,454 @@ var Crawlers = func() []Crawler {
|
|
|
80
86
|
return crawlers
|
|
81
87
|
}()
|
|
82
88
|
|
|
83
|
-
|
|
84
|
-
|
|
89
|
+
// analyzePattern expands a regular expression to the list of matching texts
|
|
90
|
+
// for plain search. The list is complete, i.e. iff a text matches the input
|
|
91
|
+
// pattern, then it contains at least one of the returned texts. If such a list
|
|
92
|
+
// can't be built, then the resulting list contains one element (main literal),
|
|
93
|
+
// it also returns built regexp object to run in this case. The main literal is
|
|
94
|
+
// a text that is contained in any matching text and is used to optimize search
|
|
95
|
+
// (pre-filter with this main literal before running a regexp). In the case such
|
|
96
|
+
// a main literal can't be found or the regexp is invalid, an error is returned.
|
|
97
|
+
func analyzePattern(pattern string) ([]string, *regexp.Regexp, error) {
|
|
98
|
+
re, err := syntax.Parse(pattern, syntax.Perl)
|
|
99
|
+
if err != nil {
|
|
100
|
+
return nil, nil, fmt.Errorf("re %q does not compile: %w", pattern, err)
|
|
101
|
+
}
|
|
102
|
+
re = re.Simplify()
|
|
103
|
+
|
|
104
|
+
// Try to convert it to the list of literals.
|
|
105
|
+
const maxLiterals = 100
|
|
106
|
+
literals, ok := literalizeRegexp(re, maxLiterals)
|
|
107
|
+
if ok {
|
|
108
|
+
return literals, nil, nil
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Fallback to using a regexp, but we need some string serving as
|
|
112
|
+
// an indicator of its possible presence.
|
|
113
|
+
mainLiteral := findLongestCommonLiteral(re)
|
|
114
|
+
const minLiteralLen = 3
|
|
115
|
+
if len(mainLiteral) < minLiteralLen {
|
|
116
|
+
return nil, nil, fmt.Errorf("re %q does not contain sufficiently long literal to serve an indicator. The longest literal is %q", pattern, mainLiteral)
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
return []string{mainLiteral}, regexp.MustCompile(pattern), nil
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// literalizeRegexp expands a regexp to the list of matching sub-strings.
|
|
123
|
+
// Iff a text matches the regexp, it contains at least one of the returned
|
|
124
|
+
// texts. Argument maxLiterals regulates the maximum number of patterns to
|
|
125
|
+
// return. In case of an overflow or if it is impossible to build such a list
|
|
126
|
+
// from the regexp, false is returned.
|
|
127
|
+
func literalizeRegexp(re *syntax.Regexp, maxLiterals int) (literals []string, ok bool) {
|
|
128
|
+
switch re.Op {
|
|
129
|
+
case syntax.OpNoMatch:
|
|
130
|
+
return nil, true
|
|
131
|
+
|
|
132
|
+
case syntax.OpEmptyMatch:
|
|
133
|
+
return []string{""}, true
|
|
134
|
+
|
|
135
|
+
case syntax.OpLiteral:
|
|
136
|
+
return unwrapCase(re, []string{string(re.Rune)}, maxLiterals)
|
|
137
|
+
|
|
138
|
+
case syntax.OpCharClass:
|
|
139
|
+
count := 0
|
|
140
|
+
for i := 0; i < len(re.Rune); i += 2 {
|
|
141
|
+
first := re.Rune[i]
|
|
142
|
+
last := re.Rune[i+1]
|
|
143
|
+
count += int(last - first + 1)
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
if count > maxLiterals {
|
|
147
|
+
return nil, false
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
patterns := make([]string, 0, count)
|
|
151
|
+
for i := 0; i < len(re.Rune); i += 2 {
|
|
152
|
+
first := re.Rune[i]
|
|
153
|
+
last := re.Rune[i+1]
|
|
154
|
+
for r := first; r <= last; r++ {
|
|
155
|
+
patterns = append(patterns, string([]rune{r}))
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
return unwrapCase(re, patterns, maxLiterals)
|
|
160
|
+
|
|
161
|
+
case syntax.OpAnyCharNotNL, syntax.OpAnyChar:
|
|
162
|
+
// Not supported.
|
|
163
|
+
return nil, false
|
|
164
|
+
|
|
165
|
+
case syntax.OpBeginLine, syntax.OpBeginText:
|
|
166
|
+
return []string{"^"}, true
|
|
167
|
+
|
|
168
|
+
case syntax.OpEndLine, syntax.OpEndText:
|
|
169
|
+
return []string{"$"}, true
|
|
170
|
+
|
|
171
|
+
case syntax.OpWordBoundary, syntax.OpNoWordBoundary:
|
|
172
|
+
// Not supported.
|
|
173
|
+
return nil, false
|
|
174
|
+
|
|
175
|
+
case syntax.OpCapture:
|
|
176
|
+
subList, ok := literalizeRegexp(re.Sub[0], maxLiterals)
|
|
177
|
+
if !ok {
|
|
178
|
+
return nil, false
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
return unwrapCase(re, subList, maxLiterals)
|
|
182
|
+
|
|
183
|
+
case syntax.OpStar, syntax.OpPlus:
|
|
184
|
+
// Not supported.
|
|
185
|
+
return nil, false
|
|
186
|
+
|
|
187
|
+
case syntax.OpQuest:
|
|
188
|
+
if re.Flags&syntax.FoldCase != 0 {
|
|
189
|
+
return nil, false
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
subList, ok := literalizeRegexp(re.Sub[0], maxLiterals)
|
|
193
|
+
if !ok {
|
|
194
|
+
return nil, false
|
|
195
|
+
}
|
|
196
|
+
subList = append(subList, "")
|
|
197
|
+
|
|
198
|
+
return subList, true
|
|
199
|
+
|
|
200
|
+
case syntax.OpRepeat:
|
|
201
|
+
// Not supported.
|
|
202
|
+
return nil, false
|
|
203
|
+
|
|
204
|
+
case syntax.OpConcat:
|
|
205
|
+
if re.Flags&syntax.FoldCase != 0 {
|
|
206
|
+
return nil, false
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
matrix := make([][]string, len(re.Sub))
|
|
210
|
+
for i, sub := range re.Sub {
|
|
211
|
+
subList, ok := literalizeRegexp(sub, maxLiterals)
|
|
212
|
+
if !ok {
|
|
213
|
+
return nil, false
|
|
214
|
+
}
|
|
215
|
+
matrix[i] = subList
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return combinations(matrix, maxLiterals)
|
|
219
|
+
|
|
220
|
+
case syntax.OpAlternate:
|
|
221
|
+
results := []string{}
|
|
222
|
+
for _, sub := range re.Sub {
|
|
223
|
+
subList, ok := literalizeRegexp(sub, maxLiterals)
|
|
224
|
+
if !ok {
|
|
225
|
+
return nil, false
|
|
226
|
+
}
|
|
227
|
+
results = append(results, subList...)
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if len(results) > maxLiterals {
|
|
231
|
+
return nil, false
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return unwrapCase(re, results, maxLiterals)
|
|
235
|
+
|
|
236
|
+
default:
|
|
237
|
+
// Not supported.
|
|
238
|
+
return nil, false
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// combinations produces all combination of elements of matrix.
|
|
243
|
+
// Each sub-slice of matrix contributes one part of a resulting string.
|
|
244
|
+
// If the number of combinations is larger than maxLiterals, the function
|
|
245
|
+
// returns false.
|
|
246
|
+
func combinations(matrix [][]string, maxLiterals int) ([]string, bool) {
|
|
247
|
+
if len(matrix) == 1 {
|
|
248
|
+
if len(matrix[0]) > maxLiterals {
|
|
249
|
+
return nil, false
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
return matrix[0], true
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
prefixes := matrix[0]
|
|
256
|
+
suffixes, ok := combinations(matrix[1:], maxLiterals)
|
|
257
|
+
if !ok {
|
|
258
|
+
return nil, false
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
size := len(prefixes) * len(suffixes)
|
|
262
|
+
if size > maxLiterals {
|
|
263
|
+
return nil, false
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
results := make([]string, 0, size)
|
|
267
|
+
for _, prefix := range prefixes {
|
|
268
|
+
for _, suffix := range suffixes {
|
|
269
|
+
results = append(results, prefix+suffix)
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
return results, true
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// unwrapCase takes the regexp and the list of patterns expanded from it and
|
|
277
|
+
// further expands it for a case-insensitive regexp, if needed. Argument
|
|
278
|
+
// maxLiterals regulates the maximum number of patterns to return. In case of an
|
|
279
|
+
// overflow, false is returned.
|
|
280
|
+
func unwrapCase(re *syntax.Regexp, patterns []string, maxLiterals int) ([]string, bool) {
|
|
281
|
+
if re.Flags&syntax.FoldCase == 0 {
|
|
282
|
+
return patterns, true
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
results := []string{}
|
|
286
|
+
for _, pattern := range patterns {
|
|
287
|
+
matrix := make([][]string, len(pattern))
|
|
288
|
+
for i, r := range pattern {
|
|
289
|
+
upper := unicode.ToUpper(r)
|
|
290
|
+
lower := unicode.ToLower(r)
|
|
291
|
+
matrix[i] = []string{
|
|
292
|
+
string([]rune{upper}),
|
|
293
|
+
string([]rune{lower}),
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
patterns, ok := combinations(matrix, maxLiterals)
|
|
298
|
+
if !ok {
|
|
299
|
+
return nil, false
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
results = append(results, patterns...)
|
|
303
|
+
if len(results) > maxLiterals {
|
|
304
|
+
return nil, false
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
return results, true
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// findLongestCommonLiteral finds the longest common literal in the regexp. It's
|
|
312
|
+
// such a string which is contained in any text matching the regexp. If such a
|
|
313
|
+
// literal can't be found, it returns an empty string.
|
|
314
|
+
func findLongestCommonLiteral(re *syntax.Regexp) string {
|
|
315
|
+
if re.Flags&syntax.FoldCase != 0 {
|
|
316
|
+
return ""
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
switch re.Op {
|
|
320
|
+
case syntax.OpNoMatch, syntax.OpEmptyMatch:
|
|
321
|
+
return ""
|
|
322
|
+
|
|
323
|
+
case syntax.OpLiteral:
|
|
324
|
+
return string(re.Rune)
|
|
325
|
+
|
|
326
|
+
case syntax.OpCharClass, syntax.OpAnyCharNotNL, syntax.OpAnyChar:
|
|
327
|
+
return ""
|
|
328
|
+
|
|
329
|
+
case syntax.OpBeginLine, syntax.OpBeginText:
|
|
330
|
+
return "^"
|
|
331
|
+
|
|
332
|
+
case syntax.OpEndLine, syntax.OpEndText:
|
|
333
|
+
return "$"
|
|
334
|
+
|
|
335
|
+
case syntax.OpWordBoundary, syntax.OpNoWordBoundary:
|
|
336
|
+
return ""
|
|
337
|
+
|
|
338
|
+
case syntax.OpCapture:
|
|
339
|
+
return findLongestCommonLiteral(re.Sub[0])
|
|
340
|
+
|
|
341
|
+
case syntax.OpStar:
|
|
342
|
+
return ""
|
|
343
|
+
|
|
344
|
+
case syntax.OpPlus:
|
|
345
|
+
return findLongestCommonLiteral(re.Sub[0])
|
|
346
|
+
|
|
347
|
+
case syntax.OpQuest:
|
|
348
|
+
return ""
|
|
349
|
+
|
|
350
|
+
case syntax.OpRepeat:
|
|
351
|
+
if re.Min >= 1 {
|
|
352
|
+
return findLongestCommonLiteral(re.Sub[0])
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
return ""
|
|
356
|
+
|
|
357
|
+
case syntax.OpConcat:
|
|
358
|
+
longest := ""
|
|
359
|
+
for _, sub := range re.Sub {
|
|
360
|
+
str := findLongestCommonLiteral(sub)
|
|
361
|
+
if len(str) > len(longest) {
|
|
362
|
+
longest = str
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
return longest
|
|
367
|
+
|
|
368
|
+
case syntax.OpAlternate:
|
|
369
|
+
return ""
|
|
370
|
+
|
|
371
|
+
default:
|
|
372
|
+
return ""
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
type regexpPattern struct {
|
|
377
|
+
re *regexp.Regexp
|
|
378
|
+
index int
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
type matcher struct {
|
|
382
|
+
replacer *strings.Replacer
|
|
383
|
+
regexps []regexpPattern
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
var uniqueToken = hex.EncodeToString((&maphash.Hash{}).Sum(nil))
|
|
387
|
+
|
|
388
|
+
const (
|
|
389
|
+
uniqueTokenLen = 2 * 8
|
|
390
|
+
numLen = 5
|
|
391
|
+
literalLabel = '-'
|
|
392
|
+
regexpLabel = '*'
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
var m = func() matcher {
|
|
396
|
+
if len(uniqueToken) != uniqueTokenLen {
|
|
397
|
+
panic("len(uniqueToken) != uniqueTokenLen")
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
regexps := []regexpPattern{}
|
|
401
|
+
oldnew := make([]string, 0, len(Crawlers)*2)
|
|
402
|
+
|
|
403
|
+
// Put re-based patterns to the end to prevent AdsBot-Google from
|
|
404
|
+
// shadowing AdsBot-Google-Mobile.
|
|
405
|
+
var oldnew2 []string
|
|
406
|
+
|
|
85
407
|
for i, crawler := range Crawlers {
|
|
86
|
-
|
|
408
|
+
literals, re, err := analyzePattern(crawler.Pattern)
|
|
409
|
+
if err != nil {
|
|
410
|
+
panic(err)
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
label := literalLabel
|
|
414
|
+
num := i
|
|
415
|
+
if re != nil {
|
|
416
|
+
label = regexpLabel
|
|
417
|
+
num = len(regexps)
|
|
418
|
+
regexps = append(regexps, regexpPattern{
|
|
419
|
+
re: re,
|
|
420
|
+
index: i,
|
|
421
|
+
})
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
replaceWith := fmt.Sprintf(" %s%c%0*d ", uniqueToken, label, numLen, num)
|
|
425
|
+
|
|
426
|
+
for _, literal := range literals {
|
|
427
|
+
if re != nil {
|
|
428
|
+
oldnew2 = append(oldnew2, literal, replaceWith)
|
|
429
|
+
} else {
|
|
430
|
+
oldnew = append(oldnew, literal, replaceWith)
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
oldnew = append(oldnew, oldnew2...)
|
|
435
|
+
|
|
436
|
+
// Allocate another array with regexps of exact size to save memory.
|
|
437
|
+
regexps2 := make([]regexpPattern, len(regexps))
|
|
438
|
+
copy(regexps2, regexps)
|
|
439
|
+
|
|
440
|
+
r := strings.NewReplacer(oldnew...)
|
|
441
|
+
r.Replace("") // To cause internal build process.
|
|
442
|
+
|
|
443
|
+
return matcher{
|
|
444
|
+
replacer: r,
|
|
445
|
+
regexps: regexps2,
|
|
87
446
|
}
|
|
88
|
-
return regexps
|
|
89
447
|
}()
|
|
90
448
|
|
|
91
449
|
// Returns if User Agent string matches any of crawler patterns.
|
|
92
450
|
func IsCrawler(userAgent string) bool {
|
|
93
|
-
|
|
94
|
-
|
|
451
|
+
// This code is mostly copy-paste of MatchingCrawlers,
|
|
452
|
+
// but with early exit logic, so it works a but faster.
|
|
453
|
+
|
|
454
|
+
text := "^" + userAgent + "$"
|
|
455
|
+
replaced := m.replacer.Replace(text)
|
|
456
|
+
if replaced == text {
|
|
457
|
+
return false
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
for {
|
|
461
|
+
uniquePos := strings.Index(replaced, uniqueToken)
|
|
462
|
+
if uniquePos == -1 {
|
|
463
|
+
break
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
start := uniquePos + uniqueTokenLen + 1
|
|
467
|
+
if start+numLen >= len(replaced) {
|
|
468
|
+
panic("corrupt replaced: " + replaced)
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
label := replaced[start-1]
|
|
472
|
+
switch label {
|
|
473
|
+
case literalLabel:
|
|
95
474
|
return true
|
|
475
|
+
case regexpLabel:
|
|
476
|
+
// Rare case. Run regexp to confirm the match.
|
|
477
|
+
indexStr := replaced[start : start+numLen]
|
|
478
|
+
index, err := strconv.Atoi(indexStr)
|
|
479
|
+
if err != nil {
|
|
480
|
+
panic("corrupt replaced: " + replaced)
|
|
481
|
+
}
|
|
482
|
+
rp := m.regexps[index]
|
|
483
|
+
if rp.re.MatchString(userAgent) {
|
|
484
|
+
return true
|
|
485
|
+
}
|
|
486
|
+
default:
|
|
487
|
+
panic("corrupt replaced: " + replaced)
|
|
96
488
|
}
|
|
489
|
+
|
|
490
|
+
replaced = replaced[start+numLen:]
|
|
97
491
|
}
|
|
492
|
+
|
|
98
493
|
return false
|
|
99
494
|
}
|
|
100
495
|
|
|
101
496
|
// Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers.
|
|
102
497
|
func MatchingCrawlers(userAgent string) []int {
|
|
498
|
+
text := "^" + userAgent + "$"
|
|
499
|
+
replaced := m.replacer.Replace(text)
|
|
500
|
+
if replaced == text {
|
|
501
|
+
return []int{}
|
|
502
|
+
}
|
|
503
|
+
|
|
103
504
|
indices := []int{}
|
|
104
|
-
for
|
|
105
|
-
|
|
106
|
-
|
|
505
|
+
for {
|
|
506
|
+
uniquePos := strings.Index(replaced, uniqueToken)
|
|
507
|
+
if uniquePos == -1 {
|
|
508
|
+
break
|
|
107
509
|
}
|
|
510
|
+
|
|
511
|
+
start := uniquePos + uniqueTokenLen + 1
|
|
512
|
+
if start+numLen >= len(replaced) {
|
|
513
|
+
panic("corrupt replaced: " + replaced)
|
|
514
|
+
}
|
|
515
|
+
indexStr := replaced[start : start+numLen]
|
|
516
|
+
index, err := strconv.Atoi(indexStr)
|
|
517
|
+
if err != nil {
|
|
518
|
+
panic("corrupt replaced: " + replaced)
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
label := replaced[start-1]
|
|
522
|
+
switch label {
|
|
523
|
+
case literalLabel:
|
|
524
|
+
indices = append(indices, index)
|
|
525
|
+
case regexpLabel:
|
|
526
|
+
// Rare case. Run regexp to confirm the match.
|
|
527
|
+
rp := m.regexps[index]
|
|
528
|
+
if rp.re.MatchString(userAgent) {
|
|
529
|
+
indices = append(indices, rp.index)
|
|
530
|
+
}
|
|
531
|
+
default:
|
|
532
|
+
panic("corrupt replaced: " + replaced)
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
replaced = replaced[start+numLen:]
|
|
108
536
|
}
|
|
537
|
+
|
|
109
538
|
return indices
|
|
110
539
|
}
|
package/validate_test.go
CHANGED
|
@@ -4,9 +4,467 @@ import (
|
|
|
4
4
|
"encoding/json"
|
|
5
5
|
"fmt"
|
|
6
6
|
"net/http"
|
|
7
|
+
"reflect"
|
|
8
|
+
"regexp/syntax"
|
|
9
|
+
"sort"
|
|
10
|
+
"strings"
|
|
7
11
|
"testing"
|
|
8
12
|
)
|
|
9
13
|
|
|
14
|
+
// TestAnalyzePattern tests analyzePattern function on many cases, including
|
|
15
|
+
// edge cases.
|
|
16
|
+
func TestAnalyzePattern(t *testing.T) {
|
|
17
|
+
cases := []struct {
|
|
18
|
+
input string
|
|
19
|
+
wantError string
|
|
20
|
+
wantPatterns []string
|
|
21
|
+
wantRe bool
|
|
22
|
+
shouldMatchRe []string
|
|
23
|
+
shouldNotMatchRe []string
|
|
24
|
+
}{
|
|
25
|
+
{
|
|
26
|
+
input: "simple phrase",
|
|
27
|
+
wantPatterns: []string{"simple phrase"},
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
input: "^begin anchor",
|
|
31
|
+
wantPatterns: []string{"^begin anchor"},
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
input: "end anchor$",
|
|
35
|
+
wantPatterns: []string{"end anchor$"},
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
input: "^both anchors$",
|
|
39
|
+
wantPatterns: []string{"^both anchors$"},
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
input: "(alter|nation)",
|
|
43
|
+
wantPatterns: []string{"alter", "nation"},
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
input: "too many [aA][lL][tT][eE][rR][nN][aA][tT][iI][oO][nN][sS]",
|
|
47
|
+
wantPatterns: []string{"too many "},
|
|
48
|
+
wantRe: true,
|
|
49
|
+
shouldMatchRe: []string{"too many ALTERNATIONs"},
|
|
50
|
+
shouldNotMatchRe: []string{"too many combinations "},
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
input: "(alter|nation) concatenation (alter|nation)",
|
|
54
|
+
wantPatterns: []string{
|
|
55
|
+
"alter concatenation alter",
|
|
56
|
+
"alter concatenation nation",
|
|
57
|
+
"nation concatenation alter",
|
|
58
|
+
"nation concatenation nation",
|
|
59
|
+
},
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
input: "clas[sS] of [c]haract[eiu]rs",
|
|
63
|
+
wantPatterns: []string{
|
|
64
|
+
"clasS of characters",
|
|
65
|
+
"clasS of charactirs",
|
|
66
|
+
"clasS of characturs",
|
|
67
|
+
"class of characters",
|
|
68
|
+
"class of charactirs",
|
|
69
|
+
"class of characturs",
|
|
70
|
+
},
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
input: "ranges [0-3]x[a-c]",
|
|
74
|
+
wantPatterns: []string{
|
|
75
|
+
"ranges 0xa", "ranges 0xb", "ranges 0xc",
|
|
76
|
+
"ranges 1xa", "ranges 1xb", "ranges 1xc",
|
|
77
|
+
"ranges 2xa", "ranges 2xb", "ranges 2xc",
|
|
78
|
+
"ranges 3xa", "ranges 3xb", "ranges 3xc",
|
|
79
|
+
},
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
input: "Quest?",
|
|
83
|
+
wantPatterns: []string{"Ques", "Quest"},
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
input: "Q?ue(st)?",
|
|
87
|
+
wantPatterns: []string{"Que", "Quest", "ue", "uest"},
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
input: "too many combinations [0-9][a-z]",
|
|
91
|
+
wantPatterns: []string{"too many combinations "},
|
|
92
|
+
wantRe: true,
|
|
93
|
+
shouldMatchRe: []string{"too many combinations 0a"},
|
|
94
|
+
shouldNotMatchRe: []string{"too many combinations "},
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
input: "negation in char class [^x]",
|
|
98
|
+
wantPatterns: []string{"negation in char class "},
|
|
99
|
+
wantRe: true,
|
|
100
|
+
shouldMatchRe: []string{"negation in char class y"},
|
|
101
|
+
shouldNotMatchRe: []string{"negation in char class x"},
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
input: "any char .",
|
|
105
|
+
wantPatterns: []string{"any char "},
|
|
106
|
+
wantRe: true,
|
|
107
|
+
shouldMatchRe: []string{"any char x"},
|
|
108
|
+
shouldNotMatchRe: []string{"any char_x"},
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
input: `word \boundary`,
|
|
112
|
+
wantPatterns: []string{"oundary"},
|
|
113
|
+
wantRe: true,
|
|
114
|
+
shouldMatchRe: []string{"word oundary"},
|
|
115
|
+
shouldNotMatchRe: []string{"word boundary"},
|
|
116
|
+
},
|
|
117
|
+
{
|
|
118
|
+
input: "asterisk*",
|
|
119
|
+
wantPatterns: []string{"asteris"},
|
|
120
|
+
wantRe: true,
|
|
121
|
+
shouldMatchRe: []string{"asteris", "asterisk", "asteriskk"},
|
|
122
|
+
shouldNotMatchRe: []string{"asterik"},
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
input: "plus+",
|
|
126
|
+
wantPatterns: []string{"plu"},
|
|
127
|
+
wantRe: true,
|
|
128
|
+
shouldMatchRe: []string{"plus", "pluss"},
|
|
129
|
+
shouldNotMatchRe: []string{"plu"},
|
|
130
|
+
},
|
|
131
|
+
{
|
|
132
|
+
input: "repeat{3,5}$",
|
|
133
|
+
wantPatterns: []string{"repeattt$", "repeatttt$", "repeattttt$"},
|
|
134
|
+
},
|
|
135
|
+
{
|
|
136
|
+
input: "repeat{1,120}$",
|
|
137
|
+
wantPatterns: []string{"repea"},
|
|
138
|
+
wantRe: true,
|
|
139
|
+
shouldMatchRe: []string{"repeattt", "repeatttt", "repeattttt"},
|
|
140
|
+
shouldNotMatchRe: []string{"repea5"},
|
|
141
|
+
},
|
|
142
|
+
{
|
|
143
|
+
input: "broken re[",
|
|
144
|
+
wantError: "does not compile",
|
|
145
|
+
},
|
|
146
|
+
{
|
|
147
|
+
input: "n?o? ?l?o?n?g? ?l?i?t?e?r?a?l?",
|
|
148
|
+
wantError: "does not contain sufficiently long literal",
|
|
149
|
+
},
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
for _, tc := range cases {
|
|
153
|
+
tc := tc
|
|
154
|
+
|
|
155
|
+
t.Run(tc.input, func(t *testing.T) {
|
|
156
|
+
gotPatterns, re, err := analyzePattern(tc.input)
|
|
157
|
+
if tc.wantError != "" {
|
|
158
|
+
if err == nil {
|
|
159
|
+
t.Fatalf("expected to get an error, got success")
|
|
160
|
+
}
|
|
161
|
+
if !strings.Contains(err.Error(), tc.wantError) {
|
|
162
|
+
t.Fatalf("the error returned must contain text %q, got %q", tc.wantError, err.Error())
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
return
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
if err != nil {
|
|
169
|
+
t.Fatalf("unexpected error: %v", err)
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
sort.Strings(tc.wantPatterns)
|
|
173
|
+
sort.Strings(gotPatterns)
|
|
174
|
+
if !reflect.DeepEqual(tc.wantPatterns, gotPatterns) {
|
|
175
|
+
t.Fatalf("returned list of patterns (%#v) does not match the expected value (%#v)", gotPatterns, tc.wantPatterns)
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if !tc.wantRe {
|
|
179
|
+
if re != nil {
|
|
180
|
+
t.Fatalf("unexpectedly got a re")
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
return
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
if re == nil {
|
|
187
|
+
t.Fatalf("expected to get a re, got nil")
|
|
188
|
+
}
|
|
189
|
+
for _, text := range tc.shouldMatchRe {
|
|
190
|
+
if !re.MatchString(text) {
|
|
191
|
+
t.Fatalf("test %q must match against the re, but it doesn't", text)
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
for _, text := range tc.shouldNotMatchRe {
|
|
195
|
+
if re.MatchString(text) {
|
|
196
|
+
t.Fatalf("test %q must not match against the re, but it does", text)
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
})
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// TestLiteralizeRegexp tests expansion of a regexp to a list of literals.
|
|
204
|
+
func TestLiteralizeRegexp(t *testing.T) {
|
|
205
|
+
cases := []struct {
|
|
206
|
+
input string
|
|
207
|
+
maxLiterals int
|
|
208
|
+
wantOutput []string
|
|
209
|
+
wantOverflow bool
|
|
210
|
+
}{
|
|
211
|
+
{
|
|
212
|
+
input: "simple phrase",
|
|
213
|
+
maxLiterals: 100,
|
|
214
|
+
wantOutput: []string{"simple phrase"},
|
|
215
|
+
},
|
|
216
|
+
{
|
|
217
|
+
input: "cases [1-2x-z]",
|
|
218
|
+
maxLiterals: 100,
|
|
219
|
+
wantOutput: []string{"cases 1", "cases 2", "cases x", "cases y", "cases z"},
|
|
220
|
+
},
|
|
221
|
+
{
|
|
222
|
+
input: "[Ii]gnore case",
|
|
223
|
+
maxLiterals: 100,
|
|
224
|
+
wantOutput: []string{"Ignore case", "ignore case"},
|
|
225
|
+
},
|
|
226
|
+
{
|
|
227
|
+
input: "overflow [1-2x-z]",
|
|
228
|
+
maxLiterals: 2,
|
|
229
|
+
wantOverflow: true,
|
|
230
|
+
},
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
for _, tc := range cases {
|
|
234
|
+
tc := tc
|
|
235
|
+
|
|
236
|
+
t.Run(tc.input, func(t *testing.T) {
|
|
237
|
+
re, err := syntax.Parse(tc.input, syntax.Perl)
|
|
238
|
+
if err != nil {
|
|
239
|
+
t.Fatalf("failed to parse regexp %q: %v", tc.input, err)
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
gotPatterns, ok := literalizeRegexp(re, tc.maxLiterals)
|
|
243
|
+
if tc.wantOverflow {
|
|
244
|
+
if ok {
|
|
245
|
+
t.Fatalf("expected to get an overflow, got success")
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
return
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
if !ok {
|
|
252
|
+
t.Fatalf("unexpected overflow")
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
sort.Strings(tc.wantOutput)
|
|
256
|
+
sort.Strings(gotPatterns)
|
|
257
|
+
if !reflect.DeepEqual(tc.wantOutput, gotPatterns) {
|
|
258
|
+
t.Fatalf("returned list of patterns (%#v) does not match the expected value (%#v)", gotPatterns, tc.wantOutput)
|
|
259
|
+
}
|
|
260
|
+
})
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// TestCombinations tests combinations() function.
|
|
265
|
+
func TestCombinations(t *testing.T) {
|
|
266
|
+
cases := []struct {
|
|
267
|
+
name string
|
|
268
|
+
input [][]string
|
|
269
|
+
maxLiterals int
|
|
270
|
+
wantOutput []string
|
|
271
|
+
wantOverflow bool
|
|
272
|
+
}{
|
|
273
|
+
{
|
|
274
|
+
name: "1x1",
|
|
275
|
+
input: [][]string{{"A"}, {"B"}},
|
|
276
|
+
maxLiterals: 100,
|
|
277
|
+
wantOutput: []string{"AB"},
|
|
278
|
+
},
|
|
279
|
+
{
|
|
280
|
+
name: "0x1",
|
|
281
|
+
input: [][]string{{}, {"B"}},
|
|
282
|
+
maxLiterals: 100,
|
|
283
|
+
wantOutput: []string{},
|
|
284
|
+
},
|
|
285
|
+
{
|
|
286
|
+
name: "1x2",
|
|
287
|
+
input: [][]string{{"A"}, {"1", "2"}},
|
|
288
|
+
maxLiterals: 100,
|
|
289
|
+
wantOutput: []string{"A1", "A2"},
|
|
290
|
+
},
|
|
291
|
+
{
|
|
292
|
+
name: "2x2",
|
|
293
|
+
input: [][]string{{"A", "B"}, {"1", "2"}},
|
|
294
|
+
maxLiterals: 100,
|
|
295
|
+
wantOutput: []string{"A1", "A2", "B1", "B2"},
|
|
296
|
+
},
|
|
297
|
+
{
|
|
298
|
+
name: "empty string as an option",
|
|
299
|
+
input: [][]string{{"A", ""}, {"1", "2"}},
|
|
300
|
+
maxLiterals: 100,
|
|
301
|
+
wantOutput: []string{"A1", "A2", "1", "2"},
|
|
302
|
+
},
|
|
303
|
+
{
|
|
304
|
+
name: "overflow",
|
|
305
|
+
input: [][]string{{"A", "B"}, {"1", "2"}},
|
|
306
|
+
maxLiterals: 3,
|
|
307
|
+
wantOverflow: true,
|
|
308
|
+
},
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
for _, tc := range cases {
|
|
312
|
+
tc := tc
|
|
313
|
+
|
|
314
|
+
t.Run(tc.name, func(t *testing.T) {
|
|
315
|
+
gotPatterns, ok := combinations(tc.input, tc.maxLiterals)
|
|
316
|
+
if tc.wantOverflow {
|
|
317
|
+
if ok {
|
|
318
|
+
t.Fatalf("expected to get an overflow, got success")
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
return
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
if !ok {
|
|
325
|
+
t.Fatalf("unexpected overflow")
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
sort.Strings(tc.wantOutput)
|
|
329
|
+
sort.Strings(gotPatterns)
|
|
330
|
+
if !reflect.DeepEqual(tc.wantOutput, gotPatterns) {
|
|
331
|
+
t.Fatalf("returned list of patterns (%#v) does not match the expected value (%#v)", gotPatterns, tc.wantOutput)
|
|
332
|
+
}
|
|
333
|
+
})
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// TestUnwrapCase tests unwrapping literals of case-insensitive regexps.
|
|
338
|
+
func TestUnwrapCase(t *testing.T) {
|
|
339
|
+
cases := []struct {
|
|
340
|
+
name string
|
|
341
|
+
ignoreCase bool
|
|
342
|
+
inputPatterns []string
|
|
343
|
+
maxLiterals int
|
|
344
|
+
wantOutput []string
|
|
345
|
+
wantOverflow bool
|
|
346
|
+
}{
|
|
347
|
+
{
|
|
348
|
+
name: "simple phrase",
|
|
349
|
+
inputPatterns: []string{"simple phrase"},
|
|
350
|
+
maxLiterals: 100,
|
|
351
|
+
wantOutput: []string{"simple phrase"},
|
|
352
|
+
},
|
|
353
|
+
{
|
|
354
|
+
name: "ignore case",
|
|
355
|
+
ignoreCase: true,
|
|
356
|
+
inputPatterns: []string{"i"},
|
|
357
|
+
maxLiterals: 100,
|
|
358
|
+
wantOutput: []string{"i", "I"},
|
|
359
|
+
},
|
|
360
|
+
{
|
|
361
|
+
name: "ignore case two letters",
|
|
362
|
+
ignoreCase: true,
|
|
363
|
+
inputPatterns: []string{"ic"},
|
|
364
|
+
maxLiterals: 100,
|
|
365
|
+
wantOutput: []string{"IC", "Ic", "iC", "ic"},
|
|
366
|
+
},
|
|
367
|
+
{
|
|
368
|
+
name: "ignore case two words",
|
|
369
|
+
ignoreCase: true,
|
|
370
|
+
inputPatterns: []string{"i", "c"},
|
|
371
|
+
maxLiterals: 100,
|
|
372
|
+
wantOutput: []string{"C", "I", "c", "i"},
|
|
373
|
+
},
|
|
374
|
+
{
|
|
375
|
+
name: "ignore case overflow",
|
|
376
|
+
ignoreCase: true,
|
|
377
|
+
inputPatterns: []string{"long text"},
|
|
378
|
+
maxLiterals: 100,
|
|
379
|
+
wantOverflow: true,
|
|
380
|
+
},
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
for _, tc := range cases {
|
|
384
|
+
tc := tc
|
|
385
|
+
|
|
386
|
+
t.Run(tc.name, func(t *testing.T) {
|
|
387
|
+
re := &syntax.Regexp{}
|
|
388
|
+
if tc.ignoreCase {
|
|
389
|
+
re.Flags = syntax.FoldCase
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
gotPatterns, ok := unwrapCase(re, tc.inputPatterns, tc.maxLiterals)
|
|
393
|
+
if tc.wantOverflow {
|
|
394
|
+
if ok {
|
|
395
|
+
t.Fatalf("expected to get an overflow, got success")
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
return
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
if !ok {
|
|
402
|
+
t.Fatalf("unexpected overflow")
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
sort.Strings(tc.wantOutput)
|
|
406
|
+
sort.Strings(gotPatterns)
|
|
407
|
+
if !reflect.DeepEqual(tc.wantOutput, gotPatterns) {
|
|
408
|
+
t.Fatalf("returned list of patterns (%#v) does not match the expected value (%#v)", gotPatterns, tc.wantOutput)
|
|
409
|
+
}
|
|
410
|
+
})
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// TestFindLongestCommonLiteral tests finding longest literal in a regexp.
|
|
415
|
+
func TestFindLongestCommonLiteral(t *testing.T) {
|
|
416
|
+
cases := []struct {
|
|
417
|
+
input string
|
|
418
|
+
wantOutput string
|
|
419
|
+
}{
|
|
420
|
+
{
|
|
421
|
+
input: "simple phrase",
|
|
422
|
+
wantOutput: "simple phrase",
|
|
423
|
+
},
|
|
424
|
+
{
|
|
425
|
+
input: "simple (phrase)?",
|
|
426
|
+
wantOutput: "simple ",
|
|
427
|
+
},
|
|
428
|
+
{
|
|
429
|
+
input: "[iI]",
|
|
430
|
+
wantOutput: "",
|
|
431
|
+
},
|
|
432
|
+
{
|
|
433
|
+
input: "[i]b",
|
|
434
|
+
wantOutput: "ib",
|
|
435
|
+
},
|
|
436
|
+
{
|
|
437
|
+
input: "simple (phrase)+",
|
|
438
|
+
wantOutput: "simple ",
|
|
439
|
+
},
|
|
440
|
+
{
|
|
441
|
+
input: "a*",
|
|
442
|
+
wantOutput: "",
|
|
443
|
+
},
|
|
444
|
+
{
|
|
445
|
+
input: "(abc)|(ab)",
|
|
446
|
+
wantOutput: "",
|
|
447
|
+
},
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
for _, tc := range cases {
|
|
451
|
+
tc := tc
|
|
452
|
+
|
|
453
|
+
t.Run(tc.input, func(t *testing.T) {
|
|
454
|
+
re, err := syntax.Parse(tc.input, syntax.Perl)
|
|
455
|
+
if err != nil {
|
|
456
|
+
t.Fatalf("failed to parse regexp %q: %v", tc.input, err)
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
gotOutput := findLongestCommonLiteral(re)
|
|
460
|
+
|
|
461
|
+
if gotOutput != tc.wantOutput {
|
|
462
|
+
t.Fatalf("returned value (%q) does not match the expected value (%q)", gotOutput, tc.wantOutput)
|
|
463
|
+
}
|
|
464
|
+
})
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
|
|
10
468
|
func contains(list []int, value int) bool {
|
|
11
469
|
for _, elem := range list {
|
|
12
470
|
if elem == value {
|