list_matcher 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'list_matcher/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "list_matcher"
8
+ spec.version = ListMatcher::VERSION
9
+ spec.authors = ["dfhoughton"]
10
+ spec.email = ["dfhoughton@gmail.com"]
11
+ spec.summary = %q{List::Matcher automates the generation of efficient regular expressions.}
12
+ spec.description = spec.summary
13
+ spec.homepage = "https://github.com/dfhoughton/list_matcher"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ end
@@ -0,0 +1,248 @@
1
+ require "minitest/autorun"
2
+
3
+ require "list_matcher"
4
+
5
+ class BasicTest < Minitest::Test
6
+
7
+ def test_simple
8
+ words = %w(cat dog camel)
9
+ rx = List::Matcher.pattern words
10
+ rx = Regexp.new rx
11
+ words.each do |w|
12
+ assert rx === w
13
+ end
14
+ end
15
+
16
+ def test_word_chars
17
+ word = (1..255).map(&:chr).select{ |c| /\w/ === c }
18
+ chars = word + ['+']
19
+ rx = List::Matcher.pattern chars
20
+ assert_equal '[+\w]', rx
21
+ rx = Regexp.new rx
22
+ chars.each do |c|
23
+ assert rx === c
24
+ end
25
+ chars = word + ['@']
26
+ rx = List::Matcher.pattern chars
27
+ assert_equal '[@\w]', rx
28
+ rx = Regexp.new rx
29
+ chars.each do |c|
30
+ assert rx === c
31
+ end
32
+ end
33
+
34
+ def test_word_chars_case_insensitive
35
+ word = (1..255).map(&:chr).select{ |c| /\w/ === c }
36
+ chars = word + ['+']
37
+ rx = List::Matcher.pattern chars, case_insensitive: true
38
+ assert_equal '(?i:[+\w])', rx
39
+ rx = Regexp.new rx
40
+ chars.each do |c|
41
+ assert rx === c
42
+ end
43
+ end
44
+
45
+ def test_num_chars
46
+ words = (0..9).map(&:to_s)
47
+ rx = List::Matcher.pattern words
48
+ assert_equal '\d', rx
49
+ rx = Regexp.new rx
50
+ words.each do |w|
51
+ assert rx === w
52
+ end
53
+ end
54
+
55
+ def test_space_chars
56
+ words = (1..255).map(&:chr).select{ |c| c =~ /\s/ }
57
+ rx = List::Matcher.pattern words
58
+ assert_equal '\s', rx
59
+ rx = Regexp.new rx
60
+ words.each do |w|
61
+ assert rx === w
62
+ end
63
+ end
64
+
65
+ def test_bounds
66
+ words = %w(cat dog)
67
+ rx = List::Matcher.pattern words, bound: true
68
+ assert_equal '(?:\b(?:cat|dog)\b)', rx
69
+ rx = Regexp.new rx
70
+ words.each do |w|
71
+ assert rx === w
72
+ end
73
+ end
74
+
75
+ def test_repeats
76
+ rx = List::Matcher.pattern %w(aaaaaaaaaa)
77
+ assert_equal '(?:a{10})', rx
78
+ rx = List::Matcher.pattern %w(bbbaaaaaaaaaabbbaaaaaaaaaa)
79
+ assert_equal '(?:(?:bbba{10}){2})', rx
80
+ end
81
+
82
+ def test_opt_suffix
83
+ words = %w(the them)
84
+ rx = List::Matcher.pattern words
85
+ assert_equal '(?:them?)', rx
86
+ rx = Regexp.new rx
87
+ words.each do |w|
88
+ assert rx === w
89
+ end
90
+ end
91
+
92
+ def test_opt_prefix
93
+ words = %w(at cat)
94
+ rx = List::Matcher.pattern words
95
+ assert_equal '(?:c?at)', rx
96
+ rx = Regexp.new rx
97
+ words.each do |w|
98
+ assert rx === w
99
+ end
100
+ end
101
+
102
+ def test_symbols_string
103
+ words = ['cat dog']
104
+ rx = List::Matcher.pattern words, symbols: { ' ' => '\s++' }
105
+ assert_equal '(?:cat\s++dog)', rx
106
+ rx = Regexp.new rx
107
+ words.each do |w|
108
+ assert rx === w
109
+ end
110
+ end
111
+
112
+ def test_symbols_rx
113
+ words = %w(year year2000 year1999)
114
+ rx = List::Matcher.pattern words, symbols: { /(?<!\d)\d{4}(?!\d)/ => nil }
115
+ assert_equal '(?:year(?-mix:(?<!\d)\d{4}(?!\d))?)', rx
116
+ rx = Regexp.new rx
117
+ words.each do |w|
118
+ assert rx === w
119
+ end
120
+ end
121
+
122
+ def test_fancy_rx
123
+ words = [' cat dog ']
124
+ good = ['the cat dog is an odd beast']
125
+ bad = ['the catdog is an odd beast', 'the cat doggy is an odd beast', 'the scat dog is an odd beast']
126
+ rx = List::Matcher.pattern words, bound: true, normalize_whitespace: true
127
+ assert_equal '(?:\bcat\s++dog\b)', rx
128
+ rx = Regexp.new rx
129
+ assert good.all?{ |w| rx === w }, 'not bothered by odd space'
130
+ assert bad.none?{ |w| rx === w }, 'needs interior space and boundaries'
131
+ end
132
+
133
+ def test_symbols_borders
134
+ words = (1..31).to_a
135
+ rx = List::Matcher.pattern words, bound: { test: /\d/, left: '(?<!\d)', right: '(?!\d)' }
136
+ rx = Regexp.new rx
137
+ good = words.map{ |n| "a#{n}b" }
138
+ bad = words.map{ |n| "0#{n}0" }
139
+ assert good.all?{ |w| rx === w }
140
+ assert bad.none?{ |w| rx === w }
141
+ end
142
+
143
+ def test_string_bound
144
+ rx = List::Matcher.pattern ['cat'], bound: :string
145
+ assert_equal '(?:\Acat\z)', rx
146
+ rx = Regexp.new rx
147
+ assert rx === 'cat', 'matches whole string'
148
+ assert "cat\ndog" !~ rx, 'line breaks do not suffice'
149
+ assert ' cat ' !~ rx, 'word boundaries do not suffice'
150
+ end
151
+
152
+ def test_line_bound
153
+ rx = List::Matcher.pattern ['cat'], bound: :line
154
+ assert_equal '(?:^cat$)', rx
155
+ rx = Regexp.new rx
156
+ assert rx === 'cat', 'matches whole string'
157
+ assert rx === "cat\ndog", 'line breaks suffice'
158
+ assert ' cat ' !~ rx, 'word boundaries do not suffice'
159
+ end
160
+
161
+ def test_dup_atomic
162
+ m = List::Matcher.new atomic: true
163
+ rx = m.pattern %w( cat dog ), atomic: false
164
+ assert_equal "cat|dog", rx
165
+ end
166
+
167
+ def test_dup_backtracking
168
+ m = List::Matcher.new backtracking: true
169
+ rx = m.pattern %w( cat dog ), backtracking: false
170
+ assert_equal "(?>cat|dog)", rx
171
+ end
172
+
173
+ def test_dup_bound
174
+ m = List::Matcher.new bound: false, atomic: false
175
+ rx = m.pattern %w( cat dog ), bound: true
176
+ assert_equal '\b(?:cat|dog)\b', rx
177
+ end
178
+
179
+ def test_dup_bound_string
180
+ m = List::Matcher.new bound: false, atomic: false
181
+ rx = m.pattern %w( cat dog ), bound: :string
182
+ assert_equal '\A(?:cat|dog)\z', rx
183
+ end
184
+
185
+ def test_dup_bound_line
186
+ m = List::Matcher.new bound: false, atomic: false
187
+ rx = m.pattern %w( cat dog ), bound: :line
188
+ assert_equal '^(?:cat|dog)$', rx
189
+ end
190
+
191
+ def test_dup_bound_fancy
192
+ m = List::Matcher.new bound: false, atomic: false
193
+ rx = m.pattern %w( 1 2 ), bound: { test: /\d/, left: '(?<!\d)', right: '(?!\d)' }
194
+ assert_equal '(?<!\d)[12](?!\d)', rx
195
+ end
196
+
197
+ def test_dup_strip
198
+ m = List::Matcher.new atomic: false
199
+ rx = m.pattern [%( cat )], strip: true
200
+ assert_equal 'cat', rx
201
+ end
202
+
203
+ def test_dup_case_insensitive
204
+ m = List::Matcher.new
205
+ rx = m.pattern %w(cat), case_insensitive: true
206
+ assert_equal '(?i:cat)', rx
207
+ end
208
+
209
+ def test_dup_normalize_whitespace
210
+ m = List::Matcher.new atomic: false
211
+ rx = m.pattern [' cat dog '], normalize_whitespace: true
212
+ assert_equal 'cat\s++dog', rx
213
+ end
214
+
215
+ def test_dup_symbols
216
+ m = List::Matcher.new atomic: false
217
+ rx = m.pattern ['cat dog'], symbols: { ' ' => '\s++' }
218
+ assert_equal 'cat\s++dog', rx
219
+ end
220
+
221
+ def test_multiline
222
+ rx = List::Matcher.pattern %w( cat dog ), multiline: true
223
+ assert_equal '(?m:cat|dog)', rx
224
+ end
225
+
226
+ def test_dup_multiline
227
+ m = List::Matcher.new atomic: false
228
+ rx = m.pattern %w( cat dog ), multiline: true
229
+ assert_equal '(?m:cat|dog)', rx
230
+ end
231
+
232
+ def test_name
233
+ m = List::Matcher.new name: :foo
234
+ rx = m.pattern %w( cat dog )
235
+ assert_equal '(?<foo>cat|dog)', rx
236
+ end
237
+
238
+ def test_vetting_good
239
+ List::Matcher.pattern %w(cat), symbols: { foo: 'bar' }, vet: true
240
+ assert true, 'good regexen are vetted appropriately'
241
+ end
242
+
243
+ def test_vetting_bad
244
+ assert_raises SyntaxError do
245
+ List::Matcher.pattern %w(cat), symbols: { foo: '+' }, vet: true
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,149 @@
1
+ require 'list_matcher'
2
+ require 'benchmark/ips'
3
+
4
+ size = 100
5
+ magnitudes = 3
6
+ creation_iterations = 1000
7
+
8
+ def words(n, char_range, size_range, avoid=Set.new)
9
+ set = Set.new
10
+ while set.size < n do
11
+ w = (1..rand(size_range)).map{ rand(char_range).chr }.join
12
+ next if avoid.include? w
13
+ set << w
14
+ end
15
+ set.to_a
16
+ end
17
+
18
+ def simple_rx(words)
19
+ rx = words.join "|"
20
+ Regexp.new "\\A(?>#{rx})\\z"
21
+ end
22
+
23
+ def list_rx(words)
24
+ List::Matcher.rx words, bound: :string
25
+ end
26
+
27
+ puts "RANDOM WORDS, VARIABLE LENGTH\n"
28
+
29
+ magnitudes.times do
30
+ good = words size, 97..122, 10..15
31
+ bad = words size, 97..122, 10..15, good
32
+ set = Set[*good]
33
+ rx = simple_rx good
34
+ lrx = list_rx good
35
+ puts "\nnumber of words: #{size}"
36
+ Benchmark.ips do |bm|
37
+ bm.report('simple rx good') do
38
+ good.each{ |w| rx === w }
39
+ end
40
+ bm.report('List::Matcher good') do
41
+ good.each{ |w| lrx === w }
42
+ end
43
+ bm.report('set good') do
44
+ good.each{ |w| set.include? w }
45
+ end
46
+ bm.report('list good') do
47
+ good.each{ |w| good.include? w }
48
+ end
49
+ bm.compare!
50
+ end
51
+ Benchmark.ips do |bm|
52
+ bm.report('simple rx bad') do
53
+ bad.each{ |w| rx === w }
54
+ end
55
+ bm.report('List::Matcher bad') do
56
+ bad.each{ |w| lrx === w }
57
+ end
58
+ bm.report('set bad') do
59
+ bad.each{ |w| set.include? w }
60
+ end
61
+ bm.report('list bad') do
62
+ bad.each{ |w| good.include? w }
63
+ end
64
+ bm.compare!
65
+ end
66
+ size *= 10
67
+ end
68
+
69
+ def nums(length)
70
+ variants length, 0..9
71
+ end
72
+
73
+ def alphas(length)
74
+ variants length, 'a'..'j'
75
+ end
76
+
77
+ def variants(length, range)
78
+ out = []
79
+ range = range.to_a
80
+ tumblers = Array.new length, 0
81
+ (range.size ** length).times do
82
+ out << tumblers.map{ |t| range[t] }.join
83
+ tumblers[0] += 1
84
+ tumblers[0] %= range.size
85
+ (0...length-1).each do |i|
86
+ if tumblers[i] == 0
87
+ tumblers[i + 1] += 1
88
+ tumblers[i + 1] %= range.size
89
+ else
90
+ break
91
+ end
92
+ end
93
+ end
94
+ out
95
+ end
96
+
97
+ puts "\nFIXED LENGTH, FULL RANGE\n"
98
+
99
+ (1..4).each do |i|
100
+ good = nums i
101
+ bad = alphas i
102
+ lrx = list_rx good
103
+ set = Set[*good]
104
+ rx = simple_rx good
105
+ puts "\nnumber of words: #{10 ** i}; List::Matcher rx: #{lrx}"
106
+ Benchmark.ips do |bm|
107
+ bm.report('simple rx creation') do
108
+ creation_iterations.times{ simple_rx good }
109
+ end
110
+ bm.report('List::Matcher creation') do
111
+ creation_iterations.times{ simple_rx good }
112
+ end
113
+ bm.report('set creation') do
114
+ creation_iterations.times{ Set[*good] }
115
+ end
116
+ bm.compare!
117
+ end
118
+ Benchmark.ips do |bm|
119
+ bm.report('simple rx good') do
120
+ good.each{ |w| rx === w }
121
+ end
122
+ bm.report('List::Matcher good') do
123
+ good.each{ |w| lrx === w }
124
+ end
125
+ bm.report('set good') do
126
+ good.each{ |w| set.include? w }
127
+ end
128
+ bm.report('list good') do
129
+ good.each{ |w| good.include? w }
130
+ end
131
+ bm.compare!
132
+ end
133
+ Benchmark.ips do |bm|
134
+ bm.report('simple rx bad') do
135
+ bad.each{ |w| rx === w }
136
+ end
137
+ bm.report('List::Matcher bad') do
138
+ bad.each{ |w| lrx === w }
139
+ end
140
+ bm.report('set bad') do
141
+ bad.each{ |w| set.include? w }
142
+ end
143
+ bm.report('list bad') do
144
+ bad.each{ |w| good.include? w }
145
+ end
146
+ bm.compare!
147
+ end
148
+ size *= 10
149
+ end
data/test/stress.rb ADDED
@@ -0,0 +1,44 @@
1
+ require "minitest/autorun"
2
+
3
+ require "list_matcher"
4
+
5
+ class Stress < Minitest::Test
6
+ def test_simple
7
+ (1..10).each{ basic_test 5000, 97..122, 4..8 }
8
+ end
9
+
10
+ def test_fixed_size
11
+ (1..10).each{ basic_test 5000, 97..122, 8..8 }
12
+ end
13
+
14
+ def test_really_big
15
+ basic_test 50000, 97..122, 4..8
16
+ end
17
+
18
+ def basic_test(n, range, max)
19
+ words = words n, range, max
20
+ good = words[0...n/10]
21
+ bad = words[n/10..-1]
22
+ rx = List::Matcher.rx( good, bound: true )
23
+ puts good.inspect unless good.all?{ |w| rx === w }
24
+ good.each do |w|
25
+ assert rx === w, "#{w} is good for #{rx}"
26
+ end
27
+ bad.each do |w|
28
+ assert !( rx === w ), "#{w} is bad for #{rx}"
29
+ end
30
+ end
31
+
32
+ def words(n, range, max)
33
+ words = []
34
+ while words.size < n
35
+ words += (1..n/10).map{ random_word range, max }
36
+ words.uniq!
37
+ end
38
+ words[0...n]
39
+ end
40
+
41
+ def random_word(range, max)
42
+ (1..rand(max)).map{ rand(range).chr }.join
43
+ end
44
+ end
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: list_matcher
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - dfhoughton
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-08-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: List::Matcher automates the generation of efficient regular expressions.
42
+ email:
43
+ - dfhoughton@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - Gemfile
50
+ - LICENSE.txt
51
+ - README.md
52
+ - Rakefile
53
+ - examples/date_grammar.rb
54
+ - lib/list_matcher.rb
55
+ - lib/list_matcher/version.rb
56
+ - list_matcher.gemspec
57
+ - test/basic_test.rb
58
+ - test/benchmarks.rb
59
+ - test/stress.rb
60
+ homepage: https://github.com/dfhoughton/list_matcher
61
+ licenses:
62
+ - MIT
63
+ metadata: {}
64
+ post_install_message:
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - '>='
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubyforge_project:
80
+ rubygems_version: 2.2.2
81
+ signing_key:
82
+ specification_version: 4
83
+ summary: List::Matcher automates the generation of efficient regular expressions.
84
+ test_files:
85
+ - test/basic_test.rb
86
+ - test/benchmarks.rb
87
+ - test/stress.rb