regexp-examples 1.1.0 → 1.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,7 +8,7 @@ module RegexpExamples
8
8
  def self.permutations_of_strings(arrays_of_strings)
9
9
  first = arrays_of_strings.shift
10
10
  return first if arrays_of_strings.empty?
11
- first.product( permutations_of_strings(arrays_of_strings) ).map do |result|
11
+ first.product(permutations_of_strings(arrays_of_strings)).map do |result|
12
12
  join_preserving_capture_groups(result)
13
13
  end
14
14
  end
@@ -16,8 +16,8 @@ module RegexpExamples
16
16
  def self.join_preserving_capture_groups(result)
17
17
  result.flatten!
18
18
  subgroups = result
19
- .map(&:all_subgroups)
20
- .flatten
19
+ .map(&:all_subgroups)
20
+ .flatten
21
21
 
22
22
  # Only save the LAST group from repeated capture groups, e.g. /([ab]){2}/
23
23
  subgroups.delete_if do |subgroup|
@@ -35,12 +35,12 @@ module RegexpExamples
35
35
  end
36
36
 
37
37
  private
38
+
38
39
  def self.generic_map_result(repeaters, method)
39
40
  repeaters
40
- .map {|repeater| repeater.public_send(method)}
41
+ .map { |repeater| repeater.public_send(method) }
41
42
  .instance_eval do |partial_results|
42
43
  RegexpExamples.permutations_of_strings(partial_results)
43
44
  end
44
45
  end
45
46
  end
46
-
@@ -15,9 +15,7 @@ module RegexpExamples
15
15
  repeaters = []
16
16
  until end_of_regexp
17
17
  group = parse_group(repeaters)
18
- if group.is_a? OrGroup
19
- return [OneTimeRepeater.new(group)]
20
- end
18
+ return [OneTimeRepeater.new(group)] if group.is_a? OrGroup
21
19
  @current_position += 1
22
20
  repeaters << parse_repeater(group)
23
21
  end
@@ -101,12 +99,16 @@ module RegexpExamples
101
99
  @current_position += 1
102
100
  case
103
101
  when rest_of_string =~ /\A(\d{1,3})/
104
- @current_position += ($1.length - 1) # In case of 10+ backrefs!
105
- group = parse_backreference_group($1)
102
+ @current_position += (Regexp.last_match(1).length - 1) # In case of 10+ backrefs!
103
+ group = parse_backreference_group(Regexp.last_match(1))
106
104
  when rest_of_string =~ /\Ak['<]([\w-]+)['>]/ # Named capture group
107
- @current_position += ($1.length + 2)
108
- # Check for RELATIVE group number, e.g. /(a)(b)(c)(d) \k<-2>/
109
- group_id = ($1.to_i < 0) ? (@num_groups + $1.to_i + 1) : $1
105
+ @current_position += (Regexp.last_match(1).length + 2)
106
+ group_id = if Regexp.last_match(1).to_i < 0
107
+ # RELATIVE group number, e.g. /(a)(b)(c)(d) \k<-2>/
108
+ @num_groups + Regexp.last_match(1).to_i + 1
109
+ else
110
+ Regexp.last_match(1)
111
+ end
110
112
  group = parse_backreference_group(group_id)
111
113
  when BackslashCharMap.keys.include?(next_char)
112
114
  group = CharGroup.new(
@@ -114,32 +116,39 @@ module RegexpExamples
114
116
  @ignorecase
115
117
  )
116
118
  when rest_of_string =~ /\A(c|C-)(.)/ # Control character
117
- @current_position += $1.length
118
- group = parse_single_char_group( parse_control_character($2) )
119
+ @current_position += Regexp.last_match(1).length
120
+ group = parse_single_char_group(parse_control_character(Regexp.last_match(2)))
119
121
  when rest_of_string =~ /\Ax(\h{1,2})/ # Escape sequence
120
- @current_position += $1.length
121
- group = parse_single_char_group( parse_escape_sequence($1) )
122
+ @current_position += Regexp.last_match(1).length
123
+ group = parse_single_char_group(parse_escape_sequence(Regexp.last_match(1)))
122
124
  when rest_of_string =~ /\Au(\h{4}|\{\h{1,4}\})/ # Unicode sequence
123
- @current_position += $1.length
124
- sequence = $1.match(/\h{1,4}/)[0] # Strip off "{" and "}"
125
- group = parse_single_char_group( parse_unicode_sequence(sequence) )
125
+ @current_position += Regexp.last_match(1).length
126
+ sequence = Regexp.last_match(1).match(/\h{1,4}/)[0] # Strip off "{" and "}"
127
+ group = parse_single_char_group(parse_unicode_sequence(sequence))
126
128
  when rest_of_string =~ /\A(p)\{(\^?)([^}]+)\}/i # Named properties
127
- @current_position += ($2.length + $3.length + 2)
128
- is_negative = ($1 == "P") ^ ($2 == "^") # Beware of double negatives! E.g. /\P{^Space}/
129
+ @current_position += (Regexp.last_match(2).length + # 0 or 1, of '^' is present
130
+ Regexp.last_match(3).length + # Length of the property name
131
+ 2) # Length of opening and closing brackets (always 2)
132
+ # Beware of double negatives! E.g. /\P{^Space}/
133
+ is_negative = (Regexp.last_match(1) == 'P') ^ (Regexp.last_match(2) == '^')
129
134
  group = CharGroup.new(
130
135
  if is_negative
131
- CharSets::Any.dup - NamedPropertyCharMap[$3.downcase]
136
+ CharSets::Any.dup - NamedPropertyCharMap[Regexp.last_match(3).downcase]
132
137
  else
133
- NamedPropertyCharMap[$3.downcase]
138
+ NamedPropertyCharMap[Regexp.last_match(3).downcase]
134
139
  end,
135
140
  @ignorecase
136
141
  )
137
142
  when next_char == 'K' # Keep (special lookbehind that CAN be supported safely!)
138
143
  group = PlaceHolderGroup.new
139
144
  when next_char == 'R' # Linebreak
140
- group = CharGroup.new(["\r\n", "\n", "\v", "\f", "\r"], @ignorecase) # A bit hacky...
145
+ group = CharGroup.new(
146
+ ["\r\n", "\n", "\v", "\f", "\r"],
147
+ @ignorecase
148
+ ) # Using "\r\n" as one character is little bit hacky...
141
149
  when next_char == 'g' # Subexpression call
142
- raise IllegalSyntaxError, "Subexpression calls (\\g) cannot be supported, as they are not regular"
150
+ fail IllegalSyntaxError,
151
+ 'Subexpression calls (\\g) cannot be supported, as they are not regular'
143
152
  when next_char =~ /[bB]/ # Anchors
144
153
  raise_anchors_exception!
145
154
  when next_char =~ /[AG]/ # Start of string
@@ -155,7 +164,7 @@ module RegexpExamples
155
164
  raise_anchors_exception!
156
165
  end
157
166
  else
158
- group = parse_single_char_group( next_char )
167
+ group = parse_single_char_group(next_char)
159
168
  end
160
169
  group
161
170
  end
@@ -193,7 +202,7 @@ module RegexpExamples
193
202
  comment_group = rest_of_string.match(/.*?[^\\](?:\\{2})*\)/)[0]
194
203
  @current_position += comment_group.length
195
204
  when match[2] =~ /\A(?=[mix-]+)([mix]*)-?([mix]*)/ # e.g. /(?i-mx)/
196
- regexp_options_toggle($1, $2)
205
+ regexp_options_toggle(Regexp.last_match(1), Regexp.last_match(2))
197
206
  @num_groups -= 1 # Toggle "groups" should not increase backref group count
198
207
  @current_position += $&.length + 1
199
208
  if next_char == ':' # e.g. /(?i:subexpr)/
@@ -202,9 +211,11 @@ module RegexpExamples
202
211
  return PlaceHolderGroup.new
203
212
  end
204
213
  when %w(! =).include?(match[2]) # e.g. /(?=lookahead)/, /(?!neglookahead)/
205
- raise IllegalSyntaxError, "Lookaheads are not regular; cannot generate examples"
214
+ fail IllegalSyntaxError,
215
+ 'Lookaheads are not regular; cannot generate examples'
206
216
  when %w(! =).include?(match[3]) # e.g. /(?<=lookbehind)/, /(?<!neglookbehind)/
207
- raise IllegalSyntaxError, "Lookbehinds are not regular; cannot generate examples"
217
+ fail IllegalSyntaxError,
218
+ 'Lookbehinds are not regular; cannot generate examples'
208
219
  else # e.g. /(?<name>namedgroup)/
209
220
  @current_position += (match[3].length + 3)
210
221
  group_id = match[3]
@@ -226,12 +237,12 @@ module RegexpExamples
226
237
  end
227
238
 
228
239
  def regexp_options_toggle(on, off)
229
- @ignorecase = true if (on.include? "i")
230
- @ignorecase = false if (off.include? "i")
231
- @multiline = true if (on.include? "m")
232
- @multiline = false if (off.include? "m")
233
- @extended = true if (on.include? "x")
234
- @extended = false if (off.include? "x")
240
+ @ignorecase = true if on.include? 'i'
241
+ @ignorecase = false if off.include? 'i'
242
+ @multiline = true if on.include? 'm'
243
+ @multiline = false if off.include? 'm'
244
+ @extended = true if on.include? 'x'
245
+ @extended = false if off.include? 'x'
235
246
  end
236
247
 
237
248
  def parse_char_group
@@ -252,7 +263,6 @@ module RegexpExamples
252
263
  OrGroup.new(left_repeaters, right_repeaters)
253
264
  end
254
265
 
255
-
256
266
  def parse_single_char_group(char)
257
267
  SingleCharGroup.new(char, @ignorecase)
258
268
  end
@@ -310,17 +320,18 @@ module RegexpExamples
310
320
  end
311
321
 
312
322
  def parse_reluctant_or_possessive_range_repeater(repeater, min, has_comma, max)
313
- # .{1}? should be equivalent to (?:.{1})?, i.e. NOT a "non-greedy quantifier"
314
- if min && !has_comma && !max && next_char == "?"
315
- repeater = parse_question_mark_repeater(repeater)
316
- else
317
- parse_reluctant_or_possessive_repeater
318
- end
319
- repeater
323
+ # .{1}? should be equivalent to (?:.{1})?, i.e. NOT a "non-greedy quantifier"
324
+ if min && !has_comma && !max && next_char == '?'
325
+ repeater = parse_question_mark_repeater(repeater)
326
+ else
327
+ parse_reluctant_or_possessive_repeater
328
+ end
329
+ repeater
320
330
  end
321
331
 
322
332
  def raise_anchors_exception!
323
- raise IllegalSyntaxError, "Anchors ('#{next_char}') cannot be supported, as they are not regular"
333
+ fail IllegalSyntaxError,
334
+ "Anchors ('#{next_char}') cannot be supported, as they are not regular"
324
335
  end
325
336
 
326
337
  def parse_one_time_repeater(group)
@@ -336,8 +347,7 @@ module RegexpExamples
336
347
  end
337
348
 
338
349
  def end_of_regexp
339
- next_char == ")" || @current_position >= regexp_string.length
350
+ next_char == ')' || @current_position >= regexp_string.length
340
351
  end
341
352
  end
342
353
  end
343
-
@@ -6,11 +6,11 @@ module RegexpExamples
6
6
  end
7
7
 
8
8
  def result
9
- group_results = group.result[0 .. RegexpExamples.MaxGroupResults-1]
9
+ group_results = group.result.first(RegexpExamples.MaxGroupResults)
10
10
  results = []
11
11
  min_repeats.upto(max_repeats) do |repeats|
12
12
  if repeats.zero?
13
- results << [ GroupResult.new('') ]
13
+ results << [GroupResult.new('')]
14
14
  else
15
15
  results << RegexpExamples.permutations_of_strings(
16
16
  [group_results] * repeats
@@ -23,8 +23,8 @@ module RegexpExamples
23
23
  def random_result
24
24
  result = []
25
25
  rand(min_repeats..max_repeats).times { result << group.random_result }
26
- result << [ GroupResult.new('') ] if result.empty? # in case of 0.times
27
- RegexpExamples::permutations_of_strings(result)
26
+ result << [GroupResult.new('')] if result.empty? # in case of 0.times
27
+ RegexpExamples.permutations_of_strings(result)
28
28
  end
29
29
  end
30
30
 
@@ -74,9 +74,9 @@ module RegexpExamples
74
74
  end
75
75
 
76
76
  private
77
+
77
78
  def smallest(x, y)
78
79
  (x < y) ? x : y
79
80
  end
80
81
  end
81
82
  end
82
-
@@ -0,0 +1,45 @@
1
+ require 'pstore'
2
+
3
+ module RegexpExamples
4
+ class UnicodeCharRanges
5
+ # These values were generated by: scripts/unicode_lister.rb
6
+ # Note: Only the first 128 results are listed, for performance.
7
+ # Also, some groups seem to have no matches (weird!)
8
+ # (Don't care about ruby micro version number)
9
+ STORE_FILENAME = "unicode_ranges_#{RUBY_VERSION[0..2]}.pstore"
10
+
11
+ attr_reader :range_store
12
+
13
+ def initialize(filename = STORE_FILENAME)
14
+ @range_store = PStore.new(File.expand_path("../../../db/#{filename}", __FILE__))
15
+ end
16
+
17
+ def get(key)
18
+ range_store.transaction(true) do
19
+ ranges_to_unicode(range_store[key])
20
+ end
21
+ end
22
+
23
+ alias_method :[], :get
24
+
25
+ private
26
+
27
+ # TODO: Document example input/output of this method
28
+ # It's pretty simple, but this code is a little confusing!!
29
+ def ranges_to_unicode(ranges)
30
+ result = []
31
+ ranges.each do |range|
32
+ if range.is_a? Fixnum # Small hack to increase data compression
33
+ result << hex_to_unicode(range.to_s(16))
34
+ else
35
+ range.each { |num| result << hex_to_unicode(num.to_s(16)) }
36
+ end
37
+ end
38
+ result
39
+ end
40
+
41
+ def hex_to_unicode(hex)
42
+ eval("?\\u{#{hex}}")
43
+ end
44
+ end
45
+ end
@@ -1,3 +1,3 @@
1
1
  module RegexpExamples
2
- VERSION = '1.1.0'
2
+ VERSION = '1.1.2'
3
3
  end
@@ -1,4 +1,4 @@
1
- require File.expand_path("../lib/regexp-examples/version", __FILE__)
1
+ require File.expand_path('../lib/regexp-examples/version', __FILE__)
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = 'regexp-examples'
@@ -11,11 +11,11 @@ Gem::Specification.new do |s|
11
11
  s.files = `git ls-files -z`.split("\x0")
12
12
  s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
13
13
  s.test_files = s.files.grep(%r{^(test|spec|features)/})
14
- s.require_paths = ["lib"]
14
+ s.require_paths = ['lib']
15
15
  s.homepage =
16
16
  'http://rubygems.org/gems/regexp-examples'
17
- s.add_development_dependency "bundler", "~> 1.7"
18
- s.add_development_dependency "rake", "~> 10.0"
17
+ s.add_development_dependency 'bundler', '~> 1.7'
18
+ s.add_development_dependency 'rake', '~> 10.0'
19
19
  s.license = 'MIT'
20
20
  s.required_ruby_version = '>= 2.0.0'
21
21
  end
@@ -1,180 +1,64 @@
1
+ require 'pstore'
2
+ require_relative '../lib/regexp-examples/unicode_char_ranges'
1
3
  # A script to generate lists of all unicode characters
2
4
  # that match all named group/character properties regexps.
3
5
  # For use in e.g. /\p{Arabic}/.examples
4
6
 
5
7
  # To (re-)generate this list, simply run this file!
6
8
  # > ruby scripts/unicode_lister.rb
7
- OutputFilename = 'unicode_result'
8
9
 
9
10
  # Taken from ruby documentation:
10
11
  # http://ruby-doc.org//core-2.2.0/Regexp.html#class-Regexp-label-Character+Properties
11
12
  NamedGroups = %w(
12
- Alnum
13
- Alpha
14
- Blank
15
- Cntrl
16
- Digit
17
- Graph
18
- Lower
19
- Print
20
- Punct
21
- Space
22
- Upper
23
- XDigit
24
- Word
25
- ASCII
26
- Any
27
- Assigned
13
+ Alnum Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word ASCII Any Assigned
28
14
 
29
- L
30
- Ll
31
- Lm
32
- Lo
33
- Lt
34
- Lu
35
- M
36
- Mn
37
- Mc
38
- Me
39
- N
40
- Nd
41
- Nl
42
- No
43
- P
44
- Pc
45
- Pd
46
- Ps
47
- Pe
48
- Pi
49
- Pf
50
- Po
51
- S
52
- Sm
53
- Sc
54
- Sk
55
- So
56
- Z
57
- Zs
58
- Zl
59
- Zp
60
- C
61
- Cc
62
- Cf
63
- Cn
64
- Co
65
- Cs
15
+ L Ll Lm Lo Lt Lu M Mn Mc Me N Nd Nl No P Pc Pd Ps Pe Pi Pf Po S Sm Sc Sk So Z Zs Zl Zp C Cc Cf Cn Co Cs
66
16
 
67
- Arabic
68
- Armenian
69
- Balinese
70
- Bengali
71
- Bopomofo
72
- Braille
73
- Buginese
74
- Buhid
75
- Canadian_Aboriginal
76
- Carian
77
- Cham
78
- Cherokee
79
- Common
80
- Coptic
81
- Cuneiform
82
- Cypriot
83
- Cyrillic
84
- Deseret
85
- Devanagari
86
- Ethiopic
87
- Georgian
88
- Glagolitic
89
- Gothic
90
- Greek
91
- Gujarati
92
- Gurmukhi
93
- Han
94
- Hangul
95
- Hanunoo
96
- Hebrew
97
- Hiragana
98
- Inherited
99
- Kannada
100
- Katakana
101
- Kayah_Li
102
- Kharoshthi
103
- Khmer
104
- Lao
105
- Latin
106
- Lepcha
107
- Limbu
108
- Linear_B
109
- Lycian
110
- Lydian
111
- Malayalam
112
- Mongolian
113
- Myanmar
114
- New_Tai_Lue
115
- Nko
116
- Ogham
117
- Ol_Chiki
118
- Old_Italic
119
- Old_Persian
120
- Oriya
121
- Osmanya
122
- Phags_Pa
123
- Phoenician
124
- Rejang
125
- Runic
126
- Saurashtra
127
- Shavian
128
- Sinhala
129
- Sundanese
130
- Syloti_Nagri
131
- Syriac
132
- Tagalog
133
- Tagbanwa
134
- Tai_Le
135
- Tamil
136
- Telugu
137
- Thaana
138
- Thai
139
- Tibetan
140
- Tifinagh
141
- Ugaritic
142
- Vai
143
- Yi
17
+ Arabic Armenian Balinese Bengali Bopomofo Braille Buginese Buhid Canadian_Aboriginal Carian Cham Cherokee
18
+ Common Coptic Cuneiform Cypriot Cyrillic Deseret Devanagari Ethiopic Georgian Glagolitic Gothic Greek
19
+ Gujarati Gurmukhi Han Hangul Hanunoo Hebrew Hiragana Inherited Kannada Katakana Kayah_Li Kharoshthi Khmer
20
+ Lao Latin Lepcha Limbu Linear_B Lycian Lydian Malayalam Mongolian Myanmar New_Tai_Lue Nko Ogham Ol_Chiki
21
+ Old_Italic Old_Persian Oriya Osmanya Phags_Pa Phoenician Rejang Runic Saurashtra Shavian Sinhala Sundanese
22
+ Syloti_Nagri Syriac Tagalog Tagbanwa Tai_Le Tamil Telugu Thaana Thai Tibetan Tifinagh Ugaritic Vai Yi
144
23
  )
145
24
 
146
- # Note: For some reason, a character encoding-related exception gets raised
147
- # when I do `/regex/ =~ eval("?\\u{#{x.to_s(16)}}")` in the range: 55296..57343
148
- # This means my calculation is MISSING results in the range: 55296..65535
149
- # However, for the sake of performance, I'm also being "lazy" and only calculating/saving
150
- # the first 128 matches anyway!
151
- # If anyone ever cares about this (I doubt it), I'll look into fixing/improving it.
25
+ # Note: For the range 55296..57343, these are reserved values that are not legal
26
+ # unicode characters.
27
+ # I.e. a character encoding-related exception gets raised when you do:
28
+ # `/regex/ =~ eval("?\\u{#{x.to_s(16)}}")`
29
+ # TODO: Add a link to somewhere that explains this better.
152
30
 
153
- # Example input: [1, 2, 3, 4, 6, 7, 12, 14] (Array)
154
- # Example output: "1..4, 6..7, 12, 14" (String)
31
+ # "Compresses" the values in an array by using ranges.
32
+ # Example input: [1, 2, 3, 4, 6, 7, 12, 14]
33
+ # Example output: [1..4, 6..7, 12, 14]
155
34
  def calculate_ranges(matching_codes)
156
- return "" if matching_codes.empty?
35
+ return [] if matching_codes.empty?
157
36
  first = matching_codes.shift
158
- matching_codes.inject([first..first]) do |r,x|
37
+ matching_codes.inject([first..first]) do |r, x|
159
38
  if r.last.last.succ != x
160
39
  r << (x..x) # Start new range
161
40
  else
162
41
  r[0..-2] << (r.last.first..x) # Update last range
163
42
  end
164
43
  end
165
- .map { |range| range.size == 1 ? range.first : range}
166
- .join(", ")
44
+ .map { |range| range.size == 1 ? range.first : range } # Replace `int..int` with `int`
167
45
  end
168
46
 
169
47
  count = 0
170
- File.open(OutputFilename, 'w') do |f|
48
+ filename = RegexpExamples::UnicodeCharRanges::STORE_FILENAME
49
+ store = PStore.new(filename)
50
+ store.transaction do
171
51
  NamedGroups.each do |name|
172
- count += 1
173
- matching_codes = (0..55295).lazy.select { |x| /\p{#{name}}/ =~ eval("?\\u{#{x.to_s(16)}}") }.first(128)
174
- f.puts "'#{name.downcase}' => ranges_to_unicode(#{calculate_ranges(matching_codes)}),"
52
+ count += 1
53
+ # Only generating first 128 matches, for performance...
54
+ # (I have tried this with generating ALL examples, and it makes the ruby gem
55
+ # painfully slow and bloated... Especially the test suite.)
56
+ matching_codes = [(0..55_295), (57_344..65_535)].map(&:to_a).flatten.lazy
57
+ .find { |x| /\p{#{name}}/ =~ eval("?\\u{#{x.to_s(16)}}") }
58
+ (128)
59
+ store[name.downcase] = calculate_ranges(matching_codes)
175
60
  puts "(#{count}/#{NamedGroups.length}) Finished property: #{name}"
176
61
  end
177
- puts "*"*50
178
- puts "Finished! Result stored in: #{OutputFilename}"
62
+ puts '*' * 50
63
+ puts "Finished! Result stored in: ./db/#{filename}"
179
64
  end
180
-