regexp-examples 1.1.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ module RegexpExamples
8
8
  def self.permutations_of_strings(arrays_of_strings)
9
9
  first = arrays_of_strings.shift
10
10
  return first if arrays_of_strings.empty?
11
- first.product( permutations_of_strings(arrays_of_strings) ).map do |result|
11
+ first.product(permutations_of_strings(arrays_of_strings)).map do |result|
12
12
  join_preserving_capture_groups(result)
13
13
  end
14
14
  end
@@ -16,8 +16,8 @@ module RegexpExamples
16
16
  def self.join_preserving_capture_groups(result)
17
17
  result.flatten!
18
18
  subgroups = result
19
- .map(&:all_subgroups)
20
- .flatten
19
+ .map(&:all_subgroups)
20
+ .flatten
21
21
 
22
22
  # Only save the LAST group from repeated capture groups, e.g. /([ab]){2}/
23
23
  subgroups.delete_if do |subgroup|
@@ -35,12 +35,12 @@ module RegexpExamples
35
35
  end
36
36
 
37
37
  private
38
+
38
39
  def self.generic_map_result(repeaters, method)
39
40
  repeaters
40
- .map {|repeater| repeater.public_send(method)}
41
+ .map { |repeater| repeater.public_send(method) }
41
42
  .instance_eval do |partial_results|
42
43
  RegexpExamples.permutations_of_strings(partial_results)
43
44
  end
44
45
  end
45
46
  end
46
-
@@ -15,9 +15,7 @@ module RegexpExamples
15
15
  repeaters = []
16
16
  until end_of_regexp
17
17
  group = parse_group(repeaters)
18
- if group.is_a? OrGroup
19
- return [OneTimeRepeater.new(group)]
20
- end
18
+ return [OneTimeRepeater.new(group)] if group.is_a? OrGroup
21
19
  @current_position += 1
22
20
  repeaters << parse_repeater(group)
23
21
  end
@@ -101,12 +99,16 @@ module RegexpExamples
101
99
  @current_position += 1
102
100
  case
103
101
  when rest_of_string =~ /\A(\d{1,3})/
104
- @current_position += ($1.length - 1) # In case of 10+ backrefs!
105
- group = parse_backreference_group($1)
102
+ @current_position += (Regexp.last_match(1).length - 1) # In case of 10+ backrefs!
103
+ group = parse_backreference_group(Regexp.last_match(1))
106
104
  when rest_of_string =~ /\Ak['<]([\w-]+)['>]/ # Named capture group
107
- @current_position += ($1.length + 2)
108
- # Check for RELATIVE group number, e.g. /(a)(b)(c)(d) \k<-2>/
109
- group_id = ($1.to_i < 0) ? (@num_groups + $1.to_i + 1) : $1
105
+ @current_position += (Regexp.last_match(1).length + 2)
106
+ group_id = if Regexp.last_match(1).to_i < 0
107
+ # RELATIVE group number, e.g. /(a)(b)(c)(d) \k<-2>/
108
+ @num_groups + Regexp.last_match(1).to_i + 1
109
+ else
110
+ Regexp.last_match(1)
111
+ end
110
112
  group = parse_backreference_group(group_id)
111
113
  when BackslashCharMap.keys.include?(next_char)
112
114
  group = CharGroup.new(
@@ -114,32 +116,39 @@ module RegexpExamples
114
116
  @ignorecase
115
117
  )
116
118
  when rest_of_string =~ /\A(c|C-)(.)/ # Control character
117
- @current_position += $1.length
118
- group = parse_single_char_group( parse_control_character($2) )
119
+ @current_position += Regexp.last_match(1).length
120
+ group = parse_single_char_group(parse_control_character(Regexp.last_match(2)))
119
121
  when rest_of_string =~ /\Ax(\h{1,2})/ # Escape sequence
120
- @current_position += $1.length
121
- group = parse_single_char_group( parse_escape_sequence($1) )
122
+ @current_position += Regexp.last_match(1).length
123
+ group = parse_single_char_group(parse_escape_sequence(Regexp.last_match(1)))
122
124
  when rest_of_string =~ /\Au(\h{4}|\{\h{1,4}\})/ # Unicode sequence
123
- @current_position += $1.length
124
- sequence = $1.match(/\h{1,4}/)[0] # Strip off "{" and "}"
125
- group = parse_single_char_group( parse_unicode_sequence(sequence) )
125
+ @current_position += Regexp.last_match(1).length
126
+ sequence = Regexp.last_match(1).match(/\h{1,4}/)[0] # Strip off "{" and "}"
127
+ group = parse_single_char_group(parse_unicode_sequence(sequence))
126
128
  when rest_of_string =~ /\A(p)\{(\^?)([^}]+)\}/i # Named properties
127
- @current_position += ($2.length + $3.length + 2)
128
- is_negative = ($1 == "P") ^ ($2 == "^") # Beware of double negatives! E.g. /\P{^Space}/
129
+ @current_position += (Regexp.last_match(2).length + # 0 or 1, of '^' is present
130
+ Regexp.last_match(3).length + # Length of the property name
131
+ 2) # Length of opening and closing brackets (always 2)
132
+ # Beware of double negatives! E.g. /\P{^Space}/
133
+ is_negative = (Regexp.last_match(1) == 'P') ^ (Regexp.last_match(2) == '^')
129
134
  group = CharGroup.new(
130
135
  if is_negative
131
- CharSets::Any.dup - NamedPropertyCharMap[$3.downcase]
136
+ CharSets::Any.dup - NamedPropertyCharMap[Regexp.last_match(3).downcase]
132
137
  else
133
- NamedPropertyCharMap[$3.downcase]
138
+ NamedPropertyCharMap[Regexp.last_match(3).downcase]
134
139
  end,
135
140
  @ignorecase
136
141
  )
137
142
  when next_char == 'K' # Keep (special lookbehind that CAN be supported safely!)
138
143
  group = PlaceHolderGroup.new
139
144
  when next_char == 'R' # Linebreak
140
- group = CharGroup.new(["\r\n", "\n", "\v", "\f", "\r"], @ignorecase) # A bit hacky...
145
+ group = CharGroup.new(
146
+ ["\r\n", "\n", "\v", "\f", "\r"],
147
+ @ignorecase
148
+ ) # Using "\r\n" as one character is little bit hacky...
141
149
  when next_char == 'g' # Subexpression call
142
- raise IllegalSyntaxError, "Subexpression calls (\\g) cannot be supported, as they are not regular"
150
+ fail IllegalSyntaxError,
151
+ 'Subexpression calls (\\g) cannot be supported, as they are not regular'
143
152
  when next_char =~ /[bB]/ # Anchors
144
153
  raise_anchors_exception!
145
154
  when next_char =~ /[AG]/ # Start of string
@@ -155,7 +164,7 @@ module RegexpExamples
155
164
  raise_anchors_exception!
156
165
  end
157
166
  else
158
- group = parse_single_char_group( next_char )
167
+ group = parse_single_char_group(next_char)
159
168
  end
160
169
  group
161
170
  end
@@ -193,7 +202,7 @@ module RegexpExamples
193
202
  comment_group = rest_of_string.match(/.*?[^\\](?:\\{2})*\)/)[0]
194
203
  @current_position += comment_group.length
195
204
  when match[2] =~ /\A(?=[mix-]+)([mix]*)-?([mix]*)/ # e.g. /(?i-mx)/
196
- regexp_options_toggle($1, $2)
205
+ regexp_options_toggle(Regexp.last_match(1), Regexp.last_match(2))
197
206
  @num_groups -= 1 # Toggle "groups" should not increase backref group count
198
207
  @current_position += $&.length + 1
199
208
  if next_char == ':' # e.g. /(?i:subexpr)/
@@ -202,9 +211,11 @@ module RegexpExamples
202
211
  return PlaceHolderGroup.new
203
212
  end
204
213
  when %w(! =).include?(match[2]) # e.g. /(?=lookahead)/, /(?!neglookahead)/
205
- raise IllegalSyntaxError, "Lookaheads are not regular; cannot generate examples"
214
+ fail IllegalSyntaxError,
215
+ 'Lookaheads are not regular; cannot generate examples'
206
216
  when %w(! =).include?(match[3]) # e.g. /(?<=lookbehind)/, /(?<!neglookbehind)/
207
- raise IllegalSyntaxError, "Lookbehinds are not regular; cannot generate examples"
217
+ fail IllegalSyntaxError,
218
+ 'Lookbehinds are not regular; cannot generate examples'
208
219
  else # e.g. /(?<name>namedgroup)/
209
220
  @current_position += (match[3].length + 3)
210
221
  group_id = match[3]
@@ -226,12 +237,12 @@ module RegexpExamples
226
237
  end
227
238
 
228
239
  def regexp_options_toggle(on, off)
229
- @ignorecase = true if (on.include? "i")
230
- @ignorecase = false if (off.include? "i")
231
- @multiline = true if (on.include? "m")
232
- @multiline = false if (off.include? "m")
233
- @extended = true if (on.include? "x")
234
- @extended = false if (off.include? "x")
240
+ @ignorecase = true if on.include? 'i'
241
+ @ignorecase = false if off.include? 'i'
242
+ @multiline = true if on.include? 'm'
243
+ @multiline = false if off.include? 'm'
244
+ @extended = true if on.include? 'x'
245
+ @extended = false if off.include? 'x'
235
246
  end
236
247
 
237
248
  def parse_char_group
@@ -252,7 +263,6 @@ module RegexpExamples
252
263
  OrGroup.new(left_repeaters, right_repeaters)
253
264
  end
254
265
 
255
-
256
266
  def parse_single_char_group(char)
257
267
  SingleCharGroup.new(char, @ignorecase)
258
268
  end
@@ -310,17 +320,18 @@ module RegexpExamples
310
320
  end
311
321
 
312
322
  def parse_reluctant_or_possessive_range_repeater(repeater, min, has_comma, max)
313
- # .{1}? should be equivalent to (?:.{1})?, i.e. NOT a "non-greedy quantifier"
314
- if min && !has_comma && !max && next_char == "?"
315
- repeater = parse_question_mark_repeater(repeater)
316
- else
317
- parse_reluctant_or_possessive_repeater
318
- end
319
- repeater
323
+ # .{1}? should be equivalent to (?:.{1})?, i.e. NOT a "non-greedy quantifier"
324
+ if min && !has_comma && !max && next_char == '?'
325
+ repeater = parse_question_mark_repeater(repeater)
326
+ else
327
+ parse_reluctant_or_possessive_repeater
328
+ end
329
+ repeater
320
330
  end
321
331
 
322
332
  def raise_anchors_exception!
323
- raise IllegalSyntaxError, "Anchors ('#{next_char}') cannot be supported, as they are not regular"
333
+ fail IllegalSyntaxError,
334
+ "Anchors ('#{next_char}') cannot be supported, as they are not regular"
324
335
  end
325
336
 
326
337
  def parse_one_time_repeater(group)
@@ -336,8 +347,7 @@ module RegexpExamples
336
347
  end
337
348
 
338
349
  def end_of_regexp
339
- next_char == ")" || @current_position >= regexp_string.length
350
+ next_char == ')' || @current_position >= regexp_string.length
340
351
  end
341
352
  end
342
353
  end
343
-
@@ -6,11 +6,11 @@ module RegexpExamples
6
6
  end
7
7
 
8
8
  def result
9
- group_results = group.result[0 .. RegexpExamples.MaxGroupResults-1]
9
+ group_results = group.result.first(RegexpExamples.MaxGroupResults)
10
10
  results = []
11
11
  min_repeats.upto(max_repeats) do |repeats|
12
12
  if repeats.zero?
13
- results << [ GroupResult.new('') ]
13
+ results << [GroupResult.new('')]
14
14
  else
15
15
  results << RegexpExamples.permutations_of_strings(
16
16
  [group_results] * repeats
@@ -23,8 +23,8 @@ module RegexpExamples
23
23
  def random_result
24
24
  result = []
25
25
  rand(min_repeats..max_repeats).times { result << group.random_result }
26
- result << [ GroupResult.new('') ] if result.empty? # in case of 0.times
27
- RegexpExamples::permutations_of_strings(result)
26
+ result << [GroupResult.new('')] if result.empty? # in case of 0.times
27
+ RegexpExamples.permutations_of_strings(result)
28
28
  end
29
29
  end
30
30
 
@@ -74,9 +74,9 @@ module RegexpExamples
74
74
  end
75
75
 
76
76
  private
77
+
77
78
  def smallest(x, y)
78
79
  (x < y) ? x : y
79
80
  end
80
81
  end
81
82
  end
82
-
@@ -0,0 +1,45 @@
1
+ require 'pstore'
2
+
3
+ module RegexpExamples
4
+ class UnicodeCharRanges
5
+ # These values were generated by: scripts/unicode_lister.rb
6
+ # Note: Only the first 128 results are listed, for performance.
7
+ # Also, some groups seem to have no matches (weird!)
8
+ # (Don't care about ruby micro version number)
9
+ STORE_FILENAME = "unicode_ranges_#{RUBY_VERSION[0..2]}.pstore"
10
+
11
+ attr_reader :range_store
12
+
13
+ def initialize(filename = STORE_FILENAME)
14
+ @range_store = PStore.new(File.expand_path("../../../db/#{filename}", __FILE__))
15
+ end
16
+
17
+ def get(key)
18
+ range_store.transaction(true) do
19
+ ranges_to_unicode(range_store[key])
20
+ end
21
+ end
22
+
23
+ alias_method :[], :get
24
+
25
+ private
26
+
27
+ # TODO: Document example input/output of this method
28
+ # It's pretty simple, but this code is a little confusing!!
29
+ def ranges_to_unicode(ranges)
30
+ result = []
31
+ ranges.each do |range|
32
+ if range.is_a? Fixnum # Small hack to increase data compression
33
+ result << hex_to_unicode(range.to_s(16))
34
+ else
35
+ range.each { |num| result << hex_to_unicode(num.to_s(16)) }
36
+ end
37
+ end
38
+ result
39
+ end
40
+
41
+ def hex_to_unicode(hex)
42
+ eval("?\\u{#{hex}}")
43
+ end
44
+ end
45
+ end
@@ -1,3 +1,3 @@
1
1
  module RegexpExamples
2
- VERSION = '1.1.0'
2
+ VERSION = '1.1.2'
3
3
  end
@@ -1,4 +1,4 @@
1
- require File.expand_path("../lib/regexp-examples/version", __FILE__)
1
+ require File.expand_path('../lib/regexp-examples/version', __FILE__)
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = 'regexp-examples'
@@ -11,11 +11,11 @@ Gem::Specification.new do |s|
11
11
  s.files = `git ls-files -z`.split("\x0")
12
12
  s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
13
13
  s.test_files = s.files.grep(%r{^(test|spec|features)/})
14
- s.require_paths = ["lib"]
14
+ s.require_paths = ['lib']
15
15
  s.homepage =
16
16
  'http://rubygems.org/gems/regexp-examples'
17
- s.add_development_dependency "bundler", "~> 1.7"
18
- s.add_development_dependency "rake", "~> 10.0"
17
+ s.add_development_dependency 'bundler', '~> 1.7'
18
+ s.add_development_dependency 'rake', '~> 10.0'
19
19
  s.license = 'MIT'
20
20
  s.required_ruby_version = '>= 2.0.0'
21
21
  end
@@ -1,180 +1,64 @@
1
+ require 'pstore'
2
+ require_relative '../lib/regexp-examples/unicode_char_ranges'
1
3
  # A script to generate lists of all unicode characters
2
4
  # that match all named group/character properties regexps.
3
5
  # For use in e.g. /\p{Arabic}/.examples
4
6
 
5
7
  # To (re-)generate this list, simply run this file!
6
8
  # > ruby scripts/unicode_lister.rb
7
- OutputFilename = 'unicode_result'
8
9
 
9
10
  # Taken from ruby documentation:
10
11
  # http://ruby-doc.org//core-2.2.0/Regexp.html#class-Regexp-label-Character+Properties
11
12
  NamedGroups = %w(
12
- Alnum
13
- Alpha
14
- Blank
15
- Cntrl
16
- Digit
17
- Graph
18
- Lower
19
- Print
20
- Punct
21
- Space
22
- Upper
23
- XDigit
24
- Word
25
- ASCII
26
- Any
27
- Assigned
13
+ Alnum Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word ASCII Any Assigned
28
14
 
29
- L
30
- Ll
31
- Lm
32
- Lo
33
- Lt
34
- Lu
35
- M
36
- Mn
37
- Mc
38
- Me
39
- N
40
- Nd
41
- Nl
42
- No
43
- P
44
- Pc
45
- Pd
46
- Ps
47
- Pe
48
- Pi
49
- Pf
50
- Po
51
- S
52
- Sm
53
- Sc
54
- Sk
55
- So
56
- Z
57
- Zs
58
- Zl
59
- Zp
60
- C
61
- Cc
62
- Cf
63
- Cn
64
- Co
65
- Cs
15
+ L Ll Lm Lo Lt Lu M Mn Mc Me N Nd Nl No P Pc Pd Ps Pe Pi Pf Po S Sm Sc Sk So Z Zs Zl Zp C Cc Cf Cn Co Cs
66
16
 
67
- Arabic
68
- Armenian
69
- Balinese
70
- Bengali
71
- Bopomofo
72
- Braille
73
- Buginese
74
- Buhid
75
- Canadian_Aboriginal
76
- Carian
77
- Cham
78
- Cherokee
79
- Common
80
- Coptic
81
- Cuneiform
82
- Cypriot
83
- Cyrillic
84
- Deseret
85
- Devanagari
86
- Ethiopic
87
- Georgian
88
- Glagolitic
89
- Gothic
90
- Greek
91
- Gujarati
92
- Gurmukhi
93
- Han
94
- Hangul
95
- Hanunoo
96
- Hebrew
97
- Hiragana
98
- Inherited
99
- Kannada
100
- Katakana
101
- Kayah_Li
102
- Kharoshthi
103
- Khmer
104
- Lao
105
- Latin
106
- Lepcha
107
- Limbu
108
- Linear_B
109
- Lycian
110
- Lydian
111
- Malayalam
112
- Mongolian
113
- Myanmar
114
- New_Tai_Lue
115
- Nko
116
- Ogham
117
- Ol_Chiki
118
- Old_Italic
119
- Old_Persian
120
- Oriya
121
- Osmanya
122
- Phags_Pa
123
- Phoenician
124
- Rejang
125
- Runic
126
- Saurashtra
127
- Shavian
128
- Sinhala
129
- Sundanese
130
- Syloti_Nagri
131
- Syriac
132
- Tagalog
133
- Tagbanwa
134
- Tai_Le
135
- Tamil
136
- Telugu
137
- Thaana
138
- Thai
139
- Tibetan
140
- Tifinagh
141
- Ugaritic
142
- Vai
143
- Yi
17
+ Arabic Armenian Balinese Bengali Bopomofo Braille Buginese Buhid Canadian_Aboriginal Carian Cham Cherokee
18
+ Common Coptic Cuneiform Cypriot Cyrillic Deseret Devanagari Ethiopic Georgian Glagolitic Gothic Greek
19
+ Gujarati Gurmukhi Han Hangul Hanunoo Hebrew Hiragana Inherited Kannada Katakana Kayah_Li Kharoshthi Khmer
20
+ Lao Latin Lepcha Limbu Linear_B Lycian Lydian Malayalam Mongolian Myanmar New_Tai_Lue Nko Ogham Ol_Chiki
21
+ Old_Italic Old_Persian Oriya Osmanya Phags_Pa Phoenician Rejang Runic Saurashtra Shavian Sinhala Sundanese
22
+ Syloti_Nagri Syriac Tagalog Tagbanwa Tai_Le Tamil Telugu Thaana Thai Tibetan Tifinagh Ugaritic Vai Yi
144
23
  )
145
24
 
146
- # Note: For some reason, a character encoding-related exception gets raised
147
- # when I do `/regex/ =~ eval("?\\u{#{x.to_s(16)}}")` in the range: 55296..57343
148
- # This means my calculation is MISSING results in the range: 55296..65535
149
- # However, for the sake of performance, I'm also being "lazy" and only calculating/saving
150
- # the first 128 matches anyway!
151
- # If anyone ever cares about this (I doubt it), I'll look into fixing/improving it.
25
+ # Note: For the range 55296..57343, these are reserved values that are not legal
26
+ # unicode characters.
27
+ # I.e. a character encoding-related exception gets raised when you do:
28
+ # `/regex/ =~ eval("?\\u{#{x.to_s(16)}}")`
29
+ # TODO: Add a link to somewhere that explains this better.
152
30
 
153
- # Example input: [1, 2, 3, 4, 6, 7, 12, 14] (Array)
154
- # Example output: "1..4, 6..7, 12, 14" (String)
31
+ # "Compresses" the values in an array by using ranges.
32
+ # Example input: [1, 2, 3, 4, 6, 7, 12, 14]
33
+ # Example output: [1..4, 6..7, 12, 14]
155
34
  def calculate_ranges(matching_codes)
156
- return "" if matching_codes.empty?
35
+ return [] if matching_codes.empty?
157
36
  first = matching_codes.shift
158
- matching_codes.inject([first..first]) do |r,x|
37
+ matching_codes.inject([first..first]) do |r, x|
159
38
  if r.last.last.succ != x
160
39
  r << (x..x) # Start new range
161
40
  else
162
41
  r[0..-2] << (r.last.first..x) # Update last range
163
42
  end
164
43
  end
165
- .map { |range| range.size == 1 ? range.first : range}
166
- .join(", ")
44
+ .map { |range| range.size == 1 ? range.first : range } # Replace `int..int` with `int`
167
45
  end
168
46
 
169
47
  count = 0
170
- File.open(OutputFilename, 'w') do |f|
48
+ filename = RegexpExamples::UnicodeCharRanges::STORE_FILENAME
49
+ store = PStore.new(filename)
50
+ store.transaction do
171
51
  NamedGroups.each do |name|
172
- count += 1
173
- matching_codes = (0..55295).lazy.select { |x| /\p{#{name}}/ =~ eval("?\\u{#{x.to_s(16)}}") }.first(128)
174
- f.puts "'#{name.downcase}' => ranges_to_unicode(#{calculate_ranges(matching_codes)}),"
52
+ count += 1
53
+ # Only generating first 128 matches, for performance...
54
+ # (I have tried this with generating ALL examples, and it makes the ruby gem
55
+ # painfully slow and bloated... Especially the test suite.)
56
+ matching_codes = [(0..55_295), (57_344..65_535)].map(&:to_a).flatten.lazy
57
+ .find { |x| /\p{#{name}}/ =~ eval("?\\u{#{x.to_s(16)}}") }
58
+ (128)
59
+ store[name.downcase] = calculate_ranges(matching_codes)
175
60
  puts "(#{count}/#{NamedGroups.length}) Finished property: #{name}"
176
61
  end
177
- puts "*"*50
178
- puts "Finished! Result stored in: #{OutputFilename}"
62
+ puts '*' * 50
63
+ puts "Finished! Result stored in: ./db/#{filename}"
179
64
  end
180
-