regexp_parser 1.7.0 → 2.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +8 -2
  3. data/LICENSE +1 -1
  4. data/Rakefile +6 -70
  5. data/lib/regexp_parser/error.rb +4 -0
  6. data/lib/regexp_parser/expression/base.rb +76 -0
  7. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  8. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  9. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
  10. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
  11. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
  12. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  13. data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
  14. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
  15. data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
  16. data/lib/regexp_parser/expression/classes/group.rb +28 -15
  17. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  18. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  19. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
  20. data/lib/regexp_parser/expression/classes/root.rb +4 -19
  21. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
  22. data/lib/regexp_parser/expression/methods/construct.rb +41 -0
  23. data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
  24. data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
  25. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  26. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  27. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  28. data/lib/regexp_parser/expression/methods/tests.rb +47 -1
  29. data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
  30. data/lib/regexp_parser/expression/quantifier.rb +57 -17
  31. data/lib/regexp_parser/expression/sequence.rb +11 -47
  32. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  33. data/lib/regexp_parser/expression/shared.rb +111 -0
  34. data/lib/regexp_parser/expression/subexpression.rb +27 -19
  35. data/lib/regexp_parser/expression.rb +14 -141
  36. data/lib/regexp_parser/lexer.rb +83 -41
  37. data/lib/regexp_parser/parser.rb +371 -429
  38. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  39. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  40. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  41. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  42. data/lib/regexp_parser/scanner/properties/long.csv +633 -0
  43. data/lib/regexp_parser/scanner/properties/short.csv +248 -0
  44. data/lib/regexp_parser/scanner/property.rl +4 -4
  45. data/lib/regexp_parser/scanner/scanner.rl +303 -368
  46. data/lib/regexp_parser/scanner.rb +1423 -1674
  47. data/lib/regexp_parser/syntax/any.rb +2 -7
  48. data/lib/regexp_parser/syntax/base.rb +92 -67
  49. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  50. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  51. data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
  52. data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
  53. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  54. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  55. data/lib/regexp_parser/syntax/token/escape.rb +33 -0
  56. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  57. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  58. data/lib/regexp_parser/syntax/token/meta.rb +20 -0
  59. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  60. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  61. data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
  62. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  63. data/lib/regexp_parser/syntax/token.rb +45 -0
  64. data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
  65. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  66. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  67. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  68. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  69. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  70. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  71. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  72. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  73. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  74. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  75. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  76. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  77. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  78. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  79. data/lib/regexp_parser/syntax/versions.rb +3 -1
  80. data/lib/regexp_parser/syntax.rb +8 -6
  81. data/lib/regexp_parser/token.rb +9 -20
  82. data/lib/regexp_parser/version.rb +1 -1
  83. data/lib/regexp_parser.rb +0 -2
  84. data/regexp_parser.gemspec +19 -23
  85. metadata +52 -171
  86. data/CHANGELOG.md +0 -349
  87. data/README.md +0 -470
  88. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  89. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  90. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  91. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  92. data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
  93. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  94. data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
  95. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  96. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  97. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  98. data/spec/expression/base_spec.rb +0 -94
  99. data/spec/expression/clone_spec.rb +0 -120
  100. data/spec/expression/conditional_spec.rb +0 -89
  101. data/spec/expression/free_space_spec.rb +0 -27
  102. data/spec/expression/methods/match_length_spec.rb +0 -161
  103. data/spec/expression/methods/match_spec.rb +0 -25
  104. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  105. data/spec/expression/methods/tests_spec.rb +0 -99
  106. data/spec/expression/methods/traverse_spec.rb +0 -161
  107. data/spec/expression/options_spec.rb +0 -128
  108. data/spec/expression/root_spec.rb +0 -9
  109. data/spec/expression/sequence_spec.rb +0 -9
  110. data/spec/expression/subexpression_spec.rb +0 -50
  111. data/spec/expression/to_h_spec.rb +0 -26
  112. data/spec/expression/to_s_spec.rb +0 -100
  113. data/spec/lexer/all_spec.rb +0 -22
  114. data/spec/lexer/conditionals_spec.rb +0 -53
  115. data/spec/lexer/escapes_spec.rb +0 -14
  116. data/spec/lexer/keep_spec.rb +0 -10
  117. data/spec/lexer/literals_spec.rb +0 -89
  118. data/spec/lexer/nesting_spec.rb +0 -99
  119. data/spec/lexer/refcalls_spec.rb +0 -55
  120. data/spec/parser/all_spec.rb +0 -43
  121. data/spec/parser/alternation_spec.rb +0 -88
  122. data/spec/parser/anchors_spec.rb +0 -17
  123. data/spec/parser/conditionals_spec.rb +0 -179
  124. data/spec/parser/errors_spec.rb +0 -30
  125. data/spec/parser/escapes_spec.rb +0 -121
  126. data/spec/parser/free_space_spec.rb +0 -130
  127. data/spec/parser/groups_spec.rb +0 -108
  128. data/spec/parser/keep_spec.rb +0 -6
  129. data/spec/parser/posix_classes_spec.rb +0 -8
  130. data/spec/parser/properties_spec.rb +0 -115
  131. data/spec/parser/quantifiers_spec.rb +0 -51
  132. data/spec/parser/refcalls_spec.rb +0 -112
  133. data/spec/parser/set/intersections_spec.rb +0 -127
  134. data/spec/parser/set/ranges_spec.rb +0 -111
  135. data/spec/parser/sets_spec.rb +0 -178
  136. data/spec/parser/types_spec.rb +0 -18
  137. data/spec/scanner/all_spec.rb +0 -18
  138. data/spec/scanner/anchors_spec.rb +0 -21
  139. data/spec/scanner/conditionals_spec.rb +0 -128
  140. data/spec/scanner/errors_spec.rb +0 -68
  141. data/spec/scanner/escapes_spec.rb +0 -53
  142. data/spec/scanner/free_space_spec.rb +0 -133
  143. data/spec/scanner/groups_spec.rb +0 -52
  144. data/spec/scanner/keep_spec.rb +0 -10
  145. data/spec/scanner/literals_spec.rb +0 -49
  146. data/spec/scanner/meta_spec.rb +0 -18
  147. data/spec/scanner/properties_spec.rb +0 -64
  148. data/spec/scanner/quantifiers_spec.rb +0 -20
  149. data/spec/scanner/refcalls_spec.rb +0 -36
  150. data/spec/scanner/sets_spec.rb +0 -102
  151. data/spec/scanner/types_spec.rb +0 -14
  152. data/spec/spec_helper.rb +0 -15
  153. data/spec/support/runner.rb +0 -42
  154. data/spec/support/shared_examples.rb +0 -77
  155. data/spec/support/warning_extractor.rb +0 -60
  156. data/spec/syntax/syntax_spec.rb +0 -48
  157. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  158. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  159. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  160. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  161. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  162. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  163. data/spec/syntax/versions/aliases_spec.rb +0 -37
  164. data/spec/token/token_spec.rb +0 -85
  165. /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -15,26 +20,15 @@
15
20
 
16
21
  group_open = '(';
17
22
  group_close = ')';
18
- parantheses = group_open | group_close;
23
+ parentheses = group_open | group_close;
19
24
 
20
25
  set_open = '[';
21
26
  set_close = ']';
22
27
  brackets = set_open | set_close;
23
28
 
24
- comment = ('#' . [^\n]* . '\n');
25
-
26
- class_name_posix = 'alnum' | 'alpha' | 'blank' |
27
- 'cntrl' | 'digit' | 'graph' |
28
- 'lower' | 'print' | 'punct' |
29
- 'space' | 'upper' | 'xdigit' |
30
- 'word' | 'ascii';
31
-
32
- class_posix = ('[:' . '^'? . class_name_posix . ':]');
33
-
29
+ comment = ('#' . [^\n]* . '\n'?);
34
30
 
35
- # these are not supported in ruby, and need verification
36
- collating_sequence = '[.' . (alpha | [\-])+ . '.]';
37
- character_equivalent = '[=' . alpha . '=]';
31
+ class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
38
32
 
39
33
  line_anchor = beginning_of_line | end_of_line;
40
34
  anchor_char = [AbBzZG];
@@ -53,21 +47,20 @@
53
47
 
54
48
  meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
49
 
50
+ sequence_char = [CMcux];
51
+
56
52
  zero_or_one = '?' | '??' | '?+';
57
53
  zero_or_more = '*' | '*?' | '*+';
58
54
  one_or_more = '+' | '+?' | '++';
59
55
 
60
56
  quantifier_greedy = '?' | '*' | '+';
61
- quantifier_reluctant = '??' | '*?' | '+?';
62
- quantifier_possessive = '?+' | '*+' | '++';
63
- quantifier_mode = '?' | '+';
64
-
65
- quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
66
- range_close . quantifier_mode?;
67
-
68
- quantifiers = quantifier_greedy | quantifier_reluctant |
69
- quantifier_possessive | quantifier_interval;
70
57
 
58
+ quantity_exact = (digit+);
59
+ quantity_minimum = (digit+) . ',';
60
+ quantity_maximum = ',' . (digit+);
61
+ quantity_range = (digit+) . ',' . (digit+);
62
+ quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
63
+ quantity_maximum | quantity_range ) . range_close;
71
64
 
72
65
  conditional = '(?(';
73
66
 
@@ -85,22 +78,22 @@
85
78
  # try to treat every other group head as options group, like Ruby
86
79
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
87
80
 
88
- group_ref = [gk];
89
- group_name_char = (alnum | '_');
90
- group_name_id = (group_name_char . (group_name_char+)?)?;
91
- group_number = '-'? . [1-9] . ([0-9]+)?;
81
+ group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
82
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
83
+ group_number = '-'? . [0-9]+;
92
84
  group_level = [+\-] . [0-9]+;
93
85
 
94
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
86
+ group_name = ('<' . group_name_id_ab? . '>') |
87
+ ("'" . group_name_id_sq? . "'");
95
88
  group_lookup = group_name | group_number;
96
89
 
97
90
  group_named = ('?' . group_name );
98
91
 
99
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
100
- ("'" . group_name_id . group_level? "'"));
92
+ group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
93
+ ("'" . (group_name_id_sq? | group_number) . group_level? "'"));
101
94
 
102
- group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
103
- ("'" . group_number . group_level? "'"));
95
+ group_ref = 'k' . group_ref_body;
96
+ group_call = 'g' . group_ref_body;
104
97
 
105
98
  group_type = group_atomic | group_passive | group_absence | group_named;
106
99
 
@@ -111,32 +104,33 @@
111
104
 
112
105
  # characters that 'break' a literal
113
106
  meta_char = dot | backslash | alternation |
114
- curlies | parantheses | brackets |
107
+ curlies | parentheses | brackets |
115
108
  line_anchor | quantifier_greedy;
116
109
 
117
- ascii_print = ((0x20..0x7e) - meta_char);
118
- ascii_nonprint = (0x01..0x1f | 0x7f);
110
+ literal_delimiters = ']' | '}';
119
111
 
120
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
121
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
122
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
112
+ ascii_print = ((0x20..0x7e) - meta_char - '#');
113
+ ascii_nonprint = (0x01..0x1f | 0x7f);
123
114
 
124
115
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
125
- group_ref | keep_mark | [xucCM];
116
+ keep_mark | sequence_char;
117
+
118
+ # escapes that also work within a character set
119
+ set_escape = backslash | brackets | escaped_ascii |
120
+ octal_sequence | property_char |
121
+ sequence_char | single_codepoint_char_type;
126
122
 
127
- non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
128
- multi_codepoint_char_type | [0-9cCM];
129
123
 
130
124
  # EOF error, used where it can be detected
131
125
  action premature_end_error {
132
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
133
- raise PrematureEndError.new( text )
126
+ text = copy(data, ts ? ts-1 : 0, -1)
127
+ raise PrematureEndError.new(text)
134
128
  }
135
129
 
136
130
  # Invalid sequence error, used from sequences, like escapes and sets
137
131
  action invalid_sequence_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
139
- validation_error(:sequence, 'sequence', text)
132
+ text = copy(data, ts ? ts-1 : 0, -1)
133
+ raise ValidationError.for(:sequence, 'sequence', text)
140
134
  }
141
135
 
142
136
  # group (nesting) and set open/close actions
@@ -150,7 +144,7 @@
150
144
  # --------------------------------------------------------------------------
151
145
  character_set := |*
152
146
  set_close > (set_meta, 2) @set_closed {
153
- emit(:set, :close, *text(data, ts, te))
147
+ emit(:set, :close, copy(data, ts, te))
154
148
  if in_set?
155
149
  fret;
156
150
  else
@@ -159,8 +153,8 @@
159
153
  };
160
154
 
161
155
  '-]' @set_closed { # special case, emits two tokens
162
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
163
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
156
+ emit(:literal, :literal, '-')
157
+ emit(:set, :close, ']')
164
158
  if in_set?
165
159
  fret;
166
160
  else
@@ -169,33 +163,32 @@
169
163
  };
170
164
 
171
165
  '-&&' { # special case, emits two tokens
172
- emit(:literal, :literal, '-', ts, te)
173
- emit(:set, :intersection, '&&', ts, te)
166
+ emit(:literal, :literal, '-')
167
+ emit(:set, :intersection, '&&')
174
168
  };
175
169
 
176
170
  '^' {
177
- text = text(data, ts, te).first
178
- if tokens.last[1] == :open
179
- emit(:set, :negate, text, ts, te)
171
+ if prev_token[1] == :open
172
+ emit(:set, :negate, '^')
180
173
  else
181
- emit(:literal, :literal, text, ts, te)
174
+ emit(:literal, :literal, '^')
182
175
  end
183
176
  };
184
177
 
185
178
  '-' {
186
- text = text(data, ts, te).first
187
- # ranges cant start with a subset or intersection/negation/range operator
188
- if tokens.last[0] == :set
189
- emit(:literal, :literal, text, ts, te)
179
+ # ranges cant start with the opening bracket, a subset, or
180
+ # intersection/negation/range operators
181
+ if prev_token[0] == :set
182
+ emit(:literal, :literal, '-')
190
183
  else
191
- emit(:set, :range, text, ts, te)
184
+ emit(:set, :range, '-')
192
185
  end
193
186
  };
194
187
 
195
188
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
196
189
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
197
190
  '&&' {
198
- emit(:set, :intersection, *text(data, ts, te))
191
+ emit(:set, :intersection, '&&')
199
192
  };
200
193
 
201
194
  backslash {
@@ -203,59 +196,60 @@
203
196
  };
204
197
 
205
198
  set_open >(open_bracket, 1) >set_opened {
206
- emit(:set, :open, *text(data, ts, te))
199
+ emit(:set, :open, '[')
207
200
  fcall character_set;
208
201
  };
209
202
 
210
- class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
211
- text = text(data, ts, te).first
203
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
204
+ text = copy(data, ts, te)
212
205
 
213
206
  type = :posixclass
214
207
  class_name = text[2..-3]
215
- if class_name[0].chr == '^'
208
+ if class_name[0] == '^'
216
209
  class_name = class_name[1..-1]
217
210
  type = :nonposixclass
218
211
  end
219
212
 
220
- emit(type, class_name.to_sym, text, ts, te)
221
- };
222
-
223
- collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
224
- emit(:set, :collation, *text(data, ts, te))
225
- };
213
+ unless self.class.posix_classes.include?(class_name)
214
+ raise ValidationError.for(:posix_class, text)
215
+ end
226
216
 
227
- character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
228
- emit(:set, :equivalent, *text(data, ts, te))
217
+ emit(type, class_name.to_sym, text)
229
218
  };
230
219
 
231
220
  meta_char > (set_meta, 1) {
232
- emit(:literal, :literal, *text(data, ts, te))
221
+ emit(:literal, :literal, copy(data, ts, te))
233
222
  };
234
223
 
235
- any |
236
- ascii_nonprint |
237
- utf8_2_byte |
238
- utf8_3_byte |
239
- utf8_4_byte {
240
- char, *rest = *text(data, ts, te)
241
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
242
- emit(:literal, :literal, char, *rest)
224
+ any | ascii_nonprint | utf8_multibyte {
225
+ text = copy(data, ts, te)
226
+ emit(:literal, :literal, text)
243
227
  };
244
228
  *|;
245
229
 
246
230
  # set escapes scanner
247
231
  # --------------------------------------------------------------------------
248
232
  set_escape_sequence := |*
249
- non_set_escape > (escaped_set_alpha, 2) {
250
- emit(:escape, :literal, *text(data, ts, te, 1))
233
+ # Special case: in sets, octal sequences have higher priority than backrefs
234
+ octal_sequence {
235
+ emit(:escape, :octal, copy(data, ts-1, te))
251
236
  fret;
252
237
  };
253
238
 
254
- any > (escaped_set_alpha, 1) {
239
+ # Scan all other escapes that work in sets with the generic escape scanner
240
+ set_escape > (escaped_set_alpha, 2) {
255
241
  fhold;
256
242
  fnext character_set;
257
243
  fcall escape_sequence;
258
244
  };
245
+
246
+ # Treat all remaining escapes - those not supported in sets - as literal.
247
+ # (This currently includes \^, \-, \&, \:, although these could potentially
248
+ # be meta chars when not escaped, depending on their position in the set.)
249
+ any > (escaped_set_alpha, 1) {
250
+ emit(:escape, :literal, copy(data, ts-1, te))
251
+ fret;
252
+ };
259
253
  *|;
260
254
 
261
255
 
@@ -263,33 +257,40 @@
263
257
  # --------------------------------------------------------------------------
264
258
  escape_sequence := |*
265
259
  [1-9] {
266
- text = text(data, ts, te, 1).first
267
- emit(:backref, :number, text, ts-1, te)
260
+ text = copy(data, ts-1, te)
261
+ emit(:backref, :number, text)
268
262
  fret;
269
263
  };
270
264
 
271
265
  octal_sequence {
272
- emit(:escape, :octal, *text(data, ts, te, 1))
266
+ emit(:escape, :octal, copy(data, ts-1, te))
267
+ fret;
268
+ };
269
+
270
+ [8-9] . [0-9] { # special case, emits two tokens
271
+ text = copy(data, ts-1, te)
272
+ emit(:escape, :literal, text[0, 2])
273
+ emit(:literal, :literal, text[2])
273
274
  fret;
274
275
  };
275
276
 
276
277
  meta_char {
277
- case text = text(data, ts, te, 1).first
278
- when '\.'; emit(:escape, :dot, text, ts-1, te)
279
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
280
- when '\^'; emit(:escape, :bol, text, ts-1, te)
281
- when '\$'; emit(:escape, :eol, text, ts-1, te)
282
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
283
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
284
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
285
- when '\('; emit(:escape, :group_open, text, ts-1, te)
286
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
287
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
288
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
289
- when '\['; emit(:escape, :set_open, text, ts-1, te)
290
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
278
+ case text = copy(data, ts-1, te)
279
+ when '\.'; emit(:escape, :dot, text)
280
+ when '\|'; emit(:escape, :alternation, text)
281
+ when '\^'; emit(:escape, :bol, text)
282
+ when '\$'; emit(:escape, :eol, text)
283
+ when '\?'; emit(:escape, :zero_or_one, text)
284
+ when '\*'; emit(:escape, :zero_or_more, text)
285
+ when '\+'; emit(:escape, :one_or_more, text)
286
+ when '\('; emit(:escape, :group_open, text)
287
+ when '\)'; emit(:escape, :group_close, text)
288
+ when '\{'; emit(:escape, :interval_open, text)
289
+ when '\}'; emit(:escape, :interval_close, text)
290
+ when '\['; emit(:escape, :set_open, text)
291
+ when '\]'; emit(:escape, :set_close, text)
291
292
  when "\\\\";
292
- emit(:escape, :backslash, text, ts-1, te)
293
+ emit(:escape, :backslash, text)
293
294
  end
294
295
  fret;
295
296
  };
@@ -297,31 +298,31 @@
297
298
  escaped_ascii > (escaped_alpha, 7) {
298
299
  # \b is emitted as backspace only when inside a character set, otherwise
299
300
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
300
- case text = text(data, ts, te, 1).first
301
- when '\a'; emit(:escape, :bell, text, ts-1, te)
302
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
303
- when '\e'; emit(:escape, :escape, text, ts-1, te)
304
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
305
- when '\n'; emit(:escape, :newline, text, ts-1, te)
306
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
307
- when '\t'; emit(:escape, :tab, text, ts-1, te)
308
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
301
+ case text = copy(data, ts-1, te)
302
+ when '\a'; emit(:escape, :bell, text)
303
+ when '\b'; emit(:escape, :backspace, text)
304
+ when '\e'; emit(:escape, :escape, text)
305
+ when '\f'; emit(:escape, :form_feed, text)
306
+ when '\n'; emit(:escape, :newline, text)
307
+ when '\r'; emit(:escape, :carriage, text)
308
+ when '\t'; emit(:escape, :tab, text)
309
+ when '\v'; emit(:escape, :vertical_tab, text)
309
310
  end
310
311
  fret;
311
312
  };
312
313
 
313
314
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
314
- text = text(data, ts, te, 1).first
315
- if text[2].chr == '{'
316
- emit(:escape, :codepoint_list, text, ts-1, te)
315
+ text = copy(data, ts-1, te)
316
+ if text[2] == '{'
317
+ emit(:escape, :codepoint_list, text)
317
318
  else
318
- emit(:escape, :codepoint, text, ts-1, te)
319
+ emit(:escape, :codepoint, text)
319
320
  end
320
321
  fret;
321
322
  };
322
323
 
323
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
324
- emit(:escape, :hex, *text(data, ts, te, 1))
324
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
325
+ emit(:escape, :hex, copy(data, ts-1, te))
325
326
  fret;
326
327
  };
327
328
 
@@ -351,8 +352,8 @@
351
352
  fcall unicode_property;
352
353
  };
353
354
 
354
- (any -- non_literal_escape) > (escaped_alpha, 1) {
355
- emit(:escape, :literal, *text(data, ts, te, 1))
355
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
356
+ emit(:escape, :literal, copy(data, ts-1, te))
356
357
  fret;
357
358
  };
358
359
  *|;
@@ -362,9 +363,10 @@
362
363
  # --------------------------------------------------------------------------
363
364
  conditional_expression := |*
364
365
  group_lookup . ')' {
365
- text = text(data, ts, te-1).first
366
- emit(:conditional, :condition, text, ts, te-1)
367
- emit(:conditional, :condition_close, ')', te-1, te)
366
+ text = copy(data, ts, te-1)
367
+ text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
368
+ emit(:conditional, :condition, text)
369
+ emit(:conditional, :condition_close, ')')
368
370
  };
369
371
 
370
372
  any {
@@ -381,46 +383,50 @@
381
383
  # Meta characters
382
384
  # ------------------------------------------------------------------------
383
385
  dot {
384
- emit(:meta, :dot, *text(data, ts, te))
386
+ emit(:meta, :dot, copy(data, ts, te))
385
387
  };
386
388
 
387
389
  alternation {
388
390
  if conditional_stack.last == group_depth
389
- emit(:conditional, :separator, *text(data, ts, te))
391
+ emit(:conditional, :separator, copy(data, ts, te))
390
392
  else
391
- emit(:meta, :alternation, *text(data, ts, te))
393
+ emit(:meta, :alternation, copy(data, ts, te))
392
394
  end
393
395
  };
394
396
 
395
397
  # Anchors
396
398
  # ------------------------------------------------------------------------
397
399
  beginning_of_line {
398
- emit(:anchor, :bol, *text(data, ts, te))
400
+ emit(:anchor, :bol, copy(data, ts, te))
399
401
  };
400
402
 
401
403
  end_of_line {
402
- emit(:anchor, :eol, *text(data, ts, te))
404
+ emit(:anchor, :eol, copy(data, ts, te))
403
405
  };
404
406
 
405
407
  backslash . keep_mark > (backslashed, 4) {
406
- emit(:keep, :mark, *text(data, ts, te))
408
+ emit(:keep, :mark, copy(data, ts, te))
407
409
  };
408
410
 
409
411
  backslash . anchor_char > (backslashed, 3) {
410
- case text = text(data, ts, te).first
411
- when '\\A'; emit(:anchor, :bos, text, ts, te)
412
- when '\\z'; emit(:anchor, :eos, text, ts, te)
413
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
414
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
415
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
416
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
412
+ case text = copy(data, ts, te)
413
+ when '\A'; emit(:anchor, :bos, text)
414
+ when '\z'; emit(:anchor, :eos, text)
415
+ when '\Z'; emit(:anchor, :eos_ob_eol, text)
416
+ when '\b'; emit(:anchor, :word_boundary, text)
417
+ when '\B'; emit(:anchor, :nonword_boundary, text)
418
+ when '\G'; emit(:anchor, :match_start, text)
417
419
  end
418
420
  };
419
421
 
422
+ literal_delimiters {
423
+ append_literal(data, ts, te)
424
+ };
425
+
420
426
  # Character sets
421
427
  # ------------------------------------------------------------------------
422
428
  set_open >set_opened {
423
- emit(:set, :open, *text(data, ts, te))
429
+ emit(:set, :open, copy(data, ts, te))
424
430
  fcall character_set;
425
431
  };
426
432
 
@@ -429,23 +435,22 @@
429
435
  # (?(condition)Y|N) conditional expression
430
436
  # ------------------------------------------------------------------------
431
437
  conditional {
432
- text = text(data, ts, te).first
438
+ text = copy(data, ts, te)
433
439
 
434
440
  conditional_stack << group_depth
435
441
 
436
- emit(:conditional, :open, text[0..-2], ts, te-1)
437
- emit(:conditional, :condition_open, '(', te-1, te)
442
+ emit(:conditional, :open, text[0..-2])
443
+ emit(:conditional, :condition_open, '(')
438
444
  fcall conditional_expression;
439
445
  };
440
446
 
441
447
 
442
448
  # (?#...) comments: parsed as a single expression, without introducing a
443
449
  # new nesting level. Comments may not include parentheses, escaped or not.
444
- # special case for close, action performed on all transitions to get the
445
- # correct closing count.
450
+ # special case for close to get the correct closing count.
446
451
  # ------------------------------------------------------------------------
447
- group_open . group_comment $group_closed {
448
- emit(:group, :comment, *text(data, ts, te))
452
+ (group_open . group_comment) @group_closed {
453
+ emit(:group, :comment, copy(data, ts, te))
449
454
  };
450
455
 
451
456
  # Expression options:
@@ -459,12 +464,12 @@
459
464
  #
460
465
  # (?imxdau-imx:subexp) option on/off for subexp
461
466
  # ------------------------------------------------------------------------
462
- group_open . group_options >group_opened {
463
- text = text(data, ts, te).first
467
+ (group_open . group_options) >group_opened {
468
+ text = copy(data, ts, te)
464
469
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
465
- raise InvalidGroupOption.new($1 || "-#{$2}", text)
470
+ raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
466
471
  end
467
- emit_options(text, ts, te)
472
+ emit_options(text)
468
473
  };
469
474
 
470
475
  # Assertions
@@ -473,12 +478,12 @@
473
478
  # (?<=subexp) look-behind
474
479
  # (?<!subexp) negative look-behind
475
480
  # ------------------------------------------------------------------------
476
- group_open . assertion_type >group_opened {
477
- case text = text(data, ts, te).first
478
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
479
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
480
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
481
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
481
+ (group_open . assertion_type) >group_opened {
482
+ case text = copy(data, ts, te)
483
+ when '(?='; emit(:assertion, :lookahead, text)
484
+ when '(?!'; emit(:assertion, :nlookahead, text)
485
+ when '(?<='; emit(:assertion, :lookbehind, text)
486
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
482
487
  end
483
488
  };
484
489
 
@@ -490,106 +495,78 @@
490
495
  # (?'name'subexp) named group (single quoted version)
491
496
  # (subexp) captured group
492
497
  # ------------------------------------------------------------------------
493
- group_open . group_type >group_opened {
494
- case text = text(data, ts, te).first
495
- when '(?:'; emit(:group, :passive, text, ts, te)
496
- when '(?>'; emit(:group, :atomic, text, ts, te)
497
- when '(?~'; emit(:group, :absence, text, ts, te)
498
+ (group_open . group_type) >group_opened {
499
+ case text = copy(data, ts, te)
500
+ when '(?:'; emit(:group, :passive, text)
501
+ when '(?>'; emit(:group, :atomic, text)
502
+ when '(?~'; emit(:group, :absence, text)
498
503
 
499
504
  when /^\(\?(?:<>|'')/
500
- validation_error(:group, 'named group', 'name is empty')
505
+ raise ValidationError.for(:group, 'named group', 'name is empty')
501
506
 
502
- when /^\(\?<\w*>/
503
- emit(:group, :named_ab, text, ts, te)
507
+ when /^\(\?<[^>]+>/
508
+ emit(:group, :named_ab, text)
504
509
 
505
- when /^\(\?'\w*'/
506
- emit(:group, :named_sq, text, ts, te)
510
+ when /^\(\?'[^']+'/
511
+ emit(:group, :named_sq, text)
507
512
 
508
513
  end
509
514
  };
510
515
 
511
516
  group_open @group_opened {
512
- text = text(data, ts, te).first
513
- emit(:group, :capture, text, ts, te)
517
+ text = copy(data, ts, te)
518
+ emit(:group, :capture, text)
514
519
  };
515
520
 
516
521
  group_close @group_closed {
517
522
  if conditional_stack.last == group_depth + 1
518
523
  conditional_stack.pop
519
- emit(:conditional, :close, *text(data, ts, te))
520
- else
524
+ emit(:conditional, :close, ')')
525
+ elsif group_depth >= 0
521
526
  if spacing_stack.length > 1 &&
522
527
  spacing_stack.last[:depth] == group_depth + 1
523
528
  spacing_stack.pop
524
529
  self.free_spacing = spacing_stack.last[:free_spacing]
525
530
  end
526
531
 
527
- emit(:group, :close, *text(data, ts, te))
532
+ emit(:group, :close, ')')
533
+ else
534
+ raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
528
535
  end
529
536
  };
530
537
 
531
538
 
532
539
  # Group backreference, named and numbered
533
540
  # ------------------------------------------------------------------------
534
- backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
535
- case text = text(data, ts, te).first
536
- when /^\\([gk])(<>|'')/ # angle brackets
537
- validation_error(:backref, 'ref/call', 'ref ID is empty')
538
-
539
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
540
- if $1 == 'k'
541
- emit(:backref, :name_ref_ab, text, ts, te)
542
- else
543
- emit(:backref, :name_call_ab, text, ts, te)
544
- end
545
-
546
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
547
- if $1 == 'k'
548
- emit(:backref, :name_ref_sq, text, ts, te)
549
- else
550
- emit(:backref, :name_call_sq, text, ts, te)
551
- end
552
-
553
- when /^\\([gk])<\d+>/ # angle-brackets
554
- if $1 == 'k'
555
- emit(:backref, :number_ref_ab, text, ts, te)
556
- else
557
- emit(:backref, :number_call_ab, text, ts, te)
558
- end
559
-
560
- when /^\\([gk])'\d+'/ # single quotes
561
- if $1 == 'k'
562
- emit(:backref, :number_ref_sq, text, ts, te)
563
- else
564
- emit(:backref, :number_call_sq, text, ts, te)
565
- end
566
-
567
- when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
568
- if $1 == 'k'
569
- emit(:backref, :number_rel_ref_ab, text, ts, te)
570
- else
571
- emit(:backref, :number_rel_call_ab, text, ts, te)
572
- end
573
-
574
- when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
575
- if $1 == 'k'
576
- emit(:backref, :number_rel_ref_sq, text, ts, te)
577
- else
578
- emit(:backref, :number_rel_call_sq, text, ts, te)
579
- end
580
-
581
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
582
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
583
-
584
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
585
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
586
-
587
- when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
588
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
589
-
590
- when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
591
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
541
+ backslash . (group_ref) > (backslashed, 4) {
542
+ case text = copy(data, ts, te)
543
+ when /^\\k(.)[^0-9\-][^+\-]*['>]$/
544
+ emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
545
+ when /^\\k(.)0*[1-9]\d*['>]$/
546
+ emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
547
+ when /^\\k(.)-0*[1-9]\d*['>]$/
548
+ emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
549
+ when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
550
+ emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
551
+ when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
552
+ emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
553
+ else
554
+ raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
555
+ end
556
+ };
592
557
 
558
+ # Group call, named and numbered
559
+ # ------------------------------------------------------------------------
560
+ backslash . (group_call) > (backslashed, 4) {
561
+ case text = copy(data, ts, te)
562
+ when /^\\g(.)[^0-9+\-].*['>]$/
563
+ emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
564
+ when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
565
+ emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
566
+ when /^\\g(.)[+-]0*[1-9]\d*/
567
+ emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
568
+ else
569
+ raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
593
570
  end
594
571
  };
595
572
 
@@ -597,31 +574,36 @@
597
574
  # Quantifiers
598
575
  # ------------------------------------------------------------------------
599
576
  zero_or_one {
600
- case text = text(data, ts, te).first
601
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
602
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
603
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
577
+ case text = copy(data, ts, te)
578
+ when '?' ; emit(:quantifier, :zero_or_one, text)
579
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
580
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
604
581
  end
605
582
  };
606
583
 
607
584
  zero_or_more {
608
- case text = text(data, ts, te).first
609
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
610
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
611
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
585
+ case text = copy(data, ts, te)
586
+ when '*' ; emit(:quantifier, :zero_or_more, text)
587
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
588
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
612
589
  end
613
590
  };
614
591
 
615
592
  one_or_more {
616
- case text = text(data, ts, te).first
617
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
618
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
619
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
593
+ case text = copy(data, ts, te)
594
+ when '+' ; emit(:quantifier, :one_or_more, text)
595
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
596
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
620
597
  end
621
598
  };
622
599
 
623
- quantifier_interval @err(premature_end_error) {
624
- emit(:quantifier, :interval, *text(data, ts, te))
600
+ quantifier_interval {
601
+ emit(:quantifier, :interval, copy(data, ts, te))
602
+ };
603
+
604
+ # Catch unmatched curly braces as literals
605
+ range_open {
606
+ append_literal(data, ts, te)
625
607
  };
626
608
 
627
609
  # Escaped sequences
@@ -632,15 +614,17 @@
632
614
 
633
615
  comment {
634
616
  if free_spacing
635
- emit(:free_space, :comment, *text(data, ts, te))
617
+ emit(:free_space, :comment, copy(data, ts, te))
636
618
  else
637
- append_literal(data, ts, te)
619
+ # consume only the pound sign (#) and backtrack to do regular scanning
620
+ append_literal(data, ts, ts + 1)
621
+ fexec ts + 1;
638
622
  end
639
623
  };
640
624
 
641
625
  space+ {
642
626
  if free_spacing
643
- emit(:free_space, :whitespace, *text(data, ts, te))
627
+ emit(:free_space, :whitespace, copy(data, ts, te))
644
628
  else
645
629
  append_literal(data, ts, te)
646
630
  end
@@ -649,105 +633,47 @@
649
633
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
650
634
  # except meta characters.
651
635
  # ------------------------------------------------------------------------
652
- (ascii_print -- space)+ |
653
- ascii_nonprint+ |
654
- utf8_2_byte+ |
655
- utf8_3_byte+ |
656
- utf8_4_byte+ {
636
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
657
637
  append_literal(data, ts, te)
658
638
  };
659
639
 
660
640
  *|;
661
641
  }%%
662
642
 
663
- # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
664
- # This file was generated from lib/regexp_parser/scanner/scanner.rl
643
+ require 'regexp_parser/scanner/errors/scanner_error'
644
+ require 'regexp_parser/scanner/errors/premature_end_error'
645
+ require 'regexp_parser/scanner/errors/validation_error'
665
646
 
666
647
  class Regexp::Scanner
667
- # General scanner error (catch all)
668
- class ScannerError < StandardError; end
669
-
670
- # Base for all scanner validation errors
671
- class ValidationError < StandardError
672
- def initialize(reason)
673
- super reason
674
- end
675
- end
676
-
677
- # Unexpected end of pattern
678
- class PrematureEndError < ScannerError
679
- def initialize(where = '')
680
- super "Premature end of pattern at #{where}"
681
- end
682
- end
683
-
684
- # Invalid sequence format. Used for escape sequences, mainly.
685
- class InvalidSequenceError < ValidationError
686
- def initialize(what = 'sequence', where = '')
687
- super "Invalid #{what} at #{where}"
688
- end
689
- end
690
-
691
- # Invalid group. Used for named groups.
692
- class InvalidGroupError < ValidationError
693
- def initialize(what, reason)
694
- super "Invalid #{what}, #{reason}."
695
- end
696
- end
697
-
698
- # Invalid groupOption. Used for inline options.
699
- class InvalidGroupOption < ValidationError
700
- def initialize(option, text)
701
- super "Invalid group option #{option} in #{text}"
702
- end
703
- end
704
-
705
- # Invalid back reference. Used for name a number refs/calls.
706
- class InvalidBackrefError < ValidationError
707
- def initialize(what, reason)
708
- super "Invalid back reference #{what}, #{reason}"
709
- end
710
- end
711
-
712
- # The property name was not recognized by the scanner.
713
- class UnknownUnicodePropertyError < ValidationError
714
- def initialize(name)
715
- super "Unknown unicode character property name #{name}"
716
- end
717
- end
718
-
719
648
  # Scans the given regular expression text, or Regexp object and collects the
720
649
  # emitted token into an array that gets returned at the end. If a block is
721
650
  # given, it gets called for each emitted token.
722
651
  #
723
652
  # This method may raise errors if a syntax error is encountered.
724
653
  # --------------------------------------------------------------------------
725
- def self.scan(input_object, &block)
726
- new.scan(input_object, &block)
654
+ def self.scan(input_object, options: nil, collect_tokens: true, &block)
655
+ new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
727
656
  end
728
657
 
729
- def scan(input_object, &block)
730
- self.literal = nil
658
+ def scan(input_object, options: nil, collect_tokens: true, &block)
659
+ self.collect_tokens = collect_tokens
660
+ self.literal_run = nil
731
661
  stack = []
732
662
 
733
- if input_object.is_a?(Regexp)
734
- input = input_object.source
735
- self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
736
- else
737
- input = input_object
738
- self.free_spacing = false
739
- end
663
+ input = input_object.is_a?(Regexp) ? input_object.source : input_object
664
+ self.free_spacing = free_spacing?(input_object, options)
740
665
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
741
666
 
742
- data = input.unpack("c*") if input.is_a?(String)
667
+ data = input.unpack("c*")
743
668
  eof = data.length
744
669
 
745
670
  self.tokens = []
746
- self.block = block_given? ? block : nil
671
+ self.block = block
747
672
 
748
673
  self.set_depth = 0
749
674
  self.group_depth = 0
750
675
  self.conditional_stack = []
676
+ self.char_pos = 0
751
677
 
752
678
  %% write data;
753
679
  %% write init;
@@ -757,7 +683,7 @@ class Regexp::Scanner
757
683
  testEof = testEof
758
684
 
759
685
  if cs == re_scanner_error
760
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
686
+ text = copy(data, ts ? ts-1 : 0, -1)
761
687
  raise ScannerError.new("Scan error at '#{text}'")
762
688
  end
763
689
 
@@ -767,40 +693,76 @@ class Regexp::Scanner
767
693
  "[#{set_depth}]") if in_set?
768
694
 
769
695
  # when the entire expression is a literal run
770
- emit_literal if literal
696
+ emit_literal if literal_run
771
697
 
772
698
  tokens
773
699
  end
774
700
 
775
701
  # lazy-load property maps when first needed
776
- require 'yaml'
777
- PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
778
-
779
702
  def self.short_prop_map
780
- @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
703
+ @short_prop_map ||= parse_prop_map('short')
781
704
  end
782
705
 
783
706
  def self.long_prop_map
784
- @long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
707
+ @long_prop_map ||= parse_prop_map('long')
708
+ end
709
+
710
+ def self.parse_prop_map(name)
711
+ File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
712
+ end
713
+
714
+ def self.posix_classes
715
+ %w[alnum alpha ascii blank cntrl digit graph
716
+ lower print punct space upper word xdigit]
785
717
  end
786
718
 
787
719
  # Emits an array with the details of the scanned pattern
788
- def emit(type, token, text, ts, te)
720
+ def emit(type, token, text)
789
721
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
790
722
 
791
- emit_literal if literal
723
+ emit_literal if literal_run
724
+
725
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
726
+ # end-users, so we keep track of char-based indices and emit those instead.
727
+ ts_char_pos = char_pos
728
+ te_char_pos = char_pos + text.length
729
+
730
+ tok = [type, token, text, ts_char_pos, te_char_pos]
731
+
732
+ self.prev_token = tok
733
+
734
+ self.char_pos = te_char_pos
792
735
 
793
736
  if block
794
- block.call type, token, text, ts, te
737
+ block.call type, token, text, ts_char_pos, te_char_pos
738
+ # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
739
+ tokens << tok if collect_tokens
740
+ elsif collect_tokens
741
+ tokens << tok
795
742
  end
796
-
797
- tokens << [type, token, text, ts, te]
798
743
  end
799
744
 
745
+ attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
746
+
800
747
  private
801
748
 
802
- attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
803
- :group_depth, :set_depth, :conditional_stack
749
+ attr_accessor :block,
750
+ :collect_tokens, :tokens, :prev_token,
751
+ :free_spacing, :spacing_stack,
752
+ :group_depth, :set_depth, :conditional_stack,
753
+ :char_pos
754
+
755
+ def free_spacing?(input_object, options)
756
+ if options && !input_object.is_a?(String)
757
+ raise ArgumentError, 'options cannot be supplied unless scanning a String'
758
+ end
759
+
760
+ options = input_object.options if input_object.is_a?(::Regexp)
761
+
762
+ return false unless options
763
+
764
+ options & Regexp::EXTENDED != 0
765
+ end
804
766
 
805
767
  def in_group?
806
768
  group_depth > 0
@@ -811,36 +773,24 @@ class Regexp::Scanner
811
773
  end
812
774
 
813
775
  # Copy from ts to te from data as text
814
- def copy(data, range)
815
- data[range].pack('c*')
816
- end
817
-
818
- # Copy from ts to te from data as text, returning an array with the text
819
- # and the offsets used to copy it.
820
- def text(data, ts, te, soff = 0)
821
- [copy(data, ts-soff..te-1), ts-soff, te]
776
+ def copy(data, ts, te)
777
+ data[ts...te].pack('c*').force_encoding('utf-8')
822
778
  end
823
779
 
824
780
  # Appends one or more characters to the literal buffer, to be emitted later
825
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
781
+ # by a call to emit_literal.
826
782
  def append_literal(data, ts, te)
827
- self.literal = literal || []
828
- literal << text(data, ts, te)
783
+ (self.literal_run ||= []) << copy(data, ts, te)
829
784
  end
830
785
 
831
- # Emits the literal run collected by calls to the append_literal method,
832
- # using the total start (ts) and end (te) offsets of the run.
786
+ # Emits the literal run collected by calls to the append_literal method.
833
787
  def emit_literal
834
- ts, te = literal.first[1], literal.last[2]
835
- text = literal.map {|t| t[0]}.join
836
-
837
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
838
-
839
- self.literal = nil
840
- emit(:literal, :literal, text, ts, te)
788
+ text = literal_run.join
789
+ self.literal_run = nil
790
+ emit(:literal, :literal, text)
841
791
  end
842
792
 
843
- def emit_options(text, ts, te)
793
+ def emit_options(text)
844
794
  token = nil
845
795
 
846
796
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -866,28 +816,13 @@ class Regexp::Scanner
866
816
  token = :options_switch
867
817
  end
868
818
 
869
- emit(:group, token, text, ts, te)
819
+ emit(:group, token, text)
870
820
  end
871
821
 
872
822
  def emit_meta_control_sequence(data, ts, te, token)
873
823
  if data.last < 0x00 || data.last > 0x7F
874
- validation_error(:sequence, 'escape', token.to_s)
875
- end
876
- emit(:escape, token, *text(data, ts, te, 1))
877
- end
878
-
879
- # Centralizes and unifies the handling of validation related
880
- # errors.
881
- def validation_error(type, what, reason)
882
- case type
883
- when :group
884
- error = InvalidGroupError.new(what, reason)
885
- when :backref
886
- error = InvalidBackrefError.new(what, reason)
887
- when :sequence
888
- error = InvalidSequenceError.new(what, reason)
824
+ raise ValidationError.for(:sequence, 'escape', token.to_s)
889
825
  end
890
-
891
- raise error # unless @@config.validation_ignore
826
+ emit(:escape, token, copy(data, ts-1, te))
892
827
  end
893
828
  end # module Regexp::Scanner