regexp_parser 1.7.0 → 2.8.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (165) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +8 -2
  3. data/LICENSE +1 -1
  4. data/Rakefile +6 -70
  5. data/lib/regexp_parser/error.rb +4 -0
  6. data/lib/regexp_parser/expression/base.rb +76 -0
  7. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  8. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  9. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
  10. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
  11. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
  12. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  13. data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
  14. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
  15. data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
  16. data/lib/regexp_parser/expression/classes/group.rb +28 -15
  17. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  18. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  19. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
  20. data/lib/regexp_parser/expression/classes/root.rb +4 -19
  21. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
  22. data/lib/regexp_parser/expression/methods/construct.rb +41 -0
  23. data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
  24. data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
  25. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  26. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  27. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  28. data/lib/regexp_parser/expression/methods/tests.rb +47 -1
  29. data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
  30. data/lib/regexp_parser/expression/quantifier.rb +57 -17
  31. data/lib/regexp_parser/expression/sequence.rb +11 -47
  32. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  33. data/lib/regexp_parser/expression/shared.rb +111 -0
  34. data/lib/regexp_parser/expression/subexpression.rb +27 -19
  35. data/lib/regexp_parser/expression.rb +14 -141
  36. data/lib/regexp_parser/lexer.rb +83 -41
  37. data/lib/regexp_parser/parser.rb +371 -429
  38. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  39. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  40. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  41. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  42. data/lib/regexp_parser/scanner/properties/long.csv +633 -0
  43. data/lib/regexp_parser/scanner/properties/short.csv +248 -0
  44. data/lib/regexp_parser/scanner/property.rl +4 -4
  45. data/lib/regexp_parser/scanner/scanner.rl +303 -368
  46. data/lib/regexp_parser/scanner.rb +1423 -1674
  47. data/lib/regexp_parser/syntax/any.rb +2 -7
  48. data/lib/regexp_parser/syntax/base.rb +92 -67
  49. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  50. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  51. data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
  52. data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
  53. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  54. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  55. data/lib/regexp_parser/syntax/token/escape.rb +33 -0
  56. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  57. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  58. data/lib/regexp_parser/syntax/token/meta.rb +20 -0
  59. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  60. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  61. data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
  62. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  63. data/lib/regexp_parser/syntax/token.rb +45 -0
  64. data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
  65. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  66. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  67. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  68. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  69. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  70. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  71. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  72. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  73. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  74. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  75. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  76. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  77. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  78. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  79. data/lib/regexp_parser/syntax/versions.rb +3 -1
  80. data/lib/regexp_parser/syntax.rb +8 -6
  81. data/lib/regexp_parser/token.rb +9 -20
  82. data/lib/regexp_parser/version.rb +1 -1
  83. data/lib/regexp_parser.rb +0 -2
  84. data/regexp_parser.gemspec +19 -23
  85. metadata +52 -171
  86. data/CHANGELOG.md +0 -349
  87. data/README.md +0 -470
  88. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  89. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  90. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  91. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  92. data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
  93. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  94. data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
  95. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  96. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  97. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  98. data/spec/expression/base_spec.rb +0 -94
  99. data/spec/expression/clone_spec.rb +0 -120
  100. data/spec/expression/conditional_spec.rb +0 -89
  101. data/spec/expression/free_space_spec.rb +0 -27
  102. data/spec/expression/methods/match_length_spec.rb +0 -161
  103. data/spec/expression/methods/match_spec.rb +0 -25
  104. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  105. data/spec/expression/methods/tests_spec.rb +0 -99
  106. data/spec/expression/methods/traverse_spec.rb +0 -161
  107. data/spec/expression/options_spec.rb +0 -128
  108. data/spec/expression/root_spec.rb +0 -9
  109. data/spec/expression/sequence_spec.rb +0 -9
  110. data/spec/expression/subexpression_spec.rb +0 -50
  111. data/spec/expression/to_h_spec.rb +0 -26
  112. data/spec/expression/to_s_spec.rb +0 -100
  113. data/spec/lexer/all_spec.rb +0 -22
  114. data/spec/lexer/conditionals_spec.rb +0 -53
  115. data/spec/lexer/escapes_spec.rb +0 -14
  116. data/spec/lexer/keep_spec.rb +0 -10
  117. data/spec/lexer/literals_spec.rb +0 -89
  118. data/spec/lexer/nesting_spec.rb +0 -99
  119. data/spec/lexer/refcalls_spec.rb +0 -55
  120. data/spec/parser/all_spec.rb +0 -43
  121. data/spec/parser/alternation_spec.rb +0 -88
  122. data/spec/parser/anchors_spec.rb +0 -17
  123. data/spec/parser/conditionals_spec.rb +0 -179
  124. data/spec/parser/errors_spec.rb +0 -30
  125. data/spec/parser/escapes_spec.rb +0 -121
  126. data/spec/parser/free_space_spec.rb +0 -130
  127. data/spec/parser/groups_spec.rb +0 -108
  128. data/spec/parser/keep_spec.rb +0 -6
  129. data/spec/parser/posix_classes_spec.rb +0 -8
  130. data/spec/parser/properties_spec.rb +0 -115
  131. data/spec/parser/quantifiers_spec.rb +0 -51
  132. data/spec/parser/refcalls_spec.rb +0 -112
  133. data/spec/parser/set/intersections_spec.rb +0 -127
  134. data/spec/parser/set/ranges_spec.rb +0 -111
  135. data/spec/parser/sets_spec.rb +0 -178
  136. data/spec/parser/types_spec.rb +0 -18
  137. data/spec/scanner/all_spec.rb +0 -18
  138. data/spec/scanner/anchors_spec.rb +0 -21
  139. data/spec/scanner/conditionals_spec.rb +0 -128
  140. data/spec/scanner/errors_spec.rb +0 -68
  141. data/spec/scanner/escapes_spec.rb +0 -53
  142. data/spec/scanner/free_space_spec.rb +0 -133
  143. data/spec/scanner/groups_spec.rb +0 -52
  144. data/spec/scanner/keep_spec.rb +0 -10
  145. data/spec/scanner/literals_spec.rb +0 -49
  146. data/spec/scanner/meta_spec.rb +0 -18
  147. data/spec/scanner/properties_spec.rb +0 -64
  148. data/spec/scanner/quantifiers_spec.rb +0 -20
  149. data/spec/scanner/refcalls_spec.rb +0 -36
  150. data/spec/scanner/sets_spec.rb +0 -102
  151. data/spec/scanner/types_spec.rb +0 -14
  152. data/spec/spec_helper.rb +0 -15
  153. data/spec/support/runner.rb +0 -42
  154. data/spec/support/shared_examples.rb +0 -77
  155. data/spec/support/warning_extractor.rb +0 -60
  156. data/spec/syntax/syntax_spec.rb +0 -48
  157. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  158. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  159. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  160. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  161. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  162. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  163. data/spec/syntax/versions/aliases_spec.rb +0 -37
  164. data/spec/token/token_spec.rb +0 -85
  165. /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -15,26 +20,15 @@
15
20
 
16
21
  group_open = '(';
17
22
  group_close = ')';
18
- parantheses = group_open | group_close;
23
+ parentheses = group_open | group_close;
19
24
 
20
25
  set_open = '[';
21
26
  set_close = ']';
22
27
  brackets = set_open | set_close;
23
28
 
24
- comment = ('#' . [^\n]* . '\n');
25
-
26
- class_name_posix = 'alnum' | 'alpha' | 'blank' |
27
- 'cntrl' | 'digit' | 'graph' |
28
- 'lower' | 'print' | 'punct' |
29
- 'space' | 'upper' | 'xdigit' |
30
- 'word' | 'ascii';
31
-
32
- class_posix = ('[:' . '^'? . class_name_posix . ':]');
33
-
29
+ comment = ('#' . [^\n]* . '\n'?);
34
30
 
35
- # these are not supported in ruby, and need verification
36
- collating_sequence = '[.' . (alpha | [\-])+ . '.]';
37
- character_equivalent = '[=' . alpha . '=]';
31
+ class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
38
32
 
39
33
  line_anchor = beginning_of_line | end_of_line;
40
34
  anchor_char = [AbBzZG];
@@ -53,21 +47,20 @@
53
47
 
54
48
  meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
49
 
50
+ sequence_char = [CMcux];
51
+
56
52
  zero_or_one = '?' | '??' | '?+';
57
53
  zero_or_more = '*' | '*?' | '*+';
58
54
  one_or_more = '+' | '+?' | '++';
59
55
 
60
56
  quantifier_greedy = '?' | '*' | '+';
61
- quantifier_reluctant = '??' | '*?' | '+?';
62
- quantifier_possessive = '?+' | '*+' | '++';
63
- quantifier_mode = '?' | '+';
64
-
65
- quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
66
- range_close . quantifier_mode?;
67
-
68
- quantifiers = quantifier_greedy | quantifier_reluctant |
69
- quantifier_possessive | quantifier_interval;
70
57
 
58
+ quantity_exact = (digit+);
59
+ quantity_minimum = (digit+) . ',';
60
+ quantity_maximum = ',' . (digit+);
61
+ quantity_range = (digit+) . ',' . (digit+);
62
+ quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
63
+ quantity_maximum | quantity_range ) . range_close;
71
64
 
72
65
  conditional = '(?(';
73
66
 
@@ -85,22 +78,22 @@
85
78
  # try to treat every other group head as options group, like Ruby
86
79
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
87
80
 
88
- group_ref = [gk];
89
- group_name_char = (alnum | '_');
90
- group_name_id = (group_name_char . (group_name_char+)?)?;
91
- group_number = '-'? . [1-9] . ([0-9]+)?;
81
+ group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
82
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
83
+ group_number = '-'? . [0-9]+;
92
84
  group_level = [+\-] . [0-9]+;
93
85
 
94
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
86
+ group_name = ('<' . group_name_id_ab? . '>') |
87
+ ("'" . group_name_id_sq? . "'");
95
88
  group_lookup = group_name | group_number;
96
89
 
97
90
  group_named = ('?' . group_name );
98
91
 
99
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
100
- ("'" . group_name_id . group_level? "'"));
92
+ group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
93
+ ("'" . (group_name_id_sq? | group_number) . group_level? "'"));
101
94
 
102
- group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
103
- ("'" . group_number . group_level? "'"));
95
+ group_ref = 'k' . group_ref_body;
96
+ group_call = 'g' . group_ref_body;
104
97
 
105
98
  group_type = group_atomic | group_passive | group_absence | group_named;
106
99
 
@@ -111,32 +104,33 @@
111
104
 
112
105
  # characters that 'break' a literal
113
106
  meta_char = dot | backslash | alternation |
114
- curlies | parantheses | brackets |
107
+ curlies | parentheses | brackets |
115
108
  line_anchor | quantifier_greedy;
116
109
 
117
- ascii_print = ((0x20..0x7e) - meta_char);
118
- ascii_nonprint = (0x01..0x1f | 0x7f);
110
+ literal_delimiters = ']' | '}';
119
111
 
120
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
121
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
122
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
112
+ ascii_print = ((0x20..0x7e) - meta_char - '#');
113
+ ascii_nonprint = (0x01..0x1f | 0x7f);
123
114
 
124
115
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
125
- group_ref | keep_mark | [xucCM];
116
+ keep_mark | sequence_char;
117
+
118
+ # escapes that also work within a character set
119
+ set_escape = backslash | brackets | escaped_ascii |
120
+ octal_sequence | property_char |
121
+ sequence_char | single_codepoint_char_type;
126
122
 
127
- non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
128
- multi_codepoint_char_type | [0-9cCM];
129
123
 
130
124
  # EOF error, used where it can be detected
131
125
  action premature_end_error {
132
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
133
- raise PrematureEndError.new( text )
126
+ text = copy(data, ts ? ts-1 : 0, -1)
127
+ raise PrematureEndError.new(text)
134
128
  }
135
129
 
136
130
  # Invalid sequence error, used from sequences, like escapes and sets
137
131
  action invalid_sequence_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
139
- validation_error(:sequence, 'sequence', text)
132
+ text = copy(data, ts ? ts-1 : 0, -1)
133
+ raise ValidationError.for(:sequence, 'sequence', text)
140
134
  }
141
135
 
142
136
  # group (nesting) and set open/close actions
@@ -150,7 +144,7 @@
150
144
  # --------------------------------------------------------------------------
151
145
  character_set := |*
152
146
  set_close > (set_meta, 2) @set_closed {
153
- emit(:set, :close, *text(data, ts, te))
147
+ emit(:set, :close, copy(data, ts, te))
154
148
  if in_set?
155
149
  fret;
156
150
  else
@@ -159,8 +153,8 @@
159
153
  };
160
154
 
161
155
  '-]' @set_closed { # special case, emits two tokens
162
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
163
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
156
+ emit(:literal, :literal, '-')
157
+ emit(:set, :close, ']')
164
158
  if in_set?
165
159
  fret;
166
160
  else
@@ -169,33 +163,32 @@
169
163
  };
170
164
 
171
165
  '-&&' { # special case, emits two tokens
172
- emit(:literal, :literal, '-', ts, te)
173
- emit(:set, :intersection, '&&', ts, te)
166
+ emit(:literal, :literal, '-')
167
+ emit(:set, :intersection, '&&')
174
168
  };
175
169
 
176
170
  '^' {
177
- text = text(data, ts, te).first
178
- if tokens.last[1] == :open
179
- emit(:set, :negate, text, ts, te)
171
+ if prev_token[1] == :open
172
+ emit(:set, :negate, '^')
180
173
  else
181
- emit(:literal, :literal, text, ts, te)
174
+ emit(:literal, :literal, '^')
182
175
  end
183
176
  };
184
177
 
185
178
  '-' {
186
- text = text(data, ts, te).first
187
- # ranges cant start with a subset or intersection/negation/range operator
188
- if tokens.last[0] == :set
189
- emit(:literal, :literal, text, ts, te)
179
+ # ranges cant start with the opening bracket, a subset, or
180
+ # intersection/negation/range operators
181
+ if prev_token[0] == :set
182
+ emit(:literal, :literal, '-')
190
183
  else
191
- emit(:set, :range, text, ts, te)
184
+ emit(:set, :range, '-')
192
185
  end
193
186
  };
194
187
 
195
188
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
196
189
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
197
190
  '&&' {
198
- emit(:set, :intersection, *text(data, ts, te))
191
+ emit(:set, :intersection, '&&')
199
192
  };
200
193
 
201
194
  backslash {
@@ -203,59 +196,60 @@
203
196
  };
204
197
 
205
198
  set_open >(open_bracket, 1) >set_opened {
206
- emit(:set, :open, *text(data, ts, te))
199
+ emit(:set, :open, '[')
207
200
  fcall character_set;
208
201
  };
209
202
 
210
- class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
211
- text = text(data, ts, te).first
203
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
204
+ text = copy(data, ts, te)
212
205
 
213
206
  type = :posixclass
214
207
  class_name = text[2..-3]
215
- if class_name[0].chr == '^'
208
+ if class_name[0] == '^'
216
209
  class_name = class_name[1..-1]
217
210
  type = :nonposixclass
218
211
  end
219
212
 
220
- emit(type, class_name.to_sym, text, ts, te)
221
- };
222
-
223
- collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
224
- emit(:set, :collation, *text(data, ts, te))
225
- };
213
+ unless self.class.posix_classes.include?(class_name)
214
+ raise ValidationError.for(:posix_class, text)
215
+ end
226
216
 
227
- character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
228
- emit(:set, :equivalent, *text(data, ts, te))
217
+ emit(type, class_name.to_sym, text)
229
218
  };
230
219
 
231
220
  meta_char > (set_meta, 1) {
232
- emit(:literal, :literal, *text(data, ts, te))
221
+ emit(:literal, :literal, copy(data, ts, te))
233
222
  };
234
223
 
235
- any |
236
- ascii_nonprint |
237
- utf8_2_byte |
238
- utf8_3_byte |
239
- utf8_4_byte {
240
- char, *rest = *text(data, ts, te)
241
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
242
- emit(:literal, :literal, char, *rest)
224
+ any | ascii_nonprint | utf8_multibyte {
225
+ text = copy(data, ts, te)
226
+ emit(:literal, :literal, text)
243
227
  };
244
228
  *|;
245
229
 
246
230
  # set escapes scanner
247
231
  # --------------------------------------------------------------------------
248
232
  set_escape_sequence := |*
249
- non_set_escape > (escaped_set_alpha, 2) {
250
- emit(:escape, :literal, *text(data, ts, te, 1))
233
+ # Special case: in sets, octal sequences have higher priority than backrefs
234
+ octal_sequence {
235
+ emit(:escape, :octal, copy(data, ts-1, te))
251
236
  fret;
252
237
  };
253
238
 
254
- any > (escaped_set_alpha, 1) {
239
+ # Scan all other escapes that work in sets with the generic escape scanner
240
+ set_escape > (escaped_set_alpha, 2) {
255
241
  fhold;
256
242
  fnext character_set;
257
243
  fcall escape_sequence;
258
244
  };
245
+
246
+ # Treat all remaining escapes - those not supported in sets - as literal.
247
+ # (This currently includes \^, \-, \&, \:, although these could potentially
248
+ # be meta chars when not escaped, depending on their position in the set.)
249
+ any > (escaped_set_alpha, 1) {
250
+ emit(:escape, :literal, copy(data, ts-1, te))
251
+ fret;
252
+ };
259
253
  *|;
260
254
 
261
255
 
@@ -263,33 +257,40 @@
263
257
  # --------------------------------------------------------------------------
264
258
  escape_sequence := |*
265
259
  [1-9] {
266
- text = text(data, ts, te, 1).first
267
- emit(:backref, :number, text, ts-1, te)
260
+ text = copy(data, ts-1, te)
261
+ emit(:backref, :number, text)
268
262
  fret;
269
263
  };
270
264
 
271
265
  octal_sequence {
272
- emit(:escape, :octal, *text(data, ts, te, 1))
266
+ emit(:escape, :octal, copy(data, ts-1, te))
267
+ fret;
268
+ };
269
+
270
+ [8-9] . [0-9] { # special case, emits two tokens
271
+ text = copy(data, ts-1, te)
272
+ emit(:escape, :literal, text[0, 2])
273
+ emit(:literal, :literal, text[2])
273
274
  fret;
274
275
  };
275
276
 
276
277
  meta_char {
277
- case text = text(data, ts, te, 1).first
278
- when '\.'; emit(:escape, :dot, text, ts-1, te)
279
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
280
- when '\^'; emit(:escape, :bol, text, ts-1, te)
281
- when '\$'; emit(:escape, :eol, text, ts-1, te)
282
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
283
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
284
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
285
- when '\('; emit(:escape, :group_open, text, ts-1, te)
286
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
287
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
288
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
289
- when '\['; emit(:escape, :set_open, text, ts-1, te)
290
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
278
+ case text = copy(data, ts-1, te)
279
+ when '\.'; emit(:escape, :dot, text)
280
+ when '\|'; emit(:escape, :alternation, text)
281
+ when '\^'; emit(:escape, :bol, text)
282
+ when '\$'; emit(:escape, :eol, text)
283
+ when '\?'; emit(:escape, :zero_or_one, text)
284
+ when '\*'; emit(:escape, :zero_or_more, text)
285
+ when '\+'; emit(:escape, :one_or_more, text)
286
+ when '\('; emit(:escape, :group_open, text)
287
+ when '\)'; emit(:escape, :group_close, text)
288
+ when '\{'; emit(:escape, :interval_open, text)
289
+ when '\}'; emit(:escape, :interval_close, text)
290
+ when '\['; emit(:escape, :set_open, text)
291
+ when '\]'; emit(:escape, :set_close, text)
291
292
  when "\\\\";
292
- emit(:escape, :backslash, text, ts-1, te)
293
+ emit(:escape, :backslash, text)
293
294
  end
294
295
  fret;
295
296
  };
@@ -297,31 +298,31 @@
297
298
  escaped_ascii > (escaped_alpha, 7) {
298
299
  # \b is emitted as backspace only when inside a character set, otherwise
299
300
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
300
- case text = text(data, ts, te, 1).first
301
- when '\a'; emit(:escape, :bell, text, ts-1, te)
302
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
303
- when '\e'; emit(:escape, :escape, text, ts-1, te)
304
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
305
- when '\n'; emit(:escape, :newline, text, ts-1, te)
306
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
307
- when '\t'; emit(:escape, :tab, text, ts-1, te)
308
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
301
+ case text = copy(data, ts-1, te)
302
+ when '\a'; emit(:escape, :bell, text)
303
+ when '\b'; emit(:escape, :backspace, text)
304
+ when '\e'; emit(:escape, :escape, text)
305
+ when '\f'; emit(:escape, :form_feed, text)
306
+ when '\n'; emit(:escape, :newline, text)
307
+ when '\r'; emit(:escape, :carriage, text)
308
+ when '\t'; emit(:escape, :tab, text)
309
+ when '\v'; emit(:escape, :vertical_tab, text)
309
310
  end
310
311
  fret;
311
312
  };
312
313
 
313
314
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
314
- text = text(data, ts, te, 1).first
315
- if text[2].chr == '{'
316
- emit(:escape, :codepoint_list, text, ts-1, te)
315
+ text = copy(data, ts-1, te)
316
+ if text[2] == '{'
317
+ emit(:escape, :codepoint_list, text)
317
318
  else
318
- emit(:escape, :codepoint, text, ts-1, te)
319
+ emit(:escape, :codepoint, text)
319
320
  end
320
321
  fret;
321
322
  };
322
323
 
323
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
324
- emit(:escape, :hex, *text(data, ts, te, 1))
324
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
325
+ emit(:escape, :hex, copy(data, ts-1, te))
325
326
  fret;
326
327
  };
327
328
 
@@ -351,8 +352,8 @@
351
352
  fcall unicode_property;
352
353
  };
353
354
 
354
- (any -- non_literal_escape) > (escaped_alpha, 1) {
355
- emit(:escape, :literal, *text(data, ts, te, 1))
355
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
356
+ emit(:escape, :literal, copy(data, ts-1, te))
356
357
  fret;
357
358
  };
358
359
  *|;
@@ -362,9 +363,10 @@
362
363
  # --------------------------------------------------------------------------
363
364
  conditional_expression := |*
364
365
  group_lookup . ')' {
365
- text = text(data, ts, te-1).first
366
- emit(:conditional, :condition, text, ts, te-1)
367
- emit(:conditional, :condition_close, ')', te-1, te)
366
+ text = copy(data, ts, te-1)
367
+ text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
368
+ emit(:conditional, :condition, text)
369
+ emit(:conditional, :condition_close, ')')
368
370
  };
369
371
 
370
372
  any {
@@ -381,46 +383,50 @@
381
383
  # Meta characters
382
384
  # ------------------------------------------------------------------------
383
385
  dot {
384
- emit(:meta, :dot, *text(data, ts, te))
386
+ emit(:meta, :dot, copy(data, ts, te))
385
387
  };
386
388
 
387
389
  alternation {
388
390
  if conditional_stack.last == group_depth
389
- emit(:conditional, :separator, *text(data, ts, te))
391
+ emit(:conditional, :separator, copy(data, ts, te))
390
392
  else
391
- emit(:meta, :alternation, *text(data, ts, te))
393
+ emit(:meta, :alternation, copy(data, ts, te))
392
394
  end
393
395
  };
394
396
 
395
397
  # Anchors
396
398
  # ------------------------------------------------------------------------
397
399
  beginning_of_line {
398
- emit(:anchor, :bol, *text(data, ts, te))
400
+ emit(:anchor, :bol, copy(data, ts, te))
399
401
  };
400
402
 
401
403
  end_of_line {
402
- emit(:anchor, :eol, *text(data, ts, te))
404
+ emit(:anchor, :eol, copy(data, ts, te))
403
405
  };
404
406
 
405
407
  backslash . keep_mark > (backslashed, 4) {
406
- emit(:keep, :mark, *text(data, ts, te))
408
+ emit(:keep, :mark, copy(data, ts, te))
407
409
  };
408
410
 
409
411
  backslash . anchor_char > (backslashed, 3) {
410
- case text = text(data, ts, te).first
411
- when '\\A'; emit(:anchor, :bos, text, ts, te)
412
- when '\\z'; emit(:anchor, :eos, text, ts, te)
413
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
414
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
415
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
416
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
412
+ case text = copy(data, ts, te)
413
+ when '\A'; emit(:anchor, :bos, text)
414
+ when '\z'; emit(:anchor, :eos, text)
415
+ when '\Z'; emit(:anchor, :eos_ob_eol, text)
416
+ when '\b'; emit(:anchor, :word_boundary, text)
417
+ when '\B'; emit(:anchor, :nonword_boundary, text)
418
+ when '\G'; emit(:anchor, :match_start, text)
417
419
  end
418
420
  };
419
421
 
422
+ literal_delimiters {
423
+ append_literal(data, ts, te)
424
+ };
425
+
420
426
  # Character sets
421
427
  # ------------------------------------------------------------------------
422
428
  set_open >set_opened {
423
- emit(:set, :open, *text(data, ts, te))
429
+ emit(:set, :open, copy(data, ts, te))
424
430
  fcall character_set;
425
431
  };
426
432
 
@@ -429,23 +435,22 @@
429
435
  # (?(condition)Y|N) conditional expression
430
436
  # ------------------------------------------------------------------------
431
437
  conditional {
432
- text = text(data, ts, te).first
438
+ text = copy(data, ts, te)
433
439
 
434
440
  conditional_stack << group_depth
435
441
 
436
- emit(:conditional, :open, text[0..-2], ts, te-1)
437
- emit(:conditional, :condition_open, '(', te-1, te)
442
+ emit(:conditional, :open, text[0..-2])
443
+ emit(:conditional, :condition_open, '(')
438
444
  fcall conditional_expression;
439
445
  };
440
446
 
441
447
 
442
448
  # (?#...) comments: parsed as a single expression, without introducing a
443
449
  # new nesting level. Comments may not include parentheses, escaped or not.
444
- # special case for close, action performed on all transitions to get the
445
- # correct closing count.
450
+ # special case for close to get the correct closing count.
446
451
  # ------------------------------------------------------------------------
447
- group_open . group_comment $group_closed {
448
- emit(:group, :comment, *text(data, ts, te))
452
+ (group_open . group_comment) @group_closed {
453
+ emit(:group, :comment, copy(data, ts, te))
449
454
  };
450
455
 
451
456
  # Expression options:
@@ -459,12 +464,12 @@
459
464
  #
460
465
  # (?imxdau-imx:subexp) option on/off for subexp
461
466
  # ------------------------------------------------------------------------
462
- group_open . group_options >group_opened {
463
- text = text(data, ts, te).first
467
+ (group_open . group_options) >group_opened {
468
+ text = copy(data, ts, te)
464
469
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
465
- raise InvalidGroupOption.new($1 || "-#{$2}", text)
470
+ raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
466
471
  end
467
- emit_options(text, ts, te)
472
+ emit_options(text)
468
473
  };
469
474
 
470
475
  # Assertions
@@ -473,12 +478,12 @@
473
478
  # (?<=subexp) look-behind
474
479
  # (?<!subexp) negative look-behind
475
480
  # ------------------------------------------------------------------------
476
- group_open . assertion_type >group_opened {
477
- case text = text(data, ts, te).first
478
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
479
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
480
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
481
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
481
+ (group_open . assertion_type) >group_opened {
482
+ case text = copy(data, ts, te)
483
+ when '(?='; emit(:assertion, :lookahead, text)
484
+ when '(?!'; emit(:assertion, :nlookahead, text)
485
+ when '(?<='; emit(:assertion, :lookbehind, text)
486
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
482
487
  end
483
488
  };
484
489
 
@@ -490,106 +495,78 @@
490
495
  # (?'name'subexp) named group (single quoted version)
491
496
  # (subexp) captured group
492
497
  # ------------------------------------------------------------------------
493
- group_open . group_type >group_opened {
494
- case text = text(data, ts, te).first
495
- when '(?:'; emit(:group, :passive, text, ts, te)
496
- when '(?>'; emit(:group, :atomic, text, ts, te)
497
- when '(?~'; emit(:group, :absence, text, ts, te)
498
+ (group_open . group_type) >group_opened {
499
+ case text = copy(data, ts, te)
500
+ when '(?:'; emit(:group, :passive, text)
501
+ when '(?>'; emit(:group, :atomic, text)
502
+ when '(?~'; emit(:group, :absence, text)
498
503
 
499
504
  when /^\(\?(?:<>|'')/
500
- validation_error(:group, 'named group', 'name is empty')
505
+ raise ValidationError.for(:group, 'named group', 'name is empty')
501
506
 
502
- when /^\(\?<\w*>/
503
- emit(:group, :named_ab, text, ts, te)
507
+ when /^\(\?<[^>]+>/
508
+ emit(:group, :named_ab, text)
504
509
 
505
- when /^\(\?'\w*'/
506
- emit(:group, :named_sq, text, ts, te)
510
+ when /^\(\?'[^']+'/
511
+ emit(:group, :named_sq, text)
507
512
 
508
513
  end
509
514
  };
510
515
 
511
516
  group_open @group_opened {
512
- text = text(data, ts, te).first
513
- emit(:group, :capture, text, ts, te)
517
+ text = copy(data, ts, te)
518
+ emit(:group, :capture, text)
514
519
  };
515
520
 
516
521
  group_close @group_closed {
517
522
  if conditional_stack.last == group_depth + 1
518
523
  conditional_stack.pop
519
- emit(:conditional, :close, *text(data, ts, te))
520
- else
524
+ emit(:conditional, :close, ')')
525
+ elsif group_depth >= 0
521
526
  if spacing_stack.length > 1 &&
522
527
  spacing_stack.last[:depth] == group_depth + 1
523
528
  spacing_stack.pop
524
529
  self.free_spacing = spacing_stack.last[:free_spacing]
525
530
  end
526
531
 
527
- emit(:group, :close, *text(data, ts, te))
532
+ emit(:group, :close, ')')
533
+ else
534
+ raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
528
535
  end
529
536
  };
530
537
 
531
538
 
532
539
  # Group backreference, named and numbered
533
540
  # ------------------------------------------------------------------------
534
- backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
535
- case text = text(data, ts, te).first
536
- when /^\\([gk])(<>|'')/ # angle brackets
537
- validation_error(:backref, 'ref/call', 'ref ID is empty')
538
-
539
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
540
- if $1 == 'k'
541
- emit(:backref, :name_ref_ab, text, ts, te)
542
- else
543
- emit(:backref, :name_call_ab, text, ts, te)
544
- end
545
-
546
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
547
- if $1 == 'k'
548
- emit(:backref, :name_ref_sq, text, ts, te)
549
- else
550
- emit(:backref, :name_call_sq, text, ts, te)
551
- end
552
-
553
- when /^\\([gk])<\d+>/ # angle-brackets
554
- if $1 == 'k'
555
- emit(:backref, :number_ref_ab, text, ts, te)
556
- else
557
- emit(:backref, :number_call_ab, text, ts, te)
558
- end
559
-
560
- when /^\\([gk])'\d+'/ # single quotes
561
- if $1 == 'k'
562
- emit(:backref, :number_ref_sq, text, ts, te)
563
- else
564
- emit(:backref, :number_call_sq, text, ts, te)
565
- end
566
-
567
- when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
568
- if $1 == 'k'
569
- emit(:backref, :number_rel_ref_ab, text, ts, te)
570
- else
571
- emit(:backref, :number_rel_call_ab, text, ts, te)
572
- end
573
-
574
- when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
575
- if $1 == 'k'
576
- emit(:backref, :number_rel_ref_sq, text, ts, te)
577
- else
578
- emit(:backref, :number_rel_call_sq, text, ts, te)
579
- end
580
-
581
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
582
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
583
-
584
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
585
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
586
-
587
- when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
588
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
589
-
590
- when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
591
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
541
+ backslash . (group_ref) > (backslashed, 4) {
542
+ case text = copy(data, ts, te)
543
+ when /^\\k(.)[^0-9\-][^+\-]*['>]$/
544
+ emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
545
+ when /^\\k(.)0*[1-9]\d*['>]$/
546
+ emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
547
+ when /^\\k(.)-0*[1-9]\d*['>]$/
548
+ emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
549
+ when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
550
+ emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
551
+ when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
552
+ emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
553
+ else
554
+ raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
555
+ end
556
+ };
592
557
 
558
+ # Group call, named and numbered
559
+ # ------------------------------------------------------------------------
560
+ backslash . (group_call) > (backslashed, 4) {
561
+ case text = copy(data, ts, te)
562
+ when /^\\g(.)[^0-9+\-].*['>]$/
563
+ emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
564
+ when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
565
+ emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
566
+ when /^\\g(.)[+-]0*[1-9]\d*/
567
+ emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
568
+ else
569
+ raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
593
570
  end
594
571
  };
595
572
 
@@ -597,31 +574,36 @@
597
574
  # Quantifiers
598
575
  # ------------------------------------------------------------------------
599
576
  zero_or_one {
600
- case text = text(data, ts, te).first
601
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
602
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
603
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
577
+ case text = copy(data, ts, te)
578
+ when '?' ; emit(:quantifier, :zero_or_one, text)
579
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
580
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
604
581
  end
605
582
  };
606
583
 
607
584
  zero_or_more {
608
- case text = text(data, ts, te).first
609
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
610
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
611
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
585
+ case text = copy(data, ts, te)
586
+ when '*' ; emit(:quantifier, :zero_or_more, text)
587
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
588
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
612
589
  end
613
590
  };
614
591
 
615
592
  one_or_more {
616
- case text = text(data, ts, te).first
617
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
618
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
619
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
593
+ case text = copy(data, ts, te)
594
+ when '+' ; emit(:quantifier, :one_or_more, text)
595
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
596
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
620
597
  end
621
598
  };
622
599
 
623
- quantifier_interval @err(premature_end_error) {
624
- emit(:quantifier, :interval, *text(data, ts, te))
600
+ quantifier_interval {
601
+ emit(:quantifier, :interval, copy(data, ts, te))
602
+ };
603
+
604
+ # Catch unmatched curly braces as literals
605
+ range_open {
606
+ append_literal(data, ts, te)
625
607
  };
626
608
 
627
609
  # Escaped sequences
@@ -632,15 +614,17 @@
632
614
 
633
615
  comment {
634
616
  if free_spacing
635
- emit(:free_space, :comment, *text(data, ts, te))
617
+ emit(:free_space, :comment, copy(data, ts, te))
636
618
  else
637
- append_literal(data, ts, te)
619
+ # consume only the pound sign (#) and backtrack to do regular scanning
620
+ append_literal(data, ts, ts + 1)
621
+ fexec ts + 1;
638
622
  end
639
623
  };
640
624
 
641
625
  space+ {
642
626
  if free_spacing
643
- emit(:free_space, :whitespace, *text(data, ts, te))
627
+ emit(:free_space, :whitespace, copy(data, ts, te))
644
628
  else
645
629
  append_literal(data, ts, te)
646
630
  end
@@ -649,105 +633,47 @@
649
633
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
650
634
  # except meta characters.
651
635
  # ------------------------------------------------------------------------
652
- (ascii_print -- space)+ |
653
- ascii_nonprint+ |
654
- utf8_2_byte+ |
655
- utf8_3_byte+ |
656
- utf8_4_byte+ {
636
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
657
637
  append_literal(data, ts, te)
658
638
  };
659
639
 
660
640
  *|;
661
641
  }%%
662
642
 
663
- # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
664
- # This file was generated from lib/regexp_parser/scanner/scanner.rl
643
+ require 'regexp_parser/scanner/errors/scanner_error'
644
+ require 'regexp_parser/scanner/errors/premature_end_error'
645
+ require 'regexp_parser/scanner/errors/validation_error'
665
646
 
666
647
  class Regexp::Scanner
667
- # General scanner error (catch all)
668
- class ScannerError < StandardError; end
669
-
670
- # Base for all scanner validation errors
671
- class ValidationError < StandardError
672
- def initialize(reason)
673
- super reason
674
- end
675
- end
676
-
677
- # Unexpected end of pattern
678
- class PrematureEndError < ScannerError
679
- def initialize(where = '')
680
- super "Premature end of pattern at #{where}"
681
- end
682
- end
683
-
684
- # Invalid sequence format. Used for escape sequences, mainly.
685
- class InvalidSequenceError < ValidationError
686
- def initialize(what = 'sequence', where = '')
687
- super "Invalid #{what} at #{where}"
688
- end
689
- end
690
-
691
- # Invalid group. Used for named groups.
692
- class InvalidGroupError < ValidationError
693
- def initialize(what, reason)
694
- super "Invalid #{what}, #{reason}."
695
- end
696
- end
697
-
698
- # Invalid groupOption. Used for inline options.
699
- class InvalidGroupOption < ValidationError
700
- def initialize(option, text)
701
- super "Invalid group option #{option} in #{text}"
702
- end
703
- end
704
-
705
- # Invalid back reference. Used for name a number refs/calls.
706
- class InvalidBackrefError < ValidationError
707
- def initialize(what, reason)
708
- super "Invalid back reference #{what}, #{reason}"
709
- end
710
- end
711
-
712
- # The property name was not recognized by the scanner.
713
- class UnknownUnicodePropertyError < ValidationError
714
- def initialize(name)
715
- super "Unknown unicode character property name #{name}"
716
- end
717
- end
718
-
719
648
  # Scans the given regular expression text, or Regexp object and collects the
720
649
  # emitted token into an array that gets returned at the end. If a block is
721
650
  # given, it gets called for each emitted token.
722
651
  #
723
652
  # This method may raise errors if a syntax error is encountered.
724
653
  # --------------------------------------------------------------------------
725
- def self.scan(input_object, &block)
726
- new.scan(input_object, &block)
654
+ def self.scan(input_object, options: nil, collect_tokens: true, &block)
655
+ new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
727
656
  end
728
657
 
729
- def scan(input_object, &block)
730
- self.literal = nil
658
+ def scan(input_object, options: nil, collect_tokens: true, &block)
659
+ self.collect_tokens = collect_tokens
660
+ self.literal_run = nil
731
661
  stack = []
732
662
 
733
- if input_object.is_a?(Regexp)
734
- input = input_object.source
735
- self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
736
- else
737
- input = input_object
738
- self.free_spacing = false
739
- end
663
+ input = input_object.is_a?(Regexp) ? input_object.source : input_object
664
+ self.free_spacing = free_spacing?(input_object, options)
740
665
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
741
666
 
742
- data = input.unpack("c*") if input.is_a?(String)
667
+ data = input.unpack("c*")
743
668
  eof = data.length
744
669
 
745
670
  self.tokens = []
746
- self.block = block_given? ? block : nil
671
+ self.block = block
747
672
 
748
673
  self.set_depth = 0
749
674
  self.group_depth = 0
750
675
  self.conditional_stack = []
676
+ self.char_pos = 0
751
677
 
752
678
  %% write data;
753
679
  %% write init;
@@ -757,7 +683,7 @@ class Regexp::Scanner
757
683
  testEof = testEof
758
684
 
759
685
  if cs == re_scanner_error
760
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
686
+ text = copy(data, ts ? ts-1 : 0, -1)
761
687
  raise ScannerError.new("Scan error at '#{text}'")
762
688
  end
763
689
 
@@ -767,40 +693,76 @@ class Regexp::Scanner
767
693
  "[#{set_depth}]") if in_set?
768
694
 
769
695
  # when the entire expression is a literal run
770
- emit_literal if literal
696
+ emit_literal if literal_run
771
697
 
772
698
  tokens
773
699
  end
774
700
 
775
701
  # lazy-load property maps when first needed
776
- require 'yaml'
777
- PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
778
-
779
702
  def self.short_prop_map
780
- @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
703
+ @short_prop_map ||= parse_prop_map('short')
781
704
  end
782
705
 
783
706
  def self.long_prop_map
784
- @long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
707
+ @long_prop_map ||= parse_prop_map('long')
708
+ end
709
+
710
+ def self.parse_prop_map(name)
711
+ File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
712
+ end
713
+
714
+ def self.posix_classes
715
+ %w[alnum alpha ascii blank cntrl digit graph
716
+ lower print punct space upper word xdigit]
785
717
  end
786
718
 
787
719
  # Emits an array with the details of the scanned pattern
788
- def emit(type, token, text, ts, te)
720
+ def emit(type, token, text)
789
721
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
790
722
 
791
- emit_literal if literal
723
+ emit_literal if literal_run
724
+
725
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
726
+ # end-users, so we keep track of char-based indices and emit those instead.
727
+ ts_char_pos = char_pos
728
+ te_char_pos = char_pos + text.length
729
+
730
+ tok = [type, token, text, ts_char_pos, te_char_pos]
731
+
732
+ self.prev_token = tok
733
+
734
+ self.char_pos = te_char_pos
792
735
 
793
736
  if block
794
- block.call type, token, text, ts, te
737
+ block.call type, token, text, ts_char_pos, te_char_pos
738
+ # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
739
+ tokens << tok if collect_tokens
740
+ elsif collect_tokens
741
+ tokens << tok
795
742
  end
796
-
797
- tokens << [type, token, text, ts, te]
798
743
  end
799
744
 
745
+ attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
746
+
800
747
  private
801
748
 
802
- attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
803
- :group_depth, :set_depth, :conditional_stack
749
+ attr_accessor :block,
750
+ :collect_tokens, :tokens, :prev_token,
751
+ :free_spacing, :spacing_stack,
752
+ :group_depth, :set_depth, :conditional_stack,
753
+ :char_pos
754
+
755
+ def free_spacing?(input_object, options)
756
+ if options && !input_object.is_a?(String)
757
+ raise ArgumentError, 'options cannot be supplied unless scanning a String'
758
+ end
759
+
760
+ options = input_object.options if input_object.is_a?(::Regexp)
761
+
762
+ return false unless options
763
+
764
+ options & Regexp::EXTENDED != 0
765
+ end
804
766
 
805
767
  def in_group?
806
768
  group_depth > 0
@@ -811,36 +773,24 @@ class Regexp::Scanner
811
773
  end
812
774
 
813
775
  # Copy from ts to te from data as text
814
- def copy(data, range)
815
- data[range].pack('c*')
816
- end
817
-
818
- # Copy from ts to te from data as text, returning an array with the text
819
- # and the offsets used to copy it.
820
- def text(data, ts, te, soff = 0)
821
- [copy(data, ts-soff..te-1), ts-soff, te]
776
+ def copy(data, ts, te)
777
+ data[ts...te].pack('c*').force_encoding('utf-8')
822
778
  end
823
779
 
824
780
  # Appends one or more characters to the literal buffer, to be emitted later
825
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
781
+ # by a call to emit_literal.
826
782
  def append_literal(data, ts, te)
827
- self.literal = literal || []
828
- literal << text(data, ts, te)
783
+ (self.literal_run ||= []) << copy(data, ts, te)
829
784
  end
830
785
 
831
- # Emits the literal run collected by calls to the append_literal method,
832
- # using the total start (ts) and end (te) offsets of the run.
786
+ # Emits the literal run collected by calls to the append_literal method.
833
787
  def emit_literal
834
- ts, te = literal.first[1], literal.last[2]
835
- text = literal.map {|t| t[0]}.join
836
-
837
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
838
-
839
- self.literal = nil
840
- emit(:literal, :literal, text, ts, te)
788
+ text = literal_run.join
789
+ self.literal_run = nil
790
+ emit(:literal, :literal, text)
841
791
  end
842
792
 
843
- def emit_options(text, ts, te)
793
+ def emit_options(text)
844
794
  token = nil
845
795
 
846
796
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -866,28 +816,13 @@ class Regexp::Scanner
866
816
  token = :options_switch
867
817
  end
868
818
 
869
- emit(:group, token, text, ts, te)
819
+ emit(:group, token, text)
870
820
  end
871
821
 
872
822
  def emit_meta_control_sequence(data, ts, te, token)
873
823
  if data.last < 0x00 || data.last > 0x7F
874
- validation_error(:sequence, 'escape', token.to_s)
875
- end
876
- emit(:escape, token, *text(data, ts, te, 1))
877
- end
878
-
879
- # Centralizes and unifies the handling of validation related
880
- # errors.
881
- def validation_error(type, what, reason)
882
- case type
883
- when :group
884
- error = InvalidGroupError.new(what, reason)
885
- when :backref
886
- error = InvalidBackrefError.new(what, reason)
887
- when :sequence
888
- error = InvalidSequenceError.new(what, reason)
824
+ raise ValidationError.for(:sequence, 'escape', token.to_s)
889
825
  end
890
-
891
- raise error # unless @@config.validation_ignore
826
+ emit(:escape, token, copy(data, ts-1, te))
892
827
  end
893
828
  end # module Regexp::Scanner