regexp_parser 0.1.1 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/ChangeLog +45 -0
  3. data/Rakefile +12 -44
  4. data/VERSION.yml +5 -0
  5. data/lib/regexp_parser.rb +5 -38
  6. data/lib/regexp_parser/expression.rb +68 -221
  7. data/lib/regexp_parser/expression/classes/alternation.rb +47 -0
  8. data/lib/regexp_parser/expression/classes/anchor.rb +26 -0
  9. data/lib/regexp_parser/expression/classes/backref.rb +42 -0
  10. data/lib/regexp_parser/expression/classes/escape.rb +27 -0
  11. data/lib/regexp_parser/expression/classes/group.rb +67 -0
  12. data/lib/regexp_parser/expression/classes/literal.rb +7 -0
  13. data/lib/regexp_parser/expression/{property.rb → classes/property.rb} +1 -1
  14. data/lib/regexp_parser/expression/classes/root.rb +26 -0
  15. data/lib/regexp_parser/expression/classes/set.rb +100 -0
  16. data/lib/regexp_parser/expression/classes/type.rb +17 -0
  17. data/lib/regexp_parser/expression/quantifier.rb +26 -0
  18. data/lib/regexp_parser/expression/subexpression.rb +69 -0
  19. data/lib/regexp_parser/lexer.rb +4 -4
  20. data/lib/regexp_parser/parser.rb +31 -13
  21. data/lib/regexp_parser/scanner.rb +1849 -1488
  22. data/lib/regexp_parser/scanner/property.rl +7 -2
  23. data/lib/regexp_parser/scanner/scanner.rl +377 -191
  24. data/lib/regexp_parser/syntax.rb +7 -0
  25. data/lib/regexp_parser/syntax/ruby/1.8.6.rb +4 -4
  26. data/lib/regexp_parser/syntax/ruby/1.9.1.rb +9 -9
  27. data/lib/regexp_parser/syntax/ruby/2.0.0.rb +16 -0
  28. data/lib/regexp_parser/syntax/ruby/2.1.0.rb +13 -0
  29. data/lib/regexp_parser/syntax/tokens.rb +21 -320
  30. data/lib/regexp_parser/syntax/tokens/anchor.rb +17 -0
  31. data/lib/regexp_parser/syntax/tokens/assertion.rb +15 -0
  32. data/lib/regexp_parser/syntax/tokens/backref.rb +26 -0
  33. data/lib/regexp_parser/syntax/tokens/character_set.rb +48 -0
  34. data/lib/regexp_parser/syntax/tokens/character_type.rb +16 -0
  35. data/lib/regexp_parser/syntax/tokens/escape.rb +29 -0
  36. data/lib/regexp_parser/syntax/tokens/group.rb +22 -0
  37. data/lib/regexp_parser/syntax/tokens/meta.rb +15 -0
  38. data/lib/regexp_parser/syntax/tokens/quantifier.rb +37 -0
  39. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +204 -0
  40. data/lib/regexp_parser/token.rb +37 -0
  41. data/test/expression/test_all.rb +7 -0
  42. data/test/expression/test_base.rb +72 -0
  43. data/test/expression/test_clone.rb +144 -0
  44. data/test/{parser/test_expression.rb → expression/test_to_s.rb} +10 -10
  45. data/test/helpers.rb +1 -0
  46. data/test/parser/test_all.rb +1 -1
  47. data/test/parser/test_alternation.rb +35 -0
  48. data/test/parser/test_anchors.rb +2 -2
  49. data/test/parser/test_refcalls.rb +1 -1
  50. data/test/parser/test_sets.rb +54 -8
  51. data/test/scanner/test_anchors.rb +2 -2
  52. data/test/scanner/test_conditionals.rb +31 -0
  53. data/test/scanner/test_errors.rb +88 -8
  54. data/test/scanner/test_escapes.rb +4 -4
  55. data/test/scanner/test_groups.rb +7 -0
  56. data/test/scanner/test_quoting.rb +29 -0
  57. data/test/scanner/test_sets.rb +1 -0
  58. data/test/syntax/ruby/test_1.8.rb +3 -3
  59. data/test/test_all.rb +1 -1
  60. metadata +62 -48
  61. data/lib/regexp_parser/expression/set.rb +0 -59
@@ -50,6 +50,13 @@ module Regexp::Syntax
50
50
  # alias for the latest 1.9 implementation
51
51
  when 'ruby/1.9'; syntax = Regexp::Syntax::Ruby::V19.new
52
52
 
53
+ when 'ruby/2.0.0'; syntax = Regexp::Syntax::Ruby::V20.new
54
+ when 'ruby/2.1.0'; syntax = Regexp::Syntax::Ruby::V21.new
55
+
56
+ # aliases for the latest 2.x implementations
57
+ when 'ruby/2.0'; syntax = Regexp::Syntax::Ruby::V20.new
58
+ when 'ruby/2.1'; syntax = Regexp::Syntax::Ruby::V21.new
59
+
53
60
  else
54
61
  raise UnknownSyntaxError.new(name)
55
62
  end
@@ -8,10 +8,10 @@ module Regexp::Syntax
8
8
  super
9
9
 
10
10
  implements :anchor, Anchor::All
11
- implements :assertion, Group::Assertion::Lookahead
11
+ implements :assertion, Assertion::Lookahead
12
12
  implements :backref, [:number]
13
13
 
14
- implements :escape,
14
+ implements :escape,
15
15
  Escape::Basic + Escape::Backreference +
16
16
  Escape::ASCII + Escape::Meta + Escape::Control
17
17
 
@@ -19,13 +19,13 @@ module Regexp::Syntax
19
19
 
20
20
  implements :meta, Meta::Extended
21
21
 
22
- implements :quantifier,
22
+ implements :quantifier,
23
23
  Quantifier::Greedy + Quantifier::Reluctant +
24
24
  Quantifier::Interval + Quantifier::IntervalReluctant
25
25
 
26
26
  implements :set, CharacterSet::OpenClose +
27
27
  CharacterSet::Extended + CharacterSet::Types +
28
- CharacterSet::POSIX::Standard
28
+ CharacterSet::POSIX::Standard
29
29
 
30
30
  implements :type,
31
31
  CharacterType::Extended
@@ -9,24 +9,24 @@ module Regexp::Syntax
9
9
  def initialize
10
10
  super
11
11
 
12
- implements :assertion, Group::Assertion::Lookbehind +
13
- Group::SubexpressionCall::All
12
+ implements :assertion, Assertion::Lookbehind +
13
+ SubexpressionCall::All
14
14
 
15
- implements :backref, Group::Backreference::All +
16
- Group::SubexpressionCall::All
15
+ implements :backref, Backreference::All +
16
+ SubexpressionCall::All
17
17
 
18
18
  implements :escape, CharacterType::Hex
19
19
 
20
- implements :property,
20
+ implements :property,
21
21
  UnicodeProperty::All
22
22
 
23
- implements :nonproperty,
23
+ implements :nonproperty,
24
24
  UnicodeProperty::All
25
25
 
26
- implements :quantifier,
26
+ implements :quantifier,
27
27
  Quantifier::Possessive + Quantifier::IntervalPossessive
28
28
 
29
- implements :set,
29
+ implements :set,
30
30
  CharacterSet::POSIX::StandardNegative +
31
31
  CharacterSet::POSIX::Extensions +
32
32
  CharacterSet::POSIX::ExtensionsNegative +
@@ -34,7 +34,7 @@ module Regexp::Syntax
34
34
 
35
35
  implements :subset, CharacterSet::OpenClose +
36
36
  CharacterSet::Extended + CharacterSet::Types +
37
- CharacterSet::POSIX::Standard
37
+ CharacterSet::POSIX::Standard
38
38
  end
39
39
 
40
40
  end
@@ -0,0 +1,16 @@
1
+ require File.expand_path('../1.9.3', __FILE__)
2
+
3
+ module Regexp::Syntax
4
+ module Ruby
5
+
6
+ # use the last 1.9 release as the base
7
+ class V20 < Regexp::Syntax::Ruby::V193
8
+ def initialize
9
+ super
10
+
11
+ #implements :escape, CharacterType::Hex
12
+ end
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,13 @@
1
+ require File.expand_path('../2.0.0', __FILE__)
2
+
3
+ module Regexp::Syntax
4
+ module Ruby
5
+
6
+ class V21 < Regexp::Syntax::Ruby::V20
7
+ def initialize
8
+ super
9
+ end
10
+ end
11
+
12
+ end
13
+ end
@@ -1,334 +1,35 @@
1
+ # Define the base module and the simplest of tokens.
1
2
  module Regexp::Syntax
2
-
3
3
  module Token
4
+ Map = {}
4
5
 
5
- # -------------------------------------------------------------------------
6
- module Anchor
7
- Basic = [:beginning_of_line, :end_of_line]
8
- Extended = Basic + [:word_boundary, :nonword_boundary]
9
- String = [:bos, :eos, :eos_ob_eol]
10
- MatchStart = [:match_start]
11
-
12
- All = Extended + String + MatchStart
13
- end
14
-
15
-
16
- # -------------------------------------------------------------------------
17
- module CharacterSet
18
- OpenClose = [:open, :close]
19
-
20
- Basic = [:negate, :member, :range]
21
- Extended = Basic + [:escape, :intersection, :range_hex, :backspace]
22
-
23
- Types = [:type_digit, :type_nondigit, :type_hex, :type_nonhex,
24
- :type_space, :type_nonspace, :type_word, :type_nonword]
25
-
26
- module POSIX
27
- Standard = [:class_alnum, :class_alpha, :class_blank, :class_cntrl,
28
- :class_digit, :class_graph, :class_lower, :class_print,
29
- :class_punct, :class_space, :class_upper, :class_xdigit]
30
-
31
- StandardNegative = [
32
- :class_nonalnum, :class_nonalpha, :class_nonblank,
33
- :class_noncntrl, :class_nondigit, :class_nongraph,
34
- :class_nonlower, :class_nonprint, :class_nonpunct,
35
- :class_nonspace, :class_nonupper, :class_nonxdigit]
36
-
37
- Extensions = [:class_ascii, :class_word]
38
- ExtensionsNegative = [:class_nonascii, :class_nonword]
39
-
40
- All = Standard + StandardNegative +
41
- Extensions + ExtensionsNegative
42
- end
43
-
44
- All = Basic + Extended + Types + POSIX::All
45
-
46
- module SubSet
47
- OpenClose = [:open, :close]
48
- All = CharacterSet::All
49
- end
50
- end
51
-
52
-
53
- # -------------------------------------------------------------------------
54
- module CharacterType
55
- Basic = []
56
- Extended = [:digit, :nondigit, :space, :nonspace, :word, :nonword]
57
- Hex = [:hex, :nonhex]
58
-
59
- All = Basic + Extended + Hex
60
- end
61
-
62
-
63
- # -------------------------------------------------------------------------
64
- module Escape
65
- Basic = [:backslash, :literal]
66
-
67
- Backreference = [:digit]
68
-
69
- Control = [:control, :meta_sequence]
70
-
71
- ASCII = [:bell, :backspace, :escape, :form_feed, :newline, :carriage,
72
- :space, :tab, :vertical_tab]
73
-
74
- Meta = [:dot, :alternation, :zero_or_one, :zero_or_more, :one_or_more,
75
- :beginning_of_line, :end_of_line, :group_open, :group_close,
76
- :interval_open, :interval_close, :set_open, :set_close, :baclslash]
77
-
78
- All = Basic + Backreference + ASCII + Meta
79
- end
80
-
81
-
82
- # -------------------------------------------------------------------------
83
- module Group
84
- Basic = [:capture, :close]
85
- Extended = Basic + [:options]
86
-
87
- Named = [:named]
88
- Atomic = [:atomic]
89
- Passive = [:passive]
90
- Comment = [:comment]
91
-
92
- module Assertion
93
- Lookahead = [:lookahead, :nlookahead]
94
- Lookbehind = [:lookbehind, :nlookbehind]
95
-
96
- All = Lookahead + Lookbehind
97
- end
98
-
99
- module Backreference
100
- Name = [:name_ref]
101
- Number = [:number_ref, :number_rel_ref]
102
-
103
- NestLevel = [:name_nest_ref, :number_nest_ref]
104
-
105
- All = Name + Number + NestLevel
106
- end
107
-
108
- module SubexpressionCall
109
- Name = [:name_call]
110
- Number = [:number_call, :number_rel_call]
111
-
112
- All = Name + Number
113
- end
114
-
115
- All = Group::Extended + Group::Named + Group::Atomic +
116
- Group::Passive + Group::Comment
6
+ module Literal
7
+ All = [:literal]
8
+ Type = :literal
117
9
  end
118
10
 
11
+ Map[Literal::Type] = Literal::All
12
+ end
13
+ end
119
14
 
120
- # -------------------------------------------------------------------------
121
- module Meta
122
- Basic = [:dot]
123
- Extended = Basic + [:alternation]
124
- end
125
15
 
16
+ # Load all the token files, they will populate the Map constant.
17
+ Dir[File.dirname(__FILE__) + '/tokens/*.rb'].each {|f| require f }
126
18
 
127
- # -------------------------------------------------------------------------
128
- module Quantifier
129
- Greedy = [:zero_or_one, :zero_or_more, :one_or_more]
130
- Reluctant = [:zero_or_one_reluctant, :zero_or_more_reluctant, :one_or_more_reluctant]
131
- Possessive = [:zero_or_one_possessive, :zero_or_more_possessive, :one_or_more_possessive]
132
19
 
133
- Interval = [:interval]
134
- IntervalReluctant = [:interval_reluctant]
135
- IntervalPossessive = [:interval_possessive]
20
+ # After loading all the tokens the map is full. Extract all tokens and types
21
+ # into the All and Types constants.
22
+ module Regexp::Syntax
23
+ module Token
24
+ if RUBY_VERSION >= '1.9'
25
+ All = Map.map {|k,v| v}.flatten.uniq.sort
26
+ else
27
+ All = Map.map {|k,v| v}.flatten.uniq
136
28
  end
137
29
 
30
+ Types = Map.keys
138
31
 
139
- # -------------------------------------------------------------------------
140
- module UnicodeProperty
141
- Type = [:alnum, :alpha, :ascii, :blank, :cntrl, :digit, :graph, :lower,
142
- :print, :punct, :space, :upper, :word, :xdigit]
143
-
144
- POSIX = [:any, :assigned, :newline]
145
-
146
- module Category
147
- Letter = [:letter_any, :letter_uppercase, :letter_lowercase,
148
- :letter_titlecase, :letter_modifier, :letter_other]
149
-
150
- Mark = [:mark_any, :mark_nonspacing, :mark_spacing,
151
- :mark_enclosing]
152
-
153
- Number = [:number_any, :number_decimal, :number_letter,
154
- :number_other]
155
-
156
- Punctuation = [:punct_any, :punct_connector, :punct_dash,
157
- :punct_open, :punct_close, :punct_initial,
158
- :punct_final, :punct_other]
159
-
160
- Symbol = [:symbol_any, :symbol_math, :symbol_currency,
161
- :symbol_modifier, :symbol_other]
162
-
163
- Separator = [:separator_any, :separator_space, :separator_line,
164
- :separator_para]
165
-
166
- Codepoint = [:other, :control, :format,
167
- :surrogate, :private_use, :unassigned]
168
-
169
- All = Letter + Mark + Number + Punctuation +
170
- Symbol + Separator + Codepoint
171
- end
172
-
173
- Age = [:age_1_1, :age_2_0, :age_2_1, :age_3_0, :age_3_1,
174
- :age_3_2, :age_4_0, :age_4_1, :age_5_0, :age_5_1,
175
- :age_5_2, :age_6_0]
176
-
177
- Derived = [
178
- :ascii_hex,
179
- :alphabetic,
180
- :cased,
181
- :changes_when_casefolded,
182
- :changes_when_casemapped,
183
- :changes_when_lowercased,
184
- :changes_when_titlecased,
185
- :changes_when_uppercased,
186
- :case_ignorable,
187
- :bidi_control,
188
- :dash,
189
- :deprecated,
190
- :default_ignorable_cp,
191
- :diacritic,
192
- :extender,
193
- :grapheme_base,
194
- :grapheme_extend,
195
- :grapheme_link,
196
- :hex_digit,
197
- :hyphen,
198
- :id_continue,
199
- :ideographic,
200
- :id_start,
201
- :ids_binary_op,
202
- :ids_trinary_op,
203
- :join_control,
204
- :logical_order_exception,
205
- :lowercase,
206
- :math,
207
- :non_character_cp,
208
- :other_alphabetic,
209
- :other_default_ignorable_cp,
210
- :other_grapheme_extended,
211
- :other_id_continue,
212
- :other_id_start,
213
- :other_lowercase,
214
- :other_math,
215
- :other_uppercase,
216
- :pattern_syntax,
217
- :pattern_whitespace,
218
- :quotation_mark,
219
- :radical,
220
- :soft_dotted,
221
- :sentence_terminal,
222
- :terminal_punctuation,
223
- :unified_ideograph,
224
- :uppercase,
225
- :variation_selector,
226
- :whitespace,
227
- :xid_start,
228
- :xid_continue,
229
- ]
230
-
231
- Script =[
232
- :script_arabic,
233
- :script_imperial_aramaic,
234
- :script_armenian,
235
- :script_avestan,
236
- :script_balinese,
237
- :script_bamum,
238
- :script_bengali,
239
- :script_bopomofo,
240
- :script_braille,
241
- :script_buginese,
242
- :script_buhid,
243
- :script_canadian_aboriginal,
244
- :script_carian,
245
- :script_cham,
246
- :script_cherokee,
247
- :script_coptic,
248
- :script_cypriot,
249
- :script_cyrillic,
250
- :script_devanagari,
251
- :script_deseret,
252
- :script_egyptian_hieroglyphs,
253
- :script_ethiopic,
254
- :script_georgian,
255
- :script_glagolitic,
256
- :script_gothic,
257
- :script_greek,
258
- :script_gujarati,
259
- :script_gurmukhi,
260
- :script_hangul,
261
- :script_han,
262
- :script_hanunoo,
263
- :script_hebrew,
264
- :script_hiragana,
265
- :script_katakana_or_hiragana,
266
- :script_old_italic,
267
- :script_javanese,
268
- :script_kayah_li,
269
- :script_katakana,
270
- :script_kharoshthi,
271
- :script_khmer,
272
- :script_kannada,
273
- :script_kaithi,
274
- :script_tai_tham,
275
- :script_lao,
276
- :script_latin,
277
- :script_lepcha,
278
- :script_limbu,
279
- :script_linear_b,
280
- :script_lisu,
281
- :script_lycian,
282
- :script_lydian,
283
- :script_malayalam,
284
- :script_mongolian,
285
- :script_meetei_mayek,
286
- :script_myanmar,
287
- :script_nko,
288
- :script_ogham,
289
- :script_ol_chiki,
290
- :script_old_turkic,
291
- :script_oriya,
292
- :script_osmanya,
293
- :script_phags_pa,
294
- :script_inscriptional_pahlavi,
295
- :script_phoenician,
296
- :script_inscriptional_parthian,
297
- :script_rejang,
298
- :script_runic,
299
- :script_samaritan,
300
- :script_old_south_arabian,
301
- :script_saurashtra,
302
- :script_shavian,
303
- :script_sinhala,
304
- :script_sundanese,
305
- :script_syloti_nagri,
306
- :script_syriac,
307
- :script_tagbanwa,
308
- :script_tai_le,
309
- :script_new_tai_lue,
310
- :script_tamil,
311
- :script_tai_viet,
312
- :script_telugu,
313
- :script_tifinagh,
314
- :script_tagalog,
315
- :script_thaana,
316
- :script_thai,
317
- :script_tibetan,
318
- :script_ugaritic,
319
- :script_vai,
320
- :script_old_persian,
321
- :script_cuneiform,
322
- :script_yi,
323
- :script_inherited,
324
- :script_common,
325
- :script_unknown
326
- ]
327
-
328
- Script_6_0 = [:script_brahmi, :script_batak, :script_mandaic]
329
-
330
- All = Type + POSIX + Category::All + Age + Derived + Script
331
- end
32
+ All.freeze
33
+ Types.freeze
332
34
  end
333
-
334
35
  end