regexp_parser 0.1.1 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/ChangeLog +45 -0
  3. data/Rakefile +12 -44
  4. data/VERSION.yml +5 -0
  5. data/lib/regexp_parser.rb +5 -38
  6. data/lib/regexp_parser/expression.rb +68 -221
  7. data/lib/regexp_parser/expression/classes/alternation.rb +47 -0
  8. data/lib/regexp_parser/expression/classes/anchor.rb +26 -0
  9. data/lib/regexp_parser/expression/classes/backref.rb +42 -0
  10. data/lib/regexp_parser/expression/classes/escape.rb +27 -0
  11. data/lib/regexp_parser/expression/classes/group.rb +67 -0
  12. data/lib/regexp_parser/expression/classes/literal.rb +7 -0
  13. data/lib/regexp_parser/expression/{property.rb → classes/property.rb} +1 -1
  14. data/lib/regexp_parser/expression/classes/root.rb +26 -0
  15. data/lib/regexp_parser/expression/classes/set.rb +100 -0
  16. data/lib/regexp_parser/expression/classes/type.rb +17 -0
  17. data/lib/regexp_parser/expression/quantifier.rb +26 -0
  18. data/lib/regexp_parser/expression/subexpression.rb +69 -0
  19. data/lib/regexp_parser/lexer.rb +4 -4
  20. data/lib/regexp_parser/parser.rb +31 -13
  21. data/lib/regexp_parser/scanner.rb +1849 -1488
  22. data/lib/regexp_parser/scanner/property.rl +7 -2
  23. data/lib/regexp_parser/scanner/scanner.rl +377 -191
  24. data/lib/regexp_parser/syntax.rb +7 -0
  25. data/lib/regexp_parser/syntax/ruby/1.8.6.rb +4 -4
  26. data/lib/regexp_parser/syntax/ruby/1.9.1.rb +9 -9
  27. data/lib/regexp_parser/syntax/ruby/2.0.0.rb +16 -0
  28. data/lib/regexp_parser/syntax/ruby/2.1.0.rb +13 -0
  29. data/lib/regexp_parser/syntax/tokens.rb +21 -320
  30. data/lib/regexp_parser/syntax/tokens/anchor.rb +17 -0
  31. data/lib/regexp_parser/syntax/tokens/assertion.rb +15 -0
  32. data/lib/regexp_parser/syntax/tokens/backref.rb +26 -0
  33. data/lib/regexp_parser/syntax/tokens/character_set.rb +48 -0
  34. data/lib/regexp_parser/syntax/tokens/character_type.rb +16 -0
  35. data/lib/regexp_parser/syntax/tokens/escape.rb +29 -0
  36. data/lib/regexp_parser/syntax/tokens/group.rb +22 -0
  37. data/lib/regexp_parser/syntax/tokens/meta.rb +15 -0
  38. data/lib/regexp_parser/syntax/tokens/quantifier.rb +37 -0
  39. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +204 -0
  40. data/lib/regexp_parser/token.rb +37 -0
  41. data/test/expression/test_all.rb +7 -0
  42. data/test/expression/test_base.rb +72 -0
  43. data/test/expression/test_clone.rb +144 -0
  44. data/test/{parser/test_expression.rb → expression/test_to_s.rb} +10 -10
  45. data/test/helpers.rb +1 -0
  46. data/test/parser/test_all.rb +1 -1
  47. data/test/parser/test_alternation.rb +35 -0
  48. data/test/parser/test_anchors.rb +2 -2
  49. data/test/parser/test_refcalls.rb +1 -1
  50. data/test/parser/test_sets.rb +54 -8
  51. data/test/scanner/test_anchors.rb +2 -2
  52. data/test/scanner/test_conditionals.rb +31 -0
  53. data/test/scanner/test_errors.rb +88 -8
  54. data/test/scanner/test_escapes.rb +4 -4
  55. data/test/scanner/test_groups.rb +7 -0
  56. data/test/scanner/test_quoting.rb +29 -0
  57. data/test/scanner/test_sets.rb +1 -0
  58. data/test/syntax/ruby/test_1.8.rb +3 -3
  59. data/test/test_all.rb +1 -1
  60. metadata +62 -48
  61. data/lib/regexp_parser/expression/set.rb +0 -59
@@ -50,6 +50,13 @@ module Regexp::Syntax
50
50
  # alias for the latest 1.9 implementation
51
51
  when 'ruby/1.9'; syntax = Regexp::Syntax::Ruby::V19.new
52
52
 
53
+ when 'ruby/2.0.0'; syntax = Regexp::Syntax::Ruby::V20.new
54
+ when 'ruby/2.1.0'; syntax = Regexp::Syntax::Ruby::V21.new
55
+
56
+ # aliases for the latest 2.x implementations
57
+ when 'ruby/2.0'; syntax = Regexp::Syntax::Ruby::V20.new
58
+ when 'ruby/2.1'; syntax = Regexp::Syntax::Ruby::V21.new
59
+
53
60
  else
54
61
  raise UnknownSyntaxError.new(name)
55
62
  end
@@ -8,10 +8,10 @@ module Regexp::Syntax
8
8
  super
9
9
 
10
10
  implements :anchor, Anchor::All
11
- implements :assertion, Group::Assertion::Lookahead
11
+ implements :assertion, Assertion::Lookahead
12
12
  implements :backref, [:number]
13
13
 
14
- implements :escape,
14
+ implements :escape,
15
15
  Escape::Basic + Escape::Backreference +
16
16
  Escape::ASCII + Escape::Meta + Escape::Control
17
17
 
@@ -19,13 +19,13 @@ module Regexp::Syntax
19
19
 
20
20
  implements :meta, Meta::Extended
21
21
 
22
- implements :quantifier,
22
+ implements :quantifier,
23
23
  Quantifier::Greedy + Quantifier::Reluctant +
24
24
  Quantifier::Interval + Quantifier::IntervalReluctant
25
25
 
26
26
  implements :set, CharacterSet::OpenClose +
27
27
  CharacterSet::Extended + CharacterSet::Types +
28
- CharacterSet::POSIX::Standard
28
+ CharacterSet::POSIX::Standard
29
29
 
30
30
  implements :type,
31
31
  CharacterType::Extended
@@ -9,24 +9,24 @@ module Regexp::Syntax
9
9
  def initialize
10
10
  super
11
11
 
12
- implements :assertion, Group::Assertion::Lookbehind +
13
- Group::SubexpressionCall::All
12
+ implements :assertion, Assertion::Lookbehind +
13
+ SubexpressionCall::All
14
14
 
15
- implements :backref, Group::Backreference::All +
16
- Group::SubexpressionCall::All
15
+ implements :backref, Backreference::All +
16
+ SubexpressionCall::All
17
17
 
18
18
  implements :escape, CharacterType::Hex
19
19
 
20
- implements :property,
20
+ implements :property,
21
21
  UnicodeProperty::All
22
22
 
23
- implements :nonproperty,
23
+ implements :nonproperty,
24
24
  UnicodeProperty::All
25
25
 
26
- implements :quantifier,
26
+ implements :quantifier,
27
27
  Quantifier::Possessive + Quantifier::IntervalPossessive
28
28
 
29
- implements :set,
29
+ implements :set,
30
30
  CharacterSet::POSIX::StandardNegative +
31
31
  CharacterSet::POSIX::Extensions +
32
32
  CharacterSet::POSIX::ExtensionsNegative +
@@ -34,7 +34,7 @@ module Regexp::Syntax
34
34
 
35
35
  implements :subset, CharacterSet::OpenClose +
36
36
  CharacterSet::Extended + CharacterSet::Types +
37
- CharacterSet::POSIX::Standard
37
+ CharacterSet::POSIX::Standard
38
38
  end
39
39
 
40
40
  end
@@ -0,0 +1,16 @@
1
+ require File.expand_path('../1.9.3', __FILE__)
2
+
3
+ module Regexp::Syntax
4
+ module Ruby
5
+
6
+ # use the last 1.9 release as the base
7
+ class V20 < Regexp::Syntax::Ruby::V193
8
+ def initialize
9
+ super
10
+
11
+ #implements :escape, CharacterType::Hex
12
+ end
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,13 @@
1
+ require File.expand_path('../2.0.0', __FILE__)
2
+
3
+ module Regexp::Syntax
4
+ module Ruby
5
+
6
+ class V21 < Regexp::Syntax::Ruby::V20
7
+ def initialize
8
+ super
9
+ end
10
+ end
11
+
12
+ end
13
+ end
@@ -1,334 +1,35 @@
1
+ # Define the base module and the simplest of tokens.
1
2
  module Regexp::Syntax
2
-
3
3
  module Token
4
+ Map = {}
4
5
 
5
- # -------------------------------------------------------------------------
6
- module Anchor
7
- Basic = [:beginning_of_line, :end_of_line]
8
- Extended = Basic + [:word_boundary, :nonword_boundary]
9
- String = [:bos, :eos, :eos_ob_eol]
10
- MatchStart = [:match_start]
11
-
12
- All = Extended + String + MatchStart
13
- end
14
-
15
-
16
- # -------------------------------------------------------------------------
17
- module CharacterSet
18
- OpenClose = [:open, :close]
19
-
20
- Basic = [:negate, :member, :range]
21
- Extended = Basic + [:escape, :intersection, :range_hex, :backspace]
22
-
23
- Types = [:type_digit, :type_nondigit, :type_hex, :type_nonhex,
24
- :type_space, :type_nonspace, :type_word, :type_nonword]
25
-
26
- module POSIX
27
- Standard = [:class_alnum, :class_alpha, :class_blank, :class_cntrl,
28
- :class_digit, :class_graph, :class_lower, :class_print,
29
- :class_punct, :class_space, :class_upper, :class_xdigit]
30
-
31
- StandardNegative = [
32
- :class_nonalnum, :class_nonalpha, :class_nonblank,
33
- :class_noncntrl, :class_nondigit, :class_nongraph,
34
- :class_nonlower, :class_nonprint, :class_nonpunct,
35
- :class_nonspace, :class_nonupper, :class_nonxdigit]
36
-
37
- Extensions = [:class_ascii, :class_word]
38
- ExtensionsNegative = [:class_nonascii, :class_nonword]
39
-
40
- All = Standard + StandardNegative +
41
- Extensions + ExtensionsNegative
42
- end
43
-
44
- All = Basic + Extended + Types + POSIX::All
45
-
46
- module SubSet
47
- OpenClose = [:open, :close]
48
- All = CharacterSet::All
49
- end
50
- end
51
-
52
-
53
- # -------------------------------------------------------------------------
54
- module CharacterType
55
- Basic = []
56
- Extended = [:digit, :nondigit, :space, :nonspace, :word, :nonword]
57
- Hex = [:hex, :nonhex]
58
-
59
- All = Basic + Extended + Hex
60
- end
61
-
62
-
63
- # -------------------------------------------------------------------------
64
- module Escape
65
- Basic = [:backslash, :literal]
66
-
67
- Backreference = [:digit]
68
-
69
- Control = [:control, :meta_sequence]
70
-
71
- ASCII = [:bell, :backspace, :escape, :form_feed, :newline, :carriage,
72
- :space, :tab, :vertical_tab]
73
-
74
- Meta = [:dot, :alternation, :zero_or_one, :zero_or_more, :one_or_more,
75
- :beginning_of_line, :end_of_line, :group_open, :group_close,
76
- :interval_open, :interval_close, :set_open, :set_close, :baclslash]
77
-
78
- All = Basic + Backreference + ASCII + Meta
79
- end
80
-
81
-
82
- # -------------------------------------------------------------------------
83
- module Group
84
- Basic = [:capture, :close]
85
- Extended = Basic + [:options]
86
-
87
- Named = [:named]
88
- Atomic = [:atomic]
89
- Passive = [:passive]
90
- Comment = [:comment]
91
-
92
- module Assertion
93
- Lookahead = [:lookahead, :nlookahead]
94
- Lookbehind = [:lookbehind, :nlookbehind]
95
-
96
- All = Lookahead + Lookbehind
97
- end
98
-
99
- module Backreference
100
- Name = [:name_ref]
101
- Number = [:number_ref, :number_rel_ref]
102
-
103
- NestLevel = [:name_nest_ref, :number_nest_ref]
104
-
105
- All = Name + Number + NestLevel
106
- end
107
-
108
- module SubexpressionCall
109
- Name = [:name_call]
110
- Number = [:number_call, :number_rel_call]
111
-
112
- All = Name + Number
113
- end
114
-
115
- All = Group::Extended + Group::Named + Group::Atomic +
116
- Group::Passive + Group::Comment
6
+ module Literal
7
+ All = [:literal]
8
+ Type = :literal
117
9
  end
118
10
 
11
+ Map[Literal::Type] = Literal::All
12
+ end
13
+ end
119
14
 
120
- # -------------------------------------------------------------------------
121
- module Meta
122
- Basic = [:dot]
123
- Extended = Basic + [:alternation]
124
- end
125
15
 
16
+ # Load all the token files, they will populate the Map constant.
17
+ Dir[File.dirname(__FILE__) + '/tokens/*.rb'].each {|f| require f }
126
18
 
127
- # -------------------------------------------------------------------------
128
- module Quantifier
129
- Greedy = [:zero_or_one, :zero_or_more, :one_or_more]
130
- Reluctant = [:zero_or_one_reluctant, :zero_or_more_reluctant, :one_or_more_reluctant]
131
- Possessive = [:zero_or_one_possessive, :zero_or_more_possessive, :one_or_more_possessive]
132
19
 
133
- Interval = [:interval]
134
- IntervalReluctant = [:interval_reluctant]
135
- IntervalPossessive = [:interval_possessive]
20
+ # After loading all the tokens the map is full. Extract all tokens and types
21
+ # into the All and Types constants.
22
+ module Regexp::Syntax
23
+ module Token
24
+ if RUBY_VERSION >= '1.9'
25
+ All = Map.map {|k,v| v}.flatten.uniq.sort
26
+ else
27
+ All = Map.map {|k,v| v}.flatten.uniq
136
28
  end
137
29
 
30
+ Types = Map.keys
138
31
 
139
- # -------------------------------------------------------------------------
140
- module UnicodeProperty
141
- Type = [:alnum, :alpha, :ascii, :blank, :cntrl, :digit, :graph, :lower,
142
- :print, :punct, :space, :upper, :word, :xdigit]
143
-
144
- POSIX = [:any, :assigned, :newline]
145
-
146
- module Category
147
- Letter = [:letter_any, :letter_uppercase, :letter_lowercase,
148
- :letter_titlecase, :letter_modifier, :letter_other]
149
-
150
- Mark = [:mark_any, :mark_nonspacing, :mark_spacing,
151
- :mark_enclosing]
152
-
153
- Number = [:number_any, :number_decimal, :number_letter,
154
- :number_other]
155
-
156
- Punctuation = [:punct_any, :punct_connector, :punct_dash,
157
- :punct_open, :punct_close, :punct_initial,
158
- :punct_final, :punct_other]
159
-
160
- Symbol = [:symbol_any, :symbol_math, :symbol_currency,
161
- :symbol_modifier, :symbol_other]
162
-
163
- Separator = [:separator_any, :separator_space, :separator_line,
164
- :separator_para]
165
-
166
- Codepoint = [:other, :control, :format,
167
- :surrogate, :private_use, :unassigned]
168
-
169
- All = Letter + Mark + Number + Punctuation +
170
- Symbol + Separator + Codepoint
171
- end
172
-
173
- Age = [:age_1_1, :age_2_0, :age_2_1, :age_3_0, :age_3_1,
174
- :age_3_2, :age_4_0, :age_4_1, :age_5_0, :age_5_1,
175
- :age_5_2, :age_6_0]
176
-
177
- Derived = [
178
- :ascii_hex,
179
- :alphabetic,
180
- :cased,
181
- :changes_when_casefolded,
182
- :changes_when_casemapped,
183
- :changes_when_lowercased,
184
- :changes_when_titlecased,
185
- :changes_when_uppercased,
186
- :case_ignorable,
187
- :bidi_control,
188
- :dash,
189
- :deprecated,
190
- :default_ignorable_cp,
191
- :diacritic,
192
- :extender,
193
- :grapheme_base,
194
- :grapheme_extend,
195
- :grapheme_link,
196
- :hex_digit,
197
- :hyphen,
198
- :id_continue,
199
- :ideographic,
200
- :id_start,
201
- :ids_binary_op,
202
- :ids_trinary_op,
203
- :join_control,
204
- :logical_order_exception,
205
- :lowercase,
206
- :math,
207
- :non_character_cp,
208
- :other_alphabetic,
209
- :other_default_ignorable_cp,
210
- :other_grapheme_extended,
211
- :other_id_continue,
212
- :other_id_start,
213
- :other_lowercase,
214
- :other_math,
215
- :other_uppercase,
216
- :pattern_syntax,
217
- :pattern_whitespace,
218
- :quotation_mark,
219
- :radical,
220
- :soft_dotted,
221
- :sentence_terminal,
222
- :terminal_punctuation,
223
- :unified_ideograph,
224
- :uppercase,
225
- :variation_selector,
226
- :whitespace,
227
- :xid_start,
228
- :xid_continue,
229
- ]
230
-
231
- Script =[
232
- :script_arabic,
233
- :script_imperial_aramaic,
234
- :script_armenian,
235
- :script_avestan,
236
- :script_balinese,
237
- :script_bamum,
238
- :script_bengali,
239
- :script_bopomofo,
240
- :script_braille,
241
- :script_buginese,
242
- :script_buhid,
243
- :script_canadian_aboriginal,
244
- :script_carian,
245
- :script_cham,
246
- :script_cherokee,
247
- :script_coptic,
248
- :script_cypriot,
249
- :script_cyrillic,
250
- :script_devanagari,
251
- :script_deseret,
252
- :script_egyptian_hieroglyphs,
253
- :script_ethiopic,
254
- :script_georgian,
255
- :script_glagolitic,
256
- :script_gothic,
257
- :script_greek,
258
- :script_gujarati,
259
- :script_gurmukhi,
260
- :script_hangul,
261
- :script_han,
262
- :script_hanunoo,
263
- :script_hebrew,
264
- :script_hiragana,
265
- :script_katakana_or_hiragana,
266
- :script_old_italic,
267
- :script_javanese,
268
- :script_kayah_li,
269
- :script_katakana,
270
- :script_kharoshthi,
271
- :script_khmer,
272
- :script_kannada,
273
- :script_kaithi,
274
- :script_tai_tham,
275
- :script_lao,
276
- :script_latin,
277
- :script_lepcha,
278
- :script_limbu,
279
- :script_linear_b,
280
- :script_lisu,
281
- :script_lycian,
282
- :script_lydian,
283
- :script_malayalam,
284
- :script_mongolian,
285
- :script_meetei_mayek,
286
- :script_myanmar,
287
- :script_nko,
288
- :script_ogham,
289
- :script_ol_chiki,
290
- :script_old_turkic,
291
- :script_oriya,
292
- :script_osmanya,
293
- :script_phags_pa,
294
- :script_inscriptional_pahlavi,
295
- :script_phoenician,
296
- :script_inscriptional_parthian,
297
- :script_rejang,
298
- :script_runic,
299
- :script_samaritan,
300
- :script_old_south_arabian,
301
- :script_saurashtra,
302
- :script_shavian,
303
- :script_sinhala,
304
- :script_sundanese,
305
- :script_syloti_nagri,
306
- :script_syriac,
307
- :script_tagbanwa,
308
- :script_tai_le,
309
- :script_new_tai_lue,
310
- :script_tamil,
311
- :script_tai_viet,
312
- :script_telugu,
313
- :script_tifinagh,
314
- :script_tagalog,
315
- :script_thaana,
316
- :script_thai,
317
- :script_tibetan,
318
- :script_ugaritic,
319
- :script_vai,
320
- :script_old_persian,
321
- :script_cuneiform,
322
- :script_yi,
323
- :script_inherited,
324
- :script_common,
325
- :script_unknown
326
- ]
327
-
328
- Script_6_0 = [:script_brahmi, :script_batak, :script_mandaic]
329
-
330
- All = Type + POSIX + Category::All + Age + Derived + Script
331
- end
32
+ All.freeze
33
+ Types.freeze
332
34
  end
333
-
334
35
  end