regexp_parser 0.1.1 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ChangeLog +45 -0
- data/Rakefile +12 -44
- data/VERSION.yml +5 -0
- data/lib/regexp_parser.rb +5 -38
- data/lib/regexp_parser/expression.rb +68 -221
- data/lib/regexp_parser/expression/classes/alternation.rb +47 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +26 -0
- data/lib/regexp_parser/expression/classes/backref.rb +42 -0
- data/lib/regexp_parser/expression/classes/escape.rb +27 -0
- data/lib/regexp_parser/expression/classes/group.rb +67 -0
- data/lib/regexp_parser/expression/classes/literal.rb +7 -0
- data/lib/regexp_parser/expression/{property.rb → classes/property.rb} +1 -1
- data/lib/regexp_parser/expression/classes/root.rb +26 -0
- data/lib/regexp_parser/expression/classes/set.rb +100 -0
- data/lib/regexp_parser/expression/classes/type.rb +17 -0
- data/lib/regexp_parser/expression/quantifier.rb +26 -0
- data/lib/regexp_parser/expression/subexpression.rb +69 -0
- data/lib/regexp_parser/lexer.rb +4 -4
- data/lib/regexp_parser/parser.rb +31 -13
- data/lib/regexp_parser/scanner.rb +1849 -1488
- data/lib/regexp_parser/scanner/property.rl +7 -2
- data/lib/regexp_parser/scanner/scanner.rl +377 -191
- data/lib/regexp_parser/syntax.rb +7 -0
- data/lib/regexp_parser/syntax/ruby/1.8.6.rb +4 -4
- data/lib/regexp_parser/syntax/ruby/1.9.1.rb +9 -9
- data/lib/regexp_parser/syntax/ruby/2.0.0.rb +16 -0
- data/lib/regexp_parser/syntax/ruby/2.1.0.rb +13 -0
- data/lib/regexp_parser/syntax/tokens.rb +21 -320
- data/lib/regexp_parser/syntax/tokens/anchor.rb +17 -0
- data/lib/regexp_parser/syntax/tokens/assertion.rb +15 -0
- data/lib/regexp_parser/syntax/tokens/backref.rb +26 -0
- data/lib/regexp_parser/syntax/tokens/character_set.rb +48 -0
- data/lib/regexp_parser/syntax/tokens/character_type.rb +16 -0
- data/lib/regexp_parser/syntax/tokens/escape.rb +29 -0
- data/lib/regexp_parser/syntax/tokens/group.rb +22 -0
- data/lib/regexp_parser/syntax/tokens/meta.rb +15 -0
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +37 -0
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +204 -0
- data/lib/regexp_parser/token.rb +37 -0
- data/test/expression/test_all.rb +7 -0
- data/test/expression/test_base.rb +72 -0
- data/test/expression/test_clone.rb +144 -0
- data/test/{parser/test_expression.rb → expression/test_to_s.rb} +10 -10
- data/test/helpers.rb +1 -0
- data/test/parser/test_all.rb +1 -1
- data/test/parser/test_alternation.rb +35 -0
- data/test/parser/test_anchors.rb +2 -2
- data/test/parser/test_refcalls.rb +1 -1
- data/test/parser/test_sets.rb +54 -8
- data/test/scanner/test_anchors.rb +2 -2
- data/test/scanner/test_conditionals.rb +31 -0
- data/test/scanner/test_errors.rb +88 -8
- data/test/scanner/test_escapes.rb +4 -4
- data/test/scanner/test_groups.rb +7 -0
- data/test/scanner/test_quoting.rb +29 -0
- data/test/scanner/test_sets.rb +1 -0
- data/test/syntax/ruby/test_1.8.rb +3 -3
- data/test/test_all.rb +1 -1
- metadata +62 -48
- data/lib/regexp_parser/expression/set.rb +0 -59
data/lib/regexp_parser/syntax.rb
CHANGED
@@ -50,6 +50,13 @@ module Regexp::Syntax
|
|
50
50
|
# alias for the latest 1.9 implementation
|
51
51
|
when 'ruby/1.9'; syntax = Regexp::Syntax::Ruby::V19.new
|
52
52
|
|
53
|
+
when 'ruby/2.0.0'; syntax = Regexp::Syntax::Ruby::V20.new
|
54
|
+
when 'ruby/2.1.0'; syntax = Regexp::Syntax::Ruby::V21.new
|
55
|
+
|
56
|
+
# aliases for the latest 2.x implementations
|
57
|
+
when 'ruby/2.0'; syntax = Regexp::Syntax::Ruby::V20.new
|
58
|
+
when 'ruby/2.1'; syntax = Regexp::Syntax::Ruby::V21.new
|
59
|
+
|
53
60
|
else
|
54
61
|
raise UnknownSyntaxError.new(name)
|
55
62
|
end
|
@@ -8,10 +8,10 @@ module Regexp::Syntax
|
|
8
8
|
super
|
9
9
|
|
10
10
|
implements :anchor, Anchor::All
|
11
|
-
implements :assertion,
|
11
|
+
implements :assertion, Assertion::Lookahead
|
12
12
|
implements :backref, [:number]
|
13
13
|
|
14
|
-
implements :escape,
|
14
|
+
implements :escape,
|
15
15
|
Escape::Basic + Escape::Backreference +
|
16
16
|
Escape::ASCII + Escape::Meta + Escape::Control
|
17
17
|
|
@@ -19,13 +19,13 @@ module Regexp::Syntax
|
|
19
19
|
|
20
20
|
implements :meta, Meta::Extended
|
21
21
|
|
22
|
-
implements :quantifier,
|
22
|
+
implements :quantifier,
|
23
23
|
Quantifier::Greedy + Quantifier::Reluctant +
|
24
24
|
Quantifier::Interval + Quantifier::IntervalReluctant
|
25
25
|
|
26
26
|
implements :set, CharacterSet::OpenClose +
|
27
27
|
CharacterSet::Extended + CharacterSet::Types +
|
28
|
-
CharacterSet::POSIX::Standard
|
28
|
+
CharacterSet::POSIX::Standard
|
29
29
|
|
30
30
|
implements :type,
|
31
31
|
CharacterType::Extended
|
@@ -9,24 +9,24 @@ module Regexp::Syntax
|
|
9
9
|
def initialize
|
10
10
|
super
|
11
11
|
|
12
|
-
implements :assertion,
|
13
|
-
|
12
|
+
implements :assertion, Assertion::Lookbehind +
|
13
|
+
SubexpressionCall::All
|
14
14
|
|
15
|
-
implements :backref,
|
16
|
-
|
15
|
+
implements :backref, Backreference::All +
|
16
|
+
SubexpressionCall::All
|
17
17
|
|
18
18
|
implements :escape, CharacterType::Hex
|
19
19
|
|
20
|
-
implements :property,
|
20
|
+
implements :property,
|
21
21
|
UnicodeProperty::All
|
22
22
|
|
23
|
-
implements :nonproperty,
|
23
|
+
implements :nonproperty,
|
24
24
|
UnicodeProperty::All
|
25
25
|
|
26
|
-
implements :quantifier,
|
26
|
+
implements :quantifier,
|
27
27
|
Quantifier::Possessive + Quantifier::IntervalPossessive
|
28
28
|
|
29
|
-
implements :set,
|
29
|
+
implements :set,
|
30
30
|
CharacterSet::POSIX::StandardNegative +
|
31
31
|
CharacterSet::POSIX::Extensions +
|
32
32
|
CharacterSet::POSIX::ExtensionsNegative +
|
@@ -34,7 +34,7 @@ module Regexp::Syntax
|
|
34
34
|
|
35
35
|
implements :subset, CharacterSet::OpenClose +
|
36
36
|
CharacterSet::Extended + CharacterSet::Types +
|
37
|
-
CharacterSet::POSIX::Standard
|
37
|
+
CharacterSet::POSIX::Standard
|
38
38
|
end
|
39
39
|
|
40
40
|
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require File.expand_path('../1.9.3', __FILE__)
|
2
|
+
|
3
|
+
module Regexp::Syntax
|
4
|
+
module Ruby
|
5
|
+
|
6
|
+
# use the last 1.9 release as the base
|
7
|
+
class V20 < Regexp::Syntax::Ruby::V193
|
8
|
+
def initialize
|
9
|
+
super
|
10
|
+
|
11
|
+
#implements :escape, CharacterType::Hex
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
@@ -1,334 +1,35 @@
|
|
1
|
+
# Define the base module and the simplest of tokens.
|
1
2
|
module Regexp::Syntax
|
2
|
-
|
3
3
|
module Token
|
4
|
+
Map = {}
|
4
5
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
Extended = Basic + [:word_boundary, :nonword_boundary]
|
9
|
-
String = [:bos, :eos, :eos_ob_eol]
|
10
|
-
MatchStart = [:match_start]
|
11
|
-
|
12
|
-
All = Extended + String + MatchStart
|
13
|
-
end
|
14
|
-
|
15
|
-
|
16
|
-
# -------------------------------------------------------------------------
|
17
|
-
module CharacterSet
|
18
|
-
OpenClose = [:open, :close]
|
19
|
-
|
20
|
-
Basic = [:negate, :member, :range]
|
21
|
-
Extended = Basic + [:escape, :intersection, :range_hex, :backspace]
|
22
|
-
|
23
|
-
Types = [:type_digit, :type_nondigit, :type_hex, :type_nonhex,
|
24
|
-
:type_space, :type_nonspace, :type_word, :type_nonword]
|
25
|
-
|
26
|
-
module POSIX
|
27
|
-
Standard = [:class_alnum, :class_alpha, :class_blank, :class_cntrl,
|
28
|
-
:class_digit, :class_graph, :class_lower, :class_print,
|
29
|
-
:class_punct, :class_space, :class_upper, :class_xdigit]
|
30
|
-
|
31
|
-
StandardNegative = [
|
32
|
-
:class_nonalnum, :class_nonalpha, :class_nonblank,
|
33
|
-
:class_noncntrl, :class_nondigit, :class_nongraph,
|
34
|
-
:class_nonlower, :class_nonprint, :class_nonpunct,
|
35
|
-
:class_nonspace, :class_nonupper, :class_nonxdigit]
|
36
|
-
|
37
|
-
Extensions = [:class_ascii, :class_word]
|
38
|
-
ExtensionsNegative = [:class_nonascii, :class_nonword]
|
39
|
-
|
40
|
-
All = Standard + StandardNegative +
|
41
|
-
Extensions + ExtensionsNegative
|
42
|
-
end
|
43
|
-
|
44
|
-
All = Basic + Extended + Types + POSIX::All
|
45
|
-
|
46
|
-
module SubSet
|
47
|
-
OpenClose = [:open, :close]
|
48
|
-
All = CharacterSet::All
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
|
53
|
-
# -------------------------------------------------------------------------
|
54
|
-
module CharacterType
|
55
|
-
Basic = []
|
56
|
-
Extended = [:digit, :nondigit, :space, :nonspace, :word, :nonword]
|
57
|
-
Hex = [:hex, :nonhex]
|
58
|
-
|
59
|
-
All = Basic + Extended + Hex
|
60
|
-
end
|
61
|
-
|
62
|
-
|
63
|
-
# -------------------------------------------------------------------------
|
64
|
-
module Escape
|
65
|
-
Basic = [:backslash, :literal]
|
66
|
-
|
67
|
-
Backreference = [:digit]
|
68
|
-
|
69
|
-
Control = [:control, :meta_sequence]
|
70
|
-
|
71
|
-
ASCII = [:bell, :backspace, :escape, :form_feed, :newline, :carriage,
|
72
|
-
:space, :tab, :vertical_tab]
|
73
|
-
|
74
|
-
Meta = [:dot, :alternation, :zero_or_one, :zero_or_more, :one_or_more,
|
75
|
-
:beginning_of_line, :end_of_line, :group_open, :group_close,
|
76
|
-
:interval_open, :interval_close, :set_open, :set_close, :baclslash]
|
77
|
-
|
78
|
-
All = Basic + Backreference + ASCII + Meta
|
79
|
-
end
|
80
|
-
|
81
|
-
|
82
|
-
# -------------------------------------------------------------------------
|
83
|
-
module Group
|
84
|
-
Basic = [:capture, :close]
|
85
|
-
Extended = Basic + [:options]
|
86
|
-
|
87
|
-
Named = [:named]
|
88
|
-
Atomic = [:atomic]
|
89
|
-
Passive = [:passive]
|
90
|
-
Comment = [:comment]
|
91
|
-
|
92
|
-
module Assertion
|
93
|
-
Lookahead = [:lookahead, :nlookahead]
|
94
|
-
Lookbehind = [:lookbehind, :nlookbehind]
|
95
|
-
|
96
|
-
All = Lookahead + Lookbehind
|
97
|
-
end
|
98
|
-
|
99
|
-
module Backreference
|
100
|
-
Name = [:name_ref]
|
101
|
-
Number = [:number_ref, :number_rel_ref]
|
102
|
-
|
103
|
-
NestLevel = [:name_nest_ref, :number_nest_ref]
|
104
|
-
|
105
|
-
All = Name + Number + NestLevel
|
106
|
-
end
|
107
|
-
|
108
|
-
module SubexpressionCall
|
109
|
-
Name = [:name_call]
|
110
|
-
Number = [:number_call, :number_rel_call]
|
111
|
-
|
112
|
-
All = Name + Number
|
113
|
-
end
|
114
|
-
|
115
|
-
All = Group::Extended + Group::Named + Group::Atomic +
|
116
|
-
Group::Passive + Group::Comment
|
6
|
+
module Literal
|
7
|
+
All = [:literal]
|
8
|
+
Type = :literal
|
117
9
|
end
|
118
10
|
|
11
|
+
Map[Literal::Type] = Literal::All
|
12
|
+
end
|
13
|
+
end
|
119
14
|
|
120
|
-
# -------------------------------------------------------------------------
|
121
|
-
module Meta
|
122
|
-
Basic = [:dot]
|
123
|
-
Extended = Basic + [:alternation]
|
124
|
-
end
|
125
15
|
|
16
|
+
# Load all the token files, they will populate the Map constant.
|
17
|
+
Dir[File.dirname(__FILE__) + '/tokens/*.rb'].each {|f| require f }
|
126
18
|
|
127
|
-
# -------------------------------------------------------------------------
|
128
|
-
module Quantifier
|
129
|
-
Greedy = [:zero_or_one, :zero_or_more, :one_or_more]
|
130
|
-
Reluctant = [:zero_or_one_reluctant, :zero_or_more_reluctant, :one_or_more_reluctant]
|
131
|
-
Possessive = [:zero_or_one_possessive, :zero_or_more_possessive, :one_or_more_possessive]
|
132
19
|
|
133
|
-
|
134
|
-
|
135
|
-
|
20
|
+
# After loading all the tokens the map is full. Extract all tokens and types
|
21
|
+
# into the All and Types constants.
|
22
|
+
module Regexp::Syntax
|
23
|
+
module Token
|
24
|
+
if RUBY_VERSION >= '1.9'
|
25
|
+
All = Map.map {|k,v| v}.flatten.uniq.sort
|
26
|
+
else
|
27
|
+
All = Map.map {|k,v| v}.flatten.uniq
|
136
28
|
end
|
137
29
|
|
30
|
+
Types = Map.keys
|
138
31
|
|
139
|
-
|
140
|
-
|
141
|
-
Type = [:alnum, :alpha, :ascii, :blank, :cntrl, :digit, :graph, :lower,
|
142
|
-
:print, :punct, :space, :upper, :word, :xdigit]
|
143
|
-
|
144
|
-
POSIX = [:any, :assigned, :newline]
|
145
|
-
|
146
|
-
module Category
|
147
|
-
Letter = [:letter_any, :letter_uppercase, :letter_lowercase,
|
148
|
-
:letter_titlecase, :letter_modifier, :letter_other]
|
149
|
-
|
150
|
-
Mark = [:mark_any, :mark_nonspacing, :mark_spacing,
|
151
|
-
:mark_enclosing]
|
152
|
-
|
153
|
-
Number = [:number_any, :number_decimal, :number_letter,
|
154
|
-
:number_other]
|
155
|
-
|
156
|
-
Punctuation = [:punct_any, :punct_connector, :punct_dash,
|
157
|
-
:punct_open, :punct_close, :punct_initial,
|
158
|
-
:punct_final, :punct_other]
|
159
|
-
|
160
|
-
Symbol = [:symbol_any, :symbol_math, :symbol_currency,
|
161
|
-
:symbol_modifier, :symbol_other]
|
162
|
-
|
163
|
-
Separator = [:separator_any, :separator_space, :separator_line,
|
164
|
-
:separator_para]
|
165
|
-
|
166
|
-
Codepoint = [:other, :control, :format,
|
167
|
-
:surrogate, :private_use, :unassigned]
|
168
|
-
|
169
|
-
All = Letter + Mark + Number + Punctuation +
|
170
|
-
Symbol + Separator + Codepoint
|
171
|
-
end
|
172
|
-
|
173
|
-
Age = [:age_1_1, :age_2_0, :age_2_1, :age_3_0, :age_3_1,
|
174
|
-
:age_3_2, :age_4_0, :age_4_1, :age_5_0, :age_5_1,
|
175
|
-
:age_5_2, :age_6_0]
|
176
|
-
|
177
|
-
Derived = [
|
178
|
-
:ascii_hex,
|
179
|
-
:alphabetic,
|
180
|
-
:cased,
|
181
|
-
:changes_when_casefolded,
|
182
|
-
:changes_when_casemapped,
|
183
|
-
:changes_when_lowercased,
|
184
|
-
:changes_when_titlecased,
|
185
|
-
:changes_when_uppercased,
|
186
|
-
:case_ignorable,
|
187
|
-
:bidi_control,
|
188
|
-
:dash,
|
189
|
-
:deprecated,
|
190
|
-
:default_ignorable_cp,
|
191
|
-
:diacritic,
|
192
|
-
:extender,
|
193
|
-
:grapheme_base,
|
194
|
-
:grapheme_extend,
|
195
|
-
:grapheme_link,
|
196
|
-
:hex_digit,
|
197
|
-
:hyphen,
|
198
|
-
:id_continue,
|
199
|
-
:ideographic,
|
200
|
-
:id_start,
|
201
|
-
:ids_binary_op,
|
202
|
-
:ids_trinary_op,
|
203
|
-
:join_control,
|
204
|
-
:logical_order_exception,
|
205
|
-
:lowercase,
|
206
|
-
:math,
|
207
|
-
:non_character_cp,
|
208
|
-
:other_alphabetic,
|
209
|
-
:other_default_ignorable_cp,
|
210
|
-
:other_grapheme_extended,
|
211
|
-
:other_id_continue,
|
212
|
-
:other_id_start,
|
213
|
-
:other_lowercase,
|
214
|
-
:other_math,
|
215
|
-
:other_uppercase,
|
216
|
-
:pattern_syntax,
|
217
|
-
:pattern_whitespace,
|
218
|
-
:quotation_mark,
|
219
|
-
:radical,
|
220
|
-
:soft_dotted,
|
221
|
-
:sentence_terminal,
|
222
|
-
:terminal_punctuation,
|
223
|
-
:unified_ideograph,
|
224
|
-
:uppercase,
|
225
|
-
:variation_selector,
|
226
|
-
:whitespace,
|
227
|
-
:xid_start,
|
228
|
-
:xid_continue,
|
229
|
-
]
|
230
|
-
|
231
|
-
Script =[
|
232
|
-
:script_arabic,
|
233
|
-
:script_imperial_aramaic,
|
234
|
-
:script_armenian,
|
235
|
-
:script_avestan,
|
236
|
-
:script_balinese,
|
237
|
-
:script_bamum,
|
238
|
-
:script_bengali,
|
239
|
-
:script_bopomofo,
|
240
|
-
:script_braille,
|
241
|
-
:script_buginese,
|
242
|
-
:script_buhid,
|
243
|
-
:script_canadian_aboriginal,
|
244
|
-
:script_carian,
|
245
|
-
:script_cham,
|
246
|
-
:script_cherokee,
|
247
|
-
:script_coptic,
|
248
|
-
:script_cypriot,
|
249
|
-
:script_cyrillic,
|
250
|
-
:script_devanagari,
|
251
|
-
:script_deseret,
|
252
|
-
:script_egyptian_hieroglyphs,
|
253
|
-
:script_ethiopic,
|
254
|
-
:script_georgian,
|
255
|
-
:script_glagolitic,
|
256
|
-
:script_gothic,
|
257
|
-
:script_greek,
|
258
|
-
:script_gujarati,
|
259
|
-
:script_gurmukhi,
|
260
|
-
:script_hangul,
|
261
|
-
:script_han,
|
262
|
-
:script_hanunoo,
|
263
|
-
:script_hebrew,
|
264
|
-
:script_hiragana,
|
265
|
-
:script_katakana_or_hiragana,
|
266
|
-
:script_old_italic,
|
267
|
-
:script_javanese,
|
268
|
-
:script_kayah_li,
|
269
|
-
:script_katakana,
|
270
|
-
:script_kharoshthi,
|
271
|
-
:script_khmer,
|
272
|
-
:script_kannada,
|
273
|
-
:script_kaithi,
|
274
|
-
:script_tai_tham,
|
275
|
-
:script_lao,
|
276
|
-
:script_latin,
|
277
|
-
:script_lepcha,
|
278
|
-
:script_limbu,
|
279
|
-
:script_linear_b,
|
280
|
-
:script_lisu,
|
281
|
-
:script_lycian,
|
282
|
-
:script_lydian,
|
283
|
-
:script_malayalam,
|
284
|
-
:script_mongolian,
|
285
|
-
:script_meetei_mayek,
|
286
|
-
:script_myanmar,
|
287
|
-
:script_nko,
|
288
|
-
:script_ogham,
|
289
|
-
:script_ol_chiki,
|
290
|
-
:script_old_turkic,
|
291
|
-
:script_oriya,
|
292
|
-
:script_osmanya,
|
293
|
-
:script_phags_pa,
|
294
|
-
:script_inscriptional_pahlavi,
|
295
|
-
:script_phoenician,
|
296
|
-
:script_inscriptional_parthian,
|
297
|
-
:script_rejang,
|
298
|
-
:script_runic,
|
299
|
-
:script_samaritan,
|
300
|
-
:script_old_south_arabian,
|
301
|
-
:script_saurashtra,
|
302
|
-
:script_shavian,
|
303
|
-
:script_sinhala,
|
304
|
-
:script_sundanese,
|
305
|
-
:script_syloti_nagri,
|
306
|
-
:script_syriac,
|
307
|
-
:script_tagbanwa,
|
308
|
-
:script_tai_le,
|
309
|
-
:script_new_tai_lue,
|
310
|
-
:script_tamil,
|
311
|
-
:script_tai_viet,
|
312
|
-
:script_telugu,
|
313
|
-
:script_tifinagh,
|
314
|
-
:script_tagalog,
|
315
|
-
:script_thaana,
|
316
|
-
:script_thai,
|
317
|
-
:script_tibetan,
|
318
|
-
:script_ugaritic,
|
319
|
-
:script_vai,
|
320
|
-
:script_old_persian,
|
321
|
-
:script_cuneiform,
|
322
|
-
:script_yi,
|
323
|
-
:script_inherited,
|
324
|
-
:script_common,
|
325
|
-
:script_unknown
|
326
|
-
]
|
327
|
-
|
328
|
-
Script_6_0 = [:script_brahmi, :script_batak, :script_mandaic]
|
329
|
-
|
330
|
-
All = Type + POSIX + Category::All + Age + Derived + Script
|
331
|
-
end
|
32
|
+
All.freeze
|
33
|
+
Types.freeze
|
332
34
|
end
|
333
|
-
|
334
35
|
end
|