regexp_parser 0.1.1 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ChangeLog +45 -0
- data/Rakefile +12 -44
- data/VERSION.yml +5 -0
- data/lib/regexp_parser.rb +5 -38
- data/lib/regexp_parser/expression.rb +68 -221
- data/lib/regexp_parser/expression/classes/alternation.rb +47 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +26 -0
- data/lib/regexp_parser/expression/classes/backref.rb +42 -0
- data/lib/regexp_parser/expression/classes/escape.rb +27 -0
- data/lib/regexp_parser/expression/classes/group.rb +67 -0
- data/lib/regexp_parser/expression/classes/literal.rb +7 -0
- data/lib/regexp_parser/expression/{property.rb → classes/property.rb} +1 -1
- data/lib/regexp_parser/expression/classes/root.rb +26 -0
- data/lib/regexp_parser/expression/classes/set.rb +100 -0
- data/lib/regexp_parser/expression/classes/type.rb +17 -0
- data/lib/regexp_parser/expression/quantifier.rb +26 -0
- data/lib/regexp_parser/expression/subexpression.rb +69 -0
- data/lib/regexp_parser/lexer.rb +4 -4
- data/lib/regexp_parser/parser.rb +31 -13
- data/lib/regexp_parser/scanner.rb +1849 -1488
- data/lib/regexp_parser/scanner/property.rl +7 -2
- data/lib/regexp_parser/scanner/scanner.rl +377 -191
- data/lib/regexp_parser/syntax.rb +7 -0
- data/lib/regexp_parser/syntax/ruby/1.8.6.rb +4 -4
- data/lib/regexp_parser/syntax/ruby/1.9.1.rb +9 -9
- data/lib/regexp_parser/syntax/ruby/2.0.0.rb +16 -0
- data/lib/regexp_parser/syntax/ruby/2.1.0.rb +13 -0
- data/lib/regexp_parser/syntax/tokens.rb +21 -320
- data/lib/regexp_parser/syntax/tokens/anchor.rb +17 -0
- data/lib/regexp_parser/syntax/tokens/assertion.rb +15 -0
- data/lib/regexp_parser/syntax/tokens/backref.rb +26 -0
- data/lib/regexp_parser/syntax/tokens/character_set.rb +48 -0
- data/lib/regexp_parser/syntax/tokens/character_type.rb +16 -0
- data/lib/regexp_parser/syntax/tokens/escape.rb +29 -0
- data/lib/regexp_parser/syntax/tokens/group.rb +22 -0
- data/lib/regexp_parser/syntax/tokens/meta.rb +15 -0
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +37 -0
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +204 -0
- data/lib/regexp_parser/token.rb +37 -0
- data/test/expression/test_all.rb +7 -0
- data/test/expression/test_base.rb +72 -0
- data/test/expression/test_clone.rb +144 -0
- data/test/{parser/test_expression.rb → expression/test_to_s.rb} +10 -10
- data/test/helpers.rb +1 -0
- data/test/parser/test_all.rb +1 -1
- data/test/parser/test_alternation.rb +35 -0
- data/test/parser/test_anchors.rb +2 -2
- data/test/parser/test_refcalls.rb +1 -1
- data/test/parser/test_sets.rb +54 -8
- data/test/scanner/test_anchors.rb +2 -2
- data/test/scanner/test_conditionals.rb +31 -0
- data/test/scanner/test_errors.rb +88 -8
- data/test/scanner/test_escapes.rb +4 -4
- data/test/scanner/test_groups.rb +7 -0
- data/test/scanner/test_quoting.rb +29 -0
- data/test/scanner/test_sets.rb +1 -0
- data/test/syntax/ruby/test_1.8.rb +3 -3
- data/test/test_all.rb +1 -1
- metadata +62 -48
- data/lib/regexp_parser/expression/set.rb +0 -59
@@ -0,0 +1,17 @@
|
|
1
|
+
module Regexp::Syntax
|
2
|
+
module Token
|
3
|
+
|
4
|
+
module Anchor
|
5
|
+
Basic = [:bol, :eol]
|
6
|
+
Extended = Basic + [:word_boundary, :nonword_boundary]
|
7
|
+
String = [:bos, :eos, :eos_ob_eol]
|
8
|
+
MatchStart = [:match_start]
|
9
|
+
|
10
|
+
All = Extended + String + MatchStart
|
11
|
+
Type = :anchor
|
12
|
+
end
|
13
|
+
|
14
|
+
Map[Anchor::Type] = Anchor::All
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Regexp::Syntax
|
2
|
+
module Token
|
3
|
+
|
4
|
+
module Assertion
|
5
|
+
Lookahead = [:lookahead, :nlookahead]
|
6
|
+
Lookbehind = [:lookbehind, :nlookbehind]
|
7
|
+
|
8
|
+
All = Lookahead + Lookbehind
|
9
|
+
Type = :assertion
|
10
|
+
end
|
11
|
+
|
12
|
+
Map[Assertion::Type] = Assertion::All
|
13
|
+
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Regexp::Syntax
|
2
|
+
module Token
|
3
|
+
|
4
|
+
module Backreference
|
5
|
+
Name = [:name_ref]
|
6
|
+
Number = [:number_ref, :number_rel_ref]
|
7
|
+
|
8
|
+
NestLevel = [:name_nest_ref, :number_nest_ref]
|
9
|
+
|
10
|
+
All = Name + Number + NestLevel
|
11
|
+
Type = :backref
|
12
|
+
end
|
13
|
+
|
14
|
+
# Type is the same as Backreference so keeping it here, for now.
|
15
|
+
module SubexpressionCall
|
16
|
+
Name = [:name_call]
|
17
|
+
Number = [:number_call, :number_rel_call]
|
18
|
+
|
19
|
+
All = Name + Number
|
20
|
+
end
|
21
|
+
|
22
|
+
Map[Backreference::Type] = Backreference::All +
|
23
|
+
SubexpressionCall::All
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Regexp::Syntax
|
2
|
+
module Token
|
3
|
+
|
4
|
+
module CharacterSet
|
5
|
+
OpenClose = [:open, :close]
|
6
|
+
|
7
|
+
Basic = [:negate, :member, :range]
|
8
|
+
Extended = Basic + [:escape, :intersection, :range_hex, :backspace]
|
9
|
+
|
10
|
+
Types = [:type_digit, :type_nondigit, :type_hex, :type_nonhex,
|
11
|
+
:type_space, :type_nonspace, :type_word, :type_nonword]
|
12
|
+
|
13
|
+
module POSIX
|
14
|
+
Standard = [
|
15
|
+
:class_alnum, :class_alpha, :class_blank, :class_cntrl,
|
16
|
+
:class_digit, :class_graph, :class_lower, :class_print,
|
17
|
+
:class_punct, :class_space, :class_upper, :class_xdigit,
|
18
|
+
]
|
19
|
+
|
20
|
+
StandardNegative = [
|
21
|
+
:class_nonalnum, :class_nonalpha, :class_nonblank,
|
22
|
+
:class_noncntrl, :class_nondigit, :class_nongraph,
|
23
|
+
:class_nonlower, :class_nonprint, :class_nonpunct,
|
24
|
+
:class_nonspace, :class_nonupper, :class_nonxdigit,
|
25
|
+
]
|
26
|
+
|
27
|
+
Extensions = [:class_ascii, :class_word]
|
28
|
+
ExtensionsNegative = [:class_nonascii, :class_nonword]
|
29
|
+
|
30
|
+
All = Standard + StandardNegative + Extensions + ExtensionsNegative
|
31
|
+
end
|
32
|
+
|
33
|
+
All = Basic + Extended + Types + POSIX::All
|
34
|
+
Type = :set
|
35
|
+
|
36
|
+
module SubSet
|
37
|
+
OpenClose = [:open, :close]
|
38
|
+
|
39
|
+
All = CharacterSet::All
|
40
|
+
Type = :subset
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
Map[CharacterSet::Type] = CharacterSet::All
|
45
|
+
Map[CharacterSet::SubSet::Type] = CharacterSet::All
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Regexp::Syntax
|
2
|
+
module Token
|
3
|
+
|
4
|
+
module CharacterType
|
5
|
+
Basic = []
|
6
|
+
Extended = [:digit, :nondigit, :space, :nonspace, :word, :nonword]
|
7
|
+
Hex = [:hex, :nonhex]
|
8
|
+
|
9
|
+
All = Basic + Extended + Hex
|
10
|
+
Type = :type
|
11
|
+
end
|
12
|
+
|
13
|
+
Map[CharacterType::Type] = CharacterType::All
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Regexp::Syntax
|
2
|
+
module Token
|
3
|
+
|
4
|
+
module Escape
|
5
|
+
Basic = [:backslash, :literal]
|
6
|
+
|
7
|
+
Backreference = [:digit]
|
8
|
+
|
9
|
+
Control = [:control, :meta_sequence]
|
10
|
+
|
11
|
+
ASCII = [:bell, :backspace, :escape, :form_feed, :newline, :carriage,
|
12
|
+
:space, :tab, :vertical_tab]
|
13
|
+
|
14
|
+
Meta = [:dot, :alternation,
|
15
|
+
:zero_or_one, :zero_or_more, :one_or_more,
|
16
|
+
:bol, :eol,
|
17
|
+
:group_open, :group_close,
|
18
|
+
:interval_open, :interval_close,
|
19
|
+
:set_open, :set_close,
|
20
|
+
:baclslash]
|
21
|
+
|
22
|
+
All = Basic + Backreference + ASCII + Meta
|
23
|
+
Type = :escape
|
24
|
+
end
|
25
|
+
|
26
|
+
Map[Escape::Type] = Escape::All
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Regexp::Syntax
|
2
|
+
module Token
|
3
|
+
|
4
|
+
module Group
|
5
|
+
Basic = [:capture, :close]
|
6
|
+
Extended = Basic + [:options]
|
7
|
+
|
8
|
+
Named = [:named]
|
9
|
+
Atomic = [:atomic]
|
10
|
+
Passive = [:passive]
|
11
|
+
Comment = [:comment]
|
12
|
+
|
13
|
+
All = Group::Extended + Group::Named + Group::Atomic +
|
14
|
+
Group::Passive + Group::Comment
|
15
|
+
|
16
|
+
Type = :group
|
17
|
+
end
|
18
|
+
|
19
|
+
Map[Group::Type] = Group::All
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Regexp::Syntax
|
2
|
+
module Token
|
3
|
+
|
4
|
+
module Quantifier
|
5
|
+
Greedy = [
|
6
|
+
:zero_or_one,
|
7
|
+
:zero_or_more,
|
8
|
+
:one_or_more
|
9
|
+
]
|
10
|
+
|
11
|
+
Reluctant = [
|
12
|
+
:zero_or_one_reluctant,
|
13
|
+
:zero_or_more_reluctant,
|
14
|
+
:one_or_more_reluctant
|
15
|
+
]
|
16
|
+
|
17
|
+
Possessive = [
|
18
|
+
:zero_or_one_possessive,
|
19
|
+
:zero_or_more_possessive,
|
20
|
+
:one_or_more_possessive
|
21
|
+
]
|
22
|
+
|
23
|
+
Interval = [:interval]
|
24
|
+
IntervalReluctant = [:interval_reluctant]
|
25
|
+
IntervalPossessive = [:interval_possessive]
|
26
|
+
|
27
|
+
IntervalAll = Interval + IntervalReluctant +
|
28
|
+
IntervalPossessive
|
29
|
+
|
30
|
+
All = Greedy + Reluctant + Possessive + IntervalAll
|
31
|
+
Type = :quantifier
|
32
|
+
end
|
33
|
+
|
34
|
+
Map[Quantifier::Type] = Quantifier::All
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,204 @@
|
|
1
|
+
module Regexp::Syntax
|
2
|
+
module Token
|
3
|
+
|
4
|
+
module UnicodeProperty
|
5
|
+
CharType = [:alnum, :alpha, :ascii, :blank, :cntrl, :digit, :graph, :lower,
|
6
|
+
:print, :punct, :space, :upper, :word, :xdigit]
|
7
|
+
|
8
|
+
POSIX = [:any, :assigned, :newline]
|
9
|
+
|
10
|
+
module Category
|
11
|
+
Letter = [:letter_any, :letter_uppercase, :letter_lowercase,
|
12
|
+
:letter_titlecase, :letter_modifier, :letter_other]
|
13
|
+
|
14
|
+
Mark = [:mark_any, :mark_nonspacing, :mark_spacing,
|
15
|
+
:mark_enclosing]
|
16
|
+
|
17
|
+
Number = [:number_any, :number_decimal, :number_letter,
|
18
|
+
:number_other]
|
19
|
+
|
20
|
+
Punctuation = [:punct_any, :punct_connector, :punct_dash,
|
21
|
+
:punct_open, :punct_close, :punct_initial,
|
22
|
+
:punct_final, :punct_other]
|
23
|
+
|
24
|
+
Symbol = [:symbol_any, :symbol_math, :symbol_currency,
|
25
|
+
:symbol_modifier, :symbol_other]
|
26
|
+
|
27
|
+
Separator = [:separator_any, :separator_space, :separator_line,
|
28
|
+
:separator_para]
|
29
|
+
|
30
|
+
Codepoint = [:other, :control, :format,
|
31
|
+
:surrogate, :private_use, :unassigned]
|
32
|
+
|
33
|
+
All = Letter + Mark + Number + Punctuation +
|
34
|
+
Symbol + Separator + Codepoint
|
35
|
+
end
|
36
|
+
|
37
|
+
Age = [:age_1_1, :age_2_0, :age_2_1, :age_3_0, :age_3_1,
|
38
|
+
:age_3_2, :age_4_0, :age_4_1, :age_5_0, :age_5_1,
|
39
|
+
:age_5_2, :age_6_0]
|
40
|
+
|
41
|
+
Derived = [
|
42
|
+
:ascii_hex,
|
43
|
+
:alphabetic,
|
44
|
+
:cased,
|
45
|
+
:changes_when_casefolded,
|
46
|
+
:changes_when_casemapped,
|
47
|
+
:changes_when_lowercased,
|
48
|
+
:changes_when_titlecased,
|
49
|
+
:changes_when_uppercased,
|
50
|
+
:case_ignorable,
|
51
|
+
:bidi_control,
|
52
|
+
:dash,
|
53
|
+
:deprecated,
|
54
|
+
:default_ignorable_cp,
|
55
|
+
:diacritic,
|
56
|
+
:extender,
|
57
|
+
:grapheme_base,
|
58
|
+
:grapheme_extend,
|
59
|
+
:grapheme_link,
|
60
|
+
:hex_digit,
|
61
|
+
:hyphen,
|
62
|
+
:id_continue,
|
63
|
+
:ideographic,
|
64
|
+
:id_start,
|
65
|
+
:ids_binary_op,
|
66
|
+
:ids_trinary_op,
|
67
|
+
:join_control,
|
68
|
+
:logical_order_exception,
|
69
|
+
:lowercase,
|
70
|
+
:math,
|
71
|
+
:non_character_cp,
|
72
|
+
:other_alphabetic,
|
73
|
+
:other_default_ignorable_cp,
|
74
|
+
:other_grapheme_extended,
|
75
|
+
:other_id_continue,
|
76
|
+
:other_id_start,
|
77
|
+
:other_lowercase,
|
78
|
+
:other_math,
|
79
|
+
:other_uppercase,
|
80
|
+
:pattern_syntax,
|
81
|
+
:pattern_whitespace,
|
82
|
+
:quotation_mark,
|
83
|
+
:radical,
|
84
|
+
:soft_dotted,
|
85
|
+
:sentence_terminal,
|
86
|
+
:terminal_punctuation,
|
87
|
+
:unified_ideograph,
|
88
|
+
:uppercase,
|
89
|
+
:variation_selector,
|
90
|
+
:whitespace,
|
91
|
+
:xid_start,
|
92
|
+
:xid_continue,
|
93
|
+
]
|
94
|
+
|
95
|
+
Script =[
|
96
|
+
:script_arabic,
|
97
|
+
:script_imperial_aramaic,
|
98
|
+
:script_armenian,
|
99
|
+
:script_avestan,
|
100
|
+
:script_balinese,
|
101
|
+
:script_bamum,
|
102
|
+
:script_bengali,
|
103
|
+
:script_bopomofo,
|
104
|
+
:script_braille,
|
105
|
+
:script_buginese,
|
106
|
+
:script_buhid,
|
107
|
+
:script_canadian_aboriginal,
|
108
|
+
:script_carian,
|
109
|
+
:script_cham,
|
110
|
+
:script_cherokee,
|
111
|
+
:script_coptic,
|
112
|
+
:script_cypriot,
|
113
|
+
:script_cyrillic,
|
114
|
+
:script_devanagari,
|
115
|
+
:script_deseret,
|
116
|
+
:script_egyptian_hieroglyphs,
|
117
|
+
:script_ethiopic,
|
118
|
+
:script_georgian,
|
119
|
+
:script_glagolitic,
|
120
|
+
:script_gothic,
|
121
|
+
:script_greek,
|
122
|
+
:script_gujarati,
|
123
|
+
:script_gurmukhi,
|
124
|
+
:script_hangul,
|
125
|
+
:script_han,
|
126
|
+
:script_hanunoo,
|
127
|
+
:script_hebrew,
|
128
|
+
:script_hiragana,
|
129
|
+
:script_katakana_or_hiragana,
|
130
|
+
:script_old_italic,
|
131
|
+
:script_javanese,
|
132
|
+
:script_kayah_li,
|
133
|
+
:script_katakana,
|
134
|
+
:script_kharoshthi,
|
135
|
+
:script_khmer,
|
136
|
+
:script_kannada,
|
137
|
+
:script_kaithi,
|
138
|
+
:script_tai_tham,
|
139
|
+
:script_lao,
|
140
|
+
:script_latin,
|
141
|
+
:script_lepcha,
|
142
|
+
:script_limbu,
|
143
|
+
:script_linear_b,
|
144
|
+
:script_lisu,
|
145
|
+
:script_lycian,
|
146
|
+
:script_lydian,
|
147
|
+
:script_malayalam,
|
148
|
+
:script_mongolian,
|
149
|
+
:script_meetei_mayek,
|
150
|
+
:script_myanmar,
|
151
|
+
:script_nko,
|
152
|
+
:script_ogham,
|
153
|
+
:script_ol_chiki,
|
154
|
+
:script_old_turkic,
|
155
|
+
:script_oriya,
|
156
|
+
:script_osmanya,
|
157
|
+
:script_phags_pa,
|
158
|
+
:script_inscriptional_pahlavi,
|
159
|
+
:script_phoenician,
|
160
|
+
:script_inscriptional_parthian,
|
161
|
+
:script_rejang,
|
162
|
+
:script_runic,
|
163
|
+
:script_samaritan,
|
164
|
+
:script_old_south_arabian,
|
165
|
+
:script_saurashtra,
|
166
|
+
:script_shavian,
|
167
|
+
:script_sinhala,
|
168
|
+
:script_sundanese,
|
169
|
+
:script_syloti_nagri,
|
170
|
+
:script_syriac,
|
171
|
+
:script_tagbanwa,
|
172
|
+
:script_tai_le,
|
173
|
+
:script_new_tai_lue,
|
174
|
+
:script_tamil,
|
175
|
+
:script_tai_viet,
|
176
|
+
:script_telugu,
|
177
|
+
:script_tifinagh,
|
178
|
+
:script_tagalog,
|
179
|
+
:script_thaana,
|
180
|
+
:script_thai,
|
181
|
+
:script_tibetan,
|
182
|
+
:script_ugaritic,
|
183
|
+
:script_vai,
|
184
|
+
:script_old_persian,
|
185
|
+
:script_cuneiform,
|
186
|
+
:script_yi,
|
187
|
+
:script_inherited,
|
188
|
+
:script_common,
|
189
|
+
:script_unknown
|
190
|
+
]
|
191
|
+
|
192
|
+
Script_6_0 = [:script_brahmi, :script_batak, :script_mandaic]
|
193
|
+
|
194
|
+
All = CharType + POSIX + Category::All + Age + Derived + Script
|
195
|
+
|
196
|
+
Type = :property
|
197
|
+
NonType = :nonproperty
|
198
|
+
end
|
199
|
+
|
200
|
+
Map[UnicodeProperty::Type] = UnicodeProperty::All
|
201
|
+
Map[UnicodeProperty::NonType] = UnicodeProperty::All
|
202
|
+
|
203
|
+
end
|
204
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
class Regexp
|
2
|
+
|
3
|
+
TOKEN_KEYS = [:type, :token, :text, :ts, :te, :level, :set_level].freeze
|
4
|
+
Token = Struct.new(*TOKEN_KEYS) do
|
5
|
+
def offset
|
6
|
+
[self.ts, self.te]
|
7
|
+
end
|
8
|
+
|
9
|
+
def length
|
10
|
+
self.te - self.ts
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_h
|
14
|
+
hash = {}
|
15
|
+
members.each do |member|
|
16
|
+
hash[member.to_sym] = self.send(member.to_sym)
|
17
|
+
end; hash
|
18
|
+
end
|
19
|
+
|
20
|
+
def next(exp = nil)
|
21
|
+
if exp
|
22
|
+
@next = exp
|
23
|
+
else
|
24
|
+
@next
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def previous(exp = nil)
|
29
|
+
if exp
|
30
|
+
@previous = exp
|
31
|
+
else
|
32
|
+
@previous
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|