regexp_parser 0.1.1 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/ChangeLog +45 -0
  3. data/Rakefile +12 -44
  4. data/VERSION.yml +5 -0
  5. data/lib/regexp_parser.rb +5 -38
  6. data/lib/regexp_parser/expression.rb +68 -221
  7. data/lib/regexp_parser/expression/classes/alternation.rb +47 -0
  8. data/lib/regexp_parser/expression/classes/anchor.rb +26 -0
  9. data/lib/regexp_parser/expression/classes/backref.rb +42 -0
  10. data/lib/regexp_parser/expression/classes/escape.rb +27 -0
  11. data/lib/regexp_parser/expression/classes/group.rb +67 -0
  12. data/lib/regexp_parser/expression/classes/literal.rb +7 -0
  13. data/lib/regexp_parser/expression/{property.rb → classes/property.rb} +1 -1
  14. data/lib/regexp_parser/expression/classes/root.rb +26 -0
  15. data/lib/regexp_parser/expression/classes/set.rb +100 -0
  16. data/lib/regexp_parser/expression/classes/type.rb +17 -0
  17. data/lib/regexp_parser/expression/quantifier.rb +26 -0
  18. data/lib/regexp_parser/expression/subexpression.rb +69 -0
  19. data/lib/regexp_parser/lexer.rb +4 -4
  20. data/lib/regexp_parser/parser.rb +31 -13
  21. data/lib/regexp_parser/scanner.rb +1849 -1488
  22. data/lib/regexp_parser/scanner/property.rl +7 -2
  23. data/lib/regexp_parser/scanner/scanner.rl +377 -191
  24. data/lib/regexp_parser/syntax.rb +7 -0
  25. data/lib/regexp_parser/syntax/ruby/1.8.6.rb +4 -4
  26. data/lib/regexp_parser/syntax/ruby/1.9.1.rb +9 -9
  27. data/lib/regexp_parser/syntax/ruby/2.0.0.rb +16 -0
  28. data/lib/regexp_parser/syntax/ruby/2.1.0.rb +13 -0
  29. data/lib/regexp_parser/syntax/tokens.rb +21 -320
  30. data/lib/regexp_parser/syntax/tokens/anchor.rb +17 -0
  31. data/lib/regexp_parser/syntax/tokens/assertion.rb +15 -0
  32. data/lib/regexp_parser/syntax/tokens/backref.rb +26 -0
  33. data/lib/regexp_parser/syntax/tokens/character_set.rb +48 -0
  34. data/lib/regexp_parser/syntax/tokens/character_type.rb +16 -0
  35. data/lib/regexp_parser/syntax/tokens/escape.rb +29 -0
  36. data/lib/regexp_parser/syntax/tokens/group.rb +22 -0
  37. data/lib/regexp_parser/syntax/tokens/meta.rb +15 -0
  38. data/lib/regexp_parser/syntax/tokens/quantifier.rb +37 -0
  39. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +204 -0
  40. data/lib/regexp_parser/token.rb +37 -0
  41. data/test/expression/test_all.rb +7 -0
  42. data/test/expression/test_base.rb +72 -0
  43. data/test/expression/test_clone.rb +144 -0
  44. data/test/{parser/test_expression.rb → expression/test_to_s.rb} +10 -10
  45. data/test/helpers.rb +1 -0
  46. data/test/parser/test_all.rb +1 -1
  47. data/test/parser/test_alternation.rb +35 -0
  48. data/test/parser/test_anchors.rb +2 -2
  49. data/test/parser/test_refcalls.rb +1 -1
  50. data/test/parser/test_sets.rb +54 -8
  51. data/test/scanner/test_anchors.rb +2 -2
  52. data/test/scanner/test_conditionals.rb +31 -0
  53. data/test/scanner/test_errors.rb +88 -8
  54. data/test/scanner/test_escapes.rb +4 -4
  55. data/test/scanner/test_groups.rb +7 -0
  56. data/test/scanner/test_quoting.rb +29 -0
  57. data/test/scanner/test_sets.rb +1 -0
  58. data/test/syntax/ruby/test_1.8.rb +3 -3
  59. data/test/test_all.rb +1 -1
  60. metadata +62 -48
  61. data/lib/regexp_parser/expression/set.rb +0 -59
@@ -0,0 +1,17 @@
1
+ module Regexp::Syntax
2
+ module Token
3
+
4
+ module Anchor
5
+ Basic = [:bol, :eol]
6
+ Extended = Basic + [:word_boundary, :nonword_boundary]
7
+ String = [:bos, :eos, :eos_ob_eol]
8
+ MatchStart = [:match_start]
9
+
10
+ All = Extended + String + MatchStart
11
+ Type = :anchor
12
+ end
13
+
14
+ Map[Anchor::Type] = Anchor::All
15
+
16
+ end
17
+ end
@@ -0,0 +1,15 @@
1
+ module Regexp::Syntax
2
+ module Token
3
+
4
+ module Assertion
5
+ Lookahead = [:lookahead, :nlookahead]
6
+ Lookbehind = [:lookbehind, :nlookbehind]
7
+
8
+ All = Lookahead + Lookbehind
9
+ Type = :assertion
10
+ end
11
+
12
+ Map[Assertion::Type] = Assertion::All
13
+
14
+ end
15
+ end
@@ -0,0 +1,26 @@
1
+ module Regexp::Syntax
2
+ module Token
3
+
4
+ module Backreference
5
+ Name = [:name_ref]
6
+ Number = [:number_ref, :number_rel_ref]
7
+
8
+ NestLevel = [:name_nest_ref, :number_nest_ref]
9
+
10
+ All = Name + Number + NestLevel
11
+ Type = :backref
12
+ end
13
+
14
+ # Type is the same as Backreference so keeping it here, for now.
15
+ module SubexpressionCall
16
+ Name = [:name_call]
17
+ Number = [:number_call, :number_rel_call]
18
+
19
+ All = Name + Number
20
+ end
21
+
22
+ Map[Backreference::Type] = Backreference::All +
23
+ SubexpressionCall::All
24
+
25
+ end
26
+ end
@@ -0,0 +1,48 @@
1
+ module Regexp::Syntax
2
+ module Token
3
+
4
+ module CharacterSet
5
+ OpenClose = [:open, :close]
6
+
7
+ Basic = [:negate, :member, :range]
8
+ Extended = Basic + [:escape, :intersection, :range_hex, :backspace]
9
+
10
+ Types = [:type_digit, :type_nondigit, :type_hex, :type_nonhex,
11
+ :type_space, :type_nonspace, :type_word, :type_nonword]
12
+
13
+ module POSIX
14
+ Standard = [
15
+ :class_alnum, :class_alpha, :class_blank, :class_cntrl,
16
+ :class_digit, :class_graph, :class_lower, :class_print,
17
+ :class_punct, :class_space, :class_upper, :class_xdigit,
18
+ ]
19
+
20
+ StandardNegative = [
21
+ :class_nonalnum, :class_nonalpha, :class_nonblank,
22
+ :class_noncntrl, :class_nondigit, :class_nongraph,
23
+ :class_nonlower, :class_nonprint, :class_nonpunct,
24
+ :class_nonspace, :class_nonupper, :class_nonxdigit,
25
+ ]
26
+
27
+ Extensions = [:class_ascii, :class_word]
28
+ ExtensionsNegative = [:class_nonascii, :class_nonword]
29
+
30
+ All = Standard + StandardNegative + Extensions + ExtensionsNegative
31
+ end
32
+
33
+ All = Basic + Extended + Types + POSIX::All
34
+ Type = :set
35
+
36
+ module SubSet
37
+ OpenClose = [:open, :close]
38
+
39
+ All = CharacterSet::All
40
+ Type = :subset
41
+ end
42
+ end
43
+
44
+ Map[CharacterSet::Type] = CharacterSet::All
45
+ Map[CharacterSet::SubSet::Type] = CharacterSet::All
46
+
47
+ end
48
+ end
@@ -0,0 +1,16 @@
1
+ module Regexp::Syntax
2
+ module Token
3
+
4
+ module CharacterType
5
+ Basic = []
6
+ Extended = [:digit, :nondigit, :space, :nonspace, :word, :nonword]
7
+ Hex = [:hex, :nonhex]
8
+
9
+ All = Basic + Extended + Hex
10
+ Type = :type
11
+ end
12
+
13
+ Map[CharacterType::Type] = CharacterType::All
14
+
15
+ end
16
+ end
@@ -0,0 +1,29 @@
1
+ module Regexp::Syntax
2
+ module Token
3
+
4
+ module Escape
5
+ Basic = [:backslash, :literal]
6
+
7
+ Backreference = [:digit]
8
+
9
+ Control = [:control, :meta_sequence]
10
+
11
+ ASCII = [:bell, :backspace, :escape, :form_feed, :newline, :carriage,
12
+ :space, :tab, :vertical_tab]
13
+
14
+ Meta = [:dot, :alternation,
15
+ :zero_or_one, :zero_or_more, :one_or_more,
16
+ :bol, :eol,
17
+ :group_open, :group_close,
18
+ :interval_open, :interval_close,
19
+ :set_open, :set_close,
20
+ :baclslash]
21
+
22
+ All = Basic + Backreference + ASCII + Meta
23
+ Type = :escape
24
+ end
25
+
26
+ Map[Escape::Type] = Escape::All
27
+
28
+ end
29
+ end
@@ -0,0 +1,22 @@
1
+ module Regexp::Syntax
2
+ module Token
3
+
4
+ module Group
5
+ Basic = [:capture, :close]
6
+ Extended = Basic + [:options]
7
+
8
+ Named = [:named]
9
+ Atomic = [:atomic]
10
+ Passive = [:passive]
11
+ Comment = [:comment]
12
+
13
+ All = Group::Extended + Group::Named + Group::Atomic +
14
+ Group::Passive + Group::Comment
15
+
16
+ Type = :group
17
+ end
18
+
19
+ Map[Group::Type] = Group::All
20
+
21
+ end
22
+ end
@@ -0,0 +1,15 @@
1
+ module Regexp::Syntax
2
+ module Token
3
+
4
+ module Meta
5
+ Basic = [:dot]
6
+ Extended = Basic + [:alternation]
7
+
8
+ All = Extended
9
+ Type = :meta
10
+ end
11
+
12
+ Map[Meta::Type] = Meta::All
13
+
14
+ end
15
+ end
@@ -0,0 +1,37 @@
1
+ module Regexp::Syntax
2
+ module Token
3
+
4
+ module Quantifier
5
+ Greedy = [
6
+ :zero_or_one,
7
+ :zero_or_more,
8
+ :one_or_more
9
+ ]
10
+
11
+ Reluctant = [
12
+ :zero_or_one_reluctant,
13
+ :zero_or_more_reluctant,
14
+ :one_or_more_reluctant
15
+ ]
16
+
17
+ Possessive = [
18
+ :zero_or_one_possessive,
19
+ :zero_or_more_possessive,
20
+ :one_or_more_possessive
21
+ ]
22
+
23
+ Interval = [:interval]
24
+ IntervalReluctant = [:interval_reluctant]
25
+ IntervalPossessive = [:interval_possessive]
26
+
27
+ IntervalAll = Interval + IntervalReluctant +
28
+ IntervalPossessive
29
+
30
+ All = Greedy + Reluctant + Possessive + IntervalAll
31
+ Type = :quantifier
32
+ end
33
+
34
+ Map[Quantifier::Type] = Quantifier::All
35
+
36
+ end
37
+ end
@@ -0,0 +1,204 @@
1
+ module Regexp::Syntax
2
+ module Token
3
+
4
+ module UnicodeProperty
5
+ CharType = [:alnum, :alpha, :ascii, :blank, :cntrl, :digit, :graph, :lower,
6
+ :print, :punct, :space, :upper, :word, :xdigit]
7
+
8
+ POSIX = [:any, :assigned, :newline]
9
+
10
+ module Category
11
+ Letter = [:letter_any, :letter_uppercase, :letter_lowercase,
12
+ :letter_titlecase, :letter_modifier, :letter_other]
13
+
14
+ Mark = [:mark_any, :mark_nonspacing, :mark_spacing,
15
+ :mark_enclosing]
16
+
17
+ Number = [:number_any, :number_decimal, :number_letter,
18
+ :number_other]
19
+
20
+ Punctuation = [:punct_any, :punct_connector, :punct_dash,
21
+ :punct_open, :punct_close, :punct_initial,
22
+ :punct_final, :punct_other]
23
+
24
+ Symbol = [:symbol_any, :symbol_math, :symbol_currency,
25
+ :symbol_modifier, :symbol_other]
26
+
27
+ Separator = [:separator_any, :separator_space, :separator_line,
28
+ :separator_para]
29
+
30
+ Codepoint = [:other, :control, :format,
31
+ :surrogate, :private_use, :unassigned]
32
+
33
+ All = Letter + Mark + Number + Punctuation +
34
+ Symbol + Separator + Codepoint
35
+ end
36
+
37
+ Age = [:age_1_1, :age_2_0, :age_2_1, :age_3_0, :age_3_1,
38
+ :age_3_2, :age_4_0, :age_4_1, :age_5_0, :age_5_1,
39
+ :age_5_2, :age_6_0]
40
+
41
+ Derived = [
42
+ :ascii_hex,
43
+ :alphabetic,
44
+ :cased,
45
+ :changes_when_casefolded,
46
+ :changes_when_casemapped,
47
+ :changes_when_lowercased,
48
+ :changes_when_titlecased,
49
+ :changes_when_uppercased,
50
+ :case_ignorable,
51
+ :bidi_control,
52
+ :dash,
53
+ :deprecated,
54
+ :default_ignorable_cp,
55
+ :diacritic,
56
+ :extender,
57
+ :grapheme_base,
58
+ :grapheme_extend,
59
+ :grapheme_link,
60
+ :hex_digit,
61
+ :hyphen,
62
+ :id_continue,
63
+ :ideographic,
64
+ :id_start,
65
+ :ids_binary_op,
66
+ :ids_trinary_op,
67
+ :join_control,
68
+ :logical_order_exception,
69
+ :lowercase,
70
+ :math,
71
+ :non_character_cp,
72
+ :other_alphabetic,
73
+ :other_default_ignorable_cp,
74
+ :other_grapheme_extended,
75
+ :other_id_continue,
76
+ :other_id_start,
77
+ :other_lowercase,
78
+ :other_math,
79
+ :other_uppercase,
80
+ :pattern_syntax,
81
+ :pattern_whitespace,
82
+ :quotation_mark,
83
+ :radical,
84
+ :soft_dotted,
85
+ :sentence_terminal,
86
+ :terminal_punctuation,
87
+ :unified_ideograph,
88
+ :uppercase,
89
+ :variation_selector,
90
+ :whitespace,
91
+ :xid_start,
92
+ :xid_continue,
93
+ ]
94
+
95
+ Script =[
96
+ :script_arabic,
97
+ :script_imperial_aramaic,
98
+ :script_armenian,
99
+ :script_avestan,
100
+ :script_balinese,
101
+ :script_bamum,
102
+ :script_bengali,
103
+ :script_bopomofo,
104
+ :script_braille,
105
+ :script_buginese,
106
+ :script_buhid,
107
+ :script_canadian_aboriginal,
108
+ :script_carian,
109
+ :script_cham,
110
+ :script_cherokee,
111
+ :script_coptic,
112
+ :script_cypriot,
113
+ :script_cyrillic,
114
+ :script_devanagari,
115
+ :script_deseret,
116
+ :script_egyptian_hieroglyphs,
117
+ :script_ethiopic,
118
+ :script_georgian,
119
+ :script_glagolitic,
120
+ :script_gothic,
121
+ :script_greek,
122
+ :script_gujarati,
123
+ :script_gurmukhi,
124
+ :script_hangul,
125
+ :script_han,
126
+ :script_hanunoo,
127
+ :script_hebrew,
128
+ :script_hiragana,
129
+ :script_katakana_or_hiragana,
130
+ :script_old_italic,
131
+ :script_javanese,
132
+ :script_kayah_li,
133
+ :script_katakana,
134
+ :script_kharoshthi,
135
+ :script_khmer,
136
+ :script_kannada,
137
+ :script_kaithi,
138
+ :script_tai_tham,
139
+ :script_lao,
140
+ :script_latin,
141
+ :script_lepcha,
142
+ :script_limbu,
143
+ :script_linear_b,
144
+ :script_lisu,
145
+ :script_lycian,
146
+ :script_lydian,
147
+ :script_malayalam,
148
+ :script_mongolian,
149
+ :script_meetei_mayek,
150
+ :script_myanmar,
151
+ :script_nko,
152
+ :script_ogham,
153
+ :script_ol_chiki,
154
+ :script_old_turkic,
155
+ :script_oriya,
156
+ :script_osmanya,
157
+ :script_phags_pa,
158
+ :script_inscriptional_pahlavi,
159
+ :script_phoenician,
160
+ :script_inscriptional_parthian,
161
+ :script_rejang,
162
+ :script_runic,
163
+ :script_samaritan,
164
+ :script_old_south_arabian,
165
+ :script_saurashtra,
166
+ :script_shavian,
167
+ :script_sinhala,
168
+ :script_sundanese,
169
+ :script_syloti_nagri,
170
+ :script_syriac,
171
+ :script_tagbanwa,
172
+ :script_tai_le,
173
+ :script_new_tai_lue,
174
+ :script_tamil,
175
+ :script_tai_viet,
176
+ :script_telugu,
177
+ :script_tifinagh,
178
+ :script_tagalog,
179
+ :script_thaana,
180
+ :script_thai,
181
+ :script_tibetan,
182
+ :script_ugaritic,
183
+ :script_vai,
184
+ :script_old_persian,
185
+ :script_cuneiform,
186
+ :script_yi,
187
+ :script_inherited,
188
+ :script_common,
189
+ :script_unknown
190
+ ]
191
+
192
+ Script_6_0 = [:script_brahmi, :script_batak, :script_mandaic]
193
+
194
+ All = CharType + POSIX + Category::All + Age + Derived + Script
195
+
196
+ Type = :property
197
+ NonType = :nonproperty
198
+ end
199
+
200
+ Map[UnicodeProperty::Type] = UnicodeProperty::All
201
+ Map[UnicodeProperty::NonType] = UnicodeProperty::All
202
+
203
+ end
204
+ end
@@ -0,0 +1,37 @@
1
+ class Regexp
2
+
3
+ TOKEN_KEYS = [:type, :token, :text, :ts, :te, :level, :set_level].freeze
4
+ Token = Struct.new(*TOKEN_KEYS) do
5
+ def offset
6
+ [self.ts, self.te]
7
+ end
8
+
9
+ def length
10
+ self.te - self.ts
11
+ end
12
+
13
+ def to_h
14
+ hash = {}
15
+ members.each do |member|
16
+ hash[member.to_sym] = self.send(member.to_sym)
17
+ end; hash
18
+ end
19
+
20
+ def next(exp = nil)
21
+ if exp
22
+ @next = exp
23
+ else
24
+ @next
25
+ end
26
+ end
27
+
28
+ def previous(exp = nil)
29
+ if exp
30
+ @previous = exp
31
+ else
32
+ @previous
33
+ end
34
+ end
35
+ end
36
+
37
+ end