regexp_parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +4 -0
- data/LICENSE +22 -0
- data/README.rdoc +307 -0
- data/Rakefile +91 -0
- data/lib/regexp_parser/ctype.rb +48 -0
- data/lib/regexp_parser/expression/property.rb +108 -0
- data/lib/regexp_parser/expression/set.rb +59 -0
- data/lib/regexp_parser/expression.rb +287 -0
- data/lib/regexp_parser/lexer.rb +105 -0
- data/lib/regexp_parser/parser.rb +417 -0
- data/lib/regexp_parser/scanner/property.rl +534 -0
- data/lib/regexp_parser/scanner/scanner.rl +712 -0
- data/lib/regexp_parser/scanner.rb +3325 -0
- data/lib/regexp_parser/syntax/ruby/1.8.6.rb +14 -0
- data/lib/regexp_parser/syntax/ruby/1.8.7.rb +14 -0
- data/lib/regexp_parser/syntax/ruby/1.8.rb +39 -0
- data/lib/regexp_parser/syntax/ruby/1.9.1.rb +39 -0
- data/lib/regexp_parser/syntax/ruby/1.9.2.rb +10 -0
- data/lib/regexp_parser/syntax/ruby/1.9.3.rb +24 -0
- data/lib/regexp_parser/syntax/ruby/1.9.rb +8 -0
- data/lib/regexp_parser/syntax/tokens.rb +332 -0
- data/lib/regexp_parser/syntax.rb +172 -0
- data/lib/regexp_parser.rb +45 -0
- data/test/helpers.rb +8 -0
- data/test/lexer/test_all.rb +26 -0
- data/test/lexer/test_literals.rb +120 -0
- data/test/lexer/test_nesting.rb +107 -0
- data/test/lexer/test_refcalls.rb +45 -0
- data/test/parser/test_all.rb +44 -0
- data/test/parser/test_alternation.rb +46 -0
- data/test/parser/test_anchors.rb +35 -0
- data/test/parser/test_errors.rb +59 -0
- data/test/parser/test_escapes.rb +48 -0
- data/test/parser/test_expression.rb +51 -0
- data/test/parser/test_groups.rb +69 -0
- data/test/parser/test_properties.rb +346 -0
- data/test/parser/test_quantifiers.rb +236 -0
- data/test/parser/test_refcalls.rb +101 -0
- data/test/parser/test_sets.rb +99 -0
- data/test/scanner/test_all.rb +30 -0
- data/test/scanner/test_anchors.rb +35 -0
- data/test/scanner/test_errors.rb +36 -0
- data/test/scanner/test_escapes.rb +49 -0
- data/test/scanner/test_groups.rb +41 -0
- data/test/scanner/test_literals.rb +85 -0
- data/test/scanner/test_meta.rb +36 -0
- data/test/scanner/test_properties.rb +315 -0
- data/test/scanner/test_quantifiers.rb +38 -0
- data/test/scanner/test_refcalls.rb +45 -0
- data/test/scanner/test_scripts.rb +314 -0
- data/test/scanner/test_sets.rb +80 -0
- data/test/scanner/test_types.rb +30 -0
- data/test/syntax/ruby/test_1.8.rb +57 -0
- data/test/syntax/ruby/test_1.9.1.rb +39 -0
- data/test/syntax/ruby/test_1.9.3.rb +38 -0
- data/test/syntax/ruby/test_all.rb +12 -0
- data/test/syntax/test_all.rb +19 -0
- data/test/test_all.rb +4 -0
- metadata +160 -0
@@ -0,0 +1,108 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
|
3
|
+
module UnicodeProperty
|
4
|
+
class Base < Regexp::Expression::Base
|
5
|
+
def negative?
|
6
|
+
@type == :nonproperty
|
7
|
+
end
|
8
|
+
|
9
|
+
def name
|
10
|
+
@text =~ /\A\\[pP]\{([^}]+)\}\z/; $1
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class Alnum < Base; end
|
15
|
+
class Alpha < Base; end
|
16
|
+
class Any < Base; end
|
17
|
+
class Ascii < Base; end
|
18
|
+
class Blank < Base; end
|
19
|
+
class Cntrl < Base; end
|
20
|
+
class Digit < Base; end
|
21
|
+
class Graph < Base; end
|
22
|
+
class Lower < Base; end
|
23
|
+
class Newline < Base; end
|
24
|
+
class Print < Base; end
|
25
|
+
class Punct < Base; end
|
26
|
+
class Space < Base; end
|
27
|
+
class Upper < Base; end
|
28
|
+
class Word < Base; end
|
29
|
+
class Xdigit < Base; end
|
30
|
+
|
31
|
+
module Letter
|
32
|
+
class Base < UnicodeProperty::Base; end
|
33
|
+
|
34
|
+
class Any < Letter::Base; end
|
35
|
+
class Uppercase < Letter::Base; end
|
36
|
+
class Lowercase < Letter::Base; end
|
37
|
+
class Titlecase < Letter::Base; end
|
38
|
+
class Modifier < Letter::Base; end
|
39
|
+
class Other < Letter::Base; end
|
40
|
+
end
|
41
|
+
|
42
|
+
module Mark
|
43
|
+
class Base < UnicodeProperty::Base; end
|
44
|
+
|
45
|
+
class Any < Mark::Base; end
|
46
|
+
class Nonspacing < Mark::Base; end
|
47
|
+
class Spacing < Mark::Base; end
|
48
|
+
class Enclosing < Mark::Base; end
|
49
|
+
end
|
50
|
+
|
51
|
+
module Number
|
52
|
+
class Base < UnicodeProperty::Base; end
|
53
|
+
|
54
|
+
class Any < Number::Base; end
|
55
|
+
class Decimal < Number::Base; end
|
56
|
+
class Letter < Number::Base; end
|
57
|
+
class Other < Number::Base; end
|
58
|
+
end
|
59
|
+
|
60
|
+
module Punctuation
|
61
|
+
class Base < UnicodeProperty::Base; end
|
62
|
+
|
63
|
+
class Any < Punctuation::Base; end
|
64
|
+
class Connector < Punctuation::Base; end
|
65
|
+
class Dash < Punctuation::Base; end
|
66
|
+
class Open < Punctuation::Base; end
|
67
|
+
class Close < Punctuation::Base; end
|
68
|
+
class Initial < Punctuation::Base; end
|
69
|
+
class Final < Punctuation::Base; end
|
70
|
+
class Other < Punctuation::Base; end
|
71
|
+
end
|
72
|
+
|
73
|
+
module Separator
|
74
|
+
class Base < UnicodeProperty::Base; end
|
75
|
+
|
76
|
+
class Any < Separator::Base; end
|
77
|
+
class Space < Separator::Base; end
|
78
|
+
class Line < Separator::Base; end
|
79
|
+
class Paragraph < Separator::Base; end
|
80
|
+
end
|
81
|
+
|
82
|
+
module Symbol
|
83
|
+
class Base < UnicodeProperty::Base; end
|
84
|
+
|
85
|
+
class Any < Symbol::Base; end
|
86
|
+
class Math < Symbol::Base; end
|
87
|
+
class Currency < Symbol::Base; end
|
88
|
+
class Modifier < Symbol::Base; end
|
89
|
+
class Other < Symbol::Base; end
|
90
|
+
end
|
91
|
+
|
92
|
+
module Codepoint
|
93
|
+
class Base < UnicodeProperty::Base; end
|
94
|
+
|
95
|
+
class Any < Codepoint::Base; end
|
96
|
+
class Control < Codepoint::Base; end
|
97
|
+
class Format < Codepoint::Base; end
|
98
|
+
class Surrogate < Codepoint::Base; end
|
99
|
+
class PrivateUse < Codepoint::Base; end
|
100
|
+
class Unassigned < Codepoint::Base; end
|
101
|
+
end
|
102
|
+
|
103
|
+
class Age < UnicodeProperty::Base; end
|
104
|
+
class Derived < UnicodeProperty::Base; end
|
105
|
+
class Script < UnicodeProperty::Base; end
|
106
|
+
end
|
107
|
+
|
108
|
+
end # module Regexp::Expression
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
|
3
|
+
class CharacterSet < Regexp::Expression::Base
|
4
|
+
attr_accessor :members
|
5
|
+
|
6
|
+
def initialize(token)
|
7
|
+
@members = []
|
8
|
+
@negative = false
|
9
|
+
super
|
10
|
+
end
|
11
|
+
|
12
|
+
def <<(member)
|
13
|
+
if @members.last.is_a?(CharacterSubSet)
|
14
|
+
@members.last << member
|
15
|
+
else
|
16
|
+
@members << member
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def include?(member)
|
21
|
+
@members.each do |m|
|
22
|
+
if m.is_a?(CharacterSubSet)
|
23
|
+
return true if m.include?(member)
|
24
|
+
else
|
25
|
+
return true if member == m.to_s
|
26
|
+
end
|
27
|
+
end; false
|
28
|
+
end
|
29
|
+
|
30
|
+
def negate
|
31
|
+
if @members.last.is_a?(CharacterSubSet)
|
32
|
+
@members.last.negate
|
33
|
+
else
|
34
|
+
@negative = true
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def negative?
|
39
|
+
@negative
|
40
|
+
end
|
41
|
+
alias :negated? :negative?
|
42
|
+
|
43
|
+
def to_s
|
44
|
+
s = @text
|
45
|
+
s << '^' if negative?
|
46
|
+
s << @members.join
|
47
|
+
s << ']'
|
48
|
+
s << @quantifier.to_s if quantified?
|
49
|
+
s
|
50
|
+
end
|
51
|
+
|
52
|
+
def matches?(input)
|
53
|
+
input =~ /#{to_s}/ ? true : false
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class CharacterSubSet < CharacterSet; end
|
58
|
+
|
59
|
+
end # module Regexp::Expression
|
@@ -0,0 +1,287 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
class Base
|
3
|
+
attr_reader :type, :token, :text
|
4
|
+
attr_reader :quantifier
|
5
|
+
attr_reader :expressions
|
6
|
+
|
7
|
+
attr_accessor :options
|
8
|
+
|
9
|
+
def initialize(token)
|
10
|
+
@type = token.type
|
11
|
+
@token = token.token
|
12
|
+
@text = token.text
|
13
|
+
@options = nil
|
14
|
+
@expressions = []
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
s = @text
|
19
|
+
s << @expressions.map{|e| e.to_s}.join unless @expressions.empty?
|
20
|
+
s << @quantifier if quantified?
|
21
|
+
s
|
22
|
+
end
|
23
|
+
|
24
|
+
def <<(exp)
|
25
|
+
@expressions << exp
|
26
|
+
end
|
27
|
+
|
28
|
+
def each(&block)
|
29
|
+
@expressions.each {|e| yield e}
|
30
|
+
end
|
31
|
+
|
32
|
+
def [](index)
|
33
|
+
@expressions[index]
|
34
|
+
end
|
35
|
+
|
36
|
+
def quantify(token, text, min = nil, max = nil, mode = :greedy)
|
37
|
+
@quantifier = Quantifier.new(token, text, min, max, mode)
|
38
|
+
end
|
39
|
+
|
40
|
+
def quantified?
|
41
|
+
not @quantifier.nil?
|
42
|
+
end
|
43
|
+
|
44
|
+
def quantity
|
45
|
+
[@quantifier.min, @quantifier.max]
|
46
|
+
end
|
47
|
+
|
48
|
+
def greedy?
|
49
|
+
@quantifier.mode == :greedy
|
50
|
+
end
|
51
|
+
|
52
|
+
def reluctant?
|
53
|
+
@quantifier.mode == :reluctant
|
54
|
+
end
|
55
|
+
alias :lazy? :reluctant?
|
56
|
+
|
57
|
+
def possessive?
|
58
|
+
@quantifier.mode == :possessive
|
59
|
+
end
|
60
|
+
|
61
|
+
def multiline?
|
62
|
+
(@options and @options[:m]) ? true : false
|
63
|
+
end
|
64
|
+
alias :m? :multiline?
|
65
|
+
|
66
|
+
def case_insensitive?
|
67
|
+
(@options and @options[:i]) ? true : false
|
68
|
+
end
|
69
|
+
alias :i? :case_insensitive?
|
70
|
+
alias :ignore_case? :case_insensitive?
|
71
|
+
|
72
|
+
def free_spacing?
|
73
|
+
(@options and @options[:x]) ? true : false
|
74
|
+
end
|
75
|
+
alias :x? :free_spacing?
|
76
|
+
alias :extended? :free_spacing?
|
77
|
+
end
|
78
|
+
|
79
|
+
class Root < Regexp::Expression::Base
|
80
|
+
def initialize
|
81
|
+
super Regexp::Token.new(:expression, :root, '')
|
82
|
+
end
|
83
|
+
|
84
|
+
def multiline?
|
85
|
+
@expressions[0].m?
|
86
|
+
end
|
87
|
+
alias :m? :multiline?
|
88
|
+
|
89
|
+
def case_insensitive?
|
90
|
+
@expressions[0].i?
|
91
|
+
end
|
92
|
+
alias :i? :case_insensitive?
|
93
|
+
|
94
|
+
def free_spacing?
|
95
|
+
@expressions[0].x?
|
96
|
+
end
|
97
|
+
alias :x? :free_spacing?
|
98
|
+
end
|
99
|
+
|
100
|
+
class Quantifier
|
101
|
+
attr_reader :token, :text, :min, :max, :mode
|
102
|
+
|
103
|
+
def initialize(token, text, min, max, mode)
|
104
|
+
@token = token
|
105
|
+
@text = text
|
106
|
+
@mode = mode
|
107
|
+
@min = min
|
108
|
+
@max = max
|
109
|
+
end
|
110
|
+
|
111
|
+
def to_s
|
112
|
+
@text
|
113
|
+
end
|
114
|
+
alias :to_str :to_s
|
115
|
+
end
|
116
|
+
|
117
|
+
class Literal < Regexp::Expression::Base; end
|
118
|
+
|
119
|
+
module Backreference
|
120
|
+
class Base < Regexp::Expression::Base; end
|
121
|
+
|
122
|
+
class Name < Backreference::Base; end
|
123
|
+
class Number < Backreference::Base; end
|
124
|
+
class NumberRelative < Backreference::Base; end
|
125
|
+
|
126
|
+
class NameNestLevel < Backreference::Base; end
|
127
|
+
class NumberNestLevel < Backreference::Base; end
|
128
|
+
|
129
|
+
class NameCall < Backreference::Base; end
|
130
|
+
class NumberCall < Backreference::Base; end
|
131
|
+
class NumberCallRelative < Backreference::Base; end
|
132
|
+
end
|
133
|
+
|
134
|
+
module Anchor
|
135
|
+
class Base < Regexp::Expression::Base; end
|
136
|
+
|
137
|
+
class BeginningOfLine < Anchor::Base; end
|
138
|
+
class EndOfLine < Anchor::Base; end
|
139
|
+
|
140
|
+
class BeginningOfString < Anchor::Base; end
|
141
|
+
class EndOfString < Anchor::Base; end
|
142
|
+
|
143
|
+
class EndOfStringOrBeforeEndOfLine < Anchor::Base; end
|
144
|
+
|
145
|
+
class WordBoundary < Anchor::Base; end
|
146
|
+
class NonWordBoundary < Anchor::Base; end
|
147
|
+
|
148
|
+
class MatchStart < Anchor::Base; end
|
149
|
+
|
150
|
+
BOL = BeginningOfLine
|
151
|
+
EOL = EndOfLine
|
152
|
+
BOS = BeginningOfString
|
153
|
+
EOS = EndOfString
|
154
|
+
EOSobEOL = EndOfStringOrBeforeEndOfLine
|
155
|
+
end
|
156
|
+
|
157
|
+
module CharacterType
|
158
|
+
class Base < Regexp::Expression::Base; end
|
159
|
+
|
160
|
+
class Any < CharacterType::Base; end
|
161
|
+
class Digit < CharacterType::Base; end
|
162
|
+
class NonDigit < CharacterType::Base; end
|
163
|
+
class Hex < CharacterType::Base; end
|
164
|
+
class NonHex < CharacterType::Base; end
|
165
|
+
class Word < CharacterType::Base; end
|
166
|
+
class NonWord < CharacterType::Base; end
|
167
|
+
class Space < CharacterType::Base; end
|
168
|
+
class NonSpace < CharacterType::Base; end
|
169
|
+
end
|
170
|
+
|
171
|
+
module EscapeSequence
|
172
|
+
class Base < Regexp::Expression::Base; end
|
173
|
+
|
174
|
+
class Literal < EscapeSequence::Base; end
|
175
|
+
|
176
|
+
class AsciiEscape < EscapeSequence::Base; end
|
177
|
+
class Backspace < EscapeSequence::Base; end
|
178
|
+
class Bell < EscapeSequence::Base; end
|
179
|
+
class FormFeed < EscapeSequence::Base; end
|
180
|
+
class Newline < EscapeSequence::Base; end
|
181
|
+
class Return < EscapeSequence::Base; end
|
182
|
+
class Space < EscapeSequence::Base; end
|
183
|
+
class Tab < EscapeSequence::Base; end
|
184
|
+
class VerticalTab < EscapeSequence::Base; end
|
185
|
+
|
186
|
+
class Octal < EscapeSequence::Base; end
|
187
|
+
class Hex < EscapeSequence::Base; end
|
188
|
+
class HexWide < EscapeSequence::Base; end
|
189
|
+
|
190
|
+
class Control < EscapeSequence::Base; end
|
191
|
+
class Meta < EscapeSequence::Base; end
|
192
|
+
class MetaControl < EscapeSequence::Base; end
|
193
|
+
end
|
194
|
+
|
195
|
+
class Alternation < Regexp::Expression::Base
|
196
|
+
def <<(exp)
|
197
|
+
@expressions.last << exp
|
198
|
+
end
|
199
|
+
|
200
|
+
def alternative(exp = nil)
|
201
|
+
@expressions << (exp ? exp : Sequence.new)
|
202
|
+
end
|
203
|
+
|
204
|
+
def alternatives
|
205
|
+
@expressions
|
206
|
+
end
|
207
|
+
|
208
|
+
def quantify(token, text, min = nil, max = nil, mode = :greedy)
|
209
|
+
@expressions.last.last.quantify(token, text, min, max, mode)
|
210
|
+
end
|
211
|
+
|
212
|
+
def to_s
|
213
|
+
@expressions.map{|e| e.to_s}.join('|')
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
# a sequence of expressions, used by alternations
|
218
|
+
class Sequence < Regexp::Expression::Base
|
219
|
+
def initialize
|
220
|
+
super Regexp::Token.new(:expression, :sequence, '')
|
221
|
+
end
|
222
|
+
|
223
|
+
def <<(exp)
|
224
|
+
@expressions << exp
|
225
|
+
end
|
226
|
+
|
227
|
+
def insert(exp)
|
228
|
+
@expressions.insert 0, exp
|
229
|
+
end
|
230
|
+
|
231
|
+
def first
|
232
|
+
@expressions.first
|
233
|
+
end
|
234
|
+
|
235
|
+
def last
|
236
|
+
@expressions.last
|
237
|
+
end
|
238
|
+
|
239
|
+
def quantify(token, text, min = nil, max = nil, mode = :greedy)
|
240
|
+
last.quantify(token, text, min, max, mode)
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
module Group
|
245
|
+
class Base < Regexp::Expression::Base
|
246
|
+
def capturing?
|
247
|
+
[:capture, :named].include? @token
|
248
|
+
end
|
249
|
+
|
250
|
+
def comment?; @type == :comment end
|
251
|
+
|
252
|
+
def to_s
|
253
|
+
s = @text
|
254
|
+
s << @expressions.join
|
255
|
+
s << ')'
|
256
|
+
s << @quantifier.to_s if quantified?
|
257
|
+
s
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
class Atomic < Group::Base; end
|
262
|
+
class Capture < Group::Base; end
|
263
|
+
class Named < Group::Base; end
|
264
|
+
class Passive < Group::Base; end
|
265
|
+
|
266
|
+
class Options < Group::Base; end
|
267
|
+
|
268
|
+
class Comment < Group::Base
|
269
|
+
def to_s; @text end
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
class Assertion
|
274
|
+
class Base < Regexp::Expression::Group::Base; end
|
275
|
+
|
276
|
+
class Lookahead < Assertion::Base; end
|
277
|
+
class NegativeLookahead < Assertion::Base; end
|
278
|
+
|
279
|
+
class Lookbehind < Assertion::Base; end
|
280
|
+
class NegativeLookbehind < Assertion::Base; end
|
281
|
+
end
|
282
|
+
|
283
|
+
end # module Regexp::Expression
|
284
|
+
|
285
|
+
%w{property set}.each do|file|
|
286
|
+
require File.expand_path("../expression/#{file}", __FILE__)
|
287
|
+
end
|