regexp_parser 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/ChangeLog +4 -0
  2. data/LICENSE +22 -0
  3. data/README.rdoc +307 -0
  4. data/Rakefile +91 -0
  5. data/lib/regexp_parser/ctype.rb +48 -0
  6. data/lib/regexp_parser/expression/property.rb +108 -0
  7. data/lib/regexp_parser/expression/set.rb +59 -0
  8. data/lib/regexp_parser/expression.rb +287 -0
  9. data/lib/regexp_parser/lexer.rb +105 -0
  10. data/lib/regexp_parser/parser.rb +417 -0
  11. data/lib/regexp_parser/scanner/property.rl +534 -0
  12. data/lib/regexp_parser/scanner/scanner.rl +712 -0
  13. data/lib/regexp_parser/scanner.rb +3325 -0
  14. data/lib/regexp_parser/syntax/ruby/1.8.6.rb +14 -0
  15. data/lib/regexp_parser/syntax/ruby/1.8.7.rb +14 -0
  16. data/lib/regexp_parser/syntax/ruby/1.8.rb +39 -0
  17. data/lib/regexp_parser/syntax/ruby/1.9.1.rb +39 -0
  18. data/lib/regexp_parser/syntax/ruby/1.9.2.rb +10 -0
  19. data/lib/regexp_parser/syntax/ruby/1.9.3.rb +24 -0
  20. data/lib/regexp_parser/syntax/ruby/1.9.rb +8 -0
  21. data/lib/regexp_parser/syntax/tokens.rb +332 -0
  22. data/lib/regexp_parser/syntax.rb +172 -0
  23. data/lib/regexp_parser.rb +45 -0
  24. data/test/helpers.rb +8 -0
  25. data/test/lexer/test_all.rb +26 -0
  26. data/test/lexer/test_literals.rb +120 -0
  27. data/test/lexer/test_nesting.rb +107 -0
  28. data/test/lexer/test_refcalls.rb +45 -0
  29. data/test/parser/test_all.rb +44 -0
  30. data/test/parser/test_alternation.rb +46 -0
  31. data/test/parser/test_anchors.rb +35 -0
  32. data/test/parser/test_errors.rb +59 -0
  33. data/test/parser/test_escapes.rb +48 -0
  34. data/test/parser/test_expression.rb +51 -0
  35. data/test/parser/test_groups.rb +69 -0
  36. data/test/parser/test_properties.rb +346 -0
  37. data/test/parser/test_quantifiers.rb +236 -0
  38. data/test/parser/test_refcalls.rb +101 -0
  39. data/test/parser/test_sets.rb +99 -0
  40. data/test/scanner/test_all.rb +30 -0
  41. data/test/scanner/test_anchors.rb +35 -0
  42. data/test/scanner/test_errors.rb +36 -0
  43. data/test/scanner/test_escapes.rb +49 -0
  44. data/test/scanner/test_groups.rb +41 -0
  45. data/test/scanner/test_literals.rb +85 -0
  46. data/test/scanner/test_meta.rb +36 -0
  47. data/test/scanner/test_properties.rb +315 -0
  48. data/test/scanner/test_quantifiers.rb +38 -0
  49. data/test/scanner/test_refcalls.rb +45 -0
  50. data/test/scanner/test_scripts.rb +314 -0
  51. data/test/scanner/test_sets.rb +80 -0
  52. data/test/scanner/test_types.rb +30 -0
  53. data/test/syntax/ruby/test_1.8.rb +57 -0
  54. data/test/syntax/ruby/test_1.9.1.rb +39 -0
  55. data/test/syntax/ruby/test_1.9.3.rb +38 -0
  56. data/test/syntax/ruby/test_all.rb +12 -0
  57. data/test/syntax/test_all.rb +19 -0
  58. data/test/test_all.rb +4 -0
  59. metadata +160 -0
@@ -0,0 +1,105 @@
1
+ # A very thin wrapper around the scanner that breaks quantified literal runs,
2
+ # collects emitted tokens into an array, calculates their nesting depth, and
3
+ # normalizes tokens for the parser, and checks if they are implemented by the
4
+ # given syntax flavor.
5
+ module Regexp::Lexer
6
+
7
+ OPENING_TOKENS = [:capture, :options, :passive, :atomic, :named,
8
+ :lookahead, :nlookahead, :lookbehind, :nlookbehind
9
+ ].freeze
10
+
11
+ CLOSING_TOKENS = [:close].freeze
12
+
13
+ def self.scan(input, syntax = 'ruby/1.9', &block)
14
+ syntax = Regexp::Syntax.new(syntax)
15
+
16
+ @tokens = []
17
+ @nesting, @set_nesting = 0, 0
18
+
19
+ last = nil
20
+ Regexp::Scanner.scan(input) do |type, token, text, ts, te|
21
+ type, token = *syntax.normalize(type, token)
22
+ syntax.check! type, token
23
+
24
+ self.ascend(type, token)
25
+
26
+ self.break_literal(last) if type == :quantifier and
27
+ last and last.type == :literal
28
+
29
+ current = Regexp::Token.new(type, token, text, ts, te,
30
+ @nesting, @set_nesting)
31
+
32
+ current = self.merge_literal(current) if type == :literal and
33
+ last and last.type == :literal
34
+
35
+ last.next(current) if last
36
+ current.previous(last) if last
37
+
38
+ @tokens << current
39
+ last = current
40
+
41
+ self.descend(type, token)
42
+ end
43
+
44
+ if block_given?
45
+ @tokens.each {|t| block.call(t)}
46
+ else
47
+ @tokens
48
+ end
49
+ end
50
+
51
+ def self.ascend(type, token)
52
+ if type == :group or type == :assertion
53
+ @nesting -= 1 if CLOSING_TOKENS.include?(token)
54
+ end
55
+
56
+ if type == :set or type == :subset
57
+ @set_nesting -= 1 if token == :close
58
+ end
59
+ end
60
+
61
+ def self.descend(type, token)
62
+ if type == :group or type == :assertion
63
+ @nesting += 1 if OPENING_TOKENS.include?(token)
64
+ end
65
+
66
+ if type == :set or type == :subset
67
+ @set_nesting += 1 if token == :open
68
+ end
69
+ end
70
+
71
+ # called by scan to break a literal run that is longer than one character
72
+ # into two separate tokens when it is followed by a quantifier
73
+ def self.break_literal(token)
74
+ text = token.text
75
+ if text.scan(/./mu).length > 1
76
+ lead = text.sub(/.\z/mu, "")
77
+ last = text[/.\z/mu] || ''
78
+
79
+ if RUBY_VERSION >= '1.9'
80
+ lead_length = lead.bytesize
81
+ last_length = last.bytesize
82
+ else
83
+ lead_length = lead.length
84
+ last_length = last.length
85
+ end
86
+
87
+ @tokens.pop
88
+ @tokens << Regexp::Token.new(:literal, :literal, lead, token.ts,
89
+ (token.te - last_length), @nesting, @set_nesting)
90
+
91
+ @tokens << Regexp::Token.new(:literal, :literal, last,
92
+ (token.ts + lead_length),
93
+ token.te, @nesting, @set_nesting)
94
+ end
95
+ end
96
+
97
+ # called by scan to merge two consecutive literals. this happens when tokens
98
+ # get normalized (as in the case of posix/bre) and end up becoming literals.
99
+ def self.merge_literal(current)
100
+ last = @tokens.pop
101
+ replace = Regexp::Token.new(:literal, :literal, last.text + current.text,
102
+ last.ts, current.te, @nesting, @set_nesting)
103
+ end
104
+
105
+ end # module Regexp::Lexer
@@ -0,0 +1,417 @@
1
+ require File.expand_path('../expression', __FILE__)
2
+
3
+ module Regexp::Parser
4
+ include Regexp::Expression
5
+ include Regexp::Syntax
6
+
7
+ class ParserError < StandardError
8
+ def initialize(what)
9
+ super what
10
+ end
11
+ end
12
+
13
+ class UnknownTokenTypeError < ParserError
14
+ def initialize(type, token)
15
+ super "Unknown #{type} type #{token.inspect}"
16
+ end
17
+ end
18
+
19
+ class UnknownTokenError < ParserError
20
+ def initialize(type, token)
21
+ super "Unknown #{type} token #{token.token}"
22
+ end
23
+ end
24
+
25
+ def self.parse(input, syntax = :any, &block)
26
+ @nesting = [@root = @node = Root.new]
27
+
28
+ Regexp::Lexer.scan(input, syntax) do |token|
29
+ self.parse_token token
30
+ end
31
+
32
+ if block_given?
33
+ block.call @root
34
+ else
35
+ @root
36
+ end
37
+ end
38
+
39
+ def self.nest(exp)
40
+ @nesting.push exp
41
+
42
+ @node << exp
43
+ @node = exp
44
+ end
45
+
46
+ def self.parse_token(token)
47
+ case token.type
48
+ when :meta; self.meta(token)
49
+ when :quantifier; self.quantifier(token)
50
+ when :anchor; self.anchor(token)
51
+ when :escape; self.escape(token)
52
+ when :group; self.group(token)
53
+ when :assertion; self.group(token)
54
+ when :set, :subset; self.set(token)
55
+ when :type; self.type(token)
56
+ when :backref; self.backref(token)
57
+
58
+ when :property, :nonproperty
59
+ self.property(token)
60
+
61
+ when :literal
62
+ @node << Literal.new(token)
63
+
64
+ else
65
+ raise UnknownTokenTypeError.new(token.type, token)
66
+ end
67
+ end
68
+
69
+ def self.set(token)
70
+ case token.token
71
+ when :open
72
+ self.open_set(token)
73
+ when :close
74
+ self.close_set
75
+ when :negate
76
+ self.negate_set
77
+ when :member, :range, :escape, :collation, :equivalent
78
+ self.append_set(token)
79
+ when *Token::Escape::All
80
+ self.append_set(token)
81
+ when *Token::CharacterSet::All
82
+ self.append_set(token)
83
+ when *Token::UnicodeProperty::All
84
+ self.append_set(token)
85
+ else
86
+ raise UnknownTokenError.new('CharacterSet', token)
87
+ end
88
+ end
89
+
90
+ def self.meta(token)
91
+ case token.token
92
+ when :dot
93
+ @node << CharacterType::Any.new(token)
94
+ when :alternation
95
+ unless @node.token == :alternation
96
+ alt = Alternation.new(token)
97
+ seq = Sequence.new
98
+ while @node.expressions.last
99
+ seq.insert @node.expressions.pop
100
+ end
101
+ alt.alternative(seq)
102
+
103
+ @node << alt
104
+ @node = alt
105
+ @node.alternative
106
+ else
107
+ @node.alternative
108
+ end
109
+ else
110
+ raise UnknownTokenError.new('Meta', token)
111
+ end
112
+ end
113
+
114
+ def self.backref(token)
115
+ case token.token
116
+ when :name_ref
117
+ @node << Backreference::Name.new(token)
118
+ when :name_nest_ref
119
+ @node << Backreference::NameNestLevel.new(token)
120
+ when :name_call
121
+ @node << Backreference::NameCall.new(token)
122
+ when :number, :number_ref
123
+ @node << Backreference::Number.new(token)
124
+ when :number_rel_ref
125
+ @node << Backreference::NumberRelative.new(token)
126
+ when :number_nest_ref
127
+ @node << Backreference::NumberNestLevel.new(token)
128
+ when :number_call
129
+ @node << Backreference::NumberCall.new(token)
130
+ when :number_rel_call
131
+ @node << Backreference::NumberCallRelative.new(token)
132
+ else
133
+ raise UnknownTokenError.new('Backreference', token)
134
+ end
135
+ end
136
+
137
+ def self.type(token)
138
+ case token.token
139
+ when :digit
140
+ @node << CharacterType::Digit.new(token)
141
+ when :nondigit
142
+ @node << CharacterType::NonDigit.new(token)
143
+ when :hex
144
+ @node << CharacterType::Hex.new(token)
145
+ when :nonhex
146
+ @node << CharacterType::NonHex.new(token)
147
+ when :space
148
+ @node << CharacterType::Space.new(token)
149
+ when :nonspace
150
+ @node << CharacterType::NonSpace.new(token)
151
+ when :word
152
+ @node << CharacterType::Word.new(token)
153
+ when :nonword
154
+ @node << CharacterType::NonWord.new(token)
155
+ else
156
+ raise UnknownTokenError.new('CharacterType', token)
157
+ end
158
+ end
159
+
160
+ def self.property(token)
161
+ include Regexp::Expression::UnicodeProperty
162
+
163
+ case token.token
164
+ when :alnum; @node << Alnum.new(token)
165
+ when :alpha; @node << Alpha.new(token)
166
+ when :any; @node << Any.new(token)
167
+ when :ascii; @node << Ascii.new(token)
168
+ when :blank; @node << Blank.new(token)
169
+ when :cntrl; @node << Cntrl.new(token)
170
+ when :digit; @node << Digit.new(token)
171
+ when :graph; @node << Graph.new(token)
172
+ when :lower; @node << Lower.new(token)
173
+ when :print; @node << Print.new(token)
174
+ when :punct; @node << Punct.new(token)
175
+ when :space; @node << Space.new(token)
176
+ when :upper; @node << Upper.new(token)
177
+ when :word; @node << Word.new(token)
178
+ when :xdigit; @node << Xdigit.new(token)
179
+ when :newline; @node << Newline.new(token)
180
+
181
+ when :letter_any; @node << Letter::Any.new(token)
182
+ when :letter_uppercase; @node << Letter::Uppercase.new(token)
183
+ when :letter_lowercase; @node << Letter::Lowercase.new(token)
184
+ when :letter_titlecase; @node << Letter::Titlecase.new(token)
185
+ when :letter_modifier; @node << Letter::Modifier.new(token)
186
+ when :letter_other; @node << Letter::Other.new(token)
187
+
188
+ when :mark_any; @node << Mark::Any.new(token)
189
+ when :mark_nonspacing; @node << Mark::Nonspacing.new(token)
190
+ when :mark_spacing; @node << Mark::Spacing.new(token)
191
+ when :mark_enclosing; @node << Mark::Enclosing.new(token)
192
+
193
+ when :number_any; @node << Number::Any.new(token)
194
+ when :number_decimal; @node << Number::Decimal.new(token)
195
+ when :number_letter; @node << Number::Letter.new(token)
196
+ when :number_other; @node << Number::Other.new(token)
197
+
198
+ when :punct_any; @node << Punctuation::Any.new(token)
199
+ when :punct_connector; @node << Punctuation::Connector.new(token)
200
+ when :punct_dash; @node << Punctuation::Dash.new(token)
201
+ when :punct_open; @node << Punctuation::Open.new(token)
202
+ when :punct_close; @node << Punctuation::Close.new(token)
203
+ when :punct_initial; @node << Punctuation::Initial.new(token)
204
+ when :punct_final; @node << Punctuation::Final.new(token)
205
+ when :punct_other; @node << Punctuation::Other.new(token)
206
+
207
+ when :separator_any; @node << Separator::Any.new(token)
208
+ when :separator_space; @node << Separator::Space.new(token)
209
+ when :separator_line; @node << Separator::Line.new(token)
210
+ when :separator_para; @node << Separator::Paragraph.new(token)
211
+
212
+ when :symbol_any; @node << Symbol::Any.new(token)
213
+ when :symbol_math; @node << Symbol::Math.new(token)
214
+ when :symbol_currency; @node << Symbol::Currency.new(token)
215
+ when :symbol_modifier; @node << Symbol::Modifier.new(token)
216
+ when :symbol_other; @node << Symbol::Other.new(token)
217
+
218
+ when :other; @node << Codepoint::Any.new(token)
219
+ when :control; @node << Codepoint::Control.new(token)
220
+ when :format; @node << Codepoint::Format.new(token)
221
+ when :surrogate; @node << Codepoint::Surrogate.new(token)
222
+ when :private_use; @node << Codepoint::PrivateUse.new(token)
223
+ when :unassigned; @node << Codepoint::Unassigned.new(token)
224
+
225
+ when *Token::UnicodeProperty::Age
226
+ @node << Age.new(token)
227
+
228
+ when *Token::UnicodeProperty::Derived
229
+ @node << Derived.new(token)
230
+
231
+ when *Regexp::Syntax::Token::UnicodeProperty::Script
232
+ @node << Script.new(token)
233
+
234
+ else
235
+ raise UnknownTokenError.new('UnicodeProperty', token)
236
+ end
237
+ end
238
+
239
+ def self.anchor(token)
240
+ case token.token
241
+ when :beginning_of_line
242
+ @node << Anchor::BeginningOfLine.new(token)
243
+ when :end_of_line
244
+ @node << Anchor::EndOfLine.new(token)
245
+ when :bos
246
+ @node << Anchor::BOS.new(token)
247
+ when :eos
248
+ @node << Anchor::EOS.new(token)
249
+ when :eos_ob_eol
250
+ @node << Anchor::EOSobEOL.new(token)
251
+ when :word_boundary
252
+ @node << Anchor::WordBoundary.new(token)
253
+ when :nonword_boundary
254
+ @node << Anchor::NonWordBoundary.new(token)
255
+ when :match_start
256
+ @node << Anchor::MatchStart.new(token)
257
+ else
258
+ raise UnknownTokenError.new('Anchor', token)
259
+ end
260
+ end
261
+
262
+ def self.escape(token)
263
+ case token.token
264
+
265
+ when :backspace
266
+ @node << EscapeSequence::Backspace.new(token)
267
+
268
+ when :escape
269
+ @node << EscapeSequence::AsciiEscape.new(token)
270
+ when :bell
271
+ @node << EscapeSequence::Bell.new(token)
272
+ when :form_feed
273
+ @node << EscapeSequence::FormFeed.new(token)
274
+ when :newline
275
+ @node << EscapeSequence::Newline.new(token)
276
+ when :carriage
277
+ @node << EscapeSequence::Return.new(token)
278
+ when :space
279
+ @node << EscapeSequence::Space.new(token)
280
+ when :tab
281
+ @node << EscapeSequence::Tab.new(token)
282
+ when :vertical_tab
283
+ @node << EscapeSequence::VerticalTab.new(token)
284
+
285
+ when :control
286
+ @node << EscapeSequence::Control.new(token)
287
+
288
+ else
289
+ # treating everything else as a literal
290
+ @node << EscapeSequence::Literal.new(token)
291
+ end
292
+ end
293
+
294
+ def self.quantifier(token)
295
+ case token.token
296
+ when :zero_or_one
297
+ @node.expressions.last.quantify(:zero_or_one, token.text, 0, 1, :greedy)
298
+ when :zero_or_one_reluctant
299
+ @node.expressions.last.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
300
+ when :zero_or_one_possessive
301
+ @node.expressions.last.quantify(:zero_or_one, token.text, 0, 1, :possessive)
302
+
303
+ when :zero_or_more
304
+ @node.expressions.last.quantify(:zero_or_more, token.text, 0, -1, :greedy)
305
+ when :zero_or_more_reluctant
306
+ @node.expressions.last.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
307
+ when :zero_or_more_possessive
308
+ @node.expressions.last.quantify(:zero_or_more, token.text, 0, -1, :possessive)
309
+
310
+ when :one_or_more
311
+ @node.expressions.last.quantify(:one_or_more, token.text, 1, -1, :greedy)
312
+ when :one_or_more_reluctant
313
+ @node.expressions.last.quantify(:one_or_more, token.text, 1, -1, :reluctant)
314
+ when :one_or_more_possessive
315
+ @node.expressions.last.quantify(:one_or_more, token.text, 1, -1, :possessive)
316
+
317
+ when :interval
318
+ self.interval(token.text)
319
+
320
+ else
321
+ raise UnknownTokenError.new('Quantifier', token)
322
+ end
323
+ end
324
+
325
+ def self.interval(text)
326
+ mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
327
+ mode = case mchr
328
+ when '?'; text.chop!; :reluctant
329
+ when '+'; text.chop!; :possessive
330
+ else :greedy
331
+ end
332
+
333
+ range = text.gsub(/\{|\}/, '').split(',', 2).each {|i| i.strip}
334
+ min = range[0].empty? ? 0 : range[0]
335
+ max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
336
+
337
+ @node.expressions.last.quantify(:interval, text, min.to_i, max.to_i, mode)
338
+ end
339
+
340
+ def self.group(token)
341
+ case token.token
342
+ when :options
343
+ self.options(token)
344
+ when :close
345
+ self.close_group
346
+ when :comment
347
+ @node << Group::Comment.new(token)
348
+ else
349
+ self.open_group(token)
350
+ end
351
+ end
352
+
353
+ def self.options(token)
354
+ opt = token.text.split('-', 2)
355
+
356
+ exp = Group::Options.new(token)
357
+ exp.options = {
358
+ :m => opt[0].include?('m') ? true : false,
359
+ :i => opt[0].include?('i') ? true : false,
360
+ :x => opt[0].include?('x') ? true : false
361
+ }
362
+
363
+ self.nest exp
364
+ end
365
+
366
+ def self.open_group(token)
367
+ case token.token
368
+ when :passive
369
+ exp = Group::Passive.new(token)
370
+ when :atomic
371
+ exp = Group::Atomic.new(token)
372
+ when :named
373
+ exp = Group::Named.new(token)
374
+ when :capture
375
+ exp = Group::Capture.new(token)
376
+
377
+ when :lookahead
378
+ exp = Assertion::Lookahead.new(token)
379
+ when :nlookahead
380
+ exp = Assertion::NegativeLookahead.new(token)
381
+ when :lookbehind
382
+ exp = Assertion::Lookbehind.new(token)
383
+ when :nlookbehind
384
+ exp = Assertion::NegativeLookbehind.new(token)
385
+
386
+ else
387
+ raise UnknownTokenError.new('Group type open', token)
388
+ end
389
+
390
+ self.nest exp
391
+ end
392
+
393
+ def self.close_group
394
+ last_group = @nesting.pop
395
+ @node = @nesting.last
396
+ end
397
+
398
+ def self.open_set(token)
399
+ if token.type == :subset
400
+ @set << CharacterSubSet.new(token)
401
+ else
402
+ @node << (@set = CharacterSet.new(token))
403
+ end
404
+ end
405
+
406
+ def self.negate_set
407
+ @set.negate
408
+ end
409
+
410
+ def self.append_set(token)
411
+ @set << token.text
412
+ end
413
+
414
+ def self.close_set
415
+ end
416
+
417
+ end # module Regexp::Parser