regexp_parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/ChangeLog +4 -0
  2. data/LICENSE +22 -0
  3. data/README.rdoc +307 -0
  4. data/Rakefile +91 -0
  5. data/lib/regexp_parser/ctype.rb +48 -0
  6. data/lib/regexp_parser/expression/property.rb +108 -0
  7. data/lib/regexp_parser/expression/set.rb +59 -0
  8. data/lib/regexp_parser/expression.rb +287 -0
  9. data/lib/regexp_parser/lexer.rb +105 -0
  10. data/lib/regexp_parser/parser.rb +417 -0
  11. data/lib/regexp_parser/scanner/property.rl +534 -0
  12. data/lib/regexp_parser/scanner/scanner.rl +712 -0
  13. data/lib/regexp_parser/scanner.rb +3325 -0
  14. data/lib/regexp_parser/syntax/ruby/1.8.6.rb +14 -0
  15. data/lib/regexp_parser/syntax/ruby/1.8.7.rb +14 -0
  16. data/lib/regexp_parser/syntax/ruby/1.8.rb +39 -0
  17. data/lib/regexp_parser/syntax/ruby/1.9.1.rb +39 -0
  18. data/lib/regexp_parser/syntax/ruby/1.9.2.rb +10 -0
  19. data/lib/regexp_parser/syntax/ruby/1.9.3.rb +24 -0
  20. data/lib/regexp_parser/syntax/ruby/1.9.rb +8 -0
  21. data/lib/regexp_parser/syntax/tokens.rb +332 -0
  22. data/lib/regexp_parser/syntax.rb +172 -0
  23. data/lib/regexp_parser.rb +45 -0
  24. data/test/helpers.rb +8 -0
  25. data/test/lexer/test_all.rb +26 -0
  26. data/test/lexer/test_literals.rb +120 -0
  27. data/test/lexer/test_nesting.rb +107 -0
  28. data/test/lexer/test_refcalls.rb +45 -0
  29. data/test/parser/test_all.rb +44 -0
  30. data/test/parser/test_alternation.rb +46 -0
  31. data/test/parser/test_anchors.rb +35 -0
  32. data/test/parser/test_errors.rb +59 -0
  33. data/test/parser/test_escapes.rb +48 -0
  34. data/test/parser/test_expression.rb +51 -0
  35. data/test/parser/test_groups.rb +69 -0
  36. data/test/parser/test_properties.rb +346 -0
  37. data/test/parser/test_quantifiers.rb +236 -0
  38. data/test/parser/test_refcalls.rb +101 -0
  39. data/test/parser/test_sets.rb +99 -0
  40. data/test/scanner/test_all.rb +30 -0
  41. data/test/scanner/test_anchors.rb +35 -0
  42. data/test/scanner/test_errors.rb +36 -0
  43. data/test/scanner/test_escapes.rb +49 -0
  44. data/test/scanner/test_groups.rb +41 -0
  45. data/test/scanner/test_literals.rb +85 -0
  46. data/test/scanner/test_meta.rb +36 -0
  47. data/test/scanner/test_properties.rb +315 -0
  48. data/test/scanner/test_quantifiers.rb +38 -0
  49. data/test/scanner/test_refcalls.rb +45 -0
  50. data/test/scanner/test_scripts.rb +314 -0
  51. data/test/scanner/test_sets.rb +80 -0
  52. data/test/scanner/test_types.rb +30 -0
  53. data/test/syntax/ruby/test_1.8.rb +57 -0
  54. data/test/syntax/ruby/test_1.9.1.rb +39 -0
  55. data/test/syntax/ruby/test_1.9.3.rb +38 -0
  56. data/test/syntax/ruby/test_all.rb +12 -0
  57. data/test/syntax/test_all.rb +19 -0
  58. data/test/test_all.rb +4 -0
  59. metadata +160 -0
@@ -0,0 +1,105 @@
1
+ # A very thin wrapper around the scanner that breaks quantified literal runs,
2
+ # collects emitted tokens into an array, calculates their nesting depth, and
3
+ # normalizes tokens for the parser, and checks if they are implemented by the
4
+ # given syntax flavor.
5
+ module Regexp::Lexer
6
+
7
+ OPENING_TOKENS = [:capture, :options, :passive, :atomic, :named,
8
+ :lookahead, :nlookahead, :lookbehind, :nlookbehind
9
+ ].freeze
10
+
11
+ CLOSING_TOKENS = [:close].freeze
12
+
13
+ def self.scan(input, syntax = 'ruby/1.9', &block)
14
+ syntax = Regexp::Syntax.new(syntax)
15
+
16
+ @tokens = []
17
+ @nesting, @set_nesting = 0, 0
18
+
19
+ last = nil
20
+ Regexp::Scanner.scan(input) do |type, token, text, ts, te|
21
+ type, token = *syntax.normalize(type, token)
22
+ syntax.check! type, token
23
+
24
+ self.ascend(type, token)
25
+
26
+ self.break_literal(last) if type == :quantifier and
27
+ last and last.type == :literal
28
+
29
+ current = Regexp::Token.new(type, token, text, ts, te,
30
+ @nesting, @set_nesting)
31
+
32
+ current = self.merge_literal(current) if type == :literal and
33
+ last and last.type == :literal
34
+
35
+ last.next(current) if last
36
+ current.previous(last) if last
37
+
38
+ @tokens << current
39
+ last = current
40
+
41
+ self.descend(type, token)
42
+ end
43
+
44
+ if block_given?
45
+ @tokens.each {|t| block.call(t)}
46
+ else
47
+ @tokens
48
+ end
49
+ end
50
+
51
+ def self.ascend(type, token)
52
+ if type == :group or type == :assertion
53
+ @nesting -= 1 if CLOSING_TOKENS.include?(token)
54
+ end
55
+
56
+ if type == :set or type == :subset
57
+ @set_nesting -= 1 if token == :close
58
+ end
59
+ end
60
+
61
+ def self.descend(type, token)
62
+ if type == :group or type == :assertion
63
+ @nesting += 1 if OPENING_TOKENS.include?(token)
64
+ end
65
+
66
+ if type == :set or type == :subset
67
+ @set_nesting += 1 if token == :open
68
+ end
69
+ end
70
+
71
+ # called by scan to break a literal run that is longer than one character
72
+ # into two separate tokens when it is followed by a quantifier
73
+ def self.break_literal(token)
74
+ text = token.text
75
+ if text.scan(/./mu).length > 1
76
+ lead = text.sub(/.\z/mu, "")
77
+ last = text[/.\z/mu] || ''
78
+
79
+ if RUBY_VERSION >= '1.9'
80
+ lead_length = lead.bytesize
81
+ last_length = last.bytesize
82
+ else
83
+ lead_length = lead.length
84
+ last_length = last.length
85
+ end
86
+
87
+ @tokens.pop
88
+ @tokens << Regexp::Token.new(:literal, :literal, lead, token.ts,
89
+ (token.te - last_length), @nesting, @set_nesting)
90
+
91
+ @tokens << Regexp::Token.new(:literal, :literal, last,
92
+ (token.ts + lead_length),
93
+ token.te, @nesting, @set_nesting)
94
+ end
95
+ end
96
+
97
+ # called by scan to merge two consecutive literals. this happens when tokens
98
+ # get normalized (as in the case of posix/bre) and end up becoming literals.
99
+ def self.merge_literal(current)
100
+ last = @tokens.pop
101
+ replace = Regexp::Token.new(:literal, :literal, last.text + current.text,
102
+ last.ts, current.te, @nesting, @set_nesting)
103
+ end
104
+
105
+ end # module Regexp::Lexer
@@ -0,0 +1,417 @@
1
+ require File.expand_path('../expression', __FILE__)
2
+
3
+ module Regexp::Parser
4
+ include Regexp::Expression
5
+ include Regexp::Syntax
6
+
7
+ class ParserError < StandardError
8
+ def initialize(what)
9
+ super what
10
+ end
11
+ end
12
+
13
+ class UnknownTokenTypeError < ParserError
14
+ def initialize(type, token)
15
+ super "Unknown #{type} type #{token.inspect}"
16
+ end
17
+ end
18
+
19
+ class UnknownTokenError < ParserError
20
+ def initialize(type, token)
21
+ super "Unknown #{type} token #{token.token}"
22
+ end
23
+ end
24
+
25
+ def self.parse(input, syntax = :any, &block)
26
+ @nesting = [@root = @node = Root.new]
27
+
28
+ Regexp::Lexer.scan(input, syntax) do |token|
29
+ self.parse_token token
30
+ end
31
+
32
+ if block_given?
33
+ block.call @root
34
+ else
35
+ @root
36
+ end
37
+ end
38
+
39
+ def self.nest(exp)
40
+ @nesting.push exp
41
+
42
+ @node << exp
43
+ @node = exp
44
+ end
45
+
46
+ def self.parse_token(token)
47
+ case token.type
48
+ when :meta; self.meta(token)
49
+ when :quantifier; self.quantifier(token)
50
+ when :anchor; self.anchor(token)
51
+ when :escape; self.escape(token)
52
+ when :group; self.group(token)
53
+ when :assertion; self.group(token)
54
+ when :set, :subset; self.set(token)
55
+ when :type; self.type(token)
56
+ when :backref; self.backref(token)
57
+
58
+ when :property, :nonproperty
59
+ self.property(token)
60
+
61
+ when :literal
62
+ @node << Literal.new(token)
63
+
64
+ else
65
+ raise UnknownTokenTypeError.new(token.type, token)
66
+ end
67
+ end
68
+
69
+ def self.set(token)
70
+ case token.token
71
+ when :open
72
+ self.open_set(token)
73
+ when :close
74
+ self.close_set
75
+ when :negate
76
+ self.negate_set
77
+ when :member, :range, :escape, :collation, :equivalent
78
+ self.append_set(token)
79
+ when *Token::Escape::All
80
+ self.append_set(token)
81
+ when *Token::CharacterSet::All
82
+ self.append_set(token)
83
+ when *Token::UnicodeProperty::All
84
+ self.append_set(token)
85
+ else
86
+ raise UnknownTokenError.new('CharacterSet', token)
87
+ end
88
+ end
89
+
90
+ def self.meta(token)
91
+ case token.token
92
+ when :dot
93
+ @node << CharacterType::Any.new(token)
94
+ when :alternation
95
+ unless @node.token == :alternation
96
+ alt = Alternation.new(token)
97
+ seq = Sequence.new
98
+ while @node.expressions.last
99
+ seq.insert @node.expressions.pop
100
+ end
101
+ alt.alternative(seq)
102
+
103
+ @node << alt
104
+ @node = alt
105
+ @node.alternative
106
+ else
107
+ @node.alternative
108
+ end
109
+ else
110
+ raise UnknownTokenError.new('Meta', token)
111
+ end
112
+ end
113
+
114
+ def self.backref(token)
115
+ case token.token
116
+ when :name_ref
117
+ @node << Backreference::Name.new(token)
118
+ when :name_nest_ref
119
+ @node << Backreference::NameNestLevel.new(token)
120
+ when :name_call
121
+ @node << Backreference::NameCall.new(token)
122
+ when :number, :number_ref
123
+ @node << Backreference::Number.new(token)
124
+ when :number_rel_ref
125
+ @node << Backreference::NumberRelative.new(token)
126
+ when :number_nest_ref
127
+ @node << Backreference::NumberNestLevel.new(token)
128
+ when :number_call
129
+ @node << Backreference::NumberCall.new(token)
130
+ when :number_rel_call
131
+ @node << Backreference::NumberCallRelative.new(token)
132
+ else
133
+ raise UnknownTokenError.new('Backreference', token)
134
+ end
135
+ end
136
+
137
+ def self.type(token)
138
+ case token.token
139
+ when :digit
140
+ @node << CharacterType::Digit.new(token)
141
+ when :nondigit
142
+ @node << CharacterType::NonDigit.new(token)
143
+ when :hex
144
+ @node << CharacterType::Hex.new(token)
145
+ when :nonhex
146
+ @node << CharacterType::NonHex.new(token)
147
+ when :space
148
+ @node << CharacterType::Space.new(token)
149
+ when :nonspace
150
+ @node << CharacterType::NonSpace.new(token)
151
+ when :word
152
+ @node << CharacterType::Word.new(token)
153
+ when :nonword
154
+ @node << CharacterType::NonWord.new(token)
155
+ else
156
+ raise UnknownTokenError.new('CharacterType', token)
157
+ end
158
+ end
159
+
160
+ def self.property(token)
161
+ include Regexp::Expression::UnicodeProperty
162
+
163
+ case token.token
164
+ when :alnum; @node << Alnum.new(token)
165
+ when :alpha; @node << Alpha.new(token)
166
+ when :any; @node << Any.new(token)
167
+ when :ascii; @node << Ascii.new(token)
168
+ when :blank; @node << Blank.new(token)
169
+ when :cntrl; @node << Cntrl.new(token)
170
+ when :digit; @node << Digit.new(token)
171
+ when :graph; @node << Graph.new(token)
172
+ when :lower; @node << Lower.new(token)
173
+ when :print; @node << Print.new(token)
174
+ when :punct; @node << Punct.new(token)
175
+ when :space; @node << Space.new(token)
176
+ when :upper; @node << Upper.new(token)
177
+ when :word; @node << Word.new(token)
178
+ when :xdigit; @node << Xdigit.new(token)
179
+ when :newline; @node << Newline.new(token)
180
+
181
+ when :letter_any; @node << Letter::Any.new(token)
182
+ when :letter_uppercase; @node << Letter::Uppercase.new(token)
183
+ when :letter_lowercase; @node << Letter::Lowercase.new(token)
184
+ when :letter_titlecase; @node << Letter::Titlecase.new(token)
185
+ when :letter_modifier; @node << Letter::Modifier.new(token)
186
+ when :letter_other; @node << Letter::Other.new(token)
187
+
188
+ when :mark_any; @node << Mark::Any.new(token)
189
+ when :mark_nonspacing; @node << Mark::Nonspacing.new(token)
190
+ when :mark_spacing; @node << Mark::Spacing.new(token)
191
+ when :mark_enclosing; @node << Mark::Enclosing.new(token)
192
+
193
+ when :number_any; @node << Number::Any.new(token)
194
+ when :number_decimal; @node << Number::Decimal.new(token)
195
+ when :number_letter; @node << Number::Letter.new(token)
196
+ when :number_other; @node << Number::Other.new(token)
197
+
198
+ when :punct_any; @node << Punctuation::Any.new(token)
199
+ when :punct_connector; @node << Punctuation::Connector.new(token)
200
+ when :punct_dash; @node << Punctuation::Dash.new(token)
201
+ when :punct_open; @node << Punctuation::Open.new(token)
202
+ when :punct_close; @node << Punctuation::Close.new(token)
203
+ when :punct_initial; @node << Punctuation::Initial.new(token)
204
+ when :punct_final; @node << Punctuation::Final.new(token)
205
+ when :punct_other; @node << Punctuation::Other.new(token)
206
+
207
+ when :separator_any; @node << Separator::Any.new(token)
208
+ when :separator_space; @node << Separator::Space.new(token)
209
+ when :separator_line; @node << Separator::Line.new(token)
210
+ when :separator_para; @node << Separator::Paragraph.new(token)
211
+
212
+ when :symbol_any; @node << Symbol::Any.new(token)
213
+ when :symbol_math; @node << Symbol::Math.new(token)
214
+ when :symbol_currency; @node << Symbol::Currency.new(token)
215
+ when :symbol_modifier; @node << Symbol::Modifier.new(token)
216
+ when :symbol_other; @node << Symbol::Other.new(token)
217
+
218
+ when :other; @node << Codepoint::Any.new(token)
219
+ when :control; @node << Codepoint::Control.new(token)
220
+ when :format; @node << Codepoint::Format.new(token)
221
+ when :surrogate; @node << Codepoint::Surrogate.new(token)
222
+ when :private_use; @node << Codepoint::PrivateUse.new(token)
223
+ when :unassigned; @node << Codepoint::Unassigned.new(token)
224
+
225
+ when *Token::UnicodeProperty::Age
226
+ @node << Age.new(token)
227
+
228
+ when *Token::UnicodeProperty::Derived
229
+ @node << Derived.new(token)
230
+
231
+ when *Regexp::Syntax::Token::UnicodeProperty::Script
232
+ @node << Script.new(token)
233
+
234
+ else
235
+ raise UnknownTokenError.new('UnicodeProperty', token)
236
+ end
237
+ end
238
+
239
+ def self.anchor(token)
240
+ case token.token
241
+ when :beginning_of_line
242
+ @node << Anchor::BeginningOfLine.new(token)
243
+ when :end_of_line
244
+ @node << Anchor::EndOfLine.new(token)
245
+ when :bos
246
+ @node << Anchor::BOS.new(token)
247
+ when :eos
248
+ @node << Anchor::EOS.new(token)
249
+ when :eos_ob_eol
250
+ @node << Anchor::EOSobEOL.new(token)
251
+ when :word_boundary
252
+ @node << Anchor::WordBoundary.new(token)
253
+ when :nonword_boundary
254
+ @node << Anchor::NonWordBoundary.new(token)
255
+ when :match_start
256
+ @node << Anchor::MatchStart.new(token)
257
+ else
258
+ raise UnknownTokenError.new('Anchor', token)
259
+ end
260
+ end
261
+
262
+ def self.escape(token)
263
+ case token.token
264
+
265
+ when :backspace
266
+ @node << EscapeSequence::Backspace.new(token)
267
+
268
+ when :escape
269
+ @node << EscapeSequence::AsciiEscape.new(token)
270
+ when :bell
271
+ @node << EscapeSequence::Bell.new(token)
272
+ when :form_feed
273
+ @node << EscapeSequence::FormFeed.new(token)
274
+ when :newline
275
+ @node << EscapeSequence::Newline.new(token)
276
+ when :carriage
277
+ @node << EscapeSequence::Return.new(token)
278
+ when :space
279
+ @node << EscapeSequence::Space.new(token)
280
+ when :tab
281
+ @node << EscapeSequence::Tab.new(token)
282
+ when :vertical_tab
283
+ @node << EscapeSequence::VerticalTab.new(token)
284
+
285
+ when :control
286
+ @node << EscapeSequence::Control.new(token)
287
+
288
+ else
289
+ # treating everything else as a literal
290
+ @node << EscapeSequence::Literal.new(token)
291
+ end
292
+ end
293
+
294
+ def self.quantifier(token)
295
+ case token.token
296
+ when :zero_or_one
297
+ @node.expressions.last.quantify(:zero_or_one, token.text, 0, 1, :greedy)
298
+ when :zero_or_one_reluctant
299
+ @node.expressions.last.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
300
+ when :zero_or_one_possessive
301
+ @node.expressions.last.quantify(:zero_or_one, token.text, 0, 1, :possessive)
302
+
303
+ when :zero_or_more
304
+ @node.expressions.last.quantify(:zero_or_more, token.text, 0, -1, :greedy)
305
+ when :zero_or_more_reluctant
306
+ @node.expressions.last.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
307
+ when :zero_or_more_possessive
308
+ @node.expressions.last.quantify(:zero_or_more, token.text, 0, -1, :possessive)
309
+
310
+ when :one_or_more
311
+ @node.expressions.last.quantify(:one_or_more, token.text, 1, -1, :greedy)
312
+ when :one_or_more_reluctant
313
+ @node.expressions.last.quantify(:one_or_more, token.text, 1, -1, :reluctant)
314
+ when :one_or_more_possessive
315
+ @node.expressions.last.quantify(:one_or_more, token.text, 1, -1, :possessive)
316
+
317
+ when :interval
318
+ self.interval(token.text)
319
+
320
+ else
321
+ raise UnknownTokenError.new('Quantifier', token)
322
+ end
323
+ end
324
+
325
+ def self.interval(text)
326
+ mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
327
+ mode = case mchr
328
+ when '?'; text.chop!; :reluctant
329
+ when '+'; text.chop!; :possessive
330
+ else :greedy
331
+ end
332
+
333
+ range = text.gsub(/\{|\}/, '').split(',', 2).each {|i| i.strip}
334
+ min = range[0].empty? ? 0 : range[0]
335
+ max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
336
+
337
+ @node.expressions.last.quantify(:interval, text, min.to_i, max.to_i, mode)
338
+ end
339
+
340
+ def self.group(token)
341
+ case token.token
342
+ when :options
343
+ self.options(token)
344
+ when :close
345
+ self.close_group
346
+ when :comment
347
+ @node << Group::Comment.new(token)
348
+ else
349
+ self.open_group(token)
350
+ end
351
+ end
352
+
353
+ def self.options(token)
354
+ opt = token.text.split('-', 2)
355
+
356
+ exp = Group::Options.new(token)
357
+ exp.options = {
358
+ :m => opt[0].include?('m') ? true : false,
359
+ :i => opt[0].include?('i') ? true : false,
360
+ :x => opt[0].include?('x') ? true : false
361
+ }
362
+
363
+ self.nest exp
364
+ end
365
+
366
+ def self.open_group(token)
367
+ case token.token
368
+ when :passive
369
+ exp = Group::Passive.new(token)
370
+ when :atomic
371
+ exp = Group::Atomic.new(token)
372
+ when :named
373
+ exp = Group::Named.new(token)
374
+ when :capture
375
+ exp = Group::Capture.new(token)
376
+
377
+ when :lookahead
378
+ exp = Assertion::Lookahead.new(token)
379
+ when :nlookahead
380
+ exp = Assertion::NegativeLookahead.new(token)
381
+ when :lookbehind
382
+ exp = Assertion::Lookbehind.new(token)
383
+ when :nlookbehind
384
+ exp = Assertion::NegativeLookbehind.new(token)
385
+
386
+ else
387
+ raise UnknownTokenError.new('Group type open', token)
388
+ end
389
+
390
+ self.nest exp
391
+ end
392
+
393
+ def self.close_group
394
+ last_group = @nesting.pop
395
+ @node = @nesting.last
396
+ end
397
+
398
+ def self.open_set(token)
399
+ if token.type == :subset
400
+ @set << CharacterSubSet.new(token)
401
+ else
402
+ @node << (@set = CharacterSet.new(token))
403
+ end
404
+ end
405
+
406
+ def self.negate_set
407
+ @set.negate
408
+ end
409
+
410
+ def self.append_set(token)
411
+ @set << token.text
412
+ end
413
+
414
+ def self.close_set
415
+ end
416
+
417
+ end # module Regexp::Parser