arboretum 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,882 @@
1
+ module Arboretum
2
+ module Scandent
3
+
4
+ class TokenizationException < StandardError
5
+ def initialize(msg="An error occurred while tokenizing input")
6
+ super(msg)
7
+ end
8
+ end
9
+
10
+ class ParseException < StandardError
11
+ def initialize(msg="An error occurred while parsing input")
12
+ super(msg)
13
+ end
14
+ end
15
+
16
+ class InvalidExpressionException < StandardError
17
+ def initialize(msg="Invalid Scandent expression")
18
+ super(msg)
19
+ end
20
+ end
21
+
22
+ class ScandentRule
23
+ attr_accessor :paths
24
+
25
+ def initialize(rule_paths)
26
+ @paths = rule_paths
27
+ end
28
+
29
+ def valid_on?(element)
30
+ @paths.each {|path| return true if path.valid_on?(element)}
31
+ return false
32
+ end
33
+ alias_method :selects?, :valid_on?
34
+ alias_method :matches?, :valid_on?
35
+
36
+ def to_s
37
+ rule_str = ''
38
+ @paths.each do |path|
39
+ rule_str << ', ' if !rule_str.empty?
40
+ rule_str << path.to_s
41
+ end
42
+ rule_str
43
+ end
44
+ end
45
+
46
+ class ListenerRule < ScandentRule
47
+ end
48
+
49
+ class LocatorRule < ScandentRule
50
+ def locate(element)
51
+ all_located = []
52
+ @paths.each do |path|
53
+ path.locate(element) do |located_element|
54
+ unless all_located.include?(located_element)
55
+ yield located_element if block_given?
56
+ all_located << located_element
57
+ end
58
+ end
59
+ end
60
+ all_located
61
+ end
62
+ end
63
+
64
+ class ListenerPath
65
+ attr_accessor :steps
66
+
67
+ def initialize(steps=[])
68
+ # Make sure that the listener stems from the root element
69
+ if !steps[0].nil? and steps[0].element_ref != :ELEMENT_ROOT
70
+ if steps[0].element_ref.nil?
71
+ implied_step = [
72
+ [:T_TILDE, '~', :STATE_ROOT_PATH],
73
+ [:T_SLASH2, '//', :STATE_ROOT_PATH]
74
+ ]
75
+ steps.unshift(Parser.parse_step_tokens(implied_step, :PATH_LISTENER))
76
+ else steps[0].element_ref == :ELEMENT_SELF
77
+ steps[0].element_ref = :ELEMENT_ROOT
78
+ end
79
+ end
80
+ # Make sure there are no misplaced :ELEMENT_SELF references
81
+ steps.each {|step| raise InvalidExpressionException.new if step.element_ref == :ELEMENT_SELF and !step.eql?(steps.first)}
82
+
83
+ @steps = steps
84
+ end
85
+
86
+ def valid_on?(candidate)
87
+ steps_valid_on?(@steps, candidate)
88
+ end
89
+
90
+ def steps_valid_on?(steps, candidate)
91
+ return true if steps.empty?
92
+ steps.last.match(candidate) do |element|
93
+ return true if steps_valid_on?(steps[0..-2], element)
94
+ end
95
+ return false
96
+ end
97
+
98
+ def to_s
99
+ path_string = ''
100
+ @steps.each{|step| path_string << step.to_s}
101
+ path_string
102
+ end
103
+
104
+ end
105
+
106
+ class LocatorPath
107
+ attr_accessor :steps
108
+
109
+ def initialize(steps=[])
110
+ # Make sure that the locators stems from the root or current context element
111
+ if steps.first.element_ref.nil?
112
+ steps.first.action = :ACTION_CHILD if steps.first.action == :ACTION_SELF
113
+ implied_step = [
114
+ [:T_DOT, '.', :STATE_ROOT_PATH],
115
+ ]
116
+ steps.unshift(Parser.parse_step_tokens(implied_step, :PATH_LOCATOR))
117
+ end
118
+ # Make sure there are no misplaced :ELEMENT_SELF references
119
+ steps.each {|step| raise InvalidExpressionException.new if step.element_ref == :ELEMENT_SELF and !step.eql?(steps.first)}
120
+
121
+ @steps = steps
122
+ end
123
+
124
+ def locate(element)
125
+ matches = [element]
126
+ next_round = []
127
+ @steps.each do |step|
128
+ next_round = []
129
+ matches.each do |matched_element|
130
+ next_round += step.match(matched_element)
131
+ end
132
+ return next_round if next_round.empty?
133
+ matches = next_round
134
+ end
135
+ matches.each {|match| yield match if block_given?}
136
+ return matches
137
+ end
138
+
139
+ def valid_on?(candidate)
140
+ steps_valid_on?(@steps, candidate)
141
+ end
142
+
143
+ def steps_valid_on?(steps, candidate)
144
+ return true if steps.empty?
145
+ steps.first.match(candidate) do |element|
146
+ return true if steps_valid_on?(steps[1..-1], element)
147
+ end
148
+ return false
149
+ end
150
+
151
+ def to_s
152
+ path_string = ''
153
+ @steps.each{|step| path_string << step.to_s}
154
+ path_string
155
+ end
156
+ end
157
+
158
+ class PathStep
159
+ @@inverse_actions = {
160
+ :ACTION_CHILD => :ACTION_PARENT,
161
+ :ACTION_DESCENDANT => :ACTION_ANCESTOR,
162
+ :ACTION_PARENT => :ACTION_CHILD,
163
+ :ACTION_ANCESTOR => :ACTION_DESCENDANT,
164
+ :ACTION_FOLLOWING_SIBLING => :ACTION_PRECEDING_SIBLING,
165
+ :ACTION_FOLLOWING => :ACTION_PRECEDING,
166
+ :ACTION_PRECEDING_SIBLING => :ACTION_FOLLOWING_SIBLING,
167
+ :ACTION_PRECEDING => :ACTION_FOLLOWING,
168
+ :ACTION_SELF => :ACTION_SELF
169
+ }
170
+ @@action_group = {
171
+ :ACTION_CHILD => :content,
172
+ :ACTION_DESCENDANT => :descendants,
173
+ :ACTION_PARENT => :parent,
174
+ :ACTION_ANCESTOR => :ancestors,
175
+ :ACTION_FOLLOWING_SIBLING => :sibling_next,
176
+ :ACTION_FOLLOWING => :following_siblings,
177
+ :ACTION_PRECEDING_SIBLING => :sibling_prev,
178
+ :ACTION_PRECEDING => :preceding_siblings,
179
+ :ACTION_SELF => :itself
180
+ }
181
+ @@action_str = {
182
+ :ACTION_CHILD => '/'.freeze,
183
+ :ACTION_DESCENDANT => '//'.freeze,
184
+ :ACTION_PARENT => '/..'.freeze,
185
+ :ACTION_ANCESTOR => '/...'.freeze,
186
+ :ACTION_FOLLOWING_SIBLING => '/>'.freeze,
187
+ :ACTION_FOLLOWING => '/>>'.freeze,
188
+ :ACTION_PRECEDING_SIBLING => '/<'.freeze,
189
+ :ACTION_PRECEDING => '/<<'.freeze,
190
+ :ACTION_SELF => ''.freeze
191
+ }
192
+
193
+ attr_accessor :action, :element_ref, :tag, :namespace, :id, :attrs, :pseudo_exps, :valid_rules
194
+
195
+ def initialize(action, element_ref, tag, namespace, id, attrs, pseudo_exps, valid_rules)
196
+ @action = action
197
+ @element_ref = element_ref
198
+ @tag = tag.nil? ? nil : tag.to_sym
199
+ @namespace = namespace.nil? ? nil : namespace.to_sym
200
+ @id = id
201
+ @attrs = attrs
202
+ @pseudo_exps = pseudo_exps
203
+ @valid_rules = valid_rules
204
+ end
205
+
206
+ # Check if all fields match that of the given element, with no attention paid to the action
207
+ # FIXME: Search time improves by 25% when a tag is given. Reject non-tagged Elements more quickly
208
+ # to get a similar performance boost across the board
209
+ def describes?(element)
210
+ if element.kind_of?(Arboretum::DocTree::Elements::TaggedElement)
211
+ return false if !@tag.nil? and element.tag != @tag
212
+ return false if !@namespace.nil? and element.namespace != @namespace
213
+ return false if !@id.nil? and !element.equals_attr_val?(:id, [@id])
214
+ @attrs[:has].each {|attr_name| return false if !element.has_attr?(attr_name)}
215
+ @attrs[:contains].each {|name,values| values.each {|value| return false if !element.contains_attr_val?(name,value)}}
216
+ @attrs[:equals].each {|name,values| values.each {|value| return false if !element.equals_attr_val?(name,value)}}
217
+ @attrs[:matches].each {|name,values| values.each {|value| return false if !element.matches_attr_val?(name,value)}}
218
+ return false if element_ref == :ELEMENT_ROOT and !element.parent.nil?
219
+ else
220
+ return false if !@tag.nil? or
221
+ !@namespace.nil? or
222
+ !@id.nil? or
223
+ !@attrs[:has].empty? or
224
+ !@attrs[:contains].empty? or
225
+ !@attrs[:equals].empty? or
226
+ !@attrs[:matches].empty? or
227
+ (element_ref == :ELEMENT_ROOT and !element.parent.nil?)
228
+ end
229
+ @pseudo_exps.each do |psuedo_name, pseudo_arg|
230
+ return false if !PseudoElements.match(element, pseudo_name, pseudo_arg)
231
+ end
232
+ @valid_rules.each do |rule|
233
+ return false if !rule.valid_on?(element)
234
+ end
235
+ return true
236
+ end
237
+
238
+ def to_s_sans_action
239
+ step_str = ''
240
+ if !@element_ref.nil?
241
+ if @element_ref == :ELEMENT_ROOT
242
+ step_str << '~'
243
+ elsif @element_ref == :ELEMENT_SELF
244
+ step_str << '.'
245
+ end
246
+ end
247
+ step_str << "%#{@tag.to_s}" if !tag.nil?
248
+ step_str << "@#{@namespace.to_s}" if !namespace.nil?
249
+ step_str << "##{@id}" if !@id.nil?
250
+ @attrs[:has].each {|attr_name| step_str << "[#{attr_name}]"}
251
+ @attrs[:contains].each {|name,values| values.each {|value| step_str << "[#{name}=\"#{value}\"]"}}
252
+ @attrs[:equals].each {|name,values| values.each {|value| step_str << "[#{name}==\"#{value.join(' ')}\"]"}}
253
+ @attrs[:matches].each {|name,values| values.each {|value| step_str << "[#{name}~=|/#{value}/|]"}}
254
+ @valid_rules.each {|rule| step_str << "{#{rule.to_s}}"}
255
+ @pseudo_exps.each {|pseudo, arg| step_str << ":#{pseudo}(#{arg.to_s})"}
256
+ step_str
257
+ end
258
+ end
259
+
260
+ class ListenerStep < PathStep
261
+ # Take step action and check for elements matching description
262
+ def match(element)
263
+ result = []
264
+ inverse_action = @@inverse_actions[@action]
265
+ search_group = element.public_send(@@action_group[inverse_action])
266
+ search_group.listing.each do |searched_element|
267
+ yield searched_element if self.describes?(searched_element) and block_given?
268
+ result << searched_element if self.describes?(searched_element)
269
+ end
270
+ result
271
+ end
272
+
273
+ def to_s
274
+ step_str = self.to_s_sans_action
275
+ step_str << @@action_str[@action]
276
+ end
277
+ end
278
+
279
+ class LocatorStep < PathStep
280
+ # Take step action and check for elements matching description
281
+ def match(element)
282
+ result = []
283
+ search_group = element.public_send(@@action_group[@action])
284
+ search_group.listing.each do |searched_element|
285
+ yield searched_element if self.describes?(searched_element) and block_given?
286
+ result << searched_element if self.describes?(searched_element)
287
+ end
288
+ result
289
+ end
290
+
291
+ def to_s
292
+ step_str = '' # Don't copy reference to @@action_str[@action]
293
+ step_str << @@action_str[@action]
294
+ step_str << self.to_s_sans_action
295
+ end
296
+ end
297
+
298
+ class Formula
299
+ def initialize(form_tokens)
300
+ @coefficient = 0
301
+ @intercept = 0
302
+ form_token_types = form_tokens.map {|token| token[0]}
303
+ if form_token_types[0] == :T_KEY_EVEN
304
+ @coefficient = 2
305
+ @intercept = 0
306
+ elsif form_token_types[0] == :T_KEY_ODD
307
+ @coefficient = 2
308
+ @intercept = 1
309
+ else
310
+ term_negative = false
311
+ term_coef = false
312
+ value = 0
313
+
314
+ index = 0
315
+ while index < form_tokens.length
316
+ case form_token_types[index]
317
+ when :T_FORM_PLUS
318
+ # Resolve Term
319
+ if term_coef
320
+ @coefficient += value
321
+ else
322
+ @intercept += value
323
+ end
324
+ # Reset with new sign
325
+ term_negative = false
326
+ term_coef = false
327
+ value = 0
328
+ when :T_FORM_MINUS
329
+ # Resolve Term
330
+ if term_coef
331
+ @coefficient += value
332
+ else
333
+ @intercept += value
334
+ end
335
+ # Reset with new sign
336
+ term_negative = true
337
+ term_coef = false
338
+ value = 0
339
+ when :LITERAL_INT
340
+ value = form_tokens[index][1].to_i
341
+ value *= -1 if term_negative
342
+ when :T_FORM_N
343
+ term_coef = true
344
+ value = 1 if value.zero?
345
+ else
346
+ raise InvalidExpressionException.new
347
+ end
348
+ index += 1
349
+ end
350
+ # Resolve one last time
351
+ if term_coef
352
+ @coefficient += value
353
+ else
354
+ @intercept += value
355
+ end
356
+ end
357
+ end
358
+
359
+ def to_s
360
+ "#{@coefficient}n#{'+' if @intercept >= 0}#{@intercept}"
361
+ end
362
+ end
363
+
364
+ class PseudoElements
365
+ def self.match(element, pseudo_name, pseudo_arg)
366
+ PseudoElements.public_send(:pseudo_name, element, pseudo_arg)
367
+ end
368
+ end
369
+
370
+ # Parser interprets tokens formed by a Scandent string and formed ScandentRules
371
+ # that represent the interpreted form of the string
372
+ # ScandentRules can then be matched to Elements in a DocTree
373
+ class Parser
374
+ @@actions = {
375
+ :T_SLASH => :ACTION_CHILD,
376
+ :T_SLASH2 => :ACTION_DESCENDANT,
377
+ :T_SLASHDOT2 => :ACTION_PARENT,
378
+ :T_SLASHDOT3 => :ACTION_ANCESTOR,
379
+ :T_SLASHGT => :ACTION_FOLLOWING_SIBLING,
380
+ :T_SLASHGT2 => :ACTION_FOLLOWING,
381
+ :T_SLASHLT => :ACTION_PRECEDING_SIBLING,
382
+ :T_SLASHLT2 => :ACTION_PRECEDING
383
+ }
384
+
385
+ # Parse a Scandent string by giving it to the Tokenizer and then parsing the results
386
+ def self.parse_rule_string(rule_string, type)
387
+ Parser.parse_rule_tokens(Tokenizer.tokenize(rule_string), type)
388
+ end
389
+
390
+ # Directly parse Scandent string tokens
391
+ def self.parse_rule_tokens(rule_tokens, type)
392
+ # Separate rules into its comma-delimited paths and remove the delimiter
393
+ rule_paths_tokens = rule_tokens.slice_after {|token| token[0] == :T_COMMA and token[2] == :STATE_ROOT_PATH}.to_a
394
+ rule_paths_tokens.each {|path_tokens| path_tokens.pop if path_tokens.last[0] == :T_COMMA}
395
+
396
+ # Parse each path individually
397
+ rule_paths = rule_paths_tokens.map{|path| Parser.parse_path_tokens(path, type)}
398
+
399
+ if type == :PATH_LISTENER
400
+ ListenerRule.new(rule_paths)
401
+ elsif type == :PATH_LOCATOR
402
+ LocatorRule.new(rule_paths)
403
+ else
404
+ raise ParseException.new("Unknown step type")
405
+ end
406
+ end
407
+
408
+ # Parse an individual path of a rule
409
+ def self.parse_path_tokens(path_tokens, type)
410
+ # Double check that the end state of the path is valid
411
+ raise InvalidExpressionException.new("End state of path is '#{path_tokens.last[2]}' instead of :STATE_ROOT_PATH") if path_tokens.last[2] != :STATE_ROOT_PATH
412
+
413
+ step_delimiters = [:T_SLASH, :T_SLASH2, :T_SLASHDOT2, :T_SLASHDOT3, :T_SLASHGT, :T_SLASHGT2, :T_SLASHLT, :T_SLASHLT2]
414
+
415
+ # Seperate paths into its steps
416
+ if type == :PATH_LISTENER
417
+ path_steps_tokens = path_tokens.slice_after {|token| step_delimiters.include? token[0] and token[2] == :STATE_ROOT_PATH}.to_a
418
+ elsif type == :PATH_LOCATOR
419
+ path_steps_tokens = path_tokens.slice_before {|token| step_delimiters.include? token[0] and token[2] == :STATE_ROOT_PATH}.to_a
420
+ else
421
+ raise ParseException.new("Unknown step type")
422
+ end
423
+
424
+ # Parse each step individually
425
+ path_steps = path_steps_tokens.map{|step| Parser.parse_step_tokens(step, type)}
426
+
427
+ if type == :PATH_LISTENER
428
+ ListenerPath.new(path_steps)
429
+ else
430
+ LocatorPath.new(path_steps)
431
+ end
432
+ end
433
+
434
+ # Parse an individual step of a path
435
+ def self.parse_step_tokens(step_tokens, type)
436
+ # Remove whitespace tokens
437
+ step_tokens.delete_if {|token| token[0] == :T_WHITESPACE}
438
+
439
+ if type == :PATH_LISTENER
440
+ action = @@actions.has_key?(step_tokens.last[0]) ? @@actions[step_tokens.last[0]] : :ACTION_SELF
441
+ elsif type == :PATH_LOCATOR
442
+ action = @@actions.has_key?(step_tokens.first[0]) ? @@actions[step_tokens.first[0]] : :ACTION_SELF
443
+ else
444
+ raise ParseException.new("Unknown step type")
445
+ end
446
+
447
+ element_ref = []
448
+ tag = []
449
+ namespace = []
450
+ id = []
451
+ attrs = {
452
+ :has => [],
453
+ :contains => Hash.new{|k, v| k[v] = Array.new},
454
+ :equals => Hash.new{|k, v| k[v] = Array.new},
455
+ :matches => Hash.new{|k, v| k[v] = Array.new}
456
+ }
457
+ pseudo_exps = []
458
+ valid_rules = []
459
+
460
+ index = 0
461
+ state = :STATE_ROOT_PATH
462
+ while index < step_tokens.length
463
+ # Consume current token and increment
464
+ index_token = step_tokens[index]
465
+ index += 1
466
+
467
+ case index_token[0]
468
+ when :T_PCT
469
+ following_token = step_tokens[index]
470
+ index += 1
471
+ raise InvalidExpressionException.new if following_token[0] != :LITERAL_IDENT
472
+ tag << following_token[1]
473
+ when :T_AT
474
+ following_token = step_tokens[index]
475
+ index += 1
476
+ raise InvalidExpressionException.new if following_token[0] != :LITERAL_IDENT
477
+ namespace << following_token[1]
478
+ when :T_PND
479
+ following_token = step_tokens[index]
480
+ index += 1
481
+ raise InvalidExpressionException.new if following_token[0] != :LITERAL_IDENT
482
+ id << following_token[1]
483
+ when :T_COLON
484
+ following_token = step_tokens[index]
485
+ index += 1
486
+ raise InvalidExpressionException.new if following_token[0] != :LITERAL_IDENT
487
+ pseudo_name = following_token[1].to_sym
488
+ arg_tokens = []
489
+
490
+ following_token = step_tokens[index]
491
+ if following_token[0] == :T_LPAREN
492
+ index += 1 # To consume the LPAREN
493
+ following_token = step_tokens[index]
494
+ index += 1
495
+ until following_token[0] == :T_RPAREN or index > step_tokens.length
496
+ arg_tokens << following_token
497
+ following_token = step_tokens[index]
498
+ index += 1
499
+ end
500
+ raise InvalidExpressionException.new if index > step_tokens.length # Undesirable exit condition to above loop
501
+ index += 1 # To consume the RPAREN
502
+ end
503
+ pseudo_exps << [pseudo_name, Parser.parse_arg(arg_tokens)]
504
+ when :T_ASTERISK # Adds no restrictions, so do nothing
505
+ when :T_TILDE
506
+ element_ref << :ELEMENT_ROOT
507
+ when :T_DOT
508
+ element_ref << :ELEMENT_SELF
509
+ when :T_LBRAK
510
+ following_token = step_tokens[index]
511
+ index += 1
512
+ raise InvalidExpressionException.new if following_token[0] != :LITERAL_IDENT
513
+ attr_name = following_token[1].to_sym
514
+ attr_value = nil
515
+ operation = nil
516
+
517
+ following_token = step_tokens[index]
518
+ index += 1
519
+ case following_token[0]
520
+ when :T_EQL
521
+ operation = :contains
522
+
523
+ following_token = step_tokens[index]
524
+ index += 1
525
+ raise InvalidExpressionException.new("Expected a string after '='") if ![:T_DQUOTE, :T_SQUOTE].include?(following_token[0])
526
+ string_limiter = following_token[0]
527
+
528
+ following_token = step_tokens[index]
529
+ index += 1
530
+ raise InvalidExpressionException.new if not [:LITERAL_IDENT, :LITERAL_STRING, :LITERAL_INT, :LITERAL_FLOAT].include?(following_token[0])
531
+ attr_value = following_token[1]
532
+
533
+ following_token = step_tokens[index]
534
+ index += 1
535
+ raise InvalidExpressionException.new if following_token[0] != string_limiter
536
+
537
+ following_token = step_tokens[index]
538
+ index += 1
539
+ raise InvalidExpressionException.new if following_token[0] != :T_RBRAK
540
+ when :T_EQL2
541
+ operation = :equals
542
+ following_token = step_tokens[index]
543
+ index += 1
544
+ raise InvalidExpressionException.new if ![:T_DQUOTE, :T_SQUOTE].include?(following_token[0])
545
+ string_limiter = following_token[0]
546
+
547
+ following_token = step_tokens[index]
548
+ index += 1
549
+ raise InvalidExpressionException.new if not [:LITERAL_IDENT, :LITERAL_STRING, :LITERAL_INT, :LITERAL_FLOAT].include?(following_token[0])
550
+ attr_value = following_token[1].split
551
+
552
+ following_token = step_tokens[index]
553
+ index += 1
554
+ raise InvalidExpressionException.new if following_token[0] != string_limiter
555
+
556
+ following_token = step_tokens[index]
557
+ index += 1
558
+ raise InvalidExpressionException.new if following_token[0] != :T_RBRAK
559
+ when :T_TILDE_EQL
560
+ operation = :matches
561
+
562
+ following_token = step_tokens[index]
563
+ index += 1
564
+ raise InvalidExpressionException.new if following_token[0] != :T_VBARSLASH
565
+
566
+ following_token = step_tokens[index]
567
+ index += 1
568
+ raise InvalidExpressionException.new if not [:LITERAL_IDENT, :LITERAL_STRING, :LITERAL_INT, :LITERAL_FLOAT].include?(following_token[0])
569
+ attr_value = Regexp.new(following_token[1])
570
+
571
+ following_token = step_tokens[index]
572
+ index += 1
573
+ raise InvalidExpressionException.new if following_token[0] != :T_SLASHVBAR
574
+
575
+ following_token = step_tokens[index]
576
+ index += 1
577
+ raise InvalidExpressionException.new if following_token[0] != :T_RBRAK
578
+ when :T_RBRAK
579
+ operation = nil
580
+ else
581
+ raise InvalidExpressionException.new
582
+ end
583
+ if operation.nil?
584
+ attrs[:has] << attr_name
585
+ else
586
+ attrs[operation][attr_name] << attr_value
587
+ end
588
+ when :T_LBRACE
589
+ equilibrium = 1
590
+ reformed_path_string = ''
591
+ following_token = step_tokens[index]
592
+ index += 1
593
+ until (following_token[0] == :T_RBRACE and equilibrium.zero?) or index > step_tokens.length
594
+ reformed_path_string << following_token[1]
595
+ following_token = step_tokens[index]
596
+ index += 1
597
+ equilibrium += 1 if following_token[0] == :T_LBRACE
598
+ equilibrium -= 1 if following_token[0] == :T_RBRACE
599
+ end
600
+ raise InvalidExpressionException.new("Could not find matching R_BRACE in #{reformed_path_string}") if index > step_tokens.length # Undesirable exit condition to above loop
601
+ valid_rules << Parser.parse_rule_string(reformed_path_string, :PATH_LOCATOR)
602
+ when :T_SLASH, :T_SLASH2, :T_SLASHDOT2, :T_SLASHDOT3, :T_SLASHGT, :T_SLASHGT2, :T_SLASHLT, :T_SLASHLT2
603
+ else
604
+ raise ParseException.new("Consumed unexpected token: #{index_token}")
605
+ end
606
+ end # All tokens consumed
607
+
608
+ # Validate results
609
+ raise InvalidExpressionException.new if tag.length > 1
610
+ raise InvalidExpressionException.new if namespace.length > 1
611
+ raise InvalidExpressionException.new if element_ref.length > 1
612
+
613
+ if type == :PATH_LISTENER
614
+ ListenerStep.new(action, element_ref[0], tag[0], namespace[0], id[0], attrs, pseudo_exps, valid_rules)
615
+ else
616
+ LocatorStep.new(action, element_ref[0], tag[0], namespace[0], id[0], attrs, pseudo_exps, valid_rules)
617
+ end
618
+ end
619
+
620
+ # Parse an argument given to a pseudo-class
621
+ def self.parse_arg(arg_tokens)
622
+ arg_token_types = arg_tokens.map {|token| token[0]}
623
+ if ((arg_token_types[0] == :T_SQUOTE and arg_token_types[2] == :T_SQUOTE) or (arg_token_types[0] == :T_DQUOTE and arg_token_types[2] == :T_DQUOTE)) and arg_tokens.length == 3
624
+ arg_tokens[1][1]
625
+ elsif (arg_token_types[0] == :T_VBARSLASH and arg_token_types[2] == :T_SLASHVBAR) and arg_tokens.length == 3
626
+ Regexp.new(arg_tokens[1][1])
627
+ elsif (!(arg_token_types & [:T_KEY_EVEN, :T_KEY_ODD]).empty? and arg_tokens.length == 1) or arg_token_types.include? :T_FORM_N
628
+ Formula.new(arg_tokens)
629
+ elsif arg_token_types[0] == :LITERAL_INT and arg_tokens.length == 1
630
+ arg_tokens[0][1].to_i
631
+ elsif arg_token_types[0] == :LITERAL_FLOAT and arg_tokens.length == 1
632
+ arg_tokens[0][1].to_f
633
+ elsif arg_tokens.length.zero?
634
+ nil
635
+ else
636
+ arg_str = ''
637
+ arg_tokens.each {|token| arg_str << token[1]}
638
+ raise InvalidExpressionException.new("Invalid argument '#{arg_str}'")
639
+ nil
640
+ end
641
+ end
642
+
643
+ end # Parser
644
+
645
+ # A class with class methods used to tokenize a Scandent string
646
+ # Has information regarding which character patterns match which tokens
647
+ # Has information regarding which tokens trigger which state in the tokenizer
648
+ class Tokenizer
649
+ # A Hash with keys cooresponding to states, and value Hashes that match patterns in that state to tokens
650
+ @@tokens = {
651
+ :STATE_ROOT_PATH => {
652
+ ',' => :T_COMMA,
653
+ '~' => :T_TILDE,
654
+ '//' => :T_SLASH2,
655
+ '/' => :T_SLASH,
656
+ '.' => :T_DOT,
657
+ '/..' => :T_SLASHDOT2,
658
+ '/...' => :T_SLASHDOT3,
659
+ '/>' => :T_SLASHGT,
660
+ '/>>' => :T_SLASHGT2,
661
+ '/<' => :T_SLASHLT,
662
+ '/<<' => :T_SLASHLT2,
663
+ '*' => :T_ASTERISK,
664
+ '{' => :T_LBRACE,
665
+ '[' => :T_LBRAK,
666
+ '(' => :T_LPAREN,
667
+ ':' => :T_COLON,
668
+ '#' => :T_PND,
669
+ '%' => :T_PCT,
670
+ '@' => :T_AT,
671
+ # Whitespace
672
+ "\n" => :T_WHITESPACE,
673
+ "\t" => :T_WHITESPACE,
674
+ ' ' => :T_WHITESPACE
675
+ },
676
+ :STATE_SSTRING => {
677
+ '\'' => :T_SQUOTE # Closing Token
678
+ },
679
+ :STATE_DSTRING => {
680
+ '"' => :T_DQUOTE, # Closing Token
681
+ },
682
+ :STATE_REGEX => {
683
+ '/|' => :T_SLASHVBAR # Closing Token
684
+ },
685
+ :STATE_ATTR_EXP => {
686
+ ']' => :T_RBRAK, # Closing Token
687
+ '=' => :T_EQL,
688
+ '==' => :T_EQL2,
689
+ '~=' => :T_TILDE_EQL,
690
+ '"' => :T_DQUOTE,
691
+ '\'' => :T_SQUOTE,
692
+ '|/' => :T_VBARSLASH,
693
+ # Whitespace
694
+ "\n" => :T_WHITESPACE,
695
+ "\t" => :T_WHITESPACE,
696
+ ' ' => :T_WHITESPACE
697
+ },
698
+ :STATE_PATH_EXP => {
699
+ '}' => :T_RBRACE, # Closing Token
700
+ # From ROOT_PATH
701
+ ',' => :T_COMMA,
702
+ '~' => :T_TILDE,
703
+ '//' => :T_SLASH2,
704
+ '/' => :T_SLASH,
705
+ '.' => :T_DOT,
706
+ '/..' => :T_SLASHDOT2,
707
+ '/...' => :T_SLASHDOT3,
708
+ '/>' => :T_SLASHGT,
709
+ '/>>' => :T_SLASHGT2,
710
+ '/<' => :T_SLASHLT,
711
+ '/<<' => :T_SLASHLT2,
712
+ '*' => :T_ASTERISK,
713
+ '{' => :T_LBRACE,
714
+ '[' => :T_LBRAK,
715
+ '(' => :T_LPAREN,
716
+ ':' => :T_COLON,
717
+ '#' => :T_PND,
718
+ '%' => :T_PCT,
719
+ '@' => :T_AT,
720
+ # Whitespace
721
+ "\n" => :T_WHITESPACE,
722
+ "\t" => :T_WHITESPACE,
723
+ ' ' => :T_WHITESPACE
724
+ },
725
+ :STATE_ARGS => {
726
+ ')' => :T_RPAREN, # Closing Token
727
+ '|/' => :T_VBARSLASH,
728
+ 'n' => :T_FORM_N,
729
+ '+' => :T_FORM_PLUS,
730
+ '-' => :T_FORM_MINUS,
731
+ 'even' => :T_KEY_EVEN,
732
+ 'odd' => :T_KEY_ODD,
733
+ '"' => :T_DQUOTE,
734
+ '\'' => :T_SQUOTE,
735
+ # Whitespace
736
+ "\n" => :T_WHITESPACE,
737
+ "\t" => :T_WHITESPACE,
738
+ ' ' => :T_WHITESPACE
739
+ },
740
+ }
741
+ # A Hash with keys cooresponding to states, and value Hashes that describe what tokens trigger the closing
742
+ # of the *current* state or the opening of a *different* state
743
+ @@triggers = {
744
+ :STATE_ROOT_PATH => {
745
+ :open => {
746
+ :T_LBRACE => :STATE_PATH_EXP,
747
+ :T_LBRAK => :STATE_ATTR_EXP,
748
+ :T_LPAREN => :STATE_ARGS
749
+ },
750
+ :close => []
751
+ },
752
+ :STATE_SSTRING => {
753
+ :open => {},
754
+ :close => [:T_SQUOTE]
755
+ },
756
+ :STATE_DSTRING => {
757
+ :open => {},
758
+ :close => [:T_DQUOTE]
759
+ },
760
+ :STATE_REGEX => {
761
+ :open => {},
762
+ :close => [:T_SLASHVBAR]
763
+ },
764
+ :STATE_ATTR_EXP => {
765
+ :open => {
766
+ :T_SQUOTE => :STATE_SSTRING,
767
+ :T_DQUOTE => :STATE_DSTRING,
768
+ :T_VBARSLASH => :STATE_REGEX
769
+ },
770
+ :close => [:T_RBRAK]
771
+ },
772
+ :STATE_PATH_EXP => {
773
+ :open => {
774
+ :T_LBRACE => :STATE_PATH_EXP,
775
+ :T_LBRAK => :STATE_ATTR_EXP,
776
+ :T_LPAREN => :STATE_ARGS
777
+ },
778
+ :close => [:T_RBRACE]
779
+ },
780
+ :STATE_ARGS => {
781
+ :open => {
782
+ :T_SQUOTE => :STATE_SSTRING,
783
+ :T_DQUOTE => :STATE_DSTRING,
784
+ :T_VBARSLASH => :STATE_REGEX
785
+ },
786
+ :close => [:T_RPAREN]
787
+ },
788
+ }
789
+
790
+ # Returns the type of a string of characters that does not match a pattern
791
+ def self.literal_type(literal)
792
+ return :LITERAL_IDENT if literal =~ /^[[:alpha:]][[:alnum:]-_]*$/
793
+ return :LITERAL_INT if literal =~ /^\d+$/
794
+ return :LITERAL_FLOAT if literal =~ /^\d*\.?\d+$/
795
+ return :LITERAL_STRING if literal =~ /^.+$/
796
+ return :LITERAL_UNKNOWN
797
+ end
798
+
799
+ # If no matches found for a letter, move to next letter, even if the possible patterns would match more
800
+ def self.tokenize(input)
801
+ state = [:STATE_ROOT_PATH] # State stack for the tokenizer, state.last will return the current state
802
+ match_start = 0
803
+ match_end = 0
804
+ unmatched_buffer = ''
805
+ largest_full_match = nil
806
+ token_list = [] # The list of tokens in the input, each item is an Array in the form:
807
+ # [Token type, Pattern that matched, State of the tokenizer after the token]
808
+
809
+ # Until we have checked and matched at or on every single character
810
+ while match_start < input.length
811
+ # Start building a substring from a single character
812
+ match_end = match_start
813
+ # Start will all patterns for current state as candidates
814
+ candidates = @@tokens[state.last].keys
815
+ # Start will no full match detected
816
+ largest_full_match = nil
817
+
818
+ # Until nothing can match substring or end of input has been reached
819
+ until candidates.empty? or match_end >= input.length
820
+ # String that candidate patterns will have to match
821
+ matched_string = input[match_start..match_end]
822
+ # Check to see if each remaining candidate pattern matches
823
+ # - If a full match, set as largest full match
824
+ # - Delete if the pattern does not match the string
825
+ candidates.delete_if do |pattern|
826
+ largest_full_match = [pattern, match_start, match_end] if pattern.eql?(matched_string)
827
+ !pattern.start_with?(matched_string) # Element deleted if true is the last statement in block
828
+ end
829
+ # Increase size of match by one if further matching is to be done
830
+ match_end += 1 if not candidates.empty?
831
+ end # Substring is now one character too large to be matched to
832
+
833
+ # If no full match found, add the last checked character as unmatched
834
+ # Otherwise:
835
+ # - Parse the unmatched_buffer as a literal and store
836
+ # - Activate state triggers associated with the largest fully matched token
837
+ # - Store the largest fully matched token
838
+ # - Start again where the matched token completes
839
+ if largest_full_match.nil?
840
+ # Add last checked character to the unmatched buffer
841
+ unmatched_buffer << input[match_start]
842
+ # Start matching again on the next letter
843
+ match_start += 1
844
+ else
845
+
846
+ # Do not activate state triggers associate with the parsed literal
847
+ # Is there a use case?
848
+
849
+ # Parse the unmatched_buffer as a literal and store (if it exists), then clear the buffer
850
+ token_list << [Tokenizer.literal_type(unmatched_buffer), unmatched_buffer, state.last] if unmatched_buffer.length > 0
851
+ unmatched_buffer = ''
852
+
853
+ # Info from the largest fully matched token
854
+ matched_pattern, pattern_start, pattern_end = largest_full_match
855
+ matched_token_type = @@tokens[state.last][matched_pattern]
856
+
857
+ # Activate state triggers associated with the largest fully matched token
858
+ current_state_triggers = @@triggers[state.last]
859
+ if current_state_triggers[:open].has_key?(matched_token_type)
860
+ state.push(current_state_triggers[:open][matched_token_type])
861
+ elsif current_state_triggers[:close].include?(matched_token_type)
862
+ state.pop
863
+ end
864
+
865
+ # Store the largest fully matched token
866
+ token_list << [matched_token_type, matched_pattern, state.last]
867
+
868
+ # Start again where the matched token completes
869
+ match_start = pattern_end + 1
870
+
871
+ end # Next token has been added to list
872
+ end # Input has been fully tokenized
873
+
874
+ # Parse and store the unmatched_buffer one last time
875
+ token_list << [Tokenizer.literal_type(unmatched_buffer), unmatched_buffer, state.last] if unmatched_buffer.length > 0
876
+
877
+ token_list
878
+ end # tokenize_path
879
+
880
+ end # Tokenizer
881
+ end # Scandent
882
+ end # Arboretum