arboretum 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/arboretum/doctree.rb +1566 -0
- data/lib/arboretum/scandent.rb +882 -0
- data/lib/arboretum/xml.rb +169 -0
- metadata +4 -1
@@ -0,0 +1,882 @@
|
|
1
|
+
module Arboretum
|
2
|
+
module Scandent
|
3
|
+
|
4
|
+
class TokenizationException < StandardError
|
5
|
+
def initialize(msg="An error occurred while tokenizing input")
|
6
|
+
super(msg)
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
class ParseException < StandardError
|
11
|
+
def initialize(msg="An error occurred while parsing input")
|
12
|
+
super(msg)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class InvalidExpressionException < StandardError
|
17
|
+
def initialize(msg="Invalid Scandent expression")
|
18
|
+
super(msg)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class ScandentRule
|
23
|
+
attr_accessor :paths
|
24
|
+
|
25
|
+
def initialize(rule_paths)
|
26
|
+
@paths = rule_paths
|
27
|
+
end
|
28
|
+
|
29
|
+
def valid_on?(element)
|
30
|
+
@paths.each {|path| return true if path.valid_on?(element)}
|
31
|
+
return false
|
32
|
+
end
|
33
|
+
alias_method :selects?, :valid_on?
|
34
|
+
alias_method :matches?, :valid_on?
|
35
|
+
|
36
|
+
def to_s
|
37
|
+
rule_str = ''
|
38
|
+
@paths.each do |path|
|
39
|
+
rule_str << ', ' if !rule_str.empty?
|
40
|
+
rule_str << path.to_s
|
41
|
+
end
|
42
|
+
rule_str
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class ListenerRule < ScandentRule
|
47
|
+
end
|
48
|
+
|
49
|
+
class LocatorRule < ScandentRule
|
50
|
+
def locate(element)
|
51
|
+
all_located = []
|
52
|
+
@paths.each do |path|
|
53
|
+
path.locate(element) do |located_element|
|
54
|
+
unless all_located.include?(located_element)
|
55
|
+
yield located_element if block_given?
|
56
|
+
all_located << located_element
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
all_located
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
class ListenerPath
|
65
|
+
attr_accessor :steps
|
66
|
+
|
67
|
+
def initialize(steps=[])
|
68
|
+
# Make sure that the listener stems from the root element
|
69
|
+
if !steps[0].nil? and steps[0].element_ref != :ELEMENT_ROOT
|
70
|
+
if steps[0].element_ref.nil?
|
71
|
+
implied_step = [
|
72
|
+
[:T_TILDE, '~', :STATE_ROOT_PATH],
|
73
|
+
[:T_SLASH2, '//', :STATE_ROOT_PATH]
|
74
|
+
]
|
75
|
+
steps.unshift(Parser.parse_step_tokens(implied_step, :PATH_LISTENER))
|
76
|
+
else steps[0].element_ref == :ELEMENT_SELF
|
77
|
+
steps[0].element_ref = :ELEMENT_ROOT
|
78
|
+
end
|
79
|
+
end
|
80
|
+
# Make sure there are no misplaced :ELEMENT_SELF references
|
81
|
+
steps.each {|step| raise InvalidExpressionException.new if step.element_ref == :ELEMENT_SELF and !step.eql?(steps.first)}
|
82
|
+
|
83
|
+
@steps = steps
|
84
|
+
end
|
85
|
+
|
86
|
+
def valid_on?(candidate)
|
87
|
+
steps_valid_on?(@steps, candidate)
|
88
|
+
end
|
89
|
+
|
90
|
+
def steps_valid_on?(steps, candidate)
|
91
|
+
return true if steps.empty?
|
92
|
+
steps.last.match(candidate) do |element|
|
93
|
+
return true if steps_valid_on?(steps[0..-2], element)
|
94
|
+
end
|
95
|
+
return false
|
96
|
+
end
|
97
|
+
|
98
|
+
def to_s
|
99
|
+
path_string = ''
|
100
|
+
@steps.each{|step| path_string << step.to_s}
|
101
|
+
path_string
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
class LocatorPath
|
107
|
+
attr_accessor :steps
|
108
|
+
|
109
|
+
def initialize(steps=[])
|
110
|
+
# Make sure that the locators stems from the root or current context element
|
111
|
+
if steps.first.element_ref.nil?
|
112
|
+
steps.first.action = :ACTION_CHILD if steps.first.action == :ACTION_SELF
|
113
|
+
implied_step = [
|
114
|
+
[:T_DOT, '.', :STATE_ROOT_PATH],
|
115
|
+
]
|
116
|
+
steps.unshift(Parser.parse_step_tokens(implied_step, :PATH_LOCATOR))
|
117
|
+
end
|
118
|
+
# Make sure there are no misplaced :ELEMENT_SELF references
|
119
|
+
steps.each {|step| raise InvalidExpressionException.new if step.element_ref == :ELEMENT_SELF and !step.eql?(steps.first)}
|
120
|
+
|
121
|
+
@steps = steps
|
122
|
+
end
|
123
|
+
|
124
|
+
def locate(element)
|
125
|
+
matches = [element]
|
126
|
+
next_round = []
|
127
|
+
@steps.each do |step|
|
128
|
+
next_round = []
|
129
|
+
matches.each do |matched_element|
|
130
|
+
next_round += step.match(matched_element)
|
131
|
+
end
|
132
|
+
return next_round if next_round.empty?
|
133
|
+
matches = next_round
|
134
|
+
end
|
135
|
+
matches.each {|match| yield match if block_given?}
|
136
|
+
return matches
|
137
|
+
end
|
138
|
+
|
139
|
+
def valid_on?(candidate)
|
140
|
+
steps_valid_on?(@steps, candidate)
|
141
|
+
end
|
142
|
+
|
143
|
+
def steps_valid_on?(steps, candidate)
|
144
|
+
return true if steps.empty?
|
145
|
+
steps.first.match(candidate) do |element|
|
146
|
+
return true if steps_valid_on?(steps[1..-1], element)
|
147
|
+
end
|
148
|
+
return false
|
149
|
+
end
|
150
|
+
|
151
|
+
def to_s
|
152
|
+
path_string = ''
|
153
|
+
@steps.each{|step| path_string << step.to_s}
|
154
|
+
path_string
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
class PathStep
|
159
|
+
@@inverse_actions = {
|
160
|
+
:ACTION_CHILD => :ACTION_PARENT,
|
161
|
+
:ACTION_DESCENDANT => :ACTION_ANCESTOR,
|
162
|
+
:ACTION_PARENT => :ACTION_CHILD,
|
163
|
+
:ACTION_ANCESTOR => :ACTION_DESCENDANT,
|
164
|
+
:ACTION_FOLLOWING_SIBLING => :ACTION_PRECEDING_SIBLING,
|
165
|
+
:ACTION_FOLLOWING => :ACTION_PRECEDING,
|
166
|
+
:ACTION_PRECEDING_SIBLING => :ACTION_FOLLOWING_SIBLING,
|
167
|
+
:ACTION_PRECEDING => :ACTION_FOLLOWING,
|
168
|
+
:ACTION_SELF => :ACTION_SELF
|
169
|
+
}
|
170
|
+
@@action_group = {
|
171
|
+
:ACTION_CHILD => :content,
|
172
|
+
:ACTION_DESCENDANT => :descendants,
|
173
|
+
:ACTION_PARENT => :parent,
|
174
|
+
:ACTION_ANCESTOR => :ancestors,
|
175
|
+
:ACTION_FOLLOWING_SIBLING => :sibling_next,
|
176
|
+
:ACTION_FOLLOWING => :following_siblings,
|
177
|
+
:ACTION_PRECEDING_SIBLING => :sibling_prev,
|
178
|
+
:ACTION_PRECEDING => :preceding_siblings,
|
179
|
+
:ACTION_SELF => :itself
|
180
|
+
}
|
181
|
+
@@action_str = {
|
182
|
+
:ACTION_CHILD => '/'.freeze,
|
183
|
+
:ACTION_DESCENDANT => '//'.freeze,
|
184
|
+
:ACTION_PARENT => '/..'.freeze,
|
185
|
+
:ACTION_ANCESTOR => '/...'.freeze,
|
186
|
+
:ACTION_FOLLOWING_SIBLING => '/>'.freeze,
|
187
|
+
:ACTION_FOLLOWING => '/>>'.freeze,
|
188
|
+
:ACTION_PRECEDING_SIBLING => '/<'.freeze,
|
189
|
+
:ACTION_PRECEDING => '/<<'.freeze,
|
190
|
+
:ACTION_SELF => ''.freeze
|
191
|
+
}
|
192
|
+
|
193
|
+
attr_accessor :action, :element_ref, :tag, :namespace, :id, :attrs, :pseudo_exps, :valid_rules
|
194
|
+
|
195
|
+
def initialize(action, element_ref, tag, namespace, id, attrs, pseudo_exps, valid_rules)
|
196
|
+
@action = action
|
197
|
+
@element_ref = element_ref
|
198
|
+
@tag = tag.nil? ? nil : tag.to_sym
|
199
|
+
@namespace = namespace.nil? ? nil : namespace.to_sym
|
200
|
+
@id = id
|
201
|
+
@attrs = attrs
|
202
|
+
@pseudo_exps = pseudo_exps
|
203
|
+
@valid_rules = valid_rules
|
204
|
+
end
|
205
|
+
|
206
|
+
# Check if all fields match that of the given element, with no attention paid to the action
|
207
|
+
# FIXME: Search time improves by 25% when a tag is given. Reject non-tagged Elements more quickly
|
208
|
+
# to get a similar performance boost across the board
|
209
|
+
def describes?(element)
|
210
|
+
if element.kind_of?(Arboretum::DocTree::Elements::TaggedElement)
|
211
|
+
return false if !@tag.nil? and element.tag != @tag
|
212
|
+
return false if !@namespace.nil? and element.namespace != @namespace
|
213
|
+
return false if !@id.nil? and !element.equals_attr_val?(:id, [@id])
|
214
|
+
@attrs[:has].each {|attr_name| return false if !element.has_attr?(attr_name)}
|
215
|
+
@attrs[:contains].each {|name,values| values.each {|value| return false if !element.contains_attr_val?(name,value)}}
|
216
|
+
@attrs[:equals].each {|name,values| values.each {|value| return false if !element.equals_attr_val?(name,value)}}
|
217
|
+
@attrs[:matches].each {|name,values| values.each {|value| return false if !element.matches_attr_val?(name,value)}}
|
218
|
+
return false if element_ref == :ELEMENT_ROOT and !element.parent.nil?
|
219
|
+
else
|
220
|
+
return false if !@tag.nil? or
|
221
|
+
!@namespace.nil? or
|
222
|
+
!@id.nil? or
|
223
|
+
!@attrs[:has].empty? or
|
224
|
+
!@attrs[:contains].empty? or
|
225
|
+
!@attrs[:equals].empty? or
|
226
|
+
!@attrs[:matches].empty? or
|
227
|
+
(element_ref == :ELEMENT_ROOT and !element.parent.nil?)
|
228
|
+
end
|
229
|
+
@pseudo_exps.each do |psuedo_name, pseudo_arg|
|
230
|
+
return false if !PseudoElements.match(element, pseudo_name, pseudo_arg)
|
231
|
+
end
|
232
|
+
@valid_rules.each do |rule|
|
233
|
+
return false if !rule.valid_on?(element)
|
234
|
+
end
|
235
|
+
return true
|
236
|
+
end
|
237
|
+
|
238
|
+
def to_s_sans_action
|
239
|
+
step_str = ''
|
240
|
+
if !@element_ref.nil?
|
241
|
+
if @element_ref == :ELEMENT_ROOT
|
242
|
+
step_str << '~'
|
243
|
+
elsif @element_ref == :ELEMENT_SELF
|
244
|
+
step_str << '.'
|
245
|
+
end
|
246
|
+
end
|
247
|
+
step_str << "%#{@tag.to_s}" if !tag.nil?
|
248
|
+
step_str << "@#{@namespace.to_s}" if !namespace.nil?
|
249
|
+
step_str << "##{@id}" if !@id.nil?
|
250
|
+
@attrs[:has].each {|attr_name| step_str << "[#{attr_name}]"}
|
251
|
+
@attrs[:contains].each {|name,values| values.each {|value| step_str << "[#{name}=\"#{value}\"]"}}
|
252
|
+
@attrs[:equals].each {|name,values| values.each {|value| step_str << "[#{name}==\"#{value.join(' ')}\"]"}}
|
253
|
+
@attrs[:matches].each {|name,values| values.each {|value| step_str << "[#{name}~=|/#{value}/|]"}}
|
254
|
+
@valid_rules.each {|rule| step_str << "{#{rule.to_s}}"}
|
255
|
+
@pseudo_exps.each {|pseudo, arg| step_str << ":#{pseudo}(#{arg.to_s})"}
|
256
|
+
step_str
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
class ListenerStep < PathStep
|
261
|
+
# Take step action and check for elements matching description
|
262
|
+
def match(element)
|
263
|
+
result = []
|
264
|
+
inverse_action = @@inverse_actions[@action]
|
265
|
+
search_group = element.public_send(@@action_group[inverse_action])
|
266
|
+
search_group.listing.each do |searched_element|
|
267
|
+
yield searched_element if self.describes?(searched_element) and block_given?
|
268
|
+
result << searched_element if self.describes?(searched_element)
|
269
|
+
end
|
270
|
+
result
|
271
|
+
end
|
272
|
+
|
273
|
+
def to_s
|
274
|
+
step_str = self.to_s_sans_action
|
275
|
+
step_str << @@action_str[@action]
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
class LocatorStep < PathStep
|
280
|
+
# Take step action and check for elements matching description
|
281
|
+
def match(element)
|
282
|
+
result = []
|
283
|
+
search_group = element.public_send(@@action_group[@action])
|
284
|
+
search_group.listing.each do |searched_element|
|
285
|
+
yield searched_element if self.describes?(searched_element) and block_given?
|
286
|
+
result << searched_element if self.describes?(searched_element)
|
287
|
+
end
|
288
|
+
result
|
289
|
+
end
|
290
|
+
|
291
|
+
def to_s
|
292
|
+
step_str = '' # Don't copy reference to @@action_str[@action]
|
293
|
+
step_str << @@action_str[@action]
|
294
|
+
step_str << self.to_s_sans_action
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
class Formula
|
299
|
+
def initialize(form_tokens)
|
300
|
+
@coefficient = 0
|
301
|
+
@intercept = 0
|
302
|
+
form_token_types = form_tokens.map {|token| token[0]}
|
303
|
+
if form_token_types[0] == :T_KEY_EVEN
|
304
|
+
@coefficient = 2
|
305
|
+
@intercept = 0
|
306
|
+
elsif form_token_types[0] == :T_KEY_ODD
|
307
|
+
@coefficient = 2
|
308
|
+
@intercept = 1
|
309
|
+
else
|
310
|
+
term_negative = false
|
311
|
+
term_coef = false
|
312
|
+
value = 0
|
313
|
+
|
314
|
+
index = 0
|
315
|
+
while index < form_tokens.length
|
316
|
+
case form_token_types[index]
|
317
|
+
when :T_FORM_PLUS
|
318
|
+
# Resolve Term
|
319
|
+
if term_coef
|
320
|
+
@coefficient += value
|
321
|
+
else
|
322
|
+
@intercept += value
|
323
|
+
end
|
324
|
+
# Reset with new sign
|
325
|
+
term_negative = false
|
326
|
+
term_coef = false
|
327
|
+
value = 0
|
328
|
+
when :T_FORM_MINUS
|
329
|
+
# Resolve Term
|
330
|
+
if term_coef
|
331
|
+
@coefficient += value
|
332
|
+
else
|
333
|
+
@intercept += value
|
334
|
+
end
|
335
|
+
# Reset with new sign
|
336
|
+
term_negative = true
|
337
|
+
term_coef = false
|
338
|
+
value = 0
|
339
|
+
when :LITERAL_INT
|
340
|
+
value = form_tokens[index][1].to_i
|
341
|
+
value *= -1 if term_negative
|
342
|
+
when :T_FORM_N
|
343
|
+
term_coef = true
|
344
|
+
value = 1 if value.zero?
|
345
|
+
else
|
346
|
+
raise InvalidExpressionException.new
|
347
|
+
end
|
348
|
+
index += 1
|
349
|
+
end
|
350
|
+
# Resolve one last time
|
351
|
+
if term_coef
|
352
|
+
@coefficient += value
|
353
|
+
else
|
354
|
+
@intercept += value
|
355
|
+
end
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
def to_s
|
360
|
+
"#{@coefficient}n#{'+' if @intercept >= 0}#{@intercept}"
|
361
|
+
end
|
362
|
+
end
|
363
|
+
|
364
|
+
class PseudoElements
|
365
|
+
def self.match(element, pseudo_name, pseudo_arg)
|
366
|
+
PseudoElements.public_send(:pseudo_name, element, pseudo_arg)
|
367
|
+
end
|
368
|
+
end
|
369
|
+
|
370
|
+
# Parser interprets tokens formed by a Scandent string and formed ScandentRules
|
371
|
+
# that represent the interpreted form of the string
|
372
|
+
# ScandentRules can then be matched to Elements in a DocTree
|
373
|
+
class Parser
|
374
|
+
@@actions = {
|
375
|
+
:T_SLASH => :ACTION_CHILD,
|
376
|
+
:T_SLASH2 => :ACTION_DESCENDANT,
|
377
|
+
:T_SLASHDOT2 => :ACTION_PARENT,
|
378
|
+
:T_SLASHDOT3 => :ACTION_ANCESTOR,
|
379
|
+
:T_SLASHGT => :ACTION_FOLLOWING_SIBLING,
|
380
|
+
:T_SLASHGT2 => :ACTION_FOLLOWING,
|
381
|
+
:T_SLASHLT => :ACTION_PRECEDING_SIBLING,
|
382
|
+
:T_SLASHLT2 => :ACTION_PRECEDING
|
383
|
+
}
|
384
|
+
|
385
|
+
# Parse a Scandent string by giving it to the Tokenizer and then parsing the results
|
386
|
+
def self.parse_rule_string(rule_string, type)
|
387
|
+
Parser.parse_rule_tokens(Tokenizer.tokenize(rule_string), type)
|
388
|
+
end
|
389
|
+
|
390
|
+
# Directly parse Scandent string tokens
|
391
|
+
def self.parse_rule_tokens(rule_tokens, type)
|
392
|
+
# Separate rules into its comma-delimited paths and remove the delimiter
|
393
|
+
rule_paths_tokens = rule_tokens.slice_after {|token| token[0] == :T_COMMA and token[2] == :STATE_ROOT_PATH}.to_a
|
394
|
+
rule_paths_tokens.each {|path_tokens| path_tokens.pop if path_tokens.last[0] == :T_COMMA}
|
395
|
+
|
396
|
+
# Parse each path individually
|
397
|
+
rule_paths = rule_paths_tokens.map{|path| Parser.parse_path_tokens(path, type)}
|
398
|
+
|
399
|
+
if type == :PATH_LISTENER
|
400
|
+
ListenerRule.new(rule_paths)
|
401
|
+
elsif type == :PATH_LOCATOR
|
402
|
+
LocatorRule.new(rule_paths)
|
403
|
+
else
|
404
|
+
raise ParseException.new("Unknown step type")
|
405
|
+
end
|
406
|
+
end
|
407
|
+
|
408
|
+
# Parse an individual path of a rule
|
409
|
+
def self.parse_path_tokens(path_tokens, type)
|
410
|
+
# Double check that the end state of the path is valid
|
411
|
+
raise InvalidExpressionException.new("End state of path is '#{path_tokens.last[2]}' instead of :STATE_ROOT_PATH") if path_tokens.last[2] != :STATE_ROOT_PATH
|
412
|
+
|
413
|
+
step_delimiters = [:T_SLASH, :T_SLASH2, :T_SLASHDOT2, :T_SLASHDOT3, :T_SLASHGT, :T_SLASHGT2, :T_SLASHLT, :T_SLASHLT2]
|
414
|
+
|
415
|
+
# Seperate paths into its steps
|
416
|
+
if type == :PATH_LISTENER
|
417
|
+
path_steps_tokens = path_tokens.slice_after {|token| step_delimiters.include? token[0] and token[2] == :STATE_ROOT_PATH}.to_a
|
418
|
+
elsif type == :PATH_LOCATOR
|
419
|
+
path_steps_tokens = path_tokens.slice_before {|token| step_delimiters.include? token[0] and token[2] == :STATE_ROOT_PATH}.to_a
|
420
|
+
else
|
421
|
+
raise ParseException.new("Unknown step type")
|
422
|
+
end
|
423
|
+
|
424
|
+
# Parse each step individually
|
425
|
+
path_steps = path_steps_tokens.map{|step| Parser.parse_step_tokens(step, type)}
|
426
|
+
|
427
|
+
if type == :PATH_LISTENER
|
428
|
+
ListenerPath.new(path_steps)
|
429
|
+
else
|
430
|
+
LocatorPath.new(path_steps)
|
431
|
+
end
|
432
|
+
end
|
433
|
+
|
434
|
+
# Parse an individual step of a path
|
435
|
+
def self.parse_step_tokens(step_tokens, type)
|
436
|
+
# Remove whitespace tokens
|
437
|
+
step_tokens.delete_if {|token| token[0] == :T_WHITESPACE}
|
438
|
+
|
439
|
+
if type == :PATH_LISTENER
|
440
|
+
action = @@actions.has_key?(step_tokens.last[0]) ? @@actions[step_tokens.last[0]] : :ACTION_SELF
|
441
|
+
elsif type == :PATH_LOCATOR
|
442
|
+
action = @@actions.has_key?(step_tokens.first[0]) ? @@actions[step_tokens.first[0]] : :ACTION_SELF
|
443
|
+
else
|
444
|
+
raise ParseException.new("Unknown step type")
|
445
|
+
end
|
446
|
+
|
447
|
+
element_ref = []
|
448
|
+
tag = []
|
449
|
+
namespace = []
|
450
|
+
id = []
|
451
|
+
attrs = {
|
452
|
+
:has => [],
|
453
|
+
:contains => Hash.new{|k, v| k[v] = Array.new},
|
454
|
+
:equals => Hash.new{|k, v| k[v] = Array.new},
|
455
|
+
:matches => Hash.new{|k, v| k[v] = Array.new}
|
456
|
+
}
|
457
|
+
pseudo_exps = []
|
458
|
+
valid_rules = []
|
459
|
+
|
460
|
+
index = 0
|
461
|
+
state = :STATE_ROOT_PATH
|
462
|
+
while index < step_tokens.length
|
463
|
+
# Consume current token and increment
|
464
|
+
index_token = step_tokens[index]
|
465
|
+
index += 1
|
466
|
+
|
467
|
+
case index_token[0]
|
468
|
+
when :T_PCT
|
469
|
+
following_token = step_tokens[index]
|
470
|
+
index += 1
|
471
|
+
raise InvalidExpressionException.new if following_token[0] != :LITERAL_IDENT
|
472
|
+
tag << following_token[1]
|
473
|
+
when :T_AT
|
474
|
+
following_token = step_tokens[index]
|
475
|
+
index += 1
|
476
|
+
raise InvalidExpressionException.new if following_token[0] != :LITERAL_IDENT
|
477
|
+
namespace << following_token[1]
|
478
|
+
when :T_PND
|
479
|
+
following_token = step_tokens[index]
|
480
|
+
index += 1
|
481
|
+
raise InvalidExpressionException.new if following_token[0] != :LITERAL_IDENT
|
482
|
+
id << following_token[1]
|
483
|
+
when :T_COLON
|
484
|
+
following_token = step_tokens[index]
|
485
|
+
index += 1
|
486
|
+
raise InvalidExpressionException.new if following_token[0] != :LITERAL_IDENT
|
487
|
+
pseudo_name = following_token[1].to_sym
|
488
|
+
arg_tokens = []
|
489
|
+
|
490
|
+
following_token = step_tokens[index]
|
491
|
+
if following_token[0] == :T_LPAREN
|
492
|
+
index += 1 # To consume the LPAREN
|
493
|
+
following_token = step_tokens[index]
|
494
|
+
index += 1
|
495
|
+
until following_token[0] == :T_RPAREN or index > step_tokens.length
|
496
|
+
arg_tokens << following_token
|
497
|
+
following_token = step_tokens[index]
|
498
|
+
index += 1
|
499
|
+
end
|
500
|
+
raise InvalidExpressionException.new if index > step_tokens.length # Undesirable exit condition to above loop
|
501
|
+
index += 1 # To consume the RPAREN
|
502
|
+
end
|
503
|
+
pseudo_exps << [pseudo_name, Parser.parse_arg(arg_tokens)]
|
504
|
+
when :T_ASTERISK # Adds no restrictions, so do nothing
|
505
|
+
when :T_TILDE
|
506
|
+
element_ref << :ELEMENT_ROOT
|
507
|
+
when :T_DOT
|
508
|
+
element_ref << :ELEMENT_SELF
|
509
|
+
when :T_LBRAK
|
510
|
+
following_token = step_tokens[index]
|
511
|
+
index += 1
|
512
|
+
raise InvalidExpressionException.new if following_token[0] != :LITERAL_IDENT
|
513
|
+
attr_name = following_token[1].to_sym
|
514
|
+
attr_value = nil
|
515
|
+
operation = nil
|
516
|
+
|
517
|
+
following_token = step_tokens[index]
|
518
|
+
index += 1
|
519
|
+
case following_token[0]
|
520
|
+
when :T_EQL
|
521
|
+
operation = :contains
|
522
|
+
|
523
|
+
following_token = step_tokens[index]
|
524
|
+
index += 1
|
525
|
+
raise InvalidExpressionException.new("Expected a string after '='") if ![:T_DQUOTE, :T_SQUOTE].include?(following_token[0])
|
526
|
+
string_limiter = following_token[0]
|
527
|
+
|
528
|
+
following_token = step_tokens[index]
|
529
|
+
index += 1
|
530
|
+
raise InvalidExpressionException.new if not [:LITERAL_IDENT, :LITERAL_STRING, :LITERAL_INT, :LITERAL_FLOAT].include?(following_token[0])
|
531
|
+
attr_value = following_token[1]
|
532
|
+
|
533
|
+
following_token = step_tokens[index]
|
534
|
+
index += 1
|
535
|
+
raise InvalidExpressionException.new if following_token[0] != string_limiter
|
536
|
+
|
537
|
+
following_token = step_tokens[index]
|
538
|
+
index += 1
|
539
|
+
raise InvalidExpressionException.new if following_token[0] != :T_RBRAK
|
540
|
+
when :T_EQL2
|
541
|
+
operation = :equals
|
542
|
+
following_token = step_tokens[index]
|
543
|
+
index += 1
|
544
|
+
raise InvalidExpressionException.new if ![:T_DQUOTE, :T_SQUOTE].include?(following_token[0])
|
545
|
+
string_limiter = following_token[0]
|
546
|
+
|
547
|
+
following_token = step_tokens[index]
|
548
|
+
index += 1
|
549
|
+
raise InvalidExpressionException.new if not [:LITERAL_IDENT, :LITERAL_STRING, :LITERAL_INT, :LITERAL_FLOAT].include?(following_token[0])
|
550
|
+
attr_value = following_token[1].split
|
551
|
+
|
552
|
+
following_token = step_tokens[index]
|
553
|
+
index += 1
|
554
|
+
raise InvalidExpressionException.new if following_token[0] != string_limiter
|
555
|
+
|
556
|
+
following_token = step_tokens[index]
|
557
|
+
index += 1
|
558
|
+
raise InvalidExpressionException.new if following_token[0] != :T_RBRAK
|
559
|
+
when :T_TILDE_EQL
|
560
|
+
operation = :matches
|
561
|
+
|
562
|
+
following_token = step_tokens[index]
|
563
|
+
index += 1
|
564
|
+
raise InvalidExpressionException.new if following_token[0] != :T_VBARSLASH
|
565
|
+
|
566
|
+
following_token = step_tokens[index]
|
567
|
+
index += 1
|
568
|
+
raise InvalidExpressionException.new if not [:LITERAL_IDENT, :LITERAL_STRING, :LITERAL_INT, :LITERAL_FLOAT].include?(following_token[0])
|
569
|
+
attr_value = Regexp.new(following_token[1])
|
570
|
+
|
571
|
+
following_token = step_tokens[index]
|
572
|
+
index += 1
|
573
|
+
raise InvalidExpressionException.new if following_token[0] != :T_SLASHVBAR
|
574
|
+
|
575
|
+
following_token = step_tokens[index]
|
576
|
+
index += 1
|
577
|
+
raise InvalidExpressionException.new if following_token[0] != :T_RBRAK
|
578
|
+
when :T_RBRAK
|
579
|
+
operation = nil
|
580
|
+
else
|
581
|
+
raise InvalidExpressionException.new
|
582
|
+
end
|
583
|
+
if operation.nil?
|
584
|
+
attrs[:has] << attr_name
|
585
|
+
else
|
586
|
+
attrs[operation][attr_name] << attr_value
|
587
|
+
end
|
588
|
+
when :T_LBRACE
|
589
|
+
equilibrium = 1
|
590
|
+
reformed_path_string = ''
|
591
|
+
following_token = step_tokens[index]
|
592
|
+
index += 1
|
593
|
+
until (following_token[0] == :T_RBRACE and equilibrium.zero?) or index > step_tokens.length
|
594
|
+
reformed_path_string << following_token[1]
|
595
|
+
following_token = step_tokens[index]
|
596
|
+
index += 1
|
597
|
+
equilibrium += 1 if following_token[0] == :T_LBRACE
|
598
|
+
equilibrium -= 1 if following_token[0] == :T_RBRACE
|
599
|
+
end
|
600
|
+
raise InvalidExpressionException.new("Could not find matching R_BRACE in #{reformed_path_string}") if index > step_tokens.length # Undesirable exit condition to above loop
|
601
|
+
valid_rules << Parser.parse_rule_string(reformed_path_string, :PATH_LOCATOR)
|
602
|
+
when :T_SLASH, :T_SLASH2, :T_SLASHDOT2, :T_SLASHDOT3, :T_SLASHGT, :T_SLASHGT2, :T_SLASHLT, :T_SLASHLT2
|
603
|
+
else
|
604
|
+
raise ParseException.new("Consumed unexpected token: #{index_token}")
|
605
|
+
end
|
606
|
+
end # All tokens consumed
|
607
|
+
|
608
|
+
# Validate results
|
609
|
+
raise InvalidExpressionException.new if tag.length > 1
|
610
|
+
raise InvalidExpressionException.new if namespace.length > 1
|
611
|
+
raise InvalidExpressionException.new if element_ref.length > 1
|
612
|
+
|
613
|
+
if type == :PATH_LISTENER
|
614
|
+
ListenerStep.new(action, element_ref[0], tag[0], namespace[0], id[0], attrs, pseudo_exps, valid_rules)
|
615
|
+
else
|
616
|
+
LocatorStep.new(action, element_ref[0], tag[0], namespace[0], id[0], attrs, pseudo_exps, valid_rules)
|
617
|
+
end
|
618
|
+
end
|
619
|
+
|
620
|
+
# Parse an argument given to a pseudo-class
|
621
|
+
def self.parse_arg(arg_tokens)
|
622
|
+
arg_token_types = arg_tokens.map {|token| token[0]}
|
623
|
+
if ((arg_token_types[0] == :T_SQUOTE and arg_token_types[2] == :T_SQUOTE) or (arg_token_types[0] == :T_DQUOTE and arg_token_types[2] == :T_DQUOTE)) and arg_tokens.length == 3
|
624
|
+
arg_tokens[1][1]
|
625
|
+
elsif (arg_token_types[0] == :T_VBARSLASH and arg_token_types[2] == :T_SLASHVBAR) and arg_tokens.length == 3
|
626
|
+
Regexp.new(arg_tokens[1][1])
|
627
|
+
elsif (!(arg_token_types & [:T_KEY_EVEN, :T_KEY_ODD]).empty? and arg_tokens.length == 1) or arg_token_types.include? :T_FORM_N
|
628
|
+
Formula.new(arg_tokens)
|
629
|
+
elsif arg_token_types[0] == :LITERAL_INT and arg_tokens.length == 1
|
630
|
+
arg_tokens[0][1].to_i
|
631
|
+
elsif arg_token_types[0] == :LITERAL_FLOAT and arg_tokens.length == 1
|
632
|
+
arg_tokens[0][1].to_f
|
633
|
+
elsif arg_tokens.length.zero?
|
634
|
+
nil
|
635
|
+
else
|
636
|
+
arg_str = ''
|
637
|
+
arg_tokens.each {|token| arg_str << token[1]}
|
638
|
+
raise InvalidExpressionException.new("Invalid argument '#{arg_str}'")
|
639
|
+
nil
|
640
|
+
end
|
641
|
+
end
|
642
|
+
|
643
|
+
end # Parser
|
644
|
+
|
645
|
+
# A class with class methods used to tokenize a Scandent string
|
646
|
+
# Has information regarding which character patterns match which tokens
|
647
|
+
# Has information regarding which tokens trigger which state in the tokenizer
|
648
|
+
class Tokenizer
|
649
|
+
# A Hash with keys cooresponding to states, and value Hashes that match patterns in that state to tokens
|
650
|
+
@@tokens = {
|
651
|
+
:STATE_ROOT_PATH => {
|
652
|
+
',' => :T_COMMA,
|
653
|
+
'~' => :T_TILDE,
|
654
|
+
'//' => :T_SLASH2,
|
655
|
+
'/' => :T_SLASH,
|
656
|
+
'.' => :T_DOT,
|
657
|
+
'/..' => :T_SLASHDOT2,
|
658
|
+
'/...' => :T_SLASHDOT3,
|
659
|
+
'/>' => :T_SLASHGT,
|
660
|
+
'/>>' => :T_SLASHGT2,
|
661
|
+
'/<' => :T_SLASHLT,
|
662
|
+
'/<<' => :T_SLASHLT2,
|
663
|
+
'*' => :T_ASTERISK,
|
664
|
+
'{' => :T_LBRACE,
|
665
|
+
'[' => :T_LBRAK,
|
666
|
+
'(' => :T_LPAREN,
|
667
|
+
':' => :T_COLON,
|
668
|
+
'#' => :T_PND,
|
669
|
+
'%' => :T_PCT,
|
670
|
+
'@' => :T_AT,
|
671
|
+
# Whitespace
|
672
|
+
"\n" => :T_WHITESPACE,
|
673
|
+
"\t" => :T_WHITESPACE,
|
674
|
+
' ' => :T_WHITESPACE
|
675
|
+
},
|
676
|
+
:STATE_SSTRING => {
|
677
|
+
'\'' => :T_SQUOTE # Closing Token
|
678
|
+
},
|
679
|
+
:STATE_DSTRING => {
|
680
|
+
'"' => :T_DQUOTE, # Closing Token
|
681
|
+
},
|
682
|
+
:STATE_REGEX => {
|
683
|
+
'/|' => :T_SLASHVBAR # Closing Token
|
684
|
+
},
|
685
|
+
:STATE_ATTR_EXP => {
|
686
|
+
']' => :T_RBRAK, # Closing Token
|
687
|
+
'=' => :T_EQL,
|
688
|
+
'==' => :T_EQL2,
|
689
|
+
'~=' => :T_TILDE_EQL,
|
690
|
+
'"' => :T_DQUOTE,
|
691
|
+
'\'' => :T_SQUOTE,
|
692
|
+
'|/' => :T_VBARSLASH,
|
693
|
+
# Whitespace
|
694
|
+
"\n" => :T_WHITESPACE,
|
695
|
+
"\t" => :T_WHITESPACE,
|
696
|
+
' ' => :T_WHITESPACE
|
697
|
+
},
|
698
|
+
:STATE_PATH_EXP => {
|
699
|
+
'}' => :T_RBRACE, # Closing Token
|
700
|
+
# From ROOT_PATH
|
701
|
+
',' => :T_COMMA,
|
702
|
+
'~' => :T_TILDE,
|
703
|
+
'//' => :T_SLASH2,
|
704
|
+
'/' => :T_SLASH,
|
705
|
+
'.' => :T_DOT,
|
706
|
+
'/..' => :T_SLASHDOT2,
|
707
|
+
'/...' => :T_SLASHDOT3,
|
708
|
+
'/>' => :T_SLASHGT,
|
709
|
+
'/>>' => :T_SLASHGT2,
|
710
|
+
'/<' => :T_SLASHLT,
|
711
|
+
'/<<' => :T_SLASHLT2,
|
712
|
+
'*' => :T_ASTERISK,
|
713
|
+
'{' => :T_LBRACE,
|
714
|
+
'[' => :T_LBRAK,
|
715
|
+
'(' => :T_LPAREN,
|
716
|
+
':' => :T_COLON,
|
717
|
+
'#' => :T_PND,
|
718
|
+
'%' => :T_PCT,
|
719
|
+
'@' => :T_AT,
|
720
|
+
# Whitespace
|
721
|
+
"\n" => :T_WHITESPACE,
|
722
|
+
"\t" => :T_WHITESPACE,
|
723
|
+
' ' => :T_WHITESPACE
|
724
|
+
},
|
725
|
+
:STATE_ARGS => {
|
726
|
+
')' => :T_RPAREN, # Closing Token
|
727
|
+
'|/' => :T_VBARSLASH,
|
728
|
+
'n' => :T_FORM_N,
|
729
|
+
'+' => :T_FORM_PLUS,
|
730
|
+
'-' => :T_FORM_MINUS,
|
731
|
+
'even' => :T_KEY_EVEN,
|
732
|
+
'odd' => :T_KEY_ODD,
|
733
|
+
'"' => :T_DQUOTE,
|
734
|
+
'\'' => :T_SQUOTE,
|
735
|
+
# Whitespace
|
736
|
+
"\n" => :T_WHITESPACE,
|
737
|
+
"\t" => :T_WHITESPACE,
|
738
|
+
' ' => :T_WHITESPACE
|
739
|
+
},
|
740
|
+
}
|
741
|
+
# A Hash with keys cooresponding to states, and value Hashes that describe what tokens trigger the closing
|
742
|
+
# of the *current* state or the opening of a *different* state
|
743
|
+
@@triggers = {
|
744
|
+
:STATE_ROOT_PATH => {
|
745
|
+
:open => {
|
746
|
+
:T_LBRACE => :STATE_PATH_EXP,
|
747
|
+
:T_LBRAK => :STATE_ATTR_EXP,
|
748
|
+
:T_LPAREN => :STATE_ARGS
|
749
|
+
},
|
750
|
+
:close => []
|
751
|
+
},
|
752
|
+
:STATE_SSTRING => {
|
753
|
+
:open => {},
|
754
|
+
:close => [:T_SQUOTE]
|
755
|
+
},
|
756
|
+
:STATE_DSTRING => {
|
757
|
+
:open => {},
|
758
|
+
:close => [:T_DQUOTE]
|
759
|
+
},
|
760
|
+
:STATE_REGEX => {
|
761
|
+
:open => {},
|
762
|
+
:close => [:T_SLASHVBAR]
|
763
|
+
},
|
764
|
+
:STATE_ATTR_EXP => {
|
765
|
+
:open => {
|
766
|
+
:T_SQUOTE => :STATE_SSTRING,
|
767
|
+
:T_DQUOTE => :STATE_DSTRING,
|
768
|
+
:T_VBARSLASH => :STATE_REGEX
|
769
|
+
},
|
770
|
+
:close => [:T_RBRAK]
|
771
|
+
},
|
772
|
+
:STATE_PATH_EXP => {
|
773
|
+
:open => {
|
774
|
+
:T_LBRACE => :STATE_PATH_EXP,
|
775
|
+
:T_LBRAK => :STATE_ATTR_EXP,
|
776
|
+
:T_LPAREN => :STATE_ARGS
|
777
|
+
},
|
778
|
+
:close => [:T_RBRACE]
|
779
|
+
},
|
780
|
+
:STATE_ARGS => {
|
781
|
+
:open => {
|
782
|
+
:T_SQUOTE => :STATE_SSTRING,
|
783
|
+
:T_DQUOTE => :STATE_DSTRING,
|
784
|
+
:T_VBARSLASH => :STATE_REGEX
|
785
|
+
},
|
786
|
+
:close => [:T_RPAREN]
|
787
|
+
},
|
788
|
+
}
|
789
|
+
|
790
|
+
# Returns the type of a string of characters that does not match a pattern
|
791
|
+
def self.literal_type(literal)
|
792
|
+
return :LITERAL_IDENT if literal =~ /^[[:alpha:]][[:alnum:]-_]*$/
|
793
|
+
return :LITERAL_INT if literal =~ /^\d+$/
|
794
|
+
return :LITERAL_FLOAT if literal =~ /^\d*\.?\d+$/
|
795
|
+
return :LITERAL_STRING if literal =~ /^.+$/
|
796
|
+
return :LITERAL_UNKNOWN
|
797
|
+
end
|
798
|
+
|
799
|
+
# If no matches found for a letter, move to next letter, even if the possible patterns would match more
|
800
|
+
def self.tokenize(input)
|
801
|
+
state = [:STATE_ROOT_PATH] # State stack for the tokenizer, state.last will return the current state
|
802
|
+
match_start = 0
|
803
|
+
match_end = 0
|
804
|
+
unmatched_buffer = ''
|
805
|
+
largest_full_match = nil
|
806
|
+
token_list = [] # The list of tokens in the input, each item is an Array in the form:
|
807
|
+
# [Token type, Pattern that matched, State of the tokenizer after the token]
|
808
|
+
|
809
|
+
# Until we have checked and matched at or on every single character
|
810
|
+
while match_start < input.length
|
811
|
+
# Start building a substring from a single character
|
812
|
+
match_end = match_start
|
813
|
+
# Start will all patterns for current state as candidates
|
814
|
+
candidates = @@tokens[state.last].keys
|
815
|
+
# Start will no full match detected
|
816
|
+
largest_full_match = nil
|
817
|
+
|
818
|
+
# Until nothing can match substring or end of input has been reached
|
819
|
+
until candidates.empty? or match_end >= input.length
|
820
|
+
# String that candidate patterns will have to match
|
821
|
+
matched_string = input[match_start..match_end]
|
822
|
+
# Check to see if each remaining candidate pattern matches
|
823
|
+
# - If a full match, set as largest full match
|
824
|
+
# - Delete if the pattern does not match the string
|
825
|
+
candidates.delete_if do |pattern|
|
826
|
+
largest_full_match = [pattern, match_start, match_end] if pattern.eql?(matched_string)
|
827
|
+
!pattern.start_with?(matched_string) # Element deleted if true is the last statement in block
|
828
|
+
end
|
829
|
+
# Increase size of match by one if further matching is to be done
|
830
|
+
match_end += 1 if not candidates.empty?
|
831
|
+
end # Substring is now one character too large to be matched to
|
832
|
+
|
833
|
+
# If no full match found, add the last checked character as unmatched
|
834
|
+
# Otherwise:
|
835
|
+
# - Parse the unmatched_buffer as a literal and store
|
836
|
+
# - Activate state triggers associated with the largest fully matched token
|
837
|
+
# - Store the largest fully matched token
|
838
|
+
# - Start again where the matched token completes
|
839
|
+
if largest_full_match.nil?
|
840
|
+
# Add last checked character to the unmatched buffer
|
841
|
+
unmatched_buffer << input[match_start]
|
842
|
+
# Start matching again on the next letter
|
843
|
+
match_start += 1
|
844
|
+
else
|
845
|
+
|
846
|
+
# Do not activate state triggers associate with the parsed literal
|
847
|
+
# Is there a use case?
|
848
|
+
|
849
|
+
# Parse the unmatched_buffer as a literal and store (if it exists), then clear the buffer
|
850
|
+
token_list << [Tokenizer.literal_type(unmatched_buffer), unmatched_buffer, state.last] if unmatched_buffer.length > 0
|
851
|
+
unmatched_buffer = ''
|
852
|
+
|
853
|
+
# Info from the largest fully matched token
|
854
|
+
matched_pattern, pattern_start, pattern_end = largest_full_match
|
855
|
+
matched_token_type = @@tokens[state.last][matched_pattern]
|
856
|
+
|
857
|
+
# Activate state triggers associated with the largest fully matched token
|
858
|
+
current_state_triggers = @@triggers[state.last]
|
859
|
+
if current_state_triggers[:open].has_key?(matched_token_type)
|
860
|
+
state.push(current_state_triggers[:open][matched_token_type])
|
861
|
+
elsif current_state_triggers[:close].include?(matched_token_type)
|
862
|
+
state.pop
|
863
|
+
end
|
864
|
+
|
865
|
+
# Store the largest fully matched token
|
866
|
+
token_list << [matched_token_type, matched_pattern, state.last]
|
867
|
+
|
868
|
+
# Start again where the matched token completes
|
869
|
+
match_start = pattern_end + 1
|
870
|
+
|
871
|
+
end # Next token has been added to list
|
872
|
+
end # Input has been fully tokenized
|
873
|
+
|
874
|
+
# Parse and store the unmatched_buffer one last time
|
875
|
+
token_list << [Tokenizer.literal_type(unmatched_buffer), unmatched_buffer, state.last] if unmatched_buffer.length > 0
|
876
|
+
|
877
|
+
token_list
|
878
|
+
end # tokenize_path
|
879
|
+
|
880
|
+
end # Tokenizer
|
881
|
+
end # Scandent
|
882
|
+
end # Arboretum
|