fabulator-grammar 0.0.1 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +22 -0
- data/Rakefile +3 -1
- data/VERSION +1 -1
- data/features/grammar.feature +116 -12
- data/features/step_definitions/expression_steps.rb +2 -2
- data/features/step_definitions/grammar_steps.rb +46 -2
- data/features/step_definitions/xml_steps.rb +5 -16
- data/features/support/env.rb +1 -0
- data/lib/fabulator-grammar.rb +1 -0
- data/lib/fabulator/grammar.rb +12 -3
- data/lib/fabulator/grammar/actions.rb +17 -7
- data/lib/fabulator/grammar/actions/context.rb +18 -0
- data/lib/fabulator/grammar/actions/grammar.rb +76 -0
- data/lib/fabulator/grammar/actions/rule.rb +51 -0
- data/lib/fabulator/grammar/actions/token.rb +27 -0
- data/lib/fabulator/grammar/actions/when.rb +35 -0
- data/lib/fabulator/grammar/cursor.rb +118 -0
- data/lib/fabulator/grammar/expr/anchor.rb +28 -0
- data/lib/fabulator/grammar/expr/char_set.rb +67 -18
- data/lib/fabulator/grammar/expr/look_ahead.rb +44 -0
- data/lib/fabulator/grammar/expr/rule.rb +33 -28
- data/lib/fabulator/grammar/expr/rule_alternative.rb +45 -0
- data/lib/fabulator/grammar/expr/rule_mode.rb +16 -0
- data/lib/fabulator/grammar/expr/rule_ref.rb +15 -4
- data/lib/fabulator/grammar/expr/rule_sequence.rb +59 -0
- data/lib/fabulator/grammar/expr/sequence.rb +7 -1
- data/lib/fabulator/grammar/expr/set_skip.rb +16 -0
- data/lib/fabulator/grammar/expr/text.rb +8 -0
- data/lib/fabulator/grammar/expr/{rules.rb → token.rb} +12 -1
- data/lib/fabulator/grammar/expr/token_alternative.rb +42 -0
- data/lib/fabulator/grammar/rule_parser.rb +667 -0
- data/lib/fabulator/grammar/token_parser.rb +733 -0
- data/rules.racc +249 -0
- data/tokens.racc +257 -0
- metadata +29 -12
- data/lib/fabulator/grammar/parser.rb +0 -548
- data/regex.racc +0 -183
data/rules.racc
ADDED
@@ -0,0 +1,249 @@
|
|
1
|
+
class Fabulator::Grammar::RuleParser
|
2
|
+
|
3
|
+
start rules
|
4
|
+
|
5
|
+
rule
|
6
|
+
|
7
|
+
rules: rule { result = Fabulator::Grammar::Expr::Rule.new; result.add_alternative(val[0]) }
|
8
|
+
| rules PIPE rule { result = val[0]; result.add_alternative(val[2]) }
|
9
|
+
|
10
|
+
rule: { result = Fabulator::Grammar::Expr::RuleAlternative.new; }
|
11
|
+
| rule rule_bit { result = val[0]; result.add_sequence(val[1]) }
|
12
|
+
|
13
|
+
rule_bit: directives
|
14
|
+
| sequence
|
15
|
+
|
16
|
+
directives: directive { result = [ val[0] ] }
|
17
|
+
| directives directive { result = val[0] + [ val[1] ] }
|
18
|
+
|
19
|
+
directive: LLB MODE NCNAME RB { result = Fabulator::Grammar::Expr::RuleMode.new(val[2]) }
|
20
|
+
| LLB COMMIT RB
|
21
|
+
| LLB UNCOMMIT RB
|
22
|
+
| LLB REJECT RB
|
23
|
+
| LLB SKIP opt_separator RB { result = Fabulator::Grammar::Expr::SetSkip.new(val[2]) }
|
24
|
+
| LLB RESYNC opt_separator RB { result = Fabulator::Grammar::Expr::Resync.new(val[2]) }
|
25
|
+
| DOT_DOT_DOT sequence { result = Fabulator::Grammar::Expr::LookAhead.new(val[1]) }
|
26
|
+
| DOT_DOT_DOT_BANG sequence { result = Fabulator::Grammar::Expr::NegLookAhead.new(val[1]) }
|
27
|
+
| CARET { result = Fabulator::Grammar::Expr::Anchor.new(:start_of_line) }
|
28
|
+
| CARET_CARET { result = Fabulator::Grammar::Expr::Anchor.new(:start_of_string) }
|
29
|
+
| DOLLAR { result = Fabulator::Grammar::Expr::Anchor.new(:end_of_line) }
|
30
|
+
| DOLLAR_DOLLAR { result = Fabulator::Grammar::Expr::Anchor.new(:end_of_string) }
|
31
|
+
|
32
|
+
sequence: atom sequence_qualifiers { result = Fabulator::Grammar::Expr::RuleSequence.new(nil, val[0], val[1]) }
|
33
|
+
| atom { result = Fabulator::Grammar::Expr::RuleSequence.new(nil, val[0]) }
|
34
|
+
| hypothetical atom sequence_qualifiers { result = Fabulator::Grammar::Expr::RuleSequence.new(val[0], val[1], val[2]) }
|
35
|
+
| hypothetical atom { result = Fabulator::Grammar::Expr::RuleSequence.new(val[0], val[1]) }
|
36
|
+
|
37
|
+
hypothetical: NCNAME COLON_EQUAL { result = val[0] }
|
38
|
+
|
39
|
+
# /\((\?|s|s\?|\d+(\.\.(\d+)?)?|\.\.\d+)(\s+ncname)?\)/
|
40
|
+
# and with no prior space -- and ncname should be specifiable as
|
41
|
+
# a text constant
|
42
|
+
# the ncname specifies a token that separates instances
|
43
|
+
# shorthand: if we see '(s[ )]', '(s?[ )]', '(?)', '(\d', then we have
|
44
|
+
# something for the following instead of LP rules RP
|
45
|
+
sequence_qualifiers: LLP QUESTION RP { result = [ '?'.to_sym ] }
|
46
|
+
| LLP S opt_separator RP { result = [ :s, val[2] ] }
|
47
|
+
| LLP S QUESTION opt_separator RP { result = [ 's?'.to_sym, val[3] ] }
|
48
|
+
| LLP INTEGER opt_separator RP { result = [ :count, val[1], val[2] ] }
|
49
|
+
| LLP INTEGER DOT_DOT INTEGER opt_separator RP { result = [ :range, [ val[1], val[3] ], val[4] ] }
|
50
|
+
| LLP DOT_DOT INTEGER opt_separator RP { result = [ :upto, val[2], val[3] ] }
|
51
|
+
| LLP INTEGER DOT_DOT opt_separator RP { result = [ :atleast, val[1],val[3] ] }
|
52
|
+
|
53
|
+
opt_separator:
|
54
|
+
| atom
|
55
|
+
|
56
|
+
atom: LITERAL { result = Fabulator::Grammar::Expr::Text.new(val[0]) }
|
57
|
+
| LP rules RP { result = val[1] }
|
58
|
+
| NCNAME opt_params { result = Fabulator::Grammar::Expr::RuleRef.new(val[0]) }
|
59
|
+
|
60
|
+
opt_params:
|
61
|
+
| LB params RB
|
62
|
+
|
63
|
+
params: param
|
64
|
+
| params COMMA param
|
65
|
+
| params COMMA
|
66
|
+
|
67
|
+
param: hypothetical relative_path
|
68
|
+
| hypothetical LITERAL
|
69
|
+
|
70
|
+
relative_path: NCNAME
|
71
|
+
| NCNAME SLASH NCNAME
|
72
|
+
|
73
|
+
|
74
|
+
---- inner
|
75
|
+
require 'fabulator/grammar'
|
76
|
+
|
77
|
+
def parse(t)
|
78
|
+
@source = t
|
79
|
+
@curpos = 0
|
80
|
+
@line = 0
|
81
|
+
@col = 0
|
82
|
+
|
83
|
+
@in_quantifier = false
|
84
|
+
|
85
|
+
@yydebug = true
|
86
|
+
|
87
|
+
@last_token = nil
|
88
|
+
|
89
|
+
do_parse
|
90
|
+
end
|
91
|
+
|
92
|
+
def on_error(*args)
|
93
|
+
raise Fabulator::Grammar::ParserError.new("unable to parse '#{args[1]}' near line #{@line + 1}, column #{@col}")
|
94
|
+
end
|
95
|
+
|
96
|
+
@@ops = {
|
97
|
+
':=' => :COLON_EQUAL,
|
98
|
+
'[' => :LB,
|
99
|
+
']' => :RB,
|
100
|
+
'(' => :LP,
|
101
|
+
')' => :RP,
|
102
|
+
'{' => :LC,
|
103
|
+
'}' => :RC,
|
104
|
+
'?' => :QUESTION,
|
105
|
+
'.' => :DOT,
|
106
|
+
'..' => :DOT_DOT,
|
107
|
+
'...'=> :DOT_DOT_DOT,
|
108
|
+
'...!'=> :DOT_DOT_DOT_BANG,
|
109
|
+
'|' => :PIPE,
|
110
|
+
',' => :COMMA,
|
111
|
+
':' => :COLON,
|
112
|
+
'^' => :CARET,
|
113
|
+
'^^' => :CARET_CARET,
|
114
|
+
'$' => :DOLLAR,
|
115
|
+
'$$' => :DOLLAR_DOLLAR,
|
116
|
+
'/' => :SLASH,
|
117
|
+
}
|
118
|
+
|
119
|
+
@@regex = {
|
120
|
+
:simple_tokens => %r{^(#{Regexp.union(@@ops.keys.sort_by{|a| a.length}.reverse.collect{ |k| k })})},
|
121
|
+
:ncname => %r{(?:[a-zA-Z_][-a-zA-Z0-9_.]*)},
|
122
|
+
:integer => %r{(\d+)},
|
123
|
+
:literal => %r{((?:"(?:[^\\"]*(?:\\.[^\\"]*)*)")|(?:'(?:[^\\']*(?:\\.[^\\']*)*)'))},
|
124
|
+
}
|
125
|
+
|
126
|
+
@@regex[:general] = Regexp.compile(%{^(#{@@regex[:ncname]})|#{@@regex[:integer]}|#{@@regex[:literal]}})
|
127
|
+
|
128
|
+
def next_token
|
129
|
+
@token = nil
|
130
|
+
white_space = 0
|
131
|
+
new_line = 0
|
132
|
+
while @curpos < @source.length && @source[@curpos..@curpos] =~ /\s/ do
|
133
|
+
if @source[@curpos..@curpos] =~ /\n/
|
134
|
+
new_line = new_line + 1
|
135
|
+
@line = @line + 1
|
136
|
+
@col = 0
|
137
|
+
else
|
138
|
+
@col = @col + 1
|
139
|
+
end
|
140
|
+
@curpos = @curpos + 1
|
141
|
+
white_space = white_space + 1
|
142
|
+
end
|
143
|
+
|
144
|
+
# skip comments delimited by (: :)
|
145
|
+
# comments can be nested
|
146
|
+
# these are XPath 2.0 comments
|
147
|
+
#
|
148
|
+
|
149
|
+
if @curpos < @source.length && @source[@curpos..@curpos+1] == '(:'
|
150
|
+
comment_depth = 1
|
151
|
+
@curpos = @curpos + 2
|
152
|
+
@col = @col + 2
|
153
|
+
while comment_depth > 0 && @curpos < @source.length
|
154
|
+
if @source[@curpos..@curpos+1] == '(:'
|
155
|
+
comment_depth = comment_depth + 1
|
156
|
+
@curpos = @curpos + 1
|
157
|
+
@col = @col + 1
|
158
|
+
end
|
159
|
+
if @source[@curpos..@curpos+1] == ':)'
|
160
|
+
comment_depth = comment_depth - 1
|
161
|
+
@curpos = @curpos + 1
|
162
|
+
@col = @col + 1
|
163
|
+
end
|
164
|
+
@curpos = @curpos + 1
|
165
|
+
@col = @col + 1
|
166
|
+
end
|
167
|
+
white_space = white_space + 1
|
168
|
+
end
|
169
|
+
|
170
|
+
while @curpos < @source.length && @source[@curpos..@curpos] =~ /\s/ do
|
171
|
+
if @source[@curpos..@curpos] =~ /\n/
|
172
|
+
new_line = new_line + 1
|
173
|
+
@line = @line + 1
|
174
|
+
@col = 0
|
175
|
+
else
|
176
|
+
@col = @col + 1
|
177
|
+
end
|
178
|
+
@curpos = @curpos + 1
|
179
|
+
white_space = white_space + 1
|
180
|
+
end
|
181
|
+
|
182
|
+
if @curpos >= @source.length
|
183
|
+
@last_token = nil
|
184
|
+
return [ false, false ]
|
185
|
+
end
|
186
|
+
|
187
|
+
res = @@regex[:simple_tokens].match(@source[@curpos..@source.length-1])
|
188
|
+
if !res.nil?
|
189
|
+
if !res[1].nil?
|
190
|
+
@token = [ @@ops[res[1]], res[1] ]
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
if @token.nil?
|
195
|
+
res = @@regex[:general].match(@source[@curpos..@source.length-1])
|
196
|
+
if res.nil?
|
197
|
+
raise "Failed to parse '#{@source}' at #{@curpos}': #{@source[@curpos..@source.length-1]}"
|
198
|
+
end
|
199
|
+
#ncname, integer, literal
|
200
|
+
if !res[1].nil?
|
201
|
+
@token = [:NCNAME, res[1].to_s]
|
202
|
+
elsif !res[2].nil?
|
203
|
+
@token = [:INTEGER, res[2].to_s]
|
204
|
+
elsif !res[3].nil?
|
205
|
+
@token = [:LITERAL, res[3].to_s]
|
206
|
+
@token[1] = @token[1][1..@token[1].size-2]
|
207
|
+
@col += 2
|
208
|
+
@curpos += 2
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
|
213
|
+
if @token.nil?
|
214
|
+
puts "Uh oh... we don't know what to do: #{@source[@curpos .. @source.length-1]}"
|
215
|
+
return [ nil, nil ]
|
216
|
+
else
|
217
|
+
@curpos += @token[1].length
|
218
|
+
@col += @token[1].length
|
219
|
+
end
|
220
|
+
|
221
|
+
if !@token.nil? && @token[0] == :LP
|
222
|
+
# shorthand: if we see '(s[ )]', '(s?[ )]', '(?)', '(\d', then we have
|
223
|
+
if @curpos > 1 && ![' ', '('].include?(@source[@curpos-2 .. @curpos-2])
|
224
|
+
@token[0] = :LLP
|
225
|
+
@in_quantifier = true
|
226
|
+
end
|
227
|
+
elsif @in_quantifier
|
228
|
+
@in_quantifier = false
|
229
|
+
if @token[0] == :NCNAME
|
230
|
+
@token[0] = case @token[1]
|
231
|
+
when 's': :S
|
232
|
+
else :NCNAME
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
if !@token.nil? && @token[0] == :LB
|
238
|
+
if @curpos == 1 || @source[@curpos-2 .. @curpos - 2] == ' '
|
239
|
+
@token[0] = :LLB
|
240
|
+
@in_directive = true
|
241
|
+
end
|
242
|
+
elsif @in_directive && @token[0] == :NCNAME
|
243
|
+
@token[0] = @token[1].upcase.to_sym
|
244
|
+
@in_directive = false
|
245
|
+
end
|
246
|
+
|
247
|
+
# puts "token: #{@token.join(' => ')}"
|
248
|
+
return @token
|
249
|
+
end
|
data/tokens.racc
ADDED
@@ -0,0 +1,257 @@
|
|
1
|
+
class Fabulator::Grammar::TokenParser
|
2
|
+
|
3
|
+
start rules
|
4
|
+
|
5
|
+
rule
|
6
|
+
rules: anchored_rule { result = Fabulator::Grammar::Expr::Token.new; result.add_alternative(val[0]) }
|
7
|
+
| rules PIPE anchored_rule { result = val[0]; result.add_alternative(val[2]) }
|
8
|
+
|
9
|
+
anchored_rule: rule { result = val[0] }
|
10
|
+
| left_anchor rule { result = val[1]; result.anchor_start(val[0]) }
|
11
|
+
| rule right_anchor { result = val[0]; result.anchor_end(val[1]) }
|
12
|
+
| left_anchor rule right_anchor { result = val[1]; result.anchor_start(val[0]); result.anchor_end(val[2]) }
|
13
|
+
|
14
|
+
left_anchor: CARET { result = '^' }
|
15
|
+
| CARET CARET { result = '^^' }
|
16
|
+
|
17
|
+
right_anchor: DOLLAR { result = '$' }
|
18
|
+
| DOLLAR DOLLAR { result = '$$' }
|
19
|
+
|
20
|
+
rule: { result = Fabulator::Grammar::Expr::TokenAlternative.new; }
|
21
|
+
| rule sequence { result = val[0]; result.add_sequence(val[1]); }
|
22
|
+
|
23
|
+
sequence: atom sequence_qualifiers { result = Fabulator::Grammar::Expr::Sequence.new(nil, val[0], val[1]) }
|
24
|
+
| atom { result = Fabulator::Grammar::Expr::Sequence.new(nil, val[0]) }
|
25
|
+
|
26
|
+
atom: text { result = Fabulator::Grammar::Expr::Text.new(val[0]) }
|
27
|
+
| DOT { result = Fabulator::Grammar::Expr::Any.new }
|
28
|
+
| LP rules RP { result = val[1] }
|
29
|
+
| LB atom_expr RB { result = val[1] }
|
30
|
+
|
31
|
+
atom_expr: char_set_expr
|
32
|
+
|
33
|
+
#{ result = Fabulator::Grammar::Expr::CharSet.new; result.universal }
|
34
|
+
char_set_expr: char_set { result = val[0] }
|
35
|
+
| MINUS char_set { result = Fabulator::Grammar::Expr::CharSet.new; result.universal; result.but_not(val[1]) }
|
36
|
+
| char_set_expr PLUS char_set { result = val[0].or(val[2]) }
|
37
|
+
| char_set_expr MINUS char_set { result = val[0].but_not(val[2]) }
|
38
|
+
|
39
|
+
char_set: LB char_set_text RB { result = Fabulator::Grammar::Expr::CharSet.new(val[1]) }
|
40
|
+
| COLON NCNAME COLON { result = Fabulator::Grammar::Expr::CharClass.new(val[1]) }
|
41
|
+
| LP char_set_expr RP { result = val[1] }
|
42
|
+
|
43
|
+
char_set_text: { result = '' }
|
44
|
+
| char_set_text CHAR_TEXT { result = val[0] + val[1] }
|
45
|
+
| char_set_text MINUS CHAR_TEXT { result = val[0] + '-' + val[2] }
|
46
|
+
| char_set_text PLUS CHAR_TEXT { result = val[0] + '+' + val[2] }
|
47
|
+
|
48
|
+
text: qname { result = val[0] }
|
49
|
+
| TEXT { result = val[0] }
|
50
|
+
| INTEGER { result = val[0] }
|
51
|
+
| COMMA { result = val[0] }
|
52
|
+
|
53
|
+
qname: NCNAME { result = val[0] }
|
54
|
+
| NCNAME COLON NCNAME { result = val[0] + ':' + val[2] }
|
55
|
+
|
56
|
+
sequence_qualifiers: STAR { result = [ :zero_or_more ] }
|
57
|
+
| STAR QUESTION { result = [ :zero_or_more, :min ] }
|
58
|
+
| PLUS { result = [ :one_or_more ] }
|
59
|
+
| PLUS QUESTION { result = [ :one_or_more, :min ] }
|
60
|
+
| QUESTION { result = [ :zero_or_one ] }
|
61
|
+
| QUESTION QUESTION { result = [ :zero_or_one, :min ] }
|
62
|
+
| LLB INTEGER RB { result = [ :exact, val[1].to_i ] }
|
63
|
+
| LLB INTEGER COMMA INTEGER RB { result = [ :range, val[1].to_i, val[3].to_i ] }
|
64
|
+
| LLB INTEGER COMMA RB { result = [ :range, val[1], '' ] }
|
65
|
+
| LLB INTEGER COMMA RB QUESTION { result = [ :min, :range, val[1], '' ] }
|
66
|
+
| LLB INTEGER COMMA INTEGER RB QUESTION { result = [ :min, :range, val[1].to_i, val[3].to_i ] }
|
67
|
+
|
68
|
+
|
69
|
+
---- inner
|
70
|
+
require 'fabulator/grammar'
|
71
|
+
|
72
|
+
def parse(t)
|
73
|
+
@source = t
|
74
|
+
@curpos = 0
|
75
|
+
@col = 0
|
76
|
+
@line = 0
|
77
|
+
|
78
|
+
@yydebug = true
|
79
|
+
|
80
|
+
@last_token = nil
|
81
|
+
|
82
|
+
@brackets = 0
|
83
|
+
|
84
|
+
do_parse
|
85
|
+
end
|
86
|
+
|
87
|
+
def on_error(*args)
|
88
|
+
raise Fabulator::Grammar::ParserError.new("unable to parse '#{args[1]}' near line #{@line + 1}, column #{@col}")
|
89
|
+
end
|
90
|
+
|
91
|
+
@@ops = {
|
92
|
+
#'[{' => :LB_LC,
|
93
|
+
#'}]' => :RC_RB,
|
94
|
+
#'[[' => :LB_LB,
|
95
|
+
#']]' => :RB_RB,
|
96
|
+
'[' => :LB,
|
97
|
+
']' => :RB,
|
98
|
+
'(' => :LP,
|
99
|
+
')' => :RP,
|
100
|
+
#'{' => :LC,
|
101
|
+
#'}' => :RC,
|
102
|
+
#'#' => :HASH,
|
103
|
+
'$' => :DOLLAR,
|
104
|
+
'^' => :CARET,
|
105
|
+
#'&' => :AND,
|
106
|
+
'*' => :STAR,
|
107
|
+
'+' => :PLUS,
|
108
|
+
'-' => :MINUS,
|
109
|
+
'?' => :QUESTION,
|
110
|
+
'.' => :DOT,
|
111
|
+
'|' => :PIPE,
|
112
|
+
',' => :COMMA,
|
113
|
+
':' => :COLON
|
114
|
+
}
|
115
|
+
|
116
|
+
|
117
|
+
@@regex = {
|
118
|
+
:simple_tokens => %r{^(#{Regexp.union(@@ops.keys.sort_by{|a| a.length}.reverse.collect{ |k| k })})},
|
119
|
+
:ncname => %r{(?:[a-zA-Z_][-a-zA-Z0-9_.]*)}
|
120
|
+
}
|
121
|
+
|
122
|
+
#puts @@regex[:simple_tokens]
|
123
|
+
|
124
|
+
@@regex[:qname] = %r{((?:#{@@regex[:ncname]}:)?#{@@regex[:ncname]})}
|
125
|
+
@@regex[:general] = Regexp.compile(%{^#{@@regex[:qname]}|#{@@regex[:simple_tokens]}})
|
126
|
+
|
127
|
+
def next_token
|
128
|
+
@token = nil
|
129
|
+
white_space = 0
|
130
|
+
new_line = 0
|
131
|
+
while @curpos < @source.length && @source[@curpos..@curpos] =~ /\s/ do
|
132
|
+
if @source[@curpos..@curpos] =~ /\n/
|
133
|
+
new_line = new_line + 1
|
134
|
+
@line = @line + 1
|
135
|
+
@col = 0
|
136
|
+
else
|
137
|
+
@col = @col + 1
|
138
|
+
end
|
139
|
+
@curpos = @curpos + 1
|
140
|
+
white_space = white_space + 1
|
141
|
+
end
|
142
|
+
|
143
|
+
# skip comments delimited by (: :)
|
144
|
+
# comments can be nested
|
145
|
+
# these are XPath 2.0 comments
|
146
|
+
#
|
147
|
+
if @curpos < @source.length && @source[@curpos..@curpos+1] == '(:'
|
148
|
+
comment_depth = 1
|
149
|
+
@curpos = @curpos + 2
|
150
|
+
@col = @col + 2
|
151
|
+
while comment_depth > 0 && @curpos < @source.length
|
152
|
+
if @source[@curpos..@curpos+1] == '(:'
|
153
|
+
comment_depth = comment_depth + 1
|
154
|
+
@curpos = @curpos + 1
|
155
|
+
@col = @col + 1
|
156
|
+
end
|
157
|
+
if @source[@curpos..@curpos+1] == ':)'
|
158
|
+
comment_depth = comment_depth - 1
|
159
|
+
@curpos = @curpos + 1
|
160
|
+
@col = @col + 1
|
161
|
+
end
|
162
|
+
@curpos = @curpos + 1
|
163
|
+
@col = @col + 1
|
164
|
+
end
|
165
|
+
white_space = white_space + 1
|
166
|
+
end
|
167
|
+
|
168
|
+
while @curpos < @source.length && @source[@curpos..@curpos] =~ /\s/ do
|
169
|
+
if @source[@curpos..@curpos] =~ /\n/
|
170
|
+
new_line = new_line + 1
|
171
|
+
@line = @line + 1
|
172
|
+
@col = 0
|
173
|
+
else
|
174
|
+
@col = @col + 1
|
175
|
+
end
|
176
|
+
@curpos = @curpos + 1
|
177
|
+
white_space = white_space + 1
|
178
|
+
end
|
179
|
+
|
180
|
+
if @curpos >= @source.length
|
181
|
+
@last_token = nil
|
182
|
+
return [ false, false ]
|
183
|
+
end
|
184
|
+
|
185
|
+
# case @source[@curpos..@curpos]
|
186
|
+
# when '<': @token = [ :LT, '<' ]
|
187
|
+
# when '>': @token = [ :GT, '>' ]
|
188
|
+
# when '[': @token = [ :LB, '[' ]
|
189
|
+
# when ']': @token = [ :RB, ']' ]
|
190
|
+
# when '(': @token = [ :LP, '(' ]
|
191
|
+
# when ')': @token = [ :RP, ')' ]
|
192
|
+
# when '{': @token = [ :LC, '{' ]
|
193
|
+
# when '}': @token = [ :RC, '}' ]
|
194
|
+
# when ':': @token = [ :COLON, ':' ]
|
195
|
+
# when ',': @token = [ :COMMA, ',' ]
|
196
|
+
# when '|': @token = [ :PIPE, '|' ]
|
197
|
+
# when '*': @token = [ :STAR, '*' ]
|
198
|
+
# when '+': @token = [ :PLUS, '+' ]
|
199
|
+
# when '.': @token = [ :DOT, '.' ]
|
200
|
+
# when '?': @token = [ :QUESTION, '?' ]
|
201
|
+
# when '$': @token = [ :DOLLAR, '$' ]
|
202
|
+
# when '^': @token = [ :CARET, '^' ]
|
203
|
+
# end
|
204
|
+
|
205
|
+
res = @@regex[:simple_tokens].match(@source[@curpos..@source.length-1])
|
206
|
+
if !res.nil?
|
207
|
+
if !res[1].nil?
|
208
|
+
@token = [ @@ops[res[1]], res[1] ]
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
if @token.nil?
|
213
|
+
# get longest sequence of non-special characters
|
214
|
+
# if it's all digits, report INTEGER
|
215
|
+
# if it's a qname, report QNAME
|
216
|
+
# otherwise, report TEXT
|
217
|
+
@source[@curpos..@source.length-1] =~ /^(((\\.)|[^ \$\^\[\]\{\}\(\):,|*+.?])+)*/
|
218
|
+
text = $1
|
219
|
+
bits = text.split(/\\/)
|
220
|
+
text = bits.join('')
|
221
|
+
@curpos += bits.size - 1
|
222
|
+
@col += bits.size - 1
|
223
|
+
if text.length > 0
|
224
|
+
if @source[@curpos+text.length .. @curpos+text.length] =~ /[*?+\{]/
|
225
|
+
# TODO: make sure we backtrack properly if the last character is escaped
|
226
|
+
text = text[0..text.length-2]
|
227
|
+
@token = [ :TEXT, text ]
|
228
|
+
else
|
229
|
+
case text
|
230
|
+
when /^\d+$/: @token = [ :INTEGER, text ]
|
231
|
+
when /^#{@@regex[:ncname]}$/: @token = [ :NCNAME, text ]
|
232
|
+
else @token = [ :TEXT, text ]
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
if @token.nil?
|
239
|
+
puts "Uh oh... we don't know what to do: #{@source[@curpos .. @source.length-1]}"
|
240
|
+
else
|
241
|
+
@curpos += @token[1].length
|
242
|
+
@col += @token[1].length
|
243
|
+
end
|
244
|
+
|
245
|
+
if @token[0] == :LB
|
246
|
+
if @brackets == 0 && @source[@curpos..@source.length-1] =~ /^\s*\d/
|
247
|
+
@token[0] = :LLB
|
248
|
+
end
|
249
|
+
@brackets += 1
|
250
|
+
elsif @token[0] == :RB
|
251
|
+
@brackets -= 1
|
252
|
+
elsif @brackets > 1
|
253
|
+
@token[0] = :CHAR_TEXT
|
254
|
+
end
|
255
|
+
|
256
|
+
return @token
|
257
|
+
end
|