fabulator-grammar 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +22 -0
- data/Rakefile +3 -1
- data/VERSION +1 -1
- data/features/grammar.feature +116 -12
- data/features/step_definitions/expression_steps.rb +2 -2
- data/features/step_definitions/grammar_steps.rb +46 -2
- data/features/step_definitions/xml_steps.rb +5 -16
- data/features/support/env.rb +1 -0
- data/lib/fabulator-grammar.rb +1 -0
- data/lib/fabulator/grammar.rb +12 -3
- data/lib/fabulator/grammar/actions.rb +17 -7
- data/lib/fabulator/grammar/actions/context.rb +18 -0
- data/lib/fabulator/grammar/actions/grammar.rb +76 -0
- data/lib/fabulator/grammar/actions/rule.rb +51 -0
- data/lib/fabulator/grammar/actions/token.rb +27 -0
- data/lib/fabulator/grammar/actions/when.rb +35 -0
- data/lib/fabulator/grammar/cursor.rb +118 -0
- data/lib/fabulator/grammar/expr/anchor.rb +28 -0
- data/lib/fabulator/grammar/expr/char_set.rb +67 -18
- data/lib/fabulator/grammar/expr/look_ahead.rb +44 -0
- data/lib/fabulator/grammar/expr/rule.rb +33 -28
- data/lib/fabulator/grammar/expr/rule_alternative.rb +45 -0
- data/lib/fabulator/grammar/expr/rule_mode.rb +16 -0
- data/lib/fabulator/grammar/expr/rule_ref.rb +15 -4
- data/lib/fabulator/grammar/expr/rule_sequence.rb +59 -0
- data/lib/fabulator/grammar/expr/sequence.rb +7 -1
- data/lib/fabulator/grammar/expr/set_skip.rb +16 -0
- data/lib/fabulator/grammar/expr/text.rb +8 -0
- data/lib/fabulator/grammar/expr/{rules.rb → token.rb} +12 -1
- data/lib/fabulator/grammar/expr/token_alternative.rb +42 -0
- data/lib/fabulator/grammar/rule_parser.rb +667 -0
- data/lib/fabulator/grammar/token_parser.rb +733 -0
- data/rules.racc +249 -0
- data/tokens.racc +257 -0
- metadata +29 -12
- data/lib/fabulator/grammar/parser.rb +0 -548
- data/regex.racc +0 -183
data/rules.racc
ADDED
@@ -0,0 +1,249 @@
|
|
1
|
+
class Fabulator::Grammar::RuleParser
|
2
|
+
|
3
|
+
start rules
|
4
|
+
|
5
|
+
rule
|
6
|
+
|
7
|
+
rules: rule { result = Fabulator::Grammar::Expr::Rule.new; result.add_alternative(val[0]) }
|
8
|
+
| rules PIPE rule { result = val[0]; result.add_alternative(val[2]) }
|
9
|
+
|
10
|
+
rule: { result = Fabulator::Grammar::Expr::RuleAlternative.new; }
|
11
|
+
| rule rule_bit { result = val[0]; result.add_sequence(val[1]) }
|
12
|
+
|
13
|
+
rule_bit: directives
|
14
|
+
| sequence
|
15
|
+
|
16
|
+
directives: directive { result = [ val[0] ] }
|
17
|
+
| directives directive { result = val[0] + [ val[1] ] }
|
18
|
+
|
19
|
+
directive: LLB MODE NCNAME RB { result = Fabulator::Grammar::Expr::RuleMode.new(val[2]) }
|
20
|
+
| LLB COMMIT RB
|
21
|
+
| LLB UNCOMMIT RB
|
22
|
+
| LLB REJECT RB
|
23
|
+
| LLB SKIP opt_separator RB { result = Fabulator::Grammar::Expr::SetSkip.new(val[2]) }
|
24
|
+
| LLB RESYNC opt_separator RB { result = Fabulator::Grammar::Expr::Resync.new(val[2]) }
|
25
|
+
| DOT_DOT_DOT sequence { result = Fabulator::Grammar::Expr::LookAhead.new(val[1]) }
|
26
|
+
| DOT_DOT_DOT_BANG sequence { result = Fabulator::Grammar::Expr::NegLookAhead.new(val[1]) }
|
27
|
+
| CARET { result = Fabulator::Grammar::Expr::Anchor.new(:start_of_line) }
|
28
|
+
| CARET_CARET { result = Fabulator::Grammar::Expr::Anchor.new(:start_of_string) }
|
29
|
+
| DOLLAR { result = Fabulator::Grammar::Expr::Anchor.new(:end_of_line) }
|
30
|
+
| DOLLAR_DOLLAR { result = Fabulator::Grammar::Expr::Anchor.new(:end_of_string) }
|
31
|
+
|
32
|
+
sequence: atom sequence_qualifiers { result = Fabulator::Grammar::Expr::RuleSequence.new(nil, val[0], val[1]) }
|
33
|
+
| atom { result = Fabulator::Grammar::Expr::RuleSequence.new(nil, val[0]) }
|
34
|
+
| hypothetical atom sequence_qualifiers { result = Fabulator::Grammar::Expr::RuleSequence.new(val[0], val[1], val[2]) }
|
35
|
+
| hypothetical atom { result = Fabulator::Grammar::Expr::RuleSequence.new(val[0], val[1]) }
|
36
|
+
|
37
|
+
hypothetical: NCNAME COLON_EQUAL { result = val[0] }
|
38
|
+
|
39
|
+
# /\((\?|s|s\?|\d+(\.\.(\d+)?)?|\.\.\d+)(\s+ncname)?\)/
|
40
|
+
# and with no prior space -- and ncname should be specifiable as
|
41
|
+
# a text constant
|
42
|
+
# the ncname specifies a token that separates instances
|
43
|
+
# shorthand: if we see '(s[ )]', '(s?[ )]', '(?)', '(\d', then we have
|
44
|
+
# something for the following instead of LP rules RP
|
45
|
+
sequence_qualifiers: LLP QUESTION RP { result = [ '?'.to_sym ] }
|
46
|
+
| LLP S opt_separator RP { result = [ :s, val[2] ] }
|
47
|
+
| LLP S QUESTION opt_separator RP { result = [ 's?'.to_sym, val[3] ] }
|
48
|
+
| LLP INTEGER opt_separator RP { result = [ :count, val[1], val[2] ] }
|
49
|
+
| LLP INTEGER DOT_DOT INTEGER opt_separator RP { result = [ :range, [ val[1], val[3] ], val[4] ] }
|
50
|
+
| LLP DOT_DOT INTEGER opt_separator RP { result = [ :upto, val[2], val[3] ] }
|
51
|
+
| LLP INTEGER DOT_DOT opt_separator RP { result = [ :atleast, val[1],val[3] ] }
|
52
|
+
|
53
|
+
opt_separator:
|
54
|
+
| atom
|
55
|
+
|
56
|
+
atom: LITERAL { result = Fabulator::Grammar::Expr::Text.new(val[0]) }
|
57
|
+
| LP rules RP { result = val[1] }
|
58
|
+
| NCNAME opt_params { result = Fabulator::Grammar::Expr::RuleRef.new(val[0]) }
|
59
|
+
|
60
|
+
opt_params:
|
61
|
+
| LB params RB
|
62
|
+
|
63
|
+
params: param
|
64
|
+
| params COMMA param
|
65
|
+
| params COMMA
|
66
|
+
|
67
|
+
param: hypothetical relative_path
|
68
|
+
| hypothetical LITERAL
|
69
|
+
|
70
|
+
relative_path: NCNAME
|
71
|
+
| NCNAME SLASH NCNAME
|
72
|
+
|
73
|
+
|
74
|
+
---- inner
|
75
|
+
require 'fabulator/grammar'
|
76
|
+
|
77
|
+
def parse(t)
|
78
|
+
@source = t
|
79
|
+
@curpos = 0
|
80
|
+
@line = 0
|
81
|
+
@col = 0
|
82
|
+
|
83
|
+
@in_quantifier = false
|
84
|
+
|
85
|
+
@yydebug = true
|
86
|
+
|
87
|
+
@last_token = nil
|
88
|
+
|
89
|
+
do_parse
|
90
|
+
end
|
91
|
+
|
92
|
+
def on_error(*args)
|
93
|
+
raise Fabulator::Grammar::ParserError.new("unable to parse '#{args[1]}' near line #{@line + 1}, column #{@col}")
|
94
|
+
end
|
95
|
+
|
96
|
+
@@ops = {
|
97
|
+
':=' => :COLON_EQUAL,
|
98
|
+
'[' => :LB,
|
99
|
+
']' => :RB,
|
100
|
+
'(' => :LP,
|
101
|
+
')' => :RP,
|
102
|
+
'{' => :LC,
|
103
|
+
'}' => :RC,
|
104
|
+
'?' => :QUESTION,
|
105
|
+
'.' => :DOT,
|
106
|
+
'..' => :DOT_DOT,
|
107
|
+
'...'=> :DOT_DOT_DOT,
|
108
|
+
'...!'=> :DOT_DOT_DOT_BANG,
|
109
|
+
'|' => :PIPE,
|
110
|
+
',' => :COMMA,
|
111
|
+
':' => :COLON,
|
112
|
+
'^' => :CARET,
|
113
|
+
'^^' => :CARET_CARET,
|
114
|
+
'$' => :DOLLAR,
|
115
|
+
'$$' => :DOLLAR_DOLLAR,
|
116
|
+
'/' => :SLASH,
|
117
|
+
}
|
118
|
+
|
119
|
+
@@regex = {
|
120
|
+
:simple_tokens => %r{^(#{Regexp.union(@@ops.keys.sort_by{|a| a.length}.reverse.collect{ |k| k })})},
|
121
|
+
:ncname => %r{(?:[a-zA-Z_][-a-zA-Z0-9_.]*)},
|
122
|
+
:integer => %r{(\d+)},
|
123
|
+
:literal => %r{((?:"(?:[^\\"]*(?:\\.[^\\"]*)*)")|(?:'(?:[^\\']*(?:\\.[^\\']*)*)'))},
|
124
|
+
}
|
125
|
+
|
126
|
+
@@regex[:general] = Regexp.compile(%{^(#{@@regex[:ncname]})|#{@@regex[:integer]}|#{@@regex[:literal]}})
|
127
|
+
|
128
|
+
def next_token
|
129
|
+
@token = nil
|
130
|
+
white_space = 0
|
131
|
+
new_line = 0
|
132
|
+
while @curpos < @source.length && @source[@curpos..@curpos] =~ /\s/ do
|
133
|
+
if @source[@curpos..@curpos] =~ /\n/
|
134
|
+
new_line = new_line + 1
|
135
|
+
@line = @line + 1
|
136
|
+
@col = 0
|
137
|
+
else
|
138
|
+
@col = @col + 1
|
139
|
+
end
|
140
|
+
@curpos = @curpos + 1
|
141
|
+
white_space = white_space + 1
|
142
|
+
end
|
143
|
+
|
144
|
+
# skip comments delimited by (: :)
|
145
|
+
# comments can be nested
|
146
|
+
# these are XPath 2.0 comments
|
147
|
+
#
|
148
|
+
|
149
|
+
if @curpos < @source.length && @source[@curpos..@curpos+1] == '(:'
|
150
|
+
comment_depth = 1
|
151
|
+
@curpos = @curpos + 2
|
152
|
+
@col = @col + 2
|
153
|
+
while comment_depth > 0 && @curpos < @source.length
|
154
|
+
if @source[@curpos..@curpos+1] == '(:'
|
155
|
+
comment_depth = comment_depth + 1
|
156
|
+
@curpos = @curpos + 1
|
157
|
+
@col = @col + 1
|
158
|
+
end
|
159
|
+
if @source[@curpos..@curpos+1] == ':)'
|
160
|
+
comment_depth = comment_depth - 1
|
161
|
+
@curpos = @curpos + 1
|
162
|
+
@col = @col + 1
|
163
|
+
end
|
164
|
+
@curpos = @curpos + 1
|
165
|
+
@col = @col + 1
|
166
|
+
end
|
167
|
+
white_space = white_space + 1
|
168
|
+
end
|
169
|
+
|
170
|
+
while @curpos < @source.length && @source[@curpos..@curpos] =~ /\s/ do
|
171
|
+
if @source[@curpos..@curpos] =~ /\n/
|
172
|
+
new_line = new_line + 1
|
173
|
+
@line = @line + 1
|
174
|
+
@col = 0
|
175
|
+
else
|
176
|
+
@col = @col + 1
|
177
|
+
end
|
178
|
+
@curpos = @curpos + 1
|
179
|
+
white_space = white_space + 1
|
180
|
+
end
|
181
|
+
|
182
|
+
if @curpos >= @source.length
|
183
|
+
@last_token = nil
|
184
|
+
return [ false, false ]
|
185
|
+
end
|
186
|
+
|
187
|
+
res = @@regex[:simple_tokens].match(@source[@curpos..@source.length-1])
|
188
|
+
if !res.nil?
|
189
|
+
if !res[1].nil?
|
190
|
+
@token = [ @@ops[res[1]], res[1] ]
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
if @token.nil?
|
195
|
+
res = @@regex[:general].match(@source[@curpos..@source.length-1])
|
196
|
+
if res.nil?
|
197
|
+
raise "Failed to parse '#{@source}' at #{@curpos}': #{@source[@curpos..@source.length-1]}"
|
198
|
+
end
|
199
|
+
#ncname, integer, literal
|
200
|
+
if !res[1].nil?
|
201
|
+
@token = [:NCNAME, res[1].to_s]
|
202
|
+
elsif !res[2].nil?
|
203
|
+
@token = [:INTEGER, res[2].to_s]
|
204
|
+
elsif !res[3].nil?
|
205
|
+
@token = [:LITERAL, res[3].to_s]
|
206
|
+
@token[1] = @token[1][1..@token[1].size-2]
|
207
|
+
@col += 2
|
208
|
+
@curpos += 2
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
|
213
|
+
if @token.nil?
|
214
|
+
puts "Uh oh... we don't know what to do: #{@source[@curpos .. @source.length-1]}"
|
215
|
+
return [ nil, nil ]
|
216
|
+
else
|
217
|
+
@curpos += @token[1].length
|
218
|
+
@col += @token[1].length
|
219
|
+
end
|
220
|
+
|
221
|
+
if !@token.nil? && @token[0] == :LP
|
222
|
+
# shorthand: if we see '(s[ )]', '(s?[ )]', '(?)', '(\d', then we have
|
223
|
+
if @curpos > 1 && ![' ', '('].include?(@source[@curpos-2 .. @curpos-2])
|
224
|
+
@token[0] = :LLP
|
225
|
+
@in_quantifier = true
|
226
|
+
end
|
227
|
+
elsif @in_quantifier
|
228
|
+
@in_quantifier = false
|
229
|
+
if @token[0] == :NCNAME
|
230
|
+
@token[0] = case @token[1]
|
231
|
+
when 's': :S
|
232
|
+
else :NCNAME
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
if !@token.nil? && @token[0] == :LB
|
238
|
+
if @curpos == 1 || @source[@curpos-2 .. @curpos - 2] == ' '
|
239
|
+
@token[0] = :LLB
|
240
|
+
@in_directive = true
|
241
|
+
end
|
242
|
+
elsif @in_directive && @token[0] == :NCNAME
|
243
|
+
@token[0] = @token[1].upcase.to_sym
|
244
|
+
@in_directive = false
|
245
|
+
end
|
246
|
+
|
247
|
+
# puts "token: #{@token.join(' => ')}"
|
248
|
+
return @token
|
249
|
+
end
|
data/tokens.racc
ADDED
@@ -0,0 +1,257 @@
|
|
1
|
+
class Fabulator::Grammar::TokenParser
|
2
|
+
|
3
|
+
start rules
|
4
|
+
|
5
|
+
rule
|
6
|
+
rules: anchored_rule { result = Fabulator::Grammar::Expr::Token.new; result.add_alternative(val[0]) }
|
7
|
+
| rules PIPE anchored_rule { result = val[0]; result.add_alternative(val[2]) }
|
8
|
+
|
9
|
+
anchored_rule: rule { result = val[0] }
|
10
|
+
| left_anchor rule { result = val[1]; result.anchor_start(val[0]) }
|
11
|
+
| rule right_anchor { result = val[0]; result.anchor_end(val[1]) }
|
12
|
+
| left_anchor rule right_anchor { result = val[1]; result.anchor_start(val[0]); result.anchor_end(val[2]) }
|
13
|
+
|
14
|
+
left_anchor: CARET { result = '^' }
|
15
|
+
| CARET CARET { result = '^^' }
|
16
|
+
|
17
|
+
right_anchor: DOLLAR { result = '$' }
|
18
|
+
| DOLLAR DOLLAR { result = '$$' }
|
19
|
+
|
20
|
+
rule: { result = Fabulator::Grammar::Expr::TokenAlternative.new; }
|
21
|
+
| rule sequence { result = val[0]; result.add_sequence(val[1]); }
|
22
|
+
|
23
|
+
sequence: atom sequence_qualifiers { result = Fabulator::Grammar::Expr::Sequence.new(nil, val[0], val[1]) }
|
24
|
+
| atom { result = Fabulator::Grammar::Expr::Sequence.new(nil, val[0]) }
|
25
|
+
|
26
|
+
atom: text { result = Fabulator::Grammar::Expr::Text.new(val[0]) }
|
27
|
+
| DOT { result = Fabulator::Grammar::Expr::Any.new }
|
28
|
+
| LP rules RP { result = val[1] }
|
29
|
+
| LB atom_expr RB { result = val[1] }
|
30
|
+
|
31
|
+
atom_expr: char_set_expr
|
32
|
+
|
33
|
+
#{ result = Fabulator::Grammar::Expr::CharSet.new; result.universal }
|
34
|
+
char_set_expr: char_set { result = val[0] }
|
35
|
+
| MINUS char_set { result = Fabulator::Grammar::Expr::CharSet.new; result.universal; result.but_not(val[1]) }
|
36
|
+
| char_set_expr PLUS char_set { result = val[0].or(val[2]) }
|
37
|
+
| char_set_expr MINUS char_set { result = val[0].but_not(val[2]) }
|
38
|
+
|
39
|
+
char_set: LB char_set_text RB { result = Fabulator::Grammar::Expr::CharSet.new(val[1]) }
|
40
|
+
| COLON NCNAME COLON { result = Fabulator::Grammar::Expr::CharClass.new(val[1]) }
|
41
|
+
| LP char_set_expr RP { result = val[1] }
|
42
|
+
|
43
|
+
char_set_text: { result = '' }
|
44
|
+
| char_set_text CHAR_TEXT { result = val[0] + val[1] }
|
45
|
+
| char_set_text MINUS CHAR_TEXT { result = val[0] + '-' + val[2] }
|
46
|
+
| char_set_text PLUS CHAR_TEXT { result = val[0] + '+' + val[2] }
|
47
|
+
|
48
|
+
text: qname { result = val[0] }
|
49
|
+
| TEXT { result = val[0] }
|
50
|
+
| INTEGER { result = val[0] }
|
51
|
+
| COMMA { result = val[0] }
|
52
|
+
|
53
|
+
qname: NCNAME { result = val[0] }
|
54
|
+
| NCNAME COLON NCNAME { result = val[0] + ':' + val[2] }
|
55
|
+
|
56
|
+
sequence_qualifiers: STAR { result = [ :zero_or_more ] }
|
57
|
+
| STAR QUESTION { result = [ :zero_or_more, :min ] }
|
58
|
+
| PLUS { result = [ :one_or_more ] }
|
59
|
+
| PLUS QUESTION { result = [ :one_or_more, :min ] }
|
60
|
+
| QUESTION { result = [ :zero_or_one ] }
|
61
|
+
| QUESTION QUESTION { result = [ :zero_or_one, :min ] }
|
62
|
+
| LLB INTEGER RB { result = [ :exact, val[1].to_i ] }
|
63
|
+
| LLB INTEGER COMMA INTEGER RB { result = [ :range, val[1].to_i, val[3].to_i ] }
|
64
|
+
| LLB INTEGER COMMA RB { result = [ :range, val[1], '' ] }
|
65
|
+
| LLB INTEGER COMMA RB QUESTION { result = [ :min, :range, val[1], '' ] }
|
66
|
+
| LLB INTEGER COMMA INTEGER RB QUESTION { result = [ :min, :range, val[1].to_i, val[3].to_i ] }
|
67
|
+
|
68
|
+
|
69
|
+
---- inner
|
70
|
+
require 'fabulator/grammar'
|
71
|
+
|
72
|
+
def parse(t)
|
73
|
+
@source = t
|
74
|
+
@curpos = 0
|
75
|
+
@col = 0
|
76
|
+
@line = 0
|
77
|
+
|
78
|
+
@yydebug = true
|
79
|
+
|
80
|
+
@last_token = nil
|
81
|
+
|
82
|
+
@brackets = 0
|
83
|
+
|
84
|
+
do_parse
|
85
|
+
end
|
86
|
+
|
87
|
+
def on_error(*args)
|
88
|
+
raise Fabulator::Grammar::ParserError.new("unable to parse '#{args[1]}' near line #{@line + 1}, column #{@col}")
|
89
|
+
end
|
90
|
+
|
91
|
+
@@ops = {
|
92
|
+
#'[{' => :LB_LC,
|
93
|
+
#'}]' => :RC_RB,
|
94
|
+
#'[[' => :LB_LB,
|
95
|
+
#']]' => :RB_RB,
|
96
|
+
'[' => :LB,
|
97
|
+
']' => :RB,
|
98
|
+
'(' => :LP,
|
99
|
+
')' => :RP,
|
100
|
+
#'{' => :LC,
|
101
|
+
#'}' => :RC,
|
102
|
+
#'#' => :HASH,
|
103
|
+
'$' => :DOLLAR,
|
104
|
+
'^' => :CARET,
|
105
|
+
#'&' => :AND,
|
106
|
+
'*' => :STAR,
|
107
|
+
'+' => :PLUS,
|
108
|
+
'-' => :MINUS,
|
109
|
+
'?' => :QUESTION,
|
110
|
+
'.' => :DOT,
|
111
|
+
'|' => :PIPE,
|
112
|
+
',' => :COMMA,
|
113
|
+
':' => :COLON
|
114
|
+
}
|
115
|
+
|
116
|
+
|
117
|
+
@@regex = {
|
118
|
+
:simple_tokens => %r{^(#{Regexp.union(@@ops.keys.sort_by{|a| a.length}.reverse.collect{ |k| k })})},
|
119
|
+
:ncname => %r{(?:[a-zA-Z_][-a-zA-Z0-9_.]*)}
|
120
|
+
}
|
121
|
+
|
122
|
+
#puts @@regex[:simple_tokens]
|
123
|
+
|
124
|
+
@@regex[:qname] = %r{((?:#{@@regex[:ncname]}:)?#{@@regex[:ncname]})}
|
125
|
+
@@regex[:general] = Regexp.compile(%{^#{@@regex[:qname]}|#{@@regex[:simple_tokens]}})
|
126
|
+
|
127
|
+
def next_token
|
128
|
+
@token = nil
|
129
|
+
white_space = 0
|
130
|
+
new_line = 0
|
131
|
+
while @curpos < @source.length && @source[@curpos..@curpos] =~ /\s/ do
|
132
|
+
if @source[@curpos..@curpos] =~ /\n/
|
133
|
+
new_line = new_line + 1
|
134
|
+
@line = @line + 1
|
135
|
+
@col = 0
|
136
|
+
else
|
137
|
+
@col = @col + 1
|
138
|
+
end
|
139
|
+
@curpos = @curpos + 1
|
140
|
+
white_space = white_space + 1
|
141
|
+
end
|
142
|
+
|
143
|
+
# skip comments delimited by (: :)
|
144
|
+
# comments can be nested
|
145
|
+
# these are XPath 2.0 comments
|
146
|
+
#
|
147
|
+
if @curpos < @source.length && @source[@curpos..@curpos+1] == '(:'
|
148
|
+
comment_depth = 1
|
149
|
+
@curpos = @curpos + 2
|
150
|
+
@col = @col + 2
|
151
|
+
while comment_depth > 0 && @curpos < @source.length
|
152
|
+
if @source[@curpos..@curpos+1] == '(:'
|
153
|
+
comment_depth = comment_depth + 1
|
154
|
+
@curpos = @curpos + 1
|
155
|
+
@col = @col + 1
|
156
|
+
end
|
157
|
+
if @source[@curpos..@curpos+1] == ':)'
|
158
|
+
comment_depth = comment_depth - 1
|
159
|
+
@curpos = @curpos + 1
|
160
|
+
@col = @col + 1
|
161
|
+
end
|
162
|
+
@curpos = @curpos + 1
|
163
|
+
@col = @col + 1
|
164
|
+
end
|
165
|
+
white_space = white_space + 1
|
166
|
+
end
|
167
|
+
|
168
|
+
while @curpos < @source.length && @source[@curpos..@curpos] =~ /\s/ do
|
169
|
+
if @source[@curpos..@curpos] =~ /\n/
|
170
|
+
new_line = new_line + 1
|
171
|
+
@line = @line + 1
|
172
|
+
@col = 0
|
173
|
+
else
|
174
|
+
@col = @col + 1
|
175
|
+
end
|
176
|
+
@curpos = @curpos + 1
|
177
|
+
white_space = white_space + 1
|
178
|
+
end
|
179
|
+
|
180
|
+
if @curpos >= @source.length
|
181
|
+
@last_token = nil
|
182
|
+
return [ false, false ]
|
183
|
+
end
|
184
|
+
|
185
|
+
# case @source[@curpos..@curpos]
|
186
|
+
# when '<': @token = [ :LT, '<' ]
|
187
|
+
# when '>': @token = [ :GT, '>' ]
|
188
|
+
# when '[': @token = [ :LB, '[' ]
|
189
|
+
# when ']': @token = [ :RB, ']' ]
|
190
|
+
# when '(': @token = [ :LP, '(' ]
|
191
|
+
# when ')': @token = [ :RP, ')' ]
|
192
|
+
# when '{': @token = [ :LC, '{' ]
|
193
|
+
# when '}': @token = [ :RC, '}' ]
|
194
|
+
# when ':': @token = [ :COLON, ':' ]
|
195
|
+
# when ',': @token = [ :COMMA, ',' ]
|
196
|
+
# when '|': @token = [ :PIPE, '|' ]
|
197
|
+
# when '*': @token = [ :STAR, '*' ]
|
198
|
+
# when '+': @token = [ :PLUS, '+' ]
|
199
|
+
# when '.': @token = [ :DOT, '.' ]
|
200
|
+
# when '?': @token = [ :QUESTION, '?' ]
|
201
|
+
# when '$': @token = [ :DOLLAR, '$' ]
|
202
|
+
# when '^': @token = [ :CARET, '^' ]
|
203
|
+
# end
|
204
|
+
|
205
|
+
res = @@regex[:simple_tokens].match(@source[@curpos..@source.length-1])
|
206
|
+
if !res.nil?
|
207
|
+
if !res[1].nil?
|
208
|
+
@token = [ @@ops[res[1]], res[1] ]
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
if @token.nil?
|
213
|
+
# get longest sequence of non-special characters
|
214
|
+
# if it's all digits, report INTEGER
|
215
|
+
# if it's a qname, report QNAME
|
216
|
+
# otherwise, report TEXT
|
217
|
+
@source[@curpos..@source.length-1] =~ /^(((\\.)|[^ \$\^\[\]\{\}\(\):,|*+.?])+)*/
|
218
|
+
text = $1
|
219
|
+
bits = text.split(/\\/)
|
220
|
+
text = bits.join('')
|
221
|
+
@curpos += bits.size - 1
|
222
|
+
@col += bits.size - 1
|
223
|
+
if text.length > 0
|
224
|
+
if @source[@curpos+text.length .. @curpos+text.length] =~ /[*?+\{]/
|
225
|
+
# TODO: make sure we backtrack properly if the last character is escaped
|
226
|
+
text = text[0..text.length-2]
|
227
|
+
@token = [ :TEXT, text ]
|
228
|
+
else
|
229
|
+
case text
|
230
|
+
when /^\d+$/: @token = [ :INTEGER, text ]
|
231
|
+
when /^#{@@regex[:ncname]}$/: @token = [ :NCNAME, text ]
|
232
|
+
else @token = [ :TEXT, text ]
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
if @token.nil?
|
239
|
+
puts "Uh oh... we don't know what to do: #{@source[@curpos .. @source.length-1]}"
|
240
|
+
else
|
241
|
+
@curpos += @token[1].length
|
242
|
+
@col += @token[1].length
|
243
|
+
end
|
244
|
+
|
245
|
+
if @token[0] == :LB
|
246
|
+
if @brackets == 0 && @source[@curpos..@source.length-1] =~ /^\s*\d/
|
247
|
+
@token[0] = :LLB
|
248
|
+
end
|
249
|
+
@brackets += 1
|
250
|
+
elsif @token[0] == :RB
|
251
|
+
@brackets -= 1
|
252
|
+
elsif @brackets > 1
|
253
|
+
@token[0] = :CHAR_TEXT
|
254
|
+
end
|
255
|
+
|
256
|
+
return @token
|
257
|
+
end
|