rubylexer 0.6.2

Sign up to get free protection for your applications and to get access to all the features.
data/rulexer.rb ADDED
@@ -0,0 +1,532 @@
1
+ =begin copyright
2
+ rubylexer - a ruby lexer written in ruby
3
+ Copyright (C) 2004,2005 Caleb Clausen
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18
+ =end
19
+
20
+
21
+
22
+ require "assert"
23
+ require "charhandler"
24
+ #require "term"
25
+ require "rubycode"
26
+ require "io.each_til_charset"
27
+
28
+ #------------------------------------
29
+ class RuLexer
30
+
31
+ WHSP=" \t\r\v\f"
32
+ WHSPLF=WHSP+"\n"
33
+ #maybe \r should be in WHSPLF instead
34
+
35
+ LEGALCHARS=/[ -~#{WHSPLF}]/
36
+
37
+ PAIRS={ '{'=>'}', '['=>']', '('=>')', '<'=>'>'}
38
+
39
+ attr_reader :linenum,:last_operative_token
40
+
41
+ #-----------------------------------
42
+ def initialize(filename, file, line)
43
+ @filename=filename
44
+ String===file && file=IOext::FakeFile.new(file)
45
+ file.binmode
46
+ @file=file
47
+ @linenum=line
48
+ @toptable=nil #descendants must fill this out
49
+ @moretokens=[ FileAndLineToken.new(@filename, @linenum, @file.pos) ]
50
+ @last_operative_token=nil
51
+ end
52
+
53
+ #-----------------------------------
54
+ def endoffile_detected s=''
55
+ EoiToken.new(s,@file,@file.pos-s.size)
56
+ end
57
+
58
+ #-----------------------------------
59
+ def get1token
60
+ @moretokens.empty? or return @moretokens.shift
61
+
62
+ if @file.eof?
63
+ #@moretokens<<nil
64
+ return endoffile_detected()
65
+ end
66
+
67
+ @toptable.go( nextchar )
68
+ end
69
+
70
+ #-----------------------------------
71
+ def no_more?
72
+ @moretokens.each{|t| FileAndLineToken===t or return false }
73
+ return true
74
+ end
75
+
76
+ private
77
+ #-----------------------------------
78
+ def lexerror_errortoken(tok,str,file=@filename,line=@linenum)
79
+ str or return tok
80
+ tok.extend(ErrorToken).error=str
81
+ tok
82
+ end
83
+
84
+ #-----------------------------------
85
+ def lexerror_exception(tok,str,file=@filename,line=@linenum)
86
+ str or return tok
87
+ raise [file,line,' '+str].join(':')
88
+ end
89
+
90
+ #-----------------------------------
91
+ alias lexerror lexerror_errortoken
92
+
93
+ #-----------------------------------
94
+ def handler_loop(handler)
95
+ @file.each_byte {|b| handler.go(b) or break }
96
+ end
97
+
98
+ #-----------------------------------
99
+ def regex(ch=nil)
100
+ result=RenderExactlyStringToken.new('/').
101
+ append_token double_quote("/")
102
+
103
+ return result
104
+ end
105
+
106
+ #-----------------------------------
107
+ def single_char_token(str) return @file.getc.chr end
108
+
109
+ #-----------------------------------
110
+ def illegal_char(ch)
111
+ pos=@file.pos
112
+ LEGALCHARS===ch and return( lexerror WsToken.new(getchar,pos), "legal (?!) bad char (code: #{ch[0]})" )
113
+ lexerror WsToken.new(til_charset(LEGALCHARS),pos), "bad char (code: #{ch[0]})"
114
+ end
115
+
116
+ #-----------------------------------
117
+ def fancy_quote (ch)
118
+ assert ch=='%'
119
+ oldpos=@file.pos
120
+ eat_next_if(ch) or raise "fancy_quote, no "+ch
121
+
122
+ ch=getchar
123
+ #ch.tr!('qwQWrx','"["{/`')
124
+ type=case ch
125
+ when 'q' then "'"
126
+ when 'w' then "[" #word array
127
+ when 'Q' then '"' #regular string
128
+ when 'W' then '{' #dquotish word array
129
+ when 'r' then '/' #regex
130
+ when 'x' then '`' #exec it
131
+ when 's' then '"' #symbol
132
+ #other letters, nums are illegal here
133
+ when /^[a-z0-9]$/oi
134
+ error= "unrecognized %string type: "+ch; '"'
135
+ when ''
136
+ return lexerror( StringToken.new('', oldpos), "unexpected eof in %string")
137
+ else back1char; '"' #no letter means string too
138
+ end
139
+
140
+ beg=nextchar.chr
141
+ if /^[\r\n]$/===beg then
142
+ beg=INET_NL_REX
143
+ end
144
+
145
+ result=double_quote(beg, type, (PAIRS[beg] or beg))
146
+ case ch
147
+ when /^[Wwr]$/: result=RenderExactlyStringToken.new(type).append_token(result)
148
+ when 's': result=SymbolToken.new(result.to_s)
149
+ end
150
+ result.offset=oldpos
151
+ return lexerror(result,error)
152
+ end
153
+
154
+ #-----------------------------------
155
+ #this method is now misnamed, since it handles single quotes as well
156
+ def double_quote(nester, type=nester, delimiter=nester)
157
+ all_quote(nester,type,delimiter)
158
+ end
159
+
160
+ #-----------------------------------
161
+ def all_quote(nester, type, delimiter, bs_handler=nil)
162
+ #string must start with nester
163
+ if nester==INET_NL_REX
164
+ readnl
165
+ else
166
+ eat_next_if(nester)
167
+ end or return nil
168
+
169
+ bs_handler ||= case type
170
+ when '/','{' then :regex_esc_seq
171
+ when '"','`',':' then :dquote_esc_seq
172
+ when "'" then :squote_esc_seq
173
+ when "[" then :wquote_esc_seq
174
+ else raise "unknown quote type: #{type}"
175
+ end
176
+
177
+ str=StringToken.new type
178
+ old_linenum=@linenum
179
+ nestlevel=1
180
+ maybe_crunch= "'["[type] ? nil : "#"
181
+ @file.each_byte {|b|
182
+ b=b.chr
183
+ if /^[\r\n]$/===b
184
+ back1char
185
+ b=readnl
186
+ end
187
+ case b
188
+ when delimiter
189
+ if (nestlevel-=1)==0
190
+ str.modifiers=til_charset(/[^eioumnsx]/) if '/'==type
191
+ #emit eol marker later if line has changed
192
+ @linenum != old_linenum and @moretokens <<
193
+ FileAndLineToken.new(@filename,@linenum,@file.pos)
194
+ return str
195
+ end
196
+ when nester
197
+ #this branch ignored if nester==delimiter
198
+ assert(nester!=delimiter)
199
+ nestlevel+=1
200
+ when "\\"
201
+ b= send(bs_handler,'\\',nester,delimiter)
202
+ when nil then raise "nil char from each_byte?" #never happens
203
+ when maybe_crunch
204
+ nc=nextchar.chr
205
+ nc[/^[{@$]$/] and b=ruby_code(nc)
206
+ end
207
+ str.append b
208
+ }
209
+
210
+ assert @file.eof?
211
+ lexerror str,"unterminated #{delimiter}-string"
212
+ end
213
+
214
+ #-----------------------------------
215
+ ESCAPECHRS="abefnrstv"
216
+ ESCAPESEQS="\a\b\e\f\n\r\s\t\v"
217
+ def dquote_esc_seq(ch,nester=nil,delimiter=nil)
218
+ assert ch == '\\'
219
+ #see ruby book, p 205 for documentation of escape sequences
220
+ return case k=getchar
221
+ when "\n" then @linenum+=1; ""
222
+ when "\\" then "\\"
223
+ when '"' then '"'
224
+ when '#' then '#'
225
+ when /^[#{ESCAPECHRS}]$/o
226
+ k.tr(ESCAPECHRS,ESCAPESEQS)
227
+ =begin not needed anymore
228
+ when "a" then "\a"
229
+ when "b" then "\b"
230
+ when "e" then "\e"
231
+ when "f" then "\f"
232
+ when "n" then "\n"
233
+ when "r" then "\r"
234
+ when "s" then "\ "
235
+ when "t" then "\t"
236
+ when "v" then "\v"
237
+ =end
238
+ when "M"
239
+ eat_next_if(?-) or lexerror 'bad \\M sequence'
240
+ (getchar_maybe_escape | 0x80).chr
241
+
242
+ when "C"
243
+ eat_next_if(?-) or lexerror 'bad \\C sequence'
244
+ (getchar_maybe_escape & 0x9F).chr
245
+
246
+ when "c"
247
+ (getchar_maybe_escape & 0x9F).chr
248
+
249
+ when /^[0-7]$/
250
+ str=k
251
+ while str.length < 3
252
+ str << (eat_next_if(/^[0-7]$/) or break)
253
+ end
254
+ (str.oct&0xFF).chr
255
+
256
+ when "x"
257
+ str=''
258
+ while str.length < 2
259
+ str << (eat_next_if(/^[0-9A-F]$/i) or break)
260
+ end
261
+ str=='' and lexerror "bad \\x sequence"
262
+ str.hex.chr
263
+
264
+ else
265
+ '\\'+k
266
+ end
267
+ end
268
+
269
+ #-----------------------------------
270
+ def regex_esc_seq(ch,nester,delimiter)
271
+ assert ch == '\\'
272
+ c=getchar
273
+ return case c
274
+ when "\n"
275
+ @linenum+=1
276
+ ''#ch+c
277
+ when nester,delimiter ,"/"
278
+ c
279
+ #when "c"
280
+ # ch + c + getchar
281
+ #when "M","C"
282
+ # eat_next_if(?-) or
283
+ # lexerror "illegal \\#{c}- esc sequence"
284
+ # ch + c + '-' + (eat_next_if(/^[^\\]$/)or'')
285
+ # #if this \M- or \C- sequence is continued by
286
+ # #another backslash, we'll just leave the
287
+ # #backslash on the input, to be read by the next pass
288
+ else
289
+ ch+c
290
+ end
291
+ end
292
+
293
+ #-----------------------------------
294
+ def wquote_esc_seq(ch,nester,delimiter)
295
+ assert(ch=='\\')
296
+
297
+ #get the escaped character
298
+ escchar=getchar
299
+ return (case escchar
300
+ #all \ sequences but \delimiter, \nester
301
+ #are passed thru unchanged; actual
302
+ #newlines are counted but not changed
303
+ when delimiter,nester
304
+ ''
305
+ when "\n"
306
+ @linenum+=1; "\\"
307
+ else '\\'
308
+ end+escchar)
309
+ end
310
+
311
+ #-----------------------------------
312
+ def squote_esc_seq(ch,nester,delimiter)
313
+ assert(ch=='\\')
314
+
315
+ #get the escaped character
316
+ escchar=getchar
317
+ return (case escchar
318
+ #all \ sequences but \delimiter, \nester and \\
319
+ #are passed thru unchanged; actual
320
+ #newlines are counted but not changed
321
+ when delimiter,nester,'\\'
322
+ ''
323
+ when "\n"
324
+ @linenum+=1; "\\"
325
+ else '\\'
326
+ end+escchar)
327
+ end
328
+
329
+ #-----------------------------------
330
+ def ruby_code(ch='{')
331
+ assert ch[/^[{(@$]$/]
332
+ klass= RubyLexer===self ? self.class : RubyLexer
333
+ rl=klass.new(@filename,@file,@linenum)
334
+
335
+
336
+
337
+ case ch
338
+ when '@'
339
+ tokens=[rl.at_identifier]
340
+ when '$'
341
+ tokens=[rl.dollar_identifier]
342
+ when '{','('
343
+ tokens=[]
344
+ loop {
345
+ tok=rl.get1token
346
+ EoiToken===tok and lexerror tok,"unterminated string inclusion"
347
+ tokens << tok
348
+ break if tok===PAIRS[ch] and rl.no_more? and rl.balanced_braces?
349
+ }
350
+ else
351
+ raise 'hell'
352
+ end
353
+
354
+ if @linenum != rl.linenum
355
+ last=tokens.pop
356
+ fal=FileAndLineToken.new(@filename,@linenum, last.offset)
357
+ tokens.push fal,last
358
+ end
359
+
360
+ #need to verify that rl's @moretokens, @incomplete_here_tokens are empty
361
+ rl.incomplete_here_tokens.empty? or
362
+ here_spread_over_ruby_code rl,tokens.last
363
+ rl.no_more? or
364
+ raise 'uh-oh, ruby tokens were lexed past end of ruby code'
365
+
366
+ result=RubyCode.new(tokens,@filename,@linenum)
367
+ @linenum=rl.linenum
368
+ return result
369
+ end
370
+
371
+ #-----------------------------------
372
+ def here_spread_over_ruby_code(rl,tok)
373
+ lexerror tok, 'here body outside string inclusion'
374
+ end
375
+
376
+
377
+ #-----------------------------------
378
+ BINCHARS=?0..?1
379
+ OCTCHARS=?0..?7
380
+ DECCHARS=?0..?9
381
+ HEXCHARS=CharSet[?0..?9, ?A..?F, ?a..?f]
382
+ #0-9
383
+ #-----------------------------------
384
+ def number(str)
385
+
386
+ return nil unless /^[0-9+\-]$/===str
387
+
388
+ interp=:to_i
389
+ str= (eat_next_if(/^[+\-]$/)or'')
390
+ str<< (eat_next_if(/^[0-9]$/)or'')
391
+
392
+ if str[-1] == ?0 and nextchar !=?.
393
+ typechar=eat_next_if(/^[BOX]$/i)||'o'
394
+ str << typechar
395
+ interp=:oct
396
+ allowed=case typechar
397
+ when 'b','B': BINCHARS
398
+ when 'x','X': HEXCHARS
399
+ when 'o','O': OCTCHARS
400
+ else raise :impossible
401
+ end
402
+ else
403
+ interp=:to_i
404
+ allowed =DECCHARS
405
+ end
406
+
407
+ addl_dig_seqs= ((!typechar)? 2 : 0) #den 210
408
+ error=nil
409
+ @file.each_byte { |b|
410
+ if allowed === b or ?_ == b
411
+ str << b
412
+ else
413
+ #digits must follow and precede . and e
414
+ if ?.==b and addl_dig_seqs==2 and allowed===nextchar
415
+ addl_dig_seqs=1
416
+ str << b
417
+ #digits must follow and precede . and e
418
+ elsif (?e==b or ?E==b) and addl_dig_seqs>=1 and
419
+ readahead(2)[/^[-+]?[0-9]/]
420
+ addl_dig_seqs=0
421
+ str << b
422
+ str << (eat_next_if(/[+\-]/)or'')
423
+ else
424
+ back1char
425
+ #return(str.send(interp))
426
+ break
427
+ end
428
+ #OCTCHARS allowed here to permit constants like this: 01.2
429
+ allowed == DECCHARS or allowed == OCTCHARS or error= "floats are always decimal (currently)"
430
+ allowed = DECCHARS
431
+ interp=:to_s
432
+ end
433
+ }
434
+
435
+ assert(str[/[0-9]/])
436
+ lexerror NumberToken.new(str.send(interp)), error
437
+ end
438
+
439
+ #-----------------------------------
440
+ def comment(str=nil)
441
+ #assert str == '#'
442
+ str=eat_next_if(?#) or return nil
443
+ Process.kill("INT",0) if
444
+ readahead(10)==%/breakpoint/ and defined? DEBUGGER__
445
+ if false
446
+ @file.each_byte {|b|
447
+ if b==?\n #leave \n's on input for newline to eat
448
+ back1char
449
+ else
450
+ str << b
451
+ end
452
+ return IgnoreToken.new(str) if b==?\n or b==?#
453
+ }
454
+ #eof...
455
+ else
456
+ str<<til_charset(/[\r\n#]/)
457
+ eat_next_if ?# and str<<?#
458
+ end
459
+ return IgnoreToken.new(str)
460
+ end
461
+
462
+ #-----------------------------------
463
+ def whitespace(ch)
464
+ assert ch[/^[#{WHSP}]$/o]
465
+ oldpos=@file.pos
466
+ str=til_charset(/[^#{WHSP}]/o)
467
+ return WsToken.new(str,oldpos)
468
+ end
469
+
470
+ #-----------------------------------
471
+ INET_NL_REX=/^(\r\n?|\n\r?)/
472
+ def readnl
473
+ #compatible with dos/mac style newlines...
474
+ nl=readahead(2)[INET_NL_REX]
475
+ nl or return nil
476
+ assert((1..2)===nl.length)
477
+ @linenum+=1
478
+ @file.read nl.length
479
+ end
480
+
481
+ #-----------------------------------
482
+ def newline(ch)
483
+ offset=@file.pos
484
+ nl=readnl
485
+ @moretokens << FileAndLineToken.new( @filename, @linenum, @file.pos )
486
+ return NewlineToken.new( nl,offset)
487
+ end
488
+
489
+
490
+ #-----------------------------------
491
+ def getchar_maybe_escape
492
+ @file.eof? and lexerror huh,"unterminated dq string"
493
+ c=@file.getc
494
+
495
+ c == ?\\ and
496
+ (c = (dquote_esc_seq('\\')[-1] or ?\n))
497
+
498
+ return c
499
+ end
500
+
501
+ #-----------------------------------
502
+ def RuLexer.delegate_to(obj,*names)
503
+ eval names.collect {|name|
504
+ "define_method(:#{name}) do|*args|
505
+ #{obj}.#{name}(*args)
506
+ end
507
+ "
508
+ }.to_s
509
+ end
510
+
511
+ protected
512
+ delegate_to :@file, :eat_next_if,:prevchar,:nextchar,:getchar,:back1char,:readahead,:readback,:til_charset
513
+
514
+ #-----------------------------------
515
+ def RuLexer.save_offsets_in(*funcnames)
516
+ eval funcnames.collect{|fn| <<-endeval }.to_s
517
+ class ::#{self}
518
+ alias #{fn}__no_offset #{fn} #rename old ver of fn
519
+ def #{fn}(*args) #create new version
520
+ pos=@file.pos
521
+ result=#{fn}__no_offset(*args)
522
+ assert Token===result
523
+ result.offset||=pos
524
+ return result
525
+ end
526
+ end
527
+ endeval
528
+ end
529
+
530
+
531
+
532
+ end