rubylexer 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/rulexer.rb ADDED
@@ -0,0 +1,532 @@
1
+ =begin copyright
2
+ rubylexer - a ruby lexer written in ruby
3
+ Copyright (C) 2004,2005 Caleb Clausen
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18
+ =end
19
+
20
+
21
+
22
+ require "assert"
23
+ require "charhandler"
24
+ #require "term"
25
+ require "rubycode"
26
+ require "io.each_til_charset"
27
+
28
+ #------------------------------------
29
+ class RuLexer
30
+
31
+ WHSP=" \t\r\v\f"
32
+ WHSPLF=WHSP+"\n"
33
+ #maybe \r should be in WHSPLF instead
34
+
35
+ LEGALCHARS=/[ -~#{WHSPLF}]/
36
+
37
+ PAIRS={ '{'=>'}', '['=>']', '('=>')', '<'=>'>'}
38
+
39
+ attr_reader :linenum,:last_operative_token
40
+
41
+ #-----------------------------------
42
+ def initialize(filename, file, line)
43
+ @filename=filename
44
+ String===file && file=IOext::FakeFile.new(file)
45
+ file.binmode
46
+ @file=file
47
+ @linenum=line
48
+ @toptable=nil #descendants must fill this out
49
+ @moretokens=[ FileAndLineToken.new(@filename, @linenum, @file.pos) ]
50
+ @last_operative_token=nil
51
+ end
52
+
53
+ #-----------------------------------
54
+ def endoffile_detected s=''
55
+ EoiToken.new(s,@file,@file.pos-s.size)
56
+ end
57
+
58
+ #-----------------------------------
59
+ def get1token
60
+ @moretokens.empty? or return @moretokens.shift
61
+
62
+ if @file.eof?
63
+ #@moretokens<<nil
64
+ return endoffile_detected()
65
+ end
66
+
67
+ @toptable.go( nextchar )
68
+ end
69
+
70
+ #-----------------------------------
71
+ def no_more?
72
+ @moretokens.each{|t| FileAndLineToken===t or return false }
73
+ return true
74
+ end
75
+
76
+ private
77
+ #-----------------------------------
78
+ def lexerror_errortoken(tok,str,file=@filename,line=@linenum)
79
+ str or return tok
80
+ tok.extend(ErrorToken).error=str
81
+ tok
82
+ end
83
+
84
+ #-----------------------------------
85
+ def lexerror_exception(tok,str,file=@filename,line=@linenum)
86
+ str or return tok
87
+ raise [file,line,' '+str].join(':')
88
+ end
89
+
90
+ #-----------------------------------
91
+ alias lexerror lexerror_errortoken
92
+
93
+ #-----------------------------------
94
+ def handler_loop(handler)
95
+ @file.each_byte {|b| handler.go(b) or break }
96
+ end
97
+
98
+ #-----------------------------------
99
+ def regex(ch=nil)
100
+ result=RenderExactlyStringToken.new('/').
101
+ append_token double_quote("/")
102
+
103
+ return result
104
+ end
105
+
106
+ #-----------------------------------
107
+ def single_char_token(str) return @file.getc.chr end
108
+
109
+ #-----------------------------------
110
+ def illegal_char(ch)
111
+ pos=@file.pos
112
+ LEGALCHARS===ch and return( lexerror WsToken.new(getchar,pos), "legal (?!) bad char (code: #{ch[0]})" )
113
+ lexerror WsToken.new(til_charset(LEGALCHARS),pos), "bad char (code: #{ch[0]})"
114
+ end
115
+
116
+ #-----------------------------------
117
+ def fancy_quote (ch)
118
+ assert ch=='%'
119
+ oldpos=@file.pos
120
+ eat_next_if(ch) or raise "fancy_quote, no "+ch
121
+
122
+ ch=getchar
123
+ #ch.tr!('qwQWrx','"["{/`')
124
+ type=case ch
125
+ when 'q' then "'"
126
+ when 'w' then "[" #word array
127
+ when 'Q' then '"' #regular string
128
+ when 'W' then '{' #dquotish word array
129
+ when 'r' then '/' #regex
130
+ when 'x' then '`' #exec it
131
+ when 's' then '"' #symbol
132
+ #other letters, nums are illegal here
133
+ when /^[a-z0-9]$/oi
134
+ error= "unrecognized %string type: "+ch; '"'
135
+ when ''
136
+ return lexerror( StringToken.new('', oldpos), "unexpected eof in %string")
137
+ else back1char; '"' #no letter means string too
138
+ end
139
+
140
+ beg=nextchar.chr
141
+ if /^[\r\n]$/===beg then
142
+ beg=INET_NL_REX
143
+ end
144
+
145
+ result=double_quote(beg, type, (PAIRS[beg] or beg))
146
+ case ch
147
+ when /^[Wwr]$/: result=RenderExactlyStringToken.new(type).append_token(result)
148
+ when 's': result=SymbolToken.new(result.to_s)
149
+ end
150
+ result.offset=oldpos
151
+ return lexerror(result,error)
152
+ end
153
+
154
+ #-----------------------------------
155
+ #this method is now misnamed, since it handles single quotes as well
156
+ def double_quote(nester, type=nester, delimiter=nester)
157
+ all_quote(nester,type,delimiter)
158
+ end
159
+
160
+ #-----------------------------------
161
+ def all_quote(nester, type, delimiter, bs_handler=nil)
162
+ #string must start with nester
163
+ if nester==INET_NL_REX
164
+ readnl
165
+ else
166
+ eat_next_if(nester)
167
+ end or return nil
168
+
169
+ bs_handler ||= case type
170
+ when '/','{' then :regex_esc_seq
171
+ when '"','`',':' then :dquote_esc_seq
172
+ when "'" then :squote_esc_seq
173
+ when "[" then :wquote_esc_seq
174
+ else raise "unknown quote type: #{type}"
175
+ end
176
+
177
+ str=StringToken.new type
178
+ old_linenum=@linenum
179
+ nestlevel=1
180
+ maybe_crunch= "'["[type] ? nil : "#"
181
+ @file.each_byte {|b|
182
+ b=b.chr
183
+ if /^[\r\n]$/===b
184
+ back1char
185
+ b=readnl
186
+ end
187
+ case b
188
+ when delimiter
189
+ if (nestlevel-=1)==0
190
+ str.modifiers=til_charset(/[^eioumnsx]/) if '/'==type
191
+ #emit eol marker later if line has changed
192
+ @linenum != old_linenum and @moretokens <<
193
+ FileAndLineToken.new(@filename,@linenum,@file.pos)
194
+ return str
195
+ end
196
+ when nester
197
+ #this branch ignored if nester==delimiter
198
+ assert(nester!=delimiter)
199
+ nestlevel+=1
200
+ when "\\"
201
+ b= send(bs_handler,'\\',nester,delimiter)
202
+ when nil then raise "nil char from each_byte?" #never happens
203
+ when maybe_crunch
204
+ nc=nextchar.chr
205
+ nc[/^[{@$]$/] and b=ruby_code(nc)
206
+ end
207
+ str.append b
208
+ }
209
+
210
+ assert @file.eof?
211
+ lexerror str,"unterminated #{delimiter}-string"
212
+ end
213
+
214
+ #-----------------------------------
215
+ ESCAPECHRS="abefnrstv"
216
+ ESCAPESEQS="\a\b\e\f\n\r\s\t\v"
217
+ def dquote_esc_seq(ch,nester=nil,delimiter=nil)
218
+ assert ch == '\\'
219
+ #see ruby book, p 205 for documentation of escape sequences
220
+ return case k=getchar
221
+ when "\n" then @linenum+=1; ""
222
+ when "\\" then "\\"
223
+ when '"' then '"'
224
+ when '#' then '#'
225
+ when /^[#{ESCAPECHRS}]$/o
226
+ k.tr(ESCAPECHRS,ESCAPESEQS)
227
+ =begin not needed anymore
228
+ when "a" then "\a"
229
+ when "b" then "\b"
230
+ when "e" then "\e"
231
+ when "f" then "\f"
232
+ when "n" then "\n"
233
+ when "r" then "\r"
234
+ when "s" then "\ "
235
+ when "t" then "\t"
236
+ when "v" then "\v"
237
+ =end
238
+ when "M"
239
+ eat_next_if(?-) or lexerror 'bad \\M sequence'
240
+ (getchar_maybe_escape | 0x80).chr
241
+
242
+ when "C"
243
+ eat_next_if(?-) or lexerror 'bad \\C sequence'
244
+ (getchar_maybe_escape & 0x9F).chr
245
+
246
+ when "c"
247
+ (getchar_maybe_escape & 0x9F).chr
248
+
249
+ when /^[0-7]$/
250
+ str=k
251
+ while str.length < 3
252
+ str << (eat_next_if(/^[0-7]$/) or break)
253
+ end
254
+ (str.oct&0xFF).chr
255
+
256
+ when "x"
257
+ str=''
258
+ while str.length < 2
259
+ str << (eat_next_if(/^[0-9A-F]$/i) or break)
260
+ end
261
+ str=='' and lexerror "bad \\x sequence"
262
+ str.hex.chr
263
+
264
+ else
265
+ '\\'+k
266
+ end
267
+ end
268
+
269
+ #-----------------------------------
270
+ def regex_esc_seq(ch,nester,delimiter)
271
+ assert ch == '\\'
272
+ c=getchar
273
+ return case c
274
+ when "\n"
275
+ @linenum+=1
276
+ ''#ch+c
277
+ when nester,delimiter ,"/"
278
+ c
279
+ #when "c"
280
+ # ch + c + getchar
281
+ #when "M","C"
282
+ # eat_next_if(?-) or
283
+ # lexerror "illegal \\#{c}- esc sequence"
284
+ # ch + c + '-' + (eat_next_if(/^[^\\]$/)or'')
285
+ # #if this \M- or \C- sequence is continued by
286
+ # #another backslash, we'll just leave the
287
+ # #backslash on the input, to be read by the next pass
288
+ else
289
+ ch+c
290
+ end
291
+ end
292
+
293
+ #-----------------------------------
294
+ def wquote_esc_seq(ch,nester,delimiter)
295
+ assert(ch=='\\')
296
+
297
+ #get the escaped character
298
+ escchar=getchar
299
+ return (case escchar
300
+ #all \ sequences but \delimiter, \nester
301
+ #are passed thru unchanged; actual
302
+ #newlines are counted but not changed
303
+ when delimiter,nester
304
+ ''
305
+ when "\n"
306
+ @linenum+=1; "\\"
307
+ else '\\'
308
+ end+escchar)
309
+ end
310
+
311
+ #-----------------------------------
312
+ def squote_esc_seq(ch,nester,delimiter)
313
+ assert(ch=='\\')
314
+
315
+ #get the escaped character
316
+ escchar=getchar
317
+ return (case escchar
318
+ #all \ sequences but \delimiter, \nester and \\
319
+ #are passed thru unchanged; actual
320
+ #newlines are counted but not changed
321
+ when delimiter,nester,'\\'
322
+ ''
323
+ when "\n"
324
+ @linenum+=1; "\\"
325
+ else '\\'
326
+ end+escchar)
327
+ end
328
+
329
+ #-----------------------------------
330
+ def ruby_code(ch='{')
331
+ assert ch[/^[{(@$]$/]
332
+ klass= RubyLexer===self ? self.class : RubyLexer
333
+ rl=klass.new(@filename,@file,@linenum)
334
+
335
+
336
+
337
+ case ch
338
+ when '@'
339
+ tokens=[rl.at_identifier]
340
+ when '$'
341
+ tokens=[rl.dollar_identifier]
342
+ when '{','('
343
+ tokens=[]
344
+ loop {
345
+ tok=rl.get1token
346
+ EoiToken===tok and lexerror tok,"unterminated string inclusion"
347
+ tokens << tok
348
+ break if tok===PAIRS[ch] and rl.no_more? and rl.balanced_braces?
349
+ }
350
+ else
351
+ raise 'hell'
352
+ end
353
+
354
+ if @linenum != rl.linenum
355
+ last=tokens.pop
356
+ fal=FileAndLineToken.new(@filename,@linenum, last.offset)
357
+ tokens.push fal,last
358
+ end
359
+
360
+ #need to verify that rl's @moretokens, @incomplete_here_tokens are empty
361
+ rl.incomplete_here_tokens.empty? or
362
+ here_spread_over_ruby_code rl,tokens.last
363
+ rl.no_more? or
364
+ raise 'uh-oh, ruby tokens were lexed past end of ruby code'
365
+
366
+ result=RubyCode.new(tokens,@filename,@linenum)
367
+ @linenum=rl.linenum
368
+ return result
369
+ end
370
+
371
+ #-----------------------------------
372
+ def here_spread_over_ruby_code(rl,tok)
373
+ lexerror tok, 'here body outside string inclusion'
374
+ end
375
+
376
+
377
+ #-----------------------------------
378
+ BINCHARS=?0..?1
379
+ OCTCHARS=?0..?7
380
+ DECCHARS=?0..?9
381
+ HEXCHARS=CharSet[?0..?9, ?A..?F, ?a..?f]
382
+ #0-9
383
+ #-----------------------------------
384
+ def number(str)
385
+
386
+ return nil unless /^[0-9+\-]$/===str
387
+
388
+ interp=:to_i
389
+ str= (eat_next_if(/^[+\-]$/)or'')
390
+ str<< (eat_next_if(/^[0-9]$/)or'')
391
+
392
+ if str[-1] == ?0 and nextchar !=?.
393
+ typechar=eat_next_if(/^[BOX]$/i)||'o'
394
+ str << typechar
395
+ interp=:oct
396
+ allowed=case typechar
397
+ when 'b','B': BINCHARS
398
+ when 'x','X': HEXCHARS
399
+ when 'o','O': OCTCHARS
400
+ else raise :impossible
401
+ end
402
+ else
403
+ interp=:to_i
404
+ allowed =DECCHARS
405
+ end
406
+
407
+ addl_dig_seqs= ((!typechar)? 2 : 0) #den 210
408
+ error=nil
409
+ @file.each_byte { |b|
410
+ if allowed === b or ?_ == b
411
+ str << b
412
+ else
413
+ #digits must follow and precede . and e
414
+ if ?.==b and addl_dig_seqs==2 and allowed===nextchar
415
+ addl_dig_seqs=1
416
+ str << b
417
+ #digits must follow and precede . and e
418
+ elsif (?e==b or ?E==b) and addl_dig_seqs>=1 and
419
+ readahead(2)[/^[-+]?[0-9]/]
420
+ addl_dig_seqs=0
421
+ str << b
422
+ str << (eat_next_if(/[+\-]/)or'')
423
+ else
424
+ back1char
425
+ #return(str.send(interp))
426
+ break
427
+ end
428
+ #OCTCHARS allowed here to permit constants like this: 01.2
429
+ allowed == DECCHARS or allowed == OCTCHARS or error= "floats are always decimal (currently)"
430
+ allowed = DECCHARS
431
+ interp=:to_s
432
+ end
433
+ }
434
+
435
+ assert(str[/[0-9]/])
436
+ lexerror NumberToken.new(str.send(interp)), error
437
+ end
438
+
439
+ #-----------------------------------
440
+ def comment(str=nil)
441
+ #assert str == '#'
442
+ str=eat_next_if(?#) or return nil
443
+ Process.kill("INT",0) if
444
+ readahead(10)==%/breakpoint/ and defined? DEBUGGER__
445
+ if false
446
+ @file.each_byte {|b|
447
+ if b==?\n #leave \n's on input for newline to eat
448
+ back1char
449
+ else
450
+ str << b
451
+ end
452
+ return IgnoreToken.new(str) if b==?\n or b==?#
453
+ }
454
+ #eof...
455
+ else
456
+ str<<til_charset(/[\r\n#]/)
457
+ eat_next_if ?# and str<<?#
458
+ end
459
+ return IgnoreToken.new(str)
460
+ end
461
+
462
+ #-----------------------------------
463
+ def whitespace(ch)
464
+ assert ch[/^[#{WHSP}]$/o]
465
+ oldpos=@file.pos
466
+ str=til_charset(/[^#{WHSP}]/o)
467
+ return WsToken.new(str,oldpos)
468
+ end
469
+
470
+ #-----------------------------------
471
+ INET_NL_REX=/^(\r\n?|\n\r?)/
472
+ def readnl
473
+ #compatible with dos/mac style newlines...
474
+ nl=readahead(2)[INET_NL_REX]
475
+ nl or return nil
476
+ assert((1..2)===nl.length)
477
+ @linenum+=1
478
+ @file.read nl.length
479
+ end
480
+
481
+ #-----------------------------------
482
+ def newline(ch)
483
+ offset=@file.pos
484
+ nl=readnl
485
+ @moretokens << FileAndLineToken.new( @filename, @linenum, @file.pos )
486
+ return NewlineToken.new( nl,offset)
487
+ end
488
+
489
+
490
+ #-----------------------------------
491
+ def getchar_maybe_escape
492
+ @file.eof? and lexerror huh,"unterminated dq string"
493
+ c=@file.getc
494
+
495
+ c == ?\\ and
496
+ (c = (dquote_esc_seq('\\')[-1] or ?\n))
497
+
498
+ return c
499
+ end
500
+
501
+ #-----------------------------------
502
+ def RuLexer.delegate_to(obj,*names)
503
+ eval names.collect {|name|
504
+ "define_method(:#{name}) do|*args|
505
+ #{obj}.#{name}(*args)
506
+ end
507
+ "
508
+ }.to_s
509
+ end
510
+
511
+ protected
512
+ delegate_to :@file, :eat_next_if,:prevchar,:nextchar,:getchar,:back1char,:readahead,:readback,:til_charset
513
+
514
+ #-----------------------------------
515
+ def RuLexer.save_offsets_in(*funcnames)
516
+ eval funcnames.collect{|fn| <<-endeval }.to_s
517
+ class ::#{self}
518
+ alias #{fn}__no_offset #{fn} #rename old ver of fn
519
+ def #{fn}(*args) #create new version
520
+ pos=@file.pos
521
+ result=#{fn}__no_offset(*args)
522
+ assert Token===result
523
+ result.offset||=pos
524
+ return result
525
+ end
526
+ end
527
+ endeval
528
+ end
529
+
530
+
531
+
532
+ end