rubylexer 0.6.2
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +510 -0
- data/README +134 -0
- data/Rantfile +37 -0
- data/assert.rb +31 -0
- data/charhandler.rb +84 -0
- data/charset.rb +76 -0
- data/context.rb +174 -0
- data/howtouse.txt +136 -0
- data/io.each_til_charset.rb +247 -0
- data/require.rb +103 -0
- data/rlold.rb +12 -0
- data/rubycode.rb +44 -0
- data/rubylexer.rb +1589 -0
- data/rulexer.rb +532 -0
- data/symboltable.rb +65 -0
- data/testcode/deletewarns.rb +39 -0
- data/testcode/dumptokens.rb +38 -0
- data/testcode/locatetest +12 -0
- data/testcode/rubylexervsruby.rb +104 -0
- data/testcode/rubylexervsruby.sh +51 -0
- data/testcode/tokentest.rb +237 -0
- data/testcode/torment +51 -0
- data/testdata/1.rb.broken +729 -0
- data/testdata/23.rb +24 -0
- data/testdata/g.rb +15 -0
- data/testdata/newsyntax.rb +18 -0
- data/testdata/noeolatend.rb +1 -0
- data/testdata/p.rb +1227 -0
- data/testdata/pleac.rb.broken +6282 -0
- data/testdata/pre.rb +33 -0
- data/testdata/pre.unix.rb +33 -0
- data/testdata/regtest.rb +621 -0
- data/testdata/tokentest.assert.rb.can +7 -0
- data/testdata/untitled1.rb +1 -0
- data/testdata/w.rb +22 -0
- data/testdata/wsdlDriver.rb +499 -0
- data/testing.txt +130 -0
- data/testresults/placeholder +0 -0
- data/token.rb +486 -0
- data/tokenprinter.rb +152 -0
- metadata +76 -0
data/howtouse.txt
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
|
2
|
+
Using rubylexer:
|
3
|
+
require "rubylexer.rb"
|
4
|
+
,then
|
5
|
+
lexer=RubyLexer.new(a_file_name, opened_File_or_String)
|
6
|
+
until EoiToken===(tok=lexer.get1token)
|
7
|
+
...do stuff w/ toks...
|
8
|
+
end
|
9
|
+
|
10
|
+
For a slightly expanded version of this example, see testcode/dumptokens.rb.
|
11
|
+
|
12
|
+
tok will be a subclass of Token. there are many token classes (see token.rb)
|
13
|
+
however, all tokens have some common methods:
|
14
|
+
to_s #return a string containing ruby code representing that token
|
15
|
+
ident #return internal form of token; use with caution
|
16
|
+
offset #offset in file of start of token
|
17
|
+
error #returns a string if there was a lex error at this position, else nil
|
18
|
+
|
19
|
+
here's a list of token subclasses and their meaning:
|
20
|
+
(note: indentation indicates inheiritance)
|
21
|
+
|
22
|
+
WToken #(mostly useless?) abstract superclass for KeywordToken,
|
23
|
+
#OperatorToken, VarNameToken, and HerePlaceholderToken
|
24
|
+
#but not (confusingly) MethNameToken (perhaps that'll change)
|
25
|
+
KeywordToken #a ruby keyword or non-overridable punctuation char(s)
|
26
|
+
OperatorToken #overrideable operators
|
27
|
+
VarNameToken #a name that represents a variable
|
28
|
+
HerePlaceholderToken #represents the header of a here string. subclass of WToken
|
29
|
+
MethNameToken #the name of a method: the uncoloned
|
30
|
+
#symbols allowed in 'alias' and 'undef' statements and all names
|
31
|
+
#which follow a 'def',
|
32
|
+
#'::', or '.', as well as other call sites. operators used as
|
33
|
+
#method names will appear as methnametokens.
|
34
|
+
#confusingly, this is not a WToken.
|
35
|
+
NumberToken #a literal number, including character constants
|
36
|
+
SymbolToken #a symbol
|
37
|
+
NewlineToken #represents an (unescaped) newline.
|
38
|
+
StringToken #represents a string. unlike all other tokens, strings might contain
|
39
|
+
#other tokens. if the string used interpolation, tokens inside #{ }
|
40
|
+
#are considered subtokens of the string. StringToken#elems returns
|
41
|
+
#an array whose elements are sections of uninterpolated string (in
|
42
|
+
#the even indeces) and arrays of subtokens (in the odd indeces).
|
43
|
+
#this notion of subtokens is an unfortunate one and will go away in
|
44
|
+
#a future release.
|
45
|
+
RenderExactlyStringToken #a subclass of StringToken; used to represent regexes and other string-like thingys
|
46
|
+
|
47
|
+
ErrorToken #actually a module that may be mixed in to any token. indicates an error in the input at (or
|
48
|
+
#near) that position. You may continue getting tokens after an error token is encountered,
|
49
|
+
#and I try to make this work as well as possible, but I can not guarantee correctness after
|
50
|
+
#an error.
|
51
|
+
#note: any token may be an ErrorToken, including IgnoreToken, EoiToken, a subtoken of a
|
52
|
+
#StringToken, etc. Please take this into account in your error processing.
|
53
|
+
|
54
|
+
IgnoreToken #superclass for tokens without semantic meaning to a parser
|
55
|
+
WsToken #whitespace
|
56
|
+
EscNlToken #implicitly or explicitly escaped newline
|
57
|
+
EoiToken #end of source file. always the last token
|
58
|
+
HereBodyToken #the actual body of the here string. subclass of IgnoreToken
|
59
|
+
OutlinedHereBodyToken #hacky subclass of HereBodyToken... will disappear once strings are done right.
|
60
|
+
|
61
|
+
ZwToken #informational IgnoreTokens. (parsers might need to look at some of these, actually.)
|
62
|
+
NoWsToken #no whitespace was on either side of this token. kind of a hack
|
63
|
+
#to help TokenPrinter work correctly in certain cases.
|
64
|
+
|
65
|
+
ImplicitParamListStartToken #if you leave the parentheses out in a function
|
66
|
+
ImplicitParamListEndToken #call, a pair of these will be generated instead
|
67
|
+
|
68
|
+
KwParamListStartToken #the when,for,and rescue keywords take a comma-
|
69
|
+
KwParamListEndToken #delimited list. these tokens enclose those lists.
|
70
|
+
|
71
|
+
AssignmentRhsListStartToken #encloses the right hand side of an assignment,
|
72
|
+
AssignmentRhsListEndToken #including both single and multiple assignment.
|
73
|
+
|
74
|
+
FileAndLineToken #generated at every newline, escaped or unescaped. the file
|
75
|
+
#and line methods of this class return the file and line at
|
76
|
+
#that point in the token stream. (not always working right now.)
|
77
|
+
|
78
|
+
|
79
|
+
Subclasses of WToken provide an === method for comparing the token to a String or Regexp.
|
80
|
+
|
81
|
+
The different types of string and how to distinguish:
|
82
|
+
For the most part you can tell what was what by looking at StringToken#char.
|
83
|
+
Single and double quotes can't be distinguished this way, and neither can you tell a fancy
|
84
|
+
string (starting with %) from the regular kind. If you want to have more... one option
|
85
|
+
with the current code is to just go and look in your input what was at StringToken#offset.
|
86
|
+
|
87
|
+
Eventually, (in version 0.8) string boundaries, bodies, and inclusions will all be separate
|
88
|
+
tokens laid out linearly in the token stream. This is the way matz handles things, and it's
|
89
|
+
much cleaner. I'll make sure that the string start token at that time contains all the
|
90
|
+
info you could want. If you really can't wait for 0.8 and can't stand #offset, I can hack
|
91
|
+
in a method to StringToken that tell you exactly what char(s) opened the string.
|
92
|
+
|
93
|
+
Certain keywords (if, unless, while, until, do) may or may not have an associated end keyword.
|
94
|
+
For instance, these have ends associated:
|
95
|
+
|
96
|
+
if somthing then
|
97
|
+
do_somthing
|
98
|
+
end
|
99
|
+
|
100
|
+
a.each do|x|
|
101
|
+
x.something_about_it
|
102
|
+
end
|
103
|
+
|
104
|
+
And these do not:
|
105
|
+
|
106
|
+
do_something if something
|
107
|
+
|
108
|
+
for x in a do
|
109
|
+
x.something_about_it
|
110
|
+
end #paired to the for, not the do!
|
111
|
+
|
112
|
+
A KeywordToken is generated by rubylexer in either case, but you can now use the has_end?
|
113
|
+
method of KeywordToken to determine whether an end should be expected for a particular if or not.
|
114
|
+
|
115
|
+
api stability:
|
116
|
+
Future changes to the user-visible api will happen in a backwards-compatible way, so that
|
117
|
+
if the interface changes, there will be a (probably quite long) transition period during
|
118
|
+
which both the old and new interfaces are supported. The idea is to give users plenty of
|
119
|
+
time to adapt to changes. That promise goes for all the changes described below.
|
120
|
+
|
121
|
+
In cases where the 2 are incompatible, (inspired by rubygems) I've come up with this:
|
122
|
+
|
123
|
+
RubyLexer.version(0.6).new(...args...) #request the 0.6 api
|
124
|
+
|
125
|
+
This actually works currently; it enables the old api where errors cause an exception instead
|
126
|
+
of generating ErrorTokens. The default will always be to use the new api.
|
127
|
+
|
128
|
+
StringToken will go away; replaced by multiple token types, like in ruby. StringToken
|
129
|
+
subclasses will need reorganization at this point too... tokens in an interpolation
|
130
|
+
will no longer be 'subtokens' but full-fledged tokens in their own right.
|
131
|
+
i intend to make a namespace for all rubylexer classes at some point... shouldn't
|
132
|
+
be a big deal; old clients can just include the namespace module.
|
133
|
+
Token#ident may be taken away or change without notice.
|
134
|
+
MethNameToken may become a WToken
|
135
|
+
HereBodyToken should really be a string subclass...
|
136
|
+
|
@@ -0,0 +1,247 @@
|
|
1
|
+
=begin copyright
|
2
|
+
rubylexer - a ruby lexer written in ruby
|
3
|
+
Copyright (C) 2004,2005 Caleb Clausen
|
4
|
+
|
5
|
+
This library is free software; you can redistribute it and/or
|
6
|
+
modify it under the terms of the GNU Lesser General Public
|
7
|
+
License as published by the Free Software Foundation; either
|
8
|
+
version 2.1 of the License, or (at your option) any later version.
|
9
|
+
|
10
|
+
This library is distributed in the hope that it will be useful,
|
11
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13
|
+
Lesser General Public License for more details.
|
14
|
+
|
15
|
+
You should have received a copy of the GNU Lesser General Public
|
16
|
+
License along with this library; if not, write to the Free Software
|
17
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
18
|
+
=end
|
19
|
+
|
20
|
+
|
21
|
+
module IOext
|
22
|
+
#read until a character in a user-supplied set is found.
|
23
|
+
#charrex must be a regexp that contains _only_ a single character class
|
24
|
+
def til_charset(charrex,blocksize=16)
|
25
|
+
blocks=[]
|
26
|
+
m=nil
|
27
|
+
until eof?
|
28
|
+
block=read blocksize
|
29
|
+
#if near eof, less than a full block may have been read
|
30
|
+
|
31
|
+
if m=charrex .match(block)
|
32
|
+
self.pos-=m.post_match.length+1
|
33
|
+
#'self.' shouldn't be needed... but is
|
34
|
+
|
35
|
+
blocks.push m.pre_match if m.pre_match.length>0
|
36
|
+
break
|
37
|
+
end
|
38
|
+
blocks<<block
|
39
|
+
end
|
40
|
+
return blocks.to_s
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
#-----------------------------------
|
48
|
+
#read and return next char if it matches ch
|
49
|
+
#else, leave input unread and return nil or false
|
50
|
+
def eat_next_if(ch)
|
51
|
+
oldpos=pos
|
52
|
+
c=read(1)
|
53
|
+
|
54
|
+
ch.kind_of? Integer and ch=ch.chr
|
55
|
+
|
56
|
+
return case c
|
57
|
+
when ch then c
|
58
|
+
when '' then self.pos=oldpos; nil
|
59
|
+
else self.pos=oldpos; false
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
#-----------------------------------
|
64
|
+
def eat_while(pat)
|
65
|
+
pat.kind_of? Integer and pat=pat.chr
|
66
|
+
|
67
|
+
result=''
|
68
|
+
loop {
|
69
|
+
ch=read(1)
|
70
|
+
unless pat===ch
|
71
|
+
back1char unless ch.nil? #nil ch mean eof
|
72
|
+
return result
|
73
|
+
end
|
74
|
+
result << ch
|
75
|
+
}
|
76
|
+
return result
|
77
|
+
end
|
78
|
+
|
79
|
+
#-----------------------------------
|
80
|
+
#returns previous character in stream
|
81
|
+
#without changing stream position
|
82
|
+
#or '' if at beginning
|
83
|
+
def prevchar
|
84
|
+
pos==0 and return ''
|
85
|
+
|
86
|
+
back1char
|
87
|
+
return getc.chr
|
88
|
+
end
|
89
|
+
|
90
|
+
#-----------------------------------
|
91
|
+
#returns next character in stream
|
92
|
+
#without changing stream position
|
93
|
+
#or nil if at end
|
94
|
+
def nextchar
|
95
|
+
eof? and return nil
|
96
|
+
|
97
|
+
result=getc
|
98
|
+
back1char
|
99
|
+
return result
|
100
|
+
end
|
101
|
+
|
102
|
+
#-----------------------------------
|
103
|
+
#this should really be in class File...
|
104
|
+
def getchar
|
105
|
+
eof? and return ''
|
106
|
+
return getc.chr
|
107
|
+
end
|
108
|
+
|
109
|
+
#-----------------------------------
|
110
|
+
def back1char() self.pos-=1 end
|
111
|
+
|
112
|
+
#-----------------------------------
|
113
|
+
def readahead(len)
|
114
|
+
oldpos=pos
|
115
|
+
result=read(len)
|
116
|
+
self.pos=oldpos
|
117
|
+
|
118
|
+
return result
|
119
|
+
end
|
120
|
+
|
121
|
+
#-----------------------------------
|
122
|
+
def readback(len)
|
123
|
+
oldpos=pos
|
124
|
+
self.pos-=len
|
125
|
+
result=read(len)
|
126
|
+
self.pos=oldpos
|
127
|
+
|
128
|
+
return result
|
129
|
+
end
|
130
|
+
|
131
|
+
#-----------------------------------
|
132
|
+
def readuntil(pat)
|
133
|
+
each(pat) { |match|
|
134
|
+
return match
|
135
|
+
}
|
136
|
+
end
|
137
|
+
|
138
|
+
|
139
|
+
|
140
|
+
#-----------------------------------------------------------------------
|
141
|
+
#a String with the duck-type of a File
|
142
|
+
#just enough is emulated to fool RubyLexer
|
143
|
+
class FakeFile < ::String #thanks to murphy for this lovely.
|
144
|
+
|
145
|
+
def initialize(*)
|
146
|
+
super
|
147
|
+
@pos = 0
|
148
|
+
end
|
149
|
+
|
150
|
+
attr_accessor :pos
|
151
|
+
|
152
|
+
def read x
|
153
|
+
pos = @pos
|
154
|
+
@pos += x
|
155
|
+
@pos>size and @pos=size
|
156
|
+
self[pos ... @pos]
|
157
|
+
end
|
158
|
+
|
159
|
+
def getc
|
160
|
+
eof? and return nil
|
161
|
+
pos = @pos
|
162
|
+
@pos += 1
|
163
|
+
self[pos]
|
164
|
+
end
|
165
|
+
|
166
|
+
def eof?
|
167
|
+
@pos >= size
|
168
|
+
end
|
169
|
+
|
170
|
+
def each_byte
|
171
|
+
until eof?
|
172
|
+
yield getc
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def stat #cheezy cheat to make #stat.size work
|
177
|
+
self
|
178
|
+
end
|
179
|
+
|
180
|
+
def close; end
|
181
|
+
|
182
|
+
def binmode; end
|
183
|
+
|
184
|
+
include IOext
|
185
|
+
|
186
|
+
|
187
|
+
#-----------------------------------
|
188
|
+
#read and return next char if it matches ch
|
189
|
+
#else, leave input unread and return nil or false
|
190
|
+
def eat_next_if(ch)
|
191
|
+
c=self[@pos,1]
|
192
|
+
|
193
|
+
ch.kind_of? Integer and ch=ch.chr
|
194
|
+
|
195
|
+
case c
|
196
|
+
when ch then @pos+=1;c
|
197
|
+
when '' then nil
|
198
|
+
else false
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
#-----------------------------------
|
203
|
+
#returns previous character in stream
|
204
|
+
#without changing stream position
|
205
|
+
#or '' if at beginning
|
206
|
+
def prevchar #returns Fixnum
|
207
|
+
pos==0 ? '' : self[@pos-1]
|
208
|
+
end
|
209
|
+
|
210
|
+
#-----------------------------------
|
211
|
+
#returns next character in stream
|
212
|
+
#without changing stream position
|
213
|
+
#or nil if at end
|
214
|
+
def nextchar #returns Fixnum
|
215
|
+
self[@pos]
|
216
|
+
end
|
217
|
+
|
218
|
+
#-----------------------------------
|
219
|
+
def getchar #returns String
|
220
|
+
eof? and return ''
|
221
|
+
pos = @pos
|
222
|
+
@pos += 1
|
223
|
+
self[pos,1]
|
224
|
+
end
|
225
|
+
|
226
|
+
#-----------------------------------
|
227
|
+
def back1char() @pos-=1 end
|
228
|
+
|
229
|
+
#-----------------------------------
|
230
|
+
def readahead(len)
|
231
|
+
self[@pos,len]
|
232
|
+
end
|
233
|
+
|
234
|
+
#-----------------------------------
|
235
|
+
def readback(len)
|
236
|
+
assert @pos-len>=0
|
237
|
+
self[@pos-len,len]
|
238
|
+
end
|
239
|
+
|
240
|
+
|
241
|
+
end
|
242
|
+
|
243
|
+
end
|
244
|
+
|
245
|
+
class IO
|
246
|
+
include IOext
|
247
|
+
end
|
data/require.rb
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
|
2
|
+
#wrapper versions of all commands that import code into a running program:
|
3
|
+
#require, load, eval and friends. the wrapped versions pass the code to
|
4
|
+
#import to rubylexervsruby, to test whether it gets lexed correctly. an
|
5
|
+
#exception is raised if an lex error happens, else the code should behave
|
6
|
+
#as normal, just much slower.
|
7
|
+
class Kernel
|
8
|
+
|
9
|
+
System_extension_extension=
|
10
|
+
case RUBY_PLATFORM
|
11
|
+
when /darwin/: 'o'
|
12
|
+
when /windows/i: 'dll'
|
13
|
+
else 'so'
|
14
|
+
end
|
15
|
+
System_ext_rex=/\.#{System_extension_extension}$/o
|
16
|
+
|
17
|
+
def require_name_resolve(name)
|
18
|
+
add_ext=case name
|
19
|
+
when System_ext_rex,/\.rb$/:
|
20
|
+
else name=/(#{name})(#{System_ext_rex}|\.rb)?/
|
21
|
+
end
|
22
|
+
name=/#{File::SEPARATOR}#{name}$/
|
23
|
+
$:.find{|dir|
|
24
|
+
dir.chomp File::SEPARATOR
|
25
|
+
Dir[/#{dir}#{name}/]
|
26
|
+
}
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
#reallyy jonesing for :wrap here
|
31
|
+
alias stdlib__require require
|
32
|
+
def require feat
|
33
|
+
name=feat
|
34
|
+
name=require_name_resolve(name) unless File.abs_path?(name)
|
35
|
+
return(false) unless name
|
36
|
+
return(true) if $".grep(feat)
|
37
|
+
$"<<feat
|
38
|
+
return stdlib__require(name) if name[System_ext_rex]
|
39
|
+
load name
|
40
|
+
end
|
41
|
+
|
42
|
+
alias stdlib__load load
|
43
|
+
def load name,wrap=false
|
44
|
+
name=$:.find{|dir| Dir[dir,name]} unless File.abs_path?(name)
|
45
|
+
if wrap then Module.new {
|
46
|
+
eval File.read(name), huh binding, name,1
|
47
|
+
} else eval File.read(name), huh binding, name,1
|
48
|
+
end
|
49
|
+
true
|
50
|
+
end
|
51
|
+
|
52
|
+
@@evalpos=1 #eval saves a position for the next eval sometimes... when?
|
53
|
+
alias stdlib__eval eval
|
54
|
+
def eval code,binding=nil,name='(eval)',linenum=1
|
55
|
+
if binding
|
56
|
+
rubylexervsruby(code, :name=>name, :linenum=>linenum, :locals=>eval("local_variables",binding))
|
57
|
+
|
58
|
+
return stdlib__eval code,binding,filename,linenum
|
59
|
+
end
|
60
|
+
huh Binding.of_caller{|bg| eval code,bg,name,linenum}
|
61
|
+
end
|
62
|
+
|
63
|
+
huh#got to do module_eval, class_eval, instance_eval, etc
|
64
|
+
end
|
65
|
+
|
66
|
+
class Object
|
67
|
+
alias stdlib__instance_eval instance_eval
|
68
|
+
def instance_eval(code,&block)
|
69
|
+
block and return stdlib__instance_eval &block
|
70
|
+
eval code, stdlib__instance_eval{binding}
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
class Module
|
75
|
+
alias stdlib__module_eval module_eval
|
76
|
+
alias module_eval instance_eval
|
77
|
+
end
|
78
|
+
|
79
|
+
class Class
|
80
|
+
alias stdlib__class_eval class_eval
|
81
|
+
alias class_eval instance_eval
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
class Binding
|
86
|
+
alias stdlib__eval eval
|
87
|
+
def eval code,name='(eval)',linenum=1
|
88
|
+
rubylexervsruby(code, :name=>name, :linenum=>linenum, :locals=>eval("local_variables",binding))
|
89
|
+
|
90
|
+
huh #should set code to (effectively) output of tokentest
|
91
|
+
#how to do that within rubylexervsruby
|
92
|
+
|
93
|
+
return stdlib__eval code,self,filename,linenum
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
=begin
|
98
|
+
def Module
|
99
|
+
def new
|
100
|
+
o=Object.extend self
|
101
|
+
end
|
102
|
+
end
|
103
|
+
=end
|