rubylexer 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +510 -0
- data/README +134 -0
- data/Rantfile +37 -0
- data/assert.rb +31 -0
- data/charhandler.rb +84 -0
- data/charset.rb +76 -0
- data/context.rb +174 -0
- data/howtouse.txt +136 -0
- data/io.each_til_charset.rb +247 -0
- data/require.rb +103 -0
- data/rlold.rb +12 -0
- data/rubycode.rb +44 -0
- data/rubylexer.rb +1589 -0
- data/rulexer.rb +532 -0
- data/symboltable.rb +65 -0
- data/testcode/deletewarns.rb +39 -0
- data/testcode/dumptokens.rb +38 -0
- data/testcode/locatetest +12 -0
- data/testcode/rubylexervsruby.rb +104 -0
- data/testcode/rubylexervsruby.sh +51 -0
- data/testcode/tokentest.rb +237 -0
- data/testcode/torment +51 -0
- data/testdata/1.rb.broken +729 -0
- data/testdata/23.rb +24 -0
- data/testdata/g.rb +15 -0
- data/testdata/newsyntax.rb +18 -0
- data/testdata/noeolatend.rb +1 -0
- data/testdata/p.rb +1227 -0
- data/testdata/pleac.rb.broken +6282 -0
- data/testdata/pre.rb +33 -0
- data/testdata/pre.unix.rb +33 -0
- data/testdata/regtest.rb +621 -0
- data/testdata/tokentest.assert.rb.can +7 -0
- data/testdata/untitled1.rb +1 -0
- data/testdata/w.rb +22 -0
- data/testdata/wsdlDriver.rb +499 -0
- data/testing.txt +130 -0
- data/testresults/placeholder +0 -0
- data/token.rb +486 -0
- data/tokenprinter.rb +152 -0
- metadata +76 -0
data/howtouse.txt
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
|
2
|
+
Using rubylexer:
|
3
|
+
require "rubylexer.rb"
|
4
|
+
,then
|
5
|
+
lexer=RubyLexer.new(a_file_name, opened_File_or_String)
|
6
|
+
until EoiToken===(tok=lexer.get1token)
|
7
|
+
...do stuff w/ toks...
|
8
|
+
end
|
9
|
+
|
10
|
+
For a slightly expanded version of this example, see testcode/dumptokens.rb.
|
11
|
+
|
12
|
+
tok will be a subclass of Token. there are many token classes (see token.rb)
|
13
|
+
however, all tokens have some common methods:
|
14
|
+
to_s #return a string containing ruby code representing that token
|
15
|
+
ident #return internal form of token; use with caution
|
16
|
+
offset #offset in file of start of token
|
17
|
+
error #returns a string if there was a lex error at this position, else nil
|
18
|
+
|
19
|
+
here's a list of token subclasses and their meaning:
|
20
|
+
(note: indentation indicates inheiritance)
|
21
|
+
|
22
|
+
WToken #(mostly useless?) abstract superclass for KeywordToken,
|
23
|
+
#OperatorToken, VarNameToken, and HerePlaceholderToken
|
24
|
+
#but not (confusingly) MethNameToken (perhaps that'll change)
|
25
|
+
KeywordToken #a ruby keyword or non-overridable punctuation char(s)
|
26
|
+
OperatorToken #overrideable operators
|
27
|
+
VarNameToken #a name that represents a variable
|
28
|
+
HerePlaceholderToken #represents the header of a here string. subclass of WToken
|
29
|
+
MethNameToken #the name of a method: the uncoloned
|
30
|
+
#symbols allowed in 'alias' and 'undef' statements and all names
|
31
|
+
#which follow a 'def',
|
32
|
+
#'::', or '.', as well as other call sites. operators used as
|
33
|
+
#method names will appear as methnametokens.
|
34
|
+
#confusingly, this is not a WToken.
|
35
|
+
NumberToken #a literal number, including character constants
|
36
|
+
SymbolToken #a symbol
|
37
|
+
NewlineToken #represents an (unescaped) newline.
|
38
|
+
StringToken #represents a string. unlike all other tokens, strings might contain
|
39
|
+
#other tokens. if the string used interpolation, tokens inside #{ }
|
40
|
+
#are considered subtokens of the string. StringToken#elems returns
|
41
|
+
#an array whose elements are sections of uninterpolated string (in
|
42
|
+
#the even indeces) and arrays of subtokens (in the odd indeces).
|
43
|
+
#this notion of subtokens is an unfortunate one and will go away in
|
44
|
+
#a future release.
|
45
|
+
RenderExactlyStringToken #a subclass of StringToken; used to represent regexes and other string-like thingys
|
46
|
+
|
47
|
+
ErrorToken #actually a module that may be mixed in to any token. indicates an error in the input at (or
|
48
|
+
#near) that position. You may continue getting tokens after an error token is encountered,
|
49
|
+
#and I try to make this work as well as possible, but I can not guarantee correctness after
|
50
|
+
#an error.
|
51
|
+
#note: any token may be an ErrorToken, including IgnoreToken, EoiToken, a subtoken of a
|
52
|
+
#StringToken, etc. Please take this into account in your error processing.
|
53
|
+
|
54
|
+
IgnoreToken #superclass for tokens without semantic meaning to a parser
|
55
|
+
WsToken #whitespace
|
56
|
+
EscNlToken #implicitly or explicitly escaped newline
|
57
|
+
EoiToken #end of source file. always the last token
|
58
|
+
HereBodyToken #the actual body of the here string. subclass of IgnoreToken
|
59
|
+
OutlinedHereBodyToken #hacky subclass of HereBodyToken... will disappear once strings are done right.
|
60
|
+
|
61
|
+
ZwToken #informational IgnoreTokens. (parsers might need to look at some of these, actually.)
|
62
|
+
NoWsToken #no whitespace was on either side of this token. kind of a hack
|
63
|
+
#to help TokenPrinter work correctly in certain cases.
|
64
|
+
|
65
|
+
ImplicitParamListStartToken #if you leave the parentheses out in a function
|
66
|
+
ImplicitParamListEndToken #call, a pair of these will be generated instead
|
67
|
+
|
68
|
+
KwParamListStartToken #the when,for,and rescue keywords take a comma-
|
69
|
+
KwParamListEndToken #delimited list. these tokens enclose those lists.
|
70
|
+
|
71
|
+
AssignmentRhsListStartToken #encloses the right hand side of an assignment,
|
72
|
+
AssignmentRhsListEndToken #including both single and multiple assignment.
|
73
|
+
|
74
|
+
FileAndLineToken #generated at every newline, escaped or unescaped. the file
|
75
|
+
#and line methods of this class return the file and line at
|
76
|
+
#that point in the token stream. (not always working right now.)
|
77
|
+
|
78
|
+
|
79
|
+
Subclasses of WToken provide an === method for comparing the token to a String or Regexp.
|
80
|
+
|
81
|
+
The different types of string and how to distinguish:
|
82
|
+
For the most part you can tell what was what by looking at StringToken#char.
|
83
|
+
Single and double quotes can't be distinguished this way, and neither can you tell a fancy
|
84
|
+
string (starting with %) from the regular kind. If you want to have more... one option
|
85
|
+
with the current code is to just go and look in your input what was at StringToken#offset.
|
86
|
+
|
87
|
+
Eventually, (in version 0.8) string boundaries, bodies, and inclusions will all be separate
|
88
|
+
tokens laid out linearly in the token stream. This is the way matz handles things, and it's
|
89
|
+
much cleaner. I'll make sure that the string start token at that time contains all the
|
90
|
+
info you could want. If you really can't wait for 0.8 and can't stand #offset, I can hack
|
91
|
+
in a method to StringToken that tell you exactly what char(s) opened the string.
|
92
|
+
|
93
|
+
Certain keywords (if, unless, while, until, do) may or may not have an associated end keyword.
|
94
|
+
For instance, these have ends associated:
|
95
|
+
|
96
|
+
if somthing then
|
97
|
+
do_somthing
|
98
|
+
end
|
99
|
+
|
100
|
+
a.each do|x|
|
101
|
+
x.something_about_it
|
102
|
+
end
|
103
|
+
|
104
|
+
And these do not:
|
105
|
+
|
106
|
+
do_something if something
|
107
|
+
|
108
|
+
for x in a do
|
109
|
+
x.something_about_it
|
110
|
+
end #paired to the for, not the do!
|
111
|
+
|
112
|
+
A KeywordToken is generated by rubylexer in either case, but you can now use the has_end?
|
113
|
+
method of KeywordToken to determine whether an end should be expected for a particular if or not.
|
114
|
+
|
115
|
+
api stability:
|
116
|
+
Future changes to the user-visible api will happen in a backwards-compatible way, so that
|
117
|
+
if the interface changes, there will be a (probably quite long) transition period during
|
118
|
+
which both the old and new interfaces are supported. The idea is to give users plenty of
|
119
|
+
time to adapt to changes. That promise goes for all the changes described below.
|
120
|
+
|
121
|
+
In cases where the 2 are incompatible, (inspired by rubygems) I've come up with this:
|
122
|
+
|
123
|
+
RubyLexer.version(0.6).new(...args...) #request the 0.6 api
|
124
|
+
|
125
|
+
This actually works currently; it enables the old api where errors cause an exception instead
|
126
|
+
of generating ErrorTokens. The default will always be to use the new api.
|
127
|
+
|
128
|
+
StringToken will go away; replaced by multiple token types, like in ruby. StringToken
|
129
|
+
subclasses will need reorganization at this point too... tokens in an interpolation
|
130
|
+
will no longer be 'subtokens' but full-fledged tokens in their own right.
|
131
|
+
i intend to make a namespace for all rubylexer classes at some point... shouldn't
|
132
|
+
be a big deal; old clients can just include the namespace module.
|
133
|
+
Token#ident may be taken away or change without notice.
|
134
|
+
MethNameToken may become a WToken
|
135
|
+
HereBodyToken should really be a string subclass...
|
136
|
+
|
@@ -0,0 +1,247 @@
|
|
1
|
+
=begin copyright
|
2
|
+
rubylexer - a ruby lexer written in ruby
|
3
|
+
Copyright (C) 2004,2005 Caleb Clausen
|
4
|
+
|
5
|
+
This library is free software; you can redistribute it and/or
|
6
|
+
modify it under the terms of the GNU Lesser General Public
|
7
|
+
License as published by the Free Software Foundation; either
|
8
|
+
version 2.1 of the License, or (at your option) any later version.
|
9
|
+
|
10
|
+
This library is distributed in the hope that it will be useful,
|
11
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13
|
+
Lesser General Public License for more details.
|
14
|
+
|
15
|
+
You should have received a copy of the GNU Lesser General Public
|
16
|
+
License along with this library; if not, write to the Free Software
|
17
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
18
|
+
=end
|
19
|
+
|
20
|
+
|
21
|
+
module IOext
|
22
|
+
#read until a character in a user-supplied set is found.
|
23
|
+
#charrex must be a regexp that contains _only_ a single character class
|
24
|
+
def til_charset(charrex,blocksize=16)
|
25
|
+
blocks=[]
|
26
|
+
m=nil
|
27
|
+
until eof?
|
28
|
+
block=read blocksize
|
29
|
+
#if near eof, less than a full block may have been read
|
30
|
+
|
31
|
+
if m=charrex .match(block)
|
32
|
+
self.pos-=m.post_match.length+1
|
33
|
+
#'self.' shouldn't be needed... but is
|
34
|
+
|
35
|
+
blocks.push m.pre_match if m.pre_match.length>0
|
36
|
+
break
|
37
|
+
end
|
38
|
+
blocks<<block
|
39
|
+
end
|
40
|
+
return blocks.to_s
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
#-----------------------------------
|
48
|
+
#read and return next char if it matches ch
|
49
|
+
#else, leave input unread and return nil or false
|
50
|
+
def eat_next_if(ch)
|
51
|
+
oldpos=pos
|
52
|
+
c=read(1)
|
53
|
+
|
54
|
+
ch.kind_of? Integer and ch=ch.chr
|
55
|
+
|
56
|
+
return case c
|
57
|
+
when ch then c
|
58
|
+
when '' then self.pos=oldpos; nil
|
59
|
+
else self.pos=oldpos; false
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
#-----------------------------------
|
64
|
+
def eat_while(pat)
|
65
|
+
pat.kind_of? Integer and pat=pat.chr
|
66
|
+
|
67
|
+
result=''
|
68
|
+
loop {
|
69
|
+
ch=read(1)
|
70
|
+
unless pat===ch
|
71
|
+
back1char unless ch.nil? #nil ch mean eof
|
72
|
+
return result
|
73
|
+
end
|
74
|
+
result << ch
|
75
|
+
}
|
76
|
+
return result
|
77
|
+
end
|
78
|
+
|
79
|
+
#-----------------------------------
|
80
|
+
#returns previous character in stream
|
81
|
+
#without changing stream position
|
82
|
+
#or '' if at beginning
|
83
|
+
def prevchar
|
84
|
+
pos==0 and return ''
|
85
|
+
|
86
|
+
back1char
|
87
|
+
return getc.chr
|
88
|
+
end
|
89
|
+
|
90
|
+
#-----------------------------------
|
91
|
+
#returns next character in stream
|
92
|
+
#without changing stream position
|
93
|
+
#or nil if at end
|
94
|
+
def nextchar
|
95
|
+
eof? and return nil
|
96
|
+
|
97
|
+
result=getc
|
98
|
+
back1char
|
99
|
+
return result
|
100
|
+
end
|
101
|
+
|
102
|
+
#-----------------------------------
|
103
|
+
#this should really be in class File...
|
104
|
+
def getchar
|
105
|
+
eof? and return ''
|
106
|
+
return getc.chr
|
107
|
+
end
|
108
|
+
|
109
|
+
#-----------------------------------
|
110
|
+
def back1char() self.pos-=1 end
|
111
|
+
|
112
|
+
#-----------------------------------
|
113
|
+
def readahead(len)
|
114
|
+
oldpos=pos
|
115
|
+
result=read(len)
|
116
|
+
self.pos=oldpos
|
117
|
+
|
118
|
+
return result
|
119
|
+
end
|
120
|
+
|
121
|
+
#-----------------------------------
|
122
|
+
def readback(len)
|
123
|
+
oldpos=pos
|
124
|
+
self.pos-=len
|
125
|
+
result=read(len)
|
126
|
+
self.pos=oldpos
|
127
|
+
|
128
|
+
return result
|
129
|
+
end
|
130
|
+
|
131
|
+
#-----------------------------------
|
132
|
+
def readuntil(pat)
|
133
|
+
each(pat) { |match|
|
134
|
+
return match
|
135
|
+
}
|
136
|
+
end
|
137
|
+
|
138
|
+
|
139
|
+
|
140
|
+
#-----------------------------------------------------------------------
|
141
|
+
#a String with the duck-type of a File
|
142
|
+
#just enough is emulated to fool RubyLexer
|
143
|
+
class FakeFile < ::String #thanks to murphy for this lovely.
|
144
|
+
|
145
|
+
def initialize(*)
|
146
|
+
super
|
147
|
+
@pos = 0
|
148
|
+
end
|
149
|
+
|
150
|
+
attr_accessor :pos
|
151
|
+
|
152
|
+
def read x
|
153
|
+
pos = @pos
|
154
|
+
@pos += x
|
155
|
+
@pos>size and @pos=size
|
156
|
+
self[pos ... @pos]
|
157
|
+
end
|
158
|
+
|
159
|
+
def getc
|
160
|
+
eof? and return nil
|
161
|
+
pos = @pos
|
162
|
+
@pos += 1
|
163
|
+
self[pos]
|
164
|
+
end
|
165
|
+
|
166
|
+
def eof?
|
167
|
+
@pos >= size
|
168
|
+
end
|
169
|
+
|
170
|
+
def each_byte
|
171
|
+
until eof?
|
172
|
+
yield getc
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def stat #cheezy cheat to make #stat.size work
|
177
|
+
self
|
178
|
+
end
|
179
|
+
|
180
|
+
def close; end
|
181
|
+
|
182
|
+
def binmode; end
|
183
|
+
|
184
|
+
include IOext
|
185
|
+
|
186
|
+
|
187
|
+
#-----------------------------------
|
188
|
+
#read and return next char if it matches ch
|
189
|
+
#else, leave input unread and return nil or false
|
190
|
+
def eat_next_if(ch)
|
191
|
+
c=self[@pos,1]
|
192
|
+
|
193
|
+
ch.kind_of? Integer and ch=ch.chr
|
194
|
+
|
195
|
+
case c
|
196
|
+
when ch then @pos+=1;c
|
197
|
+
when '' then nil
|
198
|
+
else false
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
#-----------------------------------
|
203
|
+
#returns previous character in stream
|
204
|
+
#without changing stream position
|
205
|
+
#or '' if at beginning
|
206
|
+
def prevchar #returns Fixnum
|
207
|
+
pos==0 ? '' : self[@pos-1]
|
208
|
+
end
|
209
|
+
|
210
|
+
#-----------------------------------
|
211
|
+
#returns next character in stream
|
212
|
+
#without changing stream position
|
213
|
+
#or nil if at end
|
214
|
+
def nextchar #returns Fixnum
|
215
|
+
self[@pos]
|
216
|
+
end
|
217
|
+
|
218
|
+
#-----------------------------------
|
219
|
+
def getchar #returns String
|
220
|
+
eof? and return ''
|
221
|
+
pos = @pos
|
222
|
+
@pos += 1
|
223
|
+
self[pos,1]
|
224
|
+
end
|
225
|
+
|
226
|
+
#-----------------------------------
|
227
|
+
def back1char() @pos-=1 end
|
228
|
+
|
229
|
+
#-----------------------------------
|
230
|
+
def readahead(len)
|
231
|
+
self[@pos,len]
|
232
|
+
end
|
233
|
+
|
234
|
+
#-----------------------------------
|
235
|
+
def readback(len)
|
236
|
+
assert @pos-len>=0
|
237
|
+
self[@pos-len,len]
|
238
|
+
end
|
239
|
+
|
240
|
+
|
241
|
+
end
|
242
|
+
|
243
|
+
end
|
244
|
+
|
245
|
+
class IO
|
246
|
+
include IOext
|
247
|
+
end
|
data/require.rb
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
|
2
|
+
#wrapper versions of all commands that import code into a running program:
|
3
|
+
#require, load, eval and friends. the wrapped versions pass the code to
|
4
|
+
#import to rubylexervsruby, to test whether it gets lexed correctly. an
|
5
|
+
#exception is raised if an lex error happens, else the code should behave
|
6
|
+
#as normal, just much slower.
|
7
|
+
class Kernel
|
8
|
+
|
9
|
+
System_extension_extension=
|
10
|
+
case RUBY_PLATFORM
|
11
|
+
when /darwin/: 'o'
|
12
|
+
when /windows/i: 'dll'
|
13
|
+
else 'so'
|
14
|
+
end
|
15
|
+
System_ext_rex=/\.#{System_extension_extension}$/o
|
16
|
+
|
17
|
+
def require_name_resolve(name)
|
18
|
+
add_ext=case name
|
19
|
+
when System_ext_rex,/\.rb$/:
|
20
|
+
else name=/(#{name})(#{System_ext_rex}|\.rb)?/
|
21
|
+
end
|
22
|
+
name=/#{File::SEPARATOR}#{name}$/
|
23
|
+
$:.find{|dir|
|
24
|
+
dir.chomp File::SEPARATOR
|
25
|
+
Dir[/#{dir}#{name}/]
|
26
|
+
}
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
#reallyy jonesing for :wrap here
|
31
|
+
alias stdlib__require require
|
32
|
+
def require feat
|
33
|
+
name=feat
|
34
|
+
name=require_name_resolve(name) unless File.abs_path?(name)
|
35
|
+
return(false) unless name
|
36
|
+
return(true) if $".grep(feat)
|
37
|
+
$"<<feat
|
38
|
+
return stdlib__require(name) if name[System_ext_rex]
|
39
|
+
load name
|
40
|
+
end
|
41
|
+
|
42
|
+
alias stdlib__load load
|
43
|
+
def load name,wrap=false
|
44
|
+
name=$:.find{|dir| Dir[dir,name]} unless File.abs_path?(name)
|
45
|
+
if wrap then Module.new {
|
46
|
+
eval File.read(name), huh binding, name,1
|
47
|
+
} else eval File.read(name), huh binding, name,1
|
48
|
+
end
|
49
|
+
true
|
50
|
+
end
|
51
|
+
|
52
|
+
@@evalpos=1 #eval saves a position for the next eval sometimes... when?
|
53
|
+
alias stdlib__eval eval
|
54
|
+
def eval code,binding=nil,name='(eval)',linenum=1
|
55
|
+
if binding
|
56
|
+
rubylexervsruby(code, :name=>name, :linenum=>linenum, :locals=>eval("local_variables",binding))
|
57
|
+
|
58
|
+
return stdlib__eval code,binding,filename,linenum
|
59
|
+
end
|
60
|
+
huh Binding.of_caller{|bg| eval code,bg,name,linenum}
|
61
|
+
end
|
62
|
+
|
63
|
+
huh#got to do module_eval, class_eval, instance_eval, etc
|
64
|
+
end
|
65
|
+
|
66
|
+
class Object
|
67
|
+
alias stdlib__instance_eval instance_eval
|
68
|
+
def instance_eval(code,&block)
|
69
|
+
block and return stdlib__instance_eval &block
|
70
|
+
eval code, stdlib__instance_eval{binding}
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
class Module
|
75
|
+
alias stdlib__module_eval module_eval
|
76
|
+
alias module_eval instance_eval
|
77
|
+
end
|
78
|
+
|
79
|
+
class Class
|
80
|
+
alias stdlib__class_eval class_eval
|
81
|
+
alias class_eval instance_eval
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
class Binding
|
86
|
+
alias stdlib__eval eval
|
87
|
+
def eval code,name='(eval)',linenum=1
|
88
|
+
rubylexervsruby(code, :name=>name, :linenum=>linenum, :locals=>eval("local_variables",binding))
|
89
|
+
|
90
|
+
huh #should set code to (effectively) output of tokentest
|
91
|
+
#how to do that within rubylexervsruby
|
92
|
+
|
93
|
+
return stdlib__eval code,self,filename,linenum
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
=begin
|
98
|
+
def Module
|
99
|
+
def new
|
100
|
+
o=Object.extend self
|
101
|
+
end
|
102
|
+
end
|
103
|
+
=end
|