rubylexer 0.7.7 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -0
- data/History.txt +64 -0
- data/Makefile +2 -2
- data/README.txt +13 -9
- data/bin/rubylexer +113 -0
- data/lib/assert.rb +1 -1
- data/lib/rubylexer.rb +856 -305
- data/lib/rubylexer/charhandler.rb +1 -1
- data/lib/rubylexer/charset.rb +15 -7
- data/lib/rubylexer/context.rb +10 -2
- data/lib/rubylexer/lextable.rb +1 -0
- data/lib/rubylexer/rubycode.rb +1 -1
- data/lib/rubylexer/rulexer.rb +106 -32
- data/lib/rubylexer/symboltable.rb +1 -1
- data/lib/rubylexer/test/oneliners.rb +15 -5
- data/lib/rubylexer/test/oneliners_1.9.rb +116 -92
- data/lib/rubylexer/test/stanzas.rb +49 -27
- data/lib/rubylexer/test/testcases.rb +2 -2
- data/lib/rubylexer/token.rb +153 -23
- data/lib/rubylexer/tokenprinter.rb +9 -6
- data/lib/rubylexer/version.rb +1 -1
- data/rubylexer.gemspec +12 -8
- data/test/bad/ruby_lexer.rb +7 -0
- data/test/code/deletewarns.rb +1 -1
- data/test/code/dumptokens.rb +1 -81
- data/test/code/heredoc_blast_test.rb +112 -0
- data/test/code/locatetest.rb +1 -1
- data/test/code/regression.rb +23 -23
- data/test/code/rubylexervsruby.rb +59 -12
- data/test/code/tokentest.rb +62 -52
- data/test/data/23.rb +0 -1
- data/test/data/g.rb +0 -1
- data/test/data/heremonsters.rb +1 -1
- data/test/data/heremonsters_dos.rb +1 -1
- data/test/data/pre.rb +0 -1
- data/test/data/pre.unix.rb +0 -1
- data/test/data/putstext.rb +4 -0
- data/test/data/regtest.rb +0 -1
- data/test/data/stuffydog.rb +5 -0
- data/test/data/stuffydog2.rb +5 -0
- data/test/data/wsdlDriver.rb +0 -1
- data/test/test.sh +1 -1
- data/test/test_all.rb +3 -0
- data/test/test_bad_rubylexer.rb +16 -0
- data/test/test_rubylexer_bad.rb +12 -0
- data/testing.txt +40 -20
- metadata +51 -38
data/test/code/tokentest.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
|
-
=begin
|
2
|
+
=begin legalia
|
3
3
|
rubylexer - a ruby lexer written in ruby
|
4
|
-
Copyright (C) 2004,2005,2008 Caleb Clausen
|
4
|
+
Copyright (C) 2004,2005,2008, 2011 Caleb Clausen
|
5
5
|
|
6
6
|
This library is free software; you can redistribute it and/or
|
7
7
|
modify it under the terms of the GNU Lesser General Public
|
@@ -24,7 +24,7 @@ require "pp"
|
|
24
24
|
|
25
25
|
class RubyLexer
|
26
26
|
class Token
|
27
|
-
def verify_offset(fd); false end
|
27
|
+
def verify_offset(fd,lexer); false end
|
28
28
|
|
29
29
|
def check_for_error; end
|
30
30
|
end
|
@@ -36,17 +36,17 @@ module ErrorToken
|
|
36
36
|
end
|
37
37
|
|
38
38
|
class FileAndLineToken
|
39
|
-
def verify_offset(fd); true end
|
39
|
+
def verify_offset(fd,lexer); true end
|
40
40
|
end
|
41
41
|
class ImplicitParamListStartToken
|
42
|
-
def verify_offset(fd); true end
|
42
|
+
def verify_offset(fd,lexer); true end
|
43
43
|
end
|
44
44
|
class ImplicitParamListEndToken
|
45
|
-
def verify_offset(fd); true end
|
45
|
+
def verify_offset(fd,lexer); true end
|
46
46
|
end
|
47
47
|
|
48
48
|
module SimpleVerify
|
49
|
-
def verify_offset(fd)
|
49
|
+
def verify_offset(fd,lexer)
|
50
50
|
fd.read(@ident.length)==@ident
|
51
51
|
end
|
52
52
|
end
|
@@ -57,13 +57,13 @@ class MethNameToken; include SimpleVerify; end
|
|
57
57
|
|
58
58
|
class NewlineToken
|
59
59
|
include SimpleVerify
|
60
|
-
def verify_offset(fd)
|
60
|
+
def verify_offset(fd,lexer)
|
61
61
|
super or fd.eof?
|
62
62
|
end
|
63
63
|
end
|
64
64
|
|
65
65
|
class SymbolToken
|
66
|
-
def verify_offset(fd)
|
66
|
+
def verify_offset(fd,lexer)
|
67
67
|
la=fd.read(2)
|
68
68
|
case la
|
69
69
|
when '%s'
|
@@ -98,15 +98,15 @@ end
|
|
98
98
|
|
99
99
|
class EoiToken
|
100
100
|
include SimpleVerify
|
101
|
-
def verify_offset(fd)
|
102
|
-
result=super(fd)
|
101
|
+
def verify_offset(fd,lexer)
|
102
|
+
result=super(fd,lexer)
|
103
103
|
fd.eof?
|
104
104
|
return result
|
105
105
|
end
|
106
106
|
end
|
107
107
|
|
108
108
|
class NoWsToken
|
109
|
-
def verify_offset(fd)
|
109
|
+
def verify_offset(fd,lexer)
|
110
110
|
orig=fd.pos
|
111
111
|
fd.pos=orig-1
|
112
112
|
result= (/^[^\s\v\t\n\r\f]{2}$/===fd.read(2))
|
@@ -116,13 +116,13 @@ class NoWsToken
|
|
116
116
|
end
|
117
117
|
|
118
118
|
class HereBodyToken
|
119
|
-
def verify_offset(fd)
|
120
|
-
@ident.verify_subtoken_offsets(fd)
|
119
|
+
def verify_offset(fd,lexer)
|
120
|
+
@ident.verify_subtoken_offsets(fd,lexer)
|
121
121
|
end
|
122
122
|
end
|
123
123
|
|
124
124
|
class HerePlaceholderToken
|
125
|
-
def verify_offset(fd)
|
125
|
+
def verify_offset(fd,lexer)
|
126
126
|
'<<'==fd.read(2) or return false
|
127
127
|
@dash and ('-'==fd.read(1) or return false)
|
128
128
|
case ch=fd.read(1)[0]
|
@@ -143,14 +143,14 @@ end
|
|
143
143
|
class StringToken
|
144
144
|
FANCY_QUOTE_BEGINNINGS= {'`'=>'%x', '['=>'%w', '{'=>'%W',
|
145
145
|
'"'=>/('|%[^a-pr-z0-9])/i, '/'=>'%r'}
|
146
|
-
def verify_offset(fd)
|
146
|
+
def verify_offset(fd,lexer)
|
147
147
|
fd.read(open.size)==open or return false
|
148
148
|
# str=fd.read(2)
|
149
149
|
# @char==str[0,1] or FANCY_QUOTE_BEGINNINGS[@char]===str or return false
|
150
|
-
verify_subtoken_offsets(fd)
|
150
|
+
verify_subtoken_offsets(fd,lexer)
|
151
151
|
end
|
152
152
|
|
153
|
-
def verify_subtoken_offsets(fd)
|
153
|
+
def verify_subtoken_offsets(fd,lexer)
|
154
154
|
#verify offsets of subtokens
|
155
155
|
@elems.each{|elem|
|
156
156
|
case elem
|
@@ -173,7 +173,7 @@ class StringToken
|
|
173
173
|
#assert now_at<=goal+1 #not needed
|
174
174
|
saw[goal..-1]='' unless goal==now_at
|
175
175
|
saw==elem or return false
|
176
|
-
else elem.verify_offset(fd) or raise LexerError
|
176
|
+
else elem.verify_offset(fd,lexer) or raise LexerError
|
177
177
|
end
|
178
178
|
}
|
179
179
|
return true
|
@@ -188,12 +188,12 @@ class StringToken
|
|
188
188
|
end
|
189
189
|
|
190
190
|
class RubyCode
|
191
|
-
def verify_offset(fd)
|
191
|
+
def verify_offset(fd,lexer)
|
192
192
|
thistok=nexttok=endpos=nil
|
193
193
|
@ident.each_index{ |tok_i|
|
194
194
|
thistok,nexttok=@ident[tok_i,2]
|
195
195
|
endpos=nexttok ? nexttok.offset : thistok.offset+thistok.to_s.size
|
196
|
-
check_offset(thistok,fd,endpos)
|
196
|
+
lexer.check_offset(thistok,fd,endpos)
|
197
197
|
}
|
198
198
|
assert nexttok.nil?
|
199
199
|
assert thistok.object_id==@ident.last.object_id
|
@@ -208,7 +208,7 @@ end
|
|
208
208
|
|
209
209
|
|
210
210
|
class NumberToken
|
211
|
-
def verify_offset(fd)
|
211
|
+
def verify_offset(fd,lexer)
|
212
212
|
/^[0-9?+-]$/===fd.read(1)
|
213
213
|
end
|
214
214
|
end
|
@@ -221,43 +221,48 @@ end
|
|
221
221
|
#end
|
222
222
|
end
|
223
223
|
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
file
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
224
|
+
class RubyLexer
|
225
|
+
public
|
226
|
+
|
227
|
+
attr_reader :offset_failures, :offset_first_failure
|
228
|
+
|
229
|
+
def check_offset(tok,file=nil,endpos=nil)
|
230
|
+
#the errors detected here are now reduced to warnings....
|
231
|
+
file||=@original_file
|
232
|
+
String===file and file=file.to_sequence
|
233
|
+
allow_ooo= @moretokens&&@moretokens[0]&&@moretokens[0].allow_ooo_offset unless endpos
|
234
|
+
endpos||=((@moretokens.empty?)? input_position : @moretokens[0].offset)
|
235
|
+
oldpos=file.pos
|
236
|
+
|
237
|
+
assert Integer===tok.offset
|
238
|
+
assert Integer===endpos
|
239
|
+
if endpos<tok.offset and !allow_ooo
|
240
|
+
$stderr.puts "expected #{endpos} to be >= #{tok.offset} token #{tok.to_s.gsub("\n","\n ")}:#{tok.class}"
|
241
|
+
end
|
240
242
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
243
|
+
file.pos=tok.offset
|
244
|
+
unless tok.verify_offset(file,self)
|
245
|
+
@offset_failures ||= 0
|
246
|
+
@offset_failures += 1
|
247
|
+
@offset_first_failure ||= tok
|
248
|
+
end
|
249
|
+
case tok
|
250
|
+
when RubyLexer::StringToken,RubyLexer::NumberToken,
|
251
|
+
RubyLexer::HereBodyToken,RubyLexer::SymbolToken,
|
252
|
+
RubyLexer::HerePlaceholderToken,
|
253
|
+
RubyLexer::FileAndLineToken #do nothing
|
254
|
+
else
|
255
|
+
file.pos==endpos or allow_ooo or
|
256
|
+
$stderr.puts "positions don't line up, expected #{endpos}, got #{file.pos}, token: #{tok.to_s.gsub("\n","\n ") }"
|
257
|
+
end
|
258
|
+
file.pos=oldpos
|
259
|
+
return
|
252
260
|
end
|
253
|
-
file.pos=oldpos
|
254
|
-
return
|
255
261
|
end
|
256
262
|
|
257
263
|
|
258
264
|
|
259
265
|
|
260
|
-
|
261
266
|
def tokentest(name,lexertype,pprinter,input=File.open(name),output=$stdout)
|
262
267
|
input ||= File.open(name)
|
263
268
|
if output!=$stdout
|
@@ -277,6 +282,11 @@ def tokentest(name,lexertype,pprinter,input=File.open(name),output=$stdout)
|
|
277
282
|
pprinter.pprint(tok,output)
|
278
283
|
end until RubyLexer::EoiToken===tok
|
279
284
|
|
285
|
+
if lxr.offset_failures
|
286
|
+
first=lxr.offset_first_failure
|
287
|
+
$stderr.puts "failed to check offset in #{lxr.offset_failures} cases. first=#{first.class}: #{first.to_s.gsub("\n","\n ")} at #{first.offset}"
|
288
|
+
end
|
289
|
+
|
280
290
|
#hack for SimpleTokenPrinter....
|
281
291
|
print "\n" if RubyLexer::NewlineToken===lxr.last_operative_token and
|
282
292
|
RubyLexer::SimpleTokenPrinter===pprinter
|
data/test/data/23.rb
CHANGED
@@ -14,7 +14,6 @@ r[0]^=r[1]
|
|
14
14
|
$=||=0;$_||=0_0
|
15
15
|
i,v=['o',*local_variables].flatten.grep(/\A.\Z/).sort_by{|x|x[0].-(?j).abs.-@}.reverse,false
|
16
16
|
y=methods.grep(U).sort_by{|x|x[0].-(?e).abs.-@}
|
17
|
-
#breakpoint
|
18
17
|
i=[i,y].join(I=Love(%q=(2.**(5).chr)=)).gsub(' '){w=$=^=1;$x[w.inspect[1][0]*F-1,1]}.gsub(/[eyd]/){(?e.+?y.-($&[0])).chr }.delete('z')
|
19
18
|
i=i.scan(%r:#{I}|.*?(?=#{I})|.*:).inject([]){|r,p|unless(v=!v if(v=!v)...(v))..(v=!v):r<<[]end;r.last<<p;r};v^=!v
|
20
19
|
i,@v,@u=i.reject{true if (v=!v)..v}.join.capitalize,local_variables.inject(?X){|p,v|p^v[0]}.chr,O.constants.grep(U).pop
|
data/test/data/g.rb
CHANGED
data/test/data/heremonsters.rb
CHANGED
data/test/data/pre.rb
CHANGED
data/test/data/pre.unix.rb
CHANGED
data/test/data/regtest.rb
CHANGED
@@ -382,7 +382,6 @@ if ($Slow||=nil)
|
|
382
382
|
end
|
383
383
|
|
384
384
|
assert_eee Reg[lhqqbe+0], [ :begin, :"\\", :rand, :end ]
|
385
|
-
#breakpoint
|
386
385
|
assert_eee +[be], [:begin, :"\\", :"\\", :end]
|
387
386
|
assert_eee +[be], [:begin, :"\\", :begin, :end]
|
388
387
|
assert_eee +[be], [:begin, :"\\", :end, :end]
|
data/test/data/wsdlDriver.rb
CHANGED
data/test/test.sh
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/bin/sh
|
2
2
|
# rubylexer - a ruby lexer written in ruby
|
3
|
-
# Copyright (C) 2004,2005 Caleb Clausen
|
3
|
+
# Copyright (C) 2004,2005, 2011 Caleb Clausen
|
4
4
|
#
|
5
5
|
# This library is free software; you can redistribute it and/or
|
6
6
|
# modify it under the terms of the GNU Lesser General Public
|
data/test/test_all.rb
CHANGED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'rbconfig'
|
2
|
+
conf=RbConfig::CONFIG
|
3
|
+
ruby=conf['bindir']+"/"+conf['RUBY_INSTALL_NAME']
|
4
|
+
ruby='ruby' unless File.exist? ruby
|
5
|
+
|
6
|
+
|
7
|
+
fail unless system(ruby, "-e", <<END)
|
8
|
+
begin;
|
9
|
+
require '#{File.expand_path(File.join( File.dirname(__FILE__),'bad/ruby_lexer' ))}';
|
10
|
+
require 'rubygems'
|
11
|
+
require 'rubylexer';
|
12
|
+
rl=RubyLexer.new('eval','eval');
|
13
|
+
fail if $the_wrong_rubylexer==1;
|
14
|
+
end;
|
15
|
+
END
|
16
|
+
|
data/testing.txt
CHANGED
@@ -1,21 +1,22 @@
|
|
1
1
|
Running the tests:
|
2
|
-
|
3
|
-
|
4
|
-
|
2
|
+
|
3
|
+
The simplest thing to do is run "make test". This tests the lexer with a
|
4
|
+
list of known ruby interesting expressions. It will take several minutes
|
5
|
+
to run. Currently, there are 8-11 (minor) failures, depending or ruby
|
6
|
+
version. The fact that there are a few failures is more a testament to the
|
7
|
+
thoroughness of the test suite than an indictment of the lexer. Both lexer
|
8
|
+
and test suite are very thorough, but a few more (obscure and unlikely)
|
9
|
+
expressions are supported by the latter than the former.
|
10
|
+
|
11
|
+
Most of the tests in the suite use rubylexervsruby, described below.
|
5
12
|
|
6
13
|
If you're ambitious, try this command: "ruby -Ilib test/code/locatetest.rb".
|
7
14
|
This will use locate to find as much ruby code on your system and test
|
8
15
|
each specimen to see if it can be tokenized correctly (by feeding it to
|
9
|
-
|
16
|
+
test/code/rubylexervsruby.rb, the operation of which is outlined below
|
10
17
|
under 'testing strategy').
|
11
18
|
|
12
|
-
Interpreting
|
13
|
-
In rubylexervsruby, I've tried to follow the philosophy that the test program
|
14
|
-
doesn't print anything unless there's an error. Perhaps I haven't followed
|
15
|
-
this far enough; every run of rubylexervsruby produces a little output, and
|
16
|
-
sometimes a run will produce output that doesn't actually indicate a problem,
|
17
|
-
or only a low-priority problem. (Since locatetest runs rubylexervsruby over
|
18
|
-
and over, it produces lots of (mostly harmless) output. Sorry.)
|
19
|
+
Interpreting output of rubylexervsruby (and locatetest and 'make test'):
|
19
20
|
|
20
21
|
The following types of output should be ignored:
|
21
22
|
|
@@ -31,17 +32,21 @@ indicate that a warning was added or deleted. Ultimately, these should
|
|
31
32
|
go away, but right now it's a low-priority issue.
|
32
33
|
|
33
34
|
If you ever see ruby stack dump in rubylexervsruby output, that's certainly
|
34
|
-
|
35
|
+
a test failure.
|
35
36
|
|
36
37
|
Something that looks like a unidiff chunk body (not header) may indicate
|
37
|
-
an
|
38
|
+
an text failure as well. To understand more about how the unidiff output is
|
38
39
|
created, see the section on testing strategy below.
|
39
40
|
|
41
|
+
locatetest produces lots of (mostly harmless) output. Sorry.
|
42
|
+
|
40
43
|
htree/template.rb should be ok now.
|
41
44
|
|
42
|
-
currently, lots of warnings are printed about token offsets being off
|
43
|
-
|
44
|
-
ignoring it.
|
45
|
+
currently, lots of warnings are printed about token offsets being off.
|
46
|
+
(like: "failed to check offset in N cases...") This is a problem, but for
|
47
|
+
now I'm ignoring it. (Most lexer applications don't need token offsets to
|
48
|
+
be correct, and it's only a minority of cases, near here documents, where
|
49
|
+
this problem occurs.)
|
45
50
|
|
46
51
|
Diff chunks like this indicate a minor problem with the placement of (empty)
|
47
52
|
string fragments. Ignore it for now:
|
@@ -58,10 +63,26 @@ string fragments. Ignore it for now:
|
|
58
63
|
Shifting token tSTRING_DBEG ()
|
59
64
|
|
60
65
|
|
66
|
+
Diff chunks like this indicate a minor problem with the placement of newlines.
|
67
|
+
Ignore it for now:
|
68
|
+
@@ -8,3 +8,2 @@
|
69
|
+
Shifting token tSTRING_END ()
|
70
|
+
-Shifting token '\n' ()
|
71
|
+
Shifting token "end-of-input" ()
|
72
|
+
@@ -8,3 +8,2 @@
|
73
|
+
Shifting token tSTRING_END ()
|
74
|
+
-Shifting token '\n' ()
|
75
|
+
Shifting token "end-of-input" ()
|
76
|
+
|
77
|
+
|
78
|
+
There are a few other problems in the test suite as well. Current test status
|
79
|
+
is less clean than I'd like, tho the conformance level of rubylexer is still
|
80
|
+
very high.
|
61
81
|
|
62
82
|
if you find any output that doesn't look like one of the above exceptions,
|
63
|
-
|
64
|
-
to my arsenal of
|
83
|
+
(for cases that aren't in the existing snippet set) and the input file was
|
84
|
+
valid ruby, please send it to me so that i can add it to my arsenal of
|
85
|
+
tests.
|
65
86
|
|
66
87
|
there are a number of 'ruby' files that i know of out there that actually
|
67
88
|
contain syntax errors:
|
@@ -117,5 +138,4 @@ it is possible, however, that rubylexer is emitting as a single token things tha
|
|
117
138
|
thinks should be 2 tokens. and in fact, this is the case with strings: ruby divides a
|
118
139
|
string into string open, string body, and string close tokens with option interpolations,
|
119
140
|
whereas rubylexer has just a single string token (with subtokens, if interpolations are
|
120
|
-
present.)
|
121
|
-
to correctly lex certain very complicated strings.
|
141
|
+
present.)
|