rubylexer 0.7.7 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -0
- data/History.txt +64 -0
- data/Makefile +2 -2
- data/README.txt +13 -9
- data/bin/rubylexer +113 -0
- data/lib/assert.rb +1 -1
- data/lib/rubylexer.rb +856 -305
- data/lib/rubylexer/charhandler.rb +1 -1
- data/lib/rubylexer/charset.rb +15 -7
- data/lib/rubylexer/context.rb +10 -2
- data/lib/rubylexer/lextable.rb +1 -0
- data/lib/rubylexer/rubycode.rb +1 -1
- data/lib/rubylexer/rulexer.rb +106 -32
- data/lib/rubylexer/symboltable.rb +1 -1
- data/lib/rubylexer/test/oneliners.rb +15 -5
- data/lib/rubylexer/test/oneliners_1.9.rb +116 -92
- data/lib/rubylexer/test/stanzas.rb +49 -27
- data/lib/rubylexer/test/testcases.rb +2 -2
- data/lib/rubylexer/token.rb +153 -23
- data/lib/rubylexer/tokenprinter.rb +9 -6
- data/lib/rubylexer/version.rb +1 -1
- data/rubylexer.gemspec +12 -8
- data/test/bad/ruby_lexer.rb +7 -0
- data/test/code/deletewarns.rb +1 -1
- data/test/code/dumptokens.rb +1 -81
- data/test/code/heredoc_blast_test.rb +112 -0
- data/test/code/locatetest.rb +1 -1
- data/test/code/regression.rb +23 -23
- data/test/code/rubylexervsruby.rb +59 -12
- data/test/code/tokentest.rb +62 -52
- data/test/data/23.rb +0 -1
- data/test/data/g.rb +0 -1
- data/test/data/heremonsters.rb +1 -1
- data/test/data/heremonsters_dos.rb +1 -1
- data/test/data/pre.rb +0 -1
- data/test/data/pre.unix.rb +0 -1
- data/test/data/putstext.rb +4 -0
- data/test/data/regtest.rb +0 -1
- data/test/data/stuffydog.rb +5 -0
- data/test/data/stuffydog2.rb +5 -0
- data/test/data/wsdlDriver.rb +0 -1
- data/test/test.sh +1 -1
- data/test/test_all.rb +3 -0
- data/test/test_bad_rubylexer.rb +16 -0
- data/test/test_rubylexer_bad.rb +12 -0
- data/testing.txt +40 -20
- metadata +51 -38
data/test/code/tokentest.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
|
-
=begin
|
2
|
+
=begin legalia
|
3
3
|
rubylexer - a ruby lexer written in ruby
|
4
|
-
Copyright (C) 2004,2005,2008 Caleb Clausen
|
4
|
+
Copyright (C) 2004,2005,2008, 2011 Caleb Clausen
|
5
5
|
|
6
6
|
This library is free software; you can redistribute it and/or
|
7
7
|
modify it under the terms of the GNU Lesser General Public
|
@@ -24,7 +24,7 @@ require "pp"
|
|
24
24
|
|
25
25
|
class RubyLexer
|
26
26
|
class Token
|
27
|
-
def verify_offset(fd); false end
|
27
|
+
def verify_offset(fd,lexer); false end
|
28
28
|
|
29
29
|
def check_for_error; end
|
30
30
|
end
|
@@ -36,17 +36,17 @@ module ErrorToken
|
|
36
36
|
end
|
37
37
|
|
38
38
|
class FileAndLineToken
|
39
|
-
def verify_offset(fd); true end
|
39
|
+
def verify_offset(fd,lexer); true end
|
40
40
|
end
|
41
41
|
class ImplicitParamListStartToken
|
42
|
-
def verify_offset(fd); true end
|
42
|
+
def verify_offset(fd,lexer); true end
|
43
43
|
end
|
44
44
|
class ImplicitParamListEndToken
|
45
|
-
def verify_offset(fd); true end
|
45
|
+
def verify_offset(fd,lexer); true end
|
46
46
|
end
|
47
47
|
|
48
48
|
module SimpleVerify
|
49
|
-
def verify_offset(fd)
|
49
|
+
def verify_offset(fd,lexer)
|
50
50
|
fd.read(@ident.length)==@ident
|
51
51
|
end
|
52
52
|
end
|
@@ -57,13 +57,13 @@ class MethNameToken; include SimpleVerify; end
|
|
57
57
|
|
58
58
|
class NewlineToken
|
59
59
|
include SimpleVerify
|
60
|
-
def verify_offset(fd)
|
60
|
+
def verify_offset(fd,lexer)
|
61
61
|
super or fd.eof?
|
62
62
|
end
|
63
63
|
end
|
64
64
|
|
65
65
|
class SymbolToken
|
66
|
-
def verify_offset(fd)
|
66
|
+
def verify_offset(fd,lexer)
|
67
67
|
la=fd.read(2)
|
68
68
|
case la
|
69
69
|
when '%s'
|
@@ -98,15 +98,15 @@ end
|
|
98
98
|
|
99
99
|
class EoiToken
|
100
100
|
include SimpleVerify
|
101
|
-
def verify_offset(fd)
|
102
|
-
result=super(fd)
|
101
|
+
def verify_offset(fd,lexer)
|
102
|
+
result=super(fd,lexer)
|
103
103
|
fd.eof?
|
104
104
|
return result
|
105
105
|
end
|
106
106
|
end
|
107
107
|
|
108
108
|
class NoWsToken
|
109
|
-
def verify_offset(fd)
|
109
|
+
def verify_offset(fd,lexer)
|
110
110
|
orig=fd.pos
|
111
111
|
fd.pos=orig-1
|
112
112
|
result= (/^[^\s\v\t\n\r\f]{2}$/===fd.read(2))
|
@@ -116,13 +116,13 @@ class NoWsToken
|
|
116
116
|
end
|
117
117
|
|
118
118
|
class HereBodyToken
|
119
|
-
def verify_offset(fd)
|
120
|
-
@ident.verify_subtoken_offsets(fd)
|
119
|
+
def verify_offset(fd,lexer)
|
120
|
+
@ident.verify_subtoken_offsets(fd,lexer)
|
121
121
|
end
|
122
122
|
end
|
123
123
|
|
124
124
|
class HerePlaceholderToken
|
125
|
-
def verify_offset(fd)
|
125
|
+
def verify_offset(fd,lexer)
|
126
126
|
'<<'==fd.read(2) or return false
|
127
127
|
@dash and ('-'==fd.read(1) or return false)
|
128
128
|
case ch=fd.read(1)[0]
|
@@ -143,14 +143,14 @@ end
|
|
143
143
|
class StringToken
|
144
144
|
FANCY_QUOTE_BEGINNINGS= {'`'=>'%x', '['=>'%w', '{'=>'%W',
|
145
145
|
'"'=>/('|%[^a-pr-z0-9])/i, '/'=>'%r'}
|
146
|
-
def verify_offset(fd)
|
146
|
+
def verify_offset(fd,lexer)
|
147
147
|
fd.read(open.size)==open or return false
|
148
148
|
# str=fd.read(2)
|
149
149
|
# @char==str[0,1] or FANCY_QUOTE_BEGINNINGS[@char]===str or return false
|
150
|
-
verify_subtoken_offsets(fd)
|
150
|
+
verify_subtoken_offsets(fd,lexer)
|
151
151
|
end
|
152
152
|
|
153
|
-
def verify_subtoken_offsets(fd)
|
153
|
+
def verify_subtoken_offsets(fd,lexer)
|
154
154
|
#verify offsets of subtokens
|
155
155
|
@elems.each{|elem|
|
156
156
|
case elem
|
@@ -173,7 +173,7 @@ class StringToken
|
|
173
173
|
#assert now_at<=goal+1 #not needed
|
174
174
|
saw[goal..-1]='' unless goal==now_at
|
175
175
|
saw==elem or return false
|
176
|
-
else elem.verify_offset(fd) or raise LexerError
|
176
|
+
else elem.verify_offset(fd,lexer) or raise LexerError
|
177
177
|
end
|
178
178
|
}
|
179
179
|
return true
|
@@ -188,12 +188,12 @@ class StringToken
|
|
188
188
|
end
|
189
189
|
|
190
190
|
class RubyCode
|
191
|
-
def verify_offset(fd)
|
191
|
+
def verify_offset(fd,lexer)
|
192
192
|
thistok=nexttok=endpos=nil
|
193
193
|
@ident.each_index{ |tok_i|
|
194
194
|
thistok,nexttok=@ident[tok_i,2]
|
195
195
|
endpos=nexttok ? nexttok.offset : thistok.offset+thistok.to_s.size
|
196
|
-
check_offset(thistok,fd,endpos)
|
196
|
+
lexer.check_offset(thistok,fd,endpos)
|
197
197
|
}
|
198
198
|
assert nexttok.nil?
|
199
199
|
assert thistok.object_id==@ident.last.object_id
|
@@ -208,7 +208,7 @@ end
|
|
208
208
|
|
209
209
|
|
210
210
|
class NumberToken
|
211
|
-
def verify_offset(fd)
|
211
|
+
def verify_offset(fd,lexer)
|
212
212
|
/^[0-9?+-]$/===fd.read(1)
|
213
213
|
end
|
214
214
|
end
|
@@ -221,43 +221,48 @@ end
|
|
221
221
|
#end
|
222
222
|
end
|
223
223
|
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
file
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
224
|
+
class RubyLexer
|
225
|
+
public
|
226
|
+
|
227
|
+
attr_reader :offset_failures, :offset_first_failure
|
228
|
+
|
229
|
+
def check_offset(tok,file=nil,endpos=nil)
|
230
|
+
#the errors detected here are now reduced to warnings....
|
231
|
+
file||=@original_file
|
232
|
+
String===file and file=file.to_sequence
|
233
|
+
allow_ooo= @moretokens&&@moretokens[0]&&@moretokens[0].allow_ooo_offset unless endpos
|
234
|
+
endpos||=((@moretokens.empty?)? input_position : @moretokens[0].offset)
|
235
|
+
oldpos=file.pos
|
236
|
+
|
237
|
+
assert Integer===tok.offset
|
238
|
+
assert Integer===endpos
|
239
|
+
if endpos<tok.offset and !allow_ooo
|
240
|
+
$stderr.puts "expected #{endpos} to be >= #{tok.offset} token #{tok.to_s.gsub("\n","\n ")}:#{tok.class}"
|
241
|
+
end
|
240
242
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
243
|
+
file.pos=tok.offset
|
244
|
+
unless tok.verify_offset(file,self)
|
245
|
+
@offset_failures ||= 0
|
246
|
+
@offset_failures += 1
|
247
|
+
@offset_first_failure ||= tok
|
248
|
+
end
|
249
|
+
case tok
|
250
|
+
when RubyLexer::StringToken,RubyLexer::NumberToken,
|
251
|
+
RubyLexer::HereBodyToken,RubyLexer::SymbolToken,
|
252
|
+
RubyLexer::HerePlaceholderToken,
|
253
|
+
RubyLexer::FileAndLineToken #do nothing
|
254
|
+
else
|
255
|
+
file.pos==endpos or allow_ooo or
|
256
|
+
$stderr.puts "positions don't line up, expected #{endpos}, got #{file.pos}, token: #{tok.to_s.gsub("\n","\n ") }"
|
257
|
+
end
|
258
|
+
file.pos=oldpos
|
259
|
+
return
|
252
260
|
end
|
253
|
-
file.pos=oldpos
|
254
|
-
return
|
255
261
|
end
|
256
262
|
|
257
263
|
|
258
264
|
|
259
265
|
|
260
|
-
|
261
266
|
def tokentest(name,lexertype,pprinter,input=File.open(name),output=$stdout)
|
262
267
|
input ||= File.open(name)
|
263
268
|
if output!=$stdout
|
@@ -277,6 +282,11 @@ def tokentest(name,lexertype,pprinter,input=File.open(name),output=$stdout)
|
|
277
282
|
pprinter.pprint(tok,output)
|
278
283
|
end until RubyLexer::EoiToken===tok
|
279
284
|
|
285
|
+
if lxr.offset_failures
|
286
|
+
first=lxr.offset_first_failure
|
287
|
+
$stderr.puts "failed to check offset in #{lxr.offset_failures} cases. first=#{first.class}: #{first.to_s.gsub("\n","\n ")} at #{first.offset}"
|
288
|
+
end
|
289
|
+
|
280
290
|
#hack for SimpleTokenPrinter....
|
281
291
|
print "\n" if RubyLexer::NewlineToken===lxr.last_operative_token and
|
282
292
|
RubyLexer::SimpleTokenPrinter===pprinter
|
data/test/data/23.rb
CHANGED
@@ -14,7 +14,6 @@ r[0]^=r[1]
|
|
14
14
|
$=||=0;$_||=0_0
|
15
15
|
i,v=['o',*local_variables].flatten.grep(/\A.\Z/).sort_by{|x|x[0].-(?j).abs.-@}.reverse,false
|
16
16
|
y=methods.grep(U).sort_by{|x|x[0].-(?e).abs.-@}
|
17
|
-
#breakpoint
|
18
17
|
i=[i,y].join(I=Love(%q=(2.**(5).chr)=)).gsub(' '){w=$=^=1;$x[w.inspect[1][0]*F-1,1]}.gsub(/[eyd]/){(?e.+?y.-($&[0])).chr }.delete('z')
|
19
18
|
i=i.scan(%r:#{I}|.*?(?=#{I})|.*:).inject([]){|r,p|unless(v=!v if(v=!v)...(v))..(v=!v):r<<[]end;r.last<<p;r};v^=!v
|
20
19
|
i,@v,@u=i.reject{true if (v=!v)..v}.join.capitalize,local_variables.inject(?X){|p,v|p^v[0]}.chr,O.constants.grep(U).pop
|
data/test/data/g.rb
CHANGED
data/test/data/heremonsters.rb
CHANGED
data/test/data/pre.rb
CHANGED
data/test/data/pre.unix.rb
CHANGED
data/test/data/regtest.rb
CHANGED
@@ -382,7 +382,6 @@ if ($Slow||=nil)
|
|
382
382
|
end
|
383
383
|
|
384
384
|
assert_eee Reg[lhqqbe+0], [ :begin, :"\\", :rand, :end ]
|
385
|
-
#breakpoint
|
386
385
|
assert_eee +[be], [:begin, :"\\", :"\\", :end]
|
387
386
|
assert_eee +[be], [:begin, :"\\", :begin, :end]
|
388
387
|
assert_eee +[be], [:begin, :"\\", :end, :end]
|
data/test/data/wsdlDriver.rb
CHANGED
data/test/test.sh
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/bin/sh
|
2
2
|
# rubylexer - a ruby lexer written in ruby
|
3
|
-
# Copyright (C) 2004,2005 Caleb Clausen
|
3
|
+
# Copyright (C) 2004,2005, 2011 Caleb Clausen
|
4
4
|
#
|
5
5
|
# This library is free software; you can redistribute it and/or
|
6
6
|
# modify it under the terms of the GNU Lesser General Public
|
data/test/test_all.rb
CHANGED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'rbconfig'
|
2
|
+
conf=RbConfig::CONFIG
|
3
|
+
ruby=conf['bindir']+"/"+conf['RUBY_INSTALL_NAME']
|
4
|
+
ruby='ruby' unless File.exist? ruby
|
5
|
+
|
6
|
+
|
7
|
+
fail unless system(ruby, "-e", <<END)
|
8
|
+
begin;
|
9
|
+
require '#{File.expand_path(File.join( File.dirname(__FILE__),'bad/ruby_lexer' ))}';
|
10
|
+
require 'rubygems'
|
11
|
+
require 'rubylexer';
|
12
|
+
rl=RubyLexer.new('eval','eval');
|
13
|
+
fail if $the_wrong_rubylexer==1;
|
14
|
+
end;
|
15
|
+
END
|
16
|
+
|
data/testing.txt
CHANGED
@@ -1,21 +1,22 @@
|
|
1
1
|
Running the tests:
|
2
|
-
|
3
|
-
|
4
|
-
|
2
|
+
|
3
|
+
The simplest thing to do is run "make test". This tests the lexer with a
|
4
|
+
list of known ruby interesting expressions. It will take several minutes
|
5
|
+
to run. Currently, there are 8-11 (minor) failures, depending or ruby
|
6
|
+
version. The fact that there are a few failures is more a testament to the
|
7
|
+
thoroughness of the test suite than an indictment of the lexer. Both lexer
|
8
|
+
and test suite are very thorough, but a few more (obscure and unlikely)
|
9
|
+
expressions are supported by the latter than the former.
|
10
|
+
|
11
|
+
Most of the tests in the suite use rubylexervsruby, described below.
|
5
12
|
|
6
13
|
If you're ambitious, try this command: "ruby -Ilib test/code/locatetest.rb".
|
7
14
|
This will use locate to find as much ruby code on your system and test
|
8
15
|
each specimen to see if it can be tokenized correctly (by feeding it to
|
9
|
-
|
16
|
+
test/code/rubylexervsruby.rb, the operation of which is outlined below
|
10
17
|
under 'testing strategy').
|
11
18
|
|
12
|
-
Interpreting
|
13
|
-
In rubylexervsruby, I've tried to follow the philosophy that the test program
|
14
|
-
doesn't print anything unless there's an error. Perhaps I haven't followed
|
15
|
-
this far enough; every run of rubylexervsruby produces a little output, and
|
16
|
-
sometimes a run will produce output that doesn't actually indicate a problem,
|
17
|
-
or only a low-priority problem. (Since locatetest runs rubylexervsruby over
|
18
|
-
and over, it produces lots of (mostly harmless) output. Sorry.)
|
19
|
+
Interpreting output of rubylexervsruby (and locatetest and 'make test'):
|
19
20
|
|
20
21
|
The following types of output should be ignored:
|
21
22
|
|
@@ -31,17 +32,21 @@ indicate that a warning was added or deleted. Ultimately, these should
|
|
31
32
|
go away, but right now it's a low-priority issue.
|
32
33
|
|
33
34
|
If you ever see ruby stack dump in rubylexervsruby output, that's certainly
|
34
|
-
|
35
|
+
a test failure.
|
35
36
|
|
36
37
|
Something that looks like a unidiff chunk body (not header) may indicate
|
37
|
-
an
|
38
|
+
an text failure as well. To understand more about how the unidiff output is
|
38
39
|
created, see the section on testing strategy below.
|
39
40
|
|
41
|
+
locatetest produces lots of (mostly harmless) output. Sorry.
|
42
|
+
|
40
43
|
htree/template.rb should be ok now.
|
41
44
|
|
42
|
-
currently, lots of warnings are printed about token offsets being off
|
43
|
-
|
44
|
-
ignoring it.
|
45
|
+
currently, lots of warnings are printed about token offsets being off.
|
46
|
+
(like: "failed to check offset in N cases...") This is a problem, but for
|
47
|
+
now I'm ignoring it. (Most lexer applications don't need token offsets to
|
48
|
+
be correct, and it's only a minority of cases, near here documents, where
|
49
|
+
this problem occurs.)
|
45
50
|
|
46
51
|
Diff chunks like this indicate a minor problem with the placement of (empty)
|
47
52
|
string fragments. Ignore it for now:
|
@@ -58,10 +63,26 @@ string fragments. Ignore it for now:
|
|
58
63
|
Shifting token tSTRING_DBEG ()
|
59
64
|
|
60
65
|
|
66
|
+
Diff chunks like this indicate a minor problem with the placement of newlines.
|
67
|
+
Ignore it for now:
|
68
|
+
@@ -8,3 +8,2 @@
|
69
|
+
Shifting token tSTRING_END ()
|
70
|
+
-Shifting token '\n' ()
|
71
|
+
Shifting token "end-of-input" ()
|
72
|
+
@@ -8,3 +8,2 @@
|
73
|
+
Shifting token tSTRING_END ()
|
74
|
+
-Shifting token '\n' ()
|
75
|
+
Shifting token "end-of-input" ()
|
76
|
+
|
77
|
+
|
78
|
+
There are a few other problems in the test suite as well. Current test status
|
79
|
+
is less clean than I'd like, tho the conformance level of rubylexer is still
|
80
|
+
very high.
|
61
81
|
|
62
82
|
if you find any output that doesn't look like one of the above exceptions,
|
63
|
-
|
64
|
-
to my arsenal of
|
83
|
+
(for cases that aren't in the existing snippet set) and the input file was
|
84
|
+
valid ruby, please send it to me so that i can add it to my arsenal of
|
85
|
+
tests.
|
65
86
|
|
66
87
|
there are a number of 'ruby' files that i know of out there that actually
|
67
88
|
contain syntax errors:
|
@@ -117,5 +138,4 @@ it is possible, however, that rubylexer is emitting as a single token things tha
|
|
117
138
|
thinks should be 2 tokens. and in fact, this is the case with strings: ruby divides a
|
118
139
|
string into string open, string body, and string close tokens with option interpolations,
|
119
140
|
whereas rubylexer has just a single string token (with subtokens, if interpolations are
|
120
|
-
present.)
|
121
|
-
to correctly lex certain very complicated strings.
|
141
|
+
present.)
|