rubylexer 0.7.7 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -0
  2. data/History.txt +64 -0
  3. data/Makefile +2 -2
  4. data/README.txt +13 -9
  5. data/bin/rubylexer +113 -0
  6. data/lib/assert.rb +1 -1
  7. data/lib/rubylexer.rb +856 -305
  8. data/lib/rubylexer/charhandler.rb +1 -1
  9. data/lib/rubylexer/charset.rb +15 -7
  10. data/lib/rubylexer/context.rb +10 -2
  11. data/lib/rubylexer/lextable.rb +1 -0
  12. data/lib/rubylexer/rubycode.rb +1 -1
  13. data/lib/rubylexer/rulexer.rb +106 -32
  14. data/lib/rubylexer/symboltable.rb +1 -1
  15. data/lib/rubylexer/test/oneliners.rb +15 -5
  16. data/lib/rubylexer/test/oneliners_1.9.rb +116 -92
  17. data/lib/rubylexer/test/stanzas.rb +49 -27
  18. data/lib/rubylexer/test/testcases.rb +2 -2
  19. data/lib/rubylexer/token.rb +153 -23
  20. data/lib/rubylexer/tokenprinter.rb +9 -6
  21. data/lib/rubylexer/version.rb +1 -1
  22. data/rubylexer.gemspec +12 -8
  23. data/test/bad/ruby_lexer.rb +7 -0
  24. data/test/code/deletewarns.rb +1 -1
  25. data/test/code/dumptokens.rb +1 -81
  26. data/test/code/heredoc_blast_test.rb +112 -0
  27. data/test/code/locatetest.rb +1 -1
  28. data/test/code/regression.rb +23 -23
  29. data/test/code/rubylexervsruby.rb +59 -12
  30. data/test/code/tokentest.rb +62 -52
  31. data/test/data/23.rb +0 -1
  32. data/test/data/g.rb +0 -1
  33. data/test/data/heremonsters.rb +1 -1
  34. data/test/data/heremonsters_dos.rb +1 -1
  35. data/test/data/pre.rb +0 -1
  36. data/test/data/pre.unix.rb +0 -1
  37. data/test/data/putstext.rb +4 -0
  38. data/test/data/regtest.rb +0 -1
  39. data/test/data/stuffydog.rb +5 -0
  40. data/test/data/stuffydog2.rb +5 -0
  41. data/test/data/wsdlDriver.rb +0 -1
  42. data/test/test.sh +1 -1
  43. data/test/test_all.rb +3 -0
  44. data/test/test_bad_rubylexer.rb +16 -0
  45. data/test/test_rubylexer_bad.rb +12 -0
  46. data/testing.txt +40 -20
  47. metadata +51 -38
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/ruby
2
- =begin legal crap
2
+ =begin legalia
3
3
  rubylexer - a ruby lexer written in ruby
4
- Copyright (C) 2004,2005,2008 Caleb Clausen
4
+ Copyright (C) 2004,2005,2008, 2011 Caleb Clausen
5
5
 
6
6
  This library is free software; you can redistribute it and/or
7
7
  modify it under the terms of the GNU Lesser General Public
@@ -24,7 +24,7 @@ require "pp"
24
24
 
25
25
  class RubyLexer
26
26
  class Token
27
- def verify_offset(fd); false end
27
+ def verify_offset(fd,lexer); false end
28
28
 
29
29
  def check_for_error; end
30
30
  end
@@ -36,17 +36,17 @@ module ErrorToken
36
36
  end
37
37
 
38
38
  class FileAndLineToken
39
- def verify_offset(fd); true end
39
+ def verify_offset(fd,lexer); true end
40
40
  end
41
41
  class ImplicitParamListStartToken
42
- def verify_offset(fd); true end
42
+ def verify_offset(fd,lexer); true end
43
43
  end
44
44
  class ImplicitParamListEndToken
45
- def verify_offset(fd); true end
45
+ def verify_offset(fd,lexer); true end
46
46
  end
47
47
 
48
48
  module SimpleVerify
49
- def verify_offset(fd)
49
+ def verify_offset(fd,lexer)
50
50
  fd.read(@ident.length)==@ident
51
51
  end
52
52
  end
@@ -57,13 +57,13 @@ class MethNameToken; include SimpleVerify; end
57
57
 
58
58
  class NewlineToken
59
59
  include SimpleVerify
60
- def verify_offset(fd)
60
+ def verify_offset(fd,lexer)
61
61
  super or fd.eof?
62
62
  end
63
63
  end
64
64
 
65
65
  class SymbolToken
66
- def verify_offset(fd)
66
+ def verify_offset(fd,lexer)
67
67
  la=fd.read(2)
68
68
  case la
69
69
  when '%s'
@@ -98,15 +98,15 @@ end
98
98
 
99
99
  class EoiToken
100
100
  include SimpleVerify
101
- def verify_offset(fd)
102
- result=super(fd)
101
+ def verify_offset(fd,lexer)
102
+ result=super(fd,lexer)
103
103
  fd.eof?
104
104
  return result
105
105
  end
106
106
  end
107
107
 
108
108
  class NoWsToken
109
- def verify_offset(fd)
109
+ def verify_offset(fd,lexer)
110
110
  orig=fd.pos
111
111
  fd.pos=orig-1
112
112
  result= (/^[^\s\v\t\n\r\f]{2}$/===fd.read(2))
@@ -116,13 +116,13 @@ class NoWsToken
116
116
  end
117
117
 
118
118
  class HereBodyToken
119
- def verify_offset(fd)
120
- @ident.verify_subtoken_offsets(fd)
119
+ def verify_offset(fd,lexer)
120
+ @ident.verify_subtoken_offsets(fd,lexer)
121
121
  end
122
122
  end
123
123
 
124
124
  class HerePlaceholderToken
125
- def verify_offset(fd)
125
+ def verify_offset(fd,lexer)
126
126
  '<<'==fd.read(2) or return false
127
127
  @dash and ('-'==fd.read(1) or return false)
128
128
  case ch=fd.read(1)[0]
@@ -143,14 +143,14 @@ end
143
143
  class StringToken
144
144
  FANCY_QUOTE_BEGINNINGS= {'`'=>'%x', '['=>'%w', '{'=>'%W',
145
145
  '"'=>/('|%[^a-pr-z0-9])/i, '/'=>'%r'}
146
- def verify_offset(fd)
146
+ def verify_offset(fd,lexer)
147
147
  fd.read(open.size)==open or return false
148
148
  # str=fd.read(2)
149
149
  # @char==str[0,1] or FANCY_QUOTE_BEGINNINGS[@char]===str or return false
150
- verify_subtoken_offsets(fd)
150
+ verify_subtoken_offsets(fd,lexer)
151
151
  end
152
152
 
153
- def verify_subtoken_offsets(fd)
153
+ def verify_subtoken_offsets(fd,lexer)
154
154
  #verify offsets of subtokens
155
155
  @elems.each{|elem|
156
156
  case elem
@@ -173,7 +173,7 @@ class StringToken
173
173
  #assert now_at<=goal+1 #not needed
174
174
  saw[goal..-1]='' unless goal==now_at
175
175
  saw==elem or return false
176
- else elem.verify_offset(fd) or raise LexerError
176
+ else elem.verify_offset(fd,lexer) or raise LexerError
177
177
  end
178
178
  }
179
179
  return true
@@ -188,12 +188,12 @@ class StringToken
188
188
  end
189
189
 
190
190
  class RubyCode
191
- def verify_offset(fd)
191
+ def verify_offset(fd,lexer)
192
192
  thistok=nexttok=endpos=nil
193
193
  @ident.each_index{ |tok_i|
194
194
  thistok,nexttok=@ident[tok_i,2]
195
195
  endpos=nexttok ? nexttok.offset : thistok.offset+thistok.to_s.size
196
- check_offset(thistok,fd,endpos)
196
+ lexer.check_offset(thistok,fd,endpos)
197
197
  }
198
198
  assert nexttok.nil?
199
199
  assert thistok.object_id==@ident.last.object_id
@@ -208,7 +208,7 @@ end
208
208
 
209
209
 
210
210
  class NumberToken
211
- def verify_offset(fd)
211
+ def verify_offset(fd,lexer)
212
212
  /^[0-9?+-]$/===fd.read(1)
213
213
  end
214
214
  end
@@ -221,43 +221,48 @@ end
221
221
  #end
222
222
  end
223
223
 
224
- public
225
-
226
-
227
- def check_offset(tok,file=nil,endpos=nil)
228
- #the errors detected here are now reduced to warnings....
229
- file||=@original_file
230
- String===file and file=file.to_sequence
231
- allow_ooo= @moretokens&&@moretokens[0]&&@moretokens[0].allow_ooo_offset unless endpos
232
- endpos||=((@moretokens.empty?)? input_position : @moretokens[0].offset)
233
- oldpos=file.pos
234
-
235
- assert Integer===tok.offset
236
- assert Integer===endpos
237
- if endpos<tok.offset and !allow_ooo
238
- $stderr.puts "expected #{endpos} to be >= #{tok.offset} token #{tok.to_s.gsub("\n","\n ")}:#{tok.class}"
239
- end
224
+ class RubyLexer
225
+ public
226
+
227
+ attr_reader :offset_failures, :offset_first_failure
228
+
229
+ def check_offset(tok,file=nil,endpos=nil)
230
+ #the errors detected here are now reduced to warnings....
231
+ file||=@original_file
232
+ String===file and file=file.to_sequence
233
+ allow_ooo= @moretokens&&@moretokens[0]&&@moretokens[0].allow_ooo_offset unless endpos
234
+ endpos||=((@moretokens.empty?)? input_position : @moretokens[0].offset)
235
+ oldpos=file.pos
236
+
237
+ assert Integer===tok.offset
238
+ assert Integer===endpos
239
+ if endpos<tok.offset and !allow_ooo
240
+ $stderr.puts "expected #{endpos} to be >= #{tok.offset} token #{tok.to_s.gsub("\n","\n ")}:#{tok.class}"
241
+ end
240
242
 
241
- file.pos=tok.offset
242
- tok.verify_offset(file) or
243
- $stderr.puts "couldn't check offset of token #{tok.class}: #{tok.to_s.gsub("\n","\n ")} at #{tok.offset}"
244
- case tok
245
- when RubyLexer::StringToken,RubyLexer::NumberToken,
246
- RubyLexer::HereBodyToken,RubyLexer::SymbolToken,
247
- RubyLexer::HerePlaceholderToken,
248
- RubyLexer::FileAndLineToken #do nothing
249
- else
250
- file.pos==endpos or allow_ooo or
251
- $stderr.puts "positions don't line up, expected #{endpos}, got #{file.pos}, token: #{tok.to_s.gsub("\n","\n ") }"
243
+ file.pos=tok.offset
244
+ unless tok.verify_offset(file,self)
245
+ @offset_failures ||= 0
246
+ @offset_failures += 1
247
+ @offset_first_failure ||= tok
248
+ end
249
+ case tok
250
+ when RubyLexer::StringToken,RubyLexer::NumberToken,
251
+ RubyLexer::HereBodyToken,RubyLexer::SymbolToken,
252
+ RubyLexer::HerePlaceholderToken,
253
+ RubyLexer::FileAndLineToken #do nothing
254
+ else
255
+ file.pos==endpos or allow_ooo or
256
+ $stderr.puts "positions don't line up, expected #{endpos}, got #{file.pos}, token: #{tok.to_s.gsub("\n","\n ") }"
257
+ end
258
+ file.pos=oldpos
259
+ return
252
260
  end
253
- file.pos=oldpos
254
- return
255
261
  end
256
262
 
257
263
 
258
264
 
259
265
 
260
-
261
266
  def tokentest(name,lexertype,pprinter,input=File.open(name),output=$stdout)
262
267
  input ||= File.open(name)
263
268
  if output!=$stdout
@@ -277,6 +282,11 @@ def tokentest(name,lexertype,pprinter,input=File.open(name),output=$stdout)
277
282
  pprinter.pprint(tok,output)
278
283
  end until RubyLexer::EoiToken===tok
279
284
 
285
+ if lxr.offset_failures
286
+ first=lxr.offset_first_failure
287
+ $stderr.puts "failed to check offset in #{lxr.offset_failures} cases. first=#{first.class}: #{first.to_s.gsub("\n","\n ")} at #{first.offset}"
288
+ end
289
+
280
290
  #hack for SimpleTokenPrinter....
281
291
  print "\n" if RubyLexer::NewlineToken===lxr.last_operative_token and
282
292
  RubyLexer::SimpleTokenPrinter===pprinter
@@ -14,7 +14,6 @@ r[0]^=r[1]
14
14
  $=||=0;$_||=0_0
15
15
  i,v=['o',*local_variables].flatten.grep(/\A.\Z/).sort_by{|x|x[0].-(?j).abs.-@}.reverse,false
16
16
  y=methods.grep(U).sort_by{|x|x[0].-(?e).abs.-@}
17
- #breakpoint
18
17
  i=[i,y].join(I=Love(%q=(2.**(5).chr)=)).gsub(' '){w=$=^=1;$x[w.inspect[1][0]*F-1,1]}.gsub(/[eyd]/){(?e.+?y.-($&[0])).chr }.delete('z')
19
18
  i=i.scan(%r:#{I}|.*?(?=#{I})|.*:).inject([]){|r,p|unless(v=!v if(v=!v)...(v))..(v=!v):r<<[]end;r.last<<p;r};v^=!v
20
19
  i,@v,@u=i.reject{true if (v=!v)..v}.join.capitalize,local_variables.inject(?X){|p,v|p^v[0]}.chr,O.constants.grep(U).pop
@@ -1,6 +1,5 @@
1
1
  j=9;def i(n) [n ?"d" : "e" , n] end
2
2
 
3
- #breakpoint
4
3
  p(i ?")
5
4
  p(j ?"d" : "e")
6
5
 
@@ -70,7 +70,7 @@ baz
70
70
  simple2
71
71
 
72
72
  p <<oof+"gfert"
73
- #{gleeble #breakpoint
73
+ #{gleeble
74
74
  }
75
75
  oof
76
76
 
@@ -70,7 +70,7 @@ baz
70
70
  simple2
71
71
 
72
72
  p <<oof+"gfert"
73
- #{gleeble #breakpoint
73
+ #{gleeble
74
74
  }
75
75
  oof
76
76
 
@@ -16,7 +16,6 @@ delete( %%% <<'>>
16
16
  ):'
17
17
  ) ) : ( (
18
18
  [ [ # ] ]
19
- #breakpoint
20
19
  sub( %r
21
20
  ^ #{ %q
22
21
  (
@@ -16,7 +16,6 @@ delete( %%% <<'>>
16
16
  ):'
17
17
  ) ) : ( (
18
18
  [ [ # ] ]
19
- #breakpoint
20
19
  sub( %r
21
20
  ^ #{ %q
22
21
  (
@@ -0,0 +1,4 @@
1
+ # encoding: utf-8
2
+ #àè
3
+ puts "text"
4
+ # '
@@ -382,7 +382,6 @@ if ($Slow||=nil)
382
382
  end
383
383
 
384
384
  assert_eee Reg[lhqqbe+0], [ :begin, :"\\", :rand, :end ]
385
- #breakpoint
386
385
  assert_eee +[be], [:begin, :"\\", :"\\", :end]
387
386
  assert_eee +[be], [:begin, :"\\", :begin, :end]
388
387
  assert_eee +[be], [:begin, :"\\", :end, :end]
@@ -0,0 +1,5 @@
1
+ p <<stuff+'foobar'.tr('j-l','d-f')\
2
+ +"more stuff"
3
+ 12345678
4
+ the quick brown fox jumped over the lazy dog
5
+ stuff
@@ -0,0 +1,5 @@
1
+ p <<stuff+'foobar'.tr('j-l','d-f')
2
+ +"more stuff"
3
+ 12345678
4
+ the quick brown fox jumped over the lazy dog
5
+ stuff
@@ -229,7 +229,6 @@ class WSDLDriver
229
229
  Mapping.fault2exception(e)
230
230
  end
231
231
 
232
- #breakpoint
233
232
  ret = res_body.response ?
234
233
  Mapping.soap2obj(res_body.response, @mapping_registry) : nil
235
234
 
@@ -1,6 +1,6 @@
1
1
  #!/bin/sh
2
2
  # rubylexer - a ruby lexer written in ruby
3
- # Copyright (C) 2004,2005 Caleb Clausen
3
+ # Copyright (C) 2004,2005, 2011 Caleb Clausen
4
4
  #
5
5
  # This library is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the GNU Lesser General Public
@@ -1,2 +1,5 @@
1
+ $:<<File.expand_path(File.dirname(File.dirname(__FILE__)))
1
2
  require 'test/code/regression'
2
3
  require 'test/code/test_1.9'
4
+
5
+ Dir['test/test_*.rb'].each{|test| require test }
@@ -0,0 +1,16 @@
1
+ require 'rbconfig'
2
+ conf=RbConfig::CONFIG
3
+ ruby=conf['bindir']+"/"+conf['RUBY_INSTALL_NAME']
4
+ ruby='ruby' unless File.exist? ruby
5
+
6
+
7
+ fail unless system(ruby, "-e", <<END)
8
+ begin;
9
+ require '#{File.expand_path(File.join( File.dirname(__FILE__),'bad/ruby_lexer' ))}';
10
+ require 'rubygems'
11
+ require 'rubylexer';
12
+ rl=RubyLexer.new('eval','eval');
13
+ fail if $the_wrong_rubylexer==1;
14
+ end;
15
+ END
16
+
@@ -0,0 +1,12 @@
1
+ alias orig_warn warn
2
+ def warn x; end;
3
+
4
+ begin;
5
+ require 'rubylexer';
6
+ require 'test/bad/ruby_lexer';
7
+ rl=RubyLexer.new('eval','eval');
8
+ rescue Exception;
9
+ else fail if $the_wrong_rubylexer
10
+ end
11
+
12
+ alias warn orig_warn
@@ -1,21 +1,22 @@
1
1
  Running the tests:
2
- The simplest thing to do is run "ruby -Ilib test/code/regression.rb". This
3
- tests against a list of known ruby expressions. It will take several minutes
4
- to run. Currently, there are 4 (minor) failures.
2
+
3
+ The simplest thing to do is run "make test". This tests the lexer with a
4
+ list of known ruby interesting expressions. It will take several minutes
5
+ to run. Currently, there are 8-11 (minor) failures, depending or ruby
6
+ version. The fact that there are a few failures is more a testament to the
7
+ thoroughness of the test suite than an indictment of the lexer. Both lexer
8
+ and test suite are very thorough, but a few more (obscure and unlikely)
9
+ expressions are supported by the latter than the former.
10
+
11
+ Most of the tests in the suite use rubylexervsruby, described below.
5
12
 
6
13
  If you're ambitious, try this command: "ruby -Ilib test/code/locatetest.rb".
7
14
  This will use locate to find as much ruby code on your system and test
8
15
  each specimen to see if it can be tokenized correctly (by feeding it to
9
- testcode/rubylexervsruby.rb, the operation of which is outlined below
16
+ test/code/rubylexervsruby.rb, the operation of which is outlined below
10
17
  under 'testing strategy').
11
18
 
12
- Interpreting the output of rubylexervsruby.rb (and locatetest):
13
- In rubylexervsruby, I've tried to follow the philosophy that the test program
14
- doesn't print anything unless there's an error. Perhaps I haven't followed
15
- this far enough; every run of rubylexervsruby produces a little output, and
16
- sometimes a run will produce output that doesn't actually indicate a problem,
17
- or only a low-priority problem. (Since locatetest runs rubylexervsruby over
18
- and over, it produces lots of (mostly harmless) output. Sorry.)
19
+ Interpreting output of rubylexervsruby (and locatetest and 'make test'):
19
20
 
20
21
  The following types of output should be ignored:
21
22
 
@@ -31,17 +32,21 @@ indicate that a warning was added or deleted. Ultimately, these should
31
32
  go away, but right now it's a low-priority issue.
32
33
 
33
34
  If you ever see ruby stack dump in rubylexervsruby output, that's certainly
34
- an error.
35
+ a test failure.
35
36
 
36
37
  Something that looks like a unidiff chunk body (not header) may indicate
37
- an error as well. To understand more about how the unidiff output is
38
+ an text failure as well. To understand more about how the unidiff output is
38
39
  created, see the section on testing strategy below.
39
40
 
41
+ locatetest produces lots of (mostly harmless) output. Sorry.
42
+
40
43
  htree/template.rb should be ok now.
41
44
 
42
- currently, lots of warnings are printed about token offsets being off by 1,
43
- particularly the AssignmentRhsListToken. This is a problem, but for now I'm
44
- ignoring it.
45
+ currently, lots of warnings are printed about token offsets being off.
46
+ (like: "failed to check offset in N cases...") This is a problem, but for
47
+ now I'm ignoring it. (Most lexer applications don't need token offsets to
48
+ be correct, and it's only a minority of cases, near here documents, where
49
+ this problem occurs.)
45
50
 
46
51
  Diff chunks like this indicate a minor problem with the placement of (empty)
47
52
  string fragments. Ignore it for now:
@@ -58,10 +63,26 @@ string fragments. Ignore it for now:
58
63
  Shifting token tSTRING_DBEG ()
59
64
 
60
65
 
66
+ Diff chunks like this indicate a minor problem with the placement of newlines.
67
+ Ignore it for now:
68
+ @@ -8,3 +8,2 @@
69
+ Shifting token tSTRING_END ()
70
+ -Shifting token '\n' ()
71
+ Shifting token "end-of-input" ()
72
+ @@ -8,3 +8,2 @@
73
+ Shifting token tSTRING_END ()
74
+ -Shifting token '\n' ()
75
+ Shifting token "end-of-input" ()
76
+
77
+
78
+ There are a few other problems in the test suite as well. Current test status
79
+ is less clean than I'd like, tho the conformance level of rubylexer is still
80
+ very high.
61
81
 
62
82
  if you find any output that doesn't look like one of the above exceptions,
63
- and the input file was valid ruby, please send it to me so that i can add it
64
- to my arsenal of tests.
83
+ (for cases that aren't in the existing snippet set) and the input file was
84
+ valid ruby, please send it to me so that i can add it to my arsenal of
85
+ tests.
65
86
 
66
87
  there are a number of 'ruby' files that i know of out there that actually
67
88
  contain syntax errors:
@@ -117,5 +138,4 @@ it is possible, however, that rubylexer is emitting as a single token things tha
117
138
  thinks should be 2 tokens. and in fact, this is the case with strings: ruby divides a
118
139
  string into string open, string body, and string close tokens with option interpolations,
119
140
  whereas rubylexer has just a single string token (with subtokens, if interpolations are
120
- present.) this difference in handling accounts in part for rubylexer's inability
121
- to correctly lex certain very complicated strings.
141
+ present.)