ruby_parser 3.0.0 → 3.19.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +0 -0
- data/.autotest +36 -19
- data/History.rdoc +1297 -0
- data/Manifest.txt +35 -7
- data/{README.txt → README.rdoc} +44 -14
- data/Rakefile +308 -110
- data/bin/ruby_parse +3 -1
- data/bin/ruby_parse_extract_error +36 -16
- data/compare/normalize.rb +218 -0
- data/debugging.md +190 -0
- data/gauntlet.md +107 -0
- data/lib/.document +1 -0
- data/lib/rp_extensions.rb +53 -0
- data/lib/rp_stringscanner.rb +33 -0
- data/lib/ruby20_parser.rb +10973 -0
- data/lib/ruby20_parser.y +2683 -0
- data/lib/ruby21_parser.rb +10980 -0
- data/lib/ruby21_parser.y +2700 -0
- data/lib/ruby22_parser.rb +11123 -0
- data/lib/ruby22_parser.y +2711 -0
- data/lib/ruby23_parser.rb +11132 -0
- data/lib/ruby23_parser.y +2713 -0
- data/lib/ruby24_parser.rb +11231 -0
- data/lib/ruby24_parser.y +2721 -0
- data/lib/ruby25_parser.rb +11231 -0
- data/lib/ruby25_parser.y +2721 -0
- data/lib/ruby26_parser.rb +11253 -0
- data/lib/ruby26_parser.y +2736 -0
- data/lib/ruby27_parser.rb +12980 -0
- data/lib/ruby27_parser.y +3324 -0
- data/lib/ruby30_parser.rb +13242 -0
- data/lib/ruby30_parser.y +3447 -0
- data/lib/ruby31_parser.rb +13622 -0
- data/lib/ruby31_parser.y +3481 -0
- data/lib/ruby3_parser.yy +3536 -0
- data/lib/ruby_lexer.rb +933 -1232
- data/lib/ruby_lexer.rex +185 -0
- data/lib/ruby_lexer.rex.rb +399 -0
- data/lib/ruby_lexer_strings.rb +638 -0
- data/lib/ruby_parser.rb +97 -3
- data/lib/ruby_parser.yy +3465 -0
- data/lib/ruby_parser_extras.rb +1216 -687
- data/test/test_ruby_lexer.rb +2249 -1092
- data/test/test_ruby_parser.rb +5156 -975
- data/test/test_ruby_parser_extras.rb +47 -77
- data/tools/munge.rb +250 -0
- data/tools/ripper.rb +44 -0
- data.tar.gz.sig +1 -1
- metadata +200 -155
- metadata.gz.sig +0 -0
- data/.gemtest +0 -0
- data/History.txt +0 -482
- data/lib/gauntlet_rubyparser.rb +0 -120
- data/lib/ruby18_parser.rb +0 -5747
- data/lib/ruby18_parser.y +0 -1873
- data/lib/ruby19_parser.rb +0 -6110
- data/lib/ruby19_parser.y +0 -2078
@@ -1,27 +1,27 @@
|
|
1
1
|
#!/usr/bin/ruby -ws
|
2
2
|
|
3
|
-
$d ||= false
|
4
|
-
$
|
5
|
-
$
|
6
|
-
$
|
7
|
-
$
|
8
|
-
$q ||= ENV["QUIET"]
|
3
|
+
$d ||= ENV["DELETE"] || false
|
4
|
+
$t ||= ENV["DELETE_TIMEOUT"] || false
|
5
|
+
$m ||= ENV["MOVE_TIMEOUT"] || false
|
6
|
+
$q ||= ENV["QUIET"] || false
|
7
|
+
$s ||= ENV["SPEED"] || false
|
9
8
|
|
10
9
|
require 'rubygems'
|
11
10
|
require 'ruby_parser'
|
11
|
+
require 'fileutils'
|
12
12
|
|
13
13
|
ARGV.push "-" if ARGV.empty?
|
14
14
|
|
15
|
-
class
|
15
|
+
class RubyParser
|
16
16
|
def extract_defs
|
17
|
-
ss = lexer.
|
17
|
+
ss = current.lexer.ss
|
18
18
|
|
19
19
|
raise "can't access source. possible encoding issue" unless ss
|
20
20
|
|
21
21
|
src = ss.string
|
22
22
|
pre_error = src[0...ss.pos]
|
23
23
|
|
24
|
-
defs = pre_error.grep(/^ *def/)
|
24
|
+
defs = pre_error.lines.grep(/^ *(?:def|it)/)
|
25
25
|
|
26
26
|
raise "can't figure out where the bad code starts" unless defs.last
|
27
27
|
|
@@ -34,7 +34,7 @@ class Racc::Parser
|
|
34
34
|
|
35
35
|
src = pre_error + post_error[0..idx+$&.length]
|
36
36
|
|
37
|
-
src.scan(/^(( *)def .*?^\2end)/m)
|
37
|
+
src.scan(/^(( *)(?:def|it) .*?^\2end)/m)
|
38
38
|
end
|
39
39
|
|
40
40
|
def retest_for_errors defs
|
@@ -56,7 +56,7 @@ def expand path
|
|
56
56
|
files << f if File.file? f
|
57
57
|
end
|
58
58
|
|
59
|
-
files
|
59
|
+
files.sort
|
60
60
|
else
|
61
61
|
Dir.glob path
|
62
62
|
end
|
@@ -82,24 +82,44 @@ def process_error parser
|
|
82
82
|
end
|
83
83
|
rescue RuntimeError, Racc::ParseError => e
|
84
84
|
warn "# process error: #{e.message.strip}"
|
85
|
+
warn "# #{e.backtrace.first}"
|
85
86
|
end
|
86
87
|
|
87
88
|
def process file
|
88
|
-
ruby = file == "-" ? $stdin.
|
89
|
+
ruby = file == "-" ? $stdin.binread : File.binread(file)
|
90
|
+
time = (ENV["RP_TIMEOUT"] || 10).to_i
|
89
91
|
|
90
92
|
$stderr.print "# Validating #{file}: "
|
91
|
-
parser =
|
92
|
-
|
93
|
-
|
93
|
+
parser = RubyParser.new
|
94
|
+
t0 = Time.now if $s
|
95
|
+
parser.process(ruby, file, time)
|
96
|
+
if $s then
|
97
|
+
warn "good: #{Time.now - t0}"
|
98
|
+
else
|
99
|
+
warn "good"
|
100
|
+
end
|
94
101
|
File.unlink file if $d
|
95
102
|
rescue Timeout::Error
|
96
103
|
$exit = 1
|
97
104
|
warn "TIMEOUT parsing #{file}. Skipping."
|
98
|
-
|
105
|
+
|
106
|
+
if $m then
|
107
|
+
base_dir, *rest = file.split("/")
|
108
|
+
base_dir.sub!(/\.slow\.?.*/, "")
|
109
|
+
base_dir += ".slow.#{time}"
|
110
|
+
|
111
|
+
new_file = File.join(base_dir, *rest)
|
112
|
+
|
113
|
+
FileUtils.mkdir_p File.dirname(new_file)
|
114
|
+
FileUtils.move file, new_file, verbose:true
|
115
|
+
elsif $t then
|
116
|
+
File.unlink file
|
117
|
+
end
|
99
118
|
rescue StandardError, SyntaxError, Racc::ParseError => e
|
100
119
|
$exit = 1
|
101
120
|
warn ""
|
102
121
|
warn "# error: #{e.message.strip}" unless $q
|
122
|
+
warn "# #{e.backtrace.first}"
|
103
123
|
warn ""
|
104
124
|
return if $q
|
105
125
|
|
@@ -0,0 +1,218 @@
|
|
1
|
+
#!/usr/bin/env ruby -w
|
2
|
+
|
3
|
+
good = false
|
4
|
+
|
5
|
+
rules = Hash.new { |h,k| h[k] = [] }
|
6
|
+
rule = nil
|
7
|
+
order = []
|
8
|
+
|
9
|
+
def munge s
|
10
|
+
renames = [
|
11
|
+
# unquote... wtf?
|
12
|
+
/`(.+?)'/, proc { $1 },
|
13
|
+
/"'(.+?)'"/, proc { "\"#{$1}\"" },
|
14
|
+
|
15
|
+
"'='", "tEQL",
|
16
|
+
"'!'", "tBANG",
|
17
|
+
"'%'", "tPERCENT",
|
18
|
+
"'&'", "tAMPER2",
|
19
|
+
"'('", "tLPAREN2",
|
20
|
+
"')'", "tRPAREN",
|
21
|
+
"'*'", "tSTAR2",
|
22
|
+
"'+'", "tPLUS",
|
23
|
+
"','", "tCOMMA",
|
24
|
+
"'-'", "tMINUS",
|
25
|
+
"'.'", "tDOT",
|
26
|
+
"'/'", "tDIVIDE",
|
27
|
+
"';'", "tSEMI",
|
28
|
+
"':'", "tCOLON",
|
29
|
+
"'<'", "tLT",
|
30
|
+
"'>'", "tGT",
|
31
|
+
"'?'", "tEH",
|
32
|
+
"'['", "tLBRACK",
|
33
|
+
"'\\n'", "tNL",
|
34
|
+
"']'", "tRBRACK",
|
35
|
+
"'^'", "tCARET",
|
36
|
+
"'`'", "tBACK_REF2",
|
37
|
+
"'{'", "tLCURLY",
|
38
|
+
"'|'", "tPIPE",
|
39
|
+
"'}'", "tRCURLY",
|
40
|
+
"'~'", "tTILDE",
|
41
|
+
'"["', "tLBRACK",
|
42
|
+
|
43
|
+
# 2.0 changes?
|
44
|
+
'"<=>"', "tCMP",
|
45
|
+
'"=="', "tEQ",
|
46
|
+
'"==="', "tEQQ",
|
47
|
+
'"!~"', "tNMATCH",
|
48
|
+
'"=~"', "tMATCH",
|
49
|
+
'">="', "tGEQ",
|
50
|
+
'"<="', "tLEQ",
|
51
|
+
'"!="', "tNEQ",
|
52
|
+
'"<<"', "tLSHFT",
|
53
|
+
'">>"', "tRSHFT",
|
54
|
+
'"*"', "tSTAR",
|
55
|
+
|
56
|
+
'".."', "tDOT2",
|
57
|
+
|
58
|
+
'"&"', "tAMPER",
|
59
|
+
'"&&"', "tANDOP",
|
60
|
+
'"&."', "tLONELY",
|
61
|
+
'"||"', "tOROP",
|
62
|
+
|
63
|
+
'"..."', "tDOT3",
|
64
|
+
'"**"', "tPOW",
|
65
|
+
'"unary+"', "tUPLUS",
|
66
|
+
'"unary-"', "tUMINUS",
|
67
|
+
'"[]"', "tAREF",
|
68
|
+
'"[]="', "tASET",
|
69
|
+
'"::"', "tCOLON2",
|
70
|
+
'"{ arg"', "tLBRACE_ARG",
|
71
|
+
'"( arg"', "tLPAREN_ARG",
|
72
|
+
'"("', "tLPAREN",
|
73
|
+
'rparen', "tRPAREN",
|
74
|
+
'"{"', "tLBRACE",
|
75
|
+
'"=>"', "tASSOC",
|
76
|
+
'"->"', "tLAMBDA",
|
77
|
+
'":: at EXPR_BEG"', "tCOLON3",
|
78
|
+
'"**arg"', "tDSTAR",
|
79
|
+
'","', "tCOMMA",
|
80
|
+
|
81
|
+
# other
|
82
|
+
|
83
|
+
'tLBRACK2', "tLBRACK", # HACK
|
84
|
+
|
85
|
+
"' '", "tSPACE", # needs to be later to avoid bad hits
|
86
|
+
|
87
|
+
"%empty", "none", # newer bison
|
88
|
+
"/* empty */", "none",
|
89
|
+
/^\s*$/, "none",
|
90
|
+
|
91
|
+
"keyword_BEGIN", "klBEGIN",
|
92
|
+
"keyword_END", "klEND",
|
93
|
+
/keyword_(\w+)/, proc { "k#{$1.upcase}" },
|
94
|
+
/\bk_([a-z_]+)/, proc { "k#{$1.upcase}" },
|
95
|
+
/modifier_(\w+)/, proc { "k#{$1.upcase}_MOD" },
|
96
|
+
"kVARIABLE", "keyword_variable", # ugh
|
97
|
+
|
98
|
+
# 2.6 collapses klBEGIN to kBEGIN
|
99
|
+
"klBEGIN", "kBEGIN",
|
100
|
+
"klEND", "kEND",
|
101
|
+
|
102
|
+
/keyword_(\w+)/, proc { "k#{$1.upcase}" },
|
103
|
+
/\bk_([^_][a-z_]+)/, proc { "k#{$1.upcase}" },
|
104
|
+
/modifier_(\w+)/, proc { "k#{$1.upcase}_MOD" },
|
105
|
+
|
106
|
+
"kVARIABLE", "keyword_variable", # ugh: this is a rule name
|
107
|
+
|
108
|
+
# 2.7 changes:
|
109
|
+
|
110
|
+
'"global variable"', "tGVAR",
|
111
|
+
'"operator-assignment"', "tOP_ASGN",
|
112
|
+
'"back reference"', "tBACK_REF",
|
113
|
+
'"numbered reference"', "tNTH_REF",
|
114
|
+
'"local variable or method"', "tIDENTIFIER",
|
115
|
+
'"constant"', "tCONSTANT",
|
116
|
+
|
117
|
+
'"(.."', "tBDOT2",
|
118
|
+
'"(..."', "tBDOT3",
|
119
|
+
'"char literal"', "tCHAR",
|
120
|
+
'"literal content"', "tSTRING_CONTENT",
|
121
|
+
'"string literal"', "tSTRING_BEG",
|
122
|
+
'"symbol literal"', "tSYMBEG",
|
123
|
+
'"backtick literal"', "tXSTRING_BEG",
|
124
|
+
'"regexp literal"', "tREGEXP_BEG",
|
125
|
+
'"word list"', "tWORDS_BEG",
|
126
|
+
'"verbatim word list"', "tQWORDS_BEG",
|
127
|
+
'"symbol list"', "tSYMBOLS_BEG",
|
128
|
+
'"verbatim symbol list"', "tQSYMBOLS_BEG",
|
129
|
+
|
130
|
+
'"float literal"', "tFLOAT",
|
131
|
+
'"imaginary literal"', "tIMAGINARY",
|
132
|
+
'"integer literal"', "tINTEGER",
|
133
|
+
'"rational literal"', "tRATIONAL",
|
134
|
+
|
135
|
+
'"instance variable"', "tIVAR",
|
136
|
+
'"class variable"', "tCVAR",
|
137
|
+
'"terminator"', "tSTRING_END", # TODO: switch this?
|
138
|
+
'"method"', "tFID",
|
139
|
+
'"}"', "tSTRING_DEND",
|
140
|
+
|
141
|
+
'"do for block"', "kDO_BLOCK",
|
142
|
+
'"do for condition"', "kDO_COND",
|
143
|
+
'"do for lambda"', "kDO_LAMBDA",
|
144
|
+
"tLABEL", "kLABEL",
|
145
|
+
|
146
|
+
# UGH
|
147
|
+
"k_LINE__", "k__LINE__",
|
148
|
+
"k_FILE__", "k__FILE__",
|
149
|
+
"k_ENCODING__", "k__ENCODING__",
|
150
|
+
|
151
|
+
'"defined?"', "kDEFINED",
|
152
|
+
|
153
|
+
'"do (for condition)"', "kDO_COND",
|
154
|
+
'"do (for lambda)"', "kDO_LAMBDA",
|
155
|
+
'"do (for block)"', "kDO_BLOCK",
|
156
|
+
|
157
|
+
/\"(\w+) \(?modifier\)?\"/, proc { |x| "k#{$1.upcase}_MOD" },
|
158
|
+
/\"(\w+)\"/, proc { |x| "k#{$1.upcase}" },
|
159
|
+
|
160
|
+
/\$?@(\d+)(\s+|$)/, "", # newer bison
|
161
|
+
|
162
|
+
# TODO: remove for 3.0 work:
|
163
|
+
"lex_ctxt ", "" # 3.0 production that's mostly noise right now
|
164
|
+
]
|
165
|
+
|
166
|
+
renames.each_slice(2) do |(a, b)|
|
167
|
+
if Proc === b then
|
168
|
+
s.gsub!(a, &b)
|
169
|
+
else
|
170
|
+
s.gsub!(a, b)
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
s.strip
|
175
|
+
end
|
176
|
+
|
177
|
+
ARGF.each_line do |line|
|
178
|
+
next unless good or line =~ /^-* ?Grammar|\$accept : /
|
179
|
+
|
180
|
+
case line.strip
|
181
|
+
when /^$/ then
|
182
|
+
when /^(\d+) (\$?[@\w]+): (.*)/ then # yacc
|
183
|
+
rule = $2
|
184
|
+
order << rule unless rules.has_key? rule
|
185
|
+
rules[rule] << munge($3)
|
186
|
+
when /^(\d+) \s+\| (.*)/ then # yacc
|
187
|
+
rules[rule] << munge($2)
|
188
|
+
when /^(\d+) (@\d+): (.*)/ then # yacc
|
189
|
+
rule = $2
|
190
|
+
order << rule unless rules.has_key? rule
|
191
|
+
rules[rule] << munge($3)
|
192
|
+
when /^rule (\d+) (@?\w+):(.*)/ then # racc
|
193
|
+
rule = $2
|
194
|
+
order << rule unless rules.has_key? rule
|
195
|
+
rules[rule] << munge($3)
|
196
|
+
when /\$accept/ then # byacc?
|
197
|
+
good = true
|
198
|
+
when /Grammar/ then # both
|
199
|
+
good = true
|
200
|
+
when /^-+ Symbols/ then # racc
|
201
|
+
break
|
202
|
+
when /^Terminals/ then # yacc
|
203
|
+
break
|
204
|
+
when /^\cL/ then # byacc
|
205
|
+
break
|
206
|
+
else
|
207
|
+
warn "unparsed: #{$.}: #{line.strip.inspect}"
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
require 'yaml'
|
212
|
+
|
213
|
+
order.each do |k|
|
214
|
+
next if k =~ /@/
|
215
|
+
puts
|
216
|
+
puts "#{k}:"
|
217
|
+
puts rules[k].map { |r| " #{r}" }.join "\n"
|
218
|
+
end
|
data/debugging.md
ADDED
@@ -0,0 +1,190 @@
|
|
1
|
+
# Quick Notes to Help with Debugging
|
2
|
+
|
3
|
+
## Reducing
|
4
|
+
|
5
|
+
One of the most important steps is reducing the code sample to a
|
6
|
+
minimal reproduction. For example, one thing I'm debugging right now
|
7
|
+
was reported as:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
a, b, c, d, e, f, g, h, i, j = 1, *[p1, p2, p3], *[p1, p2, p3], *[p4, p5, p6]
|
11
|
+
```
|
12
|
+
|
13
|
+
This original sample has 10 items on the left-hand-side (LHS) and 1 +
|
14
|
+
3 groups of 3 (calls) on the RHS + 3 arrays + 3 splats. That's a lot.
|
15
|
+
|
16
|
+
It's already been reported (perhaps incorrectly) that this has to do
|
17
|
+
with multiple splats on the RHS, so let's focus on that. At a minimum
|
18
|
+
the code can be reduced to 2 splats on the RHS and some
|
19
|
+
experimentation shows that it needs a non-splat item to fail:
|
20
|
+
|
21
|
+
```
|
22
|
+
_, _, _ = 1, *[2], *[3]
|
23
|
+
```
|
24
|
+
|
25
|
+
and some intuition further removed the arrays:
|
26
|
+
|
27
|
+
```
|
28
|
+
_, _, _ = 1, *2, *3
|
29
|
+
```
|
30
|
+
|
31
|
+
the difference is huge and will make a ton of difference when
|
32
|
+
debugging.
|
33
|
+
|
34
|
+
## Getting something to compare
|
35
|
+
|
36
|
+
```
|
37
|
+
% rake debug3 F=file.rb
|
38
|
+
```
|
39
|
+
|
40
|
+
TODO
|
41
|
+
|
42
|
+
## Comparing against ruby / ripper:
|
43
|
+
|
44
|
+
```
|
45
|
+
% rake cmp3 F=file.rb
|
46
|
+
```
|
47
|
+
|
48
|
+
This compiles the parser & lexer and then parses file.rb using both
|
49
|
+
ruby, ripper, and ruby_parser in debug modes. The output is munged to
|
50
|
+
be as uniform as possible and diffable. I'm using emacs'
|
51
|
+
`ediff-files3` to compare these files (via `rake cmp3`) all at once,
|
52
|
+
but regular `diff -u tmp/{ruby,rp}` will suffice for most tasks.
|
53
|
+
|
54
|
+
From there? Good luck. I'm currently trying to backtrack from rule
|
55
|
+
reductions to state change differences. I'd like to figure out a way
|
56
|
+
to go from this sort of diff to a reasonable test that checks state
|
57
|
+
changes but I don't have that set up at this point.
|
58
|
+
|
59
|
+
## Adding New Grammar Productions
|
60
|
+
|
61
|
+
Ruby adds stuff to the parser ALL THE TIME. It's actually hard to keep
|
62
|
+
up with, but I've added some tools and shown what a typical workflow
|
63
|
+
looks like. Let's say you want to add ruby 2.7's "beginless range" (eg
|
64
|
+
`..42`).
|
65
|
+
|
66
|
+
Whenever there's a language feature missing, I start with comparing
|
67
|
+
the parse trees between MRI and RP:
|
68
|
+
|
69
|
+
### Structural Comparing
|
70
|
+
|
71
|
+
There's a bunch of rake tasks `compare27`, `compare26`, etc that try
|
72
|
+
to normalize and diff MRI's parse.y parse tree (just the structure of
|
73
|
+
the tree in yacc) to ruby\_parser's parse tree (racc). It's the first
|
74
|
+
thing I do when I'm adding a new version. Stub out all the version
|
75
|
+
differences, and then start to diff the structure and move
|
76
|
+
ruby\_parser towards the new changes.
|
77
|
+
|
78
|
+
Some differences are just gonna be there... but here's an example of a
|
79
|
+
real diff between MRI 2.7 and ruby_parser as of today:
|
80
|
+
|
81
|
+
```diff
|
82
|
+
arg tDOT3 arg
|
83
|
+
arg tDOT2
|
84
|
+
arg tDOT3
|
85
|
+
- tBDOT2 arg
|
86
|
+
- tBDOT3 arg
|
87
|
+
arg tPLUS arg
|
88
|
+
arg tMINUS arg
|
89
|
+
arg tSTAR2 arg
|
90
|
+
```
|
91
|
+
|
92
|
+
This is a new language feature that ruby_parser doesn't handle yet.
|
93
|
+
It's in MRI (the left hand side of the diff) but not ruby\_parser (the
|
94
|
+
right hand side) so it is a `-` or missing line.
|
95
|
+
|
96
|
+
Some other diffs will have both `+` and `-` lines. That usually
|
97
|
+
happens when MRI has been refactoring the grammar. Sometimes I choose
|
98
|
+
to adapt those refactorings and sometimes it starts to get too
|
99
|
+
difficult to maintain multiple versions of ruby parsing in a single
|
100
|
+
file.
|
101
|
+
|
102
|
+
But! This structural comparing is always a place you should look when
|
103
|
+
ruby_parser is failing to parse something. Maybe it just hasn't been
|
104
|
+
implemented yet and the easiest place to look is the diff.
|
105
|
+
|
106
|
+
### Starting Test First
|
107
|
+
|
108
|
+
The next thing I do is to add a parser test to cover that feature. I
|
109
|
+
usually start with the parser and work backwards towards the lexer as
|
110
|
+
needed, as I find it structures things properly and keeps things goal
|
111
|
+
oriented.
|
112
|
+
|
113
|
+
So, make a new parser test, usually in the versioned section of the
|
114
|
+
parser tests.
|
115
|
+
|
116
|
+
```
|
117
|
+
def test_beginless2
|
118
|
+
rb = "..10\n; ..a\n; c"
|
119
|
+
pt = s(:block,
|
120
|
+
s(:dot2, nil, s(:lit, 0).line(1)).line(1),
|
121
|
+
s(:dot2, nil, s(:call, nil, :a).line(2)).line(2),
|
122
|
+
s(:call, nil, :c).line(3)).line(1)
|
123
|
+
|
124
|
+
assert_parse_line rb, pt, 1
|
125
|
+
|
126
|
+
flunk "not done yet"
|
127
|
+
end
|
128
|
+
```
|
129
|
+
|
130
|
+
(In this case copied and modified the tests for open ranges from 2.6)
|
131
|
+
and run it to get my first error:
|
132
|
+
|
133
|
+
```
|
134
|
+
% rake N=/beginless/
|
135
|
+
|
136
|
+
...
|
137
|
+
|
138
|
+
E
|
139
|
+
|
140
|
+
Finished in 0.021814s, 45.8421 runs/s, 0.0000 assertions/s.
|
141
|
+
|
142
|
+
1) Error:
|
143
|
+
TestRubyParserV27#test_whatevs:
|
144
|
+
Racc::ParseError: (string):1 :: parse error on value ".." (tDOT2)
|
145
|
+
GEMS/2.7.0/gems/racc-1.5.0/lib/racc/parser.rb:538:in `on_error'
|
146
|
+
WORK/ruby_parser/dev/lib/ruby_parser_extras.rb:1304:in `on_error'
|
147
|
+
(eval):3:in `_racc_do_parse_c'
|
148
|
+
(eval):3:in `do_parse'
|
149
|
+
WORK/ruby_parser/dev/lib/ruby_parser_extras.rb:1329:in `block in process'
|
150
|
+
RUBY/lib/ruby/2.7.0/timeout.rb:95:in `block in timeout'
|
151
|
+
RUBY/lib/ruby/2.7.0/timeout.rb:33:in `block in catch'
|
152
|
+
RUBY/lib/ruby/2.7.0/timeout.rb:33:in `catch'
|
153
|
+
RUBY/lib/ruby/2.7.0/timeout.rb:33:in `catch'
|
154
|
+
RUBY/lib/ruby/2.7.0/timeout.rb:110:in `timeout'
|
155
|
+
WORK/ruby_parser/dev/lib/ruby_parser_extras.rb:1317:in `process'
|
156
|
+
WORK/ruby_parser/dev/test/test_ruby_parser.rb:4198:in `assert_parse'
|
157
|
+
WORK/ruby_parser/dev/test/test_ruby_parser.rb:4221:in `assert_parse_line'
|
158
|
+
WORK/ruby_parser/dev/test/test_ruby_parser.rb:4451:in `test_whatevs'
|
159
|
+
```
|
160
|
+
|
161
|
+
For starters, we know the missing production is for `tBDOT2 arg`. It
|
162
|
+
is currently blowing up because it is getting `tDOT2` and simply
|
163
|
+
doesn't know what to do with it, so it raises the error. As the diff
|
164
|
+
suggests, that's the wrong token to begin with, so it is probably time
|
165
|
+
to also create a lexer test:
|
166
|
+
|
167
|
+
```
|
168
|
+
def test_yylex_bdot2
|
169
|
+
assert_lex3("..42",
|
170
|
+
s(:dot2, nil, s(:lit, 42)),
|
171
|
+
|
172
|
+
:tBDOT2, "..", EXPR_BEG,
|
173
|
+
:tINTEGER, "42", EXPR_NUM)
|
174
|
+
|
175
|
+
flunk "not done yet"
|
176
|
+
end
|
177
|
+
```
|
178
|
+
|
179
|
+
This one is mostly speculative at this point. It says "if we're lexing
|
180
|
+
this string, we should get this sexp if we fully parse it, and the
|
181
|
+
lexical stream should look like this"... That last bit is mostly made
|
182
|
+
up at this point. Sometimes I don't know exactly what expression state
|
183
|
+
things should be in until I start really digging in.
|
184
|
+
|
185
|
+
At this point, I have 2 failing tests that are directing me in the
|
186
|
+
right direction. It's now a matter of digging through
|
187
|
+
`compare/parse26.y` to see how the lexer differs and implementing
|
188
|
+
it...
|
189
|
+
|
190
|
+
But this is a good start to the doco for now. I'll add more later.
|
data/gauntlet.md
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
# Running the Gauntlet
|
2
|
+
|
3
|
+
## Maintaining a Gem Mirror
|
4
|
+
|
5
|
+
I use rubygems-mirror to keep an archive of all the latest rubygems on
|
6
|
+
an external disk. Here is the config:
|
7
|
+
|
8
|
+
```
|
9
|
+
---
|
10
|
+
- from: https://rubygems.org
|
11
|
+
to: /Volumes/StuffA/gauntlet/mirror
|
12
|
+
parallelism: 10
|
13
|
+
retries: 3
|
14
|
+
delete: true
|
15
|
+
skiperror: true
|
16
|
+
hashdir: true
|
17
|
+
```
|
18
|
+
|
19
|
+
And I update using rake:
|
20
|
+
|
21
|
+
```
|
22
|
+
% cd GIT/rubygems/rubygems-mirror
|
23
|
+
% git down
|
24
|
+
% rake mirror:latest
|
25
|
+
% /Volumes/StuffA/gauntlet/bin/cleanup.rb -y -v
|
26
|
+
```
|
27
|
+
|
28
|
+
This rather quickly updates my mirror to the latest versions of
|
29
|
+
everything and then deletes all old versions. I then run a cleanup
|
30
|
+
script that fixes the file dates to their publication date and deletes
|
31
|
+
any gems that have invalid specs. This can argue with the mirror a
|
32
|
+
bit, but it is pretty minimal (currently ~20 bad gems).
|
33
|
+
|
34
|
+
## Curating an Archive of Ruby Files
|
35
|
+
|
36
|
+
Next, I process the gem mirror into a much more digestable structure
|
37
|
+
using `unpack_gems.rb`.
|
38
|
+
|
39
|
+
```
|
40
|
+
% cd RP/gauntlet
|
41
|
+
% time caffeinate /Volumes/StuffA/gauntlet/bin/unpack_gems.rb -v [-a] ; say done
|
42
|
+
... waaaait ...
|
43
|
+
% DIR=gauntlet.$(today).(all|new).noindex
|
44
|
+
% mv hashed.noindex $DIR
|
45
|
+
% tar vc -T <(fd -tf . $DIR | sort) | zstd -5 -T0 --long > archives/$DIR.tar.zst ; say done
|
46
|
+
% ./bin/sync.sh
|
47
|
+
```
|
48
|
+
|
49
|
+
This script filters all the newer (< 1 year old) gems (unless `-a` is
|
50
|
+
used), unpacks them, finds all the files that look like they're valid
|
51
|
+
ruby, ensures they're valid ruby (using the current version of ruby to
|
52
|
+
compile them), and then moves them into a SHA dir structure that looks
|
53
|
+
something like this:
|
54
|
+
|
55
|
+
```
|
56
|
+
hashed.noindex/a/b/c/<full_file_sha>.rb
|
57
|
+
```
|
58
|
+
|
59
|
+
This removes all duplicates and puts everything in a fairly even,
|
60
|
+
wide, flat directory layout.
|
61
|
+
|
62
|
+
This process takes a very long time, even with a lot of
|
63
|
+
parallelization. There are currently about 160k gems in the mirror.
|
64
|
+
Unpacking, validating, SHA'ing everything is disk and CPU intensive.
|
65
|
+
The `.noindex` extension stops spotlight from indexing the continous
|
66
|
+
churn of files being unpacked and moved and saves time.
|
67
|
+
|
68
|
+
Finally, I rename and archive it all up (currently using zstd to
|
69
|
+
compress).
|
70
|
+
|
71
|
+
### Stats
|
72
|
+
|
73
|
+
```
|
74
|
+
9696 % find gauntlet.$(today).noindex -type f | lc
|
75
|
+
561270
|
76
|
+
3.5G gauntlet.2021-08-06.noindex
|
77
|
+
239M gauntlet.2021-08-06.noindex.tar.zst
|
78
|
+
```
|
79
|
+
|
80
|
+
So I wind up with a little over half a million unique ruby files to
|
81
|
+
parse. It's about 3.5g but compresses very nicely down to 240m
|
82
|
+
|
83
|
+
## Running the Gauntlet
|
84
|
+
|
85
|
+
Assuming you're starting from scratch, unpack the archive once:
|
86
|
+
|
87
|
+
```
|
88
|
+
% zstdcat gauntlet.$(today).noindex.tar.zst | tar x
|
89
|
+
```
|
90
|
+
|
91
|
+
Then, either run a single process (easier to read):
|
92
|
+
|
93
|
+
```
|
94
|
+
% ./gauntlet/bin/gauntlet.rb gauntlet/*.noindex/?
|
95
|
+
```
|
96
|
+
|
97
|
+
Or max out your machine using xargs (note the `-P 16` and choose accordingly):
|
98
|
+
|
99
|
+
```
|
100
|
+
% ls -d gauntlet/*.noindex/?/? | time xargs -n 1 -P 16 ./gauntlet/bin/gauntlet.rb
|
101
|
+
```
|
102
|
+
|
103
|
+
In another terminal I usually monitor the progress like so:
|
104
|
+
|
105
|
+
```
|
106
|
+
% while true ; do clear; fd . -t d -t e gauntlet/*.noindex -X rmdir -p 2> /dev/null ; for D in gauntlet/*.noindex/? ; do echo -n "$D: "; fd .rb $D | wc -l ; done ; echo ; sleep 30 ; done
|
107
|
+
```
|
data/lib/.document
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
*.rb
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# :stopdoc:
|
2
|
+
# WHY do I have to do this?!?
|
3
|
+
class Regexp
|
4
|
+
ONCE = 0 unless defined? ONCE # FIX: remove this - it makes no sense
|
5
|
+
|
6
|
+
unless defined? ENC_NONE then
|
7
|
+
ENC_NONE = /x/n.options
|
8
|
+
ENC_EUC = /x/e.options
|
9
|
+
ENC_SJIS = /x/s.options
|
10
|
+
ENC_UTF8 = /x/u.options
|
11
|
+
end
|
12
|
+
end
|
13
|
+
# :startdoc:
|
14
|
+
|
15
|
+
class Array
|
16
|
+
def prepend *vals
|
17
|
+
self[0,0] = vals
|
18
|
+
end
|
19
|
+
end unless [].respond_to?(:prepend)
|
20
|
+
|
21
|
+
# :stopdoc:
|
22
|
+
class Symbol
|
23
|
+
def end_with? o
|
24
|
+
self.to_s.end_with? o
|
25
|
+
end
|
26
|
+
end unless :woot.respond_to?(:end_with?)
|
27
|
+
# :startdoc:
|
28
|
+
|
29
|
+
############################################################
|
30
|
+
# HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK
|
31
|
+
|
32
|
+
class String
|
33
|
+
def clean_caller
|
34
|
+
self.sub(File.dirname(__FILE__), "./lib").sub(/:in.*/, "")
|
35
|
+
end if $DEBUG
|
36
|
+
end
|
37
|
+
|
38
|
+
require "sexp"
|
39
|
+
|
40
|
+
class Sexp
|
41
|
+
attr_writer :paren # TODO: retire
|
42
|
+
|
43
|
+
def paren
|
44
|
+
@paren ||= false
|
45
|
+
end
|
46
|
+
|
47
|
+
def block_pass?
|
48
|
+
any? { |s| Sexp === s && s.sexp_type == :block_pass }
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# END HACK
|
53
|
+
############################################################
|