ruby_parser 3.0.0 → 3.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. checksums.yaml.gz.sig +0 -0
  3. data/.autotest +36 -19
  4. data/History.rdoc +1297 -0
  5. data/Manifest.txt +35 -7
  6. data/{README.txt → README.rdoc} +44 -14
  7. data/Rakefile +308 -110
  8. data/bin/ruby_parse +3 -1
  9. data/bin/ruby_parse_extract_error +36 -16
  10. data/compare/normalize.rb +218 -0
  11. data/debugging.md +190 -0
  12. data/gauntlet.md +107 -0
  13. data/lib/.document +1 -0
  14. data/lib/rp_extensions.rb +53 -0
  15. data/lib/rp_stringscanner.rb +33 -0
  16. data/lib/ruby20_parser.rb +10973 -0
  17. data/lib/ruby20_parser.y +2683 -0
  18. data/lib/ruby21_parser.rb +10980 -0
  19. data/lib/ruby21_parser.y +2700 -0
  20. data/lib/ruby22_parser.rb +11123 -0
  21. data/lib/ruby22_parser.y +2711 -0
  22. data/lib/ruby23_parser.rb +11132 -0
  23. data/lib/ruby23_parser.y +2713 -0
  24. data/lib/ruby24_parser.rb +11231 -0
  25. data/lib/ruby24_parser.y +2721 -0
  26. data/lib/ruby25_parser.rb +11231 -0
  27. data/lib/ruby25_parser.y +2721 -0
  28. data/lib/ruby26_parser.rb +11253 -0
  29. data/lib/ruby26_parser.y +2736 -0
  30. data/lib/ruby27_parser.rb +12980 -0
  31. data/lib/ruby27_parser.y +3324 -0
  32. data/lib/ruby30_parser.rb +13242 -0
  33. data/lib/ruby30_parser.y +3447 -0
  34. data/lib/ruby31_parser.rb +13622 -0
  35. data/lib/ruby31_parser.y +3481 -0
  36. data/lib/ruby3_parser.yy +3536 -0
  37. data/lib/ruby_lexer.rb +933 -1232
  38. data/lib/ruby_lexer.rex +185 -0
  39. data/lib/ruby_lexer.rex.rb +399 -0
  40. data/lib/ruby_lexer_strings.rb +638 -0
  41. data/lib/ruby_parser.rb +97 -3
  42. data/lib/ruby_parser.yy +3465 -0
  43. data/lib/ruby_parser_extras.rb +1216 -687
  44. data/test/test_ruby_lexer.rb +2249 -1092
  45. data/test/test_ruby_parser.rb +5156 -975
  46. data/test/test_ruby_parser_extras.rb +47 -77
  47. data/tools/munge.rb +250 -0
  48. data/tools/ripper.rb +44 -0
  49. data.tar.gz.sig +1 -1
  50. metadata +200 -155
  51. metadata.gz.sig +0 -0
  52. data/.gemtest +0 -0
  53. data/History.txt +0 -482
  54. data/lib/gauntlet_rubyparser.rb +0 -120
  55. data/lib/ruby18_parser.rb +0 -5747
  56. data/lib/ruby18_parser.y +0 -1873
  57. data/lib/ruby19_parser.rb +0 -6110
  58. data/lib/ruby19_parser.y +0 -2078
@@ -1,27 +1,27 @@
1
1
  #!/usr/bin/ruby -ws
2
2
 
3
- $d ||= false
4
- $d ||= ENV["DELETE"]
5
- $t ||= false
6
- $t ||= ENV["DELETE_TIMEOUT"]
7
- $q ||= false
8
- $q ||= ENV["QUIET"]
3
+ $d ||= ENV["DELETE"] || false
4
+ $t ||= ENV["DELETE_TIMEOUT"] || false
5
+ $m ||= ENV["MOVE_TIMEOUT"] || false
6
+ $q ||= ENV["QUIET"] || false
7
+ $s ||= ENV["SPEED"] || false
9
8
 
10
9
  require 'rubygems'
11
10
  require 'ruby_parser'
11
+ require 'fileutils'
12
12
 
13
13
  ARGV.push "-" if ARGV.empty?
14
14
 
15
- class Racc::Parser
15
+ class RubyParser
16
16
  def extract_defs
17
- ss = lexer.src
17
+ ss = current.lexer.ss
18
18
 
19
19
  raise "can't access source. possible encoding issue" unless ss
20
20
 
21
21
  src = ss.string
22
22
  pre_error = src[0...ss.pos]
23
23
 
24
- defs = pre_error.grep(/^ *def/)
24
+ defs = pre_error.lines.grep(/^ *(?:def|it)/)
25
25
 
26
26
  raise "can't figure out where the bad code starts" unless defs.last
27
27
 
@@ -34,7 +34,7 @@ class Racc::Parser
34
34
 
35
35
  src = pre_error + post_error[0..idx+$&.length]
36
36
 
37
- src.scan(/^(( *)def .*?^\2end)/m)
37
+ src.scan(/^(( *)(?:def|it) .*?^\2end)/m)
38
38
  end
39
39
 
40
40
  def retest_for_errors defs
@@ -56,7 +56,7 @@ def expand path
56
56
  files << f if File.file? f
57
57
  end
58
58
 
59
- files
59
+ files.sort
60
60
  else
61
61
  Dir.glob path
62
62
  end
@@ -82,24 +82,44 @@ def process_error parser
82
82
  end
83
83
  rescue RuntimeError, Racc::ParseError => e
84
84
  warn "# process error: #{e.message.strip}"
85
+ warn "# #{e.backtrace.first}"
85
86
  end
86
87
 
87
88
  def process file
88
- ruby = file == "-" ? $stdin.read : File.read(file)
89
+ ruby = file == "-" ? $stdin.binread : File.binread(file)
90
+ time = (ENV["RP_TIMEOUT"] || 10).to_i
89
91
 
90
92
  $stderr.print "# Validating #{file}: "
91
- parser = Ruby19Parser.new
92
- parser.process(ruby, file)
93
- warn "good"
93
+ parser = RubyParser.new
94
+ t0 = Time.now if $s
95
+ parser.process(ruby, file, time)
96
+ if $s then
97
+ warn "good: #{Time.now - t0}"
98
+ else
99
+ warn "good"
100
+ end
94
101
  File.unlink file if $d
95
102
  rescue Timeout::Error
96
103
  $exit = 1
97
104
  warn "TIMEOUT parsing #{file}. Skipping."
98
- File.unlink file if $t
105
+
106
+ if $m then
107
+ base_dir, *rest = file.split("/")
108
+ base_dir.sub!(/\.slow\.?.*/, "")
109
+ base_dir += ".slow.#{time}"
110
+
111
+ new_file = File.join(base_dir, *rest)
112
+
113
+ FileUtils.mkdir_p File.dirname(new_file)
114
+ FileUtils.move file, new_file, verbose:true
115
+ elsif $t then
116
+ File.unlink file
117
+ end
99
118
  rescue StandardError, SyntaxError, Racc::ParseError => e
100
119
  $exit = 1
101
120
  warn ""
102
121
  warn "# error: #{e.message.strip}" unless $q
122
+ warn "# #{e.backtrace.first}"
103
123
  warn ""
104
124
  return if $q
105
125
 
@@ -0,0 +1,218 @@
1
+ #!/usr/bin/env ruby -w
2
+
3
+ good = false
4
+
5
+ rules = Hash.new { |h,k| h[k] = [] }
6
+ rule = nil
7
+ order = []
8
+
9
+ def munge s
10
+ renames = [
11
+ # unquote... wtf?
12
+ /`(.+?)'/, proc { $1 },
13
+ /"'(.+?)'"/, proc { "\"#{$1}\"" },
14
+
15
+ "'='", "tEQL",
16
+ "'!'", "tBANG",
17
+ "'%'", "tPERCENT",
18
+ "'&'", "tAMPER2",
19
+ "'('", "tLPAREN2",
20
+ "')'", "tRPAREN",
21
+ "'*'", "tSTAR2",
22
+ "'+'", "tPLUS",
23
+ "','", "tCOMMA",
24
+ "'-'", "tMINUS",
25
+ "'.'", "tDOT",
26
+ "'/'", "tDIVIDE",
27
+ "';'", "tSEMI",
28
+ "':'", "tCOLON",
29
+ "'<'", "tLT",
30
+ "'>'", "tGT",
31
+ "'?'", "tEH",
32
+ "'['", "tLBRACK",
33
+ "'\\n'", "tNL",
34
+ "']'", "tRBRACK",
35
+ "'^'", "tCARET",
36
+ "'`'", "tBACK_REF2",
37
+ "'{'", "tLCURLY",
38
+ "'|'", "tPIPE",
39
+ "'}'", "tRCURLY",
40
+ "'~'", "tTILDE",
41
+ '"["', "tLBRACK",
42
+
43
+ # 2.0 changes?
44
+ '"<=>"', "tCMP",
45
+ '"=="', "tEQ",
46
+ '"==="', "tEQQ",
47
+ '"!~"', "tNMATCH",
48
+ '"=~"', "tMATCH",
49
+ '">="', "tGEQ",
50
+ '"<="', "tLEQ",
51
+ '"!="', "tNEQ",
52
+ '"<<"', "tLSHFT",
53
+ '">>"', "tRSHFT",
54
+ '"*"', "tSTAR",
55
+
56
+ '".."', "tDOT2",
57
+
58
+ '"&"', "tAMPER",
59
+ '"&&"', "tANDOP",
60
+ '"&."', "tLONELY",
61
+ '"||"', "tOROP",
62
+
63
+ '"..."', "tDOT3",
64
+ '"**"', "tPOW",
65
+ '"unary+"', "tUPLUS",
66
+ '"unary-"', "tUMINUS",
67
+ '"[]"', "tAREF",
68
+ '"[]="', "tASET",
69
+ '"::"', "tCOLON2",
70
+ '"{ arg"', "tLBRACE_ARG",
71
+ '"( arg"', "tLPAREN_ARG",
72
+ '"("', "tLPAREN",
73
+ 'rparen', "tRPAREN",
74
+ '"{"', "tLBRACE",
75
+ '"=>"', "tASSOC",
76
+ '"->"', "tLAMBDA",
77
+ '":: at EXPR_BEG"', "tCOLON3",
78
+ '"**arg"', "tDSTAR",
79
+ '","', "tCOMMA",
80
+
81
+ # other
82
+
83
+ 'tLBRACK2', "tLBRACK", # HACK
84
+
85
+ "' '", "tSPACE", # needs to be later to avoid bad hits
86
+
87
+ "%empty", "none", # newer bison
88
+ "/* empty */", "none",
89
+ /^\s*$/, "none",
90
+
91
+ "keyword_BEGIN", "klBEGIN",
92
+ "keyword_END", "klEND",
93
+ /keyword_(\w+)/, proc { "k#{$1.upcase}" },
94
+ /\bk_([a-z_]+)/, proc { "k#{$1.upcase}" },
95
+ /modifier_(\w+)/, proc { "k#{$1.upcase}_MOD" },
96
+ "kVARIABLE", "keyword_variable", # ugh
97
+
98
+ # 2.6 collapses klBEGIN to kBEGIN
99
+ "klBEGIN", "kBEGIN",
100
+ "klEND", "kEND",
101
+
102
+ /keyword_(\w+)/, proc { "k#{$1.upcase}" },
103
+ /\bk_([^_][a-z_]+)/, proc { "k#{$1.upcase}" },
104
+ /modifier_(\w+)/, proc { "k#{$1.upcase}_MOD" },
105
+
106
+ "kVARIABLE", "keyword_variable", # ugh: this is a rule name
107
+
108
+ # 2.7 changes:
109
+
110
+ '"global variable"', "tGVAR",
111
+ '"operator-assignment"', "tOP_ASGN",
112
+ '"back reference"', "tBACK_REF",
113
+ '"numbered reference"', "tNTH_REF",
114
+ '"local variable or method"', "tIDENTIFIER",
115
+ '"constant"', "tCONSTANT",
116
+
117
+ '"(.."', "tBDOT2",
118
+ '"(..."', "tBDOT3",
119
+ '"char literal"', "tCHAR",
120
+ '"literal content"', "tSTRING_CONTENT",
121
+ '"string literal"', "tSTRING_BEG",
122
+ '"symbol literal"', "tSYMBEG",
123
+ '"backtick literal"', "tXSTRING_BEG",
124
+ '"regexp literal"', "tREGEXP_BEG",
125
+ '"word list"', "tWORDS_BEG",
126
+ '"verbatim word list"', "tQWORDS_BEG",
127
+ '"symbol list"', "tSYMBOLS_BEG",
128
+ '"verbatim symbol list"', "tQSYMBOLS_BEG",
129
+
130
+ '"float literal"', "tFLOAT",
131
+ '"imaginary literal"', "tIMAGINARY",
132
+ '"integer literal"', "tINTEGER",
133
+ '"rational literal"', "tRATIONAL",
134
+
135
+ '"instance variable"', "tIVAR",
136
+ '"class variable"', "tCVAR",
137
+ '"terminator"', "tSTRING_END", # TODO: switch this?
138
+ '"method"', "tFID",
139
+ '"}"', "tSTRING_DEND",
140
+
141
+ '"do for block"', "kDO_BLOCK",
142
+ '"do for condition"', "kDO_COND",
143
+ '"do for lambda"', "kDO_LAMBDA",
144
+ "tLABEL", "kLABEL",
145
+
146
+ # UGH
147
+ "k_LINE__", "k__LINE__",
148
+ "k_FILE__", "k__FILE__",
149
+ "k_ENCODING__", "k__ENCODING__",
150
+
151
+ '"defined?"', "kDEFINED",
152
+
153
+ '"do (for condition)"', "kDO_COND",
154
+ '"do (for lambda)"', "kDO_LAMBDA",
155
+ '"do (for block)"', "kDO_BLOCK",
156
+
157
+ /\"(\w+) \(?modifier\)?\"/, proc { |x| "k#{$1.upcase}_MOD" },
158
+ /\"(\w+)\"/, proc { |x| "k#{$1.upcase}" },
159
+
160
+ /\$?@(\d+)(\s+|$)/, "", # newer bison
161
+
162
+ # TODO: remove for 3.0 work:
163
+ "lex_ctxt ", "" # 3.0 production that's mostly noise right now
164
+ ]
165
+
166
+ renames.each_slice(2) do |(a, b)|
167
+ if Proc === b then
168
+ s.gsub!(a, &b)
169
+ else
170
+ s.gsub!(a, b)
171
+ end
172
+ end
173
+
174
+ s.strip
175
+ end
176
+
177
+ ARGF.each_line do |line|
178
+ next unless good or line =~ /^-* ?Grammar|\$accept : /
179
+
180
+ case line.strip
181
+ when /^$/ then
182
+ when /^(\d+) (\$?[@\w]+): (.*)/ then # yacc
183
+ rule = $2
184
+ order << rule unless rules.has_key? rule
185
+ rules[rule] << munge($3)
186
+ when /^(\d+) \s+\| (.*)/ then # yacc
187
+ rules[rule] << munge($2)
188
+ when /^(\d+) (@\d+): (.*)/ then # yacc
189
+ rule = $2
190
+ order << rule unless rules.has_key? rule
191
+ rules[rule] << munge($3)
192
+ when /^rule (\d+) (@?\w+):(.*)/ then # racc
193
+ rule = $2
194
+ order << rule unless rules.has_key? rule
195
+ rules[rule] << munge($3)
196
+ when /\$accept/ then # byacc?
197
+ good = true
198
+ when /Grammar/ then # both
199
+ good = true
200
+ when /^-+ Symbols/ then # racc
201
+ break
202
+ when /^Terminals/ then # yacc
203
+ break
204
+ when /^\cL/ then # byacc
205
+ break
206
+ else
207
+ warn "unparsed: #{$.}: #{line.strip.inspect}"
208
+ end
209
+ end
210
+
211
+ require 'yaml'
212
+
213
+ order.each do |k|
214
+ next if k =~ /@/
215
+ puts
216
+ puts "#{k}:"
217
+ puts rules[k].map { |r| " #{r}" }.join "\n"
218
+ end
data/debugging.md ADDED
@@ -0,0 +1,190 @@
1
+ # Quick Notes to Help with Debugging
2
+
3
+ ## Reducing
4
+
5
+ One of the most important steps is reducing the code sample to a
6
+ minimal reproduction. For example, one thing I'm debugging right now
7
+ was reported as:
8
+
9
+ ```ruby
10
+ a, b, c, d, e, f, g, h, i, j = 1, *[p1, p2, p3], *[p1, p2, p3], *[p4, p5, p6]
11
+ ```
12
+
13
+ This original sample has 10 items on the left-hand-side (LHS) and 1 +
14
+ 3 groups of 3 (calls) on the RHS + 3 arrays + 3 splats. That's a lot.
15
+
16
+ It's already been reported (perhaps incorrectly) that this has to do
17
+ with multiple splats on the RHS, so let's focus on that. At a minimum
18
+ the code can be reduced to 2 splats on the RHS and some
19
+ experimentation shows that it needs a non-splat item to fail:
20
+
21
+ ```
22
+ _, _, _ = 1, *[2], *[3]
23
+ ```
24
+
25
+ and some intuition further removed the arrays:
26
+
27
+ ```
28
+ _, _, _ = 1, *2, *3
29
+ ```
30
+
31
+ the difference is huge and will make a ton of difference when
32
+ debugging.
33
+
34
+ ## Getting something to compare
35
+
36
+ ```
37
+ % rake debug3 F=file.rb
38
+ ```
39
+
40
+ TODO
41
+
42
+ ## Comparing against ruby / ripper:
43
+
44
+ ```
45
+ % rake cmp3 F=file.rb
46
+ ```
47
+
48
+ This compiles the parser & lexer and then parses file.rb using both
49
+ ruby, ripper, and ruby_parser in debug modes. The output is munged to
50
+ be as uniform as possible and diffable. I'm using emacs'
51
+ `ediff-files3` to compare these files (via `rake cmp3`) all at once,
52
+ but regular `diff -u tmp/{ruby,rp}` will suffice for most tasks.
53
+
54
+ From there? Good luck. I'm currently trying to backtrack from rule
55
+ reductions to state change differences. I'd like to figure out a way
56
+ to go from this sort of diff to a reasonable test that checks state
57
+ changes but I don't have that set up at this point.
58
+
59
+ ## Adding New Grammar Productions
60
+
61
+ Ruby adds stuff to the parser ALL THE TIME. It's actually hard to keep
62
+ up with, but I've added some tools and shown what a typical workflow
63
+ looks like. Let's say you want to add ruby 2.7's "beginless range" (eg
64
+ `..42`).
65
+
66
+ Whenever there's a language feature missing, I start with comparing
67
+ the parse trees between MRI and RP:
68
+
69
+ ### Structural Comparing
70
+
71
+ There's a bunch of rake tasks `compare27`, `compare26`, etc that try
72
+ to normalize and diff MRI's parse.y parse tree (just the structure of
73
+ the tree in yacc) to ruby\_parser's parse tree (racc). It's the first
74
+ thing I do when I'm adding a new version. Stub out all the version
75
+ differences, and then start to diff the structure and move
76
+ ruby\_parser towards the new changes.
77
+
78
+ Some differences are just gonna be there... but here's an example of a
79
+ real diff between MRI 2.7 and ruby_parser as of today:
80
+
81
+ ```diff
82
+ arg tDOT3 arg
83
+ arg tDOT2
84
+ arg tDOT3
85
+ - tBDOT2 arg
86
+ - tBDOT3 arg
87
+ arg tPLUS arg
88
+ arg tMINUS arg
89
+ arg tSTAR2 arg
90
+ ```
91
+
92
+ This is a new language feature that ruby_parser doesn't handle yet.
93
+ It's in MRI (the left hand side of the diff) but not ruby\_parser (the
94
+ right hand side) so it is a `-` or missing line.
95
+
96
+ Some other diffs will have both `+` and `-` lines. That usually
97
+ happens when MRI has been refactoring the grammar. Sometimes I choose
98
+ to adapt those refactorings and sometimes it starts to get too
99
+ difficult to maintain multiple versions of ruby parsing in a single
100
+ file.
101
+
102
+ But! This structural comparing is always a place you should look when
103
+ ruby_parser is failing to parse something. Maybe it just hasn't been
104
+ implemented yet and the easiest place to look is the diff.
105
+
106
+ ### Starting Test First
107
+
108
+ The next thing I do is to add a parser test to cover that feature. I
109
+ usually start with the parser and work backwards towards the lexer as
110
+ needed, as I find it structures things properly and keeps things goal
111
+ oriented.
112
+
113
+ So, make a new parser test, usually in the versioned section of the
114
+ parser tests.
115
+
116
+ ```
117
+ def test_beginless2
118
+ rb = "..10\n; ..a\n; c"
119
+ pt = s(:block,
120
+ s(:dot2, nil, s(:lit, 0).line(1)).line(1),
121
+ s(:dot2, nil, s(:call, nil, :a).line(2)).line(2),
122
+ s(:call, nil, :c).line(3)).line(1)
123
+
124
+ assert_parse_line rb, pt, 1
125
+
126
+ flunk "not done yet"
127
+ end
128
+ ```
129
+
130
+ (In this case copied and modified the tests for open ranges from 2.6)
131
+ and run it to get my first error:
132
+
133
+ ```
134
+ % rake N=/beginless/
135
+
136
+ ...
137
+
138
+ E
139
+
140
+ Finished in 0.021814s, 45.8421 runs/s, 0.0000 assertions/s.
141
+
142
+ 1) Error:
143
+ TestRubyParserV27#test_whatevs:
144
+ Racc::ParseError: (string):1 :: parse error on value ".." (tDOT2)
145
+ GEMS/2.7.0/gems/racc-1.5.0/lib/racc/parser.rb:538:in `on_error'
146
+ WORK/ruby_parser/dev/lib/ruby_parser_extras.rb:1304:in `on_error'
147
+ (eval):3:in `_racc_do_parse_c'
148
+ (eval):3:in `do_parse'
149
+ WORK/ruby_parser/dev/lib/ruby_parser_extras.rb:1329:in `block in process'
150
+ RUBY/lib/ruby/2.7.0/timeout.rb:95:in `block in timeout'
151
+ RUBY/lib/ruby/2.7.0/timeout.rb:33:in `block in catch'
152
+ RUBY/lib/ruby/2.7.0/timeout.rb:33:in `catch'
153
+ RUBY/lib/ruby/2.7.0/timeout.rb:33:in `catch'
154
+ RUBY/lib/ruby/2.7.0/timeout.rb:110:in `timeout'
155
+ WORK/ruby_parser/dev/lib/ruby_parser_extras.rb:1317:in `process'
156
+ WORK/ruby_parser/dev/test/test_ruby_parser.rb:4198:in `assert_parse'
157
+ WORK/ruby_parser/dev/test/test_ruby_parser.rb:4221:in `assert_parse_line'
158
+ WORK/ruby_parser/dev/test/test_ruby_parser.rb:4451:in `test_whatevs'
159
+ ```
160
+
161
+ For starters, we know the missing production is for `tBDOT2 arg`. It
162
+ is currently blowing up because it is getting `tDOT2` and simply
163
+ doesn't know what to do with it, so it raises the error. As the diff
164
+ suggests, that's the wrong token to begin with, so it is probably time
165
+ to also create a lexer test:
166
+
167
+ ```
168
+ def test_yylex_bdot2
169
+ assert_lex3("..42",
170
+ s(:dot2, nil, s(:lit, 42)),
171
+
172
+ :tBDOT2, "..", EXPR_BEG,
173
+ :tINTEGER, "42", EXPR_NUM)
174
+
175
+ flunk "not done yet"
176
+ end
177
+ ```
178
+
179
+ This one is mostly speculative at this point. It says "if we're lexing
180
+ this string, we should get this sexp if we fully parse it, and the
181
+ lexical stream should look like this"... That last bit is mostly made
182
+ up at this point. Sometimes I don't know exactly what expression state
183
+ things should be in until I start really digging in.
184
+
185
+ At this point, I have 2 failing tests that are directing me in the
186
+ right direction. It's now a matter of digging through
187
+ `compare/parse26.y` to see how the lexer differs and implementing
188
+ it...
189
+
190
+ But this is a good start to the doco for now. I'll add more later.
data/gauntlet.md ADDED
@@ -0,0 +1,107 @@
1
+ # Running the Gauntlet
2
+
3
+ ## Maintaining a Gem Mirror
4
+
5
+ I use rubygems-mirror to keep an archive of all the latest rubygems on
6
+ an external disk. Here is the config:
7
+
8
+ ```
9
+ ---
10
+ - from: https://rubygems.org
11
+ to: /Volumes/StuffA/gauntlet/mirror
12
+ parallelism: 10
13
+ retries: 3
14
+ delete: true
15
+ skiperror: true
16
+ hashdir: true
17
+ ```
18
+
19
+ And I update using rake:
20
+
21
+ ```
22
+ % cd GIT/rubygems/rubygems-mirror
23
+ % git down
24
+ % rake mirror:latest
25
+ % /Volumes/StuffA/gauntlet/bin/cleanup.rb -y -v
26
+ ```
27
+
28
+ This rather quickly updates my mirror to the latest versions of
29
+ everything and then deletes all old versions. I then run a cleanup
30
+ script that fixes the file dates to their publication date and deletes
31
+ any gems that have invalid specs. This can argue with the mirror a
32
+ bit, but it is pretty minimal (currently ~20 bad gems).
33
+
34
+ ## Curating an Archive of Ruby Files
35
+
36
+ Next, I process the gem mirror into a much more digestable structure
37
+ using `unpack_gems.rb`.
38
+
39
+ ```
40
+ % cd RP/gauntlet
41
+ % time caffeinate /Volumes/StuffA/gauntlet/bin/unpack_gems.rb -v [-a] ; say done
42
+ ... waaaait ...
43
+ % DIR=gauntlet.$(today).(all|new).noindex
44
+ % mv hashed.noindex $DIR
45
+ % tar vc -T <(fd -tf . $DIR | sort) | zstd -5 -T0 --long > archives/$DIR.tar.zst ; say done
46
+ % ./bin/sync.sh
47
+ ```
48
+
49
+ This script filters all the newer (< 1 year old) gems (unless `-a` is
50
+ used), unpacks them, finds all the files that look like they're valid
51
+ ruby, ensures they're valid ruby (using the current version of ruby to
52
+ compile them), and then moves them into a SHA dir structure that looks
53
+ something like this:
54
+
55
+ ```
56
+ hashed.noindex/a/b/c/<full_file_sha>.rb
57
+ ```
58
+
59
+ This removes all duplicates and puts everything in a fairly even,
60
+ wide, flat directory layout.
61
+
62
+ This process takes a very long time, even with a lot of
63
+ parallelization. There are currently about 160k gems in the mirror.
64
+ Unpacking, validating, SHA'ing everything is disk and CPU intensive.
65
+ The `.noindex` extension stops spotlight from indexing the continous
66
+ churn of files being unpacked and moved and saves time.
67
+
68
+ Finally, I rename and archive it all up (currently using zstd to
69
+ compress).
70
+
71
+ ### Stats
72
+
73
+ ```
74
+ 9696 % find gauntlet.$(today).noindex -type f | lc
75
+ 561270
76
+ 3.5G gauntlet.2021-08-06.noindex
77
+ 239M gauntlet.2021-08-06.noindex.tar.zst
78
+ ```
79
+
80
+ So I wind up with a little over half a million unique ruby files to
81
+ parse. It's about 3.5g but compresses very nicely down to 240m
82
+
83
+ ## Running the Gauntlet
84
+
85
+ Assuming you're starting from scratch, unpack the archive once:
86
+
87
+ ```
88
+ % zstdcat gauntlet.$(today).noindex.tar.zst | tar x
89
+ ```
90
+
91
+ Then, either run a single process (easier to read):
92
+
93
+ ```
94
+ % ./gauntlet/bin/gauntlet.rb gauntlet/*.noindex/?
95
+ ```
96
+
97
+ Or max out your machine using xargs (note the `-P 16` and choose accordingly):
98
+
99
+ ```
100
+ % ls -d gauntlet/*.noindex/?/? | time xargs -n 1 -P 16 ./gauntlet/bin/gauntlet.rb
101
+ ```
102
+
103
+ In another terminal I usually monitor the progress like so:
104
+
105
+ ```
106
+ % while true ; do clear; fd . -t d -t e gauntlet/*.noindex -X rmdir -p 2> /dev/null ; for D in gauntlet/*.noindex/? ; do echo -n "$D: "; fd .rb $D | wc -l ; done ; echo ; sleep 30 ; done
107
+ ```
data/lib/.document ADDED
@@ -0,0 +1 @@
1
+ *.rb
@@ -0,0 +1,53 @@
1
+ # :stopdoc:
2
+ # WHY do I have to do this?!?
3
+ class Regexp
4
+ ONCE = 0 unless defined? ONCE # FIX: remove this - it makes no sense
5
+
6
+ unless defined? ENC_NONE then
7
+ ENC_NONE = /x/n.options
8
+ ENC_EUC = /x/e.options
9
+ ENC_SJIS = /x/s.options
10
+ ENC_UTF8 = /x/u.options
11
+ end
12
+ end
13
+ # :startdoc:
14
+
15
+ class Array
16
+ def prepend *vals
17
+ self[0,0] = vals
18
+ end
19
+ end unless [].respond_to?(:prepend)
20
+
21
+ # :stopdoc:
22
+ class Symbol
23
+ def end_with? o
24
+ self.to_s.end_with? o
25
+ end
26
+ end unless :woot.respond_to?(:end_with?)
27
+ # :startdoc:
28
+
29
+ ############################################################
30
+ # HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK
31
+
32
+ class String
33
+ def clean_caller
34
+ self.sub(File.dirname(__FILE__), "./lib").sub(/:in.*/, "")
35
+ end if $DEBUG
36
+ end
37
+
38
+ require "sexp"
39
+
40
+ class Sexp
41
+ attr_writer :paren # TODO: retire
42
+
43
+ def paren
44
+ @paren ||= false
45
+ end
46
+
47
+ def block_pass?
48
+ any? { |s| Sexp === s && s.sexp_type == :block_pass }
49
+ end
50
+ end
51
+
52
+ # END HACK
53
+ ############################################################