rabbit-slide-kou-rubykaigi-2019 2019.4.19.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: f4d26666f0ff8c0fbd0ea010f55853e41383a03bda2d9c8909d6eabd80d19c77
4
+ data.tar.gz: eaf69441d728a122d8f937610381d126919023e05c9fe3010f061e2d8bbbd851
5
+ SHA512:
6
+ metadata.gz: a8665f7ce21d4bd9bc6790a84f46505f0913ad919aba46a2a4651672216c56a855b9e486d99ddaeceea30646772e6f6e07ae276685d15d16a4936777c15bec51
7
+ data.tar.gz: f7f3bf3aaaee69d9857c4070575dfc6d0eb5cc96a52d42887a80aa0fcb8cb9e5e752984eddd9a8dadb8704e0c3243221fb70d5aba4a86f919950a974339d02f7
data/.rabbit ADDED
@@ -0,0 +1 @@
1
+ --size=800,450 csv.rab
data/README.rd ADDED
@@ -0,0 +1,52 @@
1
+ = Better CSV processing with Ruby 2.6
2
+
3
+ csv, one of the standard libraries, in Ruby 2.6 has many improvements:
4
+
5
+ * Default gemified
6
+ * Faster CSV parsing
7
+ * Faster CSV writing
8
+ * Clean new CSV parser implementation for further improvements
9
+ * Reconstructed test suites for further improvements
10
+ * Benchmark suites for further performance improvements
11
+
12
+ These improvements are done without breaking backward compatibility.
13
+
14
+ This talk describes details of these improvements by a new csv maintainer.
15
+
16
+ == License
17
+
18
+ === Slide
19
+
20
+ CC BY-SA 4.0
21
+
22
+ Use the followings for notation of the author:
23
+
24
+ * Kouhei Sutou
25
+
26
+ ==== ClearCode Inc. logo
27
+
28
+ CC BY-SA 4.0
29
+
30
+ Author: ClearCode Inc.
31
+
32
+ It is used in page header and some pages in the slide.
33
+
34
+ == For author
35
+
36
+ === Show
37
+
38
+ rake
39
+
40
+ === Publish
41
+
42
+ rake publish
43
+
44
+ == For viewers
45
+
46
+ === Install
47
+
48
+ gem install rabbit-slide-kou-rubykaigi-2019
49
+
50
+ === Show
51
+
52
+ rabbit rabbit-slide-kou-rubykaigi-2019.gem
data/Rakefile ADDED
@@ -0,0 +1,32 @@
1
+ require "rabbit/task/slide"
2
+
3
+ # Edit ./config.yaml to customize meta data
4
+
5
+ spec = nil
6
+ Rabbit::Task::Slide.new do |task|
7
+ spec = task.spec
8
+ # spec.files += Dir.glob("doc/**/*.*")
9
+ # spec.files -= Dir.glob("private/**/*.*")
10
+ spec.add_runtime_dependency("rabbit-theme-clear-code")
11
+ end
12
+
13
+ desc "Tag #{spec.version}"
14
+ task :tag do
15
+ sh("git", "tag", "-a", spec.version.to_s, "-m", "Publish #{spec.version}")
16
+ sh("git", "push", "--tags")
17
+ end
18
+
19
+ desc "Run benchmark"
20
+ task :benchmark do
21
+ env = {
22
+ "RUBYLIB" => nil,
23
+ "BUNDLER_ORIG_RUBYLIB" => nil,
24
+ }
25
+ sh(env,
26
+ FileUtils::RUBY,
27
+ "-S",
28
+ "benchmark-driver",
29
+ "--repeat-count", "1",
30
+ "--runner", "time",
31
+ "benchmark.yaml")
32
+ end
data/config.yaml ADDED
@@ -0,0 +1,23 @@
1
+ ---
2
+ id: rubykaigi-2019
3
+ base_name: csv
4
+ tags:
5
+ - rabbit
6
+ - csv
7
+ - apachearrow
8
+ presentation_date: 2019-04-19
9
+ version: 2019.4.19.0
10
+ licenses:
11
+ - CC-BY-SA-4.0
12
+ slideshare_id:
13
+ speaker_deck_id:
14
+ ustream_id:
15
+ vimeo_id:
16
+ youtube_id:
17
+ author:
18
+ markup_language: :rd
19
+ name: Kouhei Sutou
20
+ email: kou@clear-code.com
21
+ rubygems_user: kou
22
+ slideshare_user: kou
23
+ speaker_deck_user:
data/csv.rab ADDED
@@ -0,0 +1,392 @@
1
+ = Better CSV processing\nwith Ruby 2.6
2
+
3
+ : author
4
+ Kouhei Sutou
5
+ : institution
6
+ ClearCode Inc.
7
+ : content-source
8
+ RubyKaigi 2019
9
+ : date
10
+ 2019-04-19
11
+ : start-time
12
+ 2019-04-19T11:20:00+09:00
13
+ : end-time
14
+ 2019-04-19T12:00:00+09:00
15
+ : theme
16
+ .
17
+
18
+ = Ad: Silver sponsor
19
+
20
+ # img
21
+ # src = images/clear-code-rubykaigi-2019-silver-sponsor.png
22
+ # relative_height = 100
23
+ # reflect_ratio = 0.1
24
+
25
+ == Slide properties
26
+
27
+ : enable-title-on-image
28
+ false
29
+
30
+ = Me(('note:(自己紹介)'))
31
+
32
+ * The president of ClearCode Inc.\n
33
+ (('note:クリアコードの社長'))
34
+ * A new maintainer of the (({csv})) library\n
35
+ (('note:(({csv}))ライブラリーの新メンテナー'))
36
+ * The founder of ((<Red Data Tools|URL:https://red-data-tools.github.io/>)) project:\n
37
+ (('note:Red Data Toolsプロジェクトの立ち上げ人'))
38
+ * Provides data processing tools for Ruby\n
39
+ (('note:Ruby用のデータ処理ツールを提供するプロジェクト'))
40
+
41
+ = Kazuma Furuhashi
42
+
43
+ * ...
44
+
45
+ = CSV in Ruby 2.6 (1)\n(('note:Ruby 2.6のCSV (1)'))
46
+
47
+ Faster CSV parsing\n
48
+ (('note:CSVパースの高速化'))
49
+
50
+ = Unquoted CSV\n(('note:クォートなしのCSV'))
51
+
52
+ AAAAA,AAAAA,AAAAA
53
+ ...
54
+
55
+ # RT
56
+
57
+ 2.5, 2.6, Faster?
58
+
59
+ 432.0i/s, 764.9i/s, 1.77x
60
+
61
+ = Quoted CSV\n(('note:クォートありのCSV'))
62
+
63
+ "AAAAA","AAAAA","AAAAA"
64
+ ...
65
+
66
+ # RT
67
+
68
+ 2.5, 2.6, Faster?
69
+
70
+ 274.1i/s, 534.5i/s, 1.95x
71
+
72
+ = Quoted separator CSV (1)\n(('note:区切り文字をクォートしているCSV (1)'))
73
+
74
+ ",AAAAA",",AAAAA",",AAAAA"
75
+ ...
76
+
77
+ # RT
78
+
79
+ 2.5, 2.6, Faster?
80
+
81
+ 211.0i/s, 330.0/s, 1.56x
82
+
83
+ = Quoted separator CSV (2)\n(('note:区切り文字をクォートしているCSV (2)'))
84
+
85
+ "AAAAA\r\n","AAAAA\r\n","AAAAA\r\n"
86
+ ...
87
+
88
+ # RT
89
+
90
+ 2.5, 2.6, Faster?
91
+
92
+ 118.7i/s, 325.6/s, 2.74x
93
+
94
+ = Quoted CSVs\n(('note:クォートありのCSV'))
95
+
96
+ # RT
97
+
98
+ , 2.5, 2.6
99
+
100
+ Just quoted, 274.1i/s, 554.5i/s
101
+ Include sep1, 211.0i/s, 330.0i/s
102
+ Include sep2, 118.0i/s, 325.6i/s
103
+ (Note), (Slow down), (Still fast)
104
+
105
+ = Multibyte CSV\n(('note:マルチバイトのCSV'))
106
+
107
+ あああああ,あああああ,あああああ
108
+ ...
109
+
110
+ # RT
111
+
112
+ 2.5, 2.6, Faster?
113
+
114
+ 371.2i/s, 626.6i/s, 1.69x
115
+
116
+ = CSV in Ruby 2.6 (2)\n(('note:Ruby 2.6のCSV (1)'))
117
+
118
+ Faster CSV writing\n
119
+ (('note:CSV書き出しの高速化'))
120
+
121
+ = (({CSV.generate_line}))
122
+
123
+ # rouge ruby
124
+ fields = ["AAAAA"] * n_columns
125
+ n_rows.times do
126
+ CSV.generate_line(fields)
127
+ end
128
+
129
+ # RT
130
+
131
+ 2.5, 2.6, Faster?
132
+
133
+ 284.4i/s, 684.2i/s, 2.41x
134
+
135
+ = (({CSV#<<}))
136
+
137
+ # rouge ruby
138
+
139
+ output = StringIO.new
140
+ csv = CSV.new(output)
141
+ n_rows.times {csv << fields}
142
+
143
+ # RT
144
+
145
+ 2.5, 2.6, Faster?
146
+
147
+ 2891.4i/s, 4824.1i/s, 1.67x
148
+
149
+ = (({CSV.generate_line})) vs. (({CSV#<<}))
150
+
151
+ # RT
152
+
153
+ , 2.5, 2.6
154
+
155
+ (({generate_\nline})), 284.4i/s, 684.2i/s
156
+ (({<<})), 2891.4i/s, 4824.1i/s
157
+
158
+ (('tag:center'))
159
+ ((*Use (({<<})) for multiple writes*))\n
160
+ (('note:((*複数行書き出すときは(({<<}))を使うこと*))'))
161
+
162
+ = CSV in Ruby 2.6 (3)\n(('note:Ruby 2.6のCSV (3)'))
163
+
164
+ New CSV parser\n
165
+ (('tag:small:for'))\n
166
+ further improvements\n
167
+ (('note:さらなる改良のための新しいCSVパーサー'))
168
+
169
+ = Benchmark with KEN_ALL.CSV\n(('note:KEN_ALL.CSVでのベンチマーク'))
170
+
171
+ 01101,"060 ","0600000","ホッカイドウ","サッポロシチュウオウク",...
172
+ ...(124257 lines)...
173
+ 47382,"90718","9071801","オキナワケン","ヤエヤマグンヨナグニチョウ",...
174
+
175
+ (('tag:center'))
176
+ Zip code data in Japan\n
177
+ (('note:日本の郵便番号データ'))
178
+
179
+ (('tag:center'))
180
+ (('tag:small'))
181
+ ((<URL:https://www.post.japanpost.jp/zipcode/download.html>))
182
+
183
+ = KEN_ALL.CSV statistics\n(('note:KEN_ALL.CSVの統計情報'))
184
+
185
+ # RT
186
+
187
+ Size(('note:(サイズ)')), 11.7MiB
188
+ (('#')) of columns(('note:(列数)')), 15
189
+ (('#')) of rows(('note:(行数)')), 124259
190
+ Encoding(('note:(エンコーディング)')), CP932
191
+
192
+ = Parsing KEN_ALL.CSV\n(('note:KEN_ALL.CSVのパース'))
193
+
194
+ # rouge ruby
195
+ CSV.foreach("KEN_ALL.CSV",
196
+ "r:cp932") do |row|
197
+ end
198
+
199
+ # RT
200
+
201
+ 2.5, 2.6, Faster?
202
+
203
+ 1.17s, 0.79s, 1.48x
204
+
205
+ = Fastest parsing in pure Ruby\n(('note:Ruby実装での最速のパース方法'))
206
+
207
+ # rouge ruby
208
+ input.each_line(chomp: true) do |line|
209
+ line.split(",", -1) do |row|
210
+ end
211
+ end
212
+
213
+ (('tag:center'))
214
+ Limitation: No quote\n
215
+ (('note:制限:クォートがないこと'))
216
+
217
+ = KEN_ALL.CSV without quote\n(('note:クォートなしのKEN_ALL.CSV'))
218
+
219
+ 01101,060 ,0600000,ホッカイドウ,サッポロシチュウオウク,...
220
+ ...(124257 lines)...
221
+ 47382,90718,9071801,オキナワケン,ヤエヤマグンヨナグニチョウ,...
222
+
223
+ = Optimized no quote CSV parsing\n(('note:最適化したクォートなしCSVのパース方法'))
224
+
225
+ # rouge ruby
226
+ CSV.foreach("KEN_ALL_NO_QUOTE.CSV",
227
+ "r:cp932",
228
+ quote_char: nil) {|row|}
229
+
230
+ # RT
231
+
232
+ split, 2.6, Faster?
233
+
234
+ 0.32s, 0.37s, 0.86x\n(('note:(almost the same/同等)'))
235
+
236
+ = Summary: Performance\n(('note:まとめ:性能'))
237
+
238
+ * Parsing: 1.5x-3x faster\n
239
+ (('note:パース:1.5x-3x高速'))
240
+ * Max to the "split" level by using an option\n
241
+ (('note:オプションを指定すると最大で「split」レベルまで高速化可能'))
242
+ * Writing: 1.5x-2.5x faster\n
243
+ (('note:書き出し:1.5x-2.5x高速'))
244
+ * Use (({CSV#<<})) than (({CSV.generate_line}))\n
245
+ (('note:(({CSV.generate_line}))よりも(({CSV#<<}))を使うこと'))
246
+
247
+ = How to improve performance (1)
248
+
249
+ Complex quote
250
+
251
+ = Complex quote
252
+
253
+ "AA""AAA"
254
+ "AA,AAA"
255
+ "AA\rAAA"
256
+ "AA\nAAA"
257
+
258
+ = Use (({StringScanner}))
259
+
260
+ * (({String#split})) is very fast
261
+ * But it's naive for complex quote
262
+
263
+ = 2.5 uses (({String#split}))
264
+
265
+ # rouge ruby
266
+
267
+ in_extended_column = false # "...\n..." case
268
+ @input.each_line do |line|
269
+ line.split(",", -1).each do |part|
270
+ if in_extended_column
271
+ # ...
272
+ elsif part.start_with?('"')
273
+ if part.end_with?('"')
274
+ row << pars.gsub('""', '"') # "...""..." case
275
+ else
276
+ in_extended_column = true
277
+ end
278
+ # ...
279
+
280
+ = Parsing complex quote with (({split}))
281
+
282
+ # RT
283
+
284
+ Just quoted, 274.1i/s
285
+ Include sep1, 211.0i/s
286
+ Include sep2, 118.0i/s
287
+
288
+ (('tag:center'))
289
+ Slow down
290
+
291
+ = 2.6 uses (({StringScanner}))
292
+
293
+ # rouge ruby
294
+
295
+ row = []
296
+ until @scanner.eos?
297
+ value = parse_column_value
298
+ if @scanner.scan(/,/)
299
+ row << value
300
+ elsif @scanner.scan(/\n/)
301
+ row << value
302
+ yield(row)
303
+ row = []
304
+ end
305
+ end
306
+
307
+ = (({parse_column_value}))
308
+
309
+ # rouge ruby
310
+
311
+ def parse_column_value
312
+ parse_unquoted_column_value ||
313
+ parse_quoted_column_value
314
+ end
315
+
316
+ = (({parse_unquoted_column_value}))
317
+
318
+ # rouge ruby
319
+
320
+ def parse_unquoted_column_value
321
+ @scanner.scan(/[^,"\r\n]+/)
322
+ end
323
+
324
+ = (({parse_unquoted_column_value}))
325
+
326
+ # rouge ruby
327
+
328
+ def parse_quoted_column_value
329
+ return nil unless @scanner.scan(/"/)
330
+ pos = @scanner.pos; value = ""
331
+ loop do
332
+ value << @scanner.scan(/[^"]+/)
333
+ if @scanner.scan(/""/) # "": Escaped quote
334
+ value << '"'
335
+ elsif @scanner.scan(/"/) # End
336
+ return value
337
+ else # No end quote
338
+ @scanner.pos = pos; return nil # Rewind and return
339
+ end; end; end
340
+
341
+ = Parse methods can be composited
342
+
343
+ # rouge ruby
344
+
345
+ def parse_column_value
346
+ parse_unquoted_column_value ||
347
+ parse_quoted_column_value
348
+ end
349
+
350
+ (('tag:center'))
351
+ Easy to maintain
352
+
353
+ = Parsing complex quote with (({StringScanner}))
354
+
355
+ # RT
356
+
357
+ Just quoted, 554.5i/s
358
+ Include sep1, 330.0i/s
359
+ Include sep2, 325.6i/s
360
+
361
+ (('tag:center'))
362
+ No slow down...?
363
+
364
+ = How to improve performance (2)
365
+
366
+ Simple case
367
+
368
+ = Simple case
369
+
370
+ AAAAA
371
+ "AAAAA"
372
+
373
+ = Use (({String#split}))
374
+
375
+ (({StringScanner})) is\n
376
+ slow for\n
377
+ simple case
378
+
379
+ = XXX
380
+
381
+ * Default gemified
382
+ * Reconstructed test suites for further improvements
383
+ * Benchmark suites for further performance improvements
384
+
385
+ = Faster
386
+
387
+ ...
388
+
389
+ = Further works
390
+
391
+ * Improve transcoding performance
392
+ * Implement CSV parser for simple case in C
Binary file
data/theme.rb ADDED
@@ -0,0 +1 @@
1
+ include_theme("clear-code")
metadata ADDED
@@ -0,0 +1,92 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rabbit-slide-kou-rubykaigi-2019
3
+ version: !ruby/object:Gem::Version
4
+ version: 2019.4.19.0
5
+ platform: ruby
6
+ authors:
7
+ - Kouhei Sutou
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-04-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rabbit
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 2.0.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 2.0.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: rabbit-theme-clear-code
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: |-
42
+ csv, one of the standard libraries, in Ruby 2.6 has many improvements:
43
+
44
+ * Default gemified
45
+ * Faster CSV parsing
46
+ * Faster CSV writing
47
+ * Clean new CSV parser implementation for further improvements
48
+ * Reconstructed test suites for further improvements
49
+ * Benchmark suites for further performance improvements
50
+
51
+ These improvements are done without breaking backward compatibility.
52
+
53
+ This talk describes details of these improvements by a new csv maintainer.
54
+ email:
55
+ - kou@clear-code.com
56
+ executables: []
57
+ extensions: []
58
+ extra_rdoc_files: []
59
+ files:
60
+ - ".rabbit"
61
+ - README.rd
62
+ - Rakefile
63
+ - config.yaml
64
+ - csv.rab
65
+ - images/clear-code-rubykaigi-2019-silver-sponsor.png
66
+ - pdf/rubykaigi-2019-csv.pdf
67
+ - theme.rb
68
+ homepage: https://slide.rabbit-shocker.org/authors/kou/rubykaigi-2019/
69
+ licenses:
70
+ - CC-BY-SA-4.0
71
+ metadata: {}
72
+ post_install_message:
73
+ rdoc_options: []
74
+ require_paths:
75
+ - lib
76
+ required_ruby_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ requirements: []
87
+ rubyforge_project:
88
+ rubygems_version: 2.7.6.2
89
+ signing_key:
90
+ specification_version: 4
91
+ summary: Better CSV processing with Ruby 2.6
92
+ test_files: []