rabbit-slide-kou-data-science-rb 2017.5.19.2 → 2017.5.19.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/config.yaml +1 -1
- data/pdf/data-science-rb-ruby-with-apache-arrow-joins-data-processing-languages.pdf +0 -0
- data/ruby-with-apache-arrow-joins-data-processing-languages.rab +20 -19
- data/sample/read-tensor-gsl.rb +11 -0
- data/sample/read-tensor-nmatrix.rb +11 -0
- data/sample/read-tensor-numo-narray.rb +11 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 43bab7a71fb5de0ab27f1b82377c5164aaa519ca
|
4
|
+
data.tar.gz: 5c7b4e2166efc143d8e43f2739957f6cfeea232d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 69473a1db8461ad44afe52498c83590269a903d8144b33f3a0ab07ac939b04d22fcfc8cc523101d11e3d27a1a835716a187d02703060f27767acc540f989946f
|
7
|
+
data.tar.gz: 39eefdc301d3f32ec74f55e22a33f3970d7346e93baab62be512f349ecd593fe89818565611d3c6e615c0b586c5cd90e89441e9a78031f328e94521f32c3e07c
|
data/config.yaml
CHANGED
Binary file
|
@@ -117,13 +117,14 @@ Rubyでやりたい
|
|
117
117
|
|
118
118
|
# rouge python
|
119
119
|
# pandasでデータ生成→Arrow形式で書き込み
|
120
|
+
import pandas as pd
|
120
121
|
import pyarrow as pa
|
121
122
|
|
122
123
|
df = pd.DataFrame({"a": [1, 2, 3],
|
123
124
|
"b": ["hello", "world", "!"]})
|
124
125
|
record_batch = pa.RecordBatch.from_pandas(df)
|
125
126
|
|
126
|
-
with pa.OSFile("/
|
127
|
+
with pa.OSFile("/dev/shm/pandas.arrow", "wb") as sink:
|
127
128
|
schema = record_batch.schema
|
128
129
|
writer = pa.RecordBatchFileWriter(sink, schema)
|
129
130
|
writer.write_batch(record_batch)
|
@@ -136,7 +137,7 @@ Rubyでやりたい
|
|
136
137
|
require "arrow"
|
137
138
|
|
138
139
|
Input = Arrow::MemoryMappedInputStream
|
139
|
-
Input.open("/
|
140
|
+
Input.open("/dev/shm/pandas.arrow") do |input|
|
140
141
|
reader = Arrow::RecordBatchFileReader.new(input)
|
141
142
|
reader.each do |record_batch|
|
142
143
|
puts("=" * 40)
|
@@ -153,7 +154,7 @@ Rubyでやりたい
|
|
153
154
|
local Arrow = lgi.Arrow
|
154
155
|
|
155
156
|
local input_class = Arrow.MemoryMappedInputStream
|
156
|
-
local input = input_class.new("/
|
157
|
+
local input = input_class.new("/dev/shm/pandas.arrow")
|
157
158
|
local reader = Arrow.RecordBatchFileReader.new(input)
|
158
159
|
for i = 0, reader:get_n_record_batches() - 1 do
|
159
160
|
local record_batch = reader:get_record_batch(i)
|
@@ -171,7 +172,7 @@ Rubyでやりたい
|
|
171
172
|
|
172
173
|
df = data.frame(a=c(1, 2, 3),
|
173
174
|
b=c(1.1, 2.2, 3.3))
|
174
|
-
write_feather(df, "/
|
175
|
+
write_feather(df, "/dev/shm/dataframe.feather")
|
175
176
|
|
176
177
|
= Feather:Ruby
|
177
178
|
|
@@ -180,7 +181,7 @@ Rubyでやりたい
|
|
180
181
|
require "arrow"
|
181
182
|
|
182
183
|
Input = Arrow::MemoryMappedInputStream
|
183
|
-
Input.open("/
|
184
|
+
Input.open("/dev/shm/dataframe.feather") do |input|
|
184
185
|
reader = Arrow::FeatherFileReader.new(input)
|
185
186
|
reader.columns.each do |column|
|
186
187
|
puts("#{column.name}: #{column.to_a.inspect}")
|
@@ -198,7 +199,7 @@ Rubyでやりたい
|
|
198
199
|
df = pd.DataFrame({"a": [1, 2, 3],
|
199
200
|
"b": ["hello", "world", "!"]})
|
200
201
|
table = pa.Table.from_pandas(df)
|
201
|
-
pq.write_table(table, "/
|
202
|
+
pq.write_table(table, "/dev/shm/pandas.parquet")
|
202
203
|
|
203
204
|
= Parquet:Ruby
|
204
205
|
|
@@ -207,7 +208,7 @@ Rubyでやりたい
|
|
207
208
|
require "arrow"
|
208
209
|
require "parquet"
|
209
210
|
|
210
|
-
path = "/
|
211
|
+
path = "/dev/shm/pandas.parquet"
|
211
212
|
reader = Parquet::ArrowFileReader.new(path)
|
212
213
|
table = reader.read_table
|
213
214
|
table.each_column do |column|
|
@@ -237,14 +238,14 @@ Rubyでやりたい
|
|
237
238
|
# rouge ruby
|
238
239
|
# 空のテーブルにArrow形式のデータを読み込む
|
239
240
|
logs = Groonga::Array.create(name: "logs")
|
240
|
-
logs.load_arrow("/
|
241
|
+
logs.load_arrow("/dev/shm/pandas.arrow")
|
241
242
|
logs.each {|record| p record.attributes}
|
242
243
|
# フィルター
|
243
244
|
filtered_logs = logs.select do |record|
|
244
245
|
record.b =~ "hello" # "hello"で全文検索
|
245
246
|
end
|
246
247
|
# フィルター結果をArrow形式で書き込み
|
247
|
-
filtered_logs.dump_arrow("/
|
248
|
+
filtered_logs.dump_arrow("/dev/shm/filtered.arrow",
|
248
249
|
column_names: ["a", "b"])
|
249
250
|
|
250
251
|
= Groonga:Python
|
@@ -253,7 +254,7 @@ Rubyでやりたい
|
|
253
254
|
# Arrow形式のGroongaでのフィルター結果を読み込む
|
254
255
|
import pyarrow as pa
|
255
256
|
|
256
|
-
with pa.OSFile("/
|
257
|
+
with pa.OSFile("/dev/shm/filtered.arrow") as source:
|
257
258
|
writer = pa.RecordBatchFileReader(source)
|
258
259
|
print(writer.get_record_batch(0).to_pandas())
|
259
260
|
|
@@ -294,7 +295,7 @@ Rubyでやりたい
|
|
294
295
|
ndarray = np.random.randn(10, 6) # 10x6
|
295
296
|
print(ndarray)
|
296
297
|
tensor = pa.Tensor.from_numpy(ndarray)
|
297
|
-
with pa.OSFile("/
|
298
|
+
with pa.OSFile("/dev/shm/tensor.arrow", "wb") as sink:
|
298
299
|
pa.write_tensor(tensor, sink)
|
299
300
|
|
300
301
|
= Tensor:Ruby
|
@@ -304,7 +305,7 @@ Rubyでやりたい
|
|
304
305
|
require "arrow"
|
305
306
|
|
306
307
|
Input = Arrow::MemoryMappedInputStream
|
307
|
-
Input.open("/
|
308
|
+
Input.open("/dev/shm/tensor.arrow") do |input|
|
308
309
|
tensor = input.read_tensor(0)
|
309
310
|
p tensor.shape # => [10, 6]
|
310
311
|
end
|
@@ -318,7 +319,7 @@ Rubyでやりたい
|
|
318
319
|
require "pp"
|
319
320
|
|
320
321
|
Input = Arrow::MemoryMappedInputStream
|
321
|
-
Input.open("/
|
322
|
+
Input.open("/dev/shm/tensor.arrow") do |input|
|
322
323
|
tensor = input.read_tensor(0)
|
323
324
|
pp tensor.to_gsl
|
324
325
|
# tensor.to_gsl.to_arrow == tensor
|
@@ -333,7 +334,7 @@ Rubyでやりたい
|
|
333
334
|
require "pp"
|
334
335
|
|
335
336
|
Input = Arrow::MemoryMappedInputStream
|
336
|
-
Input.open("/
|
337
|
+
Input.open("/dev/shm/tensor.arrow") do |input|
|
337
338
|
tensor = input.read_tensor(0)
|
338
339
|
pp tensor.to_nmatrix
|
339
340
|
# tensor.to_nmatrix.to_arrow == tensor
|
@@ -348,7 +349,7 @@ Rubyでやりたい
|
|
348
349
|
require "pp"
|
349
350
|
|
350
351
|
Input = Arrow::MemoryMappedInputStream
|
351
|
-
Input.open("/
|
352
|
+
Input.open("/dev/shm/tensor.arrow") do |input|
|
352
353
|
tensor = input.read_tensor(0)
|
353
354
|
pp tensor.to_narray
|
354
355
|
# tensor.to_narray.to_arrow == tensor
|
@@ -382,8 +383,8 @@ Rubyでやりたい
|
|
382
383
|
|
383
384
|
* Rubyでデータ処理したいなぁ!\n
|
384
385
|
の実現を目指すプロジェクト
|
385
|
-
*
|
386
|
-
*
|
386
|
+
* リソース:
|
387
|
+
* GitHub: red-data-tools
|
387
388
|
* https://red-data-tools.github.io
|
388
389
|
* https://gitter.im/red-data-tools
|
389
390
|
|
@@ -457,7 +458,7 @@ Because we use Ruby!
|
|
457
458
|
|
458
459
|
* Rubyでデータ処理したい人!
|
459
460
|
* ポリシーに同意できる人!
|
460
|
-
*
|
461
|
-
*
|
461
|
+
* リソース:
|
462
|
+
* GitHub: red-data-tools
|
462
463
|
* https://red-data-tools.github.io
|
463
464
|
* https://gitter.im/red-data-tools
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rabbit-slide-kou-data-science-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2017.5.19.
|
4
|
+
version: 2017.5.19.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rabbit
|
@@ -59,6 +59,9 @@ files:
|
|
59
59
|
- sample/read-pandas.lua
|
60
60
|
- sample/read-pandas.rb
|
61
61
|
- sample/read-parquet.rb
|
62
|
+
- sample/read-tensor-gsl.rb
|
63
|
+
- sample/read-tensor-nmatrix.rb
|
64
|
+
- sample/read-tensor-numo-narray.rb
|
62
65
|
- sample/read-tensor.rb
|
63
66
|
- sample/write-feather.R
|
64
67
|
- sample/write-pandas.py
|