rabbit-slide-kou-data-science-rb 2017.5.19.2 → 2017.5.19.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/config.yaml +1 -1
- data/pdf/data-science-rb-ruby-with-apache-arrow-joins-data-processing-languages.pdf +0 -0
- data/ruby-with-apache-arrow-joins-data-processing-languages.rab +20 -19
- data/sample/read-tensor-gsl.rb +11 -0
- data/sample/read-tensor-nmatrix.rb +11 -0
- data/sample/read-tensor-numo-narray.rb +11 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 43bab7a71fb5de0ab27f1b82377c5164aaa519ca
|
4
|
+
data.tar.gz: 5c7b4e2166efc143d8e43f2739957f6cfeea232d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 69473a1db8461ad44afe52498c83590269a903d8144b33f3a0ab07ac939b04d22fcfc8cc523101d11e3d27a1a835716a187d02703060f27767acc540f989946f
|
7
|
+
data.tar.gz: 39eefdc301d3f32ec74f55e22a33f3970d7346e93baab62be512f349ecd593fe89818565611d3c6e615c0b586c5cd90e89441e9a78031f328e94521f32c3e07c
|
data/config.yaml
CHANGED
Binary file
|
@@ -117,13 +117,14 @@ Rubyでやりたい
|
|
117
117
|
|
118
118
|
# rouge python
|
119
119
|
# pandasでデータ生成→Arrow形式で書き込み
|
120
|
+
import pandas as pd
|
120
121
|
import pyarrow as pa
|
121
122
|
|
122
123
|
df = pd.DataFrame({"a": [1, 2, 3],
|
123
124
|
"b": ["hello", "world", "!"]})
|
124
125
|
record_batch = pa.RecordBatch.from_pandas(df)
|
125
126
|
|
126
|
-
with pa.OSFile("/
|
127
|
+
with pa.OSFile("/dev/shm/pandas.arrow", "wb") as sink:
|
127
128
|
schema = record_batch.schema
|
128
129
|
writer = pa.RecordBatchFileWriter(sink, schema)
|
129
130
|
writer.write_batch(record_batch)
|
@@ -136,7 +137,7 @@ Rubyでやりたい
|
|
136
137
|
require "arrow"
|
137
138
|
|
138
139
|
Input = Arrow::MemoryMappedInputStream
|
139
|
-
Input.open("/
|
140
|
+
Input.open("/dev/shm/pandas.arrow") do |input|
|
140
141
|
reader = Arrow::RecordBatchFileReader.new(input)
|
141
142
|
reader.each do |record_batch|
|
142
143
|
puts("=" * 40)
|
@@ -153,7 +154,7 @@ Rubyでやりたい
|
|
153
154
|
local Arrow = lgi.Arrow
|
154
155
|
|
155
156
|
local input_class = Arrow.MemoryMappedInputStream
|
156
|
-
local input = input_class.new("/
|
157
|
+
local input = input_class.new("/dev/shm/pandas.arrow")
|
157
158
|
local reader = Arrow.RecordBatchFileReader.new(input)
|
158
159
|
for i = 0, reader:get_n_record_batches() - 1 do
|
159
160
|
local record_batch = reader:get_record_batch(i)
|
@@ -171,7 +172,7 @@ Rubyでやりたい
|
|
171
172
|
|
172
173
|
df = data.frame(a=c(1, 2, 3),
|
173
174
|
b=c(1.1, 2.2, 3.3))
|
174
|
-
write_feather(df, "/
|
175
|
+
write_feather(df, "/dev/shm/dataframe.feather")
|
175
176
|
|
176
177
|
= Feather:Ruby
|
177
178
|
|
@@ -180,7 +181,7 @@ Rubyでやりたい
|
|
180
181
|
require "arrow"
|
181
182
|
|
182
183
|
Input = Arrow::MemoryMappedInputStream
|
183
|
-
Input.open("/
|
184
|
+
Input.open("/dev/shm/dataframe.feather") do |input|
|
184
185
|
reader = Arrow::FeatherFileReader.new(input)
|
185
186
|
reader.columns.each do |column|
|
186
187
|
puts("#{column.name}: #{column.to_a.inspect}")
|
@@ -198,7 +199,7 @@ Rubyでやりたい
|
|
198
199
|
df = pd.DataFrame({"a": [1, 2, 3],
|
199
200
|
"b": ["hello", "world", "!"]})
|
200
201
|
table = pa.Table.from_pandas(df)
|
201
|
-
pq.write_table(table, "/
|
202
|
+
pq.write_table(table, "/dev/shm/pandas.parquet")
|
202
203
|
|
203
204
|
= Parquet:Ruby
|
204
205
|
|
@@ -207,7 +208,7 @@ Rubyでやりたい
|
|
207
208
|
require "arrow"
|
208
209
|
require "parquet"
|
209
210
|
|
210
|
-
path = "/
|
211
|
+
path = "/dev/shm/pandas.parquet"
|
211
212
|
reader = Parquet::ArrowFileReader.new(path)
|
212
213
|
table = reader.read_table
|
213
214
|
table.each_column do |column|
|
@@ -237,14 +238,14 @@ Rubyでやりたい
|
|
237
238
|
# rouge ruby
|
238
239
|
# 空のテーブルにArrow形式のデータを読み込む
|
239
240
|
logs = Groonga::Array.create(name: "logs")
|
240
|
-
logs.load_arrow("/
|
241
|
+
logs.load_arrow("/dev/shm/pandas.arrow")
|
241
242
|
logs.each {|record| p record.attributes}
|
242
243
|
# フィルター
|
243
244
|
filtered_logs = logs.select do |record|
|
244
245
|
record.b =~ "hello" # "hello"で全文検索
|
245
246
|
end
|
246
247
|
# フィルター結果をArrow形式で書き込み
|
247
|
-
filtered_logs.dump_arrow("/
|
248
|
+
filtered_logs.dump_arrow("/dev/shm/filtered.arrow",
|
248
249
|
column_names: ["a", "b"])
|
249
250
|
|
250
251
|
= Groonga:Python
|
@@ -253,7 +254,7 @@ Rubyでやりたい
|
|
253
254
|
# Arrow形式のGroongaでのフィルター結果を読み込む
|
254
255
|
import pyarrow as pa
|
255
256
|
|
256
|
-
with pa.OSFile("/
|
257
|
+
with pa.OSFile("/dev/shm/filtered.arrow") as source:
|
257
258
|
writer = pa.RecordBatchFileReader(source)
|
258
259
|
print(writer.get_record_batch(0).to_pandas())
|
259
260
|
|
@@ -294,7 +295,7 @@ Rubyでやりたい
|
|
294
295
|
ndarray = np.random.randn(10, 6) # 10x6
|
295
296
|
print(ndarray)
|
296
297
|
tensor = pa.Tensor.from_numpy(ndarray)
|
297
|
-
with pa.OSFile("/
|
298
|
+
with pa.OSFile("/dev/shm/tensor.arrow", "wb") as sink:
|
298
299
|
pa.write_tensor(tensor, sink)
|
299
300
|
|
300
301
|
= Tensor:Ruby
|
@@ -304,7 +305,7 @@ Rubyでやりたい
|
|
304
305
|
require "arrow"
|
305
306
|
|
306
307
|
Input = Arrow::MemoryMappedInputStream
|
307
|
-
Input.open("/
|
308
|
+
Input.open("/dev/shm/tensor.arrow") do |input|
|
308
309
|
tensor = input.read_tensor(0)
|
309
310
|
p tensor.shape # => [10, 6]
|
310
311
|
end
|
@@ -318,7 +319,7 @@ Rubyでやりたい
|
|
318
319
|
require "pp"
|
319
320
|
|
320
321
|
Input = Arrow::MemoryMappedInputStream
|
321
|
-
Input.open("/
|
322
|
+
Input.open("/dev/shm/tensor.arrow") do |input|
|
322
323
|
tensor = input.read_tensor(0)
|
323
324
|
pp tensor.to_gsl
|
324
325
|
# tensor.to_gsl.to_arrow == tensor
|
@@ -333,7 +334,7 @@ Rubyでやりたい
|
|
333
334
|
require "pp"
|
334
335
|
|
335
336
|
Input = Arrow::MemoryMappedInputStream
|
336
|
-
Input.open("/
|
337
|
+
Input.open("/dev/shm/tensor.arrow") do |input|
|
337
338
|
tensor = input.read_tensor(0)
|
338
339
|
pp tensor.to_nmatrix
|
339
340
|
# tensor.to_nmatrix.to_arrow == tensor
|
@@ -348,7 +349,7 @@ Rubyでやりたい
|
|
348
349
|
require "pp"
|
349
350
|
|
350
351
|
Input = Arrow::MemoryMappedInputStream
|
351
|
-
Input.open("/
|
352
|
+
Input.open("/dev/shm/tensor.arrow") do |input|
|
352
353
|
tensor = input.read_tensor(0)
|
353
354
|
pp tensor.to_narray
|
354
355
|
# tensor.to_narray.to_arrow == tensor
|
@@ -382,8 +383,8 @@ Rubyでやりたい
|
|
382
383
|
|
383
384
|
* Rubyでデータ処理したいなぁ!\n
|
384
385
|
の実現を目指すプロジェクト
|
385
|
-
*
|
386
|
-
*
|
386
|
+
* リソース:
|
387
|
+
* GitHub: red-data-tools
|
387
388
|
* https://red-data-tools.github.io
|
388
389
|
* https://gitter.im/red-data-tools
|
389
390
|
|
@@ -457,7 +458,7 @@ Because we use Ruby!
|
|
457
458
|
|
458
459
|
* Rubyでデータ処理したい人!
|
459
460
|
* ポリシーに同意できる人!
|
460
|
-
*
|
461
|
-
*
|
461
|
+
* リソース:
|
462
|
+
* GitHub: red-data-tools
|
462
463
|
* https://red-data-tools.github.io
|
463
464
|
* https://gitter.im/red-data-tools
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rabbit-slide-kou-data-science-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2017.5.19.
|
4
|
+
version: 2017.5.19.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rabbit
|
@@ -59,6 +59,9 @@ files:
|
|
59
59
|
- sample/read-pandas.lua
|
60
60
|
- sample/read-pandas.rb
|
61
61
|
- sample/read-parquet.rb
|
62
|
+
- sample/read-tensor-gsl.rb
|
63
|
+
- sample/read-tensor-nmatrix.rb
|
64
|
+
- sample/read-tensor-numo-narray.rb
|
62
65
|
- sample/read-tensor.rb
|
63
66
|
- sample/write-feather.R
|
64
67
|
- sample/write-pandas.py
|