RubyGems - rabbit-slide-kou-data-science-rb - Versions diffs - 2017.5.19.0 - Mend

rabbit-slide-kou-data-science-rb 2017.5.19.0

Files changed (18) hide show

checksums.yaml +7 -0
data/.rabbit +1 -0
data/README.rd +38 -0
data/Rakefile +17 -0
data/config.yaml +24 -0
data/pdf/data-science-rb-ruby-with-apache-arrow-joins-data-processing-languages.pdf +0 -0
data/ruby-with-apache-arrow-joins-data-processing-languages.rab +444 -0
data/sample/filter-groonga.rb +15 -0
data/sample/read-feather.rb +10 -0
data/sample/read-groonga.py +7 -0
data/sample/read-pandas.rb +11 -0
data/sample/read-parquet.rb +10 -0
data/sample/read-tensor.rb +15 -0
data/sample/write-feather.R +5 -0
data/sample/write-pandas.py +14 -0
data/sample/write-parquet.py +10 -0
data/sample/write-tensor.py +10 -0
metadata +90 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 6adc80216160df3aa2fcc4e681adb8adef33c9ff
+  data.tar.gz: 1c8032d96c9c619f40bf1789a937408da47bb597
+SHA512:
+  metadata.gz: 677dc10e63a5bf2384757c4f679b4843fb82577c44fc2b6270df8e21b43f76a80fa8e6ca3e3c1eaf305ff0d69160cf1358701dac8e745e1f24d7dc09f32b9ff6
+  data.tar.gz: 431d971ef33ec8e19e5ffcb730d1fc6404e4cb1f8e09cd1cbea5d8dd0d1db09a3c0c7beef1dec1a6bf82258609bdfe46703b7d2a77f4f4036f41acd0eda63549

data/.rabbit ADDED

	@@ -0,0 +1 @@
1	+ ruby-with-apache-arrow-joins-data-processing-languages.rab

data/README.rd ADDED

@@ -0,0 +1,38 @@
+= RubyもApache Arrowでデータ処理言語の仲間入り
+Apache Arrowはデータ分析システム間でのデータ交換を効率化することを目的としたデータフォーマットです。pandasやApache SparkやRなど主要なデータ分析システムはApache Arrowの対応を進めています。近い将来、データ分析システム間でのデータ交換はApache Arrowを使う状態になるでしょう。RubyもApache Arrowに対応することで既存のデータ分析システムとデータ交換できるようになります。これで、Rubyもデータ分析システムの一部でデータ処理をできるようになります！
+== ライセンス
+=== スライド
+CC BY-SA 4.0
+原著作者名は以下の通りです。
+  * 須藤功平（またはKouhei Sutou）
+=== プログラム
+CC0（パブリックドメイン）
+== 作者向け
+=== 表示
+  rake
+=== 公開
+  rake publish
+== 閲覧者向け
+=== インストール
+  gem install rabbit-slide-kou-data-science-rb
+=== 表示
+  rabbit rabbit-slide-kou-data-science-rb.gem

data/Rakefile ADDED

@@ -0,0 +1,17 @@
+require "rabbit/task/slide"
+# Edit ./config.yaml to customize meta data
+spec = nil
+Rabbit::Task::Slide.new do |task|
+  spec = task.spec
+  spec.files += Dir.glob("sample/**/*.*")
+  # spec.files -= Dir.glob("private/**/*.*")
+  spec.add_runtime_dependency("rabbit-theme-clear-code")
+end
+desc "Tag #{spec.version}"
+task :tag do
+  sh("git", "tag", "-a", spec.version.to_s, "-m", "Publish #{spec.version}")
+  sh("git", "push", "--tags")
+end

data/config.yaml ADDED

@@ -0,0 +1,24 @@
+---
+id: data-science-rb
+base_name: ruby-with-apache-arrow-joins-data-processing-languages
+tags:
+- rabbit
+- ruby
+- arrow
+- data
+presentation_date: 2017-05-19
+version: 2017.5.19.0
+licenses:
+- CC-BY-SA-4.0
+slideshare_id: datasciencerb
+speaker_deck_id:
+ustream_id:
+vimeo_id:
+youtube_id:
+author:
+  markup_language: :rd
+  name: Kouhei Sutou
+  email: kou@clear-code.com
+  rubygems_user: kou
+  slideshare_user:
+  speaker_deck_user:

data/pdf/data-science-rb-ruby-with-apache-arrow-joins-data-processing-languages.pdf ADDED

Binary file

data/ruby-with-apache-arrow-joins-data-processing-languages.rab ADDED

@@ -0,0 +1,444 @@
+= RubyもApache Arrowで\nデータ処理言語の\n仲間入り
+: author
+   須藤功平
+: institution
+   クリアコード
+: content-source
+   DataScience.rbワークショップ
+: date
+   2017-05-19
+: allotted-time
+   20m
+: theme
+   clear-code
+= はじめに
+(('tag:center'))
+(('tag:large'))
+私はRubyが好きだ
+(('tag:center'))
+(('tag:large'))
+だからデータ分析だって\n
+Rubyでやりたい
+(('tag:center'))
+(('note:Rubyよりも向いている言語があるのはわかっているけどさー'))
+= Apache Arrow
+(('tag:center'))
+データフォーマットの仕様
+(('tag:center'))
+と
+(('tag:center'))
+その仕様を処理する実装
+= Arrow：解決したい問題
+  * 高いデータ交換コスト
+    * →低くしたい
+  * 重複した最適化実装
+    * →実装を共有したい
+= Arrow：文脈
+ビッグデータの分析
+= ビッグデータの分析
+  * いろんなシステムが連携
+    * Java実装のもろもろとPythonとR
+  * システム間でデータ交換が必要
+    * 交換する度にシリアライズ・パース
+    * (('wait'))↑に結構CPUと時間を使われる…
+    * (('wait'))そんなのより分析処理に使いたい！
+= Arrow：解決方針
+コストゼロの\n
+シリアライズ・\n
+パース
+= Arrow：コストゼロの実現
+  * そのまま使えるフォーマット
+    * 例:int8の配列→int8の値を連続配置
+    * 1バイトずつずらせば高速アクセス可
+  * Arrowのトレードオフ
+    * サイズ圧縮よりシリアライズゼロ
+    * 参考：Parquetはサイズ圧縮優先
+= Arrowがある世界
+  * 各システムがサクサク連携
+    * 例：PySparkが高速化
+    * 理由:Py🡘Javaのデータ交換コスト減
+  * Java・Python・R以外も活躍
+    * 例：Ruby・Go・Rust・Juliaとか
+    * 理由：低コストでデータ交換可能
+= ArrowとRuby
+チャンス！
+= ArrowとRubyとデータ分析
+  * RubyがArrowに対応
+    * Rubyにデータが回ってくる！
+    * →Rubyにもデータ分析の機会が！\n
+      (('note:（今はできることは少ないだろうけど…）'))
+  * (('wait'))次のステップ
+    * できることを増やしていく！
+    * →Rubyでもいろいろデータ分析！
+= ArrowとRubyの今
+  * RubyでArrowを使える！
+    * 私が使えるようにしているから！\n
+      (('note:コミッターにもなった'))
+    * 公式リポジトリーにも入っている\n
+      (('note:厳密に言うと違うんだけど公式サポートだと思ってよい'))
+  * Rubyでデータを読み書きできる
+    * いくらかデータ処理もできる
+= 今できること
+  * Python・Rとのデータ交換
+  * データ処理をいくらか
+  * Rubyの各種ライブラリー間での\n
+    データ交換
+= Arrow：Python
+  # coderay python
+  # pandasでデータ生成→Arrow形式で書き込み
+  import pyarrow as pa
+  df = pd.DataFrame({"a": [1, 2, 3],
+                     "b": ["hello", "world", "!"]})
+  record_batch = pa.RecordBatch.from_pandas(df)
+  with pa.OSFile("/tmp/pandas.arrow", "wb") as sink:
+      schema = record_batch.schema
+      writer = pa.RecordBatchFileWriter(sink, schema)
+      writer.write_batch(record_batch)
+      writer.close()
+= Arrow：Ruby
+  # coderay ruby
+  # RubyでArrow形式のpandasのデータを読み込み
+  require "arrow"
+  Input = Arrow::MemoryMappedInputStream
+  Input.open("/tmp/pandas.arrow") do |input|
+    reader = Arrow::RecordBatchFileReader.new(input)
+    reader.each do |record_batch|
+      puts("=" * 40)
+      puts(record_batch)
+    end
+  end
+= Feather：R
+  # coderay R
+  # Rでデータ生成→Feather形式で書き込み
+  library("feather")
+  df = data.frame(a=c(1, 2, 3),
+                  b=c(1.1, 2.2, 3.3))
+  write_feather(df, "/tmp/dataframe.feather")
+= Feather：Ruby
+  # coderay ruby
+  # RubyでFeather形式のRのデータを読み込み
+  require "arrow"
+  Input = Arrow::MemoryMappedInputStream
+  Input.open("/tmp/dataframe.feather") do |input|
+    reader = Arrow::FeatherFileReader.new(input)
+    reader.columns.each do |column|
+      puts("#{column.name}: #{column.to_a.inspect}")
+    end
+  end
+= Parquet：Python
+  # coderay python
+  # Pythonでデータ生成→Parquet形式で書き込み
+  import pandas as pd
+  import pyarrow as pa
+  import pyarrow.parquet as pq
+  df = pd.DataFrame({"a": [1, 2, 3],
+                     "b": ["hello", "world", "!"]})
+  table = pa.Table.from_pandas(df)
+  pq.write_table(table, "/tmp/pandas.parquet")
+= Parquet：Ruby
+  # coderay ruby
+  # RubyでParquet形式のデータを読み込み
+  require "arrow"
+  require "parquet"
+  path = "/tmp/pandas.parquet"
+  reader = Parquet::ArrowFileReader.new(path)
+  table = reader.read_table
+  table.each_column do |column|
+    puts("#{column.name}: #{column.to_a.inspect}")
+  end
+= 対応データ形式まとめ
+  * Arrow形式
+    * 各種言語(('note:（これから広く使われているはず）'))
+  * Feather形式
+    * Python・R専用
+  * Parquet形式
+    * 各種言語(('note:（Hadoop界隈ですでに広く使われている）'))
+= データ処理例
+  * Groongaでフィルター
+  * Groonga
+    * 全文検索エンジン
+    * カラムストアなので集計処理も得意
+    * Apache Arrow対応
+    * よくできたRubyバインディングあり
+= Groonga：Ruby
+  # coderay ruby
+  # 空のテーブルにArrow形式のデータを読み込む
+  logs = Groonga::Array.create(name: "logs")
+  logs.load_arrow("/tmp/pandas.arrow")
+  logs.each {|record| p record.attributes}
+  # フィルター
+  filtered_logs = logs.select do |record|
+    record.b =~ "hello" # "hello"で全文検索
+  end
+  # フィルター結果をArrow形式で書き込み
+  filtered_logs.dump_arrow("/tmp/filtered.arrow",
+                           column_names: ["a", "b"])
+= Groonga：Python
+  # coderay python
+  # Arrow形式のGroongaでのフィルター結果を読み込む
+  import pyarrow as pa
+  with pa.OSFile("/tmp/filtered.arrow") as source:
+      writer = pa.RecordBatchFileReader(source)
+      print(writer.get_record_batch(0).to_pandas())
+= Rubyでデータ処理（現状）
+  * 既存のCライブラリーを活用
+    * 速度がでるし機能もある
+  * CライブラリーをArrowに対応
+    * Arrow→Ruby→Cライブラリー\n
+      ↑から↓で高速化(('note:（オブジェクト生成は遅い）'))
+    * Arrow→Cライブラリー
+= Rubyでデータ処理（案）
+  * Fluentdとか速くなりそう
+    * 途中でメッセージを参照しないなら
+  * MessagePackからArrowに変える
+    * Arrowのまま出力先へ送る
+    * 途中でRubyオブジェクトができない\n
+      (('note:シリアライズ・パースがなくなって速い！'))
+= 多次元配列
+  * Arrowではオプション機能
+    * テンソルと呼んでいる\n
+      (('note:（traditional multidimensional array objectと説明）'))
+  * C++実装ではサポート
+    * バインディングでは使える
+    * Python・Ruby…では使える
+= Tensor：Python
+  # coderay python
+  # NumPyでデータ生成→書き込み
+  import pyarrow as pa
+  import numpy as np
+  ndarray = np.random.randn(10, 6) # 10x6
+  print(ndarray)
+  tensor = pa.Tensor.from_numpy(ndarray)
+  with pa.OSFile("/tmp/tensor.arrow", "wb") as sink:
+      pa.write_tensor(tensor, sink)
+= Tensor：Ruby
+  # coderay ruby
+  # Rubyで読み込み
+  require "arrow"
+  Input = Arrow::MemoryMappedInputStream
+  Input.open("/tmp/tensor.arrow") do |input|
+    tensor = input.read_tensor(0)
+    p tensor.shape # => [10, 6]
+  end
+= Ruby：GSL
+  # coderay ruby
+  # GSLオブジェクトに変換
+  require "arrow"
+  require "arrow-gsl"
+  require "pp"
+  Input = Arrow::MemoryMappedInputStream
+  Input.open("/tmp/tensor.arrow") do |input|
+    tensor = input.read_tensor(0)
+    pp tensor.to_gsl
+    # tensor.to_gsl.to_arrow == tensor
+  end
+= Ruby：NMatrix
+  # coderay ruby
+  # NMatrixオブジェクトに変換
+  require "arrow"
+  require "arrow-nmatrix"
+  require "pp"
+  Input = Arrow::MemoryMappedInputStream
+  Input.open("/tmp/tensor.arrow") do |input|
+    tensor = input.read_tensor(0)
+    pp tensor.to_nmatrix
+    # tensor.to_nmatrix.to_arrow == tensor
+  end
+= Ruby：Numo::NArray
+  # coderay ruby
+  # Numo::NArrayオブジェクトに変換
+  require "arrow"
+  require "arrow-numo-narray"
+  require "pp"
+  Input = Arrow::MemoryMappedInputStream
+  Input.open("/tmp/tensor.arrow") do |input|
+    tensor = input.read_tensor(0)
+    pp tensor.to_narray
+    # tensor.to_narray.to_arrow == tensor
+  end
+= ここまでのまとめ1
+  * Arrowが実現したい世界
+    * データ交換コストが低い世界
+    * 最適化実装を共有している世界
+= ここまでのまとめ2
+  * RubyとArrowの今
+    * ArrowはRubyを公式サポート！
+    * Rubyの外の世界とデータ交換可能\n
+      (('note:（Arrow・Feather・Parquetをサポート）'))
+    * Rubyの各種ライブラリーとの\n
+      相互変換が可能\n
+      (('note:（メモリーコピーぐらいのコストで）'))
+= ArrowとRubyとこれから
+  * Arrow
+    * データフレーム処理の最適化実装
+    * マルチコア・GPU対応
+  * Ruby
+    * Red Data Toolsプロジェクト
+= Red Data Tools
+  * Rubyでデータ処理したいなぁ！\n
+    の実現を目指すプロジェクト
+  * URL：
+    * https://github.io/red-data-tools
+    * https://red-data-tools.github.io
+    * https://gitter.im/red-data-tools
+= 既存プロダクト
+  * Red Arrow(('note:（ArrowのRubyバインディング）'))
+    * Red Arrow XXX(('note:（ArrowとXXXの相互変換）'))
+  * Parquet GLib(('note:（ParquetのGLibバインディング）'))
+  * Red Parquet(('note:（ParquetのRubyバインディング）'))
+  * Jekyll Jupyter Notebook plugin(('note:（JekyllでJupyter Notebookを表示）'))
+= ポリシー1
+(('tag:center'))
+Collaborate\n
+over Ruby communities
+(('tag:center'))
+(('note:Ruby以外の人たちとも言語を超えて協力する'))\n
+(('note:Apache Arrowがやっていることはまさにそう'))\n
+(('note:もちろんRubyの人たちとも協力する'))
+= ポリシー2
+(('tag:center'))
+Acting than blaming
+(('tag:center'))
+(('note:時間は嘆き・非難より手を動かすことに使う'))
+= ポリシー3
+(('tag:center'))
+Continuous small works than\n
+a temporary big work
+(('tag:center'))
+(('note:一時的にガッとやって終わりより'))\n
+(('note:小さくても継続して活動する'))
+= ポリシー4
+(('tag:center'))
+The current\n
+lack of knowledge\n
+isn't matter
+(('tag:center'))
+(('note:現時点で数学や統計学などの知識が足りなくても問題ない'))\n
+(('note:既存の実装を使ったりそこから学んだりできるから'))
+= ポリシー5
+(('tag:center'))
+Ignore blames from outsiders
+(('tag:center'))
+(('note:部外者の非難は気にしない'))\n
+(('note:結果がでるまでグチグチ言われるはず :p'))
+= ポリシー6
+(('tag:center'))
+Fun!\n
+Because we use Ruby!
+(('tag:center'))
+(('note:Rubyを使うんだし楽しくやろう！'))
+= Join us!
+  * Rubyでデータ処理したい人！
+  * ポリシーに同意できる人！
+  * URL：
+    * https://github.io/red-data-tools
+    * https://red-data-tools.github.io
+    * https://gitter.im/red-data-tools

data/sample/filter-groonga.rb ADDED

@@ -0,0 +1,15 @@
+#!/usr/bin/env ruby
+require "rroonga"
+Groonga::Database.create(path: "/tmp/db")
+logs = Groonga::Array.create(name: "logs")
+logs.load_arrow("/tmp/pandas.arrow")
+logs.each do |record|
+  p record.attributes
+end
+filtered_logs = logs.select do |record|
+  record.b =~ "hello"
+end
+filtered_logs.dump_arrow("/tmp/filtered.arrow",
+                         column_names: ["a", "b"])

data/sample/read-feather.rb ADDED

@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+require "arrow"
+Arrow::MemoryMappedInputStream.open("/tmp/dataframe.feather") do |input|
+  reader = Arrow::FeatherFileReader.new(input)
+  reader.columns.each do |column|
+    puts("#{column.name}: #{column.to_a.inspect}")
+  end
+end

data/sample/read-groonga.py ADDED

@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+import pyarrow as pa
+with pa.OSFile("/tmp/filtered.arrow") as source:
+    writer = pa.RecordBatchFileReader(source)
+    print(writer.get_record_batch(0).to_pandas())

data/sample/read-pandas.rb ADDED

@@ -0,0 +1,11 @@
+#!/usr/bin/env ruby
+require "arrow"
+Arrow::MemoryMappedInputStream.open("/tmp/pandas.arrow") do |input|
+  reader = Arrow::FileReader.new(input)
+  reader.each do |record_batch|
+    puts("=" * 40)
+    puts(record_batch)
+  end
+end

data/sample/read-parquet.rb ADDED

@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+require "arrow"
+require "parquet"
+reader = Parquet::ArrowFileReader.new("/tmp/pandas.parquet")
+table = reader.read_table
+table.each_column do |column|
+  puts("#{column.name}: #{column.to_a.inspect}")
+end

data/sample/read-tensor.rb ADDED

@@ -0,0 +1,15 @@
+#!/usr/bin/env ruby
+require "arrow"
+require "arrow-numo-narray"
+require "arrow-nmatrix"
+require "arrow-gsl"
+require "pp"
+Arrow::MemoryMappedInputStream.open("/tmp/tensor.arrow") do |input|
+  tensor = input.read_tensor(0)
+  pp tensor.to_narray
+  pp tensor.to_nmatrix
+  pp tensor.to_gsl
+end

data/sample/write-feather.R ADDED

@@ -0,0 +1,5 @@
+library("feather")
+df = data.frame(a=c(1, 2, 3),
+                b=c(1.1, 2.2, 3.3))
+write_feather(df, "/tmp/dataframe.feather")

data/sample/write-pandas.py ADDED

@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import pandas as pd
+import pyarrow as pa
+df = pd.DataFrame({"a": [1, 2, 3],
+                   "b": ["hello", "world", "!"]})
+record_batch = pa.RecordBatch.from_pandas(df)
+with pa.OSFile("/tmp/pandas.arrow", "wb") as sink:
+    schema = record_batch.schema
+    writer = pa.RecordBatchFileWriter(sink, schema)
+    writer.write_batch(record_batch)
+    writer.close()

data/sample/write-parquet.py ADDED

@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+df = pd.DataFrame({"a": [1, 2, 3],
+                   "b": ["hello", "world", "!"]})
+table = pa.Table.from_pandas(df)
+pq.write_table(table, "/tmp/pandas.parquet")

data/sample/write-tensor.py ADDED

@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+import pyarrow as pa
+import numpy as np
+ndarray = np.random.randn(10, 6)
+print(ndarray)
+tensor = pa.Tensor.from_numpy(ndarray)
+with pa.OSFile("/tmp/tensor.arrow", "wb") as sink:
+    pa.write_tensor(tensor, sink)

metadata ADDED

@@ -0,0 +1,90 @@
+--- !ruby/object:Gem::Specification
+name: rabbit-slide-kou-data-science-rb
+version: !ruby/object:Gem::Version
+  version: 2017.5.19.0
+platform: ruby
+authors:
+- Kouhei Sutou
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2017-05-17 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rabbit
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 2.0.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 2.0.2
+- !ruby/object:Gem::Dependency
+  name: rabbit-theme-clear-code
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Apache Arrowはデータ分析システム間でのデータ交換を効率化することを目的としたデータフォーマットです。pandasやApache
+  SparkやRなど主要なデータ分析システムはApache Arrowの対応を進めています。近い将来、データ分析システム間でのデータ交換はApache Arrowを使う状態になるでしょう。RubyもApache
+  Arrowに対応することで既存のデータ分析システムとデータ交換できるようになります。これで、Rubyもデータ分析システムの一部でデータ処理をできるようになります！
+email:
+- kou@clear-code.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".rabbit"
+- README.rd
+- Rakefile
+- config.yaml
+- pdf/data-science-rb-ruby-with-apache-arrow-joins-data-processing-languages.pdf
+- ruby-with-apache-arrow-joins-data-processing-languages.rab
+- sample/filter-groonga.rb
+- sample/read-feather.rb
+- sample/read-groonga.py
+- sample/read-pandas.rb
+- sample/read-parquet.rb
+- sample/read-tensor.rb
+- sample/write-feather.R
+- sample/write-pandas.py
+- sample/write-parquet.py
+- sample/write-tensor.py
+homepage: http://slide.rabbit-shocker.org/authors/kou/data-science-rb/
+licenses:
+- CC-BY-SA-4.0
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.5.2
+signing_key:
+specification_version: 4
+summary: RubyもApache Arrowでデータ処理言語の仲間入り
+test_files: []