RubyGems - rabbit-slide-kou-data-science-rb - Versions diffs - 2017.5.19.0 - Mend

rabbit-slide-kou-data-science-rb 2017.5.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +7 -0
data/.rabbit +1 -0
data/README.rd +38 -0
data/Rakefile +17 -0
data/config.yaml +24 -0
data/pdf/data-science-rb-ruby-with-apache-arrow-joins-data-processing-languages.pdf +0 -0
data/ruby-with-apache-arrow-joins-data-processing-languages.rab +444 -0
data/sample/filter-groonga.rb +15 -0
data/sample/read-feather.rb +10 -0
data/sample/read-groonga.py +7 -0
data/sample/read-pandas.rb +11 -0
data/sample/read-parquet.rb +10 -0
data/sample/read-tensor.rb +15 -0
data/sample/write-feather.R +5 -0
data/sample/write-pandas.py +14 -0
data/sample/write-parquet.py +10 -0
data/sample/write-tensor.py +10 -0
metadata +90 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 6adc80216160df3aa2fcc4e681adb8adef33c9ff
+  data.tar.gz: 1c8032d96c9c619f40bf1789a937408da47bb597
+SHA512:
+  metadata.gz: 677dc10e63a5bf2384757c4f679b4843fb82577c44fc2b6270df8e21b43f76a80fa8e6ca3e3c1eaf305ff0d69160cf1358701dac8e745e1f24d7dc09f32b9ff6
+  data.tar.gz: 431d971ef33ec8e19e5ffcb730d1fc6404e4cb1f8e09cd1cbea5d8dd0d1db09a3c0c7beef1dec1a6bf82258609bdfe46703b7d2a77f4f4036f41acd0eda63549

data/.rabbit ADDED

	@@ -0,0 +1 @@
1	+ ruby-with-apache-arrow-joins-data-processing-languages.rab

data/README.rd ADDED

@@ -0,0 +1,38 @@
+= RubyもApache Arrowでデータ処理言語の仲間入り
+Apache Arrowはデータ分析システム間でのデータ交換を効率化することを目的としたデータフォーマットです。pandasやApache SparkやRなど主要なデータ分析システムはApache Arrowの対応を進めています。近い将来、データ分析システム間でのデータ交換はApache Arrowを使う状態になるでしょう。RubyもApache Arrowに対応することで既存のデータ分析システムとデータ交換できるようになります。これで、Rubyもデータ分析システムの一部でデータ処理をできるようになります！
+== ライセンス
+=== スライド
+CC BY-SA 4.0
+原著作者名は以下の通りです。
+  * 須藤功平（またはKouhei Sutou）
+=== プログラム
+CC0（パブリックドメイン）
+== 作者向け
+=== 表示
+  rake
+=== 公開
+  rake publish
+== 閲覧者向け
+=== インストール
+  gem install rabbit-slide-kou-data-science-rb
+=== 表示
+  rabbit rabbit-slide-kou-data-science-rb.gem

data/Rakefile ADDED

@@ -0,0 +1,17 @@
+require "rabbit/task/slide"
+# Edit ./config.yaml to customize meta data
+spec = nil
+Rabbit::Task::Slide.new do |task|
+  spec = task.spec
+  spec.files += Dir.glob("sample/**/*.*")
+  # spec.files -= Dir.glob("private/**/*.*")
+  spec.add_runtime_dependency("rabbit-theme-clear-code")
+end
+desc "Tag #{spec.version}"
+task :tag do
+  sh("git", "tag", "-a", spec.version.to_s, "-m", "Publish #{spec.version}")
+  sh("git", "push", "--tags")
+end

data/config.yaml ADDED

@@ -0,0 +1,24 @@
+---
+id: data-science-rb
+base_name: ruby-with-apache-arrow-joins-data-processing-languages
+tags:
+- rabbit
+- ruby
+- arrow
+- data
+presentation_date: 2017-05-19
+version: 2017.5.19.0
+licenses:
+- CC-BY-SA-4.0
+slideshare_id: datasciencerb
+speaker_deck_id:
+ustream_id:
+vimeo_id:
+youtube_id:
+author:
+  markup_language: :rd
+  name: Kouhei Sutou
+  email: kou@clear-code.com
+  rubygems_user: kou
+  slideshare_user:
+  speaker_deck_user:

data/pdf/data-science-rb-ruby-with-apache-arrow-joins-data-processing-languages.pdf ADDED

Binary file

data/ruby-with-apache-arrow-joins-data-processing-languages.rab ADDED

@@ -0,0 +1,444 @@
+= RubyもApache Arrowで\nデータ処理言語の\n仲間入り
+: author
+   須藤功平
+: institution
+   クリアコード
+: content-source
+   DataScience.rbワークショップ
+: date
+   2017-05-19
+: allotted-time
+   20m
+: theme
+   clear-code
+= はじめに
+(('tag:center'))
+(('tag:large'))
+私はRubyが好きだ
+(('tag:center'))
+(('tag:large'))
+だからデータ分析だって\n
+Rubyでやりたい
+(('tag:center'))
+(('note:Rubyよりも向いている言語があるのはわかっているけどさー'))
+= Apache Arrow
+(('tag:center'))
+データフォーマットの仕様
+(('tag:center'))
+と
+(('tag:center'))
+その仕様を処理する実装
+= Arrow：解決したい問題
+  * 高いデータ交換コスト
+    * →低くしたい
+  * 重複した最適化実装
+    * →実装を共有したい
+= Arrow：文脈
+ビッグデータの分析
+= ビッグデータの分析
+  * いろんなシステムが連携
+    * Java実装のもろもろとPythonとR
+  * システム間でデータ交換が必要
+    * 交換する度にシリアライズ・パース
+    * (('wait'))↑に結構CPUと時間を使われる…
+    * (('wait'))そんなのより分析処理に使いたい！
+= Arrow：解決方針
+コストゼロの\n
+シリアライズ・\n
+パース
+= Arrow：コストゼロの実現
+  * そのまま使えるフォーマット
+    * 例:int8の配列→int8の値を連続配置
+    * 1バイトずつずらせば高速アクセス可
+  * Arrowのトレードオフ
+    * サイズ圧縮よりシリアライズゼロ
+    * 参考：Parquetはサイズ圧縮優先
+= Arrowがある世界
+  * 各システムがサクサク連携
+    * 例：PySparkが高速化
+    * 理由:Py🡘Javaのデータ交換コスト減
+  * Java・Python・R以外も活躍
+    * 例：Ruby・Go・Rust・Juliaとか
+    * 理由：低コストでデータ交換可能
+= ArrowとRuby
+チャンス！
+= ArrowとRubyとデータ分析
+  * RubyがArrowに対応
+    * Rubyにデータが回ってくる！
+    * →Rubyにもデータ分析の機会が！\n
+      (('note:（今はできることは少ないだろうけど…）'))
+  * (('wait'))次のステップ
+    * できることを増やしていく！
+    * →Rubyでもいろいろデータ分析！
+= ArrowとRubyの今
+  * RubyでArrowを使える！
+    * 私が使えるようにしているから！\n
+      (('note:コミッターにもなった'))
+    * 公式リポジトリーにも入っている\n
+      (('note:厳密に言うと違うんだけど公式サポートだと思ってよい'))
+  * Rubyでデータを読み書きできる
+    * いくらかデータ処理もできる
+= 今できること
+  * Python・Rとのデータ交換
+  * データ処理をいくらか
+  * Rubyの各種ライブラリー間での\n
+    データ交換
+= Arrow：Python
+  # coderay python
+  # pandasでデータ生成→Arrow形式で書き込み
+  import pyarrow as pa
+  df = pd.DataFrame({"a": [1, 2, 3],
+                     "b": ["hello", "world", "!"]})
+  record_batch = pa.RecordBatch.from_pandas(df)
+  with pa.OSFile("/tmp/pandas.arrow", "wb") as sink:
+      schema = record_batch.schema
+      writer = pa.RecordBatchFileWriter(sink, schema)
+      writer.write_batch(record_batch)
+      writer.close()
+= Arrow：Ruby
+  # coderay ruby
+  # RubyでArrow形式のpandasのデータを読み込み
+  require "arrow"
+  Input = Arrow::MemoryMappedInputStream
+  Input.open("/tmp/pandas.arrow") do |input|
+    reader = Arrow::RecordBatchFileReader.new(input)
+    reader.each do |record_batch|
+      puts("=" * 40)
+      puts(record_batch)
+    end
+  end
+= Feather：R
+  # coderay R
+  # Rでデータ生成→Feather形式で書き込み
+  library("feather")
+  df = data.frame(a=c(1, 2, 3),
+                  b=c(1.1, 2.2, 3.3))
+  write_feather(df, "/tmp/dataframe.feather")
+= Feather：Ruby
+  # coderay ruby
+  # RubyでFeather形式のRのデータを読み込み
+  require "arrow"
+  Input = Arrow::MemoryMappedInputStream
+  Input.open("/tmp/dataframe.feather") do |input|
+    reader = Arrow::FeatherFileReader.new(input)
+    reader.columns.each do |column|
+      puts("#{column.name}: #{column.to_a.inspect}")
+    end
+  end
+= Parquet：Python
+  # coderay python
+  # Pythonでデータ生成→Parquet形式で書き込み
+  import pandas as pd
+  import pyarrow as pa
+  import pyarrow.parquet as pq
+  df = pd.DataFrame({"a": [1, 2, 3],
+                     "b": ["hello", "world", "!"]})
+  table = pa.Table.from_pandas(df)
+  pq.write_table(table, "/tmp/pandas.parquet")
+= Parquet：Ruby
+  # coderay ruby
+  # RubyでParquet形式のデータを読み込み
+  require "arrow"
+  require "parquet"
+  path = "/tmp/pandas.parquet"
+  reader = Parquet::ArrowFileReader.new(path)
+  table = reader.read_table
+  table.each_column do |column|
+    puts("#{column.name}: #{column.to_a.inspect}")
+  end
+= 対応データ形式まとめ
+  * Arrow形式
+    * 各種言語(('note:（これから広く使われているはず）'))
+  * Feather形式
+    * Python・R専用
+  * Parquet形式
+    * 各種言語(('note:（Hadoop界隈ですでに広く使われている）'))
+= データ処理例
+  * Groongaでフィルター
+  * Groonga
+    * 全文検索エンジン
+    * カラムストアなので集計処理も得意
+    * Apache Arrow対応
+    * よくできたRubyバインディングあり
+= Groonga：Ruby
+  # coderay ruby
+  # 空のテーブルにArrow形式のデータを読み込む
+  logs = Groonga::Array.create(name: "logs")
+  logs.load_arrow("/tmp/pandas.arrow")
+  logs.each {|record| p record.attributes}
+  # フィルター
+  filtered_logs = logs.select do |record|
+    record.b =~ "hello" # "hello"で全文検索
+  end
+  # フィルター結果をArrow形式で書き込み
+  filtered_logs.dump_arrow("/tmp/filtered.arrow",
+                           column_names: ["a", "b"])
+= Groonga：Python
+  # coderay python
+  # Arrow形式のGroongaでのフィルター結果を読み込む
+  import pyarrow as pa
+  with pa.OSFile("/tmp/filtered.arrow") as source:
+      writer = pa.RecordBatchFileReader(source)
+      print(writer.get_record_batch(0).to_pandas())
+= Rubyでデータ処理（現状）
+  * 既存のCライブラリーを活用
+    * 速度がでるし機能もある
+  * CライブラリーをArrowに対応
+    * Arrow→Ruby→Cライブラリー\n
+      ↑から↓で高速化(('note:（オブジェクト生成は遅い）'))
+    * Arrow→Cライブラリー
+= Rubyでデータ処理（案）
+  * Fluentdとか速くなりそう
+    * 途中でメッセージを参照しないなら
+  * MessagePackからArrowに変える
+    * Arrowのまま出力先へ送る
+    * 途中でRubyオブジェクトができない\n
+      (('note:シリアライズ・パースがなくなって速い！'))
+= 多次元配列
+  * Arrowではオプション機能
+    * テンソルと呼んでいる\n
+      (('note:（traditional multidimensional array objectと説明）'))
+  * C++実装ではサポート
+    * バインディングでは使える
+    * Python・Ruby…では使える
+= Tensor：Python
+  # coderay python
+  # NumPyでデータ生成→書き込み
+  import pyarrow as pa
+  import numpy as np
+  ndarray = np.random.randn(10, 6) # 10x6
+  print(ndarray)
+  tensor = pa.Tensor.from_numpy(ndarray)
+  with pa.OSFile("/tmp/tensor.arrow", "wb") as sink:
+      pa.write_tensor(tensor, sink)
+= Tensor：Ruby
+  # coderay ruby
+  # Rubyで読み込み
+  require "arrow"
+  Input = Arrow::MemoryMappedInputStream
+  Input.open("/tmp/tensor.arrow") do |input|
+    tensor = input.read_tensor(0)
+    p tensor.shape # => [10, 6]
+  end
+= Ruby：GSL
+  # coderay ruby
+  # GSLオブジェクトに変換
+  require "arrow"
+  require "arrow-gsl"
+  require "pp"
+  Input = Arrow::MemoryMappedInputStream
+  Input.open("/tmp/tensor.arrow") do |input|
+    tensor = input.read_tensor(0)
+    pp tensor.to_gsl
+    # tensor.to_gsl.to_arrow == tensor
+  end
+= Ruby：NMatrix
+  # coderay ruby
+  # NMatrixオブジェクトに変換
+  require "arrow"
+  require "arrow-nmatrix"
+  require "pp"
+  Input = Arrow::MemoryMappedInputStream
+  Input.open("/tmp/tensor.arrow") do |input|
+    tensor = input.read_tensor(0)
+    pp tensor.to_nmatrix
+    # tensor.to_nmatrix.to_arrow == tensor
+  end
+= Ruby：Numo::NArray
+  # coderay ruby
+  # Numo::NArrayオブジェクトに変換
+  require "arrow"
+  require "arrow-numo-narray"
+  require "pp"
+  Input = Arrow::MemoryMappedInputStream
+  Input.open("/tmp/tensor.arrow") do |input|
+    tensor = input.read_tensor(0)
+    pp tensor.to_narray
+    # tensor.to_narray.to_arrow == tensor
+  end
+= ここまでのまとめ1
+  * Arrowが実現したい世界
+    * データ交換コストが低い世界
+    * 最適化実装を共有している世界
+= ここまでのまとめ2
+  * RubyとArrowの今
+    * ArrowはRubyを公式サポート！
+    * Rubyの外の世界とデータ交換可能\n
+      (('note:（Arrow・Feather・Parquetをサポート）'))
+    * Rubyの各種ライブラリーとの\n
+      相互変換が可能\n
+      (('note:（メモリーコピーぐらいのコストで）'))
+= ArrowとRubyとこれから
+  * Arrow
+    * データフレーム処理の最適化実装
+    * マルチコア・GPU対応
+  * Ruby
+    * Red Data Toolsプロジェクト
+= Red Data Tools
+  * Rubyでデータ処理したいなぁ！\n
+    の実現を目指すプロジェクト
+  * URL：
+    * https://github.io/red-data-tools
+    * https://red-data-tools.github.io
+    * https://gitter.im/red-data-tools
+= 既存プロダクト
+  * Red Arrow(('note:（ArrowのRubyバインディング）'))
+    * Red Arrow XXX(('note:（ArrowとXXXの相互変換）'))
+  * Parquet GLib(('note:（ParquetのGLibバインディング）'))
+  * Red Parquet(('note:（ParquetのRubyバインディング）'))
+  * Jekyll Jupyter Notebook plugin(('note:（JekyllでJupyter Notebookを表示）'))
+= ポリシー1
+(('tag:center'))
+Collaborate\n
+over Ruby communities
+(('tag:center'))
+(('note:Ruby以外の人たちとも言語を超えて協力する'))\n
+(('note:Apache Arrowがやっていることはまさにそう'))\n
+(('note:もちろんRubyの人たちとも協力する'))
+= ポリシー2
+(('tag:center'))
+Acting than blaming
+(('tag:center'))
+(('note:時間は嘆き・非難より手を動かすことに使う'))
+= ポリシー3
+(('tag:center'))
+Continuous small works than\n
+a temporary big work
+(('tag:center'))
+(('note:一時的にガッとやって終わりより'))\n
+(('note:小さくても継続して活動する'))
+= ポリシー4
+(('tag:center'))
+The current\n
+lack of knowledge\n
+isn't matter
+(('tag:center'))
+(('note:現時点で数学や統計学などの知識が足りなくても問題ない'))\n
+(('note:既存の実装を使ったりそこから学んだりできるから'))
+= ポリシー5
+(('tag:center'))
+Ignore blames from outsiders
+(('tag:center'))
+(('note:部外者の非難は気にしない'))\n
+(('note:結果がでるまでグチグチ言われるはず :p'))
+= ポリシー6
+(('tag:center'))
+Fun!\n
+Because we use Ruby!
+(('tag:center'))
+(('note:Rubyを使うんだし楽しくやろう！'))
+= Join us!
+  * Rubyでデータ処理したい人！
+  * ポリシーに同意できる人！
+  * URL：
+    * https://github.io/red-data-tools
+    * https://red-data-tools.github.io
+    * https://gitter.im/red-data-tools

data/sample/filter-groonga.rb ADDED

@@ -0,0 +1,15 @@
+#!/usr/bin/env ruby
+require "rroonga"
+Groonga::Database.create(path: "/tmp/db")
+logs = Groonga::Array.create(name: "logs")
+logs.load_arrow("/tmp/pandas.arrow")
+logs.each do |record|
+  p record.attributes
+end
+filtered_logs = logs.select do |record|
+  record.b =~ "hello"
+end
+filtered_logs.dump_arrow("/tmp/filtered.arrow",
+                         column_names: ["a", "b"])

data/sample/read-feather.rb ADDED

@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+require "arrow"
+Arrow::MemoryMappedInputStream.open("/tmp/dataframe.feather") do |input|
+  reader = Arrow::FeatherFileReader.new(input)
+  reader.columns.each do |column|
+    puts("#{column.name}: #{column.to_a.inspect}")
+  end
+end

data/sample/read-groonga.py ADDED

@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+import pyarrow as pa
+with pa.OSFile("/tmp/filtered.arrow") as source:
+    writer = pa.RecordBatchFileReader(source)
+    print(writer.get_record_batch(0).to_pandas())

data/sample/read-pandas.rb ADDED

@@ -0,0 +1,11 @@
+#!/usr/bin/env ruby
+require "arrow"
+Arrow::MemoryMappedInputStream.open("/tmp/pandas.arrow") do |input|
+  reader = Arrow::FileReader.new(input)
+  reader.each do |record_batch|
+    puts("=" * 40)
+    puts(record_batch)
+  end
+end

data/sample/read-parquet.rb ADDED

@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+require "arrow"
+require "parquet"
+reader = Parquet::ArrowFileReader.new("/tmp/pandas.parquet")
+table = reader.read_table
+table.each_column do |column|
+  puts("#{column.name}: #{column.to_a.inspect}")
+end

data/sample/read-tensor.rb ADDED

@@ -0,0 +1,15 @@
+#!/usr/bin/env ruby
+require "arrow"
+require "arrow-numo-narray"
+require "arrow-nmatrix"
+require "arrow-gsl"
+require "pp"
+Arrow::MemoryMappedInputStream.open("/tmp/tensor.arrow") do |input|
+  tensor = input.read_tensor(0)
+  pp tensor.to_narray
+  pp tensor.to_nmatrix
+  pp tensor.to_gsl
+end

data/sample/write-feather.R ADDED

@@ -0,0 +1,5 @@
+library("feather")
+df = data.frame(a=c(1, 2, 3),
+                b=c(1.1, 2.2, 3.3))
+write_feather(df, "/tmp/dataframe.feather")

data/sample/write-pandas.py ADDED

@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import pandas as pd
+import pyarrow as pa
+df = pd.DataFrame({"a": [1, 2, 3],
+                   "b": ["hello", "world", "!"]})
+record_batch = pa.RecordBatch.from_pandas(df)
+with pa.OSFile("/tmp/pandas.arrow", "wb") as sink:
+    schema = record_batch.schema
+    writer = pa.RecordBatchFileWriter(sink, schema)
+    writer.write_batch(record_batch)
+    writer.close()

data/sample/write-parquet.py ADDED

@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+df = pd.DataFrame({"a": [1, 2, 3],
+                   "b": ["hello", "world", "!"]})
+table = pa.Table.from_pandas(df)
+pq.write_table(table, "/tmp/pandas.parquet")

data/sample/write-tensor.py ADDED

@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+import pyarrow as pa
+import numpy as np
+ndarray = np.random.randn(10, 6)
+print(ndarray)
+tensor = pa.Tensor.from_numpy(ndarray)
+with pa.OSFile("/tmp/tensor.arrow", "wb") as sink:
+    pa.write_tensor(tensor, sink)

metadata ADDED

@@ -0,0 +1,90 @@
+--- !ruby/object:Gem::Specification
+name: rabbit-slide-kou-data-science-rb
+version: !ruby/object:Gem::Version
+  version: 2017.5.19.0
+platform: ruby
+authors:
+- Kouhei Sutou
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2017-05-17 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rabbit
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 2.0.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 2.0.2
+- !ruby/object:Gem::Dependency
+  name: rabbit-theme-clear-code
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Apache Arrowはデータ分析システム間でのデータ交換を効率化することを目的としたデータフォーマットです。pandasやApache
+  SparkやRなど主要なデータ分析システムはApache Arrowの対応を進めています。近い将来、データ分析システム間でのデータ交換はApache Arrowを使う状態になるでしょう。RubyもApache
+  Arrowに対応することで既存のデータ分析システムとデータ交換できるようになります。これで、Rubyもデータ分析システムの一部でデータ処理をできるようになります！
+email:
+- kou@clear-code.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".rabbit"
+- README.rd
+- Rakefile
+- config.yaml
+- pdf/data-science-rb-ruby-with-apache-arrow-joins-data-processing-languages.pdf
+- ruby-with-apache-arrow-joins-data-processing-languages.rab
+- sample/filter-groonga.rb
+- sample/read-feather.rb
+- sample/read-groonga.py
+- sample/read-pandas.rb
+- sample/read-parquet.rb
+- sample/read-tensor.rb
+- sample/write-feather.R
+- sample/write-pandas.py
+- sample/write-parquet.py
+- sample/write-tensor.py
+homepage: http://slide.rabbit-shocker.org/authors/kou/data-science-rb/
+licenses:
+- CC-BY-SA-4.0
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.5.2
+signing_key:
+specification_version: 4
+summary: RubyもApache Arrowでデータ処理言語の仲間入り
+test_files: []