red_amber 0.4.2 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -2
- data/CHANGELOG.md +57 -0
- data/Gemfile +5 -6
- data/README.ja.md +252 -0
- data/README.md +19 -14
- data/benchmark/group.yml +12 -5
- data/docker/Gemfile +8 -3
- data/docker/Gemfile.lock +54 -16
- data/docker/example +29 -17
- data/lib/red_amber/data_frame.rb +78 -4
- data/lib/red_amber/data_frame_combinable.rb +147 -119
- data/lib/red_amber/data_frame_displayable.rb +6 -5
- data/lib/red_amber/data_frame_selectable.rb +49 -0
- data/lib/red_amber/group.rb +191 -90
- data/lib/red_amber/helper.rb +26 -0
- data/lib/red_amber/subframes.rb +12 -5
- data/lib/red_amber/vector.rb +43 -24
- data/lib/red_amber/vector_aggregation.rb +26 -0
- data/lib/red_amber/vector_selectable.rb +74 -23
- data/lib/red_amber/vector_string_function.rb +211 -0
- data/lib/red_amber/vector_unary_element_wise.rb +4 -0
- data/lib/red_amber/vector_updatable.rb +28 -0
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +2 -1
- data/red_amber.gemspec +1 -1
- metadata +7 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b8fc1df498792b2b30d63a47a783cda67ccb8cea09e933aa8cba5d317277f500
|
4
|
+
data.tar.gz: 83e54f0fb6070a6b3c4301d0cd3e5356f1ca4e09bdae200f4fc7694a2e3e7daa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 440dd984e88afd4bee7860a0f5b03c54094b8536de6acf70b770d3f473c1ee93608a5565d62b5bd60d5e8e3ba8c09675e136373a15daef77bfc455c7c5a4a7cc
|
7
|
+
data.tar.gz: '09f27ff2a0c3804b345c4b5c581013135544fed098265e9bca274e34de52791b9a36525df22b17fe80093f14249c195496366e16079c25e70cf660070dc66858'
|
data/.rubocop.yml
CHANGED
@@ -76,7 +76,8 @@ Metrics/AbcSize:
|
|
76
76
|
Max: 30
|
77
77
|
CountRepeatedAttributes: false
|
78
78
|
AllowedMethods: [
|
79
|
-
'
|
79
|
+
'join_merge_keys', # 54.18
|
80
|
+
'join', # 53.1
|
80
81
|
'dataframe_info', # 46.5
|
81
82
|
'format_table', # 84.62
|
82
83
|
'to_long', # 33.66
|
@@ -87,6 +88,9 @@ Metrics/AbcSize:
|
|
87
88
|
'[]', # 33.76
|
88
89
|
'split', # 37.35
|
89
90
|
'aggregate', # 38.13
|
91
|
+
'filters', # 33.91
|
92
|
+
'merge_keys', # 32.17
|
93
|
+
'rename_keys', # 31.64
|
90
94
|
]
|
91
95
|
|
92
96
|
# Max: 25
|
@@ -139,10 +143,12 @@ Metrics/MethodLength:
|
|
139
143
|
Max: 30
|
140
144
|
AllowedMethods: [
|
141
145
|
'join', # 47
|
142
|
-
'
|
146
|
+
'join_merge_keys', # 41
|
143
147
|
'format_table', # 53
|
144
148
|
'slice_by', # 38
|
145
149
|
'assign_update', # 35
|
150
|
+
'summarize', # 35
|
151
|
+
'dataframe_info', # 33
|
146
152
|
'drop', # 32
|
147
153
|
'aggregate', # 31
|
148
154
|
]
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,60 @@
|
|
1
|
+
## [0.5.0] - 2023/05-24
|
2
|
+
|
3
|
+
- Breaking change
|
4
|
+
- Use non keyword argument in #sub_by_value (#219)
|
5
|
+
- Upgrade dependency to Arrow 12.0.0 (#238)
|
6
|
+
- right_join will output columns as same order as Red Arrow.
|
7
|
+
- DataFrame#join will not force ordering of original column by default
|
8
|
+
- Join with type, such as full_join, sort after join by default
|
9
|
+
|
10
|
+
- Bug fixes
|
11
|
+
- Use truncate in Vector#sample(float) (#229)
|
12
|
+
- Support options in DataFrame#tdra (#231)
|
13
|
+
- Fix printing table with non-ascii strings (#233)
|
14
|
+
- Fix join for Arrow 12.0.0
|
15
|
+
|
16
|
+
- New features and improvements
|
17
|
+
- Add a singleton method Vector.[] (#218)
|
18
|
+
- Add an alias #sub_group (#219)
|
19
|
+
- Accept Group#summarize{Hash} to rename aggregated columns (#219)
|
20
|
+
- Add Group#group_frame (#219)
|
21
|
+
- Add Vector#cast (#224)
|
22
|
+
- Add Vector#fill_nil(value) (#226)
|
23
|
+
- Add Vector#one (#227)
|
24
|
+
- Add Vector#mode (#228)
|
25
|
+
- Add DataFrame#propagate (#235)
|
26
|
+
- Add DataFrame#sample (#237)
|
27
|
+
- Add DataFrame#shuffle (#237)
|
28
|
+
- Support RankOptions in Vector#rank (#239)
|
29
|
+
- Introduce MatchSubstringOptions family in Vector (#241)
|
30
|
+
- Introduce Vector#match_substring?
|
31
|
+
- Add Vector#end_with?, #start_with? method
|
32
|
+
- Add Vector#match_like?
|
33
|
+
- Add Vector#count_substring method
|
34
|
+
|
35
|
+
- Refactoring
|
36
|
+
- Refine Group and SubFrames function (#219)
|
37
|
+
- Refine Group#group_count
|
38
|
+
- Use Acero in Group#filters
|
39
|
+
- Refine Group#filters, not using Acero
|
40
|
+
- Refine Group#summarize(array)
|
41
|
+
- Use Acero for renaming columns in join (#238)
|
42
|
+
- Use index kernel with IndexOptions introduced in 12.0.0 (#240)
|
43
|
+
|
44
|
+
- Improve in tests/CI
|
45
|
+
- Use Fedra 39 Rawhide in CI (#238)
|
46
|
+
|
47
|
+
- Documentation and Example
|
48
|
+
- Add missing yard documents for SubFrames::Selectors (#219)
|
49
|
+
- Update docker/example (#219)
|
50
|
+
- Update Gemfile in docker (#219)
|
51
|
+
- Add README.ja.md (#242)
|
52
|
+
|
53
|
+
- GitHub site
|
54
|
+
- Update link of Red Data Tools Chat to matrix (#242)
|
55
|
+
|
56
|
+
- Thanks
|
57
|
+
|
1
58
|
## [0.4.2] - 2023-04-02
|
2
59
|
|
3
60
|
- Breaking change
|
data/Gemfile
CHANGED
@@ -7,7 +7,7 @@ gemspec
|
|
7
7
|
group :test do
|
8
8
|
gem 'rake'
|
9
9
|
|
10
|
-
gem 'red-parquet', '~>
|
10
|
+
gem 'red-parquet', '~> 12.0.0'
|
11
11
|
gem 'rover-df', '~> 0.3.0'
|
12
12
|
|
13
13
|
gem 'rubocop'
|
@@ -15,14 +15,13 @@ group :test do
|
|
15
15
|
gem 'rubocop-rake'
|
16
16
|
gem 'rubocop-rubycw', require: false
|
17
17
|
|
18
|
-
gem 'iruby'
|
19
|
-
gem 'test-unit'
|
20
|
-
gem 'webrick'
|
21
|
-
gem 'yard'
|
22
|
-
|
23
18
|
gem 'benchmark_driver'
|
19
|
+
gem 'iruby'
|
24
20
|
gem 'red-arrow-numo-narray'
|
25
21
|
gem 'red-datasets-arrow'
|
26
22
|
gem 'simplecov'
|
27
23
|
gem 'simplecov-json'
|
24
|
+
gem 'test-unit'
|
25
|
+
gem 'webrick'
|
26
|
+
gem 'yard'
|
28
27
|
end
|
data/README.ja.md
ADDED
@@ -0,0 +1,252 @@
|
|
1
|
+
# RedAmber
|
2
|
+
|
3
|
+
[![Gem Version](https://img.shields.io/gem/v/red_amber?color=brightgreen)](https://rubygems.org/gems/red_amber)
|
4
|
+
[![CI](https://github.com/heronshoes/red_amber/actions/workflows/ci.yml/badge.svg)](https://github.com/red-data-tools/red_amber/actions/workflows/ci.yml)
|
5
|
+
[![Maintainability](https://api.codeclimate.com/v1/badges/b8a745047045d2f49daa/maintainability)](https://codeclimate.com/github/heronshoes/red_amber/maintainability)
|
6
|
+
[![Test coverage](https://api.codeclimate.com/v1/badges/b8a745047045d2f49daa/test_coverage)](https://codeclimate.com/github/heronshoes/red_amber/test_coverage)
|
7
|
+
[![Doc](https://img.shields.io/badge/docs-latest-blue)](https://heronshoes.github.io/red_amber/)
|
8
|
+
[![Discussions](https://img.shields.io/github/discussions/heronshoes/red_amber)](https://github.com/red-data-tools/red_amber/discussions)
|
9
|
+
|
10
|
+
Rubyistのためのデータフレームライブラリ.
|
11
|
+
|
12
|
+
- Powered by [Red Arrow](https://github.com/apache/arrow/tree/master/ruby/red-arrow)
|
13
|
+
[![Red Data Tools Chat (ja)](https://badges.gitter.im/red-data-tools/en.svg)](https://app.element.io/#/room/#red-data-tools_ja:gitter.im) [![Gem Version](https://img.shields.io/gem/v/red-arrow?color=brightgreen)](https://rubygems.org/gems/red-arrow)
|
14
|
+
- Inspired by the dataframe library [Rover-df](https://github.com/ankane/rover)
|
15
|
+
|
16
|
+
[README in English](README.md)
|
17
|
+
|
18
|
+
![screenshot from jupyterlab](https://raw.githubusercontent.com/red-data-tools/red_amber/main/doc/image/screenshot.png)
|
19
|
+
|
20
|
+
## 必要な環境
|
21
|
+
### Ruby
|
22
|
+
- Ruby 3.0 以上.
|
23
|
+
|
24
|
+
### ライブラリ
|
25
|
+
```ruby
|
26
|
+
gem 'red-arrow', '~> 12.0.0' # お使いの環境に合わせた Apache Arrow が必要です(下記のインストールを参照してください)
|
27
|
+
gem 'red-parquet', '~> 12.0.0' # 必要に応じて。Parquetの入出力が必要な場合。
|
28
|
+
gem 'red-datasets-arrow' # 必要に応じて。Red Datasets またはランダムサンプリングが必要な場合。
|
29
|
+
gem 'red-arrow-activerecord' # 必要に応じて。Active Record とのデータ交換が必要な場合。
|
30
|
+
gem 'rover-df', '~> 0.3.0' # 必要に応じて。Rover::DataFrameに対する入出力が必要な場合。
|
31
|
+
```
|
32
|
+
|
33
|
+
## インストール
|
34
|
+
|
35
|
+
RedAmberをインストールする前に、下記のライブラリのインストールが必要です。
|
36
|
+
|
37
|
+
- Apache Arrow (~> 12.0.0)
|
38
|
+
- Apache Arrow GLib (~> 12.0.0)
|
39
|
+
- Apache Parquet GLib (~> 12.0.0) # Parquetの入出力が必要な場合。
|
40
|
+
|
41
|
+
環境ごとの詳しいインストール方法は、 [Apache Arrow install document](https://arrow.apache.org/install/) を参照してください。
|
42
|
+
|
43
|
+
- Ubuntuの場合の最低限必要なインストール例:
|
44
|
+
|
45
|
+
```
|
46
|
+
sudo apt update
|
47
|
+
sudo apt install -y -V ca-certificates lsb-release wget
|
48
|
+
wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
|
49
|
+
sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
|
50
|
+
sudo apt update
|
51
|
+
sudo apt install -y -V libarrow-dev
|
52
|
+
sudo apt install -y -V libarrow-glib-dev
|
53
|
+
```
|
54
|
+
|
55
|
+
- Fedora 39 (Rawhide)の場合:
|
56
|
+
|
57
|
+
```
|
58
|
+
sudo dnf update
|
59
|
+
sudo dnf -y install gcc-c++ libarrow-devel libarrow-glib-devel ruby-devel
|
60
|
+
```
|
61
|
+
|
62
|
+
- macOS の場合は、Homebrewを使用する:
|
63
|
+
|
64
|
+
```
|
65
|
+
brew install apache-arrow
|
66
|
+
brew install apache-arrow-glib
|
67
|
+
```
|
68
|
+
|
69
|
+
Apache Arrowがインストールできたら、下記の行をGemfileに追加してください:
|
70
|
+
|
71
|
+
```ruby
|
72
|
+
gem 'red-arrow', '~> 12.0.0' # お使いの環境に合わせた Apache Arrow が必要です(下記のインストールを参照してください)
|
73
|
+
gem 'red_amber'
|
74
|
+
gem 'red-parquet', '~> 12.0.0' # 必要に応じて。Parquetの入出力が必要な場合。
|
75
|
+
gem 'red-datasets-arrow' # 必要に応じて。Red Datasets またはランダムサンプリングが必要な場合。
|
76
|
+
gem 'red-arrow-numo-narray' # 必要に応じて。Numo::NArrayとの連携が必要な場合
|
77
|
+
gem 'red-arrow-activerecord' # 必要に応じて。Active Record とのデータ交換が必要な場合。
|
78
|
+
gem 'rover-df', '~> 0.3.0' # 必要に応じて。Rover::DataFrameに対する入出力が必要な場合。
|
79
|
+
```
|
80
|
+
|
81
|
+
`bundle install`とするか、または `gem install red_amber`としてインストールしてください。
|
82
|
+
|
83
|
+
## Docker イメージと Jupyter Notebook
|
84
|
+
|
85
|
+
このリポジトリの`docker` フォルダーから Docker コンテナ環境を生成できます。リポジトリをクローンしてから、dockerフォルダーにある [readme](docker/readme.md) を参照してください。その環境では `docker/notebook` フォルダーにある Jupyter Notebookイメージを試用できます。
|
86
|
+
|
87
|
+
このREADMEの内容をネットワーク上のJupyter Notebookでインタラクティブに試用することも出来ます。 [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb).
|
88
|
+
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb)
|
89
|
+
|
90
|
+
Jupyter Notebookの環境を含めた他の多くのデータ処理用のライブラリーとともにRedAmberもパッケージングされたDocker Imageとして、[RubyData Docker Stacks](https://github.com/RubyData/docker-stacks) が利用できます(Thanks to Kenta Murata).
|
91
|
+
|
92
|
+
## 他のデータフレームライブラリとの比較表
|
93
|
+
|
94
|
+
RedAmberの基本的な機能をPython
|
95
|
+
[pandas](https://pandas.pydata.org/) や
|
96
|
+
R [Tidyverse](https://www.tidyverse.org/) や
|
97
|
+
Julia [Dataframes](https://dataframes.juliadata.org/stable/) と比較した表は [DataFrame_Comparison.md](doc/DataFrame_Comparison.md) にあります(Thanks to Benson Muite).
|
98
|
+
|
99
|
+
## `RedAmber`のデータフレーム
|
100
|
+
|
101
|
+
クラス `RedAmber::DataFrame` は2次元のデータの集まりを表現します。
|
102
|
+
その実体は Red Arrowの Tableオブジェクトです。
|
103
|
+
|
104
|
+
![dataframe model of RedAmber](https://raw.githubusercontent.com/red-data-tools/red_amber/main/doc/image/dataframe_model.png)
|
105
|
+
|
106
|
+
それではライブラリをロードしていくつかの例を試してみましょう。
|
107
|
+
|
108
|
+
```ruby
|
109
|
+
require 'red_amber' # require 'red-amber' でもOKです.
|
110
|
+
include RedAmber
|
111
|
+
```
|
112
|
+
|
113
|
+
### 例: diamonds データセット
|
114
|
+
|
115
|
+
もしまだであれば、Red DatasetsのArrow拡張を`
|
116
|
+
gem install red-datasets-arrow
|
117
|
+
`
|
118
|
+
としてインストールしてから次を実行してください。
|
119
|
+
|
120
|
+
```ruby
|
121
|
+
require 'datasets-arrow' # サンプルデータのロードのため
|
122
|
+
|
123
|
+
dataset = Datasets::Diamonds.new
|
124
|
+
diamonds = DataFrame.new(dataset) # v0.2.3以前では, `dataset.to_arrow`とする必要があります。
|
125
|
+
|
126
|
+
# =>
|
127
|
+
#<RedAmber::DataFrame : 53940 x 10 Vectors, 0x000000000000f668>
|
128
|
+
carat cut color clarity depth table price x ... z
|
129
|
+
<double> <string> <string> <string> <double> <double> <uint16> <double> ... <double>
|
130
|
+
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 ... 2.43
|
131
|
+
1 0.21 Premium E SI1 59.8 61.0 326 3.89 ... 2.31
|
132
|
+
2 0.23 Good E VS1 56.9 65.0 327 4.05 ... 2.31
|
133
|
+
3 0.29 Premium I VS2 62.4 58.0 334 4.2 ... 2.63
|
134
|
+
4 0.31 Good J SI2 63.3 58.0 335 4.34 ... 2.75
|
135
|
+
: : : : : : : : : ... :
|
136
|
+
53937 0.7 Very Good D SI1 62.8 60.0 2757 5.66 ... 3.56
|
137
|
+
53938 0.86 Premium H SI2 61.0 58.0 2757 6.15 ... 3.74
|
138
|
+
53939 0.75 Ideal D SI2 62.2 55.0 2757 5.83 ... 3.64
|
139
|
+
```
|
140
|
+
|
141
|
+
例えば、1カラット以下のレコードに対し、cut毎の平均のpriceを求めるには次のようにします。
|
142
|
+
|
143
|
+
```ruby
|
144
|
+
df = diamonds
|
145
|
+
.slice { carat > 1 } # #sliceの代わりに#filterでも可
|
146
|
+
.group(:cut)
|
147
|
+
.mean(:price) # ここで:priceを指定する場合はgroupの前のpickは不要
|
148
|
+
.sort('-mean(price)')
|
149
|
+
|
150
|
+
# =>
|
151
|
+
#<RedAmber::DataFrame : 5 x 2 Vectors, 0x000000000000f67c>
|
152
|
+
cut mean(price)
|
153
|
+
<string> <double>
|
154
|
+
0 Ideal 8674.23
|
155
|
+
1 Premium 8487.25
|
156
|
+
2 Very Good 8340.55
|
157
|
+
3 Good 7753.6
|
158
|
+
4 Fair 7177.86
|
159
|
+
```
|
160
|
+
|
161
|
+
Arrowのデータはイミュータブルなので、これらのメソッドは新しいオブジェクトを返します。
|
162
|
+
|
163
|
+
次の例は、列をリネームしてから新しい列に簡単な計算の結果を格納します。
|
164
|
+
|
165
|
+
```ruby
|
166
|
+
usdjpy = 110.0 # 今よりずっと円高の頃
|
167
|
+
|
168
|
+
df.rename('mean(price)': :mean_price_USD)
|
169
|
+
.assign(:mean_price_JPY) { mean_price_USD * usdjpy }
|
170
|
+
|
171
|
+
# =>
|
172
|
+
#<RedAmber::DataFrame : 5 x 3 Vectors, 0x000000000000f71c>
|
173
|
+
cut mean_price_USD mean_price_JPY
|
174
|
+
<string> <double> <double>
|
175
|
+
0 Ideal 8674.23 954164.93
|
176
|
+
1 Premium 8487.25 933597.34
|
177
|
+
2 Very Good 8340.55 917460.37
|
178
|
+
3 Good 7753.6 852896.11
|
179
|
+
4 Fair 7177.86 789564.12
|
180
|
+
```
|
181
|
+
|
182
|
+
### 例: starwars データセット
|
183
|
+
|
184
|
+
次の例は、CSVファイルをダウンロードして`starwars` データセットを読み込みます。その後簡単なデータのクリーニングを行います。
|
185
|
+
|
186
|
+
```ruby
|
187
|
+
uri = URI('https://vincentarelbundock.github.io/Rdatasets/csv/dplyr/starwars.csv')
|
188
|
+
|
189
|
+
starwars = DataFrame.load(uri)
|
190
|
+
|
191
|
+
starwars
|
192
|
+
.drop(0) # 不要な列を取り除く
|
193
|
+
.remove { species == "NA" } # 不要な行を取り除く
|
194
|
+
.group(:species) { [count(:species), mean(:height, :mass)] }
|
195
|
+
.slice { count > 1 } # #filterでも可
|
196
|
+
|
197
|
+
# =>
|
198
|
+
#<RedAmber::DataFrame : 8 x 4 Vectors, 0x000000000000f848>
|
199
|
+
species count mean(height) mean(mass)
|
200
|
+
<string> <int64> <double> <double>
|
201
|
+
0 Human 35 176.65 82.78
|
202
|
+
1 Droid 6 131.2 69.75
|
203
|
+
2 Wookiee 2 231.0 124.0
|
204
|
+
3 Gungan 3 208.67 74.0
|
205
|
+
4 Zabrak 2 173.0 80.0
|
206
|
+
5 Twi'lek 2 179.0 55.0
|
207
|
+
6 Mirialan 2 168.0 53.1
|
208
|
+
7 Kaminoan 2 221.0 88.0
|
209
|
+
```
|
210
|
+
|
211
|
+
より詳しいデータフレームの使用例については、[DataFrame.md](doc/DataFrame.md) をご参照ください。
|
212
|
+
|
213
|
+
|
214
|
+
### 1次元のデータを保持する `Vector`
|
215
|
+
|
216
|
+
クラス`RedAmber::Vector` はデータフレームの中の列方向に格納された1次元のデータ列を保持します.
|
217
|
+
|
218
|
+
より詳しい使用例については [Vector.md](doc/Vector.md) をご参照ください。
|
219
|
+
|
220
|
+
## Jupyter notebook
|
221
|
+
|
222
|
+
Jupyter Notebook形式の使用例として、[Examples of Red Amber](https://github.com/heronshoes/docker-stacks/blob/RedAmber-binder/binder/examples_of_red_amber.ipynb)
|
223
|
+
([raw file](https://raw.githubusercontent.com/heronshoes/docker-stacks/RedAmber-binder/binder/examples_of_red_amber.ipynb)) があります。データのロードから各種のデータ処理まで100以上の使用例を集めています。[Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb).
|
224
|
+
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb)で試すこともできます。
|
225
|
+
|
226
|
+
|
227
|
+
## 開発
|
228
|
+
|
229
|
+
```shell
|
230
|
+
git clone https://github.com/red-data-tools/red_amber.git
|
231
|
+
cd red_amber
|
232
|
+
bundle install
|
233
|
+
bundle exec rake test
|
234
|
+
```
|
235
|
+
|
236
|
+
rake testは必須ですが、rake rubocopをパスすることはコントリビュートの際に必須ではありません。このプロジェクトではコードの書き方の好みを尊重します。ただしマージの際に書き方を統一することがあります。
|
237
|
+
|
238
|
+
## コミュニティ
|
239
|
+
|
240
|
+
このプロジェクトを支援して頂けると嬉しいです。支援の方法はいくつかあります。
|
241
|
+
|
242
|
+
- [discussions](https://github.com/heronshoes/red_amber/discussions)で話をする [![Discussions](https://img.shields.io/github/discussions/heronshoes/red_amber)](https://github.com/red-data-tools/red_amber/discussions)
|
243
|
+
- Q and Aや使用方法、豆知識などを見る。
|
244
|
+
- 疑問に思っていることを質問する。
|
245
|
+
- 新しいアイデアを共有する。アイデアはdiscussionからissueに昇格させて育てていくこともあります。漠然としたアイデアでもdiscussionから始めて大きくしていきましょう。
|
246
|
+
- [バグ報告や新しい機能の提案](https://github.com/red-data-tools/red_amber/issues)
|
247
|
+
- バグの修正や[プルリクエスト](https://github.com/red-data-tools/red_amber/pulls)
|
248
|
+
- ドキュメントを修正したり、不明確なところを直したり、新しく追加する
|
249
|
+
|
250
|
+
## License
|
251
|
+
|
252
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/README.md
CHANGED
@@ -10,30 +10,34 @@
|
|
10
10
|
A simple dataframe library for Ruby.
|
11
11
|
|
12
12
|
- Powered by [Red Arrow](https://github.com/apache/arrow/tree/master/ruby/red-arrow)
|
13
|
-
[![
|
13
|
+
[![Red Data Tools Chat (en)](https://badges.gitter.im/red-data-tools/en.svg)](https://app.element.io/#/room/#red-data-tools_en:gitter.im) [![Gem Version](https://img.shields.io/gem/v/red-arrow?color=brightgreen)](https://rubygems.org/gems/red-arrow)
|
14
14
|
- Inspired by the dataframe library [Rover-df](https://github.com/ankane/rover)
|
15
15
|
|
16
|
+
[日本語のREADME](README.ja.md)
|
17
|
+
|
16
18
|
![screenshot from jupyterlab](https://raw.githubusercontent.com/red-data-tools/red_amber/main/doc/image/screenshot.png)
|
17
19
|
|
18
20
|
## Requirements
|
19
21
|
### Ruby
|
20
22
|
Supported Ruby version is >= 3.0 (since RedAmber 0.3.0).
|
21
|
-
- I decided to remove support for Ruby 2.7 without waiting for its EOL. See [Release note for v0.3.0](https://github.com/red-data-tools/red_amber/discussions/162) for details.
|
22
23
|
|
23
|
-
###
|
24
|
+
### Required libraries
|
24
25
|
```ruby
|
25
|
-
gem 'red-arrow', '~>
|
26
|
-
gem 'red-parquet', '~>
|
27
|
-
gem '
|
26
|
+
gem 'red-arrow', '~> 12.0.0' # Requires Apache Arrow (see installation below)
|
27
|
+
gem 'red-parquet', '~> 12.0.0' # Optional, if you use IO from/to parquet
|
28
|
+
gem 'red-datasets-arrow' # Optional, if you use Red Datasets or random sampling feature
|
29
|
+
gem 'red-arrow-numo-narray' # Optional, recommended if you use inputs from Numo::NArray
|
30
|
+
gem 'red-arrow-activerecord' # Optional, if you use Active Record
|
31
|
+
gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
|
28
32
|
```
|
29
33
|
|
30
34
|
## Installation
|
31
35
|
|
32
36
|
Install requirements before you install RedAmber.
|
33
37
|
|
34
|
-
- Apache Arrow (~>
|
35
|
-
- Apache Arrow GLib (~>
|
36
|
-
- Apache Parquet GLib (~>
|
38
|
+
- Apache Arrow (~> 12.0.0)
|
39
|
+
- Apache Arrow GLib (~> 12.0.0)
|
40
|
+
- Apache Parquet GLib (~> 12.0.0) # If you use IO from/to parquet
|
37
41
|
|
38
42
|
See [Apache Arrow install document](https://arrow.apache.org/install/).
|
39
43
|
|
@@ -49,7 +53,7 @@ See [Apache Arrow install document](https://arrow.apache.org/install/).
|
|
49
53
|
sudo apt install -y -V libarrow-glib-dev
|
50
54
|
```
|
51
55
|
|
52
|
-
- On Fedora
|
56
|
+
- On Fedora 39 (Rawhide):
|
53
57
|
|
54
58
|
```
|
55
59
|
sudo dnf update
|
@@ -66,19 +70,20 @@ See [Apache Arrow install document](https://arrow.apache.org/install/).
|
|
66
70
|
If you prepared Apache Arrow, add these lines to your Gemfile:
|
67
71
|
|
68
72
|
```ruby
|
69
|
-
gem 'red-arrow', '~>
|
73
|
+
gem 'red-arrow', '~> 12.0.0'
|
70
74
|
gem 'red_amber'
|
71
|
-
gem 'red-parquet', '~>
|
72
|
-
gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
|
75
|
+
gem 'red-parquet', '~> 12.0.0' # Optional, if you use IO from/to parquet
|
73
76
|
gem 'red-datasets-arrow' # Optional, recommended if you use Red Datasets
|
74
77
|
gem 'red-arrow-numo-narray' # Optional, recommended if you use inputs from Numo::NArray
|
78
|
+
gem 'red-arrow-activerecord' # Optional, if you use Active Record
|
79
|
+
gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
|
75
80
|
```
|
76
81
|
|
77
82
|
And then execute `bundle install` or install them yourself such as `gem install red_amber`.
|
78
83
|
|
79
84
|
## Docker image and Jupyter Notebook
|
80
85
|
|
81
|
-
Docker image is available from docker folder. See [readme](docker/readme.md) for instruction. Integrated Jypyter notebook is in docker/notebook folder.
|
86
|
+
Docker image is available from `docker` folder. See [readme](docker/readme.md) for instruction. Integrated Jypyter notebook is in docker/notebook folder.
|
82
87
|
|
83
88
|
You can try the contents of this README interactively by [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb).
|
84
89
|
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb)
|
data/benchmark/group.yml
CHANGED
@@ -1,18 +1,19 @@
|
|
1
1
|
loop_count: 3
|
2
2
|
|
3
3
|
contexts:
|
4
|
-
- name: 0.2.2
|
5
|
-
gems:
|
6
|
-
red_amber: 0.2.2
|
7
4
|
- name: 0.3.0
|
8
5
|
gems:
|
9
6
|
red_amber: 0.3.0
|
7
|
+
- name: 0.4.2
|
8
|
+
gems:
|
9
|
+
red_amber: 0.4.2
|
10
10
|
- name: HEAD
|
11
11
|
prelude: |
|
12
12
|
$LOAD_PATH.unshift(File.expand_path('lib'))
|
13
13
|
|
14
14
|
prelude: |
|
15
15
|
require 'red_amber'
|
16
|
+
include RedAmber
|
16
17
|
require 'datasets-arrow'
|
17
18
|
|
18
19
|
ds = Datasets::Rdatasets.new('nycflights13', 'flights')
|
@@ -32,8 +33,14 @@ benchmark:
|
|
32
33
|
'G03: sum arr_delay, mean distance by flight': |
|
33
34
|
df.group(:flight) { [sum(:arr_delay), mean(:distance)] }
|
34
35
|
|
35
|
-
'G04:
|
36
|
+
'G04:filtersir_time, distance by flight': |
|
36
37
|
df.group(:flight).mean(:air_time, :distance)
|
37
38
|
|
38
|
-
'
|
39
|
+
'G75: sum dep_delay, arr_delay by carrer': |
|
39
40
|
df.group(:carrier).sum(:dep_delay, :arr_delay)
|
41
|
+
|
42
|
+
'G06: filters': |
|
43
|
+
Group.new(df, :dest).filters
|
44
|
+
|
45
|
+
'G07: inspect': |
|
46
|
+
Group.new(df, :dest).inspect
|
data/docker/Gemfile
CHANGED
@@ -5,11 +5,11 @@ source 'https://rubygems.org'
|
|
5
5
|
gem 'irb'
|
6
6
|
|
7
7
|
gem 'numo-narray'
|
8
|
-
gem 'red-arrow', '~>
|
8
|
+
gem 'red-arrow', '~> 12.0.0'
|
9
9
|
gem 'red-arrow-numo-narray'
|
10
|
-
gem 'red-parquet', '~>
|
10
|
+
gem 'red-parquet', '~> 12.0.0'
|
11
11
|
|
12
|
-
gem 'red_amber', '
|
12
|
+
gem 'red_amber', path: '../'
|
13
13
|
gem 'red-amber-view'
|
14
14
|
gem 'rover-df'
|
15
15
|
|
@@ -18,4 +18,9 @@ gem 'red-datasets-arrow'
|
|
18
18
|
|
19
19
|
gem 'benchmark_driver'
|
20
20
|
gem 'benchmark-ips'
|
21
|
+
|
22
|
+
gem 'charty'
|
21
23
|
gem 'faker'
|
24
|
+
gem 'matplotlib'
|
25
|
+
gem 'pycall'
|
26
|
+
gem 'unicode_plot'
|
data/docker/Gemfile.lock
CHANGED
@@ -1,37 +1,66 @@
|
|
1
|
+
PATH
|
2
|
+
remote: ..
|
3
|
+
specs:
|
4
|
+
red_amber (0.5.0.pre.HEAD)
|
5
|
+
red-arrow (~> 12.0.0)
|
6
|
+
|
1
7
|
GEM
|
2
8
|
remote: https://rubygems.org/
|
3
9
|
specs:
|
4
10
|
benchmark-ips (2.12.0)
|
5
11
|
benchmark_driver (0.16.3)
|
6
12
|
bigdecimal (3.1.4)
|
13
|
+
charty (0.2.12)
|
14
|
+
matplotlib (>= 1.2.0)
|
15
|
+
pandas (>= 0.3.5)
|
16
|
+
playwright-ruby-client
|
17
|
+
red-colors (>= 0.3.0)
|
18
|
+
red-datasets (>= 0.1.2)
|
19
|
+
red-palette (>= 0.5.0)
|
7
20
|
concurrent-ruby (1.2.2)
|
8
21
|
csv (3.2.6)
|
22
|
+
enumerable-statistics (2.0.7)
|
9
23
|
extpp (0.1.1)
|
10
24
|
faker (3.1.1)
|
11
25
|
i18n (>= 1.8.11, < 2)
|
12
26
|
fiddle (1.1.1)
|
13
|
-
gio2 (4.1.
|
27
|
+
gio2 (4.1.4)
|
14
28
|
fiddle
|
15
|
-
gobject-introspection (= 4.1.
|
16
|
-
glib2 (4.1.
|
29
|
+
gobject-introspection (= 4.1.4)
|
30
|
+
glib2 (4.1.4)
|
17
31
|
native-package-installer (>= 1.0.3)
|
18
32
|
pkg-config (>= 1.3.5)
|
19
|
-
gobject-introspection (4.1.
|
20
|
-
glib2 (= 4.1.
|
33
|
+
gobject-introspection (4.1.4)
|
34
|
+
glib2 (= 4.1.4)
|
21
35
|
i18n (1.12.0)
|
22
36
|
concurrent-ruby (~> 1.0)
|
23
37
|
io-console (0.6.0)
|
24
|
-
irb (1.6.
|
38
|
+
irb (1.6.4)
|
25
39
|
reline (>= 0.3.0)
|
26
40
|
libui (0.0.15)
|
41
|
+
matplotlib (1.3.0)
|
42
|
+
pycall (>= 1.0.0)
|
43
|
+
matrix (0.4.2)
|
44
|
+
mime-types (3.4.1)
|
45
|
+
mime-types-data (~> 3.2015)
|
46
|
+
mime-types-data (3.2023.0218.1)
|
27
47
|
native-package-installer (1.1.5)
|
28
48
|
numo-narray (0.9.2.1)
|
49
|
+
numpy (0.4.0)
|
50
|
+
pycall (>= 1.2.0.beta1)
|
51
|
+
pandas (0.3.8)
|
52
|
+
numpy
|
53
|
+
pycall (>= 1.0.0)
|
29
54
|
pkg-config (1.5.1)
|
55
|
+
playwright-ruby-client (1.31.1)
|
56
|
+
concurrent-ruby (>= 1.1.6)
|
57
|
+
mime-types (>= 3.0)
|
58
|
+
pycall (1.4.2)
|
30
59
|
red-amber-view (0.0.1)
|
31
60
|
libui
|
32
61
|
red-arrow
|
33
62
|
red_amber
|
34
|
-
red-arrow (
|
63
|
+
red-arrow (12.0.0)
|
35
64
|
bigdecimal (>= 3.1.0)
|
36
65
|
extpp (>= 0.1.1)
|
37
66
|
gio2 (>= 3.5.0)
|
@@ -40,6 +69,8 @@ GEM
|
|
40
69
|
red-arrow-numo-narray (0.0.6)
|
41
70
|
numo-narray
|
42
71
|
red-arrow
|
72
|
+
red-colors (0.3.0)
|
73
|
+
matrix
|
43
74
|
red-datasets (0.1.5)
|
44
75
|
csv (>= 3.2.4)
|
45
76
|
rexml
|
@@ -47,34 +78,41 @@ GEM
|
|
47
78
|
red-datasets-arrow (0.0.3)
|
48
79
|
red-arrow
|
49
80
|
red-datasets (>= 0.0.3)
|
50
|
-
red-
|
51
|
-
red-
|
52
|
-
|
53
|
-
red-arrow (
|
54
|
-
reline (0.3.
|
81
|
+
red-palette (0.5.0)
|
82
|
+
red-colors (>= 0.3.0)
|
83
|
+
red-parquet (12.0.0)
|
84
|
+
red-arrow (= 12.0.0)
|
85
|
+
reline (0.3.3)
|
55
86
|
io-console (~> 0.5)
|
56
87
|
rexml (3.2.5)
|
57
88
|
rover-df (0.3.4)
|
58
89
|
numo-narray (>= 0.9.1.9)
|
59
90
|
rubyzip (2.3.2)
|
91
|
+
unicode_plot (0.0.5)
|
92
|
+
enumerable-statistics (>= 2.0.1)
|
60
93
|
|
61
94
|
PLATFORMS
|
95
|
+
x86_64-darwin-20
|
62
96
|
x86_64-linux
|
63
97
|
|
64
98
|
DEPENDENCIES
|
65
99
|
benchmark-ips
|
66
100
|
benchmark_driver
|
101
|
+
charty
|
67
102
|
faker
|
68
103
|
irb
|
104
|
+
matplotlib
|
69
105
|
numo-narray
|
106
|
+
pycall
|
70
107
|
red-amber-view
|
71
|
-
red-arrow (~>
|
108
|
+
red-arrow (~> 12.0.0)
|
72
109
|
red-arrow-numo-narray
|
73
110
|
red-datasets
|
74
111
|
red-datasets-arrow
|
75
|
-
red-parquet (~>
|
76
|
-
red_amber
|
112
|
+
red-parquet (~> 12.0.0)
|
113
|
+
red_amber!
|
77
114
|
rover-df
|
115
|
+
unicode_plot
|
78
116
|
|
79
117
|
BUNDLED WITH
|
80
|
-
2.4.
|
118
|
+
2.4.12
|