red_amber 0.4.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -2
- data/CHANGELOG.md +57 -0
- data/Gemfile +5 -6
- data/README.ja.md +252 -0
- data/README.md +19 -14
- data/benchmark/group.yml +12 -5
- data/docker/Gemfile +8 -3
- data/docker/Gemfile.lock +54 -16
- data/docker/example +29 -17
- data/lib/red_amber/data_frame.rb +78 -4
- data/lib/red_amber/data_frame_combinable.rb +147 -119
- data/lib/red_amber/data_frame_displayable.rb +6 -5
- data/lib/red_amber/data_frame_selectable.rb +49 -0
- data/lib/red_amber/group.rb +191 -90
- data/lib/red_amber/helper.rb +26 -0
- data/lib/red_amber/subframes.rb +12 -5
- data/lib/red_amber/vector.rb +43 -24
- data/lib/red_amber/vector_aggregation.rb +26 -0
- data/lib/red_amber/vector_selectable.rb +74 -23
- data/lib/red_amber/vector_string_function.rb +211 -0
- data/lib/red_amber/vector_unary_element_wise.rb +4 -0
- data/lib/red_amber/vector_updatable.rb +28 -0
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +2 -1
- data/red_amber.gemspec +1 -1
- metadata +7 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b8fc1df498792b2b30d63a47a783cda67ccb8cea09e933aa8cba5d317277f500
|
4
|
+
data.tar.gz: 83e54f0fb6070a6b3c4301d0cd3e5356f1ca4e09bdae200f4fc7694a2e3e7daa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 440dd984e88afd4bee7860a0f5b03c54094b8536de6acf70b770d3f473c1ee93608a5565d62b5bd60d5e8e3ba8c09675e136373a15daef77bfc455c7c5a4a7cc
|
7
|
+
data.tar.gz: '09f27ff2a0c3804b345c4b5c581013135544fed098265e9bca274e34de52791b9a36525df22b17fe80093f14249c195496366e16079c25e70cf660070dc66858'
|
data/.rubocop.yml
CHANGED
@@ -76,7 +76,8 @@ Metrics/AbcSize:
|
|
76
76
|
Max: 30
|
77
77
|
CountRepeatedAttributes: false
|
78
78
|
AllowedMethods: [
|
79
|
-
'
|
79
|
+
'join_merge_keys', # 54.18
|
80
|
+
'join', # 53.1
|
80
81
|
'dataframe_info', # 46.5
|
81
82
|
'format_table', # 84.62
|
82
83
|
'to_long', # 33.66
|
@@ -87,6 +88,9 @@ Metrics/AbcSize:
|
|
87
88
|
'[]', # 33.76
|
88
89
|
'split', # 37.35
|
89
90
|
'aggregate', # 38.13
|
91
|
+
'filters', # 33.91
|
92
|
+
'merge_keys', # 32.17
|
93
|
+
'rename_keys', # 31.64
|
90
94
|
]
|
91
95
|
|
92
96
|
# Max: 25
|
@@ -139,10 +143,12 @@ Metrics/MethodLength:
|
|
139
143
|
Max: 30
|
140
144
|
AllowedMethods: [
|
141
145
|
'join', # 47
|
142
|
-
'
|
146
|
+
'join_merge_keys', # 41
|
143
147
|
'format_table', # 53
|
144
148
|
'slice_by', # 38
|
145
149
|
'assign_update', # 35
|
150
|
+
'summarize', # 35
|
151
|
+
'dataframe_info', # 33
|
146
152
|
'drop', # 32
|
147
153
|
'aggregate', # 31
|
148
154
|
]
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,60 @@
|
|
1
|
+
## [0.5.0] - 2023/05-24
|
2
|
+
|
3
|
+
- Breaking change
|
4
|
+
- Use non keyword argument in #sub_by_value (#219)
|
5
|
+
- Upgrade dependency to Arrow 12.0.0 (#238)
|
6
|
+
- right_join will output columns as same order as Red Arrow.
|
7
|
+
- DataFrame#join will not force ordering of original column by default
|
8
|
+
- Join with type, such as full_join, sort after join by default
|
9
|
+
|
10
|
+
- Bug fixes
|
11
|
+
- Use truncate in Vector#sample(float) (#229)
|
12
|
+
- Support options in DataFrame#tdra (#231)
|
13
|
+
- Fix printing table with non-ascii strings (#233)
|
14
|
+
- Fix join for Arrow 12.0.0
|
15
|
+
|
16
|
+
- New features and improvements
|
17
|
+
- Add a singleton method Vector.[] (#218)
|
18
|
+
- Add an alias #sub_group (#219)
|
19
|
+
- Accept Group#summarize{Hash} to rename aggregated columns (#219)
|
20
|
+
- Add Group#group_frame (#219)
|
21
|
+
- Add Vector#cast (#224)
|
22
|
+
- Add Vector#fill_nil(value) (#226)
|
23
|
+
- Add Vector#one (#227)
|
24
|
+
- Add Vector#mode (#228)
|
25
|
+
- Add DataFrame#propagate (#235)
|
26
|
+
- Add DataFrame#sample (#237)
|
27
|
+
- Add DataFrame#shuffle (#237)
|
28
|
+
- Support RankOptions in Vector#rank (#239)
|
29
|
+
- Introduce MatchSubstringOptions family in Vector (#241)
|
30
|
+
- Introduce Vector#match_substring?
|
31
|
+
- Add Vector#end_with?, #start_with? method
|
32
|
+
- Add Vector#match_like?
|
33
|
+
- Add Vector#count_substring method
|
34
|
+
|
35
|
+
- Refactoring
|
36
|
+
- Refine Group and SubFrames function (#219)
|
37
|
+
- Refine Group#group_count
|
38
|
+
- Use Acero in Group#filters
|
39
|
+
- Refine Group#filters, not using Acero
|
40
|
+
- Refine Group#summarize(array)
|
41
|
+
- Use Acero for renaming columns in join (#238)
|
42
|
+
- Use index kernel with IndexOptions introduced in 12.0.0 (#240)
|
43
|
+
|
44
|
+
- Improve in tests/CI
|
45
|
+
- Use Fedra 39 Rawhide in CI (#238)
|
46
|
+
|
47
|
+
- Documentation and Example
|
48
|
+
- Add missing yard documents for SubFrames::Selectors (#219)
|
49
|
+
- Update docker/example (#219)
|
50
|
+
- Update Gemfile in docker (#219)
|
51
|
+
- Add README.ja.md (#242)
|
52
|
+
|
53
|
+
- GitHub site
|
54
|
+
- Update link of Red Data Tools Chat to matrix (#242)
|
55
|
+
|
56
|
+
- Thanks
|
57
|
+
|
1
58
|
## [0.4.2] - 2023-04-02
|
2
59
|
|
3
60
|
- Breaking change
|
data/Gemfile
CHANGED
@@ -7,7 +7,7 @@ gemspec
|
|
7
7
|
group :test do
|
8
8
|
gem 'rake'
|
9
9
|
|
10
|
-
gem 'red-parquet', '~>
|
10
|
+
gem 'red-parquet', '~> 12.0.0'
|
11
11
|
gem 'rover-df', '~> 0.3.0'
|
12
12
|
|
13
13
|
gem 'rubocop'
|
@@ -15,14 +15,13 @@ group :test do
|
|
15
15
|
gem 'rubocop-rake'
|
16
16
|
gem 'rubocop-rubycw', require: false
|
17
17
|
|
18
|
-
gem 'iruby'
|
19
|
-
gem 'test-unit'
|
20
|
-
gem 'webrick'
|
21
|
-
gem 'yard'
|
22
|
-
|
23
18
|
gem 'benchmark_driver'
|
19
|
+
gem 'iruby'
|
24
20
|
gem 'red-arrow-numo-narray'
|
25
21
|
gem 'red-datasets-arrow'
|
26
22
|
gem 'simplecov'
|
27
23
|
gem 'simplecov-json'
|
24
|
+
gem 'test-unit'
|
25
|
+
gem 'webrick'
|
26
|
+
gem 'yard'
|
28
27
|
end
|
data/README.ja.md
ADDED
@@ -0,0 +1,252 @@
|
|
1
|
+
# RedAmber
|
2
|
+
|
3
|
+
[](https://rubygems.org/gems/red_amber)
|
4
|
+
[](https://github.com/red-data-tools/red_amber/actions/workflows/ci.yml)
|
5
|
+
[](https://codeclimate.com/github/heronshoes/red_amber/maintainability)
|
6
|
+
[](https://codeclimate.com/github/heronshoes/red_amber/test_coverage)
|
7
|
+
[](https://heronshoes.github.io/red_amber/)
|
8
|
+
[](https://github.com/red-data-tools/red_amber/discussions)
|
9
|
+
|
10
|
+
Rubyistのためのデータフレームライブラリ.
|
11
|
+
|
12
|
+
- Powered by [Red Arrow](https://github.com/apache/arrow/tree/master/ruby/red-arrow)
|
13
|
+
[](https://app.element.io/#/room/#red-data-tools_ja:gitter.im) [](https://rubygems.org/gems/red-arrow)
|
14
|
+
- Inspired by the dataframe library [Rover-df](https://github.com/ankane/rover)
|
15
|
+
|
16
|
+
[README in English](README.md)
|
17
|
+
|
18
|
+

|
19
|
+
|
20
|
+
## 必要な環境
|
21
|
+
### Ruby
|
22
|
+
- Ruby 3.0 以上.
|
23
|
+
|
24
|
+
### ライブラリ
|
25
|
+
```ruby
|
26
|
+
gem 'red-arrow', '~> 12.0.0' # お使いの環境に合わせた Apache Arrow が必要です(下記のインストールを参照してください)
|
27
|
+
gem 'red-parquet', '~> 12.0.0' # 必要に応じて。Parquetの入出力が必要な場合。
|
28
|
+
gem 'red-datasets-arrow' # 必要に応じて。Red Datasets またはランダムサンプリングが必要な場合。
|
29
|
+
gem 'red-arrow-activerecord' # 必要に応じて。Active Record とのデータ交換が必要な場合。
|
30
|
+
gem 'rover-df', '~> 0.3.0' # 必要に応じて。Rover::DataFrameに対する入出力が必要な場合。
|
31
|
+
```
|
32
|
+
|
33
|
+
## インストール
|
34
|
+
|
35
|
+
RedAmberをインストールする前に、下記のライブラリのインストールが必要です。
|
36
|
+
|
37
|
+
- Apache Arrow (~> 12.0.0)
|
38
|
+
- Apache Arrow GLib (~> 12.0.0)
|
39
|
+
- Apache Parquet GLib (~> 12.0.0) # Parquetの入出力が必要な場合。
|
40
|
+
|
41
|
+
環境ごとの詳しいインストール方法は、 [Apache Arrow install document](https://arrow.apache.org/install/) を参照してください。
|
42
|
+
|
43
|
+
- Ubuntuの場合の最低限必要なインストール例:
|
44
|
+
|
45
|
+
```
|
46
|
+
sudo apt update
|
47
|
+
sudo apt install -y -V ca-certificates lsb-release wget
|
48
|
+
wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
|
49
|
+
sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
|
50
|
+
sudo apt update
|
51
|
+
sudo apt install -y -V libarrow-dev
|
52
|
+
sudo apt install -y -V libarrow-glib-dev
|
53
|
+
```
|
54
|
+
|
55
|
+
- Fedora 39 (Rawhide)の場合:
|
56
|
+
|
57
|
+
```
|
58
|
+
sudo dnf update
|
59
|
+
sudo dnf -y install gcc-c++ libarrow-devel libarrow-glib-devel ruby-devel
|
60
|
+
```
|
61
|
+
|
62
|
+
- macOS の場合は、Homebrewを使用する:
|
63
|
+
|
64
|
+
```
|
65
|
+
brew install apache-arrow
|
66
|
+
brew install apache-arrow-glib
|
67
|
+
```
|
68
|
+
|
69
|
+
Apache Arrowがインストールできたら、下記の行をGemfileに追加してください:
|
70
|
+
|
71
|
+
```ruby
|
72
|
+
gem 'red-arrow', '~> 12.0.0' # お使いの環境に合わせた Apache Arrow が必要です(下記のインストールを参照してください)
|
73
|
+
gem 'red_amber'
|
74
|
+
gem 'red-parquet', '~> 12.0.0' # 必要に応じて。Parquetの入出力が必要な場合。
|
75
|
+
gem 'red-datasets-arrow' # 必要に応じて。Red Datasets またはランダムサンプリングが必要な場合。
|
76
|
+
gem 'red-arrow-numo-narray' # 必要に応じて。Numo::NArrayとの連携が必要な場合
|
77
|
+
gem 'red-arrow-activerecord' # 必要に応じて。Active Record とのデータ交換が必要な場合。
|
78
|
+
gem 'rover-df', '~> 0.3.0' # 必要に応じて。Rover::DataFrameに対する入出力が必要な場合。
|
79
|
+
```
|
80
|
+
|
81
|
+
`bundle install`とするか、または `gem install red_amber`としてインストールしてください。
|
82
|
+
|
83
|
+
## Docker イメージと Jupyter Notebook
|
84
|
+
|
85
|
+
このリポジトリの`docker` フォルダーから Docker コンテナ環境を生成できます。リポジトリをクローンしてから、dockerフォルダーにある [readme](docker/readme.md) を参照してください。その環境では `docker/notebook` フォルダーにある Jupyter Notebookイメージを試用できます。
|
86
|
+
|
87
|
+
このREADMEの内容をネットワーク上のJupyter Notebookでインタラクティブに試用することも出来ます。 [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb).
|
88
|
+
[](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb)
|
89
|
+
|
90
|
+
Jupyter Notebookの環境を含めた他の多くのデータ処理用のライブラリーとともにRedAmberもパッケージングされたDocker Imageとして、[RubyData Docker Stacks](https://github.com/RubyData/docker-stacks) が利用できます(Thanks to Kenta Murata).
|
91
|
+
|
92
|
+
## 他のデータフレームライブラリとの比較表
|
93
|
+
|
94
|
+
RedAmberの基本的な機能をPython
|
95
|
+
[pandas](https://pandas.pydata.org/) や
|
96
|
+
R [Tidyverse](https://www.tidyverse.org/) や
|
97
|
+
Julia [Dataframes](https://dataframes.juliadata.org/stable/) と比較した表は [DataFrame_Comparison.md](doc/DataFrame_Comparison.md) にあります(Thanks to Benson Muite).
|
98
|
+
|
99
|
+
## `RedAmber`のデータフレーム
|
100
|
+
|
101
|
+
クラス `RedAmber::DataFrame` は2次元のデータの集まりを表現します。
|
102
|
+
その実体は Red Arrowの Tableオブジェクトです。
|
103
|
+
|
104
|
+

|
105
|
+
|
106
|
+
それではライブラリをロードしていくつかの例を試してみましょう。
|
107
|
+
|
108
|
+
```ruby
|
109
|
+
require 'red_amber' # require 'red-amber' でもOKです.
|
110
|
+
include RedAmber
|
111
|
+
```
|
112
|
+
|
113
|
+
### 例: diamonds データセット
|
114
|
+
|
115
|
+
もしまだであれば、Red DatasetsのArrow拡張を`
|
116
|
+
gem install red-datasets-arrow
|
117
|
+
`
|
118
|
+
としてインストールしてから次を実行してください。
|
119
|
+
|
120
|
+
```ruby
|
121
|
+
require 'datasets-arrow' # サンプルデータのロードのため
|
122
|
+
|
123
|
+
dataset = Datasets::Diamonds.new
|
124
|
+
diamonds = DataFrame.new(dataset) # v0.2.3以前では, `dataset.to_arrow`とする必要があります。
|
125
|
+
|
126
|
+
# =>
|
127
|
+
#<RedAmber::DataFrame : 53940 x 10 Vectors, 0x000000000000f668>
|
128
|
+
carat cut color clarity depth table price x ... z
|
129
|
+
<double> <string> <string> <string> <double> <double> <uint16> <double> ... <double>
|
130
|
+
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 ... 2.43
|
131
|
+
1 0.21 Premium E SI1 59.8 61.0 326 3.89 ... 2.31
|
132
|
+
2 0.23 Good E VS1 56.9 65.0 327 4.05 ... 2.31
|
133
|
+
3 0.29 Premium I VS2 62.4 58.0 334 4.2 ... 2.63
|
134
|
+
4 0.31 Good J SI2 63.3 58.0 335 4.34 ... 2.75
|
135
|
+
: : : : : : : : : ... :
|
136
|
+
53937 0.7 Very Good D SI1 62.8 60.0 2757 5.66 ... 3.56
|
137
|
+
53938 0.86 Premium H SI2 61.0 58.0 2757 6.15 ... 3.74
|
138
|
+
53939 0.75 Ideal D SI2 62.2 55.0 2757 5.83 ... 3.64
|
139
|
+
```
|
140
|
+
|
141
|
+
例えば、1カラット以下のレコードに対し、cut毎の平均のpriceを求めるには次のようにします。
|
142
|
+
|
143
|
+
```ruby
|
144
|
+
df = diamonds
|
145
|
+
.slice { carat > 1 } # #sliceの代わりに#filterでも可
|
146
|
+
.group(:cut)
|
147
|
+
.mean(:price) # ここで:priceを指定する場合はgroupの前のpickは不要
|
148
|
+
.sort('-mean(price)')
|
149
|
+
|
150
|
+
# =>
|
151
|
+
#<RedAmber::DataFrame : 5 x 2 Vectors, 0x000000000000f67c>
|
152
|
+
cut mean(price)
|
153
|
+
<string> <double>
|
154
|
+
0 Ideal 8674.23
|
155
|
+
1 Premium 8487.25
|
156
|
+
2 Very Good 8340.55
|
157
|
+
3 Good 7753.6
|
158
|
+
4 Fair 7177.86
|
159
|
+
```
|
160
|
+
|
161
|
+
Arrowのデータはイミュータブルなので、これらのメソッドは新しいオブジェクトを返します。
|
162
|
+
|
163
|
+
次の例は、列をリネームしてから新しい列に簡単な計算の結果を格納します。
|
164
|
+
|
165
|
+
```ruby
|
166
|
+
usdjpy = 110.0 # 今よりずっと円高の頃
|
167
|
+
|
168
|
+
df.rename('mean(price)': :mean_price_USD)
|
169
|
+
.assign(:mean_price_JPY) { mean_price_USD * usdjpy }
|
170
|
+
|
171
|
+
# =>
|
172
|
+
#<RedAmber::DataFrame : 5 x 3 Vectors, 0x000000000000f71c>
|
173
|
+
cut mean_price_USD mean_price_JPY
|
174
|
+
<string> <double> <double>
|
175
|
+
0 Ideal 8674.23 954164.93
|
176
|
+
1 Premium 8487.25 933597.34
|
177
|
+
2 Very Good 8340.55 917460.37
|
178
|
+
3 Good 7753.6 852896.11
|
179
|
+
4 Fair 7177.86 789564.12
|
180
|
+
```
|
181
|
+
|
182
|
+
### 例: starwars データセット
|
183
|
+
|
184
|
+
次の例は、CSVファイルをダウンロードして`starwars` データセットを読み込みます。その後簡単なデータのクリーニングを行います。
|
185
|
+
|
186
|
+
```ruby
|
187
|
+
uri = URI('https://vincentarelbundock.github.io/Rdatasets/csv/dplyr/starwars.csv')
|
188
|
+
|
189
|
+
starwars = DataFrame.load(uri)
|
190
|
+
|
191
|
+
starwars
|
192
|
+
.drop(0) # 不要な列を取り除く
|
193
|
+
.remove { species == "NA" } # 不要な行を取り除く
|
194
|
+
.group(:species) { [count(:species), mean(:height, :mass)] }
|
195
|
+
.slice { count > 1 } # #filterでも可
|
196
|
+
|
197
|
+
# =>
|
198
|
+
#<RedAmber::DataFrame : 8 x 4 Vectors, 0x000000000000f848>
|
199
|
+
species count mean(height) mean(mass)
|
200
|
+
<string> <int64> <double> <double>
|
201
|
+
0 Human 35 176.65 82.78
|
202
|
+
1 Droid 6 131.2 69.75
|
203
|
+
2 Wookiee 2 231.0 124.0
|
204
|
+
3 Gungan 3 208.67 74.0
|
205
|
+
4 Zabrak 2 173.0 80.0
|
206
|
+
5 Twi'lek 2 179.0 55.0
|
207
|
+
6 Mirialan 2 168.0 53.1
|
208
|
+
7 Kaminoan 2 221.0 88.0
|
209
|
+
```
|
210
|
+
|
211
|
+
より詳しいデータフレームの使用例については、[DataFrame.md](doc/DataFrame.md) をご参照ください。
|
212
|
+
|
213
|
+
|
214
|
+
### 1次元のデータを保持する `Vector`
|
215
|
+
|
216
|
+
クラス`RedAmber::Vector` はデータフレームの中の列方向に格納された1次元のデータ列を保持します.
|
217
|
+
|
218
|
+
より詳しい使用例については [Vector.md](doc/Vector.md) をご参照ください。
|
219
|
+
|
220
|
+
## Jupyter notebook
|
221
|
+
|
222
|
+
Jupyter Notebook形式の使用例として、[Examples of Red Amber](https://github.com/heronshoes/docker-stacks/blob/RedAmber-binder/binder/examples_of_red_amber.ipynb)
|
223
|
+
([raw file](https://raw.githubusercontent.com/heronshoes/docker-stacks/RedAmber-binder/binder/examples_of_red_amber.ipynb)) があります。データのロードから各種のデータ処理まで100以上の使用例を集めています。[Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb).
|
224
|
+
[](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb)で試すこともできます。
|
225
|
+
|
226
|
+
|
227
|
+
## 開発
|
228
|
+
|
229
|
+
```shell
|
230
|
+
git clone https://github.com/red-data-tools/red_amber.git
|
231
|
+
cd red_amber
|
232
|
+
bundle install
|
233
|
+
bundle exec rake test
|
234
|
+
```
|
235
|
+
|
236
|
+
rake testは必須ですが、rake rubocopをパスすることはコントリビュートの際に必須ではありません。このプロジェクトではコードの書き方の好みを尊重します。ただしマージの際に書き方を統一することがあります。
|
237
|
+
|
238
|
+
## コミュニティ
|
239
|
+
|
240
|
+
このプロジェクトを支援して頂けると嬉しいです。支援の方法はいくつかあります。
|
241
|
+
|
242
|
+
- [discussions](https://github.com/heronshoes/red_amber/discussions)で話をする [](https://github.com/red-data-tools/red_amber/discussions)
|
243
|
+
- Q and Aや使用方法、豆知識などを見る。
|
244
|
+
- 疑問に思っていることを質問する。
|
245
|
+
- 新しいアイデアを共有する。アイデアはdiscussionからissueに昇格させて育てていくこともあります。漠然としたアイデアでもdiscussionから始めて大きくしていきましょう。
|
246
|
+
- [バグ報告や新しい機能の提案](https://github.com/red-data-tools/red_amber/issues)
|
247
|
+
- バグの修正や[プルリクエスト](https://github.com/red-data-tools/red_amber/pulls)
|
248
|
+
- ドキュメントを修正したり、不明確なところを直したり、新しく追加する
|
249
|
+
|
250
|
+
## License
|
251
|
+
|
252
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/README.md
CHANGED
@@ -10,30 +10,34 @@
|
|
10
10
|
A simple dataframe library for Ruby.
|
11
11
|
|
12
12
|
- Powered by [Red Arrow](https://github.com/apache/arrow/tree/master/ruby/red-arrow)
|
13
|
-
[](https://app.element.io/#/room/#red-data-tools_en:gitter.im) [](https://rubygems.org/gems/red-arrow)
|
14
14
|
- Inspired by the dataframe library [Rover-df](https://github.com/ankane/rover)
|
15
15
|
|
16
|
+
[日本語のREADME](README.ja.md)
|
17
|
+
|
16
18
|

|
17
19
|
|
18
20
|
## Requirements
|
19
21
|
### Ruby
|
20
22
|
Supported Ruby version is >= 3.0 (since RedAmber 0.3.0).
|
21
|
-
- I decided to remove support for Ruby 2.7 without waiting for its EOL. See [Release note for v0.3.0](https://github.com/red-data-tools/red_amber/discussions/162) for details.
|
22
23
|
|
23
|
-
###
|
24
|
+
### Required libraries
|
24
25
|
```ruby
|
25
|
-
gem 'red-arrow', '~>
|
26
|
-
gem 'red-parquet', '~>
|
27
|
-
gem '
|
26
|
+
gem 'red-arrow', '~> 12.0.0' # Requires Apache Arrow (see installation below)
|
27
|
+
gem 'red-parquet', '~> 12.0.0' # Optional, if you use IO from/to parquet
|
28
|
+
gem 'red-datasets-arrow' # Optional, if you use Red Datasets or random sampling feature
|
29
|
+
gem 'red-arrow-numo-narray' # Optional, recommended if you use inputs from Numo::NArray
|
30
|
+
gem 'red-arrow-activerecord' # Optional, if you use Active Record
|
31
|
+
gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
|
28
32
|
```
|
29
33
|
|
30
34
|
## Installation
|
31
35
|
|
32
36
|
Install requirements before you install RedAmber.
|
33
37
|
|
34
|
-
- Apache Arrow (~>
|
35
|
-
- Apache Arrow GLib (~>
|
36
|
-
- Apache Parquet GLib (~>
|
38
|
+
- Apache Arrow (~> 12.0.0)
|
39
|
+
- Apache Arrow GLib (~> 12.0.0)
|
40
|
+
- Apache Parquet GLib (~> 12.0.0) # If you use IO from/to parquet
|
37
41
|
|
38
42
|
See [Apache Arrow install document](https://arrow.apache.org/install/).
|
39
43
|
|
@@ -49,7 +53,7 @@ See [Apache Arrow install document](https://arrow.apache.org/install/).
|
|
49
53
|
sudo apt install -y -V libarrow-glib-dev
|
50
54
|
```
|
51
55
|
|
52
|
-
- On Fedora
|
56
|
+
- On Fedora 39 (Rawhide):
|
53
57
|
|
54
58
|
```
|
55
59
|
sudo dnf update
|
@@ -66,19 +70,20 @@ See [Apache Arrow install document](https://arrow.apache.org/install/).
|
|
66
70
|
If you prepared Apache Arrow, add these lines to your Gemfile:
|
67
71
|
|
68
72
|
```ruby
|
69
|
-
gem 'red-arrow', '~>
|
73
|
+
gem 'red-arrow', '~> 12.0.0'
|
70
74
|
gem 'red_amber'
|
71
|
-
gem 'red-parquet', '~>
|
72
|
-
gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
|
75
|
+
gem 'red-parquet', '~> 12.0.0' # Optional, if you use IO from/to parquet
|
73
76
|
gem 'red-datasets-arrow' # Optional, recommended if you use Red Datasets
|
74
77
|
gem 'red-arrow-numo-narray' # Optional, recommended if you use inputs from Numo::NArray
|
78
|
+
gem 'red-arrow-activerecord' # Optional, if you use Active Record
|
79
|
+
gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
|
75
80
|
```
|
76
81
|
|
77
82
|
And then execute `bundle install` or install them yourself such as `gem install red_amber`.
|
78
83
|
|
79
84
|
## Docker image and Jupyter Notebook
|
80
85
|
|
81
|
-
Docker image is available from docker folder. See [readme](docker/readme.md) for instruction. Integrated Jypyter notebook is in docker/notebook folder.
|
86
|
+
Docker image is available from `docker` folder. See [readme](docker/readme.md) for instruction. Integrated Jypyter notebook is in docker/notebook folder.
|
82
87
|
|
83
88
|
You can try the contents of this README interactively by [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb).
|
84
89
|
[](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb)
|
data/benchmark/group.yml
CHANGED
@@ -1,18 +1,19 @@
|
|
1
1
|
loop_count: 3
|
2
2
|
|
3
3
|
contexts:
|
4
|
-
- name: 0.2.2
|
5
|
-
gems:
|
6
|
-
red_amber: 0.2.2
|
7
4
|
- name: 0.3.0
|
8
5
|
gems:
|
9
6
|
red_amber: 0.3.0
|
7
|
+
- name: 0.4.2
|
8
|
+
gems:
|
9
|
+
red_amber: 0.4.2
|
10
10
|
- name: HEAD
|
11
11
|
prelude: |
|
12
12
|
$LOAD_PATH.unshift(File.expand_path('lib'))
|
13
13
|
|
14
14
|
prelude: |
|
15
15
|
require 'red_amber'
|
16
|
+
include RedAmber
|
16
17
|
require 'datasets-arrow'
|
17
18
|
|
18
19
|
ds = Datasets::Rdatasets.new('nycflights13', 'flights')
|
@@ -32,8 +33,14 @@ benchmark:
|
|
32
33
|
'G03: sum arr_delay, mean distance by flight': |
|
33
34
|
df.group(:flight) { [sum(:arr_delay), mean(:distance)] }
|
34
35
|
|
35
|
-
'G04:
|
36
|
+
'G04:filtersir_time, distance by flight': |
|
36
37
|
df.group(:flight).mean(:air_time, :distance)
|
37
38
|
|
38
|
-
'
|
39
|
+
'G75: sum dep_delay, arr_delay by carrer': |
|
39
40
|
df.group(:carrier).sum(:dep_delay, :arr_delay)
|
41
|
+
|
42
|
+
'G06: filters': |
|
43
|
+
Group.new(df, :dest).filters
|
44
|
+
|
45
|
+
'G07: inspect': |
|
46
|
+
Group.new(df, :dest).inspect
|
data/docker/Gemfile
CHANGED
@@ -5,11 +5,11 @@ source 'https://rubygems.org'
|
|
5
5
|
gem 'irb'
|
6
6
|
|
7
7
|
gem 'numo-narray'
|
8
|
-
gem 'red-arrow', '~>
|
8
|
+
gem 'red-arrow', '~> 12.0.0'
|
9
9
|
gem 'red-arrow-numo-narray'
|
10
|
-
gem 'red-parquet', '~>
|
10
|
+
gem 'red-parquet', '~> 12.0.0'
|
11
11
|
|
12
|
-
gem 'red_amber', '
|
12
|
+
gem 'red_amber', path: '../'
|
13
13
|
gem 'red-amber-view'
|
14
14
|
gem 'rover-df'
|
15
15
|
|
@@ -18,4 +18,9 @@ gem 'red-datasets-arrow'
|
|
18
18
|
|
19
19
|
gem 'benchmark_driver'
|
20
20
|
gem 'benchmark-ips'
|
21
|
+
|
22
|
+
gem 'charty'
|
21
23
|
gem 'faker'
|
24
|
+
gem 'matplotlib'
|
25
|
+
gem 'pycall'
|
26
|
+
gem 'unicode_plot'
|
data/docker/Gemfile.lock
CHANGED
@@ -1,37 +1,66 @@
|
|
1
|
+
PATH
|
2
|
+
remote: ..
|
3
|
+
specs:
|
4
|
+
red_amber (0.5.0.pre.HEAD)
|
5
|
+
red-arrow (~> 12.0.0)
|
6
|
+
|
1
7
|
GEM
|
2
8
|
remote: https://rubygems.org/
|
3
9
|
specs:
|
4
10
|
benchmark-ips (2.12.0)
|
5
11
|
benchmark_driver (0.16.3)
|
6
12
|
bigdecimal (3.1.4)
|
13
|
+
charty (0.2.12)
|
14
|
+
matplotlib (>= 1.2.0)
|
15
|
+
pandas (>= 0.3.5)
|
16
|
+
playwright-ruby-client
|
17
|
+
red-colors (>= 0.3.0)
|
18
|
+
red-datasets (>= 0.1.2)
|
19
|
+
red-palette (>= 0.5.0)
|
7
20
|
concurrent-ruby (1.2.2)
|
8
21
|
csv (3.2.6)
|
22
|
+
enumerable-statistics (2.0.7)
|
9
23
|
extpp (0.1.1)
|
10
24
|
faker (3.1.1)
|
11
25
|
i18n (>= 1.8.11, < 2)
|
12
26
|
fiddle (1.1.1)
|
13
|
-
gio2 (4.1.
|
27
|
+
gio2 (4.1.4)
|
14
28
|
fiddle
|
15
|
-
gobject-introspection (= 4.1.
|
16
|
-
glib2 (4.1.
|
29
|
+
gobject-introspection (= 4.1.4)
|
30
|
+
glib2 (4.1.4)
|
17
31
|
native-package-installer (>= 1.0.3)
|
18
32
|
pkg-config (>= 1.3.5)
|
19
|
-
gobject-introspection (4.1.
|
20
|
-
glib2 (= 4.1.
|
33
|
+
gobject-introspection (4.1.4)
|
34
|
+
glib2 (= 4.1.4)
|
21
35
|
i18n (1.12.0)
|
22
36
|
concurrent-ruby (~> 1.0)
|
23
37
|
io-console (0.6.0)
|
24
|
-
irb (1.6.
|
38
|
+
irb (1.6.4)
|
25
39
|
reline (>= 0.3.0)
|
26
40
|
libui (0.0.15)
|
41
|
+
matplotlib (1.3.0)
|
42
|
+
pycall (>= 1.0.0)
|
43
|
+
matrix (0.4.2)
|
44
|
+
mime-types (3.4.1)
|
45
|
+
mime-types-data (~> 3.2015)
|
46
|
+
mime-types-data (3.2023.0218.1)
|
27
47
|
native-package-installer (1.1.5)
|
28
48
|
numo-narray (0.9.2.1)
|
49
|
+
numpy (0.4.0)
|
50
|
+
pycall (>= 1.2.0.beta1)
|
51
|
+
pandas (0.3.8)
|
52
|
+
numpy
|
53
|
+
pycall (>= 1.0.0)
|
29
54
|
pkg-config (1.5.1)
|
55
|
+
playwright-ruby-client (1.31.1)
|
56
|
+
concurrent-ruby (>= 1.1.6)
|
57
|
+
mime-types (>= 3.0)
|
58
|
+
pycall (1.4.2)
|
30
59
|
red-amber-view (0.0.1)
|
31
60
|
libui
|
32
61
|
red-arrow
|
33
62
|
red_amber
|
34
|
-
red-arrow (
|
63
|
+
red-arrow (12.0.0)
|
35
64
|
bigdecimal (>= 3.1.0)
|
36
65
|
extpp (>= 0.1.1)
|
37
66
|
gio2 (>= 3.5.0)
|
@@ -40,6 +69,8 @@ GEM
|
|
40
69
|
red-arrow-numo-narray (0.0.6)
|
41
70
|
numo-narray
|
42
71
|
red-arrow
|
72
|
+
red-colors (0.3.0)
|
73
|
+
matrix
|
43
74
|
red-datasets (0.1.5)
|
44
75
|
csv (>= 3.2.4)
|
45
76
|
rexml
|
@@ -47,34 +78,41 @@ GEM
|
|
47
78
|
red-datasets-arrow (0.0.3)
|
48
79
|
red-arrow
|
49
80
|
red-datasets (>= 0.0.3)
|
50
|
-
red-
|
51
|
-
red-
|
52
|
-
|
53
|
-
red-arrow (
|
54
|
-
reline (0.3.
|
81
|
+
red-palette (0.5.0)
|
82
|
+
red-colors (>= 0.3.0)
|
83
|
+
red-parquet (12.0.0)
|
84
|
+
red-arrow (= 12.0.0)
|
85
|
+
reline (0.3.3)
|
55
86
|
io-console (~> 0.5)
|
56
87
|
rexml (3.2.5)
|
57
88
|
rover-df (0.3.4)
|
58
89
|
numo-narray (>= 0.9.1.9)
|
59
90
|
rubyzip (2.3.2)
|
91
|
+
unicode_plot (0.0.5)
|
92
|
+
enumerable-statistics (>= 2.0.1)
|
60
93
|
|
61
94
|
PLATFORMS
|
95
|
+
x86_64-darwin-20
|
62
96
|
x86_64-linux
|
63
97
|
|
64
98
|
DEPENDENCIES
|
65
99
|
benchmark-ips
|
66
100
|
benchmark_driver
|
101
|
+
charty
|
67
102
|
faker
|
68
103
|
irb
|
104
|
+
matplotlib
|
69
105
|
numo-narray
|
106
|
+
pycall
|
70
107
|
red-amber-view
|
71
|
-
red-arrow (~>
|
108
|
+
red-arrow (~> 12.0.0)
|
72
109
|
red-arrow-numo-narray
|
73
110
|
red-datasets
|
74
111
|
red-datasets-arrow
|
75
|
-
red-parquet (~>
|
76
|
-
red_amber
|
112
|
+
red-parquet (~> 12.0.0)
|
113
|
+
red_amber!
|
77
114
|
rover-df
|
115
|
+
unicode_plot
|
78
116
|
|
79
117
|
BUNDLED WITH
|
80
|
-
2.4.
|
118
|
+
2.4.12
|