red_amber 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +15 -0
  3. data/CHANGELOG.md +170 -20
  4. data/Gemfile +4 -2
  5. data/README.md +121 -302
  6. data/benchmark/basic.yml +79 -0
  7. data/benchmark/combine.yml +63 -0
  8. data/benchmark/drop_nil.yml +15 -3
  9. data/benchmark/group.yml +33 -0
  10. data/benchmark/reshape.yml +27 -0
  11. data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
  12. data/benchmark/rover/flights.yml +23 -0
  13. data/benchmark/rover/penguins.yml +23 -0
  14. data/benchmark/rover/planes.yml +23 -0
  15. data/benchmark/rover/weather.yml +23 -0
  16. data/doc/DataFrame.md +611 -318
  17. data/doc/Vector.md +31 -36
  18. data/doc/image/basic_verbs.png +0 -0
  19. data/doc/image/dataframe/assign.png +0 -0
  20. data/doc/image/dataframe/assign_operation.png +0 -0
  21. data/doc/image/dataframe/drop.png +0 -0
  22. data/doc/image/dataframe/join.png +0 -0
  23. data/doc/image/dataframe/pick.png +0 -0
  24. data/doc/image/dataframe/pick_operation.png +0 -0
  25. data/doc/image/dataframe/remove.png +0 -0
  26. data/doc/image/dataframe/rename.png +0 -0
  27. data/doc/image/dataframe/rename_operation.png +0 -0
  28. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  29. data/doc/image/dataframe/set_and_bind.png +0 -0
  30. data/doc/image/dataframe/slice.png +0 -0
  31. data/doc/image/dataframe/slice_operation.png +0 -0
  32. data/doc/image/dataframe_model.png +0 -0
  33. data/doc/image/group_operation.png +0 -0
  34. data/doc/image/replace-if_then.png +0 -0
  35. data/doc/image/reshaping_dataframe.png +0 -0
  36. data/doc/image/screenshot.png +0 -0
  37. data/doc/image/vector/binary_element_wise.png +0 -0
  38. data/doc/image/vector/unary_aggregation.png +0 -0
  39. data/doc/image/vector/unary_aggregation_w_option.png +0 -0
  40. data/doc/image/vector/unary_element_wise.png +0 -0
  41. data/lib/red_amber/data_frame.rb +16 -42
  42. data/lib/red_amber/data_frame_combinable.rb +283 -0
  43. data/lib/red_amber/data_frame_displayable.rb +58 -3
  44. data/lib/red_amber/data_frame_loadsave.rb +36 -0
  45. data/lib/red_amber/data_frame_reshaping.rb +8 -6
  46. data/lib/red_amber/data_frame_selectable.rb +9 -9
  47. data/lib/red_amber/data_frame_variable_operation.rb +27 -21
  48. data/lib/red_amber/group.rb +100 -17
  49. data/lib/red_amber/helper.rb +20 -30
  50. data/lib/red_amber/vector.rb +56 -30
  51. data/lib/red_amber/vector_functions.rb +0 -8
  52. data/lib/red_amber/vector_selectable.rb +9 -1
  53. data/lib/red_amber/vector_updatable.rb +61 -63
  54. data/lib/red_amber/version.rb +1 -1
  55. data/lib/red_amber.rb +2 -0
  56. data/red_amber.gemspec +1 -1
  57. metadata +32 -11
  58. data/doc/examples_of_red_amber.ipynb +0 -8979
data/README.md CHANGED
@@ -1,13 +1,16 @@
1
1
  # RedAmber
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/red_amber.svg)](https://badge.fury.io/rb/red_amber)
4
- [![Ruby](https://github.com/heronshoes/red_amber/actions/workflows/test.yml/badge.svg)](https://github.com/heronshoes/red_amber/actions/workflows/test.yml)
4
+ [![Ruby](https://github.com/heronshoes/red_amber/actions/workflows/ci.yml/badge.svg)](https://github.com/heronshoes/red_amber/actions/workflows/ci.yml)
5
+ [![Discussions](https://img.shields.io/github/discussions/heronshoes/red_amber)](https://github.com/heronshoes/red_amber/discussions)
5
6
 
6
7
  A simple dataframe library for Ruby.
7
8
 
8
9
  - Powered by [Red Arrow](https://github.com/apache/arrow/tree/master/ruby/red-arrow) [![Gitter Chat](https://badges.gitter.im/red-data-tools/en.svg)](https://gitter.im/red-data-tools/en)
9
10
  - Inspired by the dataframe library [Rover-df](https://github.com/ankane/rover)
10
11
 
12
+ ![screenshot from jupyterlab](doc/image/screenshot.png)
13
+
11
14
  ## Requirements
12
15
 
13
16
  Supported Ruby version is >= 2.7.
@@ -17,9 +20,9 @@ I recommend Ruby 3 for performance.
17
20
 
18
21
  ```ruby
19
22
  # Libraries required
20
- gem 'red-arrow', '>= 9.0.0'
23
+ gem 'red-arrow', '~> 10.0.0' # Requires Apache Arrow (see installation below)
21
24
 
22
- gem 'red-parquet', '>= 9.0.0' # Optional, if you use IO from/to parquet
25
+ gem 'red-parquet', '~> 10.0.0' # Optional, if you use IO from/to parquet
23
26
  gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
24
27
  ```
25
28
 
@@ -27,368 +30,178 @@ gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
27
30
 
28
31
  Install requirements before you install Red Amber.
29
32
 
30
- - Apache Arrow GLib (>= 9.0.0)
31
-
32
- - Apache Parquet GLib (>= 9.0.0) # If you use IO from/to parquet
33
+ - Apache Arrow (~> 10.0.0)
34
+ - Apache Arrow GLib (~> 10.0.0)
35
+ - Apache Parquet GLib (~> 10.0.0) # If you use IO from/to parquet
33
36
 
34
37
  See [Apache Arrow install document](https://arrow.apache.org/install/).
35
38
 
36
- Minimum installation example for the latest Ubuntu is in the ['Prepare the Apache Arrow' section in ci test](https://github.com/heronshoes/red_amber/blob/master/.github/workflows/test.yml) of Red Amber.
37
-
38
- Add this line to your Gemfile:
39
-
40
- ```ruby
39
+ - Minimum installation example for the latest Ubuntu:
40
+ ```
41
+ sudo apt update
42
+ sudo apt install -y -V ca-certificates lsb-release wget
43
+ wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
44
+ sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
45
+ sudo apt update
46
+ sudo apt install -y -V libarrow-dev
47
+ sudo apt install -y -V libarrow-glib-dev
48
+ ```
49
+ - On macOS, you can install Apache Arrow C++ library using Homebrew:
50
+
51
+ ```
52
+ brew install apache-arrow
53
+ ```
54
+
55
+ and GLib (C) package with:
56
+
57
+ ```
58
+ brew install apache-arrow-glib
59
+ ```
60
+
61
+ If you prepared Apache Arrow, add these lines to your Gemfile:
62
+
63
+ ```ruby
64
+ gem 'red-arrow', '~> 10.0.0'
41
65
  gem 'red_amber'
66
+ gem 'red-parquet', '~> 10.0.0' # Optional, if you use IO from/to parquet
67
+ gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
68
+ gem 'red-datasets-arrow' # Optional, recommended if you use Red Datasets
69
+ gem 'red-arrow-numo-narray' # Optional, recommended if you use inputs from Numo::NArray
42
70
  ```
43
71
 
44
- And then execute:
45
-
46
- ```shell
47
- bundle install
48
- ```
49
-
50
- Or install it yourself as:
51
-
52
- ```shell
53
- gem install red_amber
54
- ```
72
+ And then execute `bundle install` or install it yourself as `gem install red_amber`.
55
73
 
56
74
  ## Docker image and Jupyter Notebook
57
75
 
58
76
  [RubyData Docker Stacks](https://github.com/RubyData/docker-stacks) is available as a ready-to-run Docker image containing Jupyter and useful data tools as well as RedAmber (Thanks to @mrkn).
59
77
 
60
- Also you can try the contents of this README interactively by [Binder](https://mybinder.org/v2/gh/RubyData/docker-stacks/master?filepath=red-amber.ipynb).
61
- [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/RubyData/docker-stacks/master?filepath=red-amber.ipynb)
62
-
78
+ Also you can try the contents of this README interactively by [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb).
79
+ [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb)
63
80
 
64
81
 
65
- ## `RedAmber::DataFrame`
82
+ ## Data frame in `RedAmber`
66
83
 
67
- It represents a set of data in 2D-shape. The entity is a Red Arrow's Table object.
84
+ Class `RedAmber::DataFrame` represents a set of data in 2D-shape.
85
+ The entity is a Red Arrow's Table object.
68
86
 
69
87
  ![dataframe model of RedAmber](doc/image/dataframe_model.png)
70
88
 
71
- ```ruby
72
- require 'red_amber' # require 'red-amber' is also OK.
73
- require 'datasets-arrow'
74
-
75
- arrow = Datasets::Penguins.new.to_arrow
76
- penguins = RedAmber::DataFrame.new(arrow)
77
-
78
- # =>
79
- #<RedAmber::DataFrame : 344 x 8 Vectors, 0x0000000000013790>
80
- species island bill_length_mm bill_depth_mm flipper_length_mm ... year
81
- <string> <string> <double> <double> <uint8> ... <uint16>
82
- 1 Adelie Torgersen 39.1 18.7 181 ... 2007
83
- 2 Adelie Torgersen 39.5 17.4 186 ... 2007
84
- 3 Adelie Torgersen 40.3 18.0 195 ... 2007
85
- 4 Adelie Torgersen (nil) (nil) (nil) ... 2007
86
- 5 Adelie Torgersen 36.7 19.3 193 ... 2007
87
- : : : : : : ... :
88
- 342 Gentoo Biscoe 50.4 15.7 222 ... 2009
89
- 343 Gentoo Biscoe 45.2 14.8 212 ... 2009
90
- 344 Gentoo Biscoe 49.9 16.1 213 ... 2009
91
- ```
92
-
93
- For example, `DataFrame#pick` accepts keys as arguments and returns a sub DataFrame.
94
-
95
- ![pick method image](doc/image/dataframe/pick.png)
89
+ Let's load the library and try some examples.
96
90
 
97
91
  ```ruby
98
- penguins.keys
99
- # =>
100
- [:species,
101
- :island,
102
- :bill_length_mm,
103
- :bill_depth_mm,
104
- :flipper_length_mm,
105
- :body_mass_g,
106
- :sex,
107
- :year]
108
-
109
- df = penguins.pick(:species, :island, :body_mass_g)
110
- df
111
-
112
- # =>
113
- #<RedAmber::DataFrame : 344 x 3 Vectors, 0x000000000003cc1c>
114
- species island body_mass_g
115
- <string> <string> <uint16>
116
- 1 Adelie Torgersen 3750
117
- 2 Adelie Torgersen 3800
118
- 3 Adelie Torgersen 3250
119
- 4 Adelie Torgersen (nil)
120
- 5 Adelie Torgersen 3450
121
- : : : :
122
- 342 Gentoo Biscoe 5750
123
- 343 Gentoo Biscoe 5200
124
- 344 Gentoo Biscoe 5400
125
- ```
126
-
127
- `DataFrame#drop` drops some columns to create a remainer DataFrame.
128
-
129
- ![drop method image](doc/image/dataframe/drop.png)
130
-
131
- You can specify by keys or a boolean array of same size as n_keys.
132
-
133
- ```ruby
134
- # Same as df.drop(:species, :island)
135
- df = df.drop(true, true, false)
136
-
137
- # =>
138
- #<RedAmber::DataFrame : 344 x 1 Vector, 0x0000000000048760>
139
- body_mass_g
140
- <uint16>
141
- 1 3750
142
- 2 3800
143
- 3 3250
144
- 4 (nil)
145
- 5 3450
146
- : :
147
- 342 5750
148
- 343 5200
149
- 344 5400
92
+ require 'red_amber' # require 'red-amber' is also OK.
93
+ include RedAmber
150
94
  ```
151
95
 
152
- Arrow data is immutable, so these methods always return an new object.
153
-
154
- `DataFrame#assign` creates new columns or update existing columns.
155
-
156
- ![assign method image](doc/image/dataframe/assign.png)
96
+ ### Example: diamonds dataset
157
97
 
158
98
  ```ruby
159
- # New column is created because ':body_mass_kg' is a new key.
160
- df.assign(:body_mass_kg => df[:body_mass_g] / 1000.0)
161
-
162
- # =>
163
- #<RedAmber::DataFrame : 344 x 2 Vectors, 0x00000000000212f0>
164
- body_mass_g body_mass_kg
165
- <uint16> <double>
166
- 1 3750 3.8
167
- 2 3800 3.8
168
- 3 3250 3.3
169
- 4 (nil) (nil)
170
- 5 3450 3.5
171
- : : :
172
- 342 5750 5.8
173
- 343 5200 5.2
174
- 344 5400 5.4
175
- ```
176
-
177
- `DataFrame#slice` selects rows (observations) to create a sub DataFrame.
99
+ require 'datasets-arrow' # to load sample data
178
100
 
179
- ![slice method image](doc/image/dataframe/slice.png)
180
-
181
- ```ruby
182
- # returns 5 rows at the start and 5 rows from the end
183
- penguins.slice(0...5, -5..-1)
101
+ dataset = Datasets::Diamonds.new
102
+ diamonds = DataFrame.new(dataset) # from v0.2.2, should be `dataset.to_arrow` if older.
184
103
 
185
104
  # =>
186
- #<RedAmber::DataFrame : 10 x 8 Vectors, 0x0000000000042be4>
187
- species island bill_length_mm bill_depth_mm flipper_length_mm ... year
188
- <string> <string> <double> <double> <uint8> ... <uint16>
189
- 1 Adelie Torgersen 39.1 18.7 181 ... 2007
190
- 2 Adelie Torgersen 39.5 17.4 186 ... 2007
191
- 3 Adelie Torgersen 40.3 18.0 195 ... 2007
192
- 4 Adelie Torgersen (nil) (nil) (nil) ... 2007
193
- 5 Adelie Torgersen 36.7 19.3 193 ... 2007
194
- : : : : : : ... :
195
- 8 Gentoo Biscoe 50.4 15.7 222 ... 2009
196
- 9 Gentoo Biscoe 45.2 14.8 212 ... 2009
197
- 10 Gentoo Biscoe 49.9 16.1 213 ... 2009
105
+ #<RedAmber::DataFrame : 53940 x 10 Vectors, 0x000000000000f668>
106
+ carat cut color clarity depth table price x ... z
107
+ <double> <string> <string> <string> <double> <double> <uint16> <double> ... <double>
108
+ 0 0.23 Ideal E SI2 61.5 55.0 326 3.95 ... 2.43
109
+ 1 0.21 Premium E SI1 59.8 61.0 326 3.89 ... 2.31
110
+ 2 0.23 Good E VS1 56.9 65.0 327 4.05 ... 2.31
111
+ 3 0.29 Premium I VS2 62.4 58.0 334 4.2 ... 2.63
112
+ 4 0.31 Good J SI2 63.3 58.0 335 4.34 ... 2.75
113
+ : : : : : : : : : ... :
114
+ 53937 0.7 Very Good D SI1 62.8 60.0 2757 5.66 ... 3.56
115
+ 53938 0.86 Premium H SI2 61.0 58.0 2757 6.15 ... 3.74
116
+ 53939 0.75 Ideal D SI2 62.2 55.0 2757 5.83 ... 3.64
198
117
  ```
199
118
 
200
- `DataFrame#remove` rejects rows (observations) to create a remainer DataFrame.
201
-
202
- ![remove method image](doc/image/dataframe/remove.png)
119
+ For example, we can compute mean prices per cut for the data larger than 1 carat.
203
120
 
204
121
  ```ruby
205
- # penguins[:bill_length_mm] < 40 returns a boolean Vector
206
- penguins.remove(penguins[:bill_length_mm] < 40)
122
+ df = diamonds
123
+ .slice { carat > 1 }
124
+ .group(:cut)
125
+ .mean(:price) # `pick` prior to `group` is not required if `:price` is specified here.
126
+ .sort('-mean(price)')
207
127
 
208
128
  # =>
209
- #<RedAmber::DataFrame : 244 x 8 Vectors, 0x000000000007d6f4>
210
- species island bill_length_mm bill_depth_mm flipper_length_mm ... year
211
- <string> <string> <double> <double> <uint8> ... <uint16>
212
- 1 Adelie Torgersen 40.3 18.0 195 ... 2007
213
- 2 Adelie Torgersen (nil) (nil) (nil) ... 2007
214
- 3 Adelie Torgersen 42.0 20.2 190 ... 2007
215
- 4 Adelie Torgersen 41.1 17.6 182 ... 2007
216
- 5 Adelie Torgersen 42.5 20.7 197 ... 2007
217
- : : : : : : ... :
218
- 242 Gentoo Biscoe 50.4 15.7 222 ... 2009
219
- 243 Gentoo Biscoe 45.2 14.8 212 ... 2009
220
- 244 Gentoo Biscoe 49.9 16.1 213 ... 2009
129
+ #<RedAmber::DataFrame : 5 x 2 Vectors, 0x000000000000f67c>
130
+ cut mean(price)
131
+ <string> <double>
132
+ 0 Ideal 8674.23
133
+ 1 Premium 8487.25
134
+ 2 Very Good 8340.55
135
+ 3 Good 7753.6
136
+ 4 Fair 7177.86
221
137
  ```
222
138
 
223
- DataFrame manipulating methods like `pick`, `drop`, `slice`, `remove`, `rename` and `assign` accept a block.
224
-
225
- Previous example is also OK with a block.
226
-
227
- ```ruby
228
- penguins.remove { bill_length_mm < 40 }
229
- ```
230
-
231
- Next example is an usage of block to update a column.
139
+ Arrow data is immutable, so these methods always return new objects.
140
+ Next example will rename a column and create a new column by simple calcuration.
232
141
 
233
142
  ```ruby
234
- df = RedAmber::DataFrame.new(
235
- integer: [0, 1, 2, 3, nil],
236
- float: [0.0, 1.1, 2.2, Float::NAN, nil],
237
- string: ['A', 'B', 'C', 'D', nil],
238
- boolean: [true, false, true, false, nil])
239
- df
240
-
241
- # =>
242
- #<RedAmber::DataFrame : 5 x 4 Vectors, 0x000000000003131c>
243
- integer float string boolean
244
- <uint8> <double> <string> <boolean>
245
- 1 0 0.0 A true
246
- 2 1 1.1 B false
247
- 3 2 2.2 C true
248
- 4 3 NaN D false
249
- 5 (nil) (nil) (nil) (nil)
250
-
251
- df.assign do
252
- vectors.select(&:float?).map { |v| [v.key, -v] }
253
- # => returns [[:float], [-0.0, -1.1, -2.2, NAN, nil]]
254
- end
255
-
256
- # =>
257
- #<RedAmber::DataFrame : 5 x 3 Vectors, 0x00000000000e270c>
258
- index float string
259
- <uint8> <double> <string>
260
- 1 0 -0.0 A
261
- 2 1 -1.1 B
262
- 3 2 -2.2 C
263
- 4 3 NaN D
264
- 5 (nil) (nil) (nil)
265
- ```
266
-
267
- Next example is to eliminate rows containing nil.
143
+ usdjpy = 110.0 # when the yen was stronger
268
144
 
269
- ```ruby
270
- # remove all observations containing nil
271
- nil_removed = penguins.remove { vectors.map(&:is_nil).reduce(&:|) }
272
- nil_removed.tdr
145
+ df.rename('mean(price)': :mean_price_USD)
146
+ .assign(:mean_price_JPY) { mean_price_USD * usdjpy }
273
147
 
274
148
  # =>
275
- RedAmber::DataFrame : 342 x 8 Vectors
276
- Vectors : 5 numeric, 3 strings
277
- # key type level data_preview
278
- 1 :species string 3 {"Adelie"=>151, "Chinstrap"=>68, "Gentoo"=>123}
279
- 2 :island string 3 {"Torgersen"=>51, "Biscoe"=>167, "Dream"=>124}
280
- 3 :bill_length_mm double 164 [39.1, 39.5, 40.3, 36.7, 39.3, ... ]
281
- 4 :bill_depth_mm double 80 [18.7, 17.4, 18.0, 19.3, 20.6, ... ]
282
- 5 :flipper_length_mm int64 55 [181, 186, 195, 193, 190, ... ]
283
- 6 :body_mass_g int64 94 [3750, 3800, 3250, 3450, 3650, ... ]
284
- 7 :sex string 3 {"male"=>168, "female"=>165, ""=>9}
285
- 8 :year int64 3 {2007=>109, 2008=>114, 2009=>119}
149
+ #<RedAmber::DataFrame : 5 x 3 Vectors, 0x000000000000f71c>
150
+ cut mean_price_USD mean_price_JPY
151
+ <string> <double> <double>
152
+ 0 Ideal 8674.23 954164.93
153
+ 1 Premium 8487.25 933597.34
154
+ 2 Very Good 8340.55 917460.37
155
+ 3 Good 7753.6 852896.11
156
+ 4 Fair 7177.86 789564.12
286
157
  ```
287
158
 
288
- For this frequently needed task, we can do it much simpler.
289
-
290
- ```ruby
291
- penguins.remove_nil # => same result as above
292
- ```
159
+ ### Example: starwars dataset
293
160
 
294
- `DataFrame#summary` shows summary statistics in a DataFrame.
161
+ Next example is `starwars` dataset reading from the downloaded CSV file. Followed by minimum data cleansing.
295
162
 
296
163
  ```ruby
297
- puts penguins.summary.to_s(width: 82)
164
+ uri = URI('https://vincentarelbundock.github.io/Rdatasets/csv/dplyr/starwars.csv')
298
165
 
299
- # =>
300
- variables count mean std min 25% median 75% max
301
- <dictionary> <uint16> <double> <double> <double> <double> <double> <double> <double>
302
- 1 bill_length_mm 342 43.92 5.46 32.1 39.23 44.38 48.5 59.6
303
- 2 bill_depth_mm 342 17.15 1.97 13.1 15.6 17.32 18.7 21.5
304
- 3 flipper_length_mm 342 200.92 14.06 172.0 190.0 197.0 213.0 231.0
305
- 4 body_mass_g 342 4201.75 801.95 2700.0 3550.0 4031.5 4750.0 6300.0
306
- 5 year 344 2008.03 0.82 2007.0 2007.0 2008.0 2009.0 2009.0
307
- ```
166
+ starwars = DataFrame.load(uri)
308
167
 
309
- `DataFrame#group` method can be used for the grouping tasks.
310
-
311
- ```ruby
312
- starwars = RedAmber::DataFrame.load(URI("https://vincentarelbundock.github.io/Rdatasets/csv/dplyr/starwars.csv"))
313
168
  starwars
169
+ .drop(0) # delete unnecessary index column
170
+ .remove { species == "NA" } # delete unnecessary rows
171
+ .group(:species) { [count(:species), mean(:height, :mass)] }
172
+ .slice { count > 1 }
314
173
 
315
174
  # =>
316
- #<RedAmber::DataFrame : 87 x 12 Vectors, 0x000000000000607c>
317
- unnamed1 name height mass hair_color skin_color eye_color ... species
318
- <int64> <string> <int64> <double> <string> <string> <string> ... <string>
319
- 1 1 Luke Skywalker 172 77.0 blond fair blue ... Human
320
- 2 2 C-3PO 167 75.0 NA gold yellow ... Droid
321
- 3 3 R2-D2 96 32.0 NA white, blue red ... Droid
322
- 4 4 Darth Vader 202 136.0 none white yellow ... Human
323
- 5 5 Leia Organa 150 49.0 brown light brown ... Human
324
- : : : : : : : : ... :
325
- 85 85 BB8 (nil) (nil) none none black ... Droid
326
- 86 86 Captain Phasma (nil) (nil) unknown unknown unknown ... NA
327
- 87 87 Padmé Amidala 165 45.0 brown light brown ... Human
328
-
329
- starwars.group(:species) { [count(:species), mean(:height, :mass)] }
330
- .slice { count > 1 }
331
-
332
- # =>
333
- #<RedAmber::DataFrame : 9 x 4 Vectors, 0x000000000006e848>
175
+ #<RedAmber::DataFrame : 8 x 4 Vectors, 0x000000000000f848>
334
176
  species count mean(height) mean(mass)
335
177
  <string> <int64> <double> <double>
336
- 1 Human 35 176.6 82.8
337
- 2 Droid 6 131.2 69.8
338
- 3 Wookiee 2 231.0 124.0
339
- 4 Gungan 3 208.7 74.0
340
- 5 NA 4 181.3 48.0
341
- 6 Zabrak 2 173.0 80.0
342
- 7 Twi'lek 2 179.0 55.0
343
- 8 Mirialan 2 168.0 53.1
344
- 9 Kaminoan 2 221.0 88.0
178
+ 0 Human 35 176.65 82.78
179
+ 1 Droid 6 131.2 69.75
180
+ 2 Wookiee 2 231.0 124.0
181
+ 3 Gungan 3 208.67 74.0
182
+ 4 Zabrak 2 173.0 80.0
183
+ 5 Twi'lek 2 179.0 55.0
184
+ 6 Mirialan 2 168.0 53.1
185
+ 7 Kaminoan 2 221.0 88.0
345
186
  ```
346
187
 
347
188
  See [DataFrame.md](doc/DataFrame.md) for other examples and details.
348
189
 
349
190
 
350
- ## `RedAmber::Vector`
191
+ ### `Vector` for 1D data object in column
351
192
 
352
193
  Class `RedAmber::Vector` represents a series of data in the DataFrame.
353
- Method `RedAmber::DataFrame#[key]` returns a Vector with the key `key`.
354
-
355
- ```ruby
356
- penguins[:bill_length_mm]
357
- # =>
358
- #<RedAmber::Vector(:double, size=344):0x000000000000f8fc>
359
- [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
360
- ```
361
-
362
- Vectors accepts some [functional methods from Arrow](https://arrow.apache.org/docs/cpp/compute.html).
363
-
364
- This is an element-wise comparison and returns a boolean Vector of same size.
365
-
366
- ![unary element-wise](doc/image/vector/unary_element_wise.png)
367
-
368
- ```ruby
369
- penguins[:bill_length_mm] < 40
370
-
371
- # =>
372
- #<RedAmber::Vector(:boolean, size=344):0x000000000007e7ac>
373
- [true, true, false, nil, true, true, true, true, true, false, true, true, false, ... ]
374
- ```
375
-
376
- Next example returns aggregated result.
377
-
378
- ![unary aggregation](doc/image/vector/unary_aggregation.png)
379
-
380
- ```ruby
381
- penguins[:bill_length_mm].mean
382
- 43.92192982456141
383
- # =>
384
-
385
- ```
386
194
 
387
195
  See [Vector.md](doc/Vector.md) for details.
388
196
 
389
197
  ## Jupyter notebook
390
198
 
391
- [71 Examples of Red Amber](doc/examples_of_red_amber.ipynb) shows more examples in jupyter notebook.
199
+ [83 Examples of Red Amber](https://github.com/heronshoes/docker-stacks/blob/RedAmber-binder/binder/examples_of_red_amber.ipynb)
200
+ ([raw file](https://raw.githubusercontent.com/heronshoes/docker-stacks/RedAmber-binder/binder/examples_of_red_amber.ipynb)) shows more examples in jupyter notebook.
201
+
202
+ You can try this notebook on [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb).
203
+ [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb)
204
+
392
205
 
393
206
  ## Development
394
207
 
@@ -399,8 +212,14 @@ bundle install
399
212
  bundle exec rake test
400
213
  ```
401
214
 
215
+ ## Community
216
+
402
217
  I will appreciate if you could help to improve this project. Here are a few ways you can help:
403
218
 
219
+ - Let's talk in the [discussions](https://github.com/heronshoes/red_amber/discussions). [![Discussions](https://img.shields.io/github/discussions/heronshoes/red_amber)](https://github.com/heronshoes/red_amber/discussions)
220
+ - Browse Q and A, how to use, tips, etc.
221
+ - Ask questions you’re wondering about.
222
+ - Share ideas. The idea may be promoted to issues or pull requests.
404
223
  - [Report bugs or suggest new features](https://github.com/heronshoes/red_amber/issues)
405
224
  - Fix bugs and [submit pull requests](https://github.com/heronshoes/red_amber/pulls)
406
225
  - Write, clarify, or fix documentation
@@ -0,0 +1,79 @@
1
+ contexts:
2
+ - name: HEAD
3
+ prelude: |
4
+ $LOAD_PATH.unshift(File.expand_path('lib'))
5
+ - gems:
6
+ red_amber: 0.2.0
7
+ - gems:
8
+ red_amber: 0.1.5
9
+
10
+ prelude: |
11
+ require 'red_amber'
12
+ require 'datasets-arrow'
13
+
14
+ ds = Datasets::Rdatasets.new('nycflights13', 'flights')
15
+ df = RedAmber::DataFrame.new(ds.to_arrow)
16
+
17
+ slicer = df[:distance] > 1000
18
+ distance_km = df[:distance] * 1.852
19
+
20
+ benchmark:
21
+ 'B01: Pick([]) by a key name': |
22
+ df[:flight]
23
+
24
+ 'B02: Pick by index': |
25
+ df[df.keys[9]]
26
+
27
+ 'B03: Pick by key names': |
28
+ df.pick(:carrier, :flight)
29
+
30
+ 'B04: Drop by key names': |
31
+ df.drop(:year, :month, :day)
32
+
33
+ 'B05: Pick by booleans': |
34
+ df.pick(df.vectors.map(&:string?))
35
+
36
+ 'B06: Pick by a block': |
37
+ df.pick { keys.map { |key| key.end_with?('time') } }
38
+
39
+ 'B07: Slice([]) by a index': |
40
+ df[877]
41
+
42
+ 'B08: Slice by indeces': |
43
+ df.slice(0...5, -5..-1)
44
+
45
+ 'B09: Slice([]) by booleans': |
46
+ df[slicer]
47
+
48
+ 'B10: Slice by booleans': |
49
+ df.slice(slicer)
50
+
51
+ 'B11: Remove by booleans': |
52
+ df.remove(slicer)
53
+
54
+ 'B12: Slice by a block': |
55
+ df.slice { slicer }
56
+
57
+ 'B13: Rename by Hash': |
58
+ df.rename(distance: :distance_mile)
59
+
60
+ 'B14: Assign an existing variable': |
61
+ df.assign(distance: distance_km)
62
+
63
+ 'B15: Assign a new variable': |
64
+ df.assign(distance_km: distance_km)
65
+
66
+ 'B16: Sort by a key': |
67
+ df.sort(:distance)
68
+
69
+ 'B17: Sort by keys': |
70
+ df.sort(:origin, '-distance')
71
+
72
+ 'B18: Convert to a Hash': |
73
+ df.to_h
74
+
75
+ 'B19: Output in TDR style': |
76
+ df.tdr
77
+
78
+ 'B20: Inspect': |
79
+ df.inspect
@@ -0,0 +1,63 @@
1
+ # --repeat-count 3
2
+
3
+ loop_count: 3
4
+
5
+ contexts:
6
+ - name: HEAD
7
+ prelude: |
8
+ $LOAD_PATH.unshift(File.expand_path('lib'))
9
+ # - gems:
10
+ # red_amber: 0.2.3
11
+
12
+ prelude: |
13
+ require 'red_amber'
14
+ include RedAmber
15
+ require 'datasets-arrow'
16
+
17
+ package = 'nycflights13'
18
+ airlines = DataFrame.new(Datasets::Rdatasets.new(package, 'airlines'))
19
+ airports = DataFrame.new(Datasets::Rdatasets.new(package, 'airports'))
20
+ flights = DataFrame.new(Datasets::Rdatasets.new(package, 'flights'))
21
+ .pick(%i[month day carrier flight tailnum origin dest air_time distance])
22
+ planes = DataFrame.new(Datasets::Rdatasets.new(package, 'planes'))
23
+ weather = DataFrame.new(Datasets::Rdatasets.new(package, 'weather'))
24
+
25
+ flights_Q1 = flights.slice { month <= 3 }
26
+ flights_Q2 = flights.slice { month > 3 }
27
+
28
+ flights_1_2 = flights_Q1.slice { month.is_in(1, 2) }
29
+ flights_1_3 = flights_Q1.slice { month.is_in(1, 3) }
30
+
31
+ flights_left = flights_Q1.pick(...5)
32
+ flights_right = flights_Q1.pick(5..)
33
+
34
+ benchmark:
35
+ 'C01: Inner join on flights_Q1 by carrier': |
36
+ flights_Q1.inner_join(airlines, :carrier)
37
+
38
+ 'C02: Full join on flights_Q1 by planes': |
39
+ flights_Q1.full_join(planes, :tailnum)
40
+
41
+ 'C03: Left join on flights_Q1 by planes': |
42
+ flights_Q1.left_join(planes, :tailnum)
43
+
44
+ 'C04: Semi join on flights_Q1 by planes': |
45
+ flights_Q1.semi_join(planes, :tailnum)
46
+
47
+ 'C05: Anti join on flights_Q1 by planes': |
48
+ flights_Q1.anti_join(planes, :tailnum)
49
+
50
+ 'C06: Intersection of flights_1_2 and flights_1_3': |
51
+ flights_1_2.intersect(flights_1_3)
52
+
53
+ 'C07: Union of flights_1_2 and flights_1_3': |
54
+ flights_1_2.union(flights_1_3)
55
+
56
+ 'C08: Difference between flights_1_2 and flights_1_3': |
57
+ flights_1_2.difference(flights_1_3)
58
+
59
+ 'C09: Concatenate flight_Q1 on flight_Q2': |
60
+ flights_Q1.concatenate(flights_Q2)
61
+
62
+ 'C10: Merge flights_Q1_right on flights_Q1_left': |
63
+ flights_left.merge(flights_right)