red_amber 0.5.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,90 @@
1
+ ---
2
+ title: RedAmber Examples
3
+ date: 2023-08-06
4
+ author: heronshoes
5
+ jupyter: ruby
6
+ format:
7
+ pdf:
8
+ toc: true
9
+ ---
10
+
11
+ This notebook walks through the [README of RedAmber](https://github.com/heronshoes/red_amber#readme).
12
+
13
+ ## `RedAmber::DataFrame`
14
+
15
+ ```{ruby}
16
+ #| tags: []
17
+ require 'red_amber'
18
+ include RedAmber
19
+ require 'datasets-arrow'
20
+
21
+ {RedAmber: VERSION, Datasets: Datasets::VERSION}
22
+ ```
23
+
24
+ ## Example: diamonds dataset
25
+
26
+ For the first loading of Datasets::Diamonds, it will take some time to download.
27
+
28
+ ```{ruby}
29
+ #| tags: []
30
+ dataset = Datasets::Diamonds.new
31
+ diamonds = DataFrame.new(dataset)
32
+ ```
33
+
34
+ ```{ruby}
35
+ #| tags: []
36
+ df = diamonds
37
+ .slice { carat > 1 } # or use #filter instead of #slice
38
+ .group(:cut)
39
+ .mean(:price) # `pick` prior to `group` is not required if `:price` is specified here.
40
+ .sort('-mean(price)')
41
+ ```
42
+
43
+ ```{ruby}
44
+ #| tags: []
45
+ usdjpy = 110.0 # when the yen was stronger
46
+
47
+ df.rename('mean(price)': :mean_price_USD)
48
+ .assign(:mean_price_JPY) { mean_price_USD * usdjpy }
49
+ ```
50
+
51
+ ## Example: starwars dataset
52
+
53
+ ```{ruby}
54
+ #| tags: []
55
+ uri = URI('https://vincentarelbundock.github.io/Rdatasets/csv/dplyr/starwars.csv')
56
+
57
+ starwars = DataFrame.load(uri)
58
+ ```
59
+
60
+ ```{ruby}
61
+ #| tags: []
62
+ starwars
63
+ .drop(0) # delete unnecessary index column
64
+ .remove { species == "NA" } # delete unnecessary rows
65
+ .group(:species) { [count(:species), mean(:height, :mass)] }
66
+ .slice { count > 1 } # or use #filter instead of slice
67
+ ```
68
+
69
+ ## `RedAmber::Vector`
70
+
71
+ ```{ruby}
72
+ #| tags: []
73
+ penguins = DataFrame.new(Datasets::Penguins.new)
74
+ ```
75
+
76
+ ```{ruby}
77
+ #| tags: []
78
+ penguins[:bill_length_mm]
79
+ ```
80
+
81
+ ```{ruby}
82
+ #| tags: []
83
+ penguins[:bill_length_mm] < 40
84
+ ```
85
+
86
+ ```{ruby}
87
+ #| tags: []
88
+ penguins[:bill_length_mm].mean
89
+ ```
90
+
data/docker/Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
- # x86-64/Ubuntu-22.04/python-3.10.9/lab-3.6.1/notebook-6.5.3/2023-03-13
2
- ARG BASE_IMAGE_TAG=295612d3ade4
1
+ # x86-64/Ubuntu-22.04/python-3.10.11/lab-3.6.3/notebook-6.5.4/2023-05-15
2
+ ARG BASE_IMAGE_TAG=513d0cb8a67c
3
3
 
4
4
  FROM jupyter/minimal-notebook:$BASE_IMAGE_TAG
5
5
 
data/docker/Gemfile CHANGED
@@ -9,7 +9,7 @@ gem 'red-arrow', '~> 12.0.0'
9
9
  gem 'red-arrow-numo-narray'
10
10
  gem 'red-parquet', '~> 12.0.0'
11
11
 
12
- gem 'red_amber', path: '../'
12
+ gem 'red_amber'
13
13
  gem 'red-amber-view'
14
14
  gem 'rover-df'
15
15
 
@@ -17,5 +17,5 @@ services:
17
17
  ports:
18
18
  - '8888:8888'
19
19
  volumes:
20
- - ./notebook:/home/$NB_USER/work
20
+ - ../doc/notebook:/home/$NB_USER/work
21
21
  command: start-notebook.sh --NotebookApp.token=$TOKEN
data/docker/readme.md CHANGED
@@ -6,12 +6,12 @@ This is a docker image containing RedAmber created from
6
6
  ## Contents
7
7
 
8
8
  - From jupyter/minimal-notebook:
9
- - Based on 2023-03-13 (295612d3ade4)
9
+ - Based on 2023-05-15 (513d0cb8a67c)
10
10
  - x86-64
11
11
  - Ubuntu-22.04
12
- - python-3.10.9
13
- - lab-3.6.1
14
- - notebook-6.5.3
12
+ - python-3.10.11
13
+ - lab-3.6.3
14
+ - notebook-6.5.4
15
15
  - System ruby-dev:
16
16
  - Ruby 3.0.2
17
17
  - Arrow 11.0.0 for Ubuntu:
@@ -22,7 +22,7 @@ This is a docker image containing RedAmber created from
22
22
  - Locally installed iruby:
23
23
  - Using Ruby 3.0.2
24
24
  - Locally installed bundler and Gemfile:
25
- - RedAmber 0.4.1
25
+ - RedAmber 0.5.0
26
26
  - Others (see Gemfile)
27
27
 
28
28
  ## Install
@@ -540,7 +540,7 @@ module RedAmber
540
540
  in ''
541
541
  '""'
542
542
  in String
543
- element.sub(/^(\s+)$/, '"\1"') # blank spaces
543
+ element.sub(/\A(\s+)$/, '"\1"') # blank spaces
544
544
  in Float
545
545
  format('%g', element)
546
546
  in Integer
@@ -44,7 +44,7 @@ module RedAmber
44
44
  # BUFFER
45
45
  #
46
46
  # @example Load from a Buffer skipping comment line
47
- # DataFrame.load(Arrow::Buffer.new(<<~BUFFER), format: :csv, skip_lines: /^#/)
47
+ # DataFrame.load(Arrow::Buffer.new(<<~BUFFER), format: :csv, skip_lines: /\A#/)
48
48
  # # comment
49
49
  # name,age
50
50
  # Yasuko,68
@@ -39,7 +39,7 @@ module RedAmber
39
39
  # penguins[:bill_length_mm]
40
40
  #
41
41
  # # =>
42
- # #<RedAmber::Vector(:double, size=344):0x00000000000104dc>
42
+ # #<RedAmber::Vector(:double, size=344, chunked):0x0000000000008f0c>
43
43
  # [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
44
44
  #
45
45
  # @overload [](keys)
@@ -173,7 +173,7 @@ module RedAmber
173
173
  # penguins.v(:bill_length_mm)
174
174
  #
175
175
  # # =>
176
- # #<RedAmber::Vector(:double, size=344):0x000000000000f140>
176
+ # #<RedAmber::Vector(:double, size=344, chunked):0x0000000000008f0c>
177
177
  # [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
178
178
  #
179
179
  def v(key)
@@ -44,7 +44,7 @@ module RedAmber
44
44
  # languages[:Language]
45
45
  #
46
46
  # # =>
47
- # #<RedAmber::Vector(:string, size=4):0x000000000010359c>
47
+ # #<RedAmber::Vector(:string, size=4, chunked):0x000000000010359c>
48
48
  # ["Ruby", "Python", "R", "Rust"]
49
49
  #
50
50
  # @overload pick(booleans)
@@ -512,8 +512,8 @@ module RedAmber
512
512
  # 1 Rui 49 78 (nil)
513
513
  # 2 Hinata 28 57 Momotaro
514
514
  #
515
- def assign(*assigner, &block)
516
- assign_update(*assigner, append_to_left: false, &block)
515
+ def assign(...)
516
+ assign_update(false, ...)
517
517
  end
518
518
 
519
519
  # Assign new or updated variables (columns) and create an updated DataFrame.
@@ -583,13 +583,13 @@ module RedAmber
583
583
  # @return [DataFrame]
584
584
  # assigned DataFrame.
585
585
  #
586
- def assign_left(*assigner, &block)
587
- assign_update(*assigner, append_to_left: true, &block)
586
+ def assign_left(...)
587
+ assign_update(true, ...)
588
588
  end
589
589
 
590
590
  private
591
591
 
592
- def assign_update(*assigner, append_to_left: false, &block)
592
+ def assign_update(append_to_left, *assigner, &block)
593
593
  if block
594
594
  assigner_from_block = instance_eval(&block)
595
595
  assigner =
@@ -26,12 +26,7 @@ module RedAmber
26
26
  private
27
27
 
28
28
  # @!macro [attach] define_group_aggregation
29
- # @!method $1(*summary_keys)
30
- # Group aggregation function `$1`.
31
- # @param summary_keys [Array<Symbol, String>]
32
- # summary keys.
33
- # @return [DataFrame]
34
- # aggregated DataFrame
29
+ # Returns aggregated DataFrame.
35
30
  #
36
31
  def define_group_aggregation(function)
37
32
  define_method(function) do |*summary_keys|
@@ -55,7 +50,7 @@ module RedAmber
55
50
  # @param group_keys [Array<Symbol, String>]
56
51
  # keys for grouping.
57
52
  # @return [Group]
58
- # Group object.
53
+ # Group object. It inspects grouped columns and its count.
59
54
  # @example
60
55
  # Group.new(penguins, :species)
61
56
  #
@@ -79,13 +74,93 @@ module RedAmber
79
74
  @group = @dataframe.table.group(*@group_keys)
80
75
  end
81
76
 
82
- define_group_aggregation(:count)
77
+ # @!macro group_aggregation
78
+ # @param group_keys [Array<Symbol, String>]
79
+ # keys for grouping.
80
+ # @return [DataFrame]
81
+ # aggregated DataFrame
82
+
83
+ # Whether all elements in each group evaluate to true.
84
+ #
85
+ # @!method all(*group_keys)
86
+ # @macro group_aggregation
87
+ # @example For boolean columns by default.
88
+ # dataframe
89
+ #
90
+ # # =>
91
+ # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x00000000000230dc>
92
+ # x y z
93
+ # <uint8> <string> <boolean>
94
+ # 0 1 A false
95
+ # 1 2 A true
96
+ # 2 3 B false
97
+ # 3 4 B (nil)
98
+ # 4 5 B true
99
+ # 5 6 C false
100
+ #
101
+ # dataframe.group(:y).all
102
+ #
103
+ # # =>
104
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000fc08>
105
+ # y all(z)
106
+ # <string> <boolean>
107
+ # 0 A false
108
+ # 1 B false
109
+ # 2 C false
110
+ #
111
+ define_group_aggregation :all
112
+
113
+ # Whether any elements in each group evaluate to true.
114
+ #
115
+ # @!method any(*group_keys)
116
+ # @macro group_aggregation
117
+ # @example For boolean columns by default.
118
+ # dataframe.group(:y).any
119
+ #
120
+ # # =>
121
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000000117ec>
122
+ # y any(z)
123
+ # <string> <boolean>
124
+ # 0 A true
125
+ # 1 B true
126
+ # 2 C false
127
+ #
128
+ define_group_aggregation :any
129
+
130
+ # Count the number of non-nil values in each group.
131
+ # If counts are the same (and do not include NaN or nil),
132
+ # columns for counts are unified.
133
+ #
134
+ # @!method max(*group_keys)
135
+ # @macro group_aggregation
136
+ # @example Show counts for each group.
137
+ # dataframe.group(:y).count
138
+ #
139
+ # # =>
140
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000011ea04>
141
+ # y count(x) count(z)
142
+ # <string> <int64> <int64>
143
+ # 0 A 2 2
144
+ # 1 B 3 2
145
+ # 2 C 1 1
146
+ #
147
+ # dataframe.group(:z).count
148
+ # # same as dataframe.group(:z).count(:x, :y)
149
+ #
150
+ # =>
151
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000122834>
152
+ # z count
153
+ # <boolean> <int64>
154
+ # 0 false 3
155
+ # 1 true 2
156
+ # 2 (nil) 1
157
+ #
158
+ define_group_aggregation :count
83
159
  alias_method :__count, :count
84
160
  private :__count
85
161
 
86
- def count(*summary_keys)
87
- df = __count(summary_keys)
88
- # if counts are the same (and do not include NaN or nil), aggregate count columns.
162
+ def count(*group_keys)
163
+ df = __count(group_keys)
89
164
  if df.pick(@group_keys.size..).to_h.values.uniq.size == 1
90
165
  df.pick(0..@group_keys.size).rename { [keys[-1], :count] }
91
166
  else
@@ -93,19 +168,213 @@ module RedAmber
93
168
  end
94
169
  end
95
170
 
96
- define_group_aggregation(:sum)
171
+ # Returns each record group size as a DataFrame.
172
+ #
173
+ # @return [DataFrame]
174
+ # DataFrame consists of:
175
+ # - Group key columns.
176
+ # - Result columns by group aggregation.
177
+ # @example
178
+ # penguins.group(:species).group_count
179
+ #
180
+ # # =>
181
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003a70>
182
+ # species group_count
183
+ # <string> <uint8>
184
+ # 0 Adelie 152
185
+ # 1 Chinstrap 68
186
+ # 2 Gentoo 124
187
+ #
188
+ def group_count
189
+ DataFrame.create(group_table)
190
+ end
191
+ alias_method :count_all, :group_count
192
+
193
+ # Count the unique values in each group.
194
+ #
195
+ # @!method count_uniq(*group_keys)
196
+ # @macro group_aggregation
197
+ # @example Show counts for each group.
198
+ # dataframe.group(:y).count_uniq
199
+ #
200
+ # # =>
201
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000011ea04>
202
+ # y count_uniq(x)
203
+ # <string> <int64>
204
+ # 0 A 2
205
+ # 1 B 3
206
+ # 2 C 1
207
+ #
208
+ define_group_aggregation :count_distinct
209
+ def count_uniq(*group_keys)
210
+ df = count_distinct(*group_keys)
211
+ df.rename do
212
+ keys_org = keys.select { _1.start_with?('count_distinct') }
213
+ keys_renamed = keys_org.map { _1.to_s.gsub('distinct', 'uniq') }
214
+ keys_org.zip keys_renamed
215
+ end
216
+ end
217
+
218
+ # Compute maximum of values in each group for numeric columns.
219
+ #
220
+ # @!method max(*group_keys)
221
+ # @macro group_aggregation
222
+ # @example
223
+ # dataframe.group(:y).max
224
+ #
225
+ # # =>
226
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000014ae74>
227
+ # y max(x)
228
+ # <string> <uint8>
229
+ # 0 A 2
230
+ # 1 B 5
231
+ # 2 C 6
232
+ #
233
+ define_group_aggregation :max
234
+
235
+ # Compute mean of values in each group for numeric columns.
236
+ #
237
+ # @!method mean(*group_keys)
238
+ # @macro group_aggregation
239
+ # @example
240
+ # dataframe.group(:y).mean
241
+ #
242
+ # # =>
243
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000138a8>
244
+ # y mean(x)
245
+ # <string> <double>
246
+ # 0 A 1.5
247
+ # 1 B 4.0
248
+ # 2 C 6.0
249
+ #
250
+ define_group_aggregation :mean
251
+
252
+ # Compute median of values in each group for numeric columns.
253
+ #
254
+ # @!method median(*group_keys)
255
+ # @macro group_aggregation
256
+ # @example
257
+ # dataframe.group(:y).median
258
+ #
259
+ # # =>
260
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000138a8>
261
+ # y median(x)
262
+ # <string> <double>
263
+ # 0 A 1.5
264
+ # 1 B 4.0
265
+ # 2 C 6.0
266
+ #
267
+ define_group_aggregation :approximate_median
268
+ def median(*group_keys)
269
+ df = approximate_median(*group_keys)
270
+ df.rename do
271
+ keys_org = keys.select { _1.start_with?('approximate_') }
272
+ keys_renamed = keys_org.map { _1.to_s.delete_prefix('approximate_') }
273
+ keys_org.zip keys_renamed
274
+ end
275
+ end
97
276
 
98
- define_group_aggregation(:product)
277
+ # Compute minimum of values in each group for numeric columns.
278
+ #
279
+ # @!method min(*group_keys)
280
+ # @macro group_aggregation
281
+ # @example
282
+ # dataframe.group(:y).min
283
+ #
284
+ # # =>
285
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000018f38>
286
+ # y min(x)
287
+ # <string> <uint8>
288
+ # 0 A 1
289
+ # 1 B 3
290
+ # 2 C 6
291
+ #
292
+ define_group_aggregation :min
99
293
 
100
- define_group_aggregation(:mean)
294
+ # Get one value from each group.
295
+ #
296
+ # @!method one(*group_keys)
297
+ # @macro group_aggregation
298
+ # @example
299
+ # dataframe.group(:y).one
300
+ #
301
+ # # =>
302
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000002885c>
303
+ # y one(x)
304
+ # <string> <uint8>
305
+ # 0 A 1
306
+ # 1 B 3
307
+ # 2 C 6
308
+ #
309
+ define_group_aggregation :one
101
310
 
102
- define_group_aggregation(:min)
311
+ # Compute product of values in each group for numeric columns.
312
+ #
313
+ # @!method product(*group_keys)
314
+ # @macro group_aggregation
315
+ # @example
316
+ # dataframe.group(:y).product
317
+ #
318
+ # # =>
319
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000021a84>
320
+ # y product(x)
321
+ # <string> <uint64>
322
+ # 0 A 2
323
+ # 1 B 60
324
+ # 2 C 6
325
+ #
326
+ define_group_aggregation :product
103
327
 
104
- define_group_aggregation(:max)
328
+ # Compute standard deviation of values in each group for numeric columns.
329
+ #
330
+ # @!method stddev(*group_keys)
331
+ # @macro group_aggregation
332
+ # @example
333
+ # dataframe.group(:y).stddev
334
+ #
335
+ # # =>
336
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000002be6c>
337
+ # y stddev(x)
338
+ # <string> <double>
339
+ # 0 A 0.5
340
+ # 1 B 0.082
341
+ # 2 C 0.0
342
+ #
343
+ define_group_aggregation :stddev
105
344
 
106
- define_group_aggregation(:stddev)
345
+ # Compute sum of values in each group for numeric columns.
346
+ #
347
+ # @!method sum(*group_keys)
348
+ # @macro group_aggregation
349
+ # @example
350
+ # dataframe.group(:y).sum
351
+ #
352
+ # # =>
353
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000032a14>
354
+ # y sum(x)
355
+ # <string> <uint64>
356
+ # 0 A 3
357
+ # 1 B 12
358
+ # 2 C 6
359
+ #
360
+ define_group_aggregation :sum
107
361
 
108
- define_group_aggregation(:variance)
362
+ # Compute variance of values in each group for numeric columns.
363
+ #
364
+ # @!method variance(*group_keys)
365
+ # @macro group_aggregation
366
+ # @example
367
+ # dataframe.group(:y).variance
368
+ #
369
+ # # =>
370
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000003b1dc>
371
+ # y variance(x)
372
+ # <string> <double>
373
+ # 0 A 0.25
374
+ # 1 B 0.067
375
+ # 2 C 0.0
376
+ #
377
+ define_group_aggregation :variance
109
378
 
110
379
  # Returns Array of boolean filters to select each records in the Group.
111
380
  #
@@ -168,27 +437,6 @@ module RedAmber
168
437
  @filters.size
169
438
  end
170
439
 
171
- # Returns each record group size as a DataFrame.
172
- #
173
- # @return [DataFrame]
174
- # DataFrame consists of:
175
- # - Group key columns.
176
- # - Result columns by group aggregation.
177
- # @example
178
- # penguins.group(:species).group_count
179
- #
180
- # # =>
181
- # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003a70>
182
- # species group_count
183
- # <string> <uint8>
184
- # 0 Adelie 152
185
- # 1 Chinstrap 68
186
- # 2 Gentoo 124
187
- #
188
- def group_count
189
- DataFrame.create(group_table)
190
- end
191
-
192
440
  # String representation of self.
193
441
  #
194
442
  # @return [String]
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # class SubFrames treats a set of subsets of a DataFrame
4
+ # class SubFrames treats subsets of a DataFrame
5
5
  # [Experimental feature] Class SubFrames may be removed or be changed in the future.
6
6
  class SubFrames
7
7
  include Enumerable # may change to use Forwardable.
@@ -434,7 +434,7 @@ module RedAmber
434
434
  # @return [DataFrame]
435
435
  # created DataFrame.
436
436
  # @example Aggregate by key labels in arguments and values from block.
437
- # subframes.aggregate(:y, :sum_x) { [y.first, x.sum] }
437
+ # subframes.aggregate(:y, :sum_x) { [y.one, x.sum] }
438
438
  #
439
439
  # # =>
440
440
  # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
@@ -445,7 +445,7 @@ module RedAmber
445
445
  # 2 C 6
446
446
  #
447
447
  # @example Aggregate by key labels in an Array and values from block.
448
- # subframes.aggregate([:y, :sum_x]) { [y.first, x.sum] }
448
+ # subframes.aggregate([:y, :sum_x]) { [y.one, x.sum] }
449
449
  #
450
450
  # # =>
451
451
  # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
@@ -457,7 +457,7 @@ module RedAmber
457
457
  #
458
458
  # @overload aggregate
459
459
  #
460
- # Aggregate SubFrames creating DataFrame with pairs of key and aggregated value
460
+ # Aggregate SubFrames creating DataFrame with pairs of key and aggregated values
461
461
  # in Hash from the block.
462
462
  #
463
463
  # @yieldparam dataframe [DataFrame]
@@ -470,7 +470,7 @@ module RedAmber
470
470
  # created DataFrame.
471
471
  # @example Aggregate by key and value pairs from block.
472
472
  # subframes.aggregate do
473
- # { y: y.first, sum_x: x.sum }
473
+ # { y: y.one, sum_x: x.sum }
474
474
  # end
475
475
  #
476
476
  # # =>
@@ -712,7 +712,7 @@ module RedAmber
712
712
  # @example
713
713
  # subframes.assign(:sum_x, :frac_x) do
714
714
  # group_sum = x.sum
715
- # [[group_sum] * size, x / s.to_f]
715
+ # [[group_sum] * size, x / group_sum.to_f]
716
716
  # end
717
717
  #
718
718
  # # =>
@@ -180,7 +180,8 @@ module RedAmber
180
180
  end
181
181
  sio << ']'
182
182
 
183
- format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n",
183
+ chunked = chunked? ? ', chunked' : ''
184
+ format "#<#{self.class}(:#{type}, size=#{size}#{chunked}):0x%016x>\n%s\n",
184
185
  object_id, sio.string
185
186
  end
186
187
  end