red_amber 0.2.1 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +15 -0
- data/CHANGELOG.md +170 -20
- data/Gemfile +4 -2
- data/README.md +121 -302
- data/benchmark/basic.yml +79 -0
- data/benchmark/combine.yml +63 -0
- data/benchmark/drop_nil.yml +15 -3
- data/benchmark/group.yml +33 -0
- data/benchmark/reshape.yml +27 -0
- data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
- data/benchmark/rover/flights.yml +23 -0
- data/benchmark/rover/penguins.yml +23 -0
- data/benchmark/rover/planes.yml +23 -0
- data/benchmark/rover/weather.yml +23 -0
- data/doc/DataFrame.md +611 -318
- data/doc/Vector.md +31 -36
- data/doc/image/basic_verbs.png +0 -0
- data/doc/image/dataframe/assign.png +0 -0
- data/doc/image/dataframe/assign_operation.png +0 -0
- data/doc/image/dataframe/drop.png +0 -0
- data/doc/image/dataframe/join.png +0 -0
- data/doc/image/dataframe/pick.png +0 -0
- data/doc/image/dataframe/pick_operation.png +0 -0
- data/doc/image/dataframe/remove.png +0 -0
- data/doc/image/dataframe/rename.png +0 -0
- data/doc/image/dataframe/rename_operation.png +0 -0
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/doc/image/dataframe/set_and_bind.png +0 -0
- data/doc/image/dataframe/slice.png +0 -0
- data/doc/image/dataframe/slice_operation.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/doc/image/group_operation.png +0 -0
- data/doc/image/replace-if_then.png +0 -0
- data/doc/image/reshaping_dataframe.png +0 -0
- data/doc/image/screenshot.png +0 -0
- data/doc/image/vector/binary_element_wise.png +0 -0
- data/doc/image/vector/unary_aggregation.png +0 -0
- data/doc/image/vector/unary_aggregation_w_option.png +0 -0
- data/doc/image/vector/unary_element_wise.png +0 -0
- data/lib/red_amber/data_frame.rb +16 -42
- data/lib/red_amber/data_frame_combinable.rb +283 -0
- data/lib/red_amber/data_frame_displayable.rb +58 -3
- data/lib/red_amber/data_frame_loadsave.rb +36 -0
- data/lib/red_amber/data_frame_reshaping.rb +8 -6
- data/lib/red_amber/data_frame_selectable.rb +9 -9
- data/lib/red_amber/data_frame_variable_operation.rb +27 -21
- data/lib/red_amber/group.rb +100 -17
- data/lib/red_amber/helper.rb +20 -30
- data/lib/red_amber/vector.rb +56 -30
- data/lib/red_amber/vector_functions.rb +0 -8
- data/lib/red_amber/vector_selectable.rb +9 -1
- data/lib/red_amber/vector_updatable.rb +61 -63
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +2 -0
- data/red_amber.gemspec +1 -1
- metadata +32 -11
- data/doc/examples_of_red_amber.ipynb +0 -8979
data/doc/DataFrame.md
CHANGED
@@ -5,7 +5,8 @@ Class `RedAmber::DataFrame` represents 2D-data. A `DataFrame` consists with:
|
|
5
5
|
- A label is attached to `Vector`. We call it `key`.
|
6
6
|
- A `Vector` and associated `key` is grouped as a `variable`.
|
7
7
|
- `variable`s with same vector length are aligned and arranged to be a `DataFrame`.
|
8
|
-
- Each `
|
8
|
+
- Each `key` in a `DataFrame` must be unique.
|
9
|
+
- Each `Vector` in a `DataFrame` contains a set of relating data at same position. We call it `record` or `observation`.
|
9
10
|
|
10
11
|
![dataframe model image](doc/../image/dataframe_model.png)
|
11
12
|
|
@@ -14,30 +15,38 @@ Class `RedAmber::DataFrame` represents 2D-data. A `DataFrame` consists with:
|
|
14
15
|
### `new` from a Hash
|
15
16
|
|
16
17
|
```ruby
|
17
|
-
RedAmber::DataFrame.new(x: [1, 2, 3])
|
18
|
+
df = RedAmber::DataFrame.new(x: [1, 2, 3], y: %w[A B C])
|
18
19
|
```
|
19
20
|
|
20
21
|
### `new` from a schema (by Hash) and data (by Array)
|
21
22
|
|
22
23
|
```ruby
|
23
|
-
RedAmber::DataFrame.new({:
|
24
|
+
RedAmber::DataFrame.new({x: :uint8, y: :string}, [[1, "A"], [2, "B"], [3, "C"]])
|
24
25
|
```
|
25
26
|
|
26
27
|
### `new` from an Arrow::Table
|
27
28
|
|
28
29
|
|
29
30
|
```ruby
|
30
|
-
table = Arrow::Table.new(x: [1, 2, 3])
|
31
|
+
table = Arrow::Table.new(x: [1, 2, 3], y: %w[A B C])
|
31
32
|
RedAmber::DataFrame.new(table)
|
32
33
|
```
|
33
34
|
|
35
|
+
### `new` from an Object which responds to `to_arrow`
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
require "datasets-arrow"
|
39
|
+
dataset = Datasets::Penguins.new
|
40
|
+
RedAmber::DataFrame.new(dataset)
|
41
|
+
```
|
42
|
+
|
34
43
|
### `new` from a Rover::DataFrame
|
35
44
|
|
36
45
|
|
37
46
|
```ruby
|
38
47
|
require 'rover'
|
39
48
|
|
40
|
-
rover = Rover::DataFrame.new(x: [1, 2, 3])
|
49
|
+
rover = Rover::DataFrame.new(x: [1, 2, 3], y: %w[A B C])
|
41
50
|
RedAmber::DataFrame.new(rover)
|
42
51
|
```
|
43
52
|
|
@@ -63,7 +72,7 @@ Class `RedAmber::DataFrame` represents 2D-data. A `DataFrame` consists with:
|
|
63
72
|
```ruby
|
64
73
|
require 'parquet'
|
65
74
|
|
66
|
-
|
75
|
+
df = RedAmber::DataFrame.load("file.parquet")
|
67
76
|
```
|
68
77
|
|
69
78
|
### `save` (instance method)
|
@@ -79,20 +88,20 @@ Class `RedAmber::DataFrame` represents 2D-data. A `DataFrame` consists with:
|
|
79
88
|
```ruby
|
80
89
|
require 'parquet'
|
81
90
|
|
82
|
-
|
91
|
+
df.save("file.parquet")
|
83
92
|
```
|
84
93
|
|
85
94
|
## Properties
|
86
95
|
|
87
96
|
### `table`, `to_arrow`
|
88
97
|
|
89
|
-
-
|
98
|
+
- Returns Arrow::Table object in the DataFrame.
|
90
99
|
|
91
|
-
### `size`, `n_obs`, `n_rows`
|
100
|
+
### `size`, `n_records`, `n_obs`, `n_rows`
|
92
101
|
|
93
|
-
- Returns size of Vector (num of
|
94
|
-
|
95
|
-
### `n_keys`, `n_vars`, `n_cols`,
|
102
|
+
- Returns size of Vector (num of records).
|
103
|
+
|
104
|
+
### `n_keys`, `n_variables`, `n_vars`, `n_cols`,
|
96
105
|
|
97
106
|
- Returns num of keys (num of variables).
|
98
107
|
|
@@ -130,16 +139,7 @@ Class `RedAmber::DataFrame` represents 2D-data. A `DataFrame` consists with:
|
|
130
139
|
|
131
140
|
- Returns key names in an Array.
|
132
141
|
|
133
|
-
|
134
|
-
|
135
|
-
```ruby
|
136
|
-
# update numeric variables, another solution
|
137
|
-
df.assign do
|
138
|
-
vectors.each_with_object({}) do |vector, assigner|
|
139
|
-
assigner[vector.key] = vector * -1 if vector.numeric?
|
140
|
-
end
|
141
|
-
end
|
142
|
-
```
|
142
|
+
Each key must be unique in the DataFrame.
|
143
143
|
|
144
144
|
### `types`
|
145
145
|
|
@@ -153,9 +153,20 @@ Class `RedAmber::DataFrame` represents 2D-data. A `DataFrame` consists with:
|
|
153
153
|
|
154
154
|
- Returns an Array of Vectors.
|
155
155
|
|
156
|
+
When we use it, Vector#key is useful to get the key in the DataFrame.
|
157
|
+
|
158
|
+
```ruby
|
159
|
+
# update numeric variables, another solution
|
160
|
+
df.assign do
|
161
|
+
vectors.each_with_object({}) do |vector, assigner|
|
162
|
+
assigner[vector.key] = vector * -1 if vector.numeric?
|
163
|
+
end
|
164
|
+
end
|
165
|
+
```
|
166
|
+
|
156
167
|
### `indices`, `indexes`
|
157
168
|
|
158
|
-
- Returns indexes in
|
169
|
+
- Returns indexes in a Vector.
|
159
170
|
Accepts an option `start` as the first of indexes.
|
160
171
|
|
161
172
|
```ruby
|
@@ -163,15 +174,19 @@ Class `RedAmber::DataFrame` represents 2D-data. A `DataFrame` consists with:
|
|
163
174
|
df.indices
|
164
175
|
|
165
176
|
# =>
|
177
|
+
#<RedAmber::Vector(:uint8, size=5):0x0000000000013ed4>
|
166
178
|
[0, 1, 2, 3, 4]
|
167
179
|
|
168
180
|
df.indices(1)
|
169
181
|
|
170
182
|
# =>
|
183
|
+
#<RedAmber::Vector(:uint8, size=5):0x0000000000018fd8>
|
171
184
|
[1, 2, 3, 4, 5]
|
172
185
|
|
173
186
|
df.indices(:a)
|
187
|
+
|
174
188
|
# =>
|
189
|
+
#<RedAmber::Vector(:dictionary, size=5):0x000000000001bd50>
|
175
190
|
[:a, :b, :c, :d, :e]
|
176
191
|
```
|
177
192
|
|
@@ -210,15 +225,15 @@ puts penguins.to_s
|
|
210
225
|
# =>
|
211
226
|
species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
212
227
|
<string> <string> <double> <double> <uint8> ... <uint16>
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
228
|
+
0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
229
|
+
1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
230
|
+
2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
231
|
+
3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
232
|
+
4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
218
233
|
: : : : : : ... :
|
219
|
-
|
220
|
-
|
221
|
-
|
234
|
+
341 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
235
|
+
342 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
236
|
+
343 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
222
237
|
```
|
223
238
|
### `inspect`
|
224
239
|
|
@@ -235,11 +250,11 @@ puts penguins.summary.to_s(width: 82) # needs more width to show all stats in th
|
|
235
250
|
# =>
|
236
251
|
variables count mean std min 25% median 75% max
|
237
252
|
<dictionary> <uint16> <double> <double> <double> <double> <double> <double> <double>
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
253
|
+
0 bill_length_mm 342 43.92 5.46 32.1 39.23 44.38 48.5 59.6
|
254
|
+
1 bill_depth_mm 342 17.15 1.97 13.1 15.6 17.32 18.7 21.5
|
255
|
+
2 flipper_length_mm 342 200.92 14.06 172.0 190.0 197.0 213.0 231.0
|
256
|
+
3 body_mass_g 342 4201.75 801.95 2700.0 3550.0 4031.5 4750.0 6300.0
|
257
|
+
4 year 344 2008.03 0.82 2007.0 2007.0 2008.0 2009.0 2009.0
|
243
258
|
```
|
244
259
|
|
245
260
|
### `to_rover`
|
@@ -265,26 +280,29 @@ penguins.to_rover
|
|
265
280
|
require 'red_amber'
|
266
281
|
require 'datasets-arrow'
|
267
282
|
|
268
|
-
|
269
|
-
|
283
|
+
dataset = Datasets::Penguins.new
|
284
|
+
# (From 0.2.2) responsible to the object which has `to_arrow` method.
|
285
|
+
# If older, it should be `dataset.to_arrow` in the parentheses.
|
286
|
+
RedAmber::DataFrame.new(dataset).tdr
|
270
287
|
|
271
288
|
# =>
|
272
289
|
RedAmber::DataFrame : 344 x 8 Vectors
|
273
290
|
Vectors : 5 numeric, 3 strings
|
274
291
|
# key type level data_preview
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
292
|
+
0 :species string 3 {"Adelie"=>152, "Chinstrap"=>68, "Gentoo"=>124}
|
293
|
+
1 :island string 3 {"Torgersen"=>52, "Biscoe"=>168, "Dream"=>124}
|
294
|
+
2 :bill_length_mm double 165 [39.1, 39.5, 40.3, nil, 36.7, ... ], 2 nils
|
295
|
+
3 :bill_depth_mm double 81 [18.7, 17.4, 18.0, nil, 19.3, ... ], 2 nils
|
296
|
+
4 :flipper_length_mm uint8 56 [181, 186, 195, nil, 193, ... ], 2 nils
|
297
|
+
5 :body_mass_g uint16 95 [3750, 3800, 3250, nil, 3450, ... ], 2 nils
|
298
|
+
6 :sex string 3 {"male"=>168, "female"=>165, nil=>11}
|
299
|
+
7 :year uint16 3 {2007=>110, 2008=>114, 2009=>120}
|
283
300
|
```
|
284
|
-
|
301
|
+
|
302
|
+
Options:
|
285
303
|
- limit: limit of variables to show. Default value is 10.
|
286
|
-
- tally: max level to use tally mode.
|
287
|
-
- elements: max num of element to show values in each
|
304
|
+
- tally: max level to use tally mode. Default value is 5.
|
305
|
+
- elements: max num of element to show values in each records. Default value is 5.
|
288
306
|
|
289
307
|
## Selecting
|
290
308
|
|
@@ -294,13 +312,13 @@ penguins.to_rover
|
|
294
312
|
- Keys in an Array: `df[:symbol1, "string", :symbol2]`
|
295
313
|
- Keys by indeces: `df[df.keys[0]`, `df[df.keys[1,2]]`, `df[df.keys[1..]]`
|
296
314
|
|
297
|
-
Key indeces
|
315
|
+
Key indeces should be used via `keys[i]` because numbers are used to select records (rows). See next section.
|
298
316
|
|
299
317
|
- Keys by a Range:
|
300
318
|
|
301
|
-
If keys are able to represent by Range, it can be included in the arguments. See a example below.
|
319
|
+
If keys are able to represent by a Range, it can be included in the arguments. See a example below.
|
302
320
|
|
303
|
-
- You can exchange the order of variables (columns).
|
321
|
+
- You can also exchange the order of variables (columns).
|
304
322
|
|
305
323
|
```ruby
|
306
324
|
hash = {a: [1, 2, 3], b: %w[A B C], c: [1.0, 2, 3]}
|
@@ -311,12 +329,12 @@ penguins.to_rover
|
|
311
329
|
#<RedAmber::DataFrame : 3 x 3 Vectors, 0x00000000000328fc>
|
312
330
|
b c a
|
313
331
|
<string> <double> <uint8>
|
314
|
-
|
315
|
-
|
316
|
-
|
332
|
+
0 A 1.0 1
|
333
|
+
1 B 2.0 2
|
334
|
+
2 C 3.0 3
|
317
335
|
```
|
318
336
|
|
319
|
-
If `#[]` represents single variable (column), it returns a Vector object.
|
337
|
+
If `#[]` represents a single variable (column), it returns a Vector object.
|
320
338
|
|
321
339
|
```ruby
|
322
340
|
df[:a]
|
@@ -325,6 +343,7 @@ penguins.to_rover
|
|
325
343
|
#<RedAmber::Vector(:uint8, size=3):0x000000000000f140>
|
326
344
|
[1, 2, 3]
|
327
345
|
```
|
346
|
+
|
328
347
|
Or `#v` method also returns a Vector for a key.
|
329
348
|
|
330
349
|
```ruby
|
@@ -335,18 +354,19 @@ penguins.to_rover
|
|
335
354
|
[1, 2, 3]
|
336
355
|
```
|
337
356
|
|
338
|
-
This may be useful to use in a block of DataFrame manipulation verbs. We can write `v(:a)` rather than `self[:a]` or `df[:a]`
|
357
|
+
This method may be useful to use in a block of DataFrame manipulation verbs. We can write `v(:a)` rather than `self[:a]` or `df[:a]`
|
339
358
|
|
340
|
-
### Select
|
359
|
+
### Select records (rows in a table) by `[]` as `[index]`, `[range]`, `[array]`
|
341
360
|
|
342
|
-
- Select a
|
343
|
-
- Select obs. by indeces in a Range: `df[1..2]`
|
361
|
+
- Select a record by index: `df[0]`
|
344
362
|
|
345
|
-
|
363
|
+
- Select records by indeces in an Array: `df[1, 2]`
|
346
364
|
|
347
|
-
- Select
|
365
|
+
- Select records by indeces in a Range: `df[1..2]`
|
348
366
|
|
349
|
-
-
|
367
|
+
An end-less or a begin-less Range can be used to represent indeces.
|
368
|
+
|
369
|
+
- You can use indices in Float.
|
350
370
|
|
351
371
|
- Mixed case: `df[2, 0..]`
|
352
372
|
|
@@ -359,15 +379,15 @@ penguins.to_rover
|
|
359
379
|
#<RedAmber::DataFrame : 4 x 3 Vectors, 0x0000000000033270>
|
360
380
|
a b c
|
361
381
|
<uint8> <string> <double>
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
382
|
+
0 3 C 3.0
|
383
|
+
1 1 A 1.0
|
384
|
+
2 2 B 2.0
|
385
|
+
3 3 C 3.0
|
366
386
|
```
|
367
387
|
|
368
|
-
- Select
|
388
|
+
- Select records by a boolean Array or a boolean RedAmber::Vector at same size as self.
|
369
389
|
|
370
|
-
It returns a sub dataframe with
|
390
|
+
It returns a sub dataframe with records at boolean is true.
|
371
391
|
|
372
392
|
```ruby
|
373
393
|
# with the same dataframe `df` above
|
@@ -382,15 +402,15 @@ penguins.to_rover
|
|
382
402
|
1 1 A 1.0
|
383
403
|
```
|
384
404
|
|
385
|
-
### Select rows from top or from bottom
|
405
|
+
### Select records (rows) from top or from bottom
|
386
406
|
|
387
407
|
`head(n=5)`, `tail(n=5)`, `first(n=1)`, `last(n=1)`
|
388
408
|
|
389
409
|
## Sub DataFrame manipulations
|
390
410
|
|
391
|
-
### `pick ` - pick up variables
|
411
|
+
### `pick ` - pick up variables -
|
392
412
|
|
393
|
-
Pick up some
|
413
|
+
Pick up some variables (columns) to create a sub DataFrame.
|
394
414
|
|
395
415
|
![pick method image](doc/../image/dataframe/pick.png)
|
396
416
|
|
@@ -405,15 +425,15 @@ penguins.to_rover
|
|
405
425
|
#<RedAmber::DataFrame : 344 x 2 Vectors, 0x0000000000035ebc>
|
406
426
|
species bill_length_mm
|
407
427
|
<string> <double>
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
428
|
+
0 Adelie 39.1
|
429
|
+
1 Adelie 39.5
|
430
|
+
2 Adelie 40.3
|
431
|
+
3 Adelie (nil)
|
432
|
+
4 Adelie 36.7
|
413
433
|
: : :
|
414
|
-
|
415
|
-
|
416
|
-
|
434
|
+
341 Gentoo 50.4
|
435
|
+
342 Gentoo 45.2
|
436
|
+
343 Gentoo 49.9
|
417
437
|
```
|
418
438
|
|
419
439
|
- Indices as arguments
|
@@ -427,15 +447,15 @@ penguins.to_rover
|
|
427
447
|
#<RedAmber::DataFrame : 344 x 4 Vectors, 0x0000000000055ce4>
|
428
448
|
species island bill_length_mm year
|
429
449
|
<string> <string> <double> <uint16>
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
450
|
+
0 Adelie Torgersen 39.1 2007
|
451
|
+
1 Adelie Torgersen 39.5 2007
|
452
|
+
2 Adelie Torgersen 40.3 2007
|
453
|
+
3 Adelie Torgersen (nil) 2007
|
454
|
+
4 Adelie Torgersen 36.7 2007
|
435
455
|
: : : : :
|
436
|
-
|
437
|
-
|
438
|
-
|
456
|
+
341 Gentoo Biscoe 50.4 2009
|
457
|
+
342 Gentoo Biscoe 45.2 2009
|
458
|
+
343 Gentoo Biscoe 49.9 2009
|
439
459
|
```
|
440
460
|
|
441
461
|
- Booleans as arguments
|
@@ -443,21 +463,21 @@ penguins.to_rover
|
|
443
463
|
`pick(booleans)` accepts booleans as arguments in an Array. Booleans must be same length as `n_keys`.
|
444
464
|
|
445
465
|
```ruby
|
446
|
-
penguins.pick(penguins.
|
466
|
+
penguins.pick(penguins.vectors.map(&:string?))
|
447
467
|
|
448
468
|
# =>
|
449
469
|
#<RedAmber::DataFrame : 344 x 3 Vectors, 0x00000000000387ac>
|
450
470
|
species island sex
|
451
471
|
<string> <string> <string>
|
452
|
-
|
472
|
+
0 Adelie Torgersen male
|
473
|
+
1 Adelie Torgersen female
|
453
474
|
2 Adelie Torgersen female
|
454
|
-
3 Adelie Torgersen
|
455
|
-
4 Adelie Torgersen
|
456
|
-
5 Adelie Torgersen female
|
475
|
+
3 Adelie Torgersen (nil)
|
476
|
+
4 Adelie Torgersen female
|
457
477
|
: : : :
|
458
|
-
|
459
|
-
|
460
|
-
|
478
|
+
341 Gentoo Biscoe male
|
479
|
+
342 Gentoo Biscoe female
|
480
|
+
343 Gentoo Biscoe male
|
461
481
|
```
|
462
482
|
|
463
483
|
- Keys or booleans by a block
|
@@ -471,20 +491,20 @@ penguins.to_rover
|
|
471
491
|
#<RedAmber::DataFrame : 344 x 3 Vectors, 0x000000000003dd4c>
|
472
492
|
bill_length_mm bill_depth_mm flipper_length_mm
|
473
493
|
<double> <double> <uint8>
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
494
|
+
0 39.1 18.7 181
|
495
|
+
1 39.5 17.4 186
|
496
|
+
2 40.3 18.0 195
|
497
|
+
3 (nil) (nil) (nil)
|
498
|
+
4 36.7 19.3 193
|
479
499
|
: : : :
|
480
|
-
|
481
|
-
|
482
|
-
|
500
|
+
341 50.4 15.7 222
|
501
|
+
342 45.2 14.8 212
|
502
|
+
343 49.9 16.1 213
|
483
503
|
```
|
484
504
|
|
485
|
-
### `drop ` -
|
505
|
+
### `drop ` - counterpart of pick -
|
486
506
|
|
487
|
-
Drop some
|
507
|
+
Drop some variables (columns) to create a remainer DataFrame.
|
488
508
|
|
489
509
|
![drop method image](doc/../image/dataframe/drop.png)
|
490
510
|
|
@@ -526,9 +546,9 @@ penguins.to_rover
|
|
526
546
|
#<RedAmber::DataFrame : 3 x 1 Vector, 0x000000000003f4bc>
|
527
547
|
a
|
528
548
|
<uint8>
|
529
|
-
|
530
|
-
|
531
|
-
|
549
|
+
0 1
|
550
|
+
1 2
|
551
|
+
2 3
|
532
552
|
|
533
553
|
df[:a]
|
534
554
|
|
@@ -548,9 +568,9 @@ penguins.to_rover
|
|
548
568
|
[1, 2, 3]
|
549
569
|
```
|
550
570
|
|
551
|
-
### `slice ` -
|
571
|
+
### `slice ` - slice and select records -
|
552
572
|
|
553
|
-
Slice and select
|
573
|
+
Slice and select records (rows) to create a sub DataFrame.
|
554
574
|
|
555
575
|
![slice method image](doc/../image/dataframe/slice.png)
|
556
576
|
|
@@ -561,22 +581,22 @@ penguins.to_rover
|
|
561
581
|
Negative index from the tail like Ruby's Array is also acceptable.
|
562
582
|
|
563
583
|
```ruby
|
564
|
-
# returns 5
|
584
|
+
# returns 5 records at start and 5 records from end
|
565
585
|
penguins.slice(0...5, -5..-1)
|
566
586
|
|
567
587
|
# =>
|
568
588
|
#<RedAmber::DataFrame : 10 x 8 Vectors, 0x0000000000042be4>
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
589
|
+
species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
590
|
+
<string> <string> <double> <double> <uint8> ... <uint16>
|
591
|
+
0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
592
|
+
1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
593
|
+
2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
594
|
+
3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
595
|
+
4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
596
|
+
: : : : : : ... :
|
597
|
+
7 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
598
|
+
8 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
599
|
+
9 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
580
600
|
```
|
581
601
|
|
582
602
|
- Booleans as an argument
|
@@ -591,15 +611,15 @@ penguins.to_rover
|
|
591
611
|
#<RedAmber::DataFrame : 242 x 8 Vectors, 0x0000000000043d3c>
|
592
612
|
species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
593
613
|
<string> <string> <double> <double> <uint8> ... <uint16>
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
614
|
+
0 Adelie Torgersen 40.3 18.0 195 ... 2007
|
615
|
+
1 Adelie Torgersen 42.0 20.2 190 ... 2007
|
616
|
+
2 Adelie Torgersen 41.1 17.6 182 ... 2007
|
617
|
+
3 Adelie Torgersen 42.5 20.7 197 ... 2007
|
618
|
+
4 Adelie Torgersen 46.0 21.5 194 ... 2007
|
599
619
|
: : : : : : ... :
|
600
|
-
|
601
|
-
|
602
|
-
|
620
|
+
239 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
621
|
+
240 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
622
|
+
241 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
603
623
|
```
|
604
624
|
|
605
625
|
- Indices or booleans by a block
|
@@ -619,15 +639,15 @@ penguins.to_rover
|
|
619
639
|
#<RedAmber::DataFrame : 204 x 8 Vectors, 0x0000000000047a40>
|
620
640
|
species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
621
641
|
<string> <string> <double> <double> <uint8> ... <uint16>
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
642
|
+
0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
643
|
+
1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
644
|
+
2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
645
|
+
3 Adelie Torgersen 39.3 20.6 190 ... 2007
|
646
|
+
4 Adelie Torgersen 38.9 17.8 181 ... 2007
|
627
647
|
: : : : : : ... :
|
628
|
-
|
629
|
-
|
630
|
-
|
648
|
+
201 Gentoo Biscoe 47.2 13.7 214 ... 2009
|
649
|
+
202 Gentoo Biscoe 46.8 14.3 215 ... 2009
|
650
|
+
203 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
631
651
|
```
|
632
652
|
|
633
653
|
- Notice: nil option
|
@@ -656,9 +676,9 @@ penguins.to_rover
|
|
656
676
|
0 1 A 1.000000
|
657
677
|
```
|
658
678
|
|
659
|
-
### `remove`
|
679
|
+
### `remove` - counterpart of slice -
|
660
680
|
|
661
|
-
Slice and reject
|
681
|
+
Slice and reject records (rows) to create a remainer DataFrame.
|
662
682
|
|
663
683
|
![remove method image](doc/../image/dataframe/remove.png)
|
664
684
|
|
@@ -667,22 +687,22 @@ penguins.to_rover
|
|
667
687
|
`remove(indeces)` accepts indeces as arguments. Indeces should be an Integer or a Range of Integer.
|
668
688
|
|
669
689
|
```ruby
|
670
|
-
# returns 6th to 339th
|
690
|
+
# returns 6th to 339th records
|
671
691
|
penguins.remove(0...5, -5..-1)
|
672
692
|
|
673
693
|
# =>
|
674
694
|
#<RedAmber::DataFrame : 334 x 8 Vectors, 0x00000000000487c4>
|
675
695
|
species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
676
696
|
<string> <string> <double> <double> <uint8> ... <uint16>
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
697
|
+
0 Adelie Torgersen 39.3 20.6 190 ... 2007
|
698
|
+
1 Adelie Torgersen 38.9 17.8 181 ... 2007
|
699
|
+
2 Adelie Torgersen 39.2 19.6 195 ... 2007
|
700
|
+
3 Adelie Torgersen 34.1 18.1 193 ... 2007
|
701
|
+
4 Adelie Torgersen 42.0 20.2 190 ... 2007
|
682
702
|
: : : : : : ... :
|
683
|
-
|
684
|
-
|
685
|
-
|
703
|
+
331 Gentoo Biscoe 44.5 15.7 217 ... 2009
|
704
|
+
332 Gentoo Biscoe 48.8 16.2 222 ... 2009
|
705
|
+
333 Gentoo Biscoe 47.2 13.7 214 ... 2009
|
686
706
|
```
|
687
707
|
|
688
708
|
- Booleans as an argument
|
@@ -690,7 +710,7 @@ penguins.to_rover
|
|
690
710
|
`remove(booleans)` accepts booleans as an argument in an Array, a Vector or an Arrow::BooleanArray . Booleans must be same length as `size`.
|
691
711
|
|
692
712
|
```ruby
|
693
|
-
# remove all
|
713
|
+
# remove all records contains nil
|
694
714
|
removed = penguins.remove { vectors.map(&:is_nil).reduce(&:|) }
|
695
715
|
removed
|
696
716
|
|
@@ -698,15 +718,15 @@ penguins.to_rover
|
|
698
718
|
#<RedAmber::DataFrame : 333 x 8 Vectors, 0x0000000000049fac>
|
699
719
|
species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
700
720
|
<string> <string> <double> <double> <uint8> ... <uint16>
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
721
|
+
0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
722
|
+
1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
723
|
+
2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
724
|
+
3 Adelie Torgersen 36.7 19.3 193 ... 2007
|
725
|
+
4 Adelie Torgersen 39.3 20.6 190 ... 2007
|
706
726
|
: : : : : : ... :
|
707
|
-
|
708
|
-
|
709
|
-
|
727
|
+
330 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
728
|
+
331 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
729
|
+
332 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
710
730
|
```
|
711
731
|
|
712
732
|
- Indices or booleans by a block
|
@@ -727,15 +747,15 @@ penguins.to_rover
|
|
727
747
|
#<RedAmber::DataFrame : 140 x 8 Vectors, 0x000000000004de40>
|
728
748
|
species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
729
749
|
<string> <string> <double> <double> <uint8> ... <uint16>
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
750
|
+
0 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
751
|
+
1 Adelie Torgersen 36.7 19.3 193 ... 2007
|
752
|
+
2 Adelie Torgersen 34.1 18.1 193 ... 2007
|
753
|
+
3 Adelie Torgersen 37.8 17.1 186 ... 2007
|
754
|
+
4 Adelie Torgersen 37.8 17.3 180 ... 2007
|
735
755
|
: : : : : : ... :
|
736
|
-
|
737
|
-
|
738
|
-
|
756
|
+
137 Gentoo Biscoe (nil) (nil) (nil) ... 2009
|
757
|
+
138 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
758
|
+
139 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
739
759
|
```
|
740
760
|
|
741
761
|
- Notice for nil
|
@@ -770,13 +790,13 @@ penguins.to_rover
|
|
770
790
|
#<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000005df98>
|
771
791
|
a b c
|
772
792
|
<uint8> <string> <double>
|
773
|
-
|
774
|
-
|
793
|
+
0 1 A 1.0
|
794
|
+
1 (nil) C 3.0
|
775
795
|
```
|
776
796
|
|
777
797
|
### `rename`
|
778
798
|
|
779
|
-
Rename keys (column names) to create a updated DataFrame.
|
799
|
+
Rename keys (variable/column names) to create a updated DataFrame.
|
780
800
|
|
781
801
|
![rename method image](doc/../image/dataframe/rename.png)
|
782
802
|
|
@@ -792,9 +812,9 @@ penguins.to_rover
|
|
792
812
|
#<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000060838>
|
793
813
|
name age_in_1993
|
794
814
|
<string> <uint8>
|
795
|
-
|
796
|
-
|
797
|
-
|
815
|
+
0 Yasuko 68
|
816
|
+
1 Rui 49
|
817
|
+
2 Hinata 28
|
798
818
|
```
|
799
819
|
|
800
820
|
- Key pairs by a block
|
@@ -811,7 +831,7 @@ penguins.to_rover
|
|
811
831
|
|
812
832
|
### `assign`
|
813
833
|
|
814
|
-
Assign new or updated
|
834
|
+
Assign new or updated variables (columns) and create an updated DataFrame.
|
815
835
|
|
816
836
|
- Variables with new keys will append new columns from the right.
|
817
837
|
- Variables with exisiting keys will update corresponding vectors.
|
@@ -832,9 +852,9 @@ penguins.to_rover
|
|
832
852
|
#<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000062804>
|
833
853
|
name age
|
834
854
|
<string> <uint8>
|
835
|
-
|
836
|
-
|
837
|
-
|
855
|
+
0 Yasuko 68
|
856
|
+
1 Rui 49
|
857
|
+
2 Hinata 28
|
838
858
|
|
839
859
|
# update :age and add :brother
|
840
860
|
df.assign do
|
@@ -848,9 +868,9 @@ penguins.to_rover
|
|
848
868
|
#<RedAmber::DataFrame : 3 x 3 Vectors, 0x00000000000658b0>
|
849
869
|
name age brother
|
850
870
|
<string> <uint8> <string>
|
851
|
-
|
852
|
-
|
853
|
-
|
871
|
+
0 Yasuko 97 Santa
|
872
|
+
1 Rui 78 (nil)
|
873
|
+
2 Hinata 57 Momotaro
|
854
874
|
```
|
855
875
|
|
856
876
|
- Key pairs by a block
|
@@ -869,11 +889,11 @@ penguins.to_rover
|
|
869
889
|
#<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000069e60>
|
870
890
|
index float string
|
871
891
|
<uint8> <double> <string>
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
892
|
+
0 0 0.0 A
|
893
|
+
1 1 1.1 B
|
894
|
+
2 2 2.2 C
|
895
|
+
3 3 NaN D
|
896
|
+
4 (nil) (nil) (nil)
|
877
897
|
|
878
898
|
# update :float
|
879
899
|
# assigner by an Array
|
@@ -886,11 +906,11 @@ penguins.to_rover
|
|
886
906
|
#<RedAmber::DataFrame : 5 x 3 Vectors, 0x00000000000dfffc>
|
887
907
|
index float string
|
888
908
|
<uint8> <double> <string>
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
909
|
+
0 0 -0.0 A
|
910
|
+
1 1 -1.1 B
|
911
|
+
2 2 -2.2 C
|
912
|
+
3 3 NaN D
|
913
|
+
4 (nil) (nil) (nil)
|
894
914
|
|
895
915
|
# Or we can use assigner by a Hash
|
896
916
|
df.assign do
|
@@ -921,11 +941,11 @@ penguins.to_rover
|
|
921
941
|
#<RedAmber::DataFrame : 5 x 4 Vectors, 0x000000000001787c>
|
922
942
|
new_index index float string
|
923
943
|
<uint8> <uint8> <double> <string>
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
944
|
+
0 1 0 0.0 A
|
945
|
+
1 2 1 1.1 B
|
946
|
+
2 3 2 2.2 C
|
947
|
+
3 4 3 NaN D
|
948
|
+
4 5 (nil) (nil) (nil)
|
929
949
|
```
|
930
950
|
|
931
951
|
### `slice_by(key, keep_key: false) { block }`
|
@@ -946,11 +966,11 @@ penguins.to_rover
|
|
946
966
|
#<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000069e60>
|
947
967
|
index float string
|
948
968
|
<uint8> <double> <string>
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
969
|
+
0 0 0.0 A
|
970
|
+
1 1 1.1 B
|
971
|
+
2 2 2.2 C
|
972
|
+
3 3 NaN D
|
973
|
+
4 (nil) (nil) (nil)
|
954
974
|
|
955
975
|
df.slice_by(:string) { ["A", "C"] }
|
956
976
|
|
@@ -958,8 +978,8 @@ penguins.to_rover
|
|
958
978
|
#<RedAmber::DataFrame : 2 x 2 Vectors, 0x000000000001b1ac>
|
959
979
|
index float
|
960
980
|
<uint8> <double>
|
961
|
-
|
962
|
-
|
981
|
+
0 0 0.0
|
982
|
+
1 2 2.2
|
963
983
|
```
|
964
984
|
|
965
985
|
It is the same behavior as;
|
@@ -977,9 +997,9 @@ It is the same behavior as;
|
|
977
997
|
#<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000069668>
|
978
998
|
index float
|
979
999
|
<uint8> <double>
|
980
|
-
|
981
|
-
|
982
|
-
|
1000
|
+
0 0 0.0
|
1001
|
+
1 1 1.1
|
1002
|
+
2 2 2.2
|
983
1003
|
```
|
984
1004
|
|
985
1005
|
When the option `keep_key: true` used, the column `key` will be preserved.
|
@@ -991,16 +1011,16 @@ When the option `keep_key: true` used, the column `key` will be preserved.
|
|
991
1011
|
#<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000073c44>
|
992
1012
|
index float string
|
993
1013
|
<uint8> <double> <string>
|
994
|
-
|
995
|
-
|
996
|
-
|
1014
|
+
0 0 0.0 A
|
1015
|
+
1 1 1.1 B
|
1016
|
+
2 2 2.2 C
|
997
1017
|
```
|
998
1018
|
|
999
1019
|
## Updating
|
1000
1020
|
|
1001
1021
|
### `sort`
|
1002
1022
|
|
1003
|
-
`sort` accepts parameters as sort_keys thanks to the
|
1023
|
+
`sort` accepts parameters as sort_keys thanks to the Red Arrow's feature。
|
1004
1024
|
- :key, "key" or "+key" denotes ascending order
|
1005
1025
|
- "-key" denotes descending order
|
1006
1026
|
|
@@ -1016,11 +1036,11 @@ When the option `keep_key: true` used, the column `key` will be preserved.
|
|
1016
1036
|
#<RedAmber::DataFrame : 5 x 3 Vectors, 0x000000000009b03c>
|
1017
1037
|
index string bool
|
1018
1038
|
<uint8> <string> <boolean>
|
1019
|
-
|
1020
|
-
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1039
|
+
0 0 (nil) false
|
1040
|
+
1 0 B false
|
1041
|
+
2 1 B true
|
1042
|
+
3 1 C (nil)
|
1043
|
+
4 (nil) A true
|
1024
1044
|
```
|
1025
1045
|
|
1026
1046
|
- [ ] Clamp
|
@@ -1031,13 +1051,13 @@ When the option `keep_key: true` used, the column `key` will be preserved.
|
|
1031
1051
|
|
1032
1052
|
### `remove_nil`
|
1033
1053
|
|
1034
|
-
Remove any
|
1054
|
+
Remove any records containing nil.
|
1035
1055
|
|
1036
1056
|
## Grouping
|
1037
1057
|
|
1038
1058
|
### `group(group_keys)`
|
1039
1059
|
|
1040
|
-
`group` creates a class `Group
|
1060
|
+
`group` creates a instance of class `Group`. `Group` accepts functions below as a method.
|
1041
1061
|
Method accepts options as `group_keys`.
|
1042
1062
|
|
1043
1063
|
Available functions are:
|
@@ -1064,23 +1084,22 @@ When the option `keep_key: true` used, the column `key` will be preserved.
|
|
1064
1084
|
This is an example of grouping of famous STARWARS dataset.
|
1065
1085
|
|
1066
1086
|
```ruby
|
1067
|
-
|
1068
|
-
|
1069
|
-
starwars
|
1087
|
+
uri = URI("https://vincentarelbundock.github.io/Rdatasets/csv/dplyr/starwars.csv")
|
1088
|
+
starwars = RedAmber::DataFrame.load(uri)
|
1070
1089
|
|
1071
1090
|
# =>
|
1072
1091
|
#<RedAmber::DataFrame : 87 x 12 Vectors, 0x0000000000005a50>
|
1073
1092
|
unnamed1 name height mass hair_color skin_color eye_color ... species
|
1074
1093
|
<int64> <string> <int64> <double> <string> <string> <string> ... <string>
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1079
|
-
|
1094
|
+
0 1 Luke Skywalker 172 77.0 blond fair blue ... Human
|
1095
|
+
1 2 C-3PO 167 75.0 NA gold yellow ... Droid
|
1096
|
+
2 3 R2-D2 96 32.0 NA white, blue red ... Droid
|
1097
|
+
3 4 Darth Vader 202 136.0 none white yellow ... Human
|
1098
|
+
4 5 Leia Organa 150 49.0 brown light brown ... Human
|
1080
1099
|
: : : : : : : : ... :
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1100
|
+
84 85 BB8 (nil) (nil) none none black ... Droid
|
1101
|
+
85 86 Captain Phasma (nil) (nil) unknown unknown unknown ... NA
|
1102
|
+
86 87 Padmé Amidala 165 45.0 brown light brown ... Human
|
1084
1103
|
|
1085
1104
|
starwars.tdr(12)
|
1086
1105
|
|
@@ -1088,58 +1107,60 @@ When the option `keep_key: true` used, the column `key` will be preserved.
|
|
1088
1107
|
RedAmber::DataFrame : 87 x 12 Vectors
|
1089
1108
|
Vectors : 4 numeric, 8 strings
|
1090
1109
|
# key type level data_preview
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1110
|
+
0 :unnamed1 int64 87 [1, 2, 3, 4, 5, ... ]
|
1111
|
+
1 :name string 87 ["Luke Skywalker", "C-3PO", "R2-D2", "Darth Vader", "Leia Organa", ... ]
|
1112
|
+
2 :height int64 46 [172, 167, 96, 202, 150, ... ], 6 nils
|
1113
|
+
3 :mass double 39 [77.0, 75.0, 32.0, 136.0, 49.0, ... ], 28 nils
|
1114
|
+
4 :hair_color string 13 ["blond", "NA", "NA", "none", "brown", ... ]
|
1115
|
+
5 :skin_color string 31 ["fair", "gold", "white, blue", "white", "light", ... ]
|
1116
|
+
6 :eye_color string 15 ["blue", "yellow", "red", "yellow", "brown", ... ]
|
1117
|
+
7 :birth_year double 37 [19.0, 112.0, 33.0, 41.9, 19.0, ... ], 44 nils
|
1118
|
+
8 :sex string 5 {"male"=>60, "none"=>6, "female"=>16, "hermaphroditic"=>1, "NA"=>4}
|
1119
|
+
9 :gender string 3 {"masculine"=>66, "feminine"=>17, "NA"=>4}
|
1120
|
+
10 :homeworld string 49 ["Tatooine", "Tatooine", "Naboo", "Tatooine", "Alderaan", ... ]
|
1121
|
+
11 :species string 38 ["Human", "Droid", "Droid", "Human", "Human", ... ]
|
1103
1122
|
```
|
1104
1123
|
|
1105
1124
|
We can group by `:species` and calculate the count.
|
1106
1125
|
|
1107
1126
|
```ruby
|
1108
|
-
starwars.
|
1127
|
+
starwars.remove { species == "NA" }
|
1128
|
+
.group(:species).count(:species)
|
1109
1129
|
|
1110
1130
|
# =>
|
1111
|
-
#<RedAmber::DataFrame :
|
1131
|
+
#<RedAmber::DataFrame : 37 x 2 Vectors, 0x000000000000ffa0>
|
1112
1132
|
species count
|
1113
1133
|
<string> <int64>
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1134
|
+
0 Human 35
|
1135
|
+
1 Droid 6
|
1136
|
+
2 Wookiee 2
|
1137
|
+
3 Rodian 1
|
1138
|
+
4 Hutt 1
|
1119
1139
|
: : :
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1140
|
+
34 Kaleesh 1
|
1141
|
+
35 Pau'an 1
|
1142
|
+
36 Kel Dor 1
|
1123
1143
|
```
|
1124
1144
|
|
1125
1145
|
We can also calculate the mean of `:mass` and `:height` together.
|
1126
1146
|
|
1127
1147
|
```ruby
|
1128
|
-
grouped = starwars.
|
1148
|
+
grouped = starwars.remove { species == "NA" }
|
1149
|
+
.group(:species) { [count(:species), mean(:height, :mass)] }
|
1129
1150
|
|
1130
1151
|
# =>
|
1131
|
-
#<RedAmber::DataFrame :
|
1132
|
-
|
1133
|
-
<
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1139
|
-
: :
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1152
|
+
#<RedAmber::DataFrame : 37 x 4 Vectors, 0x000000000000fff0>
|
1153
|
+
species count mean(height) mean(mass)
|
1154
|
+
<string> <int64> <double> <double>
|
1155
|
+
0 Human 35 176.65 82.78
|
1156
|
+
1 Droid 6 131.2 69.75
|
1157
|
+
2 Wookiee 2 231.0 124.0
|
1158
|
+
3 Rodian 1 173.0 74.0
|
1159
|
+
4 Hutt 1 175.0 1358.0
|
1160
|
+
: : : : :
|
1161
|
+
34 Kaleesh 1 216.0 159.0
|
1162
|
+
35 Pau'an 1 206.0 80.0
|
1163
|
+
36 Kel Dor 1 188.0 80.0
|
1143
1164
|
```
|
1144
1165
|
|
1145
1166
|
Select rows for count > 1.
|
@@ -1148,22 +1169,23 @@ When the option `keep_key: true` used, the column `key` will be preserved.
|
|
1148
1169
|
grouped.slice(grouped[:count] > 1)
|
1149
1170
|
|
1150
1171
|
# =>
|
1151
|
-
#<RedAmber::DataFrame :
|
1172
|
+
#<RedAmber::DataFrame : 8 x 4 Vectors, 0x000000000001002c>
|
1152
1173
|
species count mean(height) mean(mass)
|
1153
1174
|
<string> <int64> <double> <double>
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
9 Kaminoan 2 221.0 88.0
|
1175
|
+
0 Human 35 176.65 82.78
|
1176
|
+
1 Droid 6 131.2 69.75
|
1177
|
+
2 Wookiee 2 231.0 124.0
|
1178
|
+
3 Gungan 3 208.67 74.0
|
1179
|
+
4 Zabrak 2 173.0 80.0
|
1180
|
+
5 Twi'lek 2 179.0 55.0
|
1181
|
+
6 Mirialan 2 168.0 53.1
|
1182
|
+
7 Kaminoan 2 221.0 88.0
|
1163
1183
|
```
|
1164
1184
|
|
1165
1185
|
## Reshape
|
1166
1186
|
|
1187
|
+
![dataframe reshapeing image](doc/../image/reshaping_dataframe.png)
|
1188
|
+
|
1167
1189
|
### `transpose`
|
1168
1190
|
|
1169
1191
|
Creates transposed DataFrame for the wide (messy) dataframe.
|
@@ -1175,30 +1197,31 @@ When the option `keep_key: true` used, the column `key` will be preserved.
|
|
1175
1197
|
#<RedAmber::DataFrame : 5 x 6 Vectors, 0x000000000000d520>
|
1176
1198
|
Year Audi BMW BMW_MINI Mercedes-Benz VW
|
1177
1199
|
<int64> <int64> <int64> <int64> <int64> <int64>
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1183
|
-
|
1200
|
+
0 2017 28336 52527 25427 68221 49040
|
1201
|
+
1 2018 26473 50982 25984 67554 51961
|
1202
|
+
2 2019 24222 46814 23813 66553 46794
|
1203
|
+
3 2020 22304 35712 20196 57041 36576
|
1204
|
+
4 2021 22535 35905 18211 51722 35215
|
1205
|
+
|
1206
|
+
import_cars.transpose(name: :Manufacturer)
|
1184
1207
|
|
1185
1208
|
# =>
|
1186
|
-
#<RedAmber::DataFrame : 5 x 6 Vectors,
|
1209
|
+
#<RedAmber::DataFrame : 5 x 6 Vectors, 0x0000000000010a2c>
|
1187
1210
|
Manufacturer 2017 2018 2019 2020 2021
|
1188
|
-
<
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1211
|
+
<string> <uint32> <uint32> <uint32> <uint16> <uint16>
|
1212
|
+
0 Audi 28336 26473 24222 22304 22535
|
1213
|
+
1 BMW 52527 50982 46814 35712 35905
|
1214
|
+
2 BMW_MINI 25427 25984 23813 20196 18211
|
1215
|
+
3 Mercedes-Benz 68221 67554 66553 57041 51722
|
1216
|
+
4 VW 49040 51961 46794 36576 35215
|
1194
1217
|
```
|
1195
1218
|
|
1196
1219
|
The leftmost column is created by original keys. Key name of the column is
|
1197
|
-
named by parameter `:name`. If `:name` is not specified, `:
|
1220
|
+
named by parameter `:name`. If `:name` is not specified, `:NAME` is used for the key.
|
1198
1221
|
|
1199
1222
|
### `to_long(*keep_keys)`
|
1200
1223
|
|
1201
|
-
Creates a 'long' (tidy) DataFrame from a 'wide' DataFrame.
|
1224
|
+
Creates a 'long' (may be tidy) DataFrame from a 'wide' DataFrame.
|
1202
1225
|
|
1203
1226
|
- Parameter `keep_keys` specifies the key names to keep.
|
1204
1227
|
|
@@ -1206,47 +1229,51 @@ When the option `keep_key: true` used, the column `key` will be preserved.
|
|
1206
1229
|
import_cars.to_long(:Year)
|
1207
1230
|
|
1208
1231
|
# =>
|
1209
|
-
#<RedAmber::DataFrame : 25 x 3 Vectors,
|
1210
|
-
Year
|
1211
|
-
<uint16> <
|
1212
|
-
|
1213
|
-
|
1214
|
-
|
1215
|
-
|
1216
|
-
|
1232
|
+
#<RedAmber::DataFrame : 25 x 3 Vectors, 0x0000000000011864>
|
1233
|
+
Year NAME VALUE
|
1234
|
+
<uint16> <string> <uint32>
|
1235
|
+
0 2017 Audi 28336
|
1236
|
+
1 2017 BMW 52527
|
1237
|
+
2 2017 BMW_MINI 25427
|
1238
|
+
3 2017 Mercedes-Benz 68221
|
1239
|
+
4 2017 VW 49040
|
1217
1240
|
: : : :
|
1218
|
-
|
1219
|
-
|
1220
|
-
|
1241
|
+
22 2021 BMW_MINI 18211
|
1242
|
+
23 2021 Mercedes-Benz 51722
|
1243
|
+
24 2021 VW 35215
|
1221
1244
|
```
|
1222
1245
|
|
1223
1246
|
- Option `:name` is the key of the column which came **from key names**.
|
1247
|
+
The default value is `:NAME` if it is not specified.
|
1224
1248
|
- Option `:value` is the key of the column which came **from values**.
|
1249
|
+
The default value is `:VALUE` if it is not specified.
|
1225
1250
|
|
1226
1251
|
```ruby
|
1227
1252
|
import_cars.to_long(:Year, name: :Manufacturer, value: :Num_of_imported)
|
1228
1253
|
|
1229
1254
|
# =>
|
1230
|
-
#<RedAmber::DataFrame : 25 x 3 Vectors,
|
1255
|
+
#<RedAmber::DataFrame : 25 x 3 Vectors, 0x000000000001359c>
|
1231
1256
|
Year Manufacturer Num_of_imported
|
1232
|
-
<uint16> <
|
1233
|
-
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1257
|
+
<uint16> <string> <uint32>
|
1258
|
+
0 2017 Audi 28336
|
1259
|
+
1 2017 BMW 52527
|
1260
|
+
2 2017 BMW_MINI 25427
|
1261
|
+
3 2017 Mercedes-Benz 68221
|
1262
|
+
4 2017 VW 49040
|
1238
1263
|
: : : :
|
1239
|
-
|
1240
|
-
|
1241
|
-
|
1264
|
+
22 2021 BMW_MINI 18211
|
1265
|
+
23 2021 Mercedes-Benz 51722
|
1266
|
+
24 2021 VW 35215
|
1242
1267
|
```
|
1243
1268
|
|
1244
1269
|
### `to_wide`
|
1245
1270
|
|
1246
|
-
Creates a 'wide' (messy) DataFrame from a 'long' DataFrame.
|
1271
|
+
Creates a 'wide' (may be messy) DataFrame from a 'long' DataFrame.
|
1247
1272
|
|
1248
1273
|
- Option `:name` is the key of the column which will be expanded **to key names**.
|
1274
|
+
The default value is `:NAME` if it is not specified.
|
1249
1275
|
- Option `:value` is the key of the column which will be expanded **to values**.
|
1276
|
+
The default value is `:VALUE` if it is not specified.
|
1250
1277
|
|
1251
1278
|
```ruby
|
1252
1279
|
import_cars.to_long(:Year).to_wide
|
@@ -1257,20 +1284,286 @@ When the option `keep_key: true` used, the column `key` will be preserved.
|
|
1257
1284
|
#<RedAmber::DataFrame : 5 x 6 Vectors, 0x000000000000f0f0>
|
1258
1285
|
Year Audi BMW BMW_MINI Mercedes-Benz VW
|
1259
1286
|
<uint16> <uint16> <uint16> <uint16> <uint32> <uint16>
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
# == import_cars
|
1287
|
+
0 2017 28336 52527 25427 68221 49040
|
1288
|
+
1 2018 26473 50982 25984 67554 51961
|
1289
|
+
2 2019 24222 46814 23813 66553 46794
|
1290
|
+
3 2020 22304 35712 20196 57041 36576
|
1291
|
+
4 2021 22535 35905 18211 51722 35215
|
1267
1292
|
```
|
1268
1293
|
|
1269
1294
|
## Combine
|
1270
1295
|
|
1271
|
-
|
1296
|
+
### `join`
|
1297
|
+
![dataframe joining image](doc/../image/dataframe/join.png)
|
1298
|
+
|
1299
|
+
You should use specific `*_join` methods below.
|
1300
|
+
|
1301
|
+
- `other` is a DataFrame or a Arrow::Table.
|
1302
|
+
- `join_keys` are keys shared by self and other to match with them.
|
1303
|
+
- If `join_keys` are empty, common keys in self and other are chosen (natural join).
|
1304
|
+
- If (common keys) > `join_keys`, duplicated keys are renamed by `suffix`.
|
1305
|
+
|
1306
|
+
```ruby
|
1307
|
+
df = DataFrame.new(
|
1308
|
+
KEY: %w[A B C],
|
1309
|
+
X1: [1, 2, 3]
|
1310
|
+
)
|
1311
|
+
#=>
|
1312
|
+
#<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000012a70>
|
1313
|
+
KEY X1
|
1314
|
+
<string> <uint8>
|
1315
|
+
0 A 1
|
1316
|
+
1 B 2
|
1317
|
+
2 C 3
|
1318
|
+
|
1319
|
+
other = DataFrame.new(
|
1320
|
+
KEY: %w[A B D],
|
1321
|
+
X2: [true, false, nil]
|
1322
|
+
)
|
1323
|
+
#=>
|
1324
|
+
#<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000017034>
|
1325
|
+
KEY X2
|
1326
|
+
<string> <boolean>
|
1327
|
+
0 A true
|
1328
|
+
1 B false
|
1329
|
+
2 D (nil)
|
1330
|
+
```
|
1331
|
+
|
1332
|
+
#### Mutating joins
|
1333
|
+
|
1334
|
+
##### `inner_join(other, join_keys = nil, suffix: '.1')`
|
1335
|
+
|
1336
|
+
Join data, leaving only the matching records.
|
1337
|
+
|
1338
|
+
```ruby
|
1339
|
+
df.inner_join(other, :KEY)
|
1340
|
+
#=>
|
1341
|
+
#<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000001e2bc>
|
1342
|
+
KEY X1 X2
|
1343
|
+
<string> <uint8> <boolean>
|
1344
|
+
0 A 1 true
|
1345
|
+
1 B 2 false
|
1346
|
+
```
|
1347
|
+
|
1348
|
+
##### `full_join(other, join_keys = nil, suffix: '.1')`
|
1349
|
+
|
1350
|
+
Join data, leaving all records.
|
1351
|
+
|
1352
|
+
```ruby
|
1353
|
+
df.full_join(other, :KEY)
|
1354
|
+
#=>
|
1355
|
+
#<RedAmber::DataFrame : 4 x 3 Vectors, 0x0000000000029fcc>
|
1356
|
+
KEY X1 X2
|
1357
|
+
<string> <uint8> <boolean>
|
1358
|
+
0 A 1 true
|
1359
|
+
1 B 2 false
|
1360
|
+
2 C 3 (nil)
|
1361
|
+
3 D (nil) (nil)
|
1362
|
+
```
|
1272
1363
|
|
1273
|
-
|
1364
|
+
##### `left_join(other, join_keys = nil, suffix: '.1')`
|
1365
|
+
|
1366
|
+
Join matching values to self from other.
|
1367
|
+
|
1368
|
+
```ruby
|
1369
|
+
df.left_join(other, :KEY)
|
1370
|
+
#=>
|
1371
|
+
#<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000029fcc>
|
1372
|
+
KEY X1 X2
|
1373
|
+
<string> <uint8> <boolean>
|
1374
|
+
0 A 1 true
|
1375
|
+
1 B 2 false
|
1376
|
+
2 C 3 (nil)
|
1377
|
+
```
|
1378
|
+
|
1379
|
+
##### `right_join(other, join_keys = nil, suffix: '.1')`
|
1380
|
+
|
1381
|
+
Join matching values from self to other.
|
1382
|
+
|
1383
|
+
```ruby
|
1384
|
+
df.right_join(other, :KEY)
|
1385
|
+
#=>
|
1386
|
+
#<RedAmber::DataFrame : 2 x 3 Vectors, 0x0000000000029fcc>
|
1387
|
+
KEY X1 X2
|
1388
|
+
<string> <uint8> <boolean>
|
1389
|
+
0 A 1 true
|
1390
|
+
1 B 2 false
|
1391
|
+
2 D (nil) (nil)
|
1392
|
+
```
|
1393
|
+
|
1394
|
+
#### Filtering join
|
1395
|
+
|
1396
|
+
##### `semi_join(other, join_keys = nil, suffix: '.1')`
|
1397
|
+
|
1398
|
+
Return records of self that have a match in other.
|
1399
|
+
|
1400
|
+
```ruby
|
1401
|
+
df.semi_join(other, :KEY)
|
1402
|
+
#=>
|
1403
|
+
#<RedAmber::DataFrame : 2 x 2 Vectors, 0x0000000000029fcc>
|
1404
|
+
KEY X1
|
1405
|
+
<string> <uint8>
|
1406
|
+
0 A 1
|
1407
|
+
1 B 2
|
1408
|
+
```
|
1409
|
+
|
1410
|
+
##### `anti_join(other, join_keys = nil, suffix: '.1')`
|
1411
|
+
|
1412
|
+
Return records of self that do not have a match in other.
|
1413
|
+
|
1414
|
+
```ruby
|
1415
|
+
df.anti_join(other, :KEY)
|
1416
|
+
#=>
|
1417
|
+
#<RedAmber::DataFrame : 1 x 2 Vectors, 0x0000000000029fcc>
|
1418
|
+
KEY X1
|
1419
|
+
<string> <uint8>
|
1420
|
+
0 C 3
|
1421
|
+
```
|
1422
|
+
|
1423
|
+
## Set operations
|
1424
|
+
![dataframe set and binding image](doc/../image/dataframe/set_and_bind.png)
|
1425
|
+
|
1426
|
+
Keys in self and other must be same in set operations.
|
1427
|
+
|
1428
|
+
```ruby
|
1429
|
+
df = DataFrame.new(
|
1430
|
+
KEY1: %w[A B C],
|
1431
|
+
KEY2: [1, 2, 3]
|
1432
|
+
)
|
1433
|
+
#=>
|
1434
|
+
#<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000012a70>
|
1435
|
+
KEY1 KEY2
|
1436
|
+
<string> <uint8>
|
1437
|
+
0 A 1
|
1438
|
+
1 B 2
|
1439
|
+
2 C 3
|
1440
|
+
|
1441
|
+
other = DataFrame.new(
|
1442
|
+
KEY1: %w[A B D],
|
1443
|
+
KEY2: [1, 4, 5]
|
1444
|
+
)
|
1445
|
+
#=>
|
1446
|
+
#<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000017034>
|
1447
|
+
KEY1 KEY2
|
1448
|
+
<string> <uint8>
|
1449
|
+
0 A 1
|
1450
|
+
1 B 4
|
1451
|
+
2 D 5
|
1452
|
+
```
|
1453
|
+
|
1454
|
+
##### `intersect(other)`
|
1455
|
+
|
1456
|
+
Select records appearing in both self and other.
|
1457
|
+
|
1458
|
+
```ruby
|
1459
|
+
df.intersect(other)
|
1460
|
+
#=>
|
1461
|
+
#<RedAmber::DataFrame : 1 x 2 Vectors, 0x0000000000029fcc>
|
1462
|
+
KEY1 KEY2
|
1463
|
+
<string> <uint8>
|
1464
|
+
0 A 1
|
1465
|
+
```
|
1466
|
+
|
1467
|
+
##### `union(other)`
|
1468
|
+
|
1469
|
+
Select records appearing in self or other.
|
1470
|
+
|
1471
|
+
```ruby
|
1472
|
+
df.union(other)
|
1473
|
+
#=>
|
1474
|
+
#<RedAmber::DataFrame : 5 x 2 Vectors, 0x0000000000029fcc>
|
1475
|
+
KEY1 KEY2
|
1476
|
+
<string> <uint8>
|
1477
|
+
0 A 1
|
1478
|
+
1 B 2
|
1479
|
+
2 C 3
|
1480
|
+
3 B 4
|
1481
|
+
4 D 5
|
1482
|
+
```
|
1483
|
+
|
1484
|
+
##### `difference(other)`
|
1485
|
+
|
1486
|
+
Select records appearing in self but not in other.
|
1487
|
+
|
1488
|
+
It has an alias `setdiff`.
|
1489
|
+
|
1490
|
+
```ruby
|
1491
|
+
df.difference(other)
|
1492
|
+
#=>
|
1493
|
+
#<RedAmber::DataFrame : 1 x 2 Vectors, 0x0000000000029fcc>
|
1494
|
+
KEY1 KEY2
|
1495
|
+
<string> <uint8>
|
1496
|
+
1 B 2
|
1497
|
+
2 C 3
|
1498
|
+
```
|
1499
|
+
|
1500
|
+
## Binding
|
1501
|
+
|
1502
|
+
### `concatenate(other)`
|
1503
|
+
|
1504
|
+
Concatenate another DataFrame or Table onto the bottom of self. The shape and data type of other must be the same as self.
|
1505
|
+
|
1506
|
+
The alias is `concat`.
|
1507
|
+
|
1508
|
+
An array of DataFrames or Tables is also acceptable as other.
|
1509
|
+
|
1510
|
+
```ruby
|
1511
|
+
df
|
1512
|
+
#=>
|
1513
|
+
#<RedAmber::DataFrame : 2 x 2 Vectors, 0x0000000000022cb8>
|
1514
|
+
x y
|
1515
|
+
<uint8> <string>
|
1516
|
+
0 1 A
|
1517
|
+
1 2 B
|
1518
|
+
|
1519
|
+
other
|
1520
|
+
#=>
|
1521
|
+
#<RedAmber::DataFrame : 2 x 2 Vectors, 0x000000000001f6d0>
|
1522
|
+
x y
|
1523
|
+
<uint8> <string>
|
1524
|
+
0 3 C
|
1525
|
+
1 4 D
|
1526
|
+
|
1527
|
+
df.concatenate(other)
|
1528
|
+
#=>
|
1529
|
+
#<RedAmber::DataFrame : 4 x 2 Vectors, 0x0000000000022574>
|
1530
|
+
x y
|
1531
|
+
<uint8> <string>
|
1532
|
+
0 1 A
|
1533
|
+
1 2 B
|
1534
|
+
2 3 C
|
1535
|
+
3 4 D
|
1536
|
+
```
|
1537
|
+
|
1538
|
+
### `merge(other)`
|
1539
|
+
|
1540
|
+
Concatenate another DataFrame or Table onto the bottom of self. The shape and data type of other must be the same as self.
|
1541
|
+
|
1542
|
+
```ruby
|
1543
|
+
df
|
1544
|
+
#=>
|
1545
|
+
#<RedAmber::DataFrame : 2 x 2 Vectors, 0x0000000000009150>
|
1546
|
+
x y
|
1547
|
+
<uint8> <uint8>
|
1548
|
+
0 1 3
|
1549
|
+
1 2 4
|
1550
|
+
|
1551
|
+
other
|
1552
|
+
#=>
|
1553
|
+
#<RedAmber::DataFrame : 2 x 2 Vectors, 0x0000000000008a0c>
|
1554
|
+
a b
|
1555
|
+
<string> <string>
|
1556
|
+
0 A C
|
1557
|
+
1 B D
|
1558
|
+
|
1559
|
+
df.merge(other)
|
1560
|
+
#=>
|
1561
|
+
#<RedAmber::DataFrame : 2 x 4 Vectors, 0x000000000000cb70>
|
1562
|
+
x y a b
|
1563
|
+
<uint8> <uint8> <string> <string>
|
1564
|
+
0 1 3 A C
|
1565
|
+
1 2 4 B D
|
1566
|
+
```
|
1274
1567
|
|
1275
1568
|
## Encoding
|
1276
1569
|
|