red_amber 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +12 -0
- data/CHANGELOG.md +114 -31
- data/Gemfile +4 -2
- data/README.md +41 -25
- data/benchmark/basic.yml +79 -0
- data/benchmark/combine.yml +63 -0
- data/benchmark/drop_nil.yml +15 -3
- data/benchmark/group.yml +33 -0
- data/benchmark/reshape.yml +27 -0
- data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
- data/benchmark/rover/flights.yml +23 -0
- data/benchmark/rover/penguins.yml +23 -0
- data/benchmark/rover/planes.yml +23 -0
- data/benchmark/rover/weather.yml +23 -0
- data/doc/DataFrame.md +332 -53
- data/doc/Vector.md +3 -0
- data/doc/image/dataframe/join.png +0 -0
- data/doc/image/dataframe/set_and_bind.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/lib/red_amber/data_frame.rb +6 -5
- data/lib/red_amber/data_frame_combinable.rb +283 -0
- data/lib/red_amber/data_frame_displayable.rb +2 -0
- data/lib/red_amber/data_frame_selectable.rb +9 -9
- data/lib/red_amber/data_frame_variable_operation.rb +4 -4
- data/lib/red_amber/group.rb +99 -18
- data/lib/red_amber/helper.rb +1 -13
- data/lib/red_amber/vector.rb +7 -0
- data/lib/red_amber/vector_functions.rb +0 -8
- data/lib/red_amber/vector_updatable.rb +60 -65
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +1 -0
- data/red_amber.gemspec +1 -1
- metadata +21 -10
@@ -0,0 +1,23 @@
|
|
1
|
+
contexts:
|
2
|
+
- gems:
|
3
|
+
red_amber: 0.2.2
|
4
|
+
- name: HEAD
|
5
|
+
prelude: |
|
6
|
+
$LOAD_PATH.unshift(File.expand_path('lib'))
|
7
|
+
require 'red_amber'
|
8
|
+
|
9
|
+
prelude: |
|
10
|
+
require 'rover'
|
11
|
+
require 'datasets-arrow'
|
12
|
+
ds = Datasets::Rdatasets.new('nycflights13', 'flights')
|
13
|
+
df = RedAmber::DataFrame.new(ds)
|
14
|
+
rover = Rover::DataFrame.new(df.to_h)
|
15
|
+
group_keys = [:month, :origin]
|
16
|
+
summary_key = :air_time
|
17
|
+
|
18
|
+
benchmark:
|
19
|
+
'penguins Group by Rover': |
|
20
|
+
rover.group(group_keys).count
|
21
|
+
|
22
|
+
'penguins Group by RedAmber': |
|
23
|
+
df.group(group_keys).count
|
@@ -0,0 +1,23 @@
|
|
1
|
+
contexts:
|
2
|
+
- gems:
|
3
|
+
red_amber: 0.2.2
|
4
|
+
- name: HEAD
|
5
|
+
prelude: |
|
6
|
+
$LOAD_PATH.unshift(File.expand_path('lib'))
|
7
|
+
require 'red_amber'
|
8
|
+
|
9
|
+
prelude: |
|
10
|
+
require 'rover'
|
11
|
+
require 'datasets-arrow'
|
12
|
+
ds = Datasets::Penguins.new
|
13
|
+
df = RedAmber::DataFrame.new(ds)
|
14
|
+
rover = Rover::DataFrame.new(df.to_h)
|
15
|
+
group_keys = [:species, :island]
|
16
|
+
summary_key = :body_mass_g
|
17
|
+
|
18
|
+
benchmark:
|
19
|
+
'penguins Group by Rover': |
|
20
|
+
rover.group(group_keys).mean(summary_key)
|
21
|
+
|
22
|
+
'penguins Group by RedAmber': |
|
23
|
+
df.group(group_keys).mean(summary_key)
|
@@ -0,0 +1,23 @@
|
|
1
|
+
contexts:
|
2
|
+
- gems:
|
3
|
+
red_amber: 0.2.2
|
4
|
+
- name: HEAD
|
5
|
+
prelude: |
|
6
|
+
$LOAD_PATH.unshift(File.expand_path('lib'))
|
7
|
+
require 'red_amber'
|
8
|
+
|
9
|
+
prelude: |
|
10
|
+
require 'rover'
|
11
|
+
require 'datasets-arrow'
|
12
|
+
ds = Datasets::Rdatasets.new('nycflights13', 'planes')
|
13
|
+
df = RedAmber::DataFrame.new(ds)
|
14
|
+
rover = Rover::DataFrame.new(df.to_h)
|
15
|
+
group_keys = [:engines, :engine]
|
16
|
+
summary_key = :seats
|
17
|
+
|
18
|
+
benchmark:
|
19
|
+
'penguins Group by Rover': |
|
20
|
+
rover.group(group_keys).mean(summary_key)
|
21
|
+
|
22
|
+
'penguins Group by RedAmber': |
|
23
|
+
df.group(group_keys).mean(summary_key)
|
@@ -0,0 +1,23 @@
|
|
1
|
+
contexts:
|
2
|
+
- gems:
|
3
|
+
red_amber: 0.2.2
|
4
|
+
- name: HEAD
|
5
|
+
prelude: |
|
6
|
+
$LOAD_PATH.unshift(File.expand_path('lib'))
|
7
|
+
require 'red_amber'
|
8
|
+
|
9
|
+
prelude: |
|
10
|
+
require 'rover'
|
11
|
+
require 'datasets-arrow'
|
12
|
+
ds = Datasets::Rdatasets.new('nycflights13', 'weather')
|
13
|
+
df = RedAmber::DataFrame.new(ds)
|
14
|
+
rover = Rover::DataFrame.new(df.to_h)
|
15
|
+
group_keys = [:month, :origin]
|
16
|
+
summary_key = :temp
|
17
|
+
|
18
|
+
benchmark:
|
19
|
+
'penguins Group by Rover': |
|
20
|
+
rover.group(group_keys).mean(summary_key)
|
21
|
+
|
22
|
+
'penguins Group by RedAmber': |
|
23
|
+
df.group(group_keys).mean(summary_key)
|
data/doc/DataFrame.md
CHANGED
@@ -5,7 +5,8 @@ Class `RedAmber::DataFrame` represents 2D-data. A `DataFrame` consists with:
|
|
5
5
|
- A label is attached to `Vector`. We call it `key`.
|
6
6
|
- A `Vector` and associated `key` is grouped as a `variable`.
|
7
7
|
- `variable`s with same vector length are aligned and arranged to be a `DataFrame`.
|
8
|
-
- Each `
|
8
|
+
- Each `key` in a `DataFrame` must be unique.
|
9
|
+
- Each `Vector` in a `DataFrame` contains a set of relating data at same position. We call it `record` or `observation`.
|
9
10
|
|
10
11
|
![dataframe model image](doc/../image/dataframe_model.png)
|
11
12
|
|
@@ -94,13 +95,13 @@ Class `RedAmber::DataFrame` represents 2D-data. A `DataFrame` consists with:
|
|
94
95
|
|
95
96
|
### `table`, `to_arrow`
|
96
97
|
|
97
|
-
-
|
98
|
+
- Returns Arrow::Table object in the DataFrame.
|
98
99
|
|
99
|
-
### `size`, `n_obs`, `n_rows`
|
100
|
+
### `size`, `n_records`, `n_obs`, `n_rows`
|
100
101
|
|
101
|
-
- Returns size of Vector (num of
|
102
|
-
|
103
|
-
### `n_keys`, `n_vars`, `n_cols`,
|
102
|
+
- Returns size of Vector (num of records).
|
103
|
+
|
104
|
+
### `n_keys`, `n_variables`, `n_vars`, `n_cols`,
|
104
105
|
|
105
106
|
- Returns num of keys (num of variables).
|
106
107
|
|
@@ -138,16 +139,7 @@ Class `RedAmber::DataFrame` represents 2D-data. A `DataFrame` consists with:
|
|
138
139
|
|
139
140
|
- Returns key names in an Array.
|
140
141
|
|
141
|
-
|
142
|
-
|
143
|
-
```ruby
|
144
|
-
# update numeric variables, another solution
|
145
|
-
df.assign do
|
146
|
-
vectors.each_with_object({}) do |vector, assigner|
|
147
|
-
assigner[vector.key] = vector * -1 if vector.numeric?
|
148
|
-
end
|
149
|
-
end
|
150
|
-
```
|
142
|
+
Each key must be unique in the DataFrame.
|
151
143
|
|
152
144
|
### `types`
|
153
145
|
|
@@ -161,9 +153,20 @@ Class `RedAmber::DataFrame` represents 2D-data. A `DataFrame` consists with:
|
|
161
153
|
|
162
154
|
- Returns an Array of Vectors.
|
163
155
|
|
156
|
+
When we use it, Vector#key is useful to get the key in the DataFrame.
|
157
|
+
|
158
|
+
```ruby
|
159
|
+
# update numeric variables, another solution
|
160
|
+
df.assign do
|
161
|
+
vectors.each_with_object({}) do |vector, assigner|
|
162
|
+
assigner[vector.key] = vector * -1 if vector.numeric?
|
163
|
+
end
|
164
|
+
end
|
165
|
+
```
|
166
|
+
|
164
167
|
### `indices`, `indexes`
|
165
168
|
|
166
|
-
- Returns indexes in
|
169
|
+
- Returns indexes in a Vector.
|
167
170
|
Accepts an option `start` as the first of indexes.
|
168
171
|
|
169
172
|
```ruby
|
@@ -171,15 +174,19 @@ Class `RedAmber::DataFrame` represents 2D-data. A `DataFrame` consists with:
|
|
171
174
|
df.indices
|
172
175
|
|
173
176
|
# =>
|
177
|
+
#<RedAmber::Vector(:uint8, size=5):0x0000000000013ed4>
|
174
178
|
[0, 1, 2, 3, 4]
|
175
179
|
|
176
180
|
df.indices(1)
|
177
181
|
|
178
182
|
# =>
|
183
|
+
#<RedAmber::Vector(:uint8, size=5):0x0000000000018fd8>
|
179
184
|
[1, 2, 3, 4, 5]
|
180
185
|
|
181
186
|
df.indices(:a)
|
187
|
+
|
182
188
|
# =>
|
189
|
+
#<RedAmber::Vector(:dictionary, size=5):0x000000000001bd50>
|
183
190
|
[:a, :b, :c, :d, :e]
|
184
191
|
```
|
185
192
|
|
@@ -275,6 +282,7 @@ penguins.to_rover
|
|
275
282
|
|
276
283
|
dataset = Datasets::Penguins.new
|
277
284
|
# (From 0.2.2) responsible to the object which has `to_arrow` method.
|
285
|
+
# If older, it should be `dataset.to_arrow` in the parentheses.
|
278
286
|
RedAmber::DataFrame.new(dataset).tdr
|
279
287
|
|
280
288
|
# =>
|
@@ -290,10 +298,11 @@ penguins.to_rover
|
|
290
298
|
6 :sex string 3 {"male"=>168, "female"=>165, nil=>11}
|
291
299
|
7 :year uint16 3 {2007=>110, 2008=>114, 2009=>120}
|
292
300
|
```
|
293
|
-
|
301
|
+
|
302
|
+
Options:
|
294
303
|
- limit: limit of variables to show. Default value is 10.
|
295
|
-
- tally: max level to use tally mode.
|
296
|
-
- elements: max num of element to show values in each
|
304
|
+
- tally: max level to use tally mode. Default value is 5.
|
305
|
+
- elements: max num of element to show values in each records. Default value is 5.
|
297
306
|
|
298
307
|
## Selecting
|
299
308
|
|
@@ -303,13 +312,13 @@ penguins.to_rover
|
|
303
312
|
- Keys in an Array: `df[:symbol1, "string", :symbol2]`
|
304
313
|
- Keys by indeces: `df[df.keys[0]`, `df[df.keys[1,2]]`, `df[df.keys[1..]]`
|
305
314
|
|
306
|
-
Key indeces
|
315
|
+
Key indeces should be used via `keys[i]` because numbers are used to select records (rows). See next section.
|
307
316
|
|
308
317
|
- Keys by a Range:
|
309
318
|
|
310
|
-
If keys are able to represent by Range, it can be included in the arguments. See a example below.
|
319
|
+
If keys are able to represent by a Range, it can be included in the arguments. See a example below.
|
311
320
|
|
312
|
-
- You can exchange the order of variables (columns).
|
321
|
+
- You can also exchange the order of variables (columns).
|
313
322
|
|
314
323
|
```ruby
|
315
324
|
hash = {a: [1, 2, 3], b: %w[A B C], c: [1.0, 2, 3]}
|
@@ -325,7 +334,7 @@ penguins.to_rover
|
|
325
334
|
2 C 3.0 3
|
326
335
|
```
|
327
336
|
|
328
|
-
If `#[]` represents single variable (column), it returns a Vector object.
|
337
|
+
If `#[]` represents a single variable (column), it returns a Vector object.
|
329
338
|
|
330
339
|
```ruby
|
331
340
|
df[:a]
|
@@ -334,6 +343,7 @@ penguins.to_rover
|
|
334
343
|
#<RedAmber::Vector(:uint8, size=3):0x000000000000f140>
|
335
344
|
[1, 2, 3]
|
336
345
|
```
|
346
|
+
|
337
347
|
Or `#v` method also returns a Vector for a key.
|
338
348
|
|
339
349
|
```ruby
|
@@ -344,18 +354,19 @@ penguins.to_rover
|
|
344
354
|
[1, 2, 3]
|
345
355
|
```
|
346
356
|
|
347
|
-
This may be useful to use in a block of DataFrame manipulation verbs. We can write `v(:a)` rather than `self[:a]` or `df[:a]`
|
357
|
+
This method may be useful to use in a block of DataFrame manipulation verbs. We can write `v(:a)` rather than `self[:a]` or `df[:a]`
|
348
358
|
|
349
|
-
### Select
|
359
|
+
### Select records (rows in a table) by `[]` as `[index]`, `[range]`, `[array]`
|
350
360
|
|
351
|
-
- Select a
|
352
|
-
- Select obs. by indeces in a Range: `df[1..2]`
|
361
|
+
- Select a record by index: `df[0]`
|
353
362
|
|
354
|
-
|
363
|
+
- Select records by indeces in an Array: `df[1, 2]`
|
355
364
|
|
356
|
-
- Select
|
365
|
+
- Select records by indeces in a Range: `df[1..2]`
|
357
366
|
|
358
|
-
-
|
367
|
+
An end-less or a begin-less Range can be used to represent indeces.
|
368
|
+
|
369
|
+
- You can use indices in Float.
|
359
370
|
|
360
371
|
- Mixed case: `df[2, 0..]`
|
361
372
|
|
@@ -374,9 +385,9 @@ penguins.to_rover
|
|
374
385
|
3 3 C 3.0
|
375
386
|
```
|
376
387
|
|
377
|
-
- Select
|
388
|
+
- Select records by a boolean Array or a boolean RedAmber::Vector at same size as self.
|
378
389
|
|
379
|
-
It returns a sub dataframe with
|
390
|
+
It returns a sub dataframe with records at boolean is true.
|
380
391
|
|
381
392
|
```ruby
|
382
393
|
# with the same dataframe `df` above
|
@@ -391,15 +402,15 @@ penguins.to_rover
|
|
391
402
|
1 1 A 1.0
|
392
403
|
```
|
393
404
|
|
394
|
-
### Select rows from top or from bottom
|
405
|
+
### Select records (rows) from top or from bottom
|
395
406
|
|
396
407
|
`head(n=5)`, `tail(n=5)`, `first(n=1)`, `last(n=1)`
|
397
408
|
|
398
409
|
## Sub DataFrame manipulations
|
399
410
|
|
400
|
-
### `pick ` - pick up variables
|
411
|
+
### `pick ` - pick up variables -
|
401
412
|
|
402
|
-
Pick up some
|
413
|
+
Pick up some variables (columns) to create a sub DataFrame.
|
403
414
|
|
404
415
|
![pick method image](doc/../image/dataframe/pick.png)
|
405
416
|
|
@@ -491,9 +502,9 @@ penguins.to_rover
|
|
491
502
|
343 49.9 16.1 213
|
492
503
|
```
|
493
504
|
|
494
|
-
### `drop ` -
|
505
|
+
### `drop ` - counterpart of pick -
|
495
506
|
|
496
|
-
Drop some
|
507
|
+
Drop some variables (columns) to create a remainer DataFrame.
|
497
508
|
|
498
509
|
![drop method image](doc/../image/dataframe/drop.png)
|
499
510
|
|
@@ -557,9 +568,9 @@ penguins.to_rover
|
|
557
568
|
[1, 2, 3]
|
558
569
|
```
|
559
570
|
|
560
|
-
### `slice ` -
|
571
|
+
### `slice ` - slice and select records -
|
561
572
|
|
562
|
-
Slice and select
|
573
|
+
Slice and select records (rows) to create a sub DataFrame.
|
563
574
|
|
564
575
|
![slice method image](doc/../image/dataframe/slice.png)
|
565
576
|
|
@@ -570,7 +581,7 @@ penguins.to_rover
|
|
570
581
|
Negative index from the tail like Ruby's Array is also acceptable.
|
571
582
|
|
572
583
|
```ruby
|
573
|
-
# returns 5
|
584
|
+
# returns 5 records at start and 5 records from end
|
574
585
|
penguins.slice(0...5, -5..-1)
|
575
586
|
|
576
587
|
# =>
|
@@ -665,9 +676,9 @@ penguins.to_rover
|
|
665
676
|
0 1 A 1.000000
|
666
677
|
```
|
667
678
|
|
668
|
-
### `remove`
|
679
|
+
### `remove` - counterpart of slice -
|
669
680
|
|
670
|
-
Slice and reject
|
681
|
+
Slice and reject records (rows) to create a remainer DataFrame.
|
671
682
|
|
672
683
|
![remove method image](doc/../image/dataframe/remove.png)
|
673
684
|
|
@@ -676,7 +687,7 @@ penguins.to_rover
|
|
676
687
|
`remove(indeces)` accepts indeces as arguments. Indeces should be an Integer or a Range of Integer.
|
677
688
|
|
678
689
|
```ruby
|
679
|
-
# returns 6th to 339th
|
690
|
+
# returns 6th to 339th records
|
680
691
|
penguins.remove(0...5, -5..-1)
|
681
692
|
|
682
693
|
# =>
|
@@ -699,7 +710,7 @@ penguins.to_rover
|
|
699
710
|
`remove(booleans)` accepts booleans as an argument in an Array, a Vector or an Arrow::BooleanArray . Booleans must be same length as `size`.
|
700
711
|
|
701
712
|
```ruby
|
702
|
-
# remove all
|
713
|
+
# remove all records contains nil
|
703
714
|
removed = penguins.remove { vectors.map(&:is_nil).reduce(&:|) }
|
704
715
|
removed
|
705
716
|
|
@@ -785,7 +796,7 @@ penguins.to_rover
|
|
785
796
|
|
786
797
|
### `rename`
|
787
798
|
|
788
|
-
Rename keys (column names) to create a updated DataFrame.
|
799
|
+
Rename keys (variable/column names) to create a updated DataFrame.
|
789
800
|
|
790
801
|
![rename method image](doc/../image/dataframe/rename.png)
|
791
802
|
|
@@ -820,7 +831,7 @@ penguins.to_rover
|
|
820
831
|
|
821
832
|
### `assign`
|
822
833
|
|
823
|
-
Assign new or updated
|
834
|
+
Assign new or updated variables (columns) and create an updated DataFrame.
|
824
835
|
|
825
836
|
- Variables with new keys will append new columns from the right.
|
826
837
|
- Variables with exisiting keys will update corresponding vectors.
|
@@ -1009,7 +1020,7 @@ When the option `keep_key: true` used, the column `key` will be preserved.
|
|
1009
1020
|
|
1010
1021
|
### `sort`
|
1011
1022
|
|
1012
|
-
`sort` accepts parameters as sort_keys thanks to the
|
1023
|
+
`sort` accepts parameters as sort_keys thanks to the Red Arrow's feature。
|
1013
1024
|
- :key, "key" or "+key" denotes ascending order
|
1014
1025
|
- "-key" denotes descending order
|
1015
1026
|
|
@@ -1040,7 +1051,7 @@ When the option `keep_key: true` used, the column `key` will be preserved.
|
|
1040
1051
|
|
1041
1052
|
### `remove_nil`
|
1042
1053
|
|
1043
|
-
Remove any
|
1054
|
+
Remove any records containing nil.
|
1044
1055
|
|
1045
1056
|
## Grouping
|
1046
1057
|
|
@@ -1210,7 +1221,7 @@ When the option `keep_key: true` used, the column `key` will be preserved.
|
|
1210
1221
|
|
1211
1222
|
### `to_long(*keep_keys)`
|
1212
1223
|
|
1213
|
-
Creates a 'long' (tidy) DataFrame from a 'wide' DataFrame.
|
1224
|
+
Creates a 'long' (may be tidy) DataFrame from a 'wide' DataFrame.
|
1214
1225
|
|
1215
1226
|
- Parameter `keep_keys` specifies the key names to keep.
|
1216
1227
|
|
@@ -1257,7 +1268,7 @@ When the option `keep_key: true` used, the column `key` will be preserved.
|
|
1257
1268
|
|
1258
1269
|
### `to_wide`
|
1259
1270
|
|
1260
|
-
Creates a 'wide' (messy) DataFrame from a 'long' DataFrame.
|
1271
|
+
Creates a 'wide' (may be messy) DataFrame from a 'long' DataFrame.
|
1261
1272
|
|
1262
1273
|
- Option `:name` is the key of the column which will be expanded **to key names**.
|
1263
1274
|
The default value is `:NAME` if it is not specified.
|
@@ -1282,9 +1293,277 @@ When the option `keep_key: true` used, the column `key` will be preserved.
|
|
1282
1293
|
|
1283
1294
|
## Combine
|
1284
1295
|
|
1285
|
-
|
1296
|
+
### `join`
|
1297
|
+
![dataframe joining image](doc/../image/dataframe/join.png)
|
1298
|
+
|
1299
|
+
You should use specific `*_join` methods below.
|
1300
|
+
|
1301
|
+
- `other` is a DataFrame or a Arrow::Table.
|
1302
|
+
- `join_keys` are keys shared by self and other to match with them.
|
1303
|
+
- If `join_keys` are empty, common keys in self and other are chosen (natural join).
|
1304
|
+
- If (common keys) > `join_keys`, duplicated keys are renamed by `suffix`.
|
1305
|
+
|
1306
|
+
```ruby
|
1307
|
+
df = DataFrame.new(
|
1308
|
+
KEY: %w[A B C],
|
1309
|
+
X1: [1, 2, 3]
|
1310
|
+
)
|
1311
|
+
#=>
|
1312
|
+
#<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000012a70>
|
1313
|
+
KEY X1
|
1314
|
+
<string> <uint8>
|
1315
|
+
0 A 1
|
1316
|
+
1 B 2
|
1317
|
+
2 C 3
|
1318
|
+
|
1319
|
+
other = DataFrame.new(
|
1320
|
+
KEY: %w[A B D],
|
1321
|
+
X2: [true, false, nil]
|
1322
|
+
)
|
1323
|
+
#=>
|
1324
|
+
#<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000017034>
|
1325
|
+
KEY X2
|
1326
|
+
<string> <boolean>
|
1327
|
+
0 A true
|
1328
|
+
1 B false
|
1329
|
+
2 D (nil)
|
1330
|
+
```
|
1331
|
+
|
1332
|
+
#### Mutating joins
|
1333
|
+
|
1334
|
+
##### `inner_join(other, join_keys = nil, suffix: '.1')`
|
1335
|
+
|
1336
|
+
Join data, leaving only the matching records.
|
1337
|
+
|
1338
|
+
```ruby
|
1339
|
+
df.inner_join(other, :KEY)
|
1340
|
+
#=>
|
1341
|
+
#<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000001e2bc>
|
1342
|
+
KEY X1 X2
|
1343
|
+
<string> <uint8> <boolean>
|
1344
|
+
0 A 1 true
|
1345
|
+
1 B 2 false
|
1346
|
+
```
|
1347
|
+
|
1348
|
+
##### `full_join(other, join_keys = nil, suffix: '.1')`
|
1349
|
+
|
1350
|
+
Join data, leaving all records.
|
1351
|
+
|
1352
|
+
```ruby
|
1353
|
+
df.full_join(other, :KEY)
|
1354
|
+
#=>
|
1355
|
+
#<RedAmber::DataFrame : 4 x 3 Vectors, 0x0000000000029fcc>
|
1356
|
+
KEY X1 X2
|
1357
|
+
<string> <uint8> <boolean>
|
1358
|
+
0 A 1 true
|
1359
|
+
1 B 2 false
|
1360
|
+
2 C 3 (nil)
|
1361
|
+
3 D (nil) (nil)
|
1362
|
+
```
|
1363
|
+
|
1364
|
+
##### `left_join(other, join_keys = nil, suffix: '.1')`
|
1365
|
+
|
1366
|
+
Join matching values to self from other.
|
1367
|
+
|
1368
|
+
```ruby
|
1369
|
+
df.left_join(other, :KEY)
|
1370
|
+
#=>
|
1371
|
+
#<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000029fcc>
|
1372
|
+
KEY X1 X2
|
1373
|
+
<string> <uint8> <boolean>
|
1374
|
+
0 A 1 true
|
1375
|
+
1 B 2 false
|
1376
|
+
2 C 3 (nil)
|
1377
|
+
```
|
1378
|
+
|
1379
|
+
##### `right_join(other, join_keys = nil, suffix: '.1')`
|
1380
|
+
|
1381
|
+
Join matching values from self to other.
|
1286
1382
|
|
1287
|
-
|
1383
|
+
```ruby
|
1384
|
+
df.right_join(other, :KEY)
|
1385
|
+
#=>
|
1386
|
+
#<RedAmber::DataFrame : 2 x 3 Vectors, 0x0000000000029fcc>
|
1387
|
+
KEY X1 X2
|
1388
|
+
<string> <uint8> <boolean>
|
1389
|
+
0 A 1 true
|
1390
|
+
1 B 2 false
|
1391
|
+
2 D (nil) (nil)
|
1392
|
+
```
|
1393
|
+
|
1394
|
+
#### Filtering join
|
1395
|
+
|
1396
|
+
##### `semi_join(other, join_keys = nil, suffix: '.1')`
|
1397
|
+
|
1398
|
+
Return records of self that have a match in other.
|
1399
|
+
|
1400
|
+
```ruby
|
1401
|
+
df.semi_join(other, :KEY)
|
1402
|
+
#=>
|
1403
|
+
#<RedAmber::DataFrame : 2 x 2 Vectors, 0x0000000000029fcc>
|
1404
|
+
KEY X1
|
1405
|
+
<string> <uint8>
|
1406
|
+
0 A 1
|
1407
|
+
1 B 2
|
1408
|
+
```
|
1409
|
+
|
1410
|
+
##### `anti_join(other, join_keys = nil, suffix: '.1')`
|
1411
|
+
|
1412
|
+
Return records of self that do not have a match in other.
|
1413
|
+
|
1414
|
+
```ruby
|
1415
|
+
df.anti_join(other, :KEY)
|
1416
|
+
#=>
|
1417
|
+
#<RedAmber::DataFrame : 1 x 2 Vectors, 0x0000000000029fcc>
|
1418
|
+
KEY X1
|
1419
|
+
<string> <uint8>
|
1420
|
+
0 C 3
|
1421
|
+
```
|
1422
|
+
|
1423
|
+
## Set operations
|
1424
|
+
![dataframe set and binding image](doc/../image/dataframe/set_and_bind.png)
|
1425
|
+
|
1426
|
+
Keys in self and other must be same in set operations.
|
1427
|
+
|
1428
|
+
```ruby
|
1429
|
+
df = DataFrame.new(
|
1430
|
+
KEY1: %w[A B C],
|
1431
|
+
KEY2: [1, 2, 3]
|
1432
|
+
)
|
1433
|
+
#=>
|
1434
|
+
#<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000012a70>
|
1435
|
+
KEY1 KEY2
|
1436
|
+
<string> <uint8>
|
1437
|
+
0 A 1
|
1438
|
+
1 B 2
|
1439
|
+
2 C 3
|
1440
|
+
|
1441
|
+
other = DataFrame.new(
|
1442
|
+
KEY1: %w[A B D],
|
1443
|
+
KEY2: [1, 4, 5]
|
1444
|
+
)
|
1445
|
+
#=>
|
1446
|
+
#<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000017034>
|
1447
|
+
KEY1 KEY2
|
1448
|
+
<string> <uint8>
|
1449
|
+
0 A 1
|
1450
|
+
1 B 4
|
1451
|
+
2 D 5
|
1452
|
+
```
|
1453
|
+
|
1454
|
+
##### `intersect(other)`
|
1455
|
+
|
1456
|
+
Select records appearing in both self and other.
|
1457
|
+
|
1458
|
+
```ruby
|
1459
|
+
df.intersect(other)
|
1460
|
+
#=>
|
1461
|
+
#<RedAmber::DataFrame : 1 x 2 Vectors, 0x0000000000029fcc>
|
1462
|
+
KEY1 KEY2
|
1463
|
+
<string> <uint8>
|
1464
|
+
0 A 1
|
1465
|
+
```
|
1466
|
+
|
1467
|
+
##### `union(other)`
|
1468
|
+
|
1469
|
+
Select records appearing in self or other.
|
1470
|
+
|
1471
|
+
```ruby
|
1472
|
+
df.union(other)
|
1473
|
+
#=>
|
1474
|
+
#<RedAmber::DataFrame : 5 x 2 Vectors, 0x0000000000029fcc>
|
1475
|
+
KEY1 KEY2
|
1476
|
+
<string> <uint8>
|
1477
|
+
0 A 1
|
1478
|
+
1 B 2
|
1479
|
+
2 C 3
|
1480
|
+
3 B 4
|
1481
|
+
4 D 5
|
1482
|
+
```
|
1483
|
+
|
1484
|
+
##### `difference(other)`
|
1485
|
+
|
1486
|
+
Select records appearing in self but not in other.
|
1487
|
+
|
1488
|
+
It has an alias `setdiff`.
|
1489
|
+
|
1490
|
+
```ruby
|
1491
|
+
df.difference(other)
|
1492
|
+
#=>
|
1493
|
+
#<RedAmber::DataFrame : 1 x 2 Vectors, 0x0000000000029fcc>
|
1494
|
+
KEY1 KEY2
|
1495
|
+
<string> <uint8>
|
1496
|
+
1 B 2
|
1497
|
+
2 C 3
|
1498
|
+
```
|
1499
|
+
|
1500
|
+
## Binding
|
1501
|
+
|
1502
|
+
### `concatenate(other)`
|
1503
|
+
|
1504
|
+
Concatenate another DataFrame or Table onto the bottom of self. The shape and data type of other must be the same as self.
|
1505
|
+
|
1506
|
+
The alias is `concat`.
|
1507
|
+
|
1508
|
+
An array of DataFrames or Tables is also acceptable as other.
|
1509
|
+
|
1510
|
+
```ruby
|
1511
|
+
df
|
1512
|
+
#=>
|
1513
|
+
#<RedAmber::DataFrame : 2 x 2 Vectors, 0x0000000000022cb8>
|
1514
|
+
x y
|
1515
|
+
<uint8> <string>
|
1516
|
+
0 1 A
|
1517
|
+
1 2 B
|
1518
|
+
|
1519
|
+
other
|
1520
|
+
#=>
|
1521
|
+
#<RedAmber::DataFrame : 2 x 2 Vectors, 0x000000000001f6d0>
|
1522
|
+
x y
|
1523
|
+
<uint8> <string>
|
1524
|
+
0 3 C
|
1525
|
+
1 4 D
|
1526
|
+
|
1527
|
+
df.concatenate(other)
|
1528
|
+
#=>
|
1529
|
+
#<RedAmber::DataFrame : 4 x 2 Vectors, 0x0000000000022574>
|
1530
|
+
x y
|
1531
|
+
<uint8> <string>
|
1532
|
+
0 1 A
|
1533
|
+
1 2 B
|
1534
|
+
2 3 C
|
1535
|
+
3 4 D
|
1536
|
+
```
|
1537
|
+
|
1538
|
+
### `merge(other)`
|
1539
|
+
|
1540
|
+
Concatenate another DataFrame or Table onto the bottom of self. The shape and data type of other must be the same as self.
|
1541
|
+
|
1542
|
+
```ruby
|
1543
|
+
df
|
1544
|
+
#=>
|
1545
|
+
#<RedAmber::DataFrame : 2 x 2 Vectors, 0x0000000000009150>
|
1546
|
+
x y
|
1547
|
+
<uint8> <uint8>
|
1548
|
+
0 1 3
|
1549
|
+
1 2 4
|
1550
|
+
|
1551
|
+
other
|
1552
|
+
#=>
|
1553
|
+
#<RedAmber::DataFrame : 2 x 2 Vectors, 0x0000000000008a0c>
|
1554
|
+
a b
|
1555
|
+
<string> <string>
|
1556
|
+
0 A C
|
1557
|
+
1 B D
|
1558
|
+
|
1559
|
+
df.merge(other)
|
1560
|
+
#=>
|
1561
|
+
#<RedAmber::DataFrame : 2 x 4 Vectors, 0x000000000000cb70>
|
1562
|
+
x y a b
|
1563
|
+
<uint8> <uint8> <string> <string>
|
1564
|
+
0 1 3 A C
|
1565
|
+
1 2 4 B D
|
1566
|
+
```
|
1288
1567
|
|
1289
1568
|
## Encoding
|
1290
1569
|
|
data/doc/Vector.md
CHANGED
@@ -24,6 +24,9 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
24
24
|
vector = Vector.new(1..3)
|
25
25
|
# or
|
26
26
|
vector = Vector.new(Arrow::Array.new([1, 2, 3])
|
27
|
+
# or
|
28
|
+
require 'arrow-numo-narray'
|
29
|
+
vector = Vector.new(Numo::Int8[1, 2, 3])
|
27
30
|
|
28
31
|
# =>
|
29
32
|
#<RedAmber::Vector(:uint8, size=3):0x000000000000f514>
|
Binary file
|
Binary file
|
Binary file
|