red_amber 0.1.1 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +26 -10
- data/.rubocop_todo.yml +1 -7
- data/CHANGELOG.md +109 -8
- data/README.md +66 -279
- data/doc/DataFrame.md +690 -0
- data/doc/Vector.md +195 -0
- data/doc/image/TDR_operations.pdf +0 -0
- data/doc/image/arrow_table_new.png +0 -0
- data/doc/image/dataframe/assign.png +0 -0
- data/doc/image/dataframe/drop.png +0 -0
- data/doc/image/dataframe/pick.png +0 -0
- data/doc/image/dataframe/remove.png +0 -0
- data/doc/image/dataframe/rename.png +0 -0
- data/doc/image/dataframe/slice.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/doc/image/example_in_red_arrow.png +0 -0
- data/doc/image/tdr.png +0 -0
- data/doc/image/tdr_and_table.png +0 -0
- data/doc/image/tidy_data_in_TDR.png +0 -0
- data/doc/image/vector/binary_element_wise.png +0 -0
- data/doc/image/vector/unary_aggregation.png +0 -0
- data/doc/image/vector/unary_aggregation_w_option.png +0 -0
- data/doc/image/vector/unary_element_wise.png +0 -0
- data/doc/tdr.md +53 -0
- data/doc/tdr_ja.md +53 -0
- data/lib/red_amber/data_frame.rb +42 -21
- data/lib/red_amber/data_frame_displayable.rb +131 -0
- data/lib/red_amber/data_frame_helper.rb +64 -0
- data/lib/red_amber/data_frame_observation_operation.rb +72 -0
- data/lib/red_amber/data_frame_selectable.rb +29 -35
- data/lib/red_amber/data_frame_variable_operation.rb +133 -0
- data/lib/red_amber/vector.rb +35 -2
- data/lib/red_amber/vector_functions.rb +134 -58
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +4 -1
- data/red_amber.gemspec +5 -5
- metadata +35 -10
- data/lib/red_amber/data_frame_output.rb +0 -116
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6ceace9db54b82c03ccf00fcd1b7bf2af57d94ea4e54183dc6af1da47e21ef00
|
|
4
|
+
data.tar.gz: f30578dcec45fd5efec9219c6438fd0108a0690b1cd69b1c398dffacd38aeba1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ee26fd212d0cb0758bc4611c5b43b302fe5c1b958239b5a9ac81ee09e936bdded733a719507e24e5434c33fc5d7ece43c973dd66d51413f23cc435ea0bd7570c
|
|
7
|
+
data.tar.gz: 674f56a11ddf906f608ecf7d7c852bec654a749e9052092553d19be967072d5acec95a096fbecc60ffd4b33fad3f4322354d93fade67230078fff15b6b7398dd
|
data/.rubocop.yml
CHANGED
|
@@ -45,7 +45,7 @@ Lint/BinaryOperatorWithIdenticalOperands:
|
|
|
45
45
|
|
|
46
46
|
# Max: 120
|
|
47
47
|
Layout/LineLength:
|
|
48
|
-
Max:
|
|
48
|
+
Max: 118
|
|
49
49
|
Exclude:
|
|
50
50
|
- 'test/**/*'
|
|
51
51
|
|
|
@@ -53,16 +53,18 @@ Layout/LineLength:
|
|
|
53
53
|
# 18..30 unsatisfactory
|
|
54
54
|
# > 30 dangerous
|
|
55
55
|
Metrics/AbcSize:
|
|
56
|
-
Max:
|
|
56
|
+
Max: 23
|
|
57
57
|
Exclude:
|
|
58
|
-
- 'lib/red_amber/
|
|
58
|
+
- 'lib/red_amber/data_frame_displayable.rb' # Max: 55
|
|
59
|
+
- 'lib/red_amber/data_frame_selectable.rb' # Max: 27
|
|
60
|
+
- 'lib/red_amber/data_frame_observation_operation.rb' # Max: 29
|
|
61
|
+
- 'lib/red_amber/data_frame_variable_operation.rb' # Max: 26
|
|
59
62
|
|
|
60
63
|
# Max: 25
|
|
61
64
|
Metrics/BlockLength:
|
|
62
65
|
Max: 25
|
|
63
66
|
Exclude:
|
|
64
67
|
- 'test/**/*'
|
|
65
|
-
- '*.gemspec'
|
|
66
68
|
|
|
67
69
|
# Max: 100
|
|
68
70
|
Metrics/ClassLength:
|
|
@@ -72,18 +74,32 @@ Metrics/ClassLength:
|
|
|
72
74
|
|
|
73
75
|
# Max: 7
|
|
74
76
|
Metrics/CyclomaticComplexity:
|
|
75
|
-
Max:
|
|
76
|
-
Exclude:
|
|
77
|
-
- 'lib/red_amber/data_frame_output.rb' # Max: 11
|
|
77
|
+
Max: 12
|
|
78
78
|
|
|
79
79
|
# Max: 10
|
|
80
80
|
Metrics/MethodLength:
|
|
81
81
|
Max: 18
|
|
82
82
|
Exclude:
|
|
83
|
-
- 'lib/red_amber/
|
|
83
|
+
- 'lib/red_amber/data_frame_displayable.rb' # Max: 33
|
|
84
|
+
- 'lib/red_amber/data_frame_observation_operation.rb' # Max: 21
|
|
85
|
+
- 'lib/red_amber/data_frame_variable_operation.rb' # Max: 20
|
|
86
|
+
|
|
87
|
+
# Max: 100
|
|
88
|
+
Metrics/ModuleLength:
|
|
89
|
+
Max: 100
|
|
90
|
+
Exclude:
|
|
91
|
+
- 'lib/red_amber/vector_functions.rb' # Max: 114
|
|
84
92
|
|
|
85
93
|
# Max: 8
|
|
86
94
|
Metrics/PerceivedComplexity:
|
|
87
|
-
Max:
|
|
95
|
+
Max: 13
|
|
96
|
+
|
|
97
|
+
# Necessary to define is_na
|
|
98
|
+
Naming/PredicateName:
|
|
99
|
+
Exclude:
|
|
100
|
+
- 'lib/red_amber/vector_functions.rb'
|
|
101
|
+
|
|
102
|
+
# Necessary to test when range.end == -1
|
|
103
|
+
Style/SlicingWithRange:
|
|
88
104
|
Exclude:
|
|
89
|
-
- '
|
|
105
|
+
- 'test/test_data_frame_selectable.rb'
|
data/.rubocop_todo.yml
CHANGED
|
@@ -1,17 +1,11 @@
|
|
|
1
1
|
# This configuration was generated by
|
|
2
2
|
# `rubocop --auto-gen-config`
|
|
3
|
-
# on 2022-
|
|
3
|
+
# on 2022-05-08 02:37:36 UTC using RuboCop version 1.27.0.
|
|
4
4
|
# The point is for the user to remove these configuration records
|
|
5
5
|
# one by one as the offenses are removed from the code base.
|
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
|
8
8
|
|
|
9
|
-
# Offense count: 1
|
|
10
|
-
# This cop supports unsafe auto-correction (--auto-correct-all).
|
|
11
|
-
Style/SlicingWithRange:
|
|
12
|
-
Exclude:
|
|
13
|
-
- 'lib/red_amber/data_frame_selectable.rb'
|
|
14
|
-
|
|
15
9
|
# Offense count: 1
|
|
16
10
|
# This cop supports unsafe auto-correction (--auto-correct-all).
|
|
17
11
|
# Configuration parameters: EnforcedStyle.
|
data/CHANGELOG.md
CHANGED
|
@@ -1,17 +1,118 @@
|
|
|
1
|
-
##
|
|
1
|
+
## - Unreleased
|
|
2
|
+
|
|
3
|
+
- Feedback something to Red Arrow
|
|
2
4
|
|
|
3
|
-
- Add support for Arrow 8.0.0
|
|
4
5
|
- `DataFrame`
|
|
5
|
-
- Introduce
|
|
6
|
-
- Introduce
|
|
7
|
-
-
|
|
6
|
+
- Introduce `group_by`
|
|
7
|
+
- Introduce `summarize`
|
|
8
|
+
- Introduce `summary` or ``describe`
|
|
9
|
+
- Improve dataframe obs. manipuration methods to accept float as a index (#10)
|
|
10
|
+
- More performant
|
|
11
|
+
|
|
8
12
|
- `Vector`
|
|
9
|
-
-
|
|
10
|
-
|
|
13
|
+
- Support more functions
|
|
14
|
+
|
|
15
|
+
- Document
|
|
16
|
+
- YARD support
|
|
17
|
+
|
|
18
|
+
## [0.1.4] - 2022-05-29 (experimental)
|
|
19
|
+
|
|
20
|
+
- Bug fixes
|
|
21
|
+
- Fix missing support for scalar argument (#1)
|
|
22
|
+
- Fix type name of boolean in DF#types to be same as Vector#type (#6, #7)
|
|
23
|
+
- Fix zero picking to return empty DataFrame (#8)
|
|
24
|
+
- Fix code at both args and a block given (#8)
|
|
25
|
+
|
|
26
|
+
- New features and improvements
|
|
27
|
+
- `DataFrame`
|
|
28
|
+
- Refine module name `Displayable`
|
|
29
|
+
- Rename nrow/ncol methods to `size`/`n_keys` to align with TDR concept (#4)
|
|
30
|
+
- Remain `n_row`/`n_col` for compatibility
|
|
31
|
+
- Rename `ls` method to `tdr` (#4)
|
|
32
|
+
- Add limit option to `tdr`
|
|
33
|
+
- Shorten option name (#11)
|
|
34
|
+
- Introduce `pick` method to create sub DataFrame (#8)
|
|
35
|
+
- Add boolean support (#8)
|
|
36
|
+
- Refactor `pick` (#9)
|
|
37
|
+
- Introduce `drop` method to create sub DataFrame (#8)
|
|
38
|
+
- Add boolean support (#8)
|
|
39
|
+
- Refactor `drop` (#9)
|
|
40
|
+
- Add boolean array support for `[]` (#9)
|
|
41
|
+
- Add `indexes`/`indices` to use with selecting observations (#9)
|
|
42
|
+
- Introduce `slice` method to create sub DataFrame (#8)
|
|
43
|
+
- Refactor `slice` (#9)
|
|
44
|
+
- Introduce `remove` method to create sub DataFrame (#9)
|
|
45
|
+
- Introduce `rename` method to create sub DataFrame (#14)
|
|
46
|
+
- Introduce `assign` method to create sub DataFrame (#14)
|
|
47
|
+
- Improve to call block by instance_eval (#13)
|
|
48
|
+
|
|
49
|
+
- `Vector`
|
|
50
|
+
- Refine `find(function)`
|
|
51
|
+
- Add `min_max` method (#2)
|
|
52
|
+
- Add `std`/`sd` method (ddof=0 version: `stddev`) (#2)
|
|
53
|
+
- Add `var` method (ddof=0 version: `variance`) (#2)
|
|
54
|
+
- Add `VectorFunctions.arrow_doc(func_name)` (temporally)
|
|
55
|
+
|
|
56
|
+
- Documentation
|
|
57
|
+
- Show code in README
|
|
58
|
+
- Change row/column names for **TDR** concept (#4)
|
|
59
|
+
- Add documents about **TDR** concept (#4)
|
|
60
|
+
- Add example about TDR (#4)
|
|
61
|
+
- Separate README to create DataFrame and Vector documents (#12)
|
|
62
|
+
- Add DataFrame model concept image to README (#12)
|
|
63
|
+
|
|
64
|
+
- GitHub site
|
|
65
|
+
- Switched to use merge on GitHub (not to push merged master) (#1)
|
|
66
|
+
- Create lifetime issue #3 to show the goal of this project (#3)
|
|
67
|
+
|
|
68
|
+
## [0.1.3] - 2022-05-15 (experimental)
|
|
69
|
+
|
|
70
|
+
- Bug fixes
|
|
71
|
+
- Fix boolean functions in `Vector` to align with Ruby's behavior
|
|
72
|
+
- `&` == `and_kleene`
|
|
73
|
+
- `|` == `or_kleene`
|
|
74
|
+
- Quote strings of data-preview in `DataFrame#inspect`
|
|
75
|
+
- Quote empty and blank keys in `DataFrame#inspect`
|
|
76
|
+
- Respond to error for a wrong key in `DataFrame#[]`
|
|
77
|
+
|
|
78
|
+
- New features and improvements
|
|
79
|
+
- `DataFrame`
|
|
80
|
+
- Display nil elements in `inspect`
|
|
81
|
+
- Show NaN and nil counts in `inspect`
|
|
82
|
+
- Refactor `inspect`
|
|
83
|
+
- Add method `key` and `key_index`
|
|
84
|
+
- Add how to load/save Parquet to README
|
|
85
|
+
|
|
86
|
+
- `Vector`
|
|
87
|
+
- Add categorization functions
|
|
88
|
+
|
|
89
|
+
This is an important step to support `slice` method and NA treatment features.
|
|
90
|
+
- `is_finite`
|
|
91
|
+
- `is_inf`
|
|
92
|
+
- `is_na` (RedAmber original)
|
|
93
|
+
- `is_nan`
|
|
94
|
+
- `is_nil`, `is_null`
|
|
95
|
+
- `is_valid`
|
|
96
|
+
- Show in a reduced representation for long array in `inspect`
|
|
97
|
+
- Support options in aggregatiton functions
|
|
98
|
+
- Return values in non-arrow object for scalar aggregation functions
|
|
99
|
+
|
|
100
|
+
## [0.1.2] - 2022-05-08 (experimental)
|
|
101
|
+
|
|
102
|
+
- Bug fixes:
|
|
103
|
+
- `DataFrame`
|
|
104
|
+
- Fix bug in `#[]` with end-less Range
|
|
105
|
+
- New features and improvements
|
|
106
|
+
- Add support for Arrow 8.0.0
|
|
107
|
+
- `DataFrame`
|
|
108
|
+
- `types` and `data_types`
|
|
109
|
+
- Range is usable to specify columns in `#[]`
|
|
110
|
+
- `Vector`
|
|
111
|
+
- `type` and `data_type`
|
|
11
112
|
|
|
12
113
|
## [0.1.1] - 2022-05-06 (experimental)
|
|
13
114
|
|
|
14
|
-
- Release on
|
|
115
|
+
- Release on rubygems.org
|
|
15
116
|
- Introduce class `DataFrame`
|
|
16
117
|
- New from Hash, schema/rows, `Arrow::Table`, `Rover::DataFrame`
|
|
17
118
|
- Load from file, string, URI
|
data/README.md
CHANGED
|
@@ -8,8 +8,8 @@ A simple dataframe library for Ruby (experimental)
|
|
|
8
8
|
## Requirements
|
|
9
9
|
|
|
10
10
|
```ruby
|
|
11
|
-
gem 'red-arrow', '
|
|
12
|
-
gem 'red-parquet', '
|
|
11
|
+
gem 'red-arrow', '>= 7.0.0'
|
|
12
|
+
gem 'red-parquet', '>= 7.0.0' # if you use IO from/to parquet
|
|
13
13
|
gem 'rover-df', '~> 0.3.0' # if you use IO from/to Rover::DataFrame
|
|
14
14
|
```
|
|
15
15
|
|
|
@@ -23,308 +23,95 @@ gem 'red_amber'
|
|
|
23
23
|
|
|
24
24
|
And then execute:
|
|
25
25
|
|
|
26
|
-
|
|
26
|
+
```shell
|
|
27
|
+
bundle install
|
|
28
|
+
```
|
|
27
29
|
|
|
28
30
|
Or install it yourself as:
|
|
29
31
|
|
|
30
|
-
|
|
32
|
+
```shell
|
|
33
|
+
gem install red_amber
|
|
34
|
+
```
|
|
31
35
|
|
|
32
36
|
## `RedAmber::DataFrame`
|
|
33
37
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
- [x] `new` from a columnar Hash
|
|
37
|
-
- `RedAmber::DataFrame.new(x: [1, 2, 3])`
|
|
38
|
-
|
|
39
|
-
- [x] `new` from a schema (by Hash) and rows (by Array)
|
|
40
|
-
- `RedAmber::DataFrame.new({:x=>:uint8}, [[1], [2], [3]])`
|
|
41
|
-
|
|
42
|
-
- [x] `new` from an Arrow::Table
|
|
43
|
-
- `RedAmber::DataFrame.new(Arrow::Table.new(x: [1, 2, 3]))`
|
|
44
|
-
|
|
45
|
-
- [x] `new` from a Rover::DataFrame
|
|
46
|
-
- `RedAmber::DataFrame.new(Rover::DataFrame.new(x: [1, 2, 3]))`
|
|
47
|
-
|
|
48
|
-
- [ ] `load` (class method)
|
|
49
|
-
|
|
50
|
-
- [x] from a [`.arrow`, `.arrows`, `.csv`, `.csv.gz`, `.tsv`] file
|
|
51
|
-
- `RedAmber::DataFrame.load("test/entity/with_header.csv")`
|
|
52
|
-
|
|
53
|
-
- [x] from a string buffer
|
|
54
|
-
|
|
55
|
-
- [x] from a URI
|
|
56
|
-
- `RedAmber::DataFrame.load(URI("https://github.com/heronshoes/red_amber/blob/master/test/entity/with_header.csv"))`
|
|
57
|
-
|
|
58
|
-
- [ ] from a parquet file
|
|
59
|
-
|
|
60
|
-
- [ ] `save` (instance method)
|
|
61
|
-
|
|
62
|
-
- [x] to a [`.arrow`, `.arrows`, `.csv`, `.csv.gz`, `.tsv`] file
|
|
63
|
-
|
|
64
|
-
- [x] to a string buffer
|
|
65
|
-
|
|
66
|
-
- [x] to a URI
|
|
67
|
-
|
|
68
|
-
- [ ] to a parquet file
|
|
69
|
-
|
|
70
|
-
### Properties
|
|
71
|
-
|
|
72
|
-
- [x] `table`
|
|
73
|
-
|
|
74
|
-
Reader of Arrow::Table object inside.
|
|
75
|
-
|
|
76
|
-
- [x] `n_rows`, `nrow`, `size`, `length`
|
|
77
|
-
|
|
78
|
-
Returns num of rows (data size).
|
|
79
|
-
|
|
80
|
-
- [x] `n_columns`, `ncol`, `width`
|
|
81
|
-
|
|
82
|
-
Returns num of columns (num of vectors).
|
|
83
|
-
|
|
84
|
-
- [x] `shape`
|
|
85
|
-
|
|
86
|
-
Returns shape in an Array[n_rows, n_cols].
|
|
87
|
-
|
|
88
|
-
- [x] `column_names`, `keys`
|
|
89
|
-
|
|
90
|
-
Returns num of column names by an Array.
|
|
91
|
-
|
|
92
|
-
- [x] `types(class_name: false)`
|
|
93
|
-
|
|
94
|
-
Returns types of columns by an Array.
|
|
95
|
-
If `class_name: true` returns an Array of `Arrow::DataType`.
|
|
96
|
-
|
|
97
|
-
- [x] `vectors`
|
|
98
|
-
|
|
99
|
-
Returns an Array of Vectors.
|
|
100
|
-
|
|
101
|
-
- [x] `to_h`
|
|
102
|
-
|
|
103
|
-
Returns column-oriented data in a Hash.
|
|
104
|
-
|
|
105
|
-
- [x] `to_a`, `raw_records`
|
|
106
|
-
|
|
107
|
-
Returns an array of row-oriented data without header. If you need a column-oriented full array, use `.to_h.to_a`
|
|
108
|
-
|
|
109
|
-
- [x] `schema`
|
|
110
|
-
|
|
111
|
-
Returns column name and data type in a Hash.
|
|
112
|
-
|
|
113
|
-
- [x] `==`
|
|
114
|
-
|
|
115
|
-
- [x] `empty?`
|
|
116
|
-
|
|
117
|
-
### Output
|
|
118
|
-
|
|
119
|
-
- [x] `to_s`
|
|
120
|
-
|
|
121
|
-
- [ ] summary, describe
|
|
122
|
-
|
|
123
|
-
- [x] `to_rover`
|
|
38
|
+
Represents a set of data in 2D-shape.
|
|
124
39
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
- [x] Selecting rows from top or bottom
|
|
145
|
-
|
|
146
|
-
`head(n=5)`, `tail(n=5)`, `first(n=1)`, `last(n=1)`
|
|
147
|
-
|
|
148
|
-
- [ ] slice
|
|
149
|
-
|
|
150
|
-
### Updating
|
|
151
|
-
|
|
152
|
-
- [ ] Add a new column
|
|
153
|
-
|
|
154
|
-
- [ ] Update a single element
|
|
155
|
-
|
|
156
|
-
- [ ] Update multiple elements
|
|
157
|
-
|
|
158
|
-
- [ ] Update all elements
|
|
159
|
-
|
|
160
|
-
- [ ] Update elements matching a condition
|
|
161
|
-
|
|
162
|
-
- [ ] Clamp
|
|
163
|
-
|
|
164
|
-
- [ ] Delete columns
|
|
165
|
-
|
|
166
|
-
- [ ] Rename a column
|
|
167
|
-
|
|
168
|
-
- [ ] Sort rows
|
|
169
|
-
|
|
170
|
-
- [ ] Clear data
|
|
171
|
-
|
|
172
|
-
### Treat na data
|
|
40
|
+
```ruby
|
|
41
|
+
require 'red_amber'
|
|
42
|
+
require 'datasets-arrow'
|
|
43
|
+
|
|
44
|
+
penguins = Datasets::Penguins.new.to_arrow
|
|
45
|
+
puts RedAmber::DataFrame.new(penguins).tdr
|
|
46
|
+
# =>
|
|
47
|
+
RedAmber::DataFrame : 344 x 8 Vectors
|
|
48
|
+
Vectors : 5 numeric, 3 strings
|
|
49
|
+
# key type level data_preview
|
|
50
|
+
1 :species string 3 {"Adelie"=>152, "Chinstrap"=>68, "Gentoo"=>124}
|
|
51
|
+
2 :island string 3 {"Torgersen"=>52, "Biscoe"=>168, "Dream"=>124}
|
|
52
|
+
3 :bill_length_mm double 165 [39.1, 39.5, 40.3, nil, 36.7, ... ], 2 nils
|
|
53
|
+
4 :bill_depth_mm double 81 [18.7, 17.4, 18.0, nil, 19.3, ... ], 2 nils
|
|
54
|
+
5 :flipper_length_mm uint8 56 [181, 186, 195, nil, 193, ... ], 2 nils
|
|
55
|
+
6 :body_mass_g uint16 95 [3750, 3800, 3250, nil, 3450, ... ], 2 nils
|
|
56
|
+
7 :sex string 3 {"male"=>168, "female"=>165, nil=>11}
|
|
57
|
+
8 :year uint16 3 {2007=>110, 2008=>114, 2009=>120}
|
|
58
|
+
```
|
|
173
59
|
|
|
174
|
-
|
|
60
|
+
### DataFrame model
|
|
61
|
+

|
|
175
62
|
|
|
176
|
-
|
|
63
|
+
For example, `DataFrame#pick` accepts keys as an argument and returns a sub DataFrame.
|
|
177
64
|
|
|
178
|
-
|
|
65
|
+
```ruby
|
|
66
|
+
df = penguins.pick(:body_mass_g)
|
|
67
|
+
# =>
|
|
68
|
+
#<RedAmber::DataFrame : 344 x 1 Vector, 0x000000000000fa14>
|
|
69
|
+
Vector : 1 numeric
|
|
70
|
+
# key type level data_preview
|
|
71
|
+
1 :body_mass_g int64 95 [3750, 3800, 3250, nil, 3450, ... ], 2 nils
|
|
72
|
+
```
|
|
179
73
|
|
|
180
|
-
|
|
74
|
+
`DataFrame#assign` can accept a block and create new variables.
|
|
181
75
|
|
|
182
|
-
|
|
76
|
+
```ruby
|
|
77
|
+
df.assign do
|
|
78
|
+
{:body_mass_kg => penguins[:body_mass_g] / 1000.0}
|
|
79
|
+
end
|
|
80
|
+
# =>
|
|
81
|
+
#<RedAmber::DataFrame : 344 x 2 Vectors, 0x000000000000fa28>
|
|
82
|
+
Vectors : 2 numeric
|
|
83
|
+
# key type level data_preview
|
|
84
|
+
1 :body_mass_g int64 95 [3750, 3800, 3250, nil, 3450, ... ], 2 nils
|
|
85
|
+
2 :body_mass_kg double 95 [3.75, 3.8, 3.25, nil, 3.45, ... ], 2 nils
|
|
86
|
+
```
|
|
183
87
|
|
|
184
|
-
|
|
88
|
+
Other DataFrame manipulating methods like `pick`, `drop`, `slice`, `remove` and `rename` also accept a block.
|
|
185
89
|
|
|
186
|
-
|
|
90
|
+
See [DataFrame.md](doc/DataFrame.md) for details.
|
|
187
91
|
|
|
188
|
-
- [ ] Left join
|
|
189
92
|
|
|
190
|
-
|
|
93
|
+
## `RedAmber::Vector`
|
|
191
94
|
|
|
192
|
-
|
|
95
|
+
Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
193
96
|
|
|
194
|
-
|
|
97
|
+
```ruby
|
|
98
|
+
penguins[:species]
|
|
99
|
+
# =>
|
|
100
|
+
#<RedAmber::Vector(:string, size=344):0x000000000000f8e8>
|
|
101
|
+
["Adelie", "Adelie", "Adelie", "Adelie", "Adelie", "Adelie", "Adelie", "Adelie", ... ]
|
|
102
|
+
```
|
|
195
103
|
|
|
196
|
-
|
|
104
|
+
Vectors accepts some [functional methods from Arrow](https://arrow.apache.org/docs/cpp/compute.html).
|
|
197
105
|
|
|
106
|
+
See [Vector.md](doc/Vector.md) for details.
|
|
198
107
|
|
|
199
|
-
##
|
|
200
|
-
### Constructor
|
|
201
|
-
|
|
202
|
-
- [x] Create from a column in a DataFrame
|
|
203
|
-
|
|
204
|
-
- [x] New from an Array
|
|
205
|
-
|
|
206
|
-
### Properties
|
|
207
|
-
|
|
208
|
-
- [x] `to_s`
|
|
209
|
-
|
|
210
|
-
- [x] `values`, `to_a`, `entries`
|
|
211
|
-
|
|
212
|
-
- [x] `size`, `length`, `n_rows`, `nrow`
|
|
213
|
-
|
|
214
|
-
- [x] `type`
|
|
215
|
-
|
|
216
|
-
- [ ] `each`
|
|
217
|
-
|
|
218
|
-
- [ ] `chunked?`
|
|
219
|
-
|
|
220
|
-
- [ ] `n_chunks`
|
|
221
|
-
|
|
222
|
-
- [ ] `each_chunk`
|
|
223
|
-
|
|
224
|
-
- [x] `tally`
|
|
225
|
-
|
|
226
|
-
- [ ] `n_nulls`
|
|
227
|
-
|
|
228
|
-
### Functions
|
|
229
|
-
#### Unary aggregations: vector.func => Scalar
|
|
230
|
-
|
|
231
|
-
| Method |Boolean|Numeric|String|Remarks|
|
|
232
|
-
| ------------ | --- | --- | --- | ----- |
|
|
233
|
-
|[x] `all` | [x] | | | |
|
|
234
|
-
|[x] `any` | [x] | | | |
|
|
235
|
-
|[x] `approximate_median`| | [x] | | |
|
|
236
|
-
|[x] `count` | [x] | [x] | [x] | |
|
|
237
|
-
|[x] `count_distinct`| [x] | [x] | [x] | |
|
|
238
|
-
|[x] `count_uniq` | [x] | [x] | [x] |an alias of `count_distinct`|
|
|
239
|
-
|[ ] `index` | | | | |
|
|
240
|
-
|[x] `max` | [x] | [x] | [x] | |
|
|
241
|
-
|[x] `mean` | [x] | [x] | | |
|
|
242
|
-
|[x] `min` | [x] | [x] | [x] | |
|
|
243
|
-
|[ ] `min_max` | | | | |
|
|
244
|
-
|[ ] `mode` | | | | |
|
|
245
|
-
|[x] `product` | [x] | [x] | | |
|
|
246
|
-
|[ ] `quantile`| | | | |
|
|
247
|
-
|[x] `stddev` | | [x] | | |
|
|
248
|
-
|[x] `sum` | [x] | [x] | | |
|
|
249
|
-
|[ ] `tdigest` | | | | |
|
|
250
|
-
|[x] `variance`| | [x] | | |
|
|
251
|
-
|
|
252
|
-
#### Unary element-wise: vector.func => Vector
|
|
253
|
-
|
|
254
|
-
| Method |Boolean|Numeric|String|Remarks|
|
|
255
|
-
| ------------ | --- | --- | --- | ----- |
|
|
256
|
-
|[x] `-@` | | [x] | |as `-vector`|
|
|
257
|
-
|[x] `negate` | | [x] | |`-@` |
|
|
258
|
-
|[x] `abs` | | [x] | | |
|
|
259
|
-
|[ ] `acos` | | [ ] | | |
|
|
260
|
-
|[ ] `asin` | | [ ] | | |
|
|
261
|
-
|[x] `atan` | | [x] | | |
|
|
262
|
-
|[ ] `ceil` | | [x] | | |
|
|
263
|
-
|[x] `cos` | | [x] | | |
|
|
264
|
-
|[ ] `floor` | | [x] | | |
|
|
265
|
-
|[ ] `ln` | | [ ] | | |
|
|
266
|
-
|[ ] `log10` | | [ ] | | |
|
|
267
|
-
|[ ] `log1p` | | [ ] | | |
|
|
268
|
-
|[ ] `log2` | | [ ] | | |
|
|
269
|
-
|[x] `sign` | | [x] | | |
|
|
270
|
-
|[x] `sin` | | [x] | | |
|
|
271
|
-
|[x] `tan` | | [x] | | |
|
|
272
|
-
|[ ] `trunc` | | [x] | | |
|
|
273
|
-
|
|
274
|
-
#### Binary element-wise: vector.func(vector) => Vector
|
|
275
|
-
|
|
276
|
-
| Method |Boolean|Numeric|String|Remarks|
|
|
277
|
-
| ------------------ | --- | --- | --- | ----- |
|
|
278
|
-
|[x] `add` | | [x] | | `+` |
|
|
279
|
-
|[x] `atan2` | | [x] | | |
|
|
280
|
-
|[x] `and` | [x] | | | |
|
|
281
|
-
|[x] `and_kleene` | [x] | | | |
|
|
282
|
-
|[x] `and_not` | [x] | | | |
|
|
283
|
-
|[x] `and_not_kleene`| [x] | | | |
|
|
284
|
-
|[x] `bit_wise_and` | |([x])| |`&`, integer only|
|
|
285
|
-
|[ ] `bit_wise_not` | |([x])| |`!`, integer only|
|
|
286
|
-
|[x] `bit_wise_or` | |([x])| |`|`, integer only|
|
|
287
|
-
|[x] `bit_wise_xor` | |([x])| |`^`, integer only|
|
|
288
|
-
|[x] `divide` | | [x] | | `/` |
|
|
289
|
-
|[x] `equal` | [x] | [x] | [x] |`==`, alias `eq`|
|
|
290
|
-
|[x] `greater` | [x] | [x] | [x] |`>`, alias `gt`|
|
|
291
|
-
|[x] `greater_equal` | [x] | [x] | [x] |`>=`, alias `ge`|
|
|
292
|
-
|[x] `less` | [x] | [x] | [x] |`<`, alias `lt`|
|
|
293
|
-
|[x] `less_equal` | [x] | [x] | [x] |`<=`, alias `le`|
|
|
294
|
-
|[ ] `logb` | | [ ] | | |
|
|
295
|
-
|[ ] `mod` | | [ ] | | |
|
|
296
|
-
|[x] `multiply` | | [x] | | `*` |
|
|
297
|
-
|[x] `not_equal` | [x] | [x] | [x] |`!=`, alias `ne`|
|
|
298
|
-
|[x] `or` | [x] | | | |
|
|
299
|
-
|[x] `or_kleene` | [x] | | | |
|
|
300
|
-
|[x] `power` | | [x] | | `**` |
|
|
301
|
-
|[x] `subtract` | | [x] | | `-` |
|
|
302
|
-
|[x] `shift_left` | |([x])| |`<<`, integer only|
|
|
303
|
-
|[x] `shift_right` | |([x])| |`>>`, integer only|
|
|
304
|
-
|[x] `xor` | [x] | | | |
|
|
305
|
-
|
|
306
|
-
##### (Not impremented)
|
|
307
|
-
- [ ] invert, round, round_to_multiple
|
|
308
|
-
- [ ] sort, sort_index
|
|
309
|
-
- [ ] minmax, var, median, quantile
|
|
310
|
-
- [ ] argmin, argmax
|
|
311
|
-
- [ ] (array functions)
|
|
312
|
-
- [ ] (strings functions)
|
|
313
|
-
- [ ] (temporal functions)
|
|
314
|
-
- [ ] (conditional functions)
|
|
315
|
-
- [ ] (index functions)
|
|
316
|
-
- [ ] (other functions)
|
|
317
|
-
|
|
318
|
-
### Coerce (not impremented)
|
|
319
|
-
|
|
320
|
-
### Updating (not impremented)
|
|
321
|
-
|
|
322
|
-
### DSL in a block for faster calculation ?
|
|
108
|
+
## TDR concept
|
|
323
109
|
|
|
110
|
+
I named the data frame representation style in the model above as TDR (Transposed DataFrame Representation). See [TDR.md](doc/tdr.md) for details.
|
|
324
111
|
|
|
325
112
|
## Development
|
|
326
113
|
|
|
327
|
-
```
|
|
114
|
+
```shell
|
|
328
115
|
git clone https://github.com/heronshoes/red_amber.git
|
|
329
116
|
cd red_amber
|
|
330
117
|
bundle install
|