red_amber 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/CHANGELOG.md +44 -18
- data/Gemfile +4 -1
- data/README.md +51 -76
- data/Rakefile +1 -0
- data/benchmark/csv_load_penguins.yml +1 -1
- data/doc/47_examples_of_red_amber.ipynb +4872 -0
- data/doc/DataFrame.md +370 -210
- data/doc/Vector.md +68 -15
- data/doc/image/dataframe/assign.png +0 -0
- data/doc/image/dataframe/drop.png +0 -0
- data/doc/image/dataframe/pick.png +0 -0
- data/doc/image/dataframe/remove.png +0 -0
- data/doc/image/dataframe/rename.png +0 -0
- data/doc/image/dataframe/slice.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/doc/image/vector/binary_element_wise.png +0 -0
- data/doc/image/vector/unary_aggregation.png +0 -0
- data/doc/image/vector/unary_aggregation_w_option.png +0 -0
- data/doc/image/vector/unary_element_wise.png +0 -0
- data/lib/red-amber.rb +1 -25
- data/lib/red_amber/data_frame.rb +9 -7
- data/lib/red_amber/data_frame_displayable.rb +79 -4
- data/lib/red_amber/group.rb +61 -0
- data/lib/red_amber/vector.rb +17 -3
- data/lib/red_amber/vector_functions.rb +22 -20
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +27 -1
- data/red_amber.gemspec +0 -2
- metadata +4 -31
- data/lib/red_amber/data_frame_observation_operation.rb +0 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 88bdd603d8daec1a95c0277ef68857f84346ad7cf95d0ba23a306e6b70567c29
|
4
|
+
data.tar.gz: 40add80cbaa5183ca0e93eadcdcd1fead37015cac1cb2360660002c0b1878255
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d043eea51117ecc48bdc52fa951e24d2618f273eb289a30f5bbb182e1a891763cdd35f6a7c6764f6e0061bddeaaa86b2374de1dc2b48f25a5b6b05c9af83a0e3
|
7
|
+
data.tar.gz: cdbba19750bf71fe99e55bf6c46cb4522018f43563d7a93fdc375987f9388234e4f7e833297fdb6b8dd5a41b5a1bfdbf287ea47663f5f8a90facb56a4c63daef
|
data/.rubocop.yml
CHANGED
@@ -80,6 +80,7 @@ Metrics/CyclomaticComplexity:
|
|
80
80
|
Exclude:
|
81
81
|
- 'lib/red_amber/data_frame_selectable.rb' # Max: 14
|
82
82
|
- 'lib/red_amber/vector_updatable.rb' # Max: 14
|
83
|
+
- 'lib/red_amber/data_frame_displayable.rb' # Max: 18
|
83
84
|
|
84
85
|
# Max: 10
|
85
86
|
Metrics/MethodLength:
|
@@ -93,6 +94,7 @@ Metrics/ModuleLength:
|
|
93
94
|
Exclude:
|
94
95
|
- 'lib/red_amber/data_frame_selectable.rb' # Max: 141
|
95
96
|
- 'lib/red_amber/vector_functions.rb' # Max: 114
|
97
|
+
- 'lib/red_amber/data_frame_displayable.rb' # Max: 132
|
96
98
|
|
97
99
|
# Max: 8
|
98
100
|
Metrics/PerceivedComplexity:
|
@@ -100,6 +102,7 @@ Metrics/PerceivedComplexity:
|
|
100
102
|
Exclude:
|
101
103
|
- 'lib/red_amber/data_frame_selectable.rb' # Max: 14
|
102
104
|
- 'lib/red_amber/vector_updatable.rb' # Max: 15
|
105
|
+
- 'lib/red_amber/data_frame_displayable.rb' # Max: 19
|
103
106
|
|
104
107
|
Naming/FileName:
|
105
108
|
Exclude:
|
data/CHANGELOG.md
CHANGED
@@ -1,29 +1,55 @@
|
|
1
|
-
## -
|
1
|
+
## [0.1.9] - Unreleased
|
2
2
|
|
3
|
-
-
|
4
|
-
- YARD support
|
3
|
+
- Supports Arrow 9.0.0
|
5
4
|
|
6
|
-
-
|
7
|
-
- `red-amber` gem
|
5
|
+
## [0.1.7] - 2022-07-15 (experimental)
|
8
6
|
|
9
|
-
-
|
10
|
-
|
7
|
+
- Bug fixes
|
8
|
+
|
9
|
+
- Remove development dependency for red-dataset-arrow (#47)
|
10
|
+
- To avoid irregular fails in CI test
|
11
|
+
- Add red-datasets to development dependency instead (#49)
|
11
12
|
|
12
|
-
|
13
|
+
- Supress useless log in tests (#46)
|
14
|
+
Suppress log of Webrick and iruby.
|
13
15
|
|
14
|
-
-
|
15
|
-
|
16
|
-
|
16
|
+
- New features and improvements
|
17
|
+
|
18
|
+
- Use Table mode as default preview mode in `inspect`/`to_s` (#40)
|
19
|
+
- Show examples in documents in Table
|
20
|
+
- Use the word rows/columns
|
21
|
+
- Update images of data processing in Table style
|
22
|
+
|
23
|
+
- Introduce a new Table formatter (#47)
|
24
|
+
- Migrate from the Arrow's formatter
|
25
|
+
- Do not use TAB, format by spaces only.
|
26
|
+
- Align column width with head rows and tail rows.
|
27
|
+
- Show nils.
|
28
|
+
- Show data types.
|
29
|
+
- Refine documents to use new formatter output
|
30
|
+
|
31
|
+
- Simplify options of Vector functions (#46)
|
32
|
+
Vector functions with options use optional argument opt in previous code.
|
33
|
+
|
34
|
+
- Add `#float?`, `#integer?` to Vector (#46)
|
35
|
+
- Add `#each` to Vector (#47)
|
36
|
+
|
37
|
+
- Introduce class `Group` (#48)
|
38
|
+
- Refine `DataFrame#group` to use class Group
|
39
|
+
- Add methods to Group
|
40
|
+
|
41
|
+
- Move parquet and rover to development dependency (#49)
|
42
|
+
|
43
|
+
- Refine text in `DataFrame#to_iruby` (#40)
|
17
44
|
|
18
|
-
|
45
|
+
- Add badges in Github site
|
46
|
+
- Gitter badge for Red Data Tools (#42)
|
47
|
+
- Gem version and CI status badge (#45)
|
19
48
|
|
20
|
-
-
|
21
|
-
-
|
22
|
-
- Improve as more performant
|
23
|
-
- More examples of frequently needed tasks
|
49
|
+
- Exchange containers in red-amber.rb and red_amber.rb (#47)
|
50
|
+
- Mainly use red_amber by consistency with the folder name
|
24
51
|
|
25
|
-
-
|
26
|
-
- `DataFrame#join features
|
52
|
+
- Add Jupyter notebook '47 Examples of Red Amber' (#49)
|
27
53
|
|
28
54
|
## [0.1.6] - 2022-06-26 (experimental)
|
29
55
|
|
data/Gemfile
CHANGED
@@ -7,6 +7,9 @@ gemspec
|
|
7
7
|
group :test do
|
8
8
|
gem 'rake'
|
9
9
|
|
10
|
+
gem 'red-parquet', '>= 8.0.0'
|
11
|
+
gem 'rover-df', '~> 0.3.0'
|
12
|
+
|
10
13
|
gem 'rubocop'
|
11
14
|
gem 'rubocop-performance', require: false
|
12
15
|
gem 'rubocop-rake'
|
@@ -17,5 +20,5 @@ group :test do
|
|
17
20
|
gem 'webrick'
|
18
21
|
|
19
22
|
gem 'benchmark_driver'
|
20
|
-
gem 'red-datasets
|
23
|
+
gem 'red-datasets'
|
21
24
|
end
|
data/README.md
CHANGED
@@ -1,16 +1,20 @@
|
|
1
1
|
# RedAmber
|
2
2
|
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/red_amber.svg)](https://badge.fury.io/rb/red_amber)
|
4
|
+
[![Ruby](https://github.com/heronshoes/red_amber/actions/workflows/test.yml/badge.svg)](https://github.com/heronshoes/red_amber/actions/workflows/test.yml)
|
5
|
+
|
3
6
|
A simple dataframe library for Ruby (experimental).
|
4
7
|
|
5
|
-
- Powered by [Red Arrow](https://github.com/apache/arrow/tree/master/ruby/red-arrow)
|
8
|
+
- Powered by [Red Arrow](https://github.com/apache/arrow/tree/master/ruby/red-arrow) [![Gitter Chat](https://badges.gitter.im/red-data-tools/en.svg)](https://gitter.im/red-data-tools/en)
|
6
9
|
- Inspired by the dataframe library [Rover-df](https://github.com/ankane/rover)
|
7
10
|
|
8
11
|
## Requirements
|
9
12
|
|
10
13
|
```ruby
|
11
14
|
gem 'red-arrow', '>= 8.0.0'
|
12
|
-
|
13
|
-
gem '
|
15
|
+
|
16
|
+
gem 'red-parquet', '>= 8.0.0' # Optional, if you use IO from/to parquet
|
17
|
+
gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
|
14
18
|
```
|
15
19
|
|
16
20
|
## Installation
|
@@ -18,7 +22,8 @@ gem 'rover-df', '~> 0.3.0' # if you use IO from/to Rover::DataFrame
|
|
18
22
|
Install requirements before you install Red Amber.
|
19
23
|
|
20
24
|
- Apache Arrow GLib (>= 8.0.0)
|
21
|
-
|
25
|
+
|
26
|
+
- Apache Parquet GLib (>= 8.0.0) # If you use IO from/to parquet
|
22
27
|
|
23
28
|
See [Apache Arrow install document](https://arrow.apache.org/install/).
|
24
29
|
|
@@ -42,11 +47,6 @@ Or install it yourself as:
|
|
42
47
|
gem install red_amber
|
43
48
|
```
|
44
49
|
|
45
|
-
(From v0.1.6)
|
46
|
-
|
47
|
-
RedAmber uses TDR mode for `#inspect` and `#to_iruby` by default. If you prefer Table mode, please set the environment variable
|
48
|
-
`RED_AMBER_OUTPUT_MODE` to `"table"`. See [TDR section](#TDR) for detail.
|
49
|
-
|
50
50
|
## `RedAmber::DataFrame`
|
51
51
|
|
52
52
|
Represents a set of data in 2D-shape. The entity is a Red Arrow's Table object.
|
@@ -56,54 +56,21 @@ require 'red_amber' # require 'red-amber' is also OK.
|
|
56
56
|
require 'datasets-arrow'
|
57
57
|
|
58
58
|
arrow = Datasets::Penguins.new.to_arrow
|
59
|
-
|
60
|
-
penguins.table
|
61
|
-
|
62
|
-
# =>
|
63
|
-
#<Arrow::Table:0x111271098 ptr=0x7f9118b3e0b0>
|
64
|
-
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
|
65
|
-
0 Adelie Torgersen 39.100000 18.700000 181 3750 male 2007
|
66
|
-
1 Adelie Torgersen 39.500000 17.400000 186 3800 female 2007
|
67
|
-
2 Adelie Torgersen 40.300000 18.000000 195 3250 female 2007
|
68
|
-
3 Adelie Torgersen (null) (null) (null) (null) (null) 2007
|
69
|
-
4 Adelie Torgersen 36.700000 19.300000 193 3450 female 2007
|
70
|
-
5 Adelie Torgersen 39.300000 20.600000 190 3650 male 2007
|
71
|
-
6 Adelie Torgersen 38.900000 17.800000 181 3625 female 2007
|
72
|
-
7 Adelie Torgersen 39.200000 19.600000 195 4675 male 2007
|
73
|
-
8 Adelie Torgersen 34.100000 18.100000 193 3475 (null) 2007
|
74
|
-
9 Adelie Torgersen 42.000000 20.200000 190 4250 (null) 2007
|
75
|
-
...
|
76
|
-
334 Gentoo Biscoe 46.200000 14.100000 217 4375 female 2009
|
77
|
-
335 Gentoo Biscoe 55.100000 16.000000 230 5850 male 2009
|
78
|
-
336 Gentoo Biscoe 44.500000 15.700000 217 4875 (null) 2009
|
79
|
-
337 Gentoo Biscoe 48.800000 16.200000 222 6000 male 2009
|
80
|
-
338 Gentoo Biscoe 47.200000 13.700000 214 4925 female 2009
|
81
|
-
339 Gentoo Biscoe (null) (null) (null) (null) (null) 2009
|
82
|
-
340 Gentoo Biscoe 46.800000 14.300000 215 4850 female 2009
|
83
|
-
341 Gentoo Biscoe 50.400000 15.700000 222 5750 male 2009
|
84
|
-
342 Gentoo Biscoe 45.200000 14.800000 212 5200 female 2009
|
85
|
-
343 Gentoo Biscoe 49.900000 16.100000 213 5400 male 2009
|
86
|
-
```
|
87
|
-
|
88
|
-
By default, RedAmber shows self by compact transposed style. This unfamiliar style (TDR) is designed for
|
89
|
-
the exploratory data processing. It keeps Vectors as row vectors, shows keys and types at a glance, shows levels
|
90
|
-
for the 'factor-like' variables and shows the number of abnormal values like NaN and nil.
|
91
|
-
|
92
|
-
```ruby
|
93
|
-
penguins
|
59
|
+
RedAmber::DataFrame.new(arrow)
|
94
60
|
|
95
61
|
# =>
|
96
|
-
RedAmber::DataFrame : 344 x 8 Vectors
|
97
|
-
|
98
|
-
|
99
|
-
1
|
100
|
-
2
|
101
|
-
3
|
102
|
-
4
|
103
|
-
5
|
104
|
-
|
105
|
-
7
|
106
|
-
8
|
62
|
+
#<RedAmber::DataFrame : 344 x 8 Vectors, 0x0000000000013790>
|
63
|
+
species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
64
|
+
<string> <string> <double> <double> <uint8> ... <uint16>
|
65
|
+
1 Adelie Torgersen 39.1 18.7 181 ... 2007
|
66
|
+
2 Adelie Torgersen 39.5 17.4 186 ... 2007
|
67
|
+
3 Adelie Torgersen 40.3 18.0 195 ... 2007
|
68
|
+
4 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
69
|
+
5 Adelie Torgersen 36.7 19.3 193 ... 2007
|
70
|
+
: : : : : : ... :
|
71
|
+
342 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
72
|
+
343 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
73
|
+
344 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
107
74
|
```
|
108
75
|
|
109
76
|
### DataFrame model
|
@@ -113,23 +80,41 @@ For example, `DataFrame#pick` accepts keys as an argument and returns a sub Data
|
|
113
80
|
|
114
81
|
```ruby
|
115
82
|
df = penguins.pick(:body_mass_g)
|
83
|
+
df
|
84
|
+
|
116
85
|
# =>
|
117
|
-
#<RedAmber::DataFrame : 344 x 1 Vector,
|
118
|
-
|
119
|
-
|
120
|
-
1
|
86
|
+
#<RedAmber::DataFrame : 344 x 1 Vector, 0x0000000000015cc0>
|
87
|
+
body_mass_g
|
88
|
+
<uint16>
|
89
|
+
1 3750
|
90
|
+
2 3800
|
91
|
+
3 3250
|
92
|
+
4 (nil)
|
93
|
+
5 3450
|
94
|
+
: :
|
95
|
+
342 5750
|
96
|
+
343 5200
|
97
|
+
344 5400
|
121
98
|
```
|
122
99
|
|
123
100
|
`DataFrame#assign` creates new variables (column in the table).
|
124
101
|
|
125
102
|
```ruby
|
126
103
|
df.assign(:body_mass_kg => df[:body_mass_g] / 1000.0)
|
104
|
+
|
127
105
|
# =>
|
128
|
-
#<RedAmber::DataFrame : 344 x 2 Vectors,
|
129
|
-
|
130
|
-
|
131
|
-
1
|
132
|
-
2
|
106
|
+
#<RedAmber::DataFrame : 344 x 2 Vectors, 0x00000000000212f0>
|
107
|
+
body_mass_g body_mass_kg
|
108
|
+
<uint16> <double>
|
109
|
+
1 3750 3.8
|
110
|
+
2 3800 3.8
|
111
|
+
3 3250 3.3
|
112
|
+
4 (nil) (nil)
|
113
|
+
5 3450 3.5
|
114
|
+
: : :
|
115
|
+
342 5750 5.8
|
116
|
+
343 5200 5.2
|
117
|
+
344 5400 5.4
|
133
118
|
```
|
134
119
|
|
135
120
|
DataFrame manipulating methods like `pick`, `drop`, `slice`, `remove`, `rename` and `assign` accept a block.
|
@@ -178,19 +163,9 @@ Vectors accepts some [functional methods from Arrow](https://arrow.apache.org/do
|
|
178
163
|
|
179
164
|
See [Vector.md](doc/Vector.md) for details.
|
180
165
|
|
181
|
-
##
|
182
|
-
|
183
|
-
I named the data frame representation style in the model above as TDR (Transposed DataFrame Representation).
|
184
|
-
|
185
|
-
This library can be used with both TDR mode and usual Table mode.
|
186
|
-
If you set the environment variable `RED_AMBER_OUTPUT_MODE` to `"table"`, output style by `inspect` and `to_iruby` is the Table mode. Other value including nil will output TDR style.
|
187
|
-
|
188
|
-
You can switch the mode in Ruby like this.
|
189
|
-
```ruby
|
190
|
-
ENV['RED_AMBER_OUTPUT_STYLE'] = 'table' # => Table mode
|
191
|
-
```
|
166
|
+
## Jupyter notebook
|
192
167
|
|
193
|
-
|
168
|
+
[47 Examples of Red Amber](doc/47_examples_of_red_amber.ipynb)
|
194
169
|
|
195
170
|
## Development
|
196
171
|
|
data/Rakefile
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
prelude: |
|
2
|
-
require 'datasets-arrow'
|
3
2
|
require 'rover'
|
4
3
|
require 'red_amber'
|
5
4
|
|
6
5
|
penguins_csv = 'benchmark/cache/penguins.csv'
|
7
6
|
|
8
7
|
unless File.exist?(penguins_csv)
|
8
|
+
require 'datasets-arrow'
|
9
9
|
arrow = Datasets::Penguins.new.to_arrow
|
10
10
|
RedAmber::DataFrame.new(arrow).save(penguins_csv)
|
11
11
|
end
|