red_amber 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +11 -5
- data/CHANGELOG.md +93 -1
- data/Gemfile +5 -6
- data/README.ja.md +252 -0
- data/README.md +30 -23
- data/benchmark/basic.yml +1 -1
- data/benchmark/group.yml +12 -5
- data/doc/CODE_OF_CONDUCT.md +1 -1
- data/docker/.env +4 -0
- data/docker/Dockerfile +66 -0
- data/docker/Gemfile +26 -0
- data/docker/Gemfile.lock +118 -0
- data/docker/docker-compose.yml +21 -0
- data/docker/example +86 -0
- data/docker/notebook/examples_of_red_amber.ipynb +8562 -0
- data/docker/notebook/red-amber.ipynb +188 -0
- data/docker/readme.md +118 -0
- data/lib/red_amber/data_frame.rb +78 -4
- data/lib/red_amber/data_frame_combinable.rb +147 -119
- data/lib/red_amber/data_frame_displayable.rb +6 -5
- data/lib/red_amber/data_frame_selectable.rb +49 -0
- data/lib/red_amber/group.rb +190 -89
- data/lib/red_amber/helper.rb +26 -0
- data/lib/red_amber/subframes.rb +166 -66
- data/lib/red_amber/vector.rb +43 -24
- data/lib/red_amber/vector_aggregation.rb +26 -0
- data/lib/red_amber/vector_binary_element_wise.rb +54 -25
- data/lib/red_amber/vector_selectable.rb +74 -23
- data/lib/red_amber/vector_string_function.rb +211 -0
- data/lib/red_amber/vector_unary_element_wise.rb +4 -0
- data/lib/red_amber/vector_updatable.rb +28 -0
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +2 -1
- data/red_amber.gemspec +4 -4
- metadata +20 -9
@@ -0,0 +1,188 @@
|
|
1
|
+
{
|
2
|
+
"cells": [
|
3
|
+
{
|
4
|
+
"cell_type": "markdown",
|
5
|
+
"metadata": {},
|
6
|
+
"source": [
|
7
|
+
"# RedAmber Examples\n",
|
8
|
+
"\n",
|
9
|
+
"This notebook walks through the [README of RedAmber](https://github.com/heronshoes/red_amber#readme)."
|
10
|
+
]
|
11
|
+
},
|
12
|
+
{
|
13
|
+
"cell_type": "markdown",
|
14
|
+
"metadata": {},
|
15
|
+
"source": [
|
16
|
+
"## `RedAmber::DataFrame`"
|
17
|
+
]
|
18
|
+
},
|
19
|
+
{
|
20
|
+
"cell_type": "code",
|
21
|
+
"execution_count": null,
|
22
|
+
"metadata": {
|
23
|
+
"tags": []
|
24
|
+
},
|
25
|
+
"outputs": [],
|
26
|
+
"source": [
|
27
|
+
"require 'red_amber'\n",
|
28
|
+
"include RedAmber\n",
|
29
|
+
"require 'datasets-arrow'\n",
|
30
|
+
"\n",
|
31
|
+
"{RedAmber: VERSION, Datasets: Datasets::VERSION}"
|
32
|
+
]
|
33
|
+
},
|
34
|
+
{
|
35
|
+
"cell_type": "markdown",
|
36
|
+
"metadata": {},
|
37
|
+
"source": [
|
38
|
+
"## Example: diamonds dataset\n",
|
39
|
+
"\n",
|
40
|
+
"For the first loading of Datasets::Diamonds, it will take some time to download."
|
41
|
+
]
|
42
|
+
},
|
43
|
+
{
|
44
|
+
"cell_type": "code",
|
45
|
+
"execution_count": null,
|
46
|
+
"metadata": {
|
47
|
+
"tags": []
|
48
|
+
},
|
49
|
+
"outputs": [],
|
50
|
+
"source": [
|
51
|
+
"dataset = Datasets::Diamonds.new\n",
|
52
|
+
"diamonds = DataFrame.new(dataset)"
|
53
|
+
]
|
54
|
+
},
|
55
|
+
{
|
56
|
+
"cell_type": "code",
|
57
|
+
"execution_count": null,
|
58
|
+
"metadata": {
|
59
|
+
"tags": []
|
60
|
+
},
|
61
|
+
"outputs": [],
|
62
|
+
"source": [
|
63
|
+
"df = diamonds\n",
|
64
|
+
" .slice { carat > 1 } # or use #filter instead of #slice\n",
|
65
|
+
" .group(:cut)\n",
|
66
|
+
" .mean(:price) # `pick` prior to `group` is not required if `:price` is specified here.\n",
|
67
|
+
" .sort('-mean(price)')"
|
68
|
+
]
|
69
|
+
},
|
70
|
+
{
|
71
|
+
"cell_type": "code",
|
72
|
+
"execution_count": null,
|
73
|
+
"metadata": {
|
74
|
+
"tags": []
|
75
|
+
},
|
76
|
+
"outputs": [],
|
77
|
+
"source": [
|
78
|
+
"usdjpy = 110.0 # when the yen was stronger\n",
|
79
|
+
"\n",
|
80
|
+
"df.rename('mean(price)': :mean_price_USD)\n",
|
81
|
+
" .assign(:mean_price_JPY) { mean_price_USD * usdjpy }"
|
82
|
+
]
|
83
|
+
},
|
84
|
+
{
|
85
|
+
"cell_type": "markdown",
|
86
|
+
"metadata": {
|
87
|
+
"tags": []
|
88
|
+
},
|
89
|
+
"source": [
|
90
|
+
"## Example: starwars dataset"
|
91
|
+
]
|
92
|
+
},
|
93
|
+
{
|
94
|
+
"cell_type": "code",
|
95
|
+
"execution_count": null,
|
96
|
+
"metadata": {
|
97
|
+
"tags": []
|
98
|
+
},
|
99
|
+
"outputs": [],
|
100
|
+
"source": [
|
101
|
+
"uri = URI('https://vincentarelbundock.github.io/Rdatasets/csv/dplyr/starwars.csv')\n",
|
102
|
+
"\n",
|
103
|
+
"starwars = DataFrame.load(uri)"
|
104
|
+
]
|
105
|
+
},
|
106
|
+
{
|
107
|
+
"cell_type": "code",
|
108
|
+
"execution_count": null,
|
109
|
+
"metadata": {
|
110
|
+
"tags": []
|
111
|
+
},
|
112
|
+
"outputs": [],
|
113
|
+
"source": [
|
114
|
+
"starwars\n",
|
115
|
+
" .drop(0) # delete unnecessary index column\n",
|
116
|
+
" .remove { species == \"NA\" } # delete unnecessary rows\n",
|
117
|
+
" .group(:species) { [count(:species), mean(:height, :mass)] }\n",
|
118
|
+
" .slice { count > 1 } # or use #filter instead of slice"
|
119
|
+
]
|
120
|
+
},
|
121
|
+
{
|
122
|
+
"cell_type": "markdown",
|
123
|
+
"metadata": {},
|
124
|
+
"source": [
|
125
|
+
"## `RedAmber::Vector`"
|
126
|
+
]
|
127
|
+
},
|
128
|
+
{
|
129
|
+
"cell_type": "code",
|
130
|
+
"execution_count": null,
|
131
|
+
"metadata": {
|
132
|
+
"tags": []
|
133
|
+
},
|
134
|
+
"outputs": [],
|
135
|
+
"source": [
|
136
|
+
"penguins = DataFrame.new(Datasets::Penguins.new)"
|
137
|
+
]
|
138
|
+
},
|
139
|
+
{
|
140
|
+
"cell_type": "code",
|
141
|
+
"execution_count": null,
|
142
|
+
"metadata": {
|
143
|
+
"tags": []
|
144
|
+
},
|
145
|
+
"outputs": [],
|
146
|
+
"source": [
|
147
|
+
"penguins[:bill_length_mm]"
|
148
|
+
]
|
149
|
+
},
|
150
|
+
{
|
151
|
+
"cell_type": "code",
|
152
|
+
"execution_count": null,
|
153
|
+
"metadata": {
|
154
|
+
"tags": []
|
155
|
+
},
|
156
|
+
"outputs": [],
|
157
|
+
"source": [
|
158
|
+
"penguins[:bill_length_mm] < 40"
|
159
|
+
]
|
160
|
+
},
|
161
|
+
{
|
162
|
+
"cell_type": "code",
|
163
|
+
"execution_count": null,
|
164
|
+
"metadata": {
|
165
|
+
"tags": []
|
166
|
+
},
|
167
|
+
"outputs": [],
|
168
|
+
"source": [
|
169
|
+
"penguins[:bill_length_mm].mean"
|
170
|
+
]
|
171
|
+
}
|
172
|
+
],
|
173
|
+
"metadata": {
|
174
|
+
"kernelspec": {
|
175
|
+
"display_name": "Ruby 3.0.2",
|
176
|
+
"language": "ruby",
|
177
|
+
"name": "ruby"
|
178
|
+
},
|
179
|
+
"language_info": {
|
180
|
+
"file_extension": ".rb",
|
181
|
+
"mimetype": "application/x-ruby",
|
182
|
+
"name": "ruby",
|
183
|
+
"version": "3.0.2"
|
184
|
+
}
|
185
|
+
},
|
186
|
+
"nbformat": 4,
|
187
|
+
"nbformat_minor": 4
|
188
|
+
}
|
data/docker/readme.md
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
# RedAmber Minimal Notebook
|
2
|
+
|
3
|
+
This is a docker image containing RedAmber created from
|
4
|
+
[jupyter/minimal-notebook](https://jupyter-docker-stacks.readthedocs.io/en/latest/using/selecting.html#jupyter-minimal-notebook)
|
5
|
+
|
6
|
+
## Contents
|
7
|
+
|
8
|
+
- From jupyter/minimal-notebook:
|
9
|
+
- Based on 2023-03-13 (295612d3ade4)
|
10
|
+
- x86-64
|
11
|
+
- Ubuntu-22.04
|
12
|
+
- python-3.10.9
|
13
|
+
- lab-3.6.1
|
14
|
+
- notebook-6.5.3
|
15
|
+
- System ruby-dev:
|
16
|
+
- Ruby 3.0.2
|
17
|
+
- Arrow 11.0.0 for Ubuntu:
|
18
|
+
- libarrow-dev
|
19
|
+
- libarrow-glib-dev
|
20
|
+
- libparquet-dev
|
21
|
+
- libparquet-glib-dev
|
22
|
+
- Locally installed iruby:
|
23
|
+
- Using Ruby 3.0.2
|
24
|
+
- Locally installed bundler and Gemfile:
|
25
|
+
- RedAmber 0.4.1
|
26
|
+
- Others (see Gemfile)
|
27
|
+
|
28
|
+
## Install
|
29
|
+
|
30
|
+
```
|
31
|
+
git clone https://github.com/heronshoes/red_amber.git
|
32
|
+
cd docker
|
33
|
+
```
|
34
|
+
|
35
|
+
Edit ENV variable in `.env` as you like.
|
36
|
+
|
37
|
+
[note] NB_USER is fixed for `jovyan`, the common user name in Jupyter,
|
38
|
+
can not change it in this version.
|
39
|
+
|
40
|
+
If TZ is not used in your host system, define it here.
|
41
|
+
Otherwise UTC is used in the container.
|
42
|
+
|
43
|
+
TOKEN will be used for token-based authentication.
|
44
|
+
|
45
|
+
```
|
46
|
+
# Example
|
47
|
+
TZ=Asia/Tokyo
|
48
|
+
TOKEN='something'
|
49
|
+
```
|
50
|
+
|
51
|
+
Then build `red_amber-minimal-notebook` container. It will take a while.
|
52
|
+
|
53
|
+
```
|
54
|
+
docker-compose build
|
55
|
+
```
|
56
|
+
|
57
|
+
## Start Jupyter Lab
|
58
|
+
|
59
|
+
After build, start the container. Adding `-d` option will detach it in background.
|
60
|
+
|
61
|
+
```
|
62
|
+
docker-compose up
|
63
|
+
```
|
64
|
+
|
65
|
+
You can access Jupyter Lab from `http://localhost:8888/` in your browser.
|
66
|
+
|
67
|
+
- `red-amber.ipynb`:
|
68
|
+
- Walks through the [README of RedAmber](https://github.com/heronshoes/red_amber#readme).
|
69
|
+
- `examples_of_red_amber.ipynb`:
|
70
|
+
- [Examples of RedAmber](https://github.com/heronshoes/red_amber/blob/main/docker/notebook/examples_of_red_amber.ipynb) in Notebook style.
|
71
|
+
|
72
|
+
## Example in REPL
|
73
|
+
|
74
|
+
You can try RedAmber in irb with pre-loaded datasets.
|
75
|
+
|
76
|
+
Start `terminal` in Jupyter.
|
77
|
+
|
78
|
+
For the first run,
|
79
|
+
|
80
|
+
```
|
81
|
+
source ~/.bashrc
|
82
|
+
../example
|
83
|
+
|
84
|
+
```
|
85
|
+
|
86
|
+
It will take a while for the first run to fetch and prepare red-datasets cache.
|
87
|
+
|
88
|
+
If irb starts you can see:
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
|
92
|
+
69: # Welcome to RedAmber example!
|
93
|
+
70: # This environment will offer these pre-loaded datasets:
|
94
|
+
71: # penguins, diamonds, iris, starwars, simpsons_paradox_covid,
|
95
|
+
72: # mtcars, band_members, band_instruments, band_instruments2
|
96
|
+
73: # (original) import_cars, comecome, dataframe, subframes
|
97
|
+
=> 74: binding.irb
|
98
|
+
|
99
|
+
irb(main):001:0>
|
100
|
+
```
|
101
|
+
|
102
|
+
RedAmber is already loaded in this environment with some datasets shown above.
|
103
|
+
|
104
|
+
```ruby
|
105
|
+
irb(main):002:0> dataframe
|
106
|
+
=>
|
107
|
+
#<RedAmber::DataFrame : 6 x 3 Vectors, 0x0000000000003818>
|
108
|
+
x y z
|
109
|
+
<uint8> <string> <boolean>
|
110
|
+
0 1 A false
|
111
|
+
1 2 A true
|
112
|
+
2 3 B false
|
113
|
+
3 4 B (nil)
|
114
|
+
4 5 B true
|
115
|
+
5 6 C false
|
116
|
+
```
|
117
|
+
|
118
|
+
Next time you start this environment, you can simply invoke as `../example`.
|
data/lib/red_amber/data_frame.rb
CHANGED
@@ -422,12 +422,12 @@ module RedAmber
|
|
422
422
|
# Create SubFrames by value grouping.
|
423
423
|
#
|
424
424
|
# [Experimental feature] this method may be removed or be changed in the future.
|
425
|
-
# @param keys [Symbol, String
|
425
|
+
# @param keys [List<Symbol, String>, Array<Symbol, String>]
|
426
426
|
# grouping keys.
|
427
427
|
# @return [SubFrames]
|
428
428
|
# a created SubFrames grouped by column values on `keys`.
|
429
429
|
# @example
|
430
|
-
# df.sub_by_value(
|
430
|
+
# df.sub_by_value(:y)
|
431
431
|
#
|
432
432
|
# # =>
|
433
433
|
# #<RedAmber::SubFrames : 0x000000000000fc08>
|
@@ -454,10 +454,11 @@ module RedAmber
|
|
454
454
|
#
|
455
455
|
# @since 0.4.0
|
456
456
|
#
|
457
|
-
def sub_by_value(keys
|
458
|
-
SubFrames.new(self, group(keys).filters)
|
457
|
+
def sub_by_value(*keys)
|
458
|
+
SubFrames.new(self, group(keys.flatten).filters)
|
459
459
|
end
|
460
460
|
alias_method :subframes_by_value, :sub_by_value
|
461
|
+
alias_method :sub_group, :sub_by_value
|
461
462
|
|
462
463
|
# Create SubFrames by Windowing with `from`, `size` and `step`.
|
463
464
|
#
|
@@ -697,6 +698,79 @@ module RedAmber
|
|
697
698
|
end
|
698
699
|
end
|
699
700
|
|
701
|
+
# Returns a Vector such that all elements have value `scalar`
|
702
|
+
# and have same size as self.
|
703
|
+
#
|
704
|
+
# @overload propagate(scalar)
|
705
|
+
# Specifies scalar as an agrument.
|
706
|
+
#
|
707
|
+
# @param scalar [scalar]
|
708
|
+
# a value to propagate in Vector.
|
709
|
+
# @return [Vector]
|
710
|
+
# created Vector.
|
711
|
+
# @example propagate a value
|
712
|
+
# df
|
713
|
+
# # =>
|
714
|
+
# #<RedAmber::DataFrame : 6 x 3 Vectors, 0x00000000000849a4>
|
715
|
+
# x y z
|
716
|
+
# <uint8> <string> <boolean>
|
717
|
+
# 0 1 A false
|
718
|
+
# 1 2 A true
|
719
|
+
# 2 3 B false
|
720
|
+
# 3 4 B (nil)
|
721
|
+
# 4 5 B true
|
722
|
+
# 5 6 C false
|
723
|
+
#
|
724
|
+
# df.assign(:sum_x) { propagate(x.sum) }
|
725
|
+
# # =>
|
726
|
+
# #<RedAmber::DataFrame : 6 x 4 Vectors, 0x000000000007bd04>
|
727
|
+
# x y z sum_x
|
728
|
+
# <uint8> <string> <boolean> <uint8>
|
729
|
+
# 0 1 A false 21
|
730
|
+
# 1 2 A true 21
|
731
|
+
# 2 3 B false 21
|
732
|
+
# 3 4 B (nil) 21
|
733
|
+
# 4 5 B true 21
|
734
|
+
# 5 6 C false 21
|
735
|
+
#
|
736
|
+
# # Using `Vector#propagate` like below has same result as above.
|
737
|
+
# df.assign(:sum_x) { x.propagate(:sum) }
|
738
|
+
#
|
739
|
+
# # Also it is same as creating column from an Array.
|
740
|
+
# df.assign(:sum_x) { [x.sum] * size }
|
741
|
+
#
|
742
|
+
# @overload propagate
|
743
|
+
#
|
744
|
+
# @yieldparam self [DataFrame]
|
745
|
+
# gives self to the block.
|
746
|
+
# @yieldreturn [scalar]
|
747
|
+
# a value to propagate in Vector
|
748
|
+
# @return [Vector]
|
749
|
+
# created Vector.
|
750
|
+
# @example propagate the value from the block
|
751
|
+
# df.assign(:range) { propagate { x.max - x.min } }
|
752
|
+
# # =>
|
753
|
+
# #<RedAmber::DataFrame : 6 x 4 Vectors, 0x00000000000e603c>
|
754
|
+
# x y z range
|
755
|
+
# <uint8> <string> <boolean> <uint8>
|
756
|
+
# 0 1 A false 5
|
757
|
+
# 1 2 A true 5
|
758
|
+
# 2 3 B false 5
|
759
|
+
# 3 4 B (nil) 5
|
760
|
+
# 4 5 B true 5
|
761
|
+
# 5 6 C false 5
|
762
|
+
#
|
763
|
+
# @since 0.5.0
|
764
|
+
#
|
765
|
+
def propagate(scalar = nil, &block)
|
766
|
+
if block
|
767
|
+
raise VectorArgumentError, "can't specify both function and block" if scalar
|
768
|
+
|
769
|
+
scalar = instance_eval(&block)
|
770
|
+
end
|
771
|
+
Vector.new([scalar] * size)
|
772
|
+
end
|
773
|
+
|
700
774
|
# Catch variable (column) key as method name.
|
701
775
|
def method_missing(name, *args, &block)
|
702
776
|
return variables[name] if args.empty? && key?(name)
|