red_amber 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,188 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# RedAmber Examples\n",
8
+ "\n",
9
+ "This notebook walks through the [README of RedAmber](https://github.com/heronshoes/red_amber#readme)."
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {},
15
+ "source": [
16
+ "## `RedAmber::DataFrame`"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "metadata": {
23
+ "tags": []
24
+ },
25
+ "outputs": [],
26
+ "source": [
27
+ "require 'red_amber'\n",
28
+ "include RedAmber\n",
29
+ "require 'datasets-arrow'\n",
30
+ "\n",
31
+ "{RedAmber: VERSION, Datasets: Datasets::VERSION}"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "markdown",
36
+ "metadata": {},
37
+ "source": [
38
+ "## Example: diamonds dataset\n",
39
+ "\n",
40
+ "For the first loading of Datasets::Diamonds, it will take some time to download."
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": null,
46
+ "metadata": {
47
+ "tags": []
48
+ },
49
+ "outputs": [],
50
+ "source": [
51
+ "dataset = Datasets::Diamonds.new\n",
52
+ "diamonds = DataFrame.new(dataset)"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": null,
58
+ "metadata": {
59
+ "tags": []
60
+ },
61
+ "outputs": [],
62
+ "source": [
63
+ "df = diamonds\n",
64
+ " .slice { carat > 1 } # or use #filter instead of #slice\n",
65
+ " .group(:cut)\n",
66
+ " .mean(:price) # `pick` prior to `group` is not required if `:price` is specified here.\n",
67
+ " .sort('-mean(price)')"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": null,
73
+ "metadata": {
74
+ "tags": []
75
+ },
76
+ "outputs": [],
77
+ "source": [
78
+ "usdjpy = 110.0 # when the yen was stronger\n",
79
+ "\n",
80
+ "df.rename('mean(price)': :mean_price_USD)\n",
81
+ " .assign(:mean_price_JPY) { mean_price_USD * usdjpy }"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "markdown",
86
+ "metadata": {
87
+ "tags": []
88
+ },
89
+ "source": [
90
+ "## Example: starwars dataset"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": null,
96
+ "metadata": {
97
+ "tags": []
98
+ },
99
+ "outputs": [],
100
+ "source": [
101
+ "uri = URI('https://vincentarelbundock.github.io/Rdatasets/csv/dplyr/starwars.csv')\n",
102
+ "\n",
103
+ "starwars = DataFrame.load(uri)"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "metadata": {
110
+ "tags": []
111
+ },
112
+ "outputs": [],
113
+ "source": [
114
+ "starwars\n",
115
+ " .drop(0) # delete unnecessary index column\n",
116
+ " .remove { species == \"NA\" } # delete unnecessary rows\n",
117
+ " .group(:species) { [count(:species), mean(:height, :mass)] }\n",
118
+ " .slice { count > 1 } # or use #filter instead of slice"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "markdown",
123
+ "metadata": {},
124
+ "source": [
125
+ "## `RedAmber::Vector`"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": null,
131
+ "metadata": {
132
+ "tags": []
133
+ },
134
+ "outputs": [],
135
+ "source": [
136
+ "penguins = DataFrame.new(Datasets::Penguins.new)"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": null,
142
+ "metadata": {
143
+ "tags": []
144
+ },
145
+ "outputs": [],
146
+ "source": [
147
+ "penguins[:bill_length_mm]"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": null,
153
+ "metadata": {
154
+ "tags": []
155
+ },
156
+ "outputs": [],
157
+ "source": [
158
+ "penguins[:bill_length_mm] < 40"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": null,
164
+ "metadata": {
165
+ "tags": []
166
+ },
167
+ "outputs": [],
168
+ "source": [
169
+ "penguins[:bill_length_mm].mean"
170
+ ]
171
+ }
172
+ ],
173
+ "metadata": {
174
+ "kernelspec": {
175
+ "display_name": "Ruby 3.0.2",
176
+ "language": "ruby",
177
+ "name": "ruby"
178
+ },
179
+ "language_info": {
180
+ "file_extension": ".rb",
181
+ "mimetype": "application/x-ruby",
182
+ "name": "ruby",
183
+ "version": "3.0.2"
184
+ }
185
+ },
186
+ "nbformat": 4,
187
+ "nbformat_minor": 4
188
+ }
data/docker/readme.md ADDED
@@ -0,0 +1,118 @@
1
+ # RedAmber Minimal Notebook
2
+
3
+ This is a docker image containing RedAmber created from
4
+ [jupyter/minimal-notebook](https://jupyter-docker-stacks.readthedocs.io/en/latest/using/selecting.html#jupyter-minimal-notebook)
5
+
6
+ ## Contents
7
+
8
+ - From jupyter/minimal-notebook:
9
+ - Based on 2023-03-13 (295612d3ade4)
10
+ - x86-64
11
+ - Ubuntu-22.04
12
+ - python-3.10.9
13
+ - lab-3.6.1
14
+ - notebook-6.5.3
15
+ - System ruby-dev:
16
+ - Ruby 3.0.2
17
+ - Arrow 11.0.0 for Ubuntu:
18
+ - libarrow-dev
19
+ - libarrow-glib-dev
20
+ - libparquet-dev
21
+ - libparquet-glib-dev
22
+ - Locally installed iruby:
23
+ - Using Ruby 3.0.2
24
+ - Locally installed bundler and Gemfile:
25
+ - RedAmber 0.4.1
26
+ - Others (see Gemfile)
27
+
28
+ ## Install
29
+
30
+ ```
31
+ git clone https://github.com/heronshoes/red_amber.git
32
+ cd docker
33
+ ```
34
+
35
+ Edit ENV variable in `.env` as you like.
36
+
37
+ [note] NB_USER is fixed for `jovyan`, the common user name in Jupyter,
38
+ can not change it in this version.
39
+
40
+ If TZ is not used in your host system, define it here.
41
+ Otherwise UTC is used in the container.
42
+
43
+ TOKEN will be used for token-based authentication.
44
+
45
+ ```
46
+ # Example
47
+ TZ=Asia/Tokyo
48
+ TOKEN='something'
49
+ ```
50
+
51
+ Then build `red_amber-minimal-notebook` container. It will take a while.
52
+
53
+ ```
54
+ docker-compose build
55
+ ```
56
+
57
+ ## Start Jupyter Lab
58
+
59
+ After build, start the container. Adding `-d` option will detach it in background.
60
+
61
+ ```
62
+ docker-compose up
63
+ ```
64
+
65
+ You can access Jupyter Lab from `http://localhost:8888/` in your browser.
66
+
67
+ - `red-amber.ipynb`:
68
+ - Walks through the [README of RedAmber](https://github.com/heronshoes/red_amber#readme).
69
+ - `examples_of_red_amber.ipynb`:
70
+ - [Examples of RedAmber](https://github.com/heronshoes/red_amber/blob/main/docker/notebook/examples_of_red_amber.ipynb) in Notebook style.
71
+
72
+ ## Example in REPL
73
+
74
+ You can try RedAmber in irb with pre-loaded datasets.
75
+
76
+ Start `terminal` in Jupyter.
77
+
78
+ For the first run,
79
+
80
+ ```
81
+ source ~/.bashrc
82
+ ../example
83
+
84
+ ```
85
+
86
+ It will take a while for the first run to fetch and prepare red-datasets cache.
87
+
88
+ If irb starts you can see:
89
+
90
+ ```ruby
91
+
92
+ 69: # Welcome to RedAmber example!
93
+ 70: # This environment will offer these pre-loaded datasets:
94
+ 71: # penguins, diamonds, iris, starwars, simpsons_paradox_covid,
95
+ 72: # mtcars, band_members, band_instruments, band_instruments2
96
+ 73: # (original) import_cars, comecome, dataframe, subframes
97
+ => 74: binding.irb
98
+
99
+ irb(main):001:0>
100
+ ```
101
+
102
+ RedAmber is already loaded in this environment with some datasets shown above.
103
+
104
+ ```ruby
105
+ irb(main):002:0> dataframe
106
+ =>
107
+ #<RedAmber::DataFrame : 6 x 3 Vectors, 0x0000000000003818>
108
+ x y z
109
+ <uint8> <string> <boolean>
110
+ 0 1 A false
111
+ 1 2 A true
112
+ 2 3 B false
113
+ 3 4 B (nil)
114
+ 4 5 B true
115
+ 5 6 C false
116
+ ```
117
+
118
+ Next time you start this environment, you can simply invoke as `../example`.
@@ -60,11 +60,11 @@ module RedAmber
60
60
  #
61
61
  # # =>
62
62
  # #<RedAmber::Group : 0x000000000000f410>
63
- # species group_count
64
- # <string> <uint8>
65
- # 0 Adelie 152
66
- # 1 Chinstrap 68
67
- # 2 Gentoo 124
63
+ # species count
64
+ # <string> <uint8>
65
+ # 0 Adelie 152
66
+ # 1 Chinstrap 68
67
+ # 2 Gentoo 124
68
68
  #
69
69
  def initialize(dataframe, *group_keys)
70
70
  @dataframe = dataframe
@@ -186,14 +186,14 @@ module RedAmber
186
186
  #
187
187
  # # =>
188
188
  # #<RedAmber::Group : 0x0000000000003a98>
189
- # species group_count
190
- # <string> <uint8>
191
- # 0 Adelie 152
192
- # 1 Chinstrap 68
193
- # 2 Gentoo 124
189
+ # species count
190
+ # <string> <uint8>
191
+ # 0 Adelie 152
192
+ # 1 Chinstrap 68
193
+ # 2 Gentoo 124
194
194
  #
195
195
  def inspect
196
- "#<#{self.class} : #{format('0x%016x', object_id)}>\n#{group_count}"
196
+ "#<#{self.class} : #{format('0x%016x', object_id)}>\n#{count(@group_keys)}"
197
197
  end
198
198
 
199
199
  # Summarize Group by aggregation functions from the block.
@@ -210,11 +210,11 @@ module RedAmber
210
210
  #
211
211
  # # =>
212
212
  # #<RedAmber::Group : 0x000000000000c314>
213
- # species group_count
214
- # <string> <uint8>
215
- # 0 Adelie 152
216
- # 1 Chinstrap 68
217
- # 2 Gentoo 124
213
+ # species count
214
+ # <string> <uint8>
215
+ # 0 Adelie 152
216
+ # 1 Chinstrap 68
217
+ # 2 Gentoo 124
218
218
  #
219
219
  # group.summarize { mean(:bill_length_mm) }
220
220
  #
@@ -10,6 +10,38 @@ module RedAmber
10
10
  using RefineArray
11
11
  using RefineArrayLike
12
12
 
13
+ # Entity to select sub-dataframes
14
+ class Selectors
15
+ attr_reader :selectors, :size, :sizes
16
+
17
+ def initialize(selectors)
18
+ @selectors = selectors
19
+ @size = selectors.size
20
+ @sizes = []
21
+ end
22
+
23
+ def each
24
+ @selectors.each
25
+ end
26
+ end
27
+
28
+ # Boolean selectors of sub-dataframes
29
+ class Filters < Selectors
30
+ def sizes
31
+ # count true
32
+ @sizes = @selectors.map { |s| s.to_a.count { _1 } } # rubocop:disable Performance/Size
33
+ end
34
+ end
35
+
36
+ # Index selectors of sub-dataframes
37
+ class Indices < Selectors
38
+ def sizes
39
+ @sizes = @selectors.map(&:size)
40
+ end
41
+ end
42
+
43
+ private_constant :Selectors, :Filters, :Indices
44
+
13
45
  class << self
14
46
  # Create SubFrames from a Group.
15
47
  #
@@ -79,13 +111,8 @@ module RedAmber
79
111
  def by_indices(dataframe, subset_indices)
80
112
  instance = allocate
81
113
  instance.instance_variable_set(:@baseframe, dataframe)
82
- enum =
83
- Enumerator.new(subset_indices.size) do |y|
84
- subset_indices.each do |i|
85
- y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.take(i))
86
- end
87
- end
88
- instance.instance_variable_set(:@enum, enum)
114
+ instance.instance_variable_set(:@selectors, Indices.new(subset_indices))
115
+ instance.instance_variable_set(:@frames, [])
89
116
  instance
90
117
  end
91
118
 
@@ -105,13 +132,8 @@ module RedAmber
105
132
  def by_filters(dataframe, subset_filters)
106
133
  instance = allocate
107
134
  instance.instance_variable_set(:@baseframe, dataframe)
108
- enum =
109
- Enumerator.new(subset_filters.size) do |y|
110
- subset_filters.each do |i|
111
- y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.filter(i))
112
- end
113
- end
114
- instance.instance_variable_set(:@enum, enum)
135
+ instance.instance_variable_set(:@selectors, Filters.new(subset_filters))
136
+ instance.instance_variable_set(:@frames, [])
115
137
  instance
116
138
  end
117
139
 
@@ -130,18 +152,13 @@ module RedAmber
130
152
  case Array(dataframes)
131
153
  when [] || [nil]
132
154
  instance.instance_variable_set(:@baseframe, DataFrame.new)
155
+ instance.instance_variable_set(:@selectors, [])
133
156
  instance.instance_variable_set(:@frames, [])
134
- enum = [].each
135
157
  else
136
- enum =
137
- Enumerator.new(dataframes.size) do |y|
138
- dataframes.each do |i|
139
- y.yield i
140
- end
141
- end
142
- instance.instance_variable_set(:@baseframe, enum.lazy)
158
+ instance.instance_variable_set(:@baseframe, nil)
159
+ instance.instance_variable_set(:@selectors, nil)
160
+ instance.instance_variable_set(:@frames, dataframes)
143
161
  end
144
- instance.instance_variable_set(:@enum, enum)
145
162
  instance
146
163
  end
147
164
 
@@ -261,40 +278,34 @@ module RedAmber
261
278
  #
262
279
  # @since 0.4.0
263
280
  #
264
- def initialize(dataframe, subset_specifier = nil, &block)
281
+ def initialize(dataframe, selectors = nil, &block)
265
282
  unless dataframe.is_a?(DataFrame)
266
283
  raise SubFramesArgumentError, "not a DataFrame: #{dataframe}"
267
284
  end
268
285
 
269
286
  if block
270
- unless subset_specifier.nil?
287
+ unless selectors.nil?
271
288
  raise SubFramesArgumentError, 'Must not specify both arguments and block.'
272
289
  end
273
290
 
274
- subset_specifier = yield(dataframe)
291
+ selectors = yield(dataframe)
275
292
  end
276
293
 
277
- if dataframe.empty? || subset_specifier.nil? || subset_specifier.empty?
294
+ if dataframe.empty? || selectors.nil? || selectors.empty?
278
295
  @baseframe = DataFrame.new
279
- @frames = []
280
- @enum = @frames.each
296
+ @selectors = Selectors.new([])
281
297
  else
282
- @baseframe = nil
283
- @enum =
284
- Enumerator.new(subset_specifier.size) do |yielder|
285
- subset_specifier.map do |i|
286
- df =
287
- if i.numeric?
288
- dataframe.take(i)
289
- elsif i.boolean?
290
- dataframe.filter(i)
291
- else
292
- raise SubFramesArgumentError, "illegal type: #{i}"
293
- end
294
- yielder.yield DataFrame.new_dataframe_with_schema(dataframe, df)
295
- end
298
+ @baseframe = dataframe
299
+ @selectors =
300
+ if selectors[0].boolean?
301
+ Filters.new(selectors)
302
+ elsif selectors[0].numeric?
303
+ Indices.new(selectors)
304
+ else
305
+ raise SubFramesArgumentError, "illegal type: #{selectors}"
296
306
  end
297
307
  end
308
+ @frames = []
298
309
  end
299
310
 
300
311
  # Return concatenated SubFrames as a DataFrame.
@@ -305,11 +316,7 @@ module RedAmber
305
316
  # @since 0.4.0
306
317
  #
307
318
  def baseframe
308
- if @baseframe.nil? || @baseframe.is_a?(Enumerator)
309
- @baseframe = reduce(&:concatenate)
310
- else
311
- @baseframe
312
- end
319
+ @baseframe ||= reduce(&:concatenate)
313
320
  end
314
321
  alias_method :concatenate, :baseframe
315
322
  alias_method :concat, :baseframe
@@ -384,7 +391,19 @@ module RedAmber
384
391
  def each(&block)
385
392
  return enum_for(__method__) { size } unless block
386
393
 
387
- frames.each(&block)
394
+ if @selectors
395
+ @selectors.each.with_index do |selector, i|
396
+ if i < @frames.size
397
+ yield @frames[i]
398
+ else
399
+ frame = get_subframe(selector)
400
+ @frames << frame
401
+ yield frame
402
+ end
403
+ end
404
+ else
405
+ @frames.each(&block)
406
+ end
388
407
  nil
389
408
  end
390
409
 
@@ -916,6 +935,26 @@ module RedAmber
916
935
  #
917
936
  define_subframable_method :filter_map
918
937
 
938
+ # Return 0...num sub-dataframes in self.
939
+ #
940
+ # @param num [Integer, Float]
941
+ # num of sub-dataframes to pick up. `num`` must be positive or zero.
942
+ # @return [SubFrames]
943
+ # A new SubFrames.
944
+ # If n == 0, it returns empty SubFrames.
945
+ # If n >= size, it returns self.
946
+ # @since 0.4.2
947
+ #
948
+ def take(num)
949
+ if num.zero?
950
+ SubFrames.new(DataFrame.new, [])
951
+ elsif num >= size
952
+ self
953
+ else
954
+ SubFrames.by_dataframes(frames(num))
955
+ end
956
+ end
957
+
919
958
  # Number of subsets.
920
959
  #
921
960
  # @return [Integer]
@@ -923,7 +962,12 @@ module RedAmber
923
962
  # @since 0.4.0
924
963
  #
925
964
  def size
926
- @size ||= @enum.size
965
+ @size ||=
966
+ if @selectors
967
+ @selectors.size
968
+ else
969
+ @frames.size
970
+ end
927
971
  end
928
972
 
929
973
  # Size list of subsets.
@@ -933,7 +977,12 @@ module RedAmber
933
977
  # @since 0.4.0
934
978
  #
935
979
  def sizes
936
- @sizes ||= @enum.map(&:size)
980
+ @sizes ||=
981
+ if @selectors
982
+ @selectors.sizes
983
+ else
984
+ @frames.map(&:size)
985
+ end
937
986
  end
938
987
 
939
988
  # Indices at the top of each sub DataFrames.
@@ -945,10 +994,17 @@ module RedAmber
945
994
  # @since 0.4.0
946
995
  #
947
996
  def offset_indices
948
- sum = 0
949
- sizes.map do |size|
950
- sum += size
951
- sum - size
997
+ case @selectors
998
+ when Filters
999
+ @selectors.selectors.map do |selector|
1000
+ selector.each.with_index.find { |x, _| x }[1]
1001
+ end
1002
+ else # Indices, nil
1003
+ sum = 0
1004
+ sizes.map do |size|
1005
+ sum += size
1006
+ sum - size
1007
+ end
952
1008
  end
953
1009
  end
954
1010
 
@@ -965,11 +1021,11 @@ module RedAmber
965
1021
  # Test if self has only one subset and it is comprehensive.
966
1022
  #
967
1023
  # @return [true, false]
968
- # true if only member of self is equal to universal DataFrame.
1024
+ # true if the only member of self is equal to universal DataFrame.
969
1025
  # @since 0.4.0
970
1026
  #
971
1027
  def universal?
972
- size == 1 && @enum.first == baseframe
1028
+ size == 1 && first == @baseframe
973
1029
  end
974
1030
 
975
1031
  # Return string representation of self.
@@ -1012,7 +1068,7 @@ module RedAmber
1012
1068
  #
1013
1069
  # @since 0.4.0
1014
1070
  #
1015
- def to_s(limit: 16)
1071
+ def to_s(limit: 5)
1016
1072
  _to_s(limit: limit)
1017
1073
  end
1018
1074
 
@@ -1064,10 +1120,10 @@ module RedAmber
1064
1120
  #
1065
1121
  # @since 0.4.0
1066
1122
  #
1067
- def inspect(limit: 16)
1123
+ def inspect(limit: 5)
1068
1124
  shape =
1069
- if @baseframe.is_a?(Enumerator)
1070
- "Enumerator::Lazy:size=#{@baseframe.size}"
1125
+ if @baseframe.nil?
1126
+ '(Not prepared)'
1071
1127
  else
1072
1128
  baseframe.shape_str(with_id: true)
1073
1129
  end
@@ -1079,14 +1135,51 @@ module RedAmber
1079
1135
  "---\n#{_to_s(limit: limit, with_id: true)}"
1080
1136
  end
1081
1137
 
1138
+ # Return an Array of sub DataFrames
1139
+ #
1140
+ # @overload frames
1141
+ # Returns all sub dataframes.
1142
+ #
1143
+ # @return [Array<DataFrame>]
1144
+ # sub DataFrames.
1145
+ #
1146
+ # @overload frames(n_frames)
1147
+ # Returns partial sub dataframes.
1148
+ #
1149
+ # @param n_frames [Integer]
1150
+ # num of dataframes to retrieve.
1151
+ # @return [Array<DataFrame>]
1152
+ # sub DataFrames.
1153
+ #
1154
+ # @since 0.4.2
1155
+ #
1156
+ def frames(n_frames = nil)
1157
+ n_frames = size if n_frames.nil?
1158
+
1159
+ if @frames.size < n_frames
1160
+ @frames = each.take(n_frames)
1161
+ else
1162
+ @frames.take(n_frames)
1163
+ end
1164
+ end
1165
+
1082
1166
  private
1083
1167
 
1084
- def frames
1085
- @frames ||= @enum.to_a
1168
+ # Get sub dataframe specified by 'selector'
1169
+ def get_subframe(selector)
1170
+ df =
1171
+ case @selectors
1172
+ when Filters
1173
+ @baseframe.filter(selector)
1174
+ when Indices
1175
+ @baseframe.take(selector)
1176
+ end
1177
+ DataFrame.new_dataframe_with_schema(@baseframe, df)
1086
1178
  end
1087
1179
 
1088
- def _to_s(limit: 16, with_id: false)
1089
- a = take(limit).map do |df|
1180
+ # Subcontractor of to_s
1181
+ def _to_s(limit: 5, with_id: false)
1182
+ a = each.take(limit).map do |df|
1090
1183
  if with_id
1091
1184
  "#<#{df.shape_str(with_id: with_id)}>\n" \
1092
1185
  "#{df.to_s(head: 2, tail: 2)}"