red_amber 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +56 -22
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +178 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +29 -30
  8. data/benchmark/basic.yml +7 -7
  9. data/benchmark/combine.yml +3 -3
  10. data/benchmark/dataframe.yml +15 -9
  11. data/benchmark/group.yml +6 -6
  12. data/benchmark/reshape.yml +6 -6
  13. data/benchmark/vector.yml +6 -3
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +454 -85
  20. data/lib/red_amber/data_frame_combinable.rb +609 -115
  21. data/lib/red_amber/data_frame_displayable.rb +313 -34
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +623 -70
  26. data/lib/red_amber/data_frame_variable_operation.rb +452 -35
  27. data/lib/red_amber/group.rb +186 -22
  28. data/lib/red_amber/helper.rb +74 -14
  29. data/lib/red_amber/refinements.rb +26 -6
  30. data/lib/red_amber/subframes.rb +1101 -0
  31. data/lib/red_amber/vector.rb +362 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +506 -0
  34. data/lib/red_amber/vector_selectable.rb +265 -23
  35. data/lib/red_amber/vector_unary_element_wise.rb +529 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -1,16 +1,71 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # group class
4
+ # Group class
5
5
  class Group
6
6
  include Enumerable # This feature is experimental
7
7
 
8
8
  using RefineArrowTable
9
9
 
10
+ # Source DataFrame.
11
+ #
12
+ # @return [DataFrame]
13
+ # source DataFrame.
14
+ #
15
+ attr_reader :dataframe
16
+
17
+ # Keys for grouping by value.
18
+ #
19
+ # @return [Array]
20
+ # group keys.
21
+ #
22
+ attr_reader :group_keys
23
+
24
+ class << self
25
+ private
26
+
27
+ # @!macro [attach] define_group_aggregation
28
+ # @!method $1(*summary_keys)
29
+ # Group aggregation function `$1`.
30
+ # @param summary_keys [Array<Symbol, String>]
31
+ # summary keys.
32
+ # @return [DataFrame]
33
+ # aggregated DataFrame
34
+ #
35
+ def define_group_aggregation(function)
36
+ define_method(function) do |*summary_keys|
37
+ summary_keys = Array(summary_keys).flatten
38
+ d = summary_keys - @dataframe.keys
39
+ unless summary_keys.empty? || d.empty?
40
+ raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}."
41
+ end
42
+
43
+ table = @group.aggregate(*build_aggregation_keys("hash_#{function}",
44
+ summary_keys))
45
+ DataFrame.new(table[@group_keys + (table.keys - @group_keys)])
46
+ end
47
+ end
48
+ end
49
+
10
50
  # Creates a new Group object.
11
51
  #
12
- # @param dataframe [DataFrame] dataframe to be grouped.
13
- # @param group_keys [Array<>] keys for grouping.
52
+ # @param dataframe [DataFrame]
53
+ # dataframe to be grouped.
54
+ # @param group_keys [Array<Symbol, String>]
55
+ # keys for grouping.
56
+ # @return [Group]
57
+ # Group object.
58
+ # @example
59
+ # Group.new(penguins, :species)
60
+ #
61
+ # # =>
62
+ # #<RedAmber::Group : 0x000000000000f410>
63
+ # species group_count
64
+ # <string> <uint8>
65
+ # 0 Adelie 152
66
+ # 1 Chinstrap 68
67
+ # 2 Gentoo 124
68
+ #
14
69
  def initialize(dataframe, *group_keys)
15
70
  @dataframe = dataframe
16
71
  @group_keys = group_keys.flatten
@@ -23,24 +78,7 @@ module RedAmber
23
78
  @group = @dataframe.table.group(*@group_keys)
24
79
  end
25
80
 
26
- attr_reader :dataframe, :group_keys
27
-
28
- functions = %i[count sum product mean min max stddev variance]
29
- functions.each do |function|
30
- define_method(function) do |*summary_keys|
31
- summary_keys = Array(summary_keys).flatten
32
- d = summary_keys - @dataframe.keys
33
- unless summary_keys.empty? || d.empty?
34
- raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}."
35
- end
36
-
37
- table = @group.aggregate(*build_aggregation_keys("hash_#{function}",
38
- summary_keys))
39
- g = @group_keys.map(&:to_s)
40
- DataFrame.new(table[g + (table.keys - g)])
41
- end
42
- end
43
-
81
+ define_group_aggregation(:count)
44
82
  alias_method :__count, :count
45
83
  private :__count
46
84
 
@@ -54,6 +92,26 @@ module RedAmber
54
92
  end
55
93
  end
56
94
 
95
+ define_group_aggregation(:sum)
96
+
97
+ define_group_aggregation(:product)
98
+
99
+ define_group_aggregation(:mean)
100
+
101
+ define_group_aggregation(:min)
102
+
103
+ define_group_aggregation(:max)
104
+
105
+ define_group_aggregation(:stddev)
106
+
107
+ define_group_aggregation(:variance)
108
+
109
+ # Returns Array of boolean filters to select each records in the Group.
110
+ #
111
+ # @api private
112
+ # @return [Array]
113
+ # an Array of boolean filter Vectors.
114
+ #
57
115
  def filters
58
116
  @filters ||= begin
59
117
  first, *others = @group_keys.map do |key|
@@ -69,6 +127,25 @@ module RedAmber
69
127
  end
70
128
  end
71
129
 
130
+ # Iterates over each record group as a DataFrame or returns a Enumerator.
131
+ #
132
+ # @api private
133
+ # @overload each
134
+ # Returns a new Enumerator if no block given.
135
+ #
136
+ # @return [Enumerator]
137
+ # Enumerator of each group as a DataFrame.
138
+ #
139
+ # @overload each
140
+ # When a block given, passes each record group as a DataFrame to the block.
141
+ #
142
+ # @yieldparam df [DataFrame]
143
+ # passes each record group as a DataFrame by a block parameter.
144
+ # @yieldreturn [Object]
145
+ # evaluated result value from the block.
146
+ # @return [Integer]
147
+ # group size.
148
+ #
72
149
  def each
73
150
  filters
74
151
  return enum_for(:each) unless block_given?
@@ -79,14 +156,98 @@ module RedAmber
79
156
  @filters.size
80
157
  end
81
158
 
159
+ # Returns each record group size as a DataFrame.
160
+ #
161
+ # @return [DataFrame]
162
+ # DataFrame consists of:
163
+ # - Group key columns.
164
+ # - Result columns by group aggregation.
165
+ # @example
166
+ # penguins.group(:species).group_count
167
+ #
168
+ # # =>
169
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003a70>
170
+ # species group_count
171
+ # <string> <uint8>
172
+ # 0 Adelie 152
173
+ # 1 Chinstrap 68
174
+ # 2 Gentoo 124
175
+ #
82
176
  def group_count
83
177
  DataFrame.create(add_columns_to_table(base_table, [:group_count], [group_counts]))
84
178
  end
85
179
 
180
+ # String representation of self.
181
+ #
182
+ # @return [String]
183
+ # show information of self as a String.
184
+ # @example
185
+ # puts penguins.group(:species).inspect
186
+ #
187
+ # # =>
188
+ # #<RedAmber::Group : 0x0000000000003a98>
189
+ # species group_count
190
+ # <string> <uint8>
191
+ # 0 Adelie 152
192
+ # 1 Chinstrap 68
193
+ # 2 Gentoo 124
194
+ #
86
195
  def inspect
87
196
  "#<#{self.class} : #{format('0x%016x', object_id)}>\n#{group_count}"
88
197
  end
89
198
 
199
+ # Summarize Group by aggregation functions from the block.
200
+ #
201
+ # @yieldparam group [Group]
202
+ # passes group object self.
203
+ # @yieldreturn [DataFrame, Array<DataFrame>]
204
+ # an aggregated DataFrame or an array of aggregated DataFrames.
205
+ # @return [DataFrame]
206
+ # summarized DataFrame.
207
+ # @example Single function and single variable
208
+ # group = penguins.group(:species)
209
+ # group
210
+ #
211
+ # # =>
212
+ # #<RedAmber::Group : 0x000000000000c314>
213
+ # species group_count
214
+ # <string> <uint8>
215
+ # 0 Adelie 152
216
+ # 1 Chinstrap 68
217
+ # 2 Gentoo 124
218
+ #
219
+ # group.summarize { mean(:bill_length_mm) }
220
+ #
221
+ # # =>
222
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
223
+ # species mean(bill_length_mm)
224
+ # <string> <double>
225
+ # 0 Adelie 38.79
226
+ # 1 Chinstrap 48.83
227
+ # 2 Gentoo 47.5
228
+ #
229
+ # @example Single function only
230
+ # group.summarize { mean }
231
+ #
232
+ # # =>
233
+ # #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
234
+ # species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
235
+ # <string> <double> <double> ... <double>
236
+ # 0 Adelie 38.79 18.35 ... 2008.01
237
+ # 1 Chinstrap 48.83 18.42 ... 2007.97
238
+ # 2 Gentoo 47.5 14.98 ... 2008.08
239
+ #
240
+ # @example Multiple functions
241
+ # group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
242
+ #
243
+ # # =>
244
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
245
+ # species min(bill_length_mm) max(bill_length_mm)
246
+ # <string> <double> <double>
247
+ # 0 Adelie 32.1 46.0
248
+ # 1 Chinstrap 40.9 58.0
249
+ # 2 Gentoo 40.9 59.6
250
+ #
90
251
  def summarize(&block)
91
252
  agg = instance_eval(&block)
92
253
  case agg
@@ -99,7 +260,10 @@ module RedAmber
99
260
  end
100
261
  end
101
262
 
102
- # experimental
263
+ # Aggregating summary.
264
+ #
265
+ # @api private
266
+ #
103
267
  def agg_sum(*summary_keys)
104
268
  call_aggregating_function(:sum, summary_keys, _options = nil)
105
269
  end
@@ -1,41 +1,46 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module Helper
6
6
  private
7
7
 
8
8
  # If num is larger than 1 return 's' to be plural.
9
9
  #
10
- # @param num [Numeric] some number.
11
- # @return ['s', ''] return 's' if num is larger than 1.
10
+ # @param num [Numeric]
11
+ # some number.
12
+ # @return ['s', '']
13
+ # return 's' if num is larger than 1.
12
14
  # Otherwise return ''.
15
+ #
13
16
  def pl(num)
14
17
  num > 1 ? 's' : ''
15
18
  end
16
19
 
17
- # Parse the argments in an Array
18
- # and returns a parsed Array.
20
+ # Parse the argments in an Array and returns a parsed Array.
19
21
  #
20
22
  # @param args
21
23
  # [<Integer, Symbol, true, false, nil, Array, Range, Enumerator, String, Float>]
22
24
  # arguments.
23
- # @param array_size [Integer] size of target Array to use in a endless Range.
24
- # @return [<Integer, Symbol, true, false, nil>] parsed flat Array.
25
+ # @param array_size [Integer]
26
+ # size of target Array to use in a endless Range.
27
+ # @return [<Integer, Symbol, true, false, nil>]
28
+ # parsed flat Array.
25
29
  # @note This method is recursively called to parse.
26
- def parse_args(args, array_size)
30
+ #
31
+ def parse_args(args, array_size, symbolize: true)
27
32
  args.flat_map do |elem|
28
33
  case elem
29
34
  when Integer, Symbol, NilClass, TrueClass, FalseClass
30
35
  elem
31
36
  when Array
32
- parse_args(elem, array_size)
37
+ parse_args(elem, array_size, symbolize: symbolize)
33
38
  when Range
34
39
  parse_range(elem, array_size)
35
40
  when Enumerator
36
- parse_args(Array(elem), array_size)
41
+ parse_args(Array(elem), array_size, symbolize: symbolize)
37
42
  when String
38
- elem.to_sym
43
+ symbolize ? elem.to_sym : elem
39
44
  when Float
40
45
  elem.floor.to_i
41
46
  else
@@ -46,9 +51,13 @@ module RedAmber
46
51
 
47
52
  # Parse a Range to an Array
48
53
  #
49
- # @param range [Range] Range to parse.
50
- # @param array_size [Integer] size of target Array to use in a endless Range.
51
- # @return [Array<Integer, Symbol, String>] parsed Array.
54
+ # @param range [Range]
55
+ # range to parse.
56
+ # @param array_size [Integer]
57
+ # size of target Array to use in a endless Range.
58
+ # @return [Array<Integer, Symbol, String>]
59
+ # parsed Array.
60
+ #
52
61
  def parse_range(range, array_size)
53
62
  bg = range.begin
54
63
  en = range.end
@@ -70,4 +79,55 @@ module RedAmber
70
79
  end
71
80
  end
72
81
  end
82
+
83
+ # rubocop:disable Layout/LineLength
84
+
85
+ # Helper for Arrow Functions
86
+ module ArrowFunction
87
+ module_function
88
+
89
+ # Find Arrow's compute function.
90
+ #
91
+ # {https://arrow.apache.org/docs/cpp/compute.html}
92
+ # @param function_name [Symbol]
93
+ # function name.
94
+ # @return [Arrow::Function]
95
+ # arrow compute function object.
96
+ # @example
97
+ # RedAmber::ArrowFunction.find(:array_sort_indices)
98
+ #
99
+ # # =>
100
+ # #<Arrow::Function:0x7fa8838a0d80 ptr=0x7fa87e9b7320 array_sort_indices(array, {order=Ascending, null_placement=AtEnd}): Return the indices that would sort an array>
101
+ #
102
+ def find(function_name)
103
+ Arrow::Function.find(function_name)
104
+ end
105
+
106
+ # Show document of Arrow's compute function.
107
+ #
108
+ # @param function_name [Symbol]
109
+ # function name.
110
+ # @return [String]
111
+ # document of compute function object.
112
+ # @example
113
+ # puts RedAmber::ArrowFunction.arrow_doc(:array_sort_indices)
114
+ #
115
+ # # =>
116
+ # array_sort_indices(array, {order=Ascending, null_placement=AtEnd}): Return the indices that would sort an array
117
+ # ------------------
118
+ # This function computes an array of indices that define a stable sort
119
+ # of the input array. By default, Null values are considered greater
120
+ # than any other value and are therefore sorted at the end of the array.
121
+ # For floating-point types, NaNs are considered greater than any
122
+ # other non-null value, but smaller than null values.
123
+ #
124
+ # The handling of nulls and NaNs can be changed in ArraySortOptions.
125
+ #
126
+ def arrow_doc(function_name)
127
+ f = find(function_name)
128
+ "#{f}\n#{'-' * function_name.size}\n#{f.doc.description}"
129
+ end
130
+ end
131
+
132
+ # rubocop:enable Layout/LineLength
73
133
  end
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # Namespace of RedAmber
3
4
  module RedAmber
4
5
  # Add additional capabilities to Hash
5
6
  module RefineHash
@@ -142,7 +143,7 @@ module RedAmber
142
143
  module RefineArrowTable
143
144
  refine Arrow::Table do
144
145
  def keys
145
- columns.map(&:name)
146
+ columns.map { |column| column.name.to_sym }
146
147
  end
147
148
 
148
149
  def key?(key)
@@ -154,23 +155,27 @@ module RedAmber
154
155
  # Add additional capabilities to Array
155
156
  module RefineArray
156
157
  refine Array do
157
- def integers?
158
+ def integer?
158
159
  all? { |e| e.is_a?(Integer) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
159
160
  end
160
161
 
161
- def booleans?
162
+ def numeric?
163
+ all? { |e| e.is_a?(Numeric) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
164
+ end
165
+
166
+ def boolean?
162
167
  all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
163
168
  end
164
169
 
165
- def symbols?
170
+ def symbol?
166
171
  all? { |e| e.is_a?(Symbol) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
167
172
  end
168
173
 
169
- def strings?
174
+ def string?
170
175
  all? { |e| e.is_a?(String) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
171
176
  end
172
177
 
173
- def symbols_or_strings?
178
+ def symbol_or_string?
174
179
  all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
175
180
  end
176
181
 
@@ -196,4 +201,19 @@ module RedAmber
196
201
  end
197
202
  end
198
203
  end
204
+
205
+ # Add additional capabilities to String
206
+ module RefineString
207
+ refine String do
208
+ def width
209
+ chars
210
+ .partition(&:ascii_only?)
211
+ .map.with_index(1) { |a, i| a.size * i }
212
+ .sum
213
+ end
214
+ end
215
+ end
216
+
217
+ private_constant :RefineArray, :RefineArrayLike, :RefineArrowTable,
218
+ :RefineHash, :RefineString
199
219
  end