red_amber 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +39 -20
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +113 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +25 -26
  8. data/benchmark/basic.yml +2 -2
  9. data/benchmark/combine.yml +2 -2
  10. data/benchmark/dataframe.yml +2 -2
  11. data/benchmark/group.yml +2 -2
  12. data/benchmark/reshape.yml +2 -2
  13. data/benchmark/vector.yml +3 -0
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +429 -75
  20. data/lib/red_amber/data_frame_combinable.rb +516 -66
  21. data/lib/red_amber/data_frame_displayable.rb +244 -14
  22. data/lib/red_amber/data_frame_indexable.rb +121 -18
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +622 -66
  26. data/lib/red_amber/data_frame_variable_operation.rb +446 -34
  27. data/lib/red_amber/group.rb +187 -22
  28. data/lib/red_amber/helper.rb +70 -10
  29. data/lib/red_amber/refinements.rb +12 -5
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +385 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +217 -12
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -1,16 +1,72 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # group class
4
+ # Group class
5
5
  class Group
6
6
  include Enumerable # This feature is experimental
7
7
 
8
8
  using RefineArrowTable
9
9
 
10
+ # Source DataFrame.
11
+ #
12
+ # @return [DataFrame]
13
+ # source DataFrame.
14
+ #
15
+ attr_reader :dataframe
16
+
17
+ # Keys for grouping by value.
18
+ #
19
+ # @return [Array]
20
+ # group keys.
21
+ #
22
+ attr_reader :group_keys
23
+
24
+ class << self
25
+ private
26
+
27
+ # @!macro [attach] define_group_aggregation
28
+ # @!method $1(*summary_keys)
29
+ # Group aggregation function `$1`.
30
+ # @param summary_keys [Array<Symbol, String>]
31
+ # summary keys.
32
+ # @return [DataFrame]
33
+ # aggregated DataFrame
34
+ #
35
+ def define_group_aggregation(function)
36
+ define_method(function) do |*summary_keys|
37
+ summary_keys = Array(summary_keys).flatten
38
+ d = summary_keys - @dataframe.keys
39
+ unless summary_keys.empty? || d.empty?
40
+ raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}."
41
+ end
42
+
43
+ table = @group.aggregate(*build_aggregation_keys("hash_#{function}",
44
+ summary_keys))
45
+ g = @group_keys.map(&:to_s)
46
+ DataFrame.new(table[g + (table.keys - g)])
47
+ end
48
+ end
49
+ end
50
+
10
51
  # Creates a new Group object.
11
52
  #
12
- # @param dataframe [DataFrame] dataframe to be grouped.
13
- # @param group_keys [Array<>] keys for grouping.
53
+ # @param dataframe [DataFrame]
54
+ # dataframe to be grouped.
55
+ # @param group_keys [Array<Symbol, String>]
56
+ # keys for grouping.
57
+ # @return [Group]
58
+ # Group object.
59
+ # @example
60
+ # Group.new(penguins, :species)
61
+ #
62
+ # # =>
63
+ # #<RedAmber::Group : 0x000000000000f410>
64
+ # species group_count
65
+ # <string> <uint8>
66
+ # 0 Adelie 152
67
+ # 1 Chinstrap 68
68
+ # 2 Gentoo 124
69
+ #
14
70
  def initialize(dataframe, *group_keys)
15
71
  @dataframe = dataframe
16
72
  @group_keys = group_keys.flatten
@@ -23,24 +79,7 @@ module RedAmber
23
79
  @group = @dataframe.table.group(*@group_keys)
24
80
  end
25
81
 
26
- attr_reader :dataframe, :group_keys
27
-
28
- functions = %i[count sum product mean min max stddev variance]
29
- functions.each do |function|
30
- define_method(function) do |*summary_keys|
31
- summary_keys = Array(summary_keys).flatten
32
- d = summary_keys - @dataframe.keys
33
- unless summary_keys.empty? || d.empty?
34
- raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}."
35
- end
36
-
37
- table = @group.aggregate(*build_aggregation_keys("hash_#{function}",
38
- summary_keys))
39
- g = @group_keys.map(&:to_s)
40
- DataFrame.new(table[g + (table.keys - g)])
41
- end
42
- end
43
-
82
+ define_group_aggregation(:count)
44
83
  alias_method :__count, :count
45
84
  private :__count
46
85
 
@@ -54,6 +93,26 @@ module RedAmber
54
93
  end
55
94
  end
56
95
 
96
+ define_group_aggregation(:sum)
97
+
98
+ define_group_aggregation(:product)
99
+
100
+ define_group_aggregation(:mean)
101
+
102
+ define_group_aggregation(:min)
103
+
104
+ define_group_aggregation(:max)
105
+
106
+ define_group_aggregation(:stddev)
107
+
108
+ define_group_aggregation(:variance)
109
+
110
+ # Returns Array of boolean filters to select each records in the Group.
111
+ #
112
+ # @api private
113
+ # @return [Array]
114
+ # an Array of boolean filter Vectors.
115
+ #
57
116
  def filters
58
117
  @filters ||= begin
59
118
  first, *others = @group_keys.map do |key|
@@ -69,6 +128,25 @@ module RedAmber
69
128
  end
70
129
  end
71
130
 
131
+ # Iterates over each record group as a DataFrame or returns a Enumerator.
132
+ #
133
+ # @api private
134
+ # @overload each
135
+ # Returns a new Enumerator if no block given.
136
+ #
137
+ # @return [Enumerator]
138
+ # Enumerator of each group as a DataFrame.
139
+ #
140
+ # @overload each
141
+ # When a block given, passes each record group as a DataFrame to the block.
142
+ #
143
+ # @yieldparam df [DataFrame]
144
+ # passes each record group as a DataFrame by a block parameter.
145
+ # @yieldreturn [Object]
146
+ # evaluated result value from the block.
147
+ # @return [Integer]
148
+ # group size.
149
+ #
72
150
  def each
73
151
  filters
74
152
  return enum_for(:each) unless block_given?
@@ -79,14 +157,98 @@ module RedAmber
79
157
  @filters.size
80
158
  end
81
159
 
160
+ # Returns each record group size as a DataFrame.
161
+ #
162
+ # @return [DataFrame]
163
+ # DataFrame consists of:
164
+ # - Group key columns.
165
+ # - Result columns by group aggregation.
166
+ # @example
167
+ # penguins.group(:species).group_count
168
+ #
169
+ # # =>
170
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003a70>
171
+ # species group_count
172
+ # <string> <uint8>
173
+ # 0 Adelie 152
174
+ # 1 Chinstrap 68
175
+ # 2 Gentoo 124
176
+ #
82
177
  def group_count
83
178
  DataFrame.create(add_columns_to_table(base_table, [:group_count], [group_counts]))
84
179
  end
85
180
 
181
+ # String representation of self.
182
+ #
183
+ # @return [String]
184
+ # show information of self as a String.
185
+ # @example
186
+ # puts penguins.group(:species).inspect
187
+ #
188
+ # # =>
189
+ # #<RedAmber::Group : 0x0000000000003a98>
190
+ # species group_count
191
+ # <string> <uint8>
192
+ # 0 Adelie 152
193
+ # 1 Chinstrap 68
194
+ # 2 Gentoo 124
195
+ #
86
196
  def inspect
87
197
  "#<#{self.class} : #{format('0x%016x', object_id)}>\n#{group_count}"
88
198
  end
89
199
 
200
+ # Summarize Group by aggregation functions from the block.
201
+ #
202
+ # @yieldparam group [Group]
203
+ # passes group object self.
204
+ # @yieldreturn [DataFrame, Array<DataFrame>]
205
+ # an aggregated DataFrame or an array of aggregated DataFrames.
206
+ # @return [DataFrame]
207
+ # summarized DataFrame.
208
+ # @example Single function and single variable
209
+ # group = penguins.group(:species)
210
+ # group
211
+ #
212
+ # # =>
213
+ # #<RedAmber::Group : 0x000000000000c314>
214
+ # species group_count
215
+ # <string> <uint8>
216
+ # 0 Adelie 152
217
+ # 1 Chinstrap 68
218
+ # 2 Gentoo 124
219
+ #
220
+ # group.summarize { mean(:bill_length_mm) }
221
+ #
222
+ # # =>
223
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
224
+ # species mean(bill_length_mm)
225
+ # <string> <double>
226
+ # 0 Adelie 38.79
227
+ # 1 Chinstrap 48.83
228
+ # 2 Gentoo 47.5
229
+ #
230
+ # @example Single function only
231
+ # group.summarize { mean }
232
+ #
233
+ # # =>
234
+ # #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
235
+ # species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
236
+ # <string> <double> <double> ... <double>
237
+ # 0 Adelie 38.79 18.35 ... 2008.01
238
+ # 1 Chinstrap 48.83 18.42 ... 2007.97
239
+ # 2 Gentoo 47.5 14.98 ... 2008.08
240
+ #
241
+ # @example Multiple functions
242
+ # group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
243
+ #
244
+ # # =>
245
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
246
+ # species min(bill_length_mm) max(bill_length_mm)
247
+ # <string> <double> <double>
248
+ # 0 Adelie 32.1 46.0
249
+ # 1 Chinstrap 40.9 58.0
250
+ # 2 Gentoo 40.9 59.6
251
+ #
90
252
  def summarize(&block)
91
253
  agg = instance_eval(&block)
92
254
  case agg
@@ -99,7 +261,10 @@ module RedAmber
99
261
  end
100
262
  end
101
263
 
102
- # experimental
264
+ # Aggregating summary.
265
+ #
266
+ # @api private
267
+ #
103
268
  def agg_sum(*summary_keys)
104
269
  call_aggregating_function(:sum, summary_keys, _options = nil)
105
270
  end
@@ -1,28 +1,33 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module Helper
6
6
  private
7
7
 
8
8
  # If num is larger than 1 return 's' to be plural.
9
9
  #
10
- # @param num [Numeric] some number.
11
- # @return ['s', ''] return 's' if num is larger than 1.
10
+ # @param num [Numeric]
11
+ # some number.
12
+ # @return ['s', '']
13
+ # return 's' if num is larger than 1.
12
14
  # Otherwise return ''.
15
+ #
13
16
  def pl(num)
14
17
  num > 1 ? 's' : ''
15
18
  end
16
19
 
17
- # Parse the argments in an Array
18
- # and returns a parsed Array.
20
+ # Parse the argments in an Array and returns a parsed Array.
19
21
  #
20
22
  # @param args
21
23
  # [<Integer, Symbol, true, false, nil, Array, Range, Enumerator, String, Float>]
22
24
  # arguments.
23
- # @param array_size [Integer] size of target Array to use in a endless Range.
24
- # @return [<Integer, Symbol, true, false, nil>] parsed flat Array.
25
+ # @param array_size [Integer]
26
+ # size of target Array to use in a endless Range.
27
+ # @return [<Integer, Symbol, true, false, nil>]
28
+ # parsed flat Array.
25
29
  # @note This method is recursively called to parse.
30
+ #
26
31
  def parse_args(args, array_size)
27
32
  args.flat_map do |elem|
28
33
  case elem
@@ -46,9 +51,13 @@ module RedAmber
46
51
 
47
52
  # Parse a Range to an Array
48
53
  #
49
- # @param range [Range] Range to parse.
50
- # @param array_size [Integer] size of target Array to use in a endless Range.
51
- # @return [Array<Integer, Symbol, String>] parsed Array.
54
+ # @param range [Range]
55
+ # range to parse.
56
+ # @param array_size [Integer]
57
+ # size of target Array to use in a endless Range.
58
+ # @return [Array<Integer, Symbol, String>]
59
+ # parsed Array.
60
+ #
52
61
  def parse_range(range, array_size)
53
62
  bg = range.begin
54
63
  en = range.end
@@ -70,4 +79,55 @@ module RedAmber
70
79
  end
71
80
  end
72
81
  end
82
+
83
+ # rubocop:disable Layout/LineLength
84
+
85
+ # Helper for Arrow Functions
86
+ module ArrowFunction
87
+ module_function
88
+
89
+ # Find Arrow's compute function.
90
+ #
91
+ # {https://arrow.apache.org/docs/cpp/compute.html}
92
+ # @param function_name [Symbol]
93
+ # function name.
94
+ # @return [Arrow::Function]
95
+ # arrow compute function object.
96
+ # @example
97
+ # RedAmber::ArrowFunction.find(:array_sort_indices)
98
+ #
99
+ # # =>
100
+ # #<Arrow::Function:0x7fa8838a0d80 ptr=0x7fa87e9b7320 array_sort_indices(array, {order=Ascending, null_placement=AtEnd}): Return the indices that would sort an array>
101
+ #
102
+ def find(function_name)
103
+ Arrow::Function.find(function_name)
104
+ end
105
+
106
+ # Show document of Arrow's compute function.
107
+ #
108
+ # @param function_name [Symbol]
109
+ # function name.
110
+ # @return [String]
111
+ # document of compute function object.
112
+ # @example
113
+ # puts RedAmber::ArrowFunction.arrow_doc(:array_sort_indices)
114
+ #
115
+ # # =>
116
+ # array_sort_indices(array, {order=Ascending, null_placement=AtEnd}): Return the indices that would sort an array
117
+ # ------------------
118
+ # This function computes an array of indices that define a stable sort
119
+ # of the input array. By default, Null values are considered greater
120
+ # than any other value and are therefore sorted at the end of the array.
121
+ # For floating-point types, NaNs are considered greater than any
122
+ # other non-null value, but smaller than null values.
123
+ #
124
+ # The handling of nulls and NaNs can be changed in ArraySortOptions.
125
+ #
126
+ def arrow_doc(function_name)
127
+ f = find(function_name)
128
+ "#{f}\n#{'-' * function_name.size}\n#{f.doc.description}"
129
+ end
130
+ end
131
+
132
+ # rubocop:enable Layout/LineLength
73
133
  end
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # Namespace of RedAmber
3
4
  module RedAmber
4
5
  # Add additional capabilities to Hash
5
6
  module RefineHash
@@ -154,23 +155,27 @@ module RedAmber
154
155
  # Add additional capabilities to Array
155
156
  module RefineArray
156
157
  refine Array do
157
- def integers?
158
+ def integer?
158
159
  all? { |e| e.is_a?(Integer) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
159
160
  end
160
161
 
161
- def booleans?
162
+ def numeric?
163
+ all? { |e| e.is_a?(Numeric) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
164
+ end
165
+
166
+ def boolean?
162
167
  all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
163
168
  end
164
169
 
165
- def symbols?
170
+ def symbol?
166
171
  all? { |e| e.is_a?(Symbol) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
167
172
  end
168
173
 
169
- def strings?
174
+ def string?
170
175
  all? { |e| e.is_a?(String) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
171
176
  end
172
177
 
173
- def symbols_or_strings?
178
+ def symbol_or_string?
174
179
  all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
175
180
  end
176
181
 
@@ -196,4 +201,6 @@ module RedAmber
196
201
  end
197
202
  end
198
203
  end
204
+
205
+ private_constant :RefineArray, :RefineArrayLike, :RefineArrowTable, :RefineHash
199
206
  end