red_amber 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +39 -20
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +113 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +25 -26
  8. data/benchmark/basic.yml +2 -2
  9. data/benchmark/combine.yml +2 -2
  10. data/benchmark/dataframe.yml +2 -2
  11. data/benchmark/group.yml +2 -2
  12. data/benchmark/reshape.yml +2 -2
  13. data/benchmark/vector.yml +3 -0
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +429 -75
  20. data/lib/red_amber/data_frame_combinable.rb +516 -66
  21. data/lib/red_amber/data_frame_displayable.rb +244 -14
  22. data/lib/red_amber/data_frame_indexable.rb +121 -18
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +622 -66
  26. data/lib/red_amber/data_frame_variable_operation.rb +446 -34
  27. data/lib/red_amber/group.rb +187 -22
  28. data/lib/red_amber/helper.rb +70 -10
  29. data/lib/red_amber/refinements.rb +12 -5
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +385 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +217 -12
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -1,16 +1,72 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # group class
4
+ # Group class
5
5
  class Group
6
6
  include Enumerable # This feature is experimental
7
7
 
8
8
  using RefineArrowTable
9
9
 
10
+ # Source DataFrame.
11
+ #
12
+ # @return [DataFrame]
13
+ # source DataFrame.
14
+ #
15
+ attr_reader :dataframe
16
+
17
+ # Keys for grouping by value.
18
+ #
19
+ # @return [Array]
20
+ # group keys.
21
+ #
22
+ attr_reader :group_keys
23
+
24
+ class << self
25
+ private
26
+
27
+ # @!macro [attach] define_group_aggregation
28
+ # @!method $1(*summary_keys)
29
+ # Group aggregation function `$1`.
30
+ # @param summary_keys [Array<Symbol, String>]
31
+ # summary keys.
32
+ # @return [DataFrame]
33
+ # aggregated DataFrame
34
+ #
35
+ def define_group_aggregation(function)
36
+ define_method(function) do |*summary_keys|
37
+ summary_keys = Array(summary_keys).flatten
38
+ d = summary_keys - @dataframe.keys
39
+ unless summary_keys.empty? || d.empty?
40
+ raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}."
41
+ end
42
+
43
+ table = @group.aggregate(*build_aggregation_keys("hash_#{function}",
44
+ summary_keys))
45
+ g = @group_keys.map(&:to_s)
46
+ DataFrame.new(table[g + (table.keys - g)])
47
+ end
48
+ end
49
+ end
50
+
10
51
  # Creates a new Group object.
11
52
  #
12
- # @param dataframe [DataFrame] dataframe to be grouped.
13
- # @param group_keys [Array<>] keys for grouping.
53
+ # @param dataframe [DataFrame]
54
+ # dataframe to be grouped.
55
+ # @param group_keys [Array<Symbol, String>]
56
+ # keys for grouping.
57
+ # @return [Group]
58
+ # Group object.
59
+ # @example
60
+ # Group.new(penguins, :species)
61
+ #
62
+ # # =>
63
+ # #<RedAmber::Group : 0x000000000000f410>
64
+ # species group_count
65
+ # <string> <uint8>
66
+ # 0 Adelie 152
67
+ # 1 Chinstrap 68
68
+ # 2 Gentoo 124
69
+ #
14
70
  def initialize(dataframe, *group_keys)
15
71
  @dataframe = dataframe
16
72
  @group_keys = group_keys.flatten
@@ -23,24 +79,7 @@ module RedAmber
23
79
  @group = @dataframe.table.group(*@group_keys)
24
80
  end
25
81
 
26
- attr_reader :dataframe, :group_keys
27
-
28
- functions = %i[count sum product mean min max stddev variance]
29
- functions.each do |function|
30
- define_method(function) do |*summary_keys|
31
- summary_keys = Array(summary_keys).flatten
32
- d = summary_keys - @dataframe.keys
33
- unless summary_keys.empty? || d.empty?
34
- raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}."
35
- end
36
-
37
- table = @group.aggregate(*build_aggregation_keys("hash_#{function}",
38
- summary_keys))
39
- g = @group_keys.map(&:to_s)
40
- DataFrame.new(table[g + (table.keys - g)])
41
- end
42
- end
43
-
82
+ define_group_aggregation(:count)
44
83
  alias_method :__count, :count
45
84
  private :__count
46
85
 
@@ -54,6 +93,26 @@ module RedAmber
54
93
  end
55
94
  end
56
95
 
96
+ define_group_aggregation(:sum)
97
+
98
+ define_group_aggregation(:product)
99
+
100
+ define_group_aggregation(:mean)
101
+
102
+ define_group_aggregation(:min)
103
+
104
+ define_group_aggregation(:max)
105
+
106
+ define_group_aggregation(:stddev)
107
+
108
+ define_group_aggregation(:variance)
109
+
110
+ # Returns Array of boolean filters to select each records in the Group.
111
+ #
112
+ # @api private
113
+ # @return [Array]
114
+ # an Array of boolean filter Vectors.
115
+ #
57
116
  def filters
58
117
  @filters ||= begin
59
118
  first, *others = @group_keys.map do |key|
@@ -69,6 +128,25 @@ module RedAmber
69
128
  end
70
129
  end
71
130
 
131
+ # Iterates over each record group as a DataFrame or returns a Enumerator.
132
+ #
133
+ # @api private
134
+ # @overload each
135
+ # Returns a new Enumerator if no block given.
136
+ #
137
+ # @return [Enumerator]
138
+ # Enumerator of each group as a DataFrame.
139
+ #
140
+ # @overload each
141
+ # When a block given, passes each record group as a DataFrame to the block.
142
+ #
143
+ # @yieldparam df [DataFrame]
144
+ # passes each record group as a DataFrame by a block parameter.
145
+ # @yieldreturn [Object]
146
+ # evaluated result value from the block.
147
+ # @return [Integer]
148
+ # group size.
149
+ #
72
150
  def each
73
151
  filters
74
152
  return enum_for(:each) unless block_given?
@@ -79,14 +157,98 @@ module RedAmber
79
157
  @filters.size
80
158
  end
81
159
 
160
+ # Returns each record group size as a DataFrame.
161
+ #
162
+ # @return [DataFrame]
163
+ # DataFrame consists of:
164
+ # - Group key columns.
165
+ # - Result columns by group aggregation.
166
+ # @example
167
+ # penguins.group(:species).group_count
168
+ #
169
+ # # =>
170
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003a70>
171
+ # species group_count
172
+ # <string> <uint8>
173
+ # 0 Adelie 152
174
+ # 1 Chinstrap 68
175
+ # 2 Gentoo 124
176
+ #
82
177
  def group_count
83
178
  DataFrame.create(add_columns_to_table(base_table, [:group_count], [group_counts]))
84
179
  end
85
180
 
181
+ # String representation of self.
182
+ #
183
+ # @return [String]
184
+ # show information of self as a String.
185
+ # @example
186
+ # puts penguins.group(:species).inspect
187
+ #
188
+ # # =>
189
+ # #<RedAmber::Group : 0x0000000000003a98>
190
+ # species group_count
191
+ # <string> <uint8>
192
+ # 0 Adelie 152
193
+ # 1 Chinstrap 68
194
+ # 2 Gentoo 124
195
+ #
86
196
  def inspect
87
197
  "#<#{self.class} : #{format('0x%016x', object_id)}>\n#{group_count}"
88
198
  end
89
199
 
200
+ # Summarize Group by aggregation functions from the block.
201
+ #
202
+ # @yieldparam group [Group]
203
+ # passes group object self.
204
+ # @yieldreturn [DataFrame, Array<DataFrame>]
205
+ # an aggregated DataFrame or an array of aggregated DataFrames.
206
+ # @return [DataFrame]
207
+ # summarized DataFrame.
208
+ # @example Single function and single variable
209
+ # group = penguins.group(:species)
210
+ # group
211
+ #
212
+ # # =>
213
+ # #<RedAmber::Group : 0x000000000000c314>
214
+ # species group_count
215
+ # <string> <uint8>
216
+ # 0 Adelie 152
217
+ # 1 Chinstrap 68
218
+ # 2 Gentoo 124
219
+ #
220
+ # group.summarize { mean(:bill_length_mm) }
221
+ #
222
+ # # =>
223
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
224
+ # species mean(bill_length_mm)
225
+ # <string> <double>
226
+ # 0 Adelie 38.79
227
+ # 1 Chinstrap 48.83
228
+ # 2 Gentoo 47.5
229
+ #
230
+ # @example Single function only
231
+ # group.summarize { mean }
232
+ #
233
+ # # =>
234
+ # #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
235
+ # species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
236
+ # <string> <double> <double> ... <double>
237
+ # 0 Adelie 38.79 18.35 ... 2008.01
238
+ # 1 Chinstrap 48.83 18.42 ... 2007.97
239
+ # 2 Gentoo 47.5 14.98 ... 2008.08
240
+ #
241
+ # @example Multiple functions
242
+ # group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
243
+ #
244
+ # # =>
245
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
246
+ # species min(bill_length_mm) max(bill_length_mm)
247
+ # <string> <double> <double>
248
+ # 0 Adelie 32.1 46.0
249
+ # 1 Chinstrap 40.9 58.0
250
+ # 2 Gentoo 40.9 59.6
251
+ #
90
252
  def summarize(&block)
91
253
  agg = instance_eval(&block)
92
254
  case agg
@@ -99,7 +261,10 @@ module RedAmber
99
261
  end
100
262
  end
101
263
 
102
- # experimental
264
+ # Aggregating summary.
265
+ #
266
+ # @api private
267
+ #
103
268
  def agg_sum(*summary_keys)
104
269
  call_aggregating_function(:sum, summary_keys, _options = nil)
105
270
  end
@@ -1,28 +1,33 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module Helper
6
6
  private
7
7
 
8
8
  # If num is larger than 1 return 's' to be plural.
9
9
  #
10
- # @param num [Numeric] some number.
11
- # @return ['s', ''] return 's' if num is larger than 1.
10
+ # @param num [Numeric]
11
+ # some number.
12
+ # @return ['s', '']
13
+ # return 's' if num is larger than 1.
12
14
  # Otherwise return ''.
15
+ #
13
16
  def pl(num)
14
17
  num > 1 ? 's' : ''
15
18
  end
16
19
 
17
- # Parse the argments in an Array
18
- # and returns a parsed Array.
20
+ # Parse the argments in an Array and returns a parsed Array.
19
21
  #
20
22
  # @param args
21
23
  # [<Integer, Symbol, true, false, nil, Array, Range, Enumerator, String, Float>]
22
24
  # arguments.
23
- # @param array_size [Integer] size of target Array to use in a endless Range.
24
- # @return [<Integer, Symbol, true, false, nil>] parsed flat Array.
25
+ # @param array_size [Integer]
26
+ # size of target Array to use in a endless Range.
27
+ # @return [<Integer, Symbol, true, false, nil>]
28
+ # parsed flat Array.
25
29
  # @note This method is recursively called to parse.
30
+ #
26
31
  def parse_args(args, array_size)
27
32
  args.flat_map do |elem|
28
33
  case elem
@@ -46,9 +51,13 @@ module RedAmber
46
51
 
47
52
  # Parse a Range to an Array
48
53
  #
49
- # @param range [Range] Range to parse.
50
- # @param array_size [Integer] size of target Array to use in a endless Range.
51
- # @return [Array<Integer, Symbol, String>] parsed Array.
54
+ # @param range [Range]
55
+ # range to parse.
56
+ # @param array_size [Integer]
57
+ # size of target Array to use in a endless Range.
58
+ # @return [Array<Integer, Symbol, String>]
59
+ # parsed Array.
60
+ #
52
61
  def parse_range(range, array_size)
53
62
  bg = range.begin
54
63
  en = range.end
@@ -70,4 +79,55 @@ module RedAmber
70
79
  end
71
80
  end
72
81
  end
82
+
83
+ # rubocop:disable Layout/LineLength
84
+
85
+ # Helper for Arrow Functions
86
+ module ArrowFunction
87
+ module_function
88
+
89
+ # Find Arrow's compute function.
90
+ #
91
+ # {https://arrow.apache.org/docs/cpp/compute.html}
92
+ # @param function_name [Symbol]
93
+ # function name.
94
+ # @return [Arrow::Function]
95
+ # arrow compute function object.
96
+ # @example
97
+ # RedAmber::ArrowFunction.find(:array_sort_indices)
98
+ #
99
+ # # =>
100
+ # #<Arrow::Function:0x7fa8838a0d80 ptr=0x7fa87e9b7320 array_sort_indices(array, {order=Ascending, null_placement=AtEnd}): Return the indices that would sort an array>
101
+ #
102
+ def find(function_name)
103
+ Arrow::Function.find(function_name)
104
+ end
105
+
106
+ # Show document of Arrow's compute function.
107
+ #
108
+ # @param function_name [Symbol]
109
+ # function name.
110
+ # @return [String]
111
+ # document of compute function object.
112
+ # @example
113
+ # puts RedAmber::ArrowFunction.arrow_doc(:array_sort_indices)
114
+ #
115
+ # # =>
116
+ # array_sort_indices(array, {order=Ascending, null_placement=AtEnd}): Return the indices that would sort an array
117
+ # ------------------
118
+ # This function computes an array of indices that define a stable sort
119
+ # of the input array. By default, Null values are considered greater
120
+ # than any other value and are therefore sorted at the end of the array.
121
+ # For floating-point types, NaNs are considered greater than any
122
+ # other non-null value, but smaller than null values.
123
+ #
124
+ # The handling of nulls and NaNs can be changed in ArraySortOptions.
125
+ #
126
+ def arrow_doc(function_name)
127
+ f = find(function_name)
128
+ "#{f}\n#{'-' * function_name.size}\n#{f.doc.description}"
129
+ end
130
+ end
131
+
132
+ # rubocop:enable Layout/LineLength
73
133
  end
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # Namespace of RedAmber
3
4
  module RedAmber
4
5
  # Add additional capabilities to Hash
5
6
  module RefineHash
@@ -154,23 +155,27 @@ module RedAmber
154
155
  # Add additional capabilities to Array
155
156
  module RefineArray
156
157
  refine Array do
157
- def integers?
158
+ def integer?
158
159
  all? { |e| e.is_a?(Integer) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
159
160
  end
160
161
 
161
- def booleans?
162
+ def numeric?
163
+ all? { |e| e.is_a?(Numeric) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
164
+ end
165
+
166
+ def boolean?
162
167
  all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
163
168
  end
164
169
 
165
- def symbols?
170
+ def symbol?
166
171
  all? { |e| e.is_a?(Symbol) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
167
172
  end
168
173
 
169
- def strings?
174
+ def string?
170
175
  all? { |e| e.is_a?(String) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
171
176
  end
172
177
 
173
- def symbols_or_strings?
178
+ def symbol_or_string?
174
179
  all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
175
180
  end
176
181
 
@@ -196,4 +201,6 @@ module RedAmber
196
201
  end
197
202
  end
198
203
  end
204
+
205
+ private_constant :RefineArray, :RefineArrayLike, :RefineArrowTable, :RefineHash
199
206
  end