red_amber 0.3.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +56 -22
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +178 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +29 -30
  8. data/benchmark/basic.yml +7 -7
  9. data/benchmark/combine.yml +3 -3
  10. data/benchmark/dataframe.yml +15 -9
  11. data/benchmark/group.yml +6 -6
  12. data/benchmark/reshape.yml +6 -6
  13. data/benchmark/vector.yml +6 -3
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +454 -85
  20. data/lib/red_amber/data_frame_combinable.rb +609 -115
  21. data/lib/red_amber/data_frame_displayable.rb +313 -34
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +623 -70
  26. data/lib/red_amber/data_frame_variable_operation.rb +452 -35
  27. data/lib/red_amber/group.rb +186 -22
  28. data/lib/red_amber/helper.rb +74 -14
  29. data/lib/red_amber/refinements.rb +26 -6
  30. data/lib/red_amber/subframes.rb +1101 -0
  31. data/lib/red_amber/vector.rb +362 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +506 -0
  34. data/lib/red_amber/vector_selectable.rb +265 -23
  35. data/lib/red_amber/vector_unary_element_wise.rb +529 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -1,16 +1,71 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # group class
4
+ # Group class
5
5
  class Group
6
6
  include Enumerable # This feature is experimental
7
7
 
8
8
  using RefineArrowTable
9
9
 
10
+ # Source DataFrame.
11
+ #
12
+ # @return [DataFrame]
13
+ # source DataFrame.
14
+ #
15
+ attr_reader :dataframe
16
+
17
+ # Keys for grouping by value.
18
+ #
19
+ # @return [Array]
20
+ # group keys.
21
+ #
22
+ attr_reader :group_keys
23
+
24
+ class << self
25
+ private
26
+
27
+ # @!macro [attach] define_group_aggregation
28
+ # @!method $1(*summary_keys)
29
+ # Group aggregation function `$1`.
30
+ # @param summary_keys [Array<Symbol, String>]
31
+ # summary keys.
32
+ # @return [DataFrame]
33
+ # aggregated DataFrame
34
+ #
35
+ def define_group_aggregation(function)
36
+ define_method(function) do |*summary_keys|
37
+ summary_keys = Array(summary_keys).flatten
38
+ d = summary_keys - @dataframe.keys
39
+ unless summary_keys.empty? || d.empty?
40
+ raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}."
41
+ end
42
+
43
+ table = @group.aggregate(*build_aggregation_keys("hash_#{function}",
44
+ summary_keys))
45
+ DataFrame.new(table[@group_keys + (table.keys - @group_keys)])
46
+ end
47
+ end
48
+ end
49
+
10
50
  # Creates a new Group object.
11
51
  #
12
- # @param dataframe [DataFrame] dataframe to be grouped.
13
- # @param group_keys [Array<>] keys for grouping.
52
+ # @param dataframe [DataFrame]
53
+ # dataframe to be grouped.
54
+ # @param group_keys [Array<Symbol, String>]
55
+ # keys for grouping.
56
+ # @return [Group]
57
+ # Group object.
58
+ # @example
59
+ # Group.new(penguins, :species)
60
+ #
61
+ # # =>
62
+ # #<RedAmber::Group : 0x000000000000f410>
63
+ # species group_count
64
+ # <string> <uint8>
65
+ # 0 Adelie 152
66
+ # 1 Chinstrap 68
67
+ # 2 Gentoo 124
68
+ #
14
69
  def initialize(dataframe, *group_keys)
15
70
  @dataframe = dataframe
16
71
  @group_keys = group_keys.flatten
@@ -23,24 +78,7 @@ module RedAmber
23
78
  @group = @dataframe.table.group(*@group_keys)
24
79
  end
25
80
 
26
- attr_reader :dataframe, :group_keys
27
-
28
- functions = %i[count sum product mean min max stddev variance]
29
- functions.each do |function|
30
- define_method(function) do |*summary_keys|
31
- summary_keys = Array(summary_keys).flatten
32
- d = summary_keys - @dataframe.keys
33
- unless summary_keys.empty? || d.empty?
34
- raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}."
35
- end
36
-
37
- table = @group.aggregate(*build_aggregation_keys("hash_#{function}",
38
- summary_keys))
39
- g = @group_keys.map(&:to_s)
40
- DataFrame.new(table[g + (table.keys - g)])
41
- end
42
- end
43
-
81
+ define_group_aggregation(:count)
44
82
  alias_method :__count, :count
45
83
  private :__count
46
84
 
@@ -54,6 +92,26 @@ module RedAmber
54
92
  end
55
93
  end
56
94
 
95
+ define_group_aggregation(:sum)
96
+
97
+ define_group_aggregation(:product)
98
+
99
+ define_group_aggregation(:mean)
100
+
101
+ define_group_aggregation(:min)
102
+
103
+ define_group_aggregation(:max)
104
+
105
+ define_group_aggregation(:stddev)
106
+
107
+ define_group_aggregation(:variance)
108
+
109
+ # Returns Array of boolean filters to select each records in the Group.
110
+ #
111
+ # @api private
112
+ # @return [Array]
113
+ # an Array of boolean filter Vectors.
114
+ #
57
115
  def filters
58
116
  @filters ||= begin
59
117
  first, *others = @group_keys.map do |key|
@@ -69,6 +127,25 @@ module RedAmber
69
127
  end
70
128
  end
71
129
 
130
+ # Iterates over each record group as a DataFrame or returns a Enumerator.
131
+ #
132
+ # @api private
133
+ # @overload each
134
+ # Returns a new Enumerator if no block given.
135
+ #
136
+ # @return [Enumerator]
137
+ # Enumerator of each group as a DataFrame.
138
+ #
139
+ # @overload each
140
+ # When a block given, passes each record group as a DataFrame to the block.
141
+ #
142
+ # @yieldparam df [DataFrame]
143
+ # passes each record group as a DataFrame by a block parameter.
144
+ # @yieldreturn [Object]
145
+ # evaluated result value from the block.
146
+ # @return [Integer]
147
+ # group size.
148
+ #
72
149
  def each
73
150
  filters
74
151
  return enum_for(:each) unless block_given?
@@ -79,14 +156,98 @@ module RedAmber
79
156
  @filters.size
80
157
  end
81
158
 
159
+ # Returns each record group size as a DataFrame.
160
+ #
161
+ # @return [DataFrame]
162
+ # DataFrame consists of:
163
+ # - Group key columns.
164
+ # - Result columns by group aggregation.
165
+ # @example
166
+ # penguins.group(:species).group_count
167
+ #
168
+ # # =>
169
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003a70>
170
+ # species group_count
171
+ # <string> <uint8>
172
+ # 0 Adelie 152
173
+ # 1 Chinstrap 68
174
+ # 2 Gentoo 124
175
+ #
82
176
  def group_count
83
177
  DataFrame.create(add_columns_to_table(base_table, [:group_count], [group_counts]))
84
178
  end
85
179
 
180
+ # String representation of self.
181
+ #
182
+ # @return [String]
183
+ # show information of self as a String.
184
+ # @example
185
+ # puts penguins.group(:species).inspect
186
+ #
187
+ # # =>
188
+ # #<RedAmber::Group : 0x0000000000003a98>
189
+ # species group_count
190
+ # <string> <uint8>
191
+ # 0 Adelie 152
192
+ # 1 Chinstrap 68
193
+ # 2 Gentoo 124
194
+ #
86
195
  def inspect
87
196
  "#<#{self.class} : #{format('0x%016x', object_id)}>\n#{group_count}"
88
197
  end
89
198
 
199
+ # Summarize Group by aggregation functions from the block.
200
+ #
201
+ # @yieldparam group [Group]
202
+ # passes group object self.
203
+ # @yieldreturn [DataFrame, Array<DataFrame>]
204
+ # an aggregated DataFrame or an array of aggregated DataFrames.
205
+ # @return [DataFrame]
206
+ # summarized DataFrame.
207
+ # @example Single function and single variable
208
+ # group = penguins.group(:species)
209
+ # group
210
+ #
211
+ # # =>
212
+ # #<RedAmber::Group : 0x000000000000c314>
213
+ # species group_count
214
+ # <string> <uint8>
215
+ # 0 Adelie 152
216
+ # 1 Chinstrap 68
217
+ # 2 Gentoo 124
218
+ #
219
+ # group.summarize { mean(:bill_length_mm) }
220
+ #
221
+ # # =>
222
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
223
+ # species mean(bill_length_mm)
224
+ # <string> <double>
225
+ # 0 Adelie 38.79
226
+ # 1 Chinstrap 48.83
227
+ # 2 Gentoo 47.5
228
+ #
229
+ # @example Single function only
230
+ # group.summarize { mean }
231
+ #
232
+ # # =>
233
+ # #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
234
+ # species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
235
+ # <string> <double> <double> ... <double>
236
+ # 0 Adelie 38.79 18.35 ... 2008.01
237
+ # 1 Chinstrap 48.83 18.42 ... 2007.97
238
+ # 2 Gentoo 47.5 14.98 ... 2008.08
239
+ #
240
+ # @example Multiple functions
241
+ # group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
242
+ #
243
+ # # =>
244
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
245
+ # species min(bill_length_mm) max(bill_length_mm)
246
+ # <string> <double> <double>
247
+ # 0 Adelie 32.1 46.0
248
+ # 1 Chinstrap 40.9 58.0
249
+ # 2 Gentoo 40.9 59.6
250
+ #
90
251
  def summarize(&block)
91
252
  agg = instance_eval(&block)
92
253
  case agg
@@ -99,7 +260,10 @@ module RedAmber
99
260
  end
100
261
  end
101
262
 
102
- # experimental
263
+ # Aggregating summary.
264
+ #
265
+ # @api private
266
+ #
103
267
  def agg_sum(*summary_keys)
104
268
  call_aggregating_function(:sum, summary_keys, _options = nil)
105
269
  end
@@ -1,41 +1,46 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module Helper
6
6
  private
7
7
 
8
8
  # If num is larger than 1 return 's' to be plural.
9
9
  #
10
- # @param num [Numeric] some number.
11
- # @return ['s', ''] return 's' if num is larger than 1.
10
+ # @param num [Numeric]
11
+ # some number.
12
+ # @return ['s', '']
13
+ # return 's' if num is larger than 1.
12
14
  # Otherwise return ''.
15
+ #
13
16
  def pl(num)
14
17
  num > 1 ? 's' : ''
15
18
  end
16
19
 
17
- # Parse the argments in an Array
18
- # and returns a parsed Array.
20
+ # Parse the argments in an Array and returns a parsed Array.
19
21
  #
20
22
  # @param args
21
23
  # [<Integer, Symbol, true, false, nil, Array, Range, Enumerator, String, Float>]
22
24
  # arguments.
23
- # @param array_size [Integer] size of target Array to use in a endless Range.
24
- # @return [<Integer, Symbol, true, false, nil>] parsed flat Array.
25
+ # @param array_size [Integer]
26
+ # size of target Array to use in a endless Range.
27
+ # @return [<Integer, Symbol, true, false, nil>]
28
+ # parsed flat Array.
25
29
  # @note This method is recursively called to parse.
26
- def parse_args(args, array_size)
30
+ #
31
+ def parse_args(args, array_size, symbolize: true)
27
32
  args.flat_map do |elem|
28
33
  case elem
29
34
  when Integer, Symbol, NilClass, TrueClass, FalseClass
30
35
  elem
31
36
  when Array
32
- parse_args(elem, array_size)
37
+ parse_args(elem, array_size, symbolize: symbolize)
33
38
  when Range
34
39
  parse_range(elem, array_size)
35
40
  when Enumerator
36
- parse_args(Array(elem), array_size)
41
+ parse_args(Array(elem), array_size, symbolize: symbolize)
37
42
  when String
38
- elem.to_sym
43
+ symbolize ? elem.to_sym : elem
39
44
  when Float
40
45
  elem.floor.to_i
41
46
  else
@@ -46,9 +51,13 @@ module RedAmber
46
51
 
47
52
  # Parse a Range to an Array
48
53
  #
49
- # @param range [Range] Range to parse.
50
- # @param array_size [Integer] size of target Array to use in a endless Range.
51
- # @return [Array<Integer, Symbol, String>] parsed Array.
54
+ # @param range [Range]
55
+ # range to parse.
56
+ # @param array_size [Integer]
57
+ # size of target Array to use in a endless Range.
58
+ # @return [Array<Integer, Symbol, String>]
59
+ # parsed Array.
60
+ #
52
61
  def parse_range(range, array_size)
53
62
  bg = range.begin
54
63
  en = range.end
@@ -70,4 +79,55 @@ module RedAmber
70
79
  end
71
80
  end
72
81
  end
82
+
83
+ # rubocop:disable Layout/LineLength
84
+
85
+ # Helper for Arrow Functions
86
+ module ArrowFunction
87
+ module_function
88
+
89
+ # Find Arrow's compute function.
90
+ #
91
+ # {https://arrow.apache.org/docs/cpp/compute.html}
92
+ # @param function_name [Symbol]
93
+ # function name.
94
+ # @return [Arrow::Function]
95
+ # arrow compute function object.
96
+ # @example
97
+ # RedAmber::ArrowFunction.find(:array_sort_indices)
98
+ #
99
+ # # =>
100
+ # #<Arrow::Function:0x7fa8838a0d80 ptr=0x7fa87e9b7320 array_sort_indices(array, {order=Ascending, null_placement=AtEnd}): Return the indices that would sort an array>
101
+ #
102
+ def find(function_name)
103
+ Arrow::Function.find(function_name)
104
+ end
105
+
106
+ # Show document of Arrow's compute function.
107
+ #
108
+ # @param function_name [Symbol]
109
+ # function name.
110
+ # @return [String]
111
+ # document of compute function object.
112
+ # @example
113
+ # puts RedAmber::ArrowFunction.arrow_doc(:array_sort_indices)
114
+ #
115
+ # # =>
116
+ # array_sort_indices(array, {order=Ascending, null_placement=AtEnd}): Return the indices that would sort an array
117
+ # ------------------
118
+ # This function computes an array of indices that define a stable sort
119
+ # of the input array. By default, Null values are considered greater
120
+ # than any other value and are therefore sorted at the end of the array.
121
+ # For floating-point types, NaNs are considered greater than any
122
+ # other non-null value, but smaller than null values.
123
+ #
124
+ # The handling of nulls and NaNs can be changed in ArraySortOptions.
125
+ #
126
+ def arrow_doc(function_name)
127
+ f = find(function_name)
128
+ "#{f}\n#{'-' * function_name.size}\n#{f.doc.description}"
129
+ end
130
+ end
131
+
132
+ # rubocop:enable Layout/LineLength
73
133
  end
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # Namespace of RedAmber
3
4
  module RedAmber
4
5
  # Add additional capabilities to Hash
5
6
  module RefineHash
@@ -142,7 +143,7 @@ module RedAmber
142
143
  module RefineArrowTable
143
144
  refine Arrow::Table do
144
145
  def keys
145
- columns.map(&:name)
146
+ columns.map { |column| column.name.to_sym }
146
147
  end
147
148
 
148
149
  def key?(key)
@@ -154,23 +155,27 @@ module RedAmber
154
155
  # Add additional capabilities to Array
155
156
  module RefineArray
156
157
  refine Array do
157
- def integers?
158
+ def integer?
158
159
  all? { |e| e.is_a?(Integer) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
159
160
  end
160
161
 
161
- def booleans?
162
+ def numeric?
163
+ all? { |e| e.is_a?(Numeric) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
164
+ end
165
+
166
+ def boolean?
162
167
  all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
163
168
  end
164
169
 
165
- def symbols?
170
+ def symbol?
166
171
  all? { |e| e.is_a?(Symbol) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
167
172
  end
168
173
 
169
- def strings?
174
+ def string?
170
175
  all? { |e| e.is_a?(String) } # rubocop:disable Performance/RedundantEqualityComparisonBlock
171
176
  end
172
177
 
173
- def symbols_or_strings?
178
+ def symbol_or_string?
174
179
  all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
175
180
  end
176
181
 
@@ -196,4 +201,19 @@ module RedAmber
196
201
  end
197
202
  end
198
203
  end
204
+
205
+ # Add additional capabilities to String
206
+ module RefineString
207
+ refine String do
208
+ def width
209
+ chars
210
+ .partition(&:ascii_only?)
211
+ .map.with_index(1) { |a, i| a.size * i }
212
+ .sum
213
+ end
214
+ end
215
+ end
216
+
217
+ private_constant :RefineArray, :RefineArrayLike, :RefineArrowTable,
218
+ :RefineHash, :RefineString
199
219
  end