red_amber 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameObservationOperation
6
+ # slice and select some observations to create sub DataFrame
7
+ def slice(*args, &block)
8
+ slicer = args
9
+ if block
10
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
11
+
12
+ slicer = instance_eval(&block)
13
+ end
14
+ slicer = [slicer].flatten
15
+ return remove_all_values if slicer.empty? || slicer[0].nil?
16
+
17
+ # filter with same length
18
+ booleans = nil
19
+ if slicer[0].is_a?(Vector) || slicer[0].is_a?(Arrow::BooleanArray)
20
+ booleans = slicer[0].to_a
21
+ elsif slicer.size == size && booleans?(slicer)
22
+ booleans = slicer
23
+ end
24
+ return select_obs_by_boolean(booleans) if booleans
25
+
26
+ # filter with indexes
27
+ slicer = expand_range(slicer)
28
+ return select_obs_by_indeces(slicer) if integers?(slicer)
29
+
30
+ raise DataFrameArgumentError, "Invalid argument #{args}"
31
+ end
32
+
33
+ # remove selected observations to create sub DataFrame
34
+ def remove(*args, &block)
35
+ remover = args
36
+ if block
37
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
38
+
39
+ remover = instance_eval(&block)
40
+ end
41
+ remover = [remover].flatten
42
+
43
+ return self if remover.empty?
44
+
45
+ # filter with same length
46
+ booleans = nil
47
+ if remover[0].is_a?(Vector) || remover[0].is_a?(Arrow::BooleanArray)
48
+ booleans = remover[0].to_a
49
+ elsif remover.size == size && booleans?(remover)
50
+ booleans = remover
51
+ end
52
+ if booleans
53
+ inverted = booleans.map(&:!)
54
+ return select_obs_by_boolean(inverted)
55
+ end
56
+
57
+ # filter with indexes
58
+ slicer = indexes.to_a - expand_range(remover)
59
+ return remove_all_values if slicer.empty?
60
+ return select_obs_by_indeces(slicer) if integers?(slicer)
61
+
62
+ raise DataFrameArgumentError, "Invalid argument #{args}"
63
+ end
64
+
65
+ private
66
+
67
+ # return a DataFrame with same keys as self without values
68
+ def remove_all_values
69
+ DataFrame.new(keys.each_with_object({}) { |key, h| h[key] = [] })
70
+ end
71
+ end
72
+ end
@@ -9,14 +9,26 @@ module RedAmber
9
9
  raise DataFrameArgumentError, 'Empty dataframe' if empty?
10
10
  raise DataFrameArgumentError, 'Empty argument' if args.empty?
11
11
 
12
- # expand Range like [1..3, 4] to [1, 2, 3, 4]
13
- expanded =
14
- args.each_with_object([]) do |e, a|
15
- e.is_a?(Range) ? a.concat(normalized_array(e)) : a.append(e)
12
+ if args.one?
13
+ case args[0]
14
+ when Vector
15
+ return select_obs_by_boolean(Arrow::BooleanArray.new(args[0].data))
16
+ when Arrow::BooleanArray
17
+ return select_obs_by_boolean(args[0])
18
+ when Array
19
+ return select_obs_by_boolean(Arrow::BooleanArray.new(args[0]))
20
+
21
+ # when Hash
22
+ # specify conditions to select by a Hash
16
23
  end
24
+ end
17
25
 
18
- return select_rows(expanded) if integers?(expanded)
19
- return select_columns(expanded.map(&:to_sym)) if sym_or_str?(expanded)
26
+ return select_obs_by_boolean(args) if booleans?(args)
27
+
28
+ # expand Range like [1..3, 4] to [1, 2, 3, 4]
29
+ expanded = expand_range(args)
30
+ return select_obs_by_indeces(expanded) if integers?(expanded)
31
+ return select_vars_by_keys(expanded.map(&:to_sym)) if sym_or_str?(expanded)
20
32
 
21
33
  raise DataFrameArgumentError, "Invalid argument #{args}"
22
34
  end
@@ -41,51 +53,17 @@ module RedAmber
41
53
  tail(n_rows)
42
54
  end
43
55
 
44
- private # =====
56
+ private
45
57
 
46
- def select_columns(keys)
58
+ def select_vars_by_keys(keys)
47
59
  if keys.one?
48
60
  t = @table[*keys]
49
- raise DataFrameArgumentError, "Key is not exists #{keys}" unless t
61
+ raise DataFrameArgumentError, "Key does not exist #{keys}" unless t
50
62
 
51
63
  Vector.new(t.data)
52
64
  else
53
65
  DataFrame.new(@table[keys])
54
66
  end
55
67
  end
56
-
57
- def select_rows(indeces)
58
- out_of_range?(indeces) && raise(DataFrameArgumentError, "Invalid index: #{indeces} for 0..#{size - 1}")
59
-
60
- a = indeces.map { |i| @table.slice(i).to_a }
61
- DataFrame.new(@table.schema, a)
62
- end
63
-
64
- def normalized_array(range)
65
- both_end = [range.begin, range.end]
66
- both_end[1] -= 1 if range.exclude_end? && range.end.is_a?(Integer)
67
-
68
- if both_end.any?(Integer) || both_end.all?(&:nil?)
69
- if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
70
- raise DataFrameArgumentError, "Index out of range: #{range} for 0..#{size - 1}"
71
- end
72
-
73
- (0...size).to_a[range]
74
- else
75
- range.to_a
76
- end
77
- end
78
-
79
- def out_of_range?(indeces)
80
- indeces.max >= size || indeces.min < -size
81
- end
82
-
83
- def integers?(enum)
84
- enum.all?(Integer)
85
- end
86
-
87
- def sym_or_str?(enum)
88
- enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
89
- end
90
68
  end
91
69
  end
@@ -0,0 +1,133 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameVariableOperation
6
+ # pick up some variables to create sub DataFrame
7
+ def pick(*args, &block)
8
+ picker = args
9
+ if block
10
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
11
+
12
+ picker = instance_eval(&block)
13
+ end
14
+ picker = [picker].flatten
15
+ return DataFrame.new if picker.empty? || picker == [nil]
16
+
17
+ picker = keys_by_booleans(picker) if booleans?(picker)
18
+
19
+ # DataFrame#[] creates a Vector with single key is specified.
20
+ # DataFrame#pick creates a DataFrame with single key.
21
+ return DataFrame.new(@table[picker]) if sym_or_str?(picker)
22
+
23
+ raise DataFrameArgumentError, "Invalid argument #{args}"
24
+ end
25
+
26
+ # drop some variables to create remainer sub DataFrame
27
+ def drop(*args, &block)
28
+ dropper = args
29
+ if block
30
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
31
+
32
+ dropper = instance_eval(&block)
33
+ end
34
+ dropper = [dropper].flatten
35
+ dropper = keys_by_booleans(dropper) if booleans?(dropper)
36
+
37
+ picker = keys - dropper
38
+ return DataFrame.new if picker.empty?
39
+
40
+ # DataFrame#[] creates a Vector with single key is specified.
41
+ # DataFrame#drop creates a DataFrame with single key.
42
+ return DataFrame.new(@table[picker]) if sym_or_str?(picker)
43
+
44
+ raise DataFrameArgumentError, "Invalid argument #{args}"
45
+ end
46
+
47
+ # rename variables to create new DataFrame
48
+ def rename(*args, &block)
49
+ renamer = args
50
+ if block
51
+ raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
52
+
53
+ renamer = instance_eval(&block)
54
+ end
55
+ renamer = [renamer].flatten
56
+ return self if renamer.empty?
57
+
58
+ return rename_by_hash([renamer].to_h) if renamer.size == 2 && sym_or_str?(renamer) # rename(from, to)
59
+ return rename_by_hash(renamer[0]) if renamer.one? && renamer[0].is_a?(Hash) # rename({from => to})
60
+
61
+ raise DataFrameArgumentError, "Invalid argument #{args}"
62
+ end
63
+
64
+ # assign variables to create new DataFrame
65
+ def assign(*args, &block)
66
+ assigner = args
67
+ if block
68
+ raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
69
+
70
+ assigner = instance_eval(&block)
71
+ end
72
+ assigner = [assigner].flatten
73
+ return self if assigner.empty? || assigner == [nil]
74
+
75
+ raise DataFrameArgumentError, "Invalid argument #{args}" unless assigner.one? && assigner[0].is_a?(Hash)
76
+
77
+ updater = {}
78
+ appender = {}
79
+ assigner[0].each do |key, value|
80
+ if keys.include? key
81
+ updater[key] = value
82
+ else
83
+ appender[key] = value
84
+ end
85
+ end
86
+ fields, arrays = update_fields_and_arrays(updater)
87
+ append_to_fields_and_arrays(appender, fields, arrays) unless appender.empty?
88
+
89
+ DataFrame.new(Arrow::Table.new(Arrow::Schema.new(fields), arrays))
90
+ end
91
+
92
+ private
93
+
94
+ def rename_by_hash(key_pairs)
95
+ fields = keys.map do |key|
96
+ new_key = key_pairs[key]
97
+ if new_key
98
+ Arrow::Field.new(new_key.to_sym, @table[key].data_type)
99
+ else
100
+ @table.schema[key]
101
+ end
102
+ end
103
+ schema = Arrow::Schema.new(fields)
104
+ DataFrame.new(Arrow::Table.new(schema, @table.columns))
105
+ end
106
+
107
+ def update_fields_and_arrays(updater)
108
+ fields = @table.columns.map(&:field)
109
+ arrays = @table.columns.map(&:data) # chunked_arrays
110
+ keys.each_with_index do |key, i|
111
+ data = updater[key]
112
+ next unless data
113
+
114
+ raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
115
+
116
+ a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
117
+ fields[i] = Arrow::Field.new(key, a.value_data_type)
118
+ arrays[i] = Arrow::ChunkedArray.new([a])
119
+ end
120
+ [fields, arrays]
121
+ end
122
+
123
+ def append_to_fields_and_arrays(appender, fields, arrays)
124
+ appender.each do |key, data|
125
+ raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
126
+
127
+ a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
128
+ fields << Arrow::Field.new(key.to_sym, a.value_data_type)
129
+ arrays << Arrow::ChunkedArray.new([a])
130
+ end
131
+ end
132
+ end
133
+ end
@@ -12,21 +12,30 @@ module RedAmber
12
12
  module VectorFunctions
13
13
  # [Unary aggregations]: vector.func => scalar
14
14
  unary_aggregations =
15
- %i[all any approximate_median count count_distinct max mean min product stddev sum variance]
15
+ %i[all any approximate_median count count_distinct max mean min min_max product stddev sum variance]
16
16
  unary_aggregations.each do |function|
17
17
  define_method(function) do |opts: nil|
18
- output = exec_func_unary(function, options: opts)
19
- take_out_scalar(output)
18
+ datum = exec_func_unary(function, options: opts)
19
+ take_out_scalar(datum)
20
20
  end
21
21
  end
22
22
  alias_method :median, :approximate_median
23
23
  alias_method :count_uniq, :count_distinct
24
24
 
25
+ def unbiased_variance
26
+ variance(opts: { ddof: 1 })
27
+ end
28
+ alias_method :var, :unbiased_variance
29
+
30
+ def sd
31
+ stddev(opts: { ddof: 1 })
32
+ end
33
+ alias_method :std, :sd
34
+
25
35
  # option(s) required
26
36
  # - index
27
37
 
28
38
  # Returns other than value
29
- # - min_max
30
39
  # - mode
31
40
  # - quantile
32
41
  # - tdigest
@@ -36,8 +45,8 @@ module RedAmber
36
45
  %i[abs atan bit_wise_not ceil cos floor is_finite is_inf is_nan is_null is_valid sign sin tan trunc]
37
46
  unary_element_wise.each do |function|
38
47
  define_method(function) do |opts: nil|
39
- output = exec_func_unary(function, options: opts)
40
- take_out_element_wise(output)
48
+ datum = exec_func_unary(function, options: opts)
49
+ take_out_element_wise(datum)
41
50
  end
42
51
  end
43
52
  alias_method :is_nil, :is_null
@@ -53,13 +62,13 @@ module RedAmber
53
62
  }
54
63
  unary_element_wise_op.each do |function, operator|
55
64
  define_method(function) do |opts: nil|
56
- output = exec_func_unary(function, options: opts)
57
- take_out_element_wise(output)
65
+ datum = exec_func_unary(function, options: opts)
66
+ take_out_element_wise(datum)
58
67
  end
59
68
 
60
69
  define_method(operator) do |opts: nil|
61
- output = exec_func_unary(function, options: opts)
62
- take_out_element_wise(output)
70
+ datum = exec_func_unary(function, options: opts)
71
+ take_out_element_wise(datum)
63
72
  end
64
73
  end
65
74
  alias_method :not, :invert
@@ -79,8 +88,8 @@ module RedAmber
79
88
  %i[atan2 and_not and_not_kleene bit_wise_and bit_wise_or bit_wise_xor]
80
89
  binary_element_wise.each do |function|
81
90
  define_method(function) do |other, opts: nil|
82
- output = exec_func_binary(function, other, options: opts)
83
- take_out_element_wise(output)
91
+ datum = exec_func_binary(function, other, options: opts)
92
+ take_out_element_wise(datum)
84
93
  end
85
94
  end
86
95
 
@@ -95,8 +104,8 @@ module RedAmber
95
104
  }
96
105
  logical_binary_element_wise.each do |method, function|
97
106
  define_method(method) do |other, opts: nil|
98
- output = exec_func_binary(function, other, options: opts)
99
- take_out_element_wise(output)
107
+ datum = exec_func_binary(function, other, options: opts)
108
+ take_out_element_wise(datum)
100
109
  end
101
110
  end
102
111
 
@@ -128,13 +137,13 @@ module RedAmber
128
137
  }
129
138
  binary_element_wise_op.each do |function, operator|
130
139
  define_method(function) do |other, opts: nil|
131
- output = exec_func_binary(function, other, options: opts)
132
- take_out_element_wise(output)
140
+ datum = exec_func_binary(function, other, options: opts)
141
+ take_out_element_wise(datum)
133
142
  end
134
143
 
135
144
  define_method(operator) do |other, opts: nil|
136
- output = exec_func_binary(function, other, options: opts)
137
- take_out_element_wise(output)
145
+ datum = exec_func_binary(function, other, options: opts)
146
+ take_out_element_wise(datum)
138
147
  end
139
148
  end
140
149
  alias_method :eq, :equal
@@ -195,29 +204,45 @@ module RedAmber
195
204
  private # =======
196
205
 
197
206
  def exec_func_unary(function, options: nil)
198
- func = Arrow::Function.find(function)
199
- func.execute([data], options)
207
+ find(function).execute([data], options)
200
208
  end
201
209
 
202
210
  def exec_func_binary(function, other, options: nil)
203
- func = Arrow::Function.find(function)
204
211
  case other
205
212
  when Vector
206
- func.execute([data, other.data], options)
207
- when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric
208
- func.execute([data, other], options)
213
+ find(function).execute([data, other.data], options)
214
+ when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric, String, TrueClass, FalseClass
215
+ find(function).execute([data, other], options)
209
216
  else
210
217
  raise ArgumentError, "Operand is not supported: #{other.class}"
211
218
  end
212
219
  end
213
220
 
214
- def take_out_scalar(output)
215
- output = output.value
216
- output.is_a?(Arrow::StringScalar) ? output.to_s : output.value
221
+ def take_out_scalar(datum)
222
+ output = datum.value
223
+ case output
224
+ when Arrow::StringScalar then output.to_s
225
+ when Arrow::StructScalar
226
+ output.value.map { |s| s.is_a?(Arrow::StringScalar) ? s.to_s : s.value }
227
+ else
228
+ output.value
229
+ end
230
+ end
231
+
232
+ def take_out_element_wise(datum)
233
+ Vector.new(datum.value)
234
+ end
235
+
236
+ module_function # ======
237
+
238
+ def find(function_name)
239
+ Arrow::Function.find(function_name)
217
240
  end
218
241
 
219
- def take_out_element_wise(output)
220
- Vector.new(output.value)
242
+ # temporary API until RedAmber document prepared.
243
+ def arrow_doc(function_name)
244
+ f = find(function_name)
245
+ "#{f}\n#{'-' * function_name.size}\n#{f.doc.description}"
221
246
  end
222
247
  end
223
248
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- VERSION = '0.1.3'
4
+ VERSION = '0.1.4'
5
5
  end
data/lib/red_amber.rb CHANGED
@@ -3,8 +3,11 @@
3
3
  require 'arrow'
4
4
  require 'rover-df'
5
5
 
6
- require_relative 'red_amber/data_frame_output'
6
+ require_relative 'red_amber/data_frame_displayable'
7
+ require_relative 'red_amber/data_frame_helper'
7
8
  require_relative 'red_amber/data_frame_selectable'
9
+ require_relative 'red_amber/data_frame_observation_operation'
10
+ require_relative 'red_amber/data_frame_variable_operation'
8
11
  require_relative 'red_amber/data_frame'
9
12
  require_relative 'red_amber/vector_functions'
10
13
  require_relative 'red_amber/vector'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red_amber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hirokazu SUZUKI (heronshoes)
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-05-14 00:00:00.000000000 Z
11
+ date: 2022-05-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: red-arrow
@@ -68,10 +68,34 @@ files:
68
68
  - README.md
69
69
  - Rakefile
70
70
  - doc/CODE_OF_CONDUCT.md
71
+ - doc/DataFrame.md
72
+ - doc/Vector.md
73
+ - doc/image/TDR_operations.pdf
74
+ - doc/image/arrow_table_new.png
75
+ - doc/image/dataframe/assign.png
76
+ - doc/image/dataframe/drop.png
77
+ - doc/image/dataframe/pick.png
78
+ - doc/image/dataframe/remove.png
79
+ - doc/image/dataframe/rename.png
80
+ - doc/image/dataframe/slice.png
81
+ - doc/image/dataframe_model.png
82
+ - doc/image/example_in_red_arrow.png
83
+ - doc/image/tdr.png
84
+ - doc/image/tdr_and_table.png
85
+ - doc/image/tidy_data_in_TDR.png
86
+ - doc/image/vector/binary_element_wise.png
87
+ - doc/image/vector/unary_aggregation.png
88
+ - doc/image/vector/unary_aggregation_w_option.png
89
+ - doc/image/vector/unary_element_wise.png
90
+ - doc/tdr.md
91
+ - doc/tdr_ja.md
71
92
  - lib/red_amber.rb
72
93
  - lib/red_amber/data_frame.rb
73
- - lib/red_amber/data_frame_output.rb
94
+ - lib/red_amber/data_frame_displayable.rb
95
+ - lib/red_amber/data_frame_helper.rb
96
+ - lib/red_amber/data_frame_observation_operation.rb
74
97
  - lib/red_amber/data_frame_selectable.rb
98
+ - lib/red_amber/data_frame_variable_operation.rb
75
99
  - lib/red_amber/vector.rb
76
100
  - lib/red_amber/vector_functions.rb
77
101
  - lib/red_amber/version.rb