red_amber 0.1.2 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +21 -10
  3. data/CHANGELOG.md +162 -6
  4. data/Gemfile +3 -0
  5. data/README.md +89 -303
  6. data/benchmark/csv_load_penguins.yml +15 -0
  7. data/benchmark/drop_nil.yml +11 -0
  8. data/doc/DataFrame.md +840 -0
  9. data/doc/Vector.md +317 -0
  10. data/doc/image/arrow_table_new.png +0 -0
  11. data/doc/image/dataframe/assign.png +0 -0
  12. data/doc/image/dataframe/drop.png +0 -0
  13. data/doc/image/dataframe/pick.png +0 -0
  14. data/doc/image/dataframe/remove.png +0 -0
  15. data/doc/image/dataframe/rename.png +0 -0
  16. data/doc/image/dataframe/slice.png +0 -0
  17. data/doc/image/dataframe_model.png +0 -0
  18. data/doc/image/example_in_red_arrow.png +0 -0
  19. data/doc/image/tdr.png +0 -0
  20. data/doc/image/tdr_and_table.png +0 -0
  21. data/doc/image/tidy_data_in_TDR.png +0 -0
  22. data/doc/image/vector/binary_element_wise.png +0 -0
  23. data/doc/image/vector/unary_aggregation.png +0 -0
  24. data/doc/image/vector/unary_aggregation_w_option.png +0 -0
  25. data/doc/image/vector/unary_element_wise.png +0 -0
  26. data/doc/tdr.md +56 -0
  27. data/doc/tdr_ja.md +56 -0
  28. data/lib/red_amber/data_frame.rb +68 -35
  29. data/lib/red_amber/data_frame_displayable.rb +132 -0
  30. data/lib/red_amber/data_frame_helper.rb +64 -0
  31. data/lib/red_amber/data_frame_indexable.rb +38 -0
  32. data/lib/red_amber/data_frame_observation_operation.rb +83 -0
  33. data/lib/red_amber/data_frame_selectable.rb +34 -43
  34. data/lib/red_amber/data_frame_variable_operation.rb +133 -0
  35. data/lib/red_amber/vector.rb +58 -6
  36. data/lib/red_amber/vector_compensable.rb +68 -0
  37. data/lib/red_amber/vector_functions.rb +147 -68
  38. data/lib/red_amber/version.rb +1 -1
  39. data/lib/red_amber.rb +9 -1
  40. data/red_amber.gemspec +3 -6
  41. metadata +36 -9
  42. data/lib/red_amber/data_frame_output.rb +0 -116
@@ -0,0 +1,132 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
4
+
5
+ module RedAmber
6
+ # mix-ins for the class DataFrame
7
+ module DataFrameDisplayable
8
+ def to_s
9
+ @table.to_s
10
+ end
11
+
12
+ # def describe() end
13
+
14
+ # def summary() end
15
+
16
+ def inspect
17
+ "#<#{shape_str(with_id: true)}>\n#{dataframe_info(3)}"
18
+ end
19
+
20
+ # - limit: max num of Vectors to show
21
+ # - tally: max level to use tally mode
22
+ # - elements: max element to show values in each vector
23
+ def tdr(limit = 10, tally: 5, elements: 5)
24
+ puts tdr_str(limit, tally: tally, elements: elements)
25
+ end
26
+
27
+ def tdr_str(limit = 10, tally: 5, elements: 5)
28
+ "#{shape_str}\n#{dataframe_info(limit, tally_level: tally, max_element: elements)}"
29
+ end
30
+
31
+ private # =====
32
+
33
+ def pl(num)
34
+ num > 1 ? 's' : ''
35
+ end
36
+
37
+ def shape_str(with_id: false)
38
+ shape_info = empty? ? '(empty)' : "#{size} x #{n_keys} Vector#{pl(n_keys)}"
39
+ id = with_id ? format(', 0x%016x', object_id) : ''
40
+ "#{self.class} : #{shape_info}#{id}"
41
+ end
42
+
43
+ def dataframe_info(limit, tally_level: 5, max_element: 5)
44
+ return '' if empty?
45
+
46
+ limit = n_keys if [:all, -1].include? limit
47
+
48
+ tallys = vectors.map(&:tally)
49
+ levels = tallys.map(&:size)
50
+ type_groups = @table.columns.map { |column| type_group(column.data_type) }
51
+ quoted_keys = keys.map(&:inspect)
52
+ headers = { idx: '#', key: 'key', type: 'type', levels: 'level', data: 'data_preview' }
53
+ header_format = make_header_format(levels, headers, quoted_keys)
54
+
55
+ sio = StringIO.new # output string buffer
56
+ sio.puts "Vector#{pl(n_keys)} : #{var_type_count(type_groups).join(', ')}"
57
+ sio.printf header_format, *headers.values
58
+
59
+ vectors.each.with_index do |vector, i|
60
+ if i >= limit
61
+ sio << " ... #{n_keys - i} more Vector#{pl(n_keys - i)} ...\n"
62
+ break
63
+ end
64
+ key = quoted_keys[i]
65
+ type = types[i]
66
+ type_group = type_groups[i]
67
+ data_tally = tallys[i]
68
+ a = case type_group
69
+ when :numeric, :string, :boolean
70
+ if data_tally.size <= tally_level && data_tally.size != size
71
+ [data_tally.to_s]
72
+ else
73
+ [shorthand(vector, size, max_element)].concat na_string(vector)
74
+ end
75
+ else
76
+ [shorthand(vector, size, max_element)]
77
+ end
78
+ sio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
79
+ end
80
+ sio.string
81
+ end
82
+
83
+ def make_header_format(levels, headers, quoted_keys)
84
+ # find longest word to adjust column width
85
+ w_idx = n_keys.to_s.size
86
+ w_key = [quoted_keys.map(&:size).max, headers[:key].size].max
87
+ w_type = [types.map(&:size).max, headers[:type].size].max
88
+ w_row = [levels.map { |l| l.to_s.size }.max, headers[:levels].size].max
89
+ "%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{w_row}s %s\n"
90
+ end
91
+
92
+ def type_group(data_type)
93
+ case data_type
94
+ when Arrow::NumericDataType then :numeric
95
+ when Arrow::StringDataType then :string
96
+ when Arrow::BooleanDataType then :boolean
97
+ when Arrow::TemporalDataType then :temporal
98
+ else
99
+ :other
100
+ end
101
+ end
102
+
103
+ def var_type_count(type_groups)
104
+ tg = type_groups.tally
105
+ a = []
106
+ a << "#{tg[:numeric]} numeric" if tg[:numeric]
107
+ a << "#{tg[:string]} string#{pl(tg[:string])}" if tg[:string]
108
+ a << "#{tg[:boolean]} boolean" if tg[:boolean]
109
+ a << "#{tg[:temporal]} temporal" if tg[:temporal]
110
+ a
111
+ end
112
+
113
+ def shorthand(vector, size, max_element)
114
+ max = vector.temporal? ? 2 : max_element
115
+ a = vector.to_a.take(max)
116
+ a.map! { |e| e.nil? ? 'nil' : e.inspect }
117
+ a << '... ' if size > max
118
+ "[#{a.join(', ')}]"
119
+ end
120
+
121
+ def na_string(vector)
122
+ n_nan = vector.n_nans
123
+ n_nil = vector.n_nils
124
+ a = []
125
+ return a if (n_nan + n_nil).zero?
126
+
127
+ a << "#{n_nan} NaN#{pl(n_nan)}" unless n_nan.zero?
128
+ a << "#{n_nil} nil#{pl(n_nil)}" unless n_nil.zero?
129
+ a
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-in for the class DataFrame
5
+ module DataFrameHelper
6
+ private
7
+
8
+ def expand_range(args)
9
+ ary = args.each_with_object([]) do |e, a|
10
+ e.is_a?(Range) ? a.concat(normalized_array(e)) : a.append(e)
11
+ end
12
+ ary.map do |e|
13
+ if e.is_a?(Integer) && e.negative?
14
+ e + size
15
+ else
16
+ e
17
+ end
18
+ end
19
+ end
20
+
21
+ def normalized_array(range)
22
+ both_end = [range.begin, range.end]
23
+ both_end[1] -= 1 if range.exclude_end? && range.end.is_a?(Integer)
24
+
25
+ if both_end.any?(Integer) || both_end.all?(&:nil?)
26
+ if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
27
+ raise DataFrameArgumentError, "Index out of range: #{range} for 0..#{size - 1}"
28
+ end
29
+
30
+ (0...size).to_a[range]
31
+ else
32
+ range.to_a
33
+ end
34
+ end
35
+
36
+ def out_of_range?(indeces)
37
+ indeces.max >= size || indeces.min < -size
38
+ end
39
+
40
+ def integers?(enum)
41
+ enum.all?(Integer)
42
+ end
43
+
44
+ def sym_or_str?(enum)
45
+ enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
46
+ end
47
+
48
+ def booleans?(enum)
49
+ enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
50
+ end
51
+
52
+ def create_dataframe_from_vector(key, vector)
53
+ DataFrame.new(key => vector.data)
54
+ end
55
+
56
+ def select_obs_by_boolean(array)
57
+ DataFrame.new(@table.filter(array))
58
+ end
59
+
60
+ def keys_by_booleans(booleans)
61
+ keys.select.with_index { |_, i| booleans[i] }
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameIndexable
6
+ # Common method
7
+ def map_indices(*indices)
8
+ return self if indices.empty?
9
+
10
+ indices = indices[0].data if indices[0].is_a?(Vector)
11
+
12
+ new_dataframe_by(indices)
13
+ end
14
+
15
+ # @param sort_keys [Arrow::SortKey]
16
+ # :key, "key" or "+key" denotes ascending,
17
+ # "-key" denotes descending order
18
+ # @return [RedAmber::Vector] Sorted indices in Vector
19
+ def sort_indices(*sort_keys)
20
+ indices = @table.sort_indices(sort_keys.flatten)
21
+ Vector.new(indices)
22
+ end
23
+
24
+ # @return [RedAmber::DataFrame] Sorted DataFrame
25
+ def sort(*sort_keys)
26
+ indices = @table.sort_indices(sort_keys.flatten)
27
+
28
+ new_dataframe_by(indices)
29
+ end
30
+
31
+ private
32
+
33
+ def new_dataframe_by(index_array)
34
+ t = Arrow::Function.find(:take).execute([@table, index_array]).value
35
+ RedAmber::DataFrame.new(t)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameObservationOperation
6
+ # slice and select some observations to create sub DataFrame
7
+ def slice(*args, &block)
8
+ slicer = args
9
+ if block
10
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
11
+
12
+ slicer = instance_eval(&block)
13
+ end
14
+ slicer = [slicer].flatten
15
+ return remove_all_values if slicer.empty? || slicer[0].nil?
16
+
17
+ # filter with same length
18
+ booleans = nil
19
+ if slicer[0].is_a?(Vector) || slicer[0].is_a?(Arrow::BooleanArray)
20
+ booleans = slicer[0].to_a
21
+ elsif slicer.size == size && booleans?(slicer)
22
+ booleans = slicer
23
+ end
24
+ return select_obs_by_boolean(booleans) if booleans
25
+
26
+ # filter with indexes
27
+ slicer = expand_range(slicer)
28
+ return map_indices(*slicer) if integers?(slicer)
29
+
30
+ raise DataFrameArgumentError, "Invalid argument #{args}"
31
+ end
32
+
33
+ # remove selected observations to create sub DataFrame
34
+ def remove(*args, &block)
35
+ remover = args
36
+ if block
37
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
38
+
39
+ remover = instance_eval(&block)
40
+ end
41
+ remover = [remover].flatten
42
+
43
+ return self if remover.empty?
44
+
45
+ # filter with same length
46
+ booleans = nil
47
+ if remover[0].is_a?(Vector) || remover[0].is_a?(Arrow::BooleanArray)
48
+ booleans = remover[0].to_a
49
+ elsif remover.size == size && booleans?(remover)
50
+ booleans = remover
51
+ end
52
+ if booleans
53
+ inverted = booleans.map(&:!)
54
+ return select_obs_by_boolean(inverted)
55
+ end
56
+
57
+ # filter with indexes
58
+ slicer = indexes.to_a - expand_range(remover)
59
+ return remove_all_values if slicer.empty?
60
+ return map_indices(*slicer) if integers?(slicer)
61
+
62
+ raise DataFrameArgumentError, "Invalid argument #{args}"
63
+ end
64
+
65
+ def remove_nil
66
+ func = Arrow::Function.find(:drop_null)
67
+ DataFrame.new(func.execute([table]).value)
68
+ end
69
+ alias_method :drop_nil, :remove_nil
70
+
71
+ def group(aggregating_keys, func, target_keys)
72
+ t = table.group(*aggregating_keys)
73
+ RedAmber::DataFrame.new(t.send(func, *target_keys))
74
+ end
75
+
76
+ private
77
+
78
+ # return a DataFrame with same keys as self without values
79
+ def remove_all_values
80
+ DataFrame.new(keys.each_with_object({}) { |key, h| h[key] = [] })
81
+ end
82
+ end
83
+ end
@@ -9,18 +9,40 @@ module RedAmber
9
9
  raise DataFrameArgumentError, 'Empty dataframe' if empty?
10
10
  raise DataFrameArgumentError, 'Empty argument' if args.empty?
11
11
 
12
- # expand Range like [1..3, 4] to [1, 2, 3, 4]
13
- expanded =
14
- args.each_with_object([]) do |e, a|
15
- e.is_a?(Range) ? a.concat(normalized_array(e)) : a.append(e)
12
+ if args.one?
13
+ case args[0]
14
+ when Vector
15
+ return select_obs_by_boolean(Arrow::BooleanArray.new(args[0].data))
16
+ when Arrow::BooleanArray
17
+ return select_obs_by_boolean(args[0])
18
+ when Array
19
+ return select_obs_by_boolean(Arrow::BooleanArray.new(args[0]))
20
+
21
+ # when Hash
22
+ # specify conditions to select by a Hash
16
23
  end
24
+ end
17
25
 
18
- return select_rows(expanded) if integers?(expanded)
19
- return select_columns(expanded.map(&:to_sym)) if sym_or_str?(expanded)
26
+ return select_obs_by_boolean(args) if booleans?(args)
27
+
28
+ # expand Range like [1..3, 4] to [1, 2, 3, 4]
29
+ expanded = expand_range(args)
30
+ return map_indices(*expanded) if integers?(expanded)
31
+ return select_vars_by_keys(expanded.map(&:to_sym)) if sym_or_str?(expanded)
20
32
 
21
33
  raise DataFrameArgumentError, "Invalid argument #{args}"
22
34
  end
23
35
 
36
+ # Select a variable by a key in String or Symbol
37
+ def v(key)
38
+ unless key.is_a?(Symbol) || key.is_a?(String)
39
+ raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
40
+ end
41
+ raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
42
+
43
+ variables[key.to_sym]
44
+ end
45
+
24
46
  def head(n_rows = 5)
25
47
  raise DataFrameArgumentError, "Index is out of range #{n_rows}" if n_rows.negative?
26
48
 
@@ -41,48 +63,17 @@ module RedAmber
41
63
  tail(n_rows)
42
64
  end
43
65
 
44
- private # =====
66
+ private
45
67
 
46
- def select_columns(keys)
68
+ def select_vars_by_keys(keys)
47
69
  if keys.one?
48
- Vector.new(@table[*keys].data)
49
- else
50
- DataFrame.new(@table[keys])
51
- end
52
- end
53
-
54
- def select_rows(indeces)
55
- out_of_range?(indeces) && raise(DataFrameArgumentError, "Invalid index: #{indeces} for 0..#{size - 1}")
70
+ key = keys[0].to_sym
71
+ raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key
56
72
 
57
- a = indeces.map { |i| @table.slice(i).to_a }
58
- DataFrame.new(@table.schema, a)
59
- end
60
-
61
- def normalized_array(range)
62
- both_end = [range.begin, range.end]
63
- both_end[1] -= 1 if range.exclude_end? && range.end.is_a?(Integer)
64
-
65
- if both_end.any?(Integer) || both_end.all?(&:nil?)
66
- if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
67
- raise DataFrameArgumentError, "Index out of range: #{range} for 0..#{size - 1}"
68
- end
69
-
70
- (0...size).to_a[range]
73
+ variables[key]
71
74
  else
72
- range.to_a
75
+ DataFrame.new(@table[keys])
73
76
  end
74
77
  end
75
-
76
- def out_of_range?(indeces)
77
- indeces.max >= size || indeces.min < -size
78
- end
79
-
80
- def integers?(enum)
81
- enum.all?(Integer)
82
- end
83
-
84
- def sym_or_str?(enum)
85
- enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
86
- end
87
78
  end
88
79
  end
@@ -0,0 +1,133 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameVariableOperation
6
+ # pick up some variables to create sub DataFrame
7
+ def pick(*args, &block)
8
+ picker = args
9
+ if block
10
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
11
+
12
+ picker = instance_eval(&block)
13
+ end
14
+ picker = [picker].flatten
15
+ return DataFrame.new if picker.empty? || picker == [nil]
16
+
17
+ picker = keys_by_booleans(picker) if booleans?(picker)
18
+
19
+ # DataFrame#[] creates a Vector with single key is specified.
20
+ # DataFrame#pick creates a DataFrame with single key.
21
+ return DataFrame.new(@table[picker]) if sym_or_str?(picker)
22
+
23
+ raise DataFrameArgumentError, "Invalid argument #{args}"
24
+ end
25
+
26
+ # drop some variables to create remainer sub DataFrame
27
+ def drop(*args, &block)
28
+ dropper = args
29
+ if block
30
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
31
+
32
+ dropper = instance_eval(&block)
33
+ end
34
+ dropper = [dropper].flatten
35
+ dropper = keys_by_booleans(dropper) if booleans?(dropper)
36
+
37
+ picker = keys - dropper
38
+ return DataFrame.new if picker.empty?
39
+
40
+ # DataFrame#[] creates a Vector with single key is specified.
41
+ # DataFrame#drop creates a DataFrame with single key.
42
+ return DataFrame.new(@table[picker]) if sym_or_str?(picker)
43
+
44
+ raise DataFrameArgumentError, "Invalid argument #{args}"
45
+ end
46
+
47
+ # rename variables to create new DataFrame
48
+ def rename(*args, &block)
49
+ renamer = args
50
+ if block
51
+ raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
52
+
53
+ renamer = instance_eval(&block)
54
+ end
55
+ renamer = [renamer].flatten
56
+ return self if renamer.empty?
57
+
58
+ return rename_by_hash([renamer].to_h) if renamer.size == 2 && sym_or_str?(renamer) # rename(from, to)
59
+ return rename_by_hash(renamer[0]) if renamer.one? && renamer[0].is_a?(Hash) # rename({from => to})
60
+
61
+ raise DataFrameArgumentError, "Invalid argument #{args}"
62
+ end
63
+
64
+ # assign variables to create new DataFrame
65
+ def assign(*args, &block)
66
+ assigner = args
67
+ if block
68
+ raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
69
+
70
+ assigner = instance_eval(&block)
71
+ end
72
+ assigner = [assigner].flatten
73
+ return self if assigner.empty? || assigner == [nil]
74
+
75
+ raise DataFrameArgumentError, "Invalid argument #{args}" unless assigner.one? && assigner[0].is_a?(Hash)
76
+
77
+ updater = {}
78
+ appender = {}
79
+ assigner[0].each do |key, value|
80
+ if keys.include? key
81
+ updater[key] = value
82
+ else
83
+ appender[key] = value
84
+ end
85
+ end
86
+ fields, arrays = update_fields_and_arrays(updater)
87
+ append_to_fields_and_arrays(appender, fields, arrays) unless appender.empty?
88
+
89
+ DataFrame.new(Arrow::Table.new(Arrow::Schema.new(fields), arrays))
90
+ end
91
+
92
+ private
93
+
94
+ def rename_by_hash(key_pairs)
95
+ fields = keys.map do |key|
96
+ new_key = key_pairs[key]
97
+ if new_key
98
+ Arrow::Field.new(new_key.to_sym, @table[key].data_type)
99
+ else
100
+ @table.schema[key]
101
+ end
102
+ end
103
+ schema = Arrow::Schema.new(fields)
104
+ DataFrame.new(Arrow::Table.new(schema, @table.columns))
105
+ end
106
+
107
+ def update_fields_and_arrays(updater)
108
+ fields = @table.columns.map(&:field)
109
+ arrays = @table.columns.map(&:data) # chunked_arrays
110
+ keys.each_with_index do |key, i|
111
+ data = updater[key]
112
+ next unless data
113
+
114
+ raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
115
+
116
+ a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
117
+ fields[i] = Arrow::Field.new(key, a.value_data_type)
118
+ arrays[i] = Arrow::ChunkedArray.new([a])
119
+ end
120
+ [fields, arrays]
121
+ end
122
+
123
+ def append_to_fields_and_arrays(appender, fields, arrays)
124
+ appender.each do |key, data|
125
+ raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
126
+
127
+ a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
128
+ fields << Arrow::Field.new(key.to_sym, a.value_data_type)
129
+ arrays << Arrow::ChunkedArray.new([a])
130
+ end
131
+ end
132
+ end
133
+ end
@@ -5,10 +5,12 @@ module RedAmber
5
5
  # @data : holds Arrow::ChunkedArray
6
6
  class Vector
7
7
  # mix-in
8
+ include VectorCompensable
8
9
  include VectorFunctions
9
10
 
10
11
  # chunked_array may come from column.data
11
12
  def initialize(array)
13
+ @key = nil # default is 'headless'
12
14
  case array
13
15
  when Vector
14
16
  @data = array.data
@@ -17,18 +19,31 @@ module RedAmber
17
19
  when Array
18
20
  @data = Arrow::Array.new(array)
19
21
  else
20
- raise ArgumentError, 'Unknown array in argument'
22
+ raise VectorArgumentError, 'Unknown array in argument'
21
23
  end
22
24
  end
23
25
 
24
26
  attr_reader :data
27
+ attr_accessor :key
25
28
 
26
29
  def to_s
27
30
  @data.to_a.inspect
28
31
  end
29
32
 
30
- def inspect
31
- format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n#{self}", object_id
33
+ def inspect(limit: 80)
34
+ sio = StringIO.new << '['
35
+ to_a.each_with_object(sio).with_index do |(e, s), i|
36
+ next_str = "#{s.size > 1 ? ', ' : ''}#{e.inspect}"
37
+ if (s.size + next_str.size) < limit
38
+ s << next_str
39
+ else
40
+ s << ', ... ' if i < size
41
+ break
42
+ end
43
+ end
44
+ sio << ']'
45
+
46
+ format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n", object_id, sio.string
32
47
  end
33
48
 
34
49
  def values
@@ -49,8 +64,24 @@ module RedAmber
49
64
  @data.value_type.nick.to_sym
50
65
  end
51
66
 
52
- def data_type
53
- @data.value_type
67
+ def boolean?
68
+ type == :boolean
69
+ end
70
+
71
+ def numeric?
72
+ type_class < Arrow::NumericDataType
73
+ end
74
+
75
+ def string?
76
+ type == :string
77
+ end
78
+
79
+ def temporal?
80
+ type_class < Arrow::TemporalDataType
81
+ end
82
+
83
+ def type_class
84
+ @data.value_data_type.class
54
85
  end
55
86
 
56
87
  # def each() end
@@ -66,11 +97,32 @@ module RedAmber
66
97
  # def each_chunk() end
67
98
 
68
99
  def tally
69
- values.tally
100
+ hash = values.tally
101
+ if (type_class < Arrow::FloatingPointDataType) && is_nan.any
102
+ a = 0
103
+ hash.each do |key, value|
104
+ if key.is_a?(Float) && key.nan?
105
+ hash.delete(key)
106
+ a += value
107
+ end
108
+ end
109
+ hash[Float::NAN] = a
110
+ end
111
+ hash
112
+ end
113
+
114
+ def value_counts
115
+ values, counts = Arrow::Function.find(:value_counts).execute([data]).value.fields
116
+ values.zip(counts).to_h
70
117
  end
71
118
 
72
119
  def n_nulls
73
120
  @data.n_nulls
74
121
  end
122
+ alias_method :n_nils, :n_nulls
123
+
124
+ def n_nans
125
+ numeric? ? is_nan.to_a.count(true) : 0
126
+ end
75
127
  end
76
128
  end