red_amber 0.1.3 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +31 -7
  3. data/CHANGELOG.md +214 -10
  4. data/Gemfile +4 -0
  5. data/README.md +117 -342
  6. data/benchmark/csv_load_penguins.yml +15 -0
  7. data/benchmark/drop_nil.yml +11 -0
  8. data/doc/DataFrame.md +854 -0
  9. data/doc/Vector.md +449 -0
  10. data/doc/image/arrow_table_new.png +0 -0
  11. data/doc/image/dataframe/assign.png +0 -0
  12. data/doc/image/dataframe/drop.png +0 -0
  13. data/doc/image/dataframe/pick.png +0 -0
  14. data/doc/image/dataframe/remove.png +0 -0
  15. data/doc/image/dataframe/rename.png +0 -0
  16. data/doc/image/dataframe/slice.png +0 -0
  17. data/doc/image/dataframe_model.png +0 -0
  18. data/doc/image/example_in_red_arrow.png +0 -0
  19. data/doc/image/tdr.png +0 -0
  20. data/doc/image/tdr_and_table.png +0 -0
  21. data/doc/image/tidy_data_in_TDR.png +0 -0
  22. data/doc/image/vector/binary_element_wise.png +0 -0
  23. data/doc/image/vector/unary_aggregation.png +0 -0
  24. data/doc/image/vector/unary_aggregation_w_option.png +0 -0
  25. data/doc/image/vector/unary_element_wise.png +0 -0
  26. data/doc/tdr.md +56 -0
  27. data/doc/tdr_ja.md +56 -0
  28. data/lib/red-amber.rb +27 -0
  29. data/lib/red_amber/data_frame.rb +91 -37
  30. data/lib/red_amber/{data_frame_output.rb → data_frame_displayable.rb} +49 -41
  31. data/lib/red_amber/data_frame_indexable.rb +38 -0
  32. data/lib/red_amber/data_frame_observation_operation.rb +11 -0
  33. data/lib/red_amber/data_frame_selectable.rb +155 -48
  34. data/lib/red_amber/data_frame_variable_operation.rb +137 -0
  35. data/lib/red_amber/helper.rb +61 -0
  36. data/lib/red_amber/vector.rb +69 -16
  37. data/lib/red_amber/vector_functions.rb +80 -45
  38. data/lib/red_amber/vector_selectable.rb +124 -0
  39. data/lib/red_amber/vector_updatable.rb +104 -0
  40. data/lib/red_amber/version.rb +1 -1
  41. data/lib/red_amber.rb +1 -16
  42. data/red_amber.gemspec +3 -6
  43. metadata +38 -9
@@ -5,19 +5,23 @@ module RedAmber
5
5
  # @table : holds Arrow::Table object
6
6
  class DataFrame
7
7
  # mix-in
8
+ include DataFrameDisplayable
9
+ include DataFrameIndexable
8
10
  include DataFrameSelectable
9
- include DataFrameOutput
11
+ include DataFrameObservationOperation
12
+ include DataFrameVariableOperation
13
+ include Helper
10
14
 
11
15
  def initialize(*args)
12
- # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
13
- # returns empty DataFrame
14
- @table = Arrow::Table.new({}, [])
16
+ @variables = @keys = @vectors = @types = @data_types = nil
15
17
  # bug in gobject-introspection: ruby-gnome/ruby-gnome#1472
16
18
  # [Arrow::Table] == [nil] shows ArgumentError
17
19
  # temporary use yoda condition to workaround
18
- return if args.empty? || args == [[]] || args == [{}] || [nil] == args
19
-
20
- if args.size > 1
20
+ if args.empty? || args == [[]] || args == [{}] || [nil] == args
21
+ # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
22
+ # returns empty DataFrame
23
+ @table = Arrow::Table.new({}, [])
24
+ elsif args.size > 1
21
25
  @table = Arrow::Table.new(*args)
22
26
  else
23
27
  arg = args[0]
@@ -39,67 +43,71 @@ module RedAmber
39
43
 
40
44
  attr_reader :table
41
45
 
46
+ def to_arrow
47
+ @table
48
+ end
49
+
42
50
  def save(output, options = {})
43
51
  @table.save(output, options)
44
52
  end
45
53
 
46
- # Properties ===
47
- def n_rows
54
+ def size
48
55
  @table.n_rows
49
56
  end
50
- alias_method :nrow, :n_rows
51
- alias_method :size, :n_rows
52
- alias_method :length, :n_rows
57
+ alias_method :n_rows, :size
58
+ alias_method :n_obs, :size
53
59
 
54
- def n_columns
60
+ def n_keys
55
61
  @table.n_columns
56
62
  end
57
- alias_method :ncol, :n_columns
58
- alias_method :width, :n_columns
63
+ alias_method :n_cols, :n_keys
64
+ alias_method :n_vars, :n_keys
59
65
 
60
66
  def shape
61
- [n_rows, n_columns]
67
+ [size, n_keys]
62
68
  end
63
69
 
64
- def column_names
65
- @table.columns.map { |column| column.name.to_sym }
70
+ def variables
71
+ @variables || @variables = init_instance_vars(:variables)
66
72
  end
67
- alias_method :keys, :column_names
68
- alias_method :header, :column_names
73
+ alias_method :vars, :variables
74
+
75
+ def keys
76
+ @keys || @keys = init_instance_vars(:keys)
77
+ end
78
+ alias_method :column_names, :keys
79
+ alias_method :var_names, :keys
69
80
 
70
81
  def key?(key)
71
- column_names.include?(key.to_sym)
82
+ @keys.include?(key.to_sym)
72
83
  end
73
84
  alias_method :has_key?, :key?
74
85
 
75
86
  def key_index(key)
76
- column_names.find_index(key.to_sym)
87
+ @keys.find_index(key.to_sym)
77
88
  end
78
89
  alias_method :find_index, :key_index
79
90
  alias_method :index, :key_index
80
91
 
81
92
  def types
82
- @table.columns.map do |column|
83
- column.data_type.to_s.to_sym
84
- end
93
+ @types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
85
94
  end
86
95
 
87
- def data_types
88
- @table.columns.map do |column|
89
- column.data_type.class
90
- end
96
+ def type_classes
97
+ @data_types || @data_types = @table.columns.map { |column| column.data_type.class }
91
98
  end
92
99
 
93
100
  def vectors
94
- @table.columns.map do |column|
95
- Vector.new(column.data)
96
- end
101
+ @vectors || @vectors = init_instance_vars(:vectors)
102
+ end
103
+
104
+ def indices
105
+ (0...size).to_a
97
106
  end
107
+ alias_method :indexes, :indices
98
108
 
99
109
  def to_h
100
- @table.columns.each_with_object({}) do |column, result|
101
- result[column.name.to_sym] = column.entries
102
- end
110
+ variables.transform_values(&:to_a)
103
111
  end
104
112
 
105
113
  def to_a
@@ -118,13 +126,59 @@ module RedAmber
118
126
  end
119
127
 
120
128
  def empty?
121
- @table.columns.empty?
129
+ variables.empty?
122
130
  end
123
131
 
124
132
  def to_rover
125
133
  Rover::DataFrame.new(to_h)
126
134
  end
127
135
 
128
- # def to_parquet() end
136
+ def to_iruby
137
+ require 'iruby'
138
+ return ['text/plain', '(empty DataFrame)'] if empty?
139
+
140
+ if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'tdr') == 'table'
141
+ ['text/html', html_table]
142
+ elsif size <= 5
143
+ ['text/plain', tdr_str(tally: 0)]
144
+ else
145
+ ['text/plain', tdr_str]
146
+ end
147
+ end
148
+
149
+ private
150
+
151
+ # initialize @variable, @keys, @vectors and return one of them
152
+ def init_instance_vars(var)
153
+ ary = @table.columns.each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
154
+ v = Vector.new(column.data)
155
+ k = column.name.to_sym
156
+ v.key = k
157
+ variables[k] = v
158
+ keys << k
159
+ vectors << v
160
+ end
161
+ @variables, @keys, @vectors = ary
162
+ ary[%i[variables keys vectors].index(var)]
163
+ end
164
+
165
+ def html_table
166
+ reduced = size > 8 ? self[0..4, -4..-1] : self
167
+
168
+ converted = reduced.assign do
169
+ vectors.select.with_object({}) do |vector, assigner|
170
+ if vector.has_nil?
171
+ assigner[vector.key] = vector.to_a.map do |e|
172
+ e = e.nil? ? '<i>(nil)</i>' : e.to_s # nil
173
+ e = '""' if e.empty? # empty string
174
+ e.sub(/(\s+)/, '"\1"') # blank spaces
175
+ end
176
+ end
177
+ end
178
+ end
179
+
180
+ html = IRuby::HTML.table(converted.to_h, maxrows: 8, maxcols: 15)
181
+ "#{size} x #{n_keys} vector#{pl(n_keys)} ; #{html}"
182
+ end
129
183
  end
130
184
  end
@@ -4,7 +4,7 @@ require 'stringio'
4
4
 
5
5
  module RedAmber
6
6
  # mix-ins for the class DataFrame
7
- module DataFrameOutput
7
+ module DataFrameDisplayable
8
8
  def to_s
9
9
  @table.to_s
10
10
  end
@@ -13,19 +13,37 @@ module RedAmber
13
13
 
14
14
  # def summary() end
15
15
 
16
- def inspect_raw
17
- format "#<#{self.class}:0x%016x>\n#{self}", object_id
16
+ def inspect
17
+ if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'tdr') == 'table'
18
+ "#<#{shape_str(with_id: true)}>\n#{self}"
19
+ else
20
+ "#<#{shape_str(with_id: true)}>\n#{dataframe_info(3)}"
21
+ end
18
22
  end
19
23
 
20
- # - tally_level: max level to use tally mode
21
- # - max_element: max element to show values in each row
22
- # - TODO: Is it better to change name other than `inspect` ?
23
- # - TODO: Fall back to inspect_raw when treating large dataset
24
- # - TODO: Refactor code to smaller methods
25
- def inspect(tally_level: 5, max_element: 5)
26
- return '#<RedAmber::DataFrame (empty)>' if empty?
24
+ # - limit: max num of Vectors to show
25
+ # - tally: max level to use tally mode
26
+ # - elements: max element to show values in each vector
27
+ def tdr(limit = 10, tally: 5, elements: 5)
28
+ puts tdr_str(limit, tally: tally, elements: elements)
29
+ end
30
+
31
+ def tdr_str(limit = 10, tally: 5, elements: 5)
32
+ "#{shape_str}\n#{dataframe_info(limit, tally_level: tally, max_element: elements)}"
33
+ end
34
+
35
+ private # =====
36
+
37
+ def shape_str(with_id: false)
38
+ shape_info = empty? ? '(empty)' : "#{size} x #{n_keys} Vector#{pl(n_keys)}"
39
+ id = with_id ? format(', 0x%016x', object_id) : ''
40
+ "#{self.class} : #{shape_info}#{id}"
41
+ end
27
42
 
28
- stringio = StringIO.new # output string buffer
43
+ def dataframe_info(limit, tally_level: 5, max_element: 5)
44
+ return '' if empty?
45
+
46
+ limit = n_keys if [:all, -1].include? limit
29
47
 
30
48
  tallys = vectors.map(&:tally)
31
49
  levels = tallys.map(&:size)
@@ -34,52 +52,41 @@ module RedAmber
34
52
  headers = { idx: '#', key: 'key', type: 'type', levels: 'level', data: 'data_preview' }
35
53
  header_format = make_header_format(levels, headers, quoted_keys)
36
54
 
37
- # 1st row: show shape of the dataframe
38
- vs = "Vector#{pl(ncol)}"
39
- stringio.puts \
40
- "#{self.class} : #{nrow} x #{ncol} #{vs}"
41
-
42
- # 2nd row: show var counts by type
43
- stringio.puts "#{vs} : #{var_type_count(type_groups).join(', ')}"
55
+ sio = StringIO.new # output string buffer
56
+ sio.puts "Vector#{pl(n_keys)} : #{var_type_count(type_groups).join(', ')}"
57
+ sio.printf header_format, *headers.values
44
58
 
45
- # 3rd row: print header of rows
46
- stringio.printf header_format, *headers.values
47
-
48
- # 4th row ~: show details for each column (vector)
49
59
  vectors.each.with_index do |vector, i|
60
+ if i >= limit
61
+ sio << " ... #{n_keys - i} more Vector#{pl(n_keys - i)} ...\n"
62
+ break
63
+ end
50
64
  key = quoted_keys[i]
51
65
  type = types[i]
52
66
  type_group = type_groups[i]
53
67
  data_tally = tallys[i]
54
-
55
68
  a = case type_group
56
69
  when :numeric, :string, :boolean
57
- if data_tally.size <= tally_level && data_tally.size != nrow
70
+ if data_tally.size <= tally_level && data_tally.size != size
58
71
  [data_tally.to_s]
59
72
  else
60
- [shorthand(vector, nrow, max_element)].concat na_string(vector)
73
+ [shorthand(vector, size, max_element)].concat na_string(vector)
61
74
  end
62
75
  else
63
- shorthand(vector, nrow, max_element)
76
+ [shorthand(vector, size, max_element)]
64
77
  end
65
- stringio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
78
+ sio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
66
79
  end
67
- stringio.string
68
- end
69
-
70
- private # =====
71
-
72
- def pl(num)
73
- num > 1 ? 's' : ''
80
+ sio.string
74
81
  end
75
82
 
76
83
  def make_header_format(levels, headers, quoted_keys)
77
- # find longest word to adjust column width
78
- w_idx = ncol.to_s.size
84
+ # find longest word to adjust width
85
+ w_idx = n_keys.to_s.size
79
86
  w_key = [quoted_keys.map(&:size).max, headers[:key].size].max
80
87
  w_type = [types.map(&:size).max, headers[:type].size].max
81
- w_row = [levels.map { |l| l.to_s.size }.max, headers[:levels].size].max
82
- "%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{w_row}s %s\n"
88
+ w_level = [levels.map { |l| l.to_s.size }.max, headers[:levels].size].max
89
+ "%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{w_level}s %s\n"
83
90
  end
84
91
 
85
92
  def type_group(data_type)
@@ -103,10 +110,11 @@ module RedAmber
103
110
  a
104
111
  end
105
112
 
106
- def shorthand(vector, nrow, max_element)
107
- a = vector.to_a.take(max_element)
113
+ def shorthand(vector, size, max_element)
114
+ max = vector.temporal? ? 2 : max_element
115
+ a = vector.to_a.take(max)
108
116
  a.map! { |e| e.nil? ? 'nil' : e.inspect }
109
- a << '... ' if nrow > max_element
117
+ a << '... ' if size > max
110
118
  "[#{a.join(', ')}]"
111
119
  end
112
120
 
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameIndexable
6
+ # Common method
7
+ def map_indices(*indices)
8
+ return self if indices.empty?
9
+
10
+ indices = indices[0].data if indices[0].is_a?(Vector)
11
+
12
+ new_dataframe_by(indices)
13
+ end
14
+
15
+ # @param sort_keys [Arrow::SortKey]
16
+ # :key, "key" or "+key" denotes ascending,
17
+ # "-key" denotes descending order
18
+ # @return [RedAmber::Vector] Sorted indices in Vector
19
+ def sort_indices(*sort_keys)
20
+ indices = @table.sort_indices(sort_keys.flatten)
21
+ Vector.new(indices)
22
+ end
23
+
24
+ # @return [RedAmber::DataFrame] Sorted DataFrame
25
+ def sort(*sort_keys)
26
+ indices = @table.sort_indices(sort_keys.flatten)
27
+
28
+ new_dataframe_by(indices)
29
+ end
30
+
31
+ private
32
+
33
+ def new_dataframe_by(index_array)
34
+ t = Arrow::Function.find(:take).execute([@table, index_array]).value
35
+ RedAmber::DataFrame.new(t)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameObservationOperation
6
+ def group(aggregating_keys, func, target_keys)
7
+ t = table.group(*aggregating_keys)
8
+ RedAmber::DataFrame.new(t.send(func, *target_keys))
9
+ end
10
+ end
11
+ end
@@ -3,89 +3,196 @@
3
3
  module RedAmber
4
4
  # mix-in for the class DataFrame
5
5
  module DataFrameSelectable
6
- # select columns: [symbol] or [string]
7
- # select rows: [array of index], [range]
6
+ # select variables: [symbol] or [string]
7
+ # select observations: [array of index], [range]
8
8
  def [](*args)
9
+ args.flatten!
9
10
  raise DataFrameArgumentError, 'Empty dataframe' if empty?
10
- raise DataFrameArgumentError, 'Empty argument' if args.empty?
11
+ return remove_all_values if args.empty? || args[0].nil?
11
12
 
12
- # expand Range like [1..3, 4] to [1, 2, 3, 4]
13
- expanded =
14
- args.each_with_object([]) do |e, a|
15
- e.is_a?(Range) ? a.concat(normalized_array(e)) : a.append(e)
13
+ vector = parse_to_vector(args)
14
+ if vector.boolean?
15
+ return filter_by_vector(vector.data) if vector.size == size
16
+
17
+ raise DataFrameArgumentError, "Size is not match in booleans: #{args}"
18
+ end
19
+ return take_by_array(vector) if vector.numeric?
20
+ return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.type == :dictionary
21
+
22
+ raise DataFrameArgumentError, "Invalid argument: #{args}"
23
+ end
24
+
25
+ # slice and select some observations to create sub DataFrame
26
+ def slice(*args, &block)
27
+ slicer = args
28
+ if block
29
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
30
+
31
+ slicer = instance_eval(&block)
32
+ end
33
+ slicer = [slicer].flatten
34
+
35
+ raise DataFrameArgumentError, 'Empty dataframe' if empty?
36
+ return remove_all_values if slicer.empty? || slicer[0].nil?
37
+
38
+ vector = parse_to_vector(slicer)
39
+ if vector.boolean?
40
+ return filter_by_vector(vector.data) if vector.size == size
41
+
42
+ raise DataFrameArgumentError, "Size is not match in booleans: #{slicer}"
43
+ end
44
+ return take_by_array(vector) if vector.numeric?
45
+
46
+ raise DataFrameArgumentError, "Invalid argument #{slicer}"
47
+ end
48
+
49
+ # remove selected observations to create sub DataFrame
50
+ def remove(*args, &block)
51
+ remover = args
52
+ if block
53
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
54
+
55
+ remover = instance_eval(&block)
56
+ end
57
+ remover = [remover].flatten
58
+
59
+ raise DataFrameArgumentError, 'Empty dataframe' if empty?
60
+ return self if remover.empty? || remover[0].nil?
61
+
62
+ vector = parse_to_vector(remover)
63
+ if vector.boolean?
64
+ return filter_by_vector(vector.primitive_invert.data) if vector.size == size
65
+
66
+ raise DataFrameArgumentError, "Size is not match in booleans: #{remover}"
67
+ end
68
+ if vector.numeric?
69
+ raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
70
+
71
+ normalized_indices = (vector < 0).if_else(vector + size, vector) # normalize index from tail
72
+ if normalized_indices.max >= size
73
+ raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}"
16
74
  end
17
75
 
18
- return select_rows(expanded) if integers?(expanded)
19
- return select_columns(expanded.map(&:to_sym)) if sym_or_str?(expanded)
76
+ normalized_indices = normalized_indices.floor.to_a.map(&:to_i) # round to integer array
77
+ return remove_all_values if normalized_indices == indices
78
+ return self if normalized_indices.empty?
79
+
80
+ index_array = indices - normalized_indices
20
81
 
21
- raise DataFrameArgumentError, "Invalid argument #{args}"
82
+ datum = Arrow::Function.find(:take).execute([table, index_array])
83
+ return DataFrame.new(datum.value)
84
+ end
85
+
86
+ raise DataFrameArgumentError, "Invalid argument #{remover}"
87
+ end
88
+
89
+ def remove_nil
90
+ func = Arrow::Function.find(:drop_null)
91
+ DataFrame.new(func.execute([table]).value)
92
+ end
93
+ alias_method :drop_nil, :remove_nil
94
+
95
+ # Select a variable by a key in String or Symbol
96
+ def v(key)
97
+ unless key.is_a?(Symbol) || key.is_a?(String)
98
+ raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
99
+ end
100
+ raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
101
+
102
+ variables[key.to_sym]
22
103
  end
23
104
 
24
- def head(n_rows = 5)
25
- raise DataFrameArgumentError, "Index is out of range #{n_rows}" if n_rows.negative?
105
+ def head(n_obs = 5)
106
+ raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
26
107
 
27
- self[0...[n_rows, size].min]
108
+ self[0...[n_obs, size].min]
28
109
  end
29
110
 
30
- def tail(n_rows = 5)
31
- raise DataFrameArgumentError, "Index is out of range #{n_rows}" if n_rows.negative?
111
+ def tail(n_obs = 5)
112
+ raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
32
113
 
33
- self[-[n_rows, size].min..]
114
+ self[-[n_obs, size].min..]
34
115
  end
35
116
 
36
- def first(n_rows = 1)
37
- head(n_rows)
117
+ def first(n_obs = 1)
118
+ head(n_obs)
38
119
  end
39
120
 
40
- def last(n_rows = 1)
41
- tail(n_rows)
121
+ def last(n_obs = 1)
122
+ tail(n_obs)
42
123
  end
43
124
 
44
- private # =====
125
+ # Undocumented
126
+ # TODO: support for option {boundscheck: true}
127
+ def take(*indices)
128
+ indices.flatten!
129
+ return remove_all_values if indices.empty?
45
130
 
46
- def select_columns(keys)
47
- if keys.one?
48
- t = @table[*keys]
49
- raise DataFrameArgumentError, "Key is not exists #{keys}" unless t
131
+ indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
132
+ indices = Vector.new(indices) unless indices.is_a?(Vector)
50
133
 
51
- Vector.new(t.data)
52
- else
53
- DataFrame.new(@table[keys])
54
- end
134
+ take_by_array(indices)
55
135
  end
56
136
 
57
- def select_rows(indeces)
58
- out_of_range?(indeces) && raise(DataFrameArgumentError, "Invalid index: #{indeces} for 0..#{size - 1}")
137
+ # Undocumented
138
+ # TODO: support for option {null_selection_behavior: :drop}
139
+ def filter(*booleans)
140
+ booleans.flatten!
141
+ return remove_all_values if booleans.empty?
59
142
 
60
- a = indeces.map { |i| @table.slice(i).to_a }
61
- DataFrame.new(@table.schema, a)
143
+ b = booleans[0]
144
+ case b
145
+ when Vector
146
+ raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
147
+
148
+ filter_by_vector(b.data)
149
+ when Arrow::BooleanArray
150
+ filter_by_vector(b)
151
+ else
152
+ raise DataFrameArgumentError, 'Argument is not a boolean.' unless booleans?(booleans)
153
+
154
+ filter_by_vector(Arrow::BooleanArray.new(booleans))
155
+ end
62
156
  end
63
157
 
64
- def normalized_array(range)
65
- both_end = [range.begin, range.end]
66
- both_end[1] -= 1 if range.exclude_end? && range.end.is_a?(Integer)
158
+ private
67
159
 
68
- if both_end.any?(Integer) || both_end.all?(&:nil?)
69
- if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
70
- raise DataFrameArgumentError, "Index out of range: #{range} for 0..#{size - 1}"
71
- end
160
+ def select_vars_by_keys(keys)
161
+ if keys.one?
162
+ key = keys[0].to_sym
163
+ raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key
72
164
 
73
- (0...size).to_a[range]
165
+ variables[key]
74
166
  else
75
- range.to_a
167
+ DataFrame.new(@table[keys])
76
168
  end
77
169
  end
78
170
 
79
- def out_of_range?(indeces)
80
- indeces.max >= size || indeces.min < -size
171
+ # Accepts indices by numeric Vector
172
+ def take_by_array(indices)
173
+ raise DataFrameArgumentError, "Indices must be a numeric Vector: #{indices}" unless indices.numeric?
174
+ raise DataFrameArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
175
+
176
+ normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
177
+ raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
178
+
179
+ index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
180
+
181
+ datum = Arrow::Function.find(:take).execute([table, index_array])
182
+ DataFrame.new(datum.value)
81
183
  end
82
184
 
83
- def integers?(enum)
84
- enum.all?(Integer)
185
+ # Accepts booleans by Arrow::BooleanArray
186
+ def filter_by_vector(boolean_array)
187
+ raise DataFrameArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
188
+
189
+ datum = Arrow::Function.find(:filter).execute([table, boolean_array])
190
+ DataFrame.new(datum.value)
85
191
  end
86
192
 
87
- def sym_or_str?(enum)
88
- enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
193
+ # return a DataFrame with same keys as self without values
194
+ def remove_all_values
195
+ filter_by_vector(Arrow::BooleanArray.new([false] * size))
89
196
  end
90
197
  end
91
198
  end