red_amber 0.1.3 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +31 -7
  3. data/CHANGELOG.md +214 -10
  4. data/Gemfile +4 -0
  5. data/README.md +117 -342
  6. data/benchmark/csv_load_penguins.yml +15 -0
  7. data/benchmark/drop_nil.yml +11 -0
  8. data/doc/DataFrame.md +854 -0
  9. data/doc/Vector.md +449 -0
  10. data/doc/image/arrow_table_new.png +0 -0
  11. data/doc/image/dataframe/assign.png +0 -0
  12. data/doc/image/dataframe/drop.png +0 -0
  13. data/doc/image/dataframe/pick.png +0 -0
  14. data/doc/image/dataframe/remove.png +0 -0
  15. data/doc/image/dataframe/rename.png +0 -0
  16. data/doc/image/dataframe/slice.png +0 -0
  17. data/doc/image/dataframe_model.png +0 -0
  18. data/doc/image/example_in_red_arrow.png +0 -0
  19. data/doc/image/tdr.png +0 -0
  20. data/doc/image/tdr_and_table.png +0 -0
  21. data/doc/image/tidy_data_in_TDR.png +0 -0
  22. data/doc/image/vector/binary_element_wise.png +0 -0
  23. data/doc/image/vector/unary_aggregation.png +0 -0
  24. data/doc/image/vector/unary_aggregation_w_option.png +0 -0
  25. data/doc/image/vector/unary_element_wise.png +0 -0
  26. data/doc/tdr.md +56 -0
  27. data/doc/tdr_ja.md +56 -0
  28. data/lib/red-amber.rb +27 -0
  29. data/lib/red_amber/data_frame.rb +91 -37
  30. data/lib/red_amber/{data_frame_output.rb → data_frame_displayable.rb} +49 -41
  31. data/lib/red_amber/data_frame_indexable.rb +38 -0
  32. data/lib/red_amber/data_frame_observation_operation.rb +11 -0
  33. data/lib/red_amber/data_frame_selectable.rb +155 -48
  34. data/lib/red_amber/data_frame_variable_operation.rb +137 -0
  35. data/lib/red_amber/helper.rb +61 -0
  36. data/lib/red_amber/vector.rb +69 -16
  37. data/lib/red_amber/vector_functions.rb +80 -45
  38. data/lib/red_amber/vector_selectable.rb +124 -0
  39. data/lib/red_amber/vector_updatable.rb +104 -0
  40. data/lib/red_amber/version.rb +1 -1
  41. data/lib/red_amber.rb +1 -16
  42. data/red_amber.gemspec +3 -6
  43. metadata +38 -9
@@ -5,19 +5,23 @@ module RedAmber
5
5
  # @table : holds Arrow::Table object
6
6
  class DataFrame
7
7
  # mix-in
8
+ include DataFrameDisplayable
9
+ include DataFrameIndexable
8
10
  include DataFrameSelectable
9
- include DataFrameOutput
11
+ include DataFrameObservationOperation
12
+ include DataFrameVariableOperation
13
+ include Helper
10
14
 
11
15
  def initialize(*args)
12
- # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
13
- # returns empty DataFrame
14
- @table = Arrow::Table.new({}, [])
16
+ @variables = @keys = @vectors = @types = @data_types = nil
15
17
  # bug in gobject-introspection: ruby-gnome/ruby-gnome#1472
16
18
  # [Arrow::Table] == [nil] shows ArgumentError
17
19
  # temporary use yoda condition to workaround
18
- return if args.empty? || args == [[]] || args == [{}] || [nil] == args
19
-
20
- if args.size > 1
20
+ if args.empty? || args == [[]] || args == [{}] || [nil] == args
21
+ # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
22
+ # returns empty DataFrame
23
+ @table = Arrow::Table.new({}, [])
24
+ elsif args.size > 1
21
25
  @table = Arrow::Table.new(*args)
22
26
  else
23
27
  arg = args[0]
@@ -39,67 +43,71 @@ module RedAmber
39
43
 
40
44
  attr_reader :table
41
45
 
46
+ def to_arrow
47
+ @table
48
+ end
49
+
42
50
  def save(output, options = {})
43
51
  @table.save(output, options)
44
52
  end
45
53
 
46
- # Properties ===
47
- def n_rows
54
+ def size
48
55
  @table.n_rows
49
56
  end
50
- alias_method :nrow, :n_rows
51
- alias_method :size, :n_rows
52
- alias_method :length, :n_rows
57
+ alias_method :n_rows, :size
58
+ alias_method :n_obs, :size
53
59
 
54
- def n_columns
60
+ def n_keys
55
61
  @table.n_columns
56
62
  end
57
- alias_method :ncol, :n_columns
58
- alias_method :width, :n_columns
63
+ alias_method :n_cols, :n_keys
64
+ alias_method :n_vars, :n_keys
59
65
 
60
66
  def shape
61
- [n_rows, n_columns]
67
+ [size, n_keys]
62
68
  end
63
69
 
64
- def column_names
65
- @table.columns.map { |column| column.name.to_sym }
70
+ def variables
71
+ @variables || @variables = init_instance_vars(:variables)
66
72
  end
67
- alias_method :keys, :column_names
68
- alias_method :header, :column_names
73
+ alias_method :vars, :variables
74
+
75
+ def keys
76
+ @keys || @keys = init_instance_vars(:keys)
77
+ end
78
+ alias_method :column_names, :keys
79
+ alias_method :var_names, :keys
69
80
 
70
81
  def key?(key)
71
- column_names.include?(key.to_sym)
82
+ @keys.include?(key.to_sym)
72
83
  end
73
84
  alias_method :has_key?, :key?
74
85
 
75
86
  def key_index(key)
76
- column_names.find_index(key.to_sym)
87
+ @keys.find_index(key.to_sym)
77
88
  end
78
89
  alias_method :find_index, :key_index
79
90
  alias_method :index, :key_index
80
91
 
81
92
  def types
82
- @table.columns.map do |column|
83
- column.data_type.to_s.to_sym
84
- end
93
+ @types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
85
94
  end
86
95
 
87
- def data_types
88
- @table.columns.map do |column|
89
- column.data_type.class
90
- end
96
+ def type_classes
97
+ @data_types || @data_types = @table.columns.map { |column| column.data_type.class }
91
98
  end
92
99
 
93
100
  def vectors
94
- @table.columns.map do |column|
95
- Vector.new(column.data)
96
- end
101
+ @vectors || @vectors = init_instance_vars(:vectors)
102
+ end
103
+
104
+ def indices
105
+ (0...size).to_a
97
106
  end
107
+ alias_method :indexes, :indices
98
108
 
99
109
  def to_h
100
- @table.columns.each_with_object({}) do |column, result|
101
- result[column.name.to_sym] = column.entries
102
- end
110
+ variables.transform_values(&:to_a)
103
111
  end
104
112
 
105
113
  def to_a
@@ -118,13 +126,59 @@ module RedAmber
118
126
  end
119
127
 
120
128
  def empty?
121
- @table.columns.empty?
129
+ variables.empty?
122
130
  end
123
131
 
124
132
  def to_rover
125
133
  Rover::DataFrame.new(to_h)
126
134
  end
127
135
 
128
- # def to_parquet() end
136
+ def to_iruby
137
+ require 'iruby'
138
+ return ['text/plain', '(empty DataFrame)'] if empty?
139
+
140
+ if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'tdr') == 'table'
141
+ ['text/html', html_table]
142
+ elsif size <= 5
143
+ ['text/plain', tdr_str(tally: 0)]
144
+ else
145
+ ['text/plain', tdr_str]
146
+ end
147
+ end
148
+
149
+ private
150
+
151
+ # initialize @variable, @keys, @vectors and return one of them
152
+ def init_instance_vars(var)
153
+ ary = @table.columns.each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
154
+ v = Vector.new(column.data)
155
+ k = column.name.to_sym
156
+ v.key = k
157
+ variables[k] = v
158
+ keys << k
159
+ vectors << v
160
+ end
161
+ @variables, @keys, @vectors = ary
162
+ ary[%i[variables keys vectors].index(var)]
163
+ end
164
+
165
+ def html_table
166
+ reduced = size > 8 ? self[0..4, -4..-1] : self
167
+
168
+ converted = reduced.assign do
169
+ vectors.select.with_object({}) do |vector, assigner|
170
+ if vector.has_nil?
171
+ assigner[vector.key] = vector.to_a.map do |e|
172
+ e = e.nil? ? '<i>(nil)</i>' : e.to_s # nil
173
+ e = '""' if e.empty? # empty string
174
+ e.sub(/(\s+)/, '"\1"') # blank spaces
175
+ end
176
+ end
177
+ end
178
+ end
179
+
180
+ html = IRuby::HTML.table(converted.to_h, maxrows: 8, maxcols: 15)
181
+ "#{size} x #{n_keys} vector#{pl(n_keys)} ; #{html}"
182
+ end
129
183
  end
130
184
  end
@@ -4,7 +4,7 @@ require 'stringio'
4
4
 
5
5
  module RedAmber
6
6
  # mix-ins for the class DataFrame
7
- module DataFrameOutput
7
+ module DataFrameDisplayable
8
8
  def to_s
9
9
  @table.to_s
10
10
  end
@@ -13,19 +13,37 @@ module RedAmber
13
13
 
14
14
  # def summary() end
15
15
 
16
- def inspect_raw
17
- format "#<#{self.class}:0x%016x>\n#{self}", object_id
16
+ def inspect
17
+ if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'tdr') == 'table'
18
+ "#<#{shape_str(with_id: true)}>\n#{self}"
19
+ else
20
+ "#<#{shape_str(with_id: true)}>\n#{dataframe_info(3)}"
21
+ end
18
22
  end
19
23
 
20
- # - tally_level: max level to use tally mode
21
- # - max_element: max element to show values in each row
22
- # - TODO: Is it better to change name other than `inspect` ?
23
- # - TODO: Fall back to inspect_raw when treating large dataset
24
- # - TODO: Refactor code to smaller methods
25
- def inspect(tally_level: 5, max_element: 5)
26
- return '#<RedAmber::DataFrame (empty)>' if empty?
24
+ # - limit: max num of Vectors to show
25
+ # - tally: max level to use tally mode
26
+ # - elements: max element to show values in each vector
27
+ def tdr(limit = 10, tally: 5, elements: 5)
28
+ puts tdr_str(limit, tally: tally, elements: elements)
29
+ end
30
+
31
+ def tdr_str(limit = 10, tally: 5, elements: 5)
32
+ "#{shape_str}\n#{dataframe_info(limit, tally_level: tally, max_element: elements)}"
33
+ end
34
+
35
+ private # =====
36
+
37
+ def shape_str(with_id: false)
38
+ shape_info = empty? ? '(empty)' : "#{size} x #{n_keys} Vector#{pl(n_keys)}"
39
+ id = with_id ? format(', 0x%016x', object_id) : ''
40
+ "#{self.class} : #{shape_info}#{id}"
41
+ end
27
42
 
28
- stringio = StringIO.new # output string buffer
43
+ def dataframe_info(limit, tally_level: 5, max_element: 5)
44
+ return '' if empty?
45
+
46
+ limit = n_keys if [:all, -1].include? limit
29
47
 
30
48
  tallys = vectors.map(&:tally)
31
49
  levels = tallys.map(&:size)
@@ -34,52 +52,41 @@ module RedAmber
34
52
  headers = { idx: '#', key: 'key', type: 'type', levels: 'level', data: 'data_preview' }
35
53
  header_format = make_header_format(levels, headers, quoted_keys)
36
54
 
37
- # 1st row: show shape of the dataframe
38
- vs = "Vector#{pl(ncol)}"
39
- stringio.puts \
40
- "#{self.class} : #{nrow} x #{ncol} #{vs}"
41
-
42
- # 2nd row: show var counts by type
43
- stringio.puts "#{vs} : #{var_type_count(type_groups).join(', ')}"
55
+ sio = StringIO.new # output string buffer
56
+ sio.puts "Vector#{pl(n_keys)} : #{var_type_count(type_groups).join(', ')}"
57
+ sio.printf header_format, *headers.values
44
58
 
45
- # 3rd row: print header of rows
46
- stringio.printf header_format, *headers.values
47
-
48
- # 4th row ~: show details for each column (vector)
49
59
  vectors.each.with_index do |vector, i|
60
+ if i >= limit
61
+ sio << " ... #{n_keys - i} more Vector#{pl(n_keys - i)} ...\n"
62
+ break
63
+ end
50
64
  key = quoted_keys[i]
51
65
  type = types[i]
52
66
  type_group = type_groups[i]
53
67
  data_tally = tallys[i]
54
-
55
68
  a = case type_group
56
69
  when :numeric, :string, :boolean
57
- if data_tally.size <= tally_level && data_tally.size != nrow
70
+ if data_tally.size <= tally_level && data_tally.size != size
58
71
  [data_tally.to_s]
59
72
  else
60
- [shorthand(vector, nrow, max_element)].concat na_string(vector)
73
+ [shorthand(vector, size, max_element)].concat na_string(vector)
61
74
  end
62
75
  else
63
- shorthand(vector, nrow, max_element)
76
+ [shorthand(vector, size, max_element)]
64
77
  end
65
- stringio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
78
+ sio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
66
79
  end
67
- stringio.string
68
- end
69
-
70
- private # =====
71
-
72
- def pl(num)
73
- num > 1 ? 's' : ''
80
+ sio.string
74
81
  end
75
82
 
76
83
  def make_header_format(levels, headers, quoted_keys)
77
- # find longest word to adjust column width
78
- w_idx = ncol.to_s.size
84
+ # find longest word to adjust width
85
+ w_idx = n_keys.to_s.size
79
86
  w_key = [quoted_keys.map(&:size).max, headers[:key].size].max
80
87
  w_type = [types.map(&:size).max, headers[:type].size].max
81
- w_row = [levels.map { |l| l.to_s.size }.max, headers[:levels].size].max
82
- "%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{w_row}s %s\n"
88
+ w_level = [levels.map { |l| l.to_s.size }.max, headers[:levels].size].max
89
+ "%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{w_level}s %s\n"
83
90
  end
84
91
 
85
92
  def type_group(data_type)
@@ -103,10 +110,11 @@ module RedAmber
103
110
  a
104
111
  end
105
112
 
106
- def shorthand(vector, nrow, max_element)
107
- a = vector.to_a.take(max_element)
113
+ def shorthand(vector, size, max_element)
114
+ max = vector.temporal? ? 2 : max_element
115
+ a = vector.to_a.take(max)
108
116
  a.map! { |e| e.nil? ? 'nil' : e.inspect }
109
- a << '... ' if nrow > max_element
117
+ a << '... ' if size > max
110
118
  "[#{a.join(', ')}]"
111
119
  end
112
120
 
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameIndexable
6
+ # Common method
7
+ def map_indices(*indices)
8
+ return self if indices.empty?
9
+
10
+ indices = indices[0].data if indices[0].is_a?(Vector)
11
+
12
+ new_dataframe_by(indices)
13
+ end
14
+
15
+ # @param sort_keys [Arrow::SortKey]
16
+ # :key, "key" or "+key" denotes ascending,
17
+ # "-key" denotes descending order
18
+ # @return [RedAmber::Vector] Sorted indices in Vector
19
+ def sort_indices(*sort_keys)
20
+ indices = @table.sort_indices(sort_keys.flatten)
21
+ Vector.new(indices)
22
+ end
23
+
24
+ # @return [RedAmber::DataFrame] Sorted DataFrame
25
+ def sort(*sort_keys)
26
+ indices = @table.sort_indices(sort_keys.flatten)
27
+
28
+ new_dataframe_by(indices)
29
+ end
30
+
31
+ private
32
+
33
+ def new_dataframe_by(index_array)
34
+ t = Arrow::Function.find(:take).execute([@table, index_array]).value
35
+ RedAmber::DataFrame.new(t)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameObservationOperation
6
+ def group(aggregating_keys, func, target_keys)
7
+ t = table.group(*aggregating_keys)
8
+ RedAmber::DataFrame.new(t.send(func, *target_keys))
9
+ end
10
+ end
11
+ end
@@ -3,89 +3,196 @@
3
3
  module RedAmber
4
4
  # mix-in for the class DataFrame
5
5
  module DataFrameSelectable
6
- # select columns: [symbol] or [string]
7
- # select rows: [array of index], [range]
6
+ # select variables: [symbol] or [string]
7
+ # select observations: [array of index], [range]
8
8
  def [](*args)
9
+ args.flatten!
9
10
  raise DataFrameArgumentError, 'Empty dataframe' if empty?
10
- raise DataFrameArgumentError, 'Empty argument' if args.empty?
11
+ return remove_all_values if args.empty? || args[0].nil?
11
12
 
12
- # expand Range like [1..3, 4] to [1, 2, 3, 4]
13
- expanded =
14
- args.each_with_object([]) do |e, a|
15
- e.is_a?(Range) ? a.concat(normalized_array(e)) : a.append(e)
13
+ vector = parse_to_vector(args)
14
+ if vector.boolean?
15
+ return filter_by_vector(vector.data) if vector.size == size
16
+
17
+ raise DataFrameArgumentError, "Size is not match in booleans: #{args}"
18
+ end
19
+ return take_by_array(vector) if vector.numeric?
20
+ return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.type == :dictionary
21
+
22
+ raise DataFrameArgumentError, "Invalid argument: #{args}"
23
+ end
24
+
25
+ # slice and select some observations to create sub DataFrame
26
+ def slice(*args, &block)
27
+ slicer = args
28
+ if block
29
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
30
+
31
+ slicer = instance_eval(&block)
32
+ end
33
+ slicer = [slicer].flatten
34
+
35
+ raise DataFrameArgumentError, 'Empty dataframe' if empty?
36
+ return remove_all_values if slicer.empty? || slicer[0].nil?
37
+
38
+ vector = parse_to_vector(slicer)
39
+ if vector.boolean?
40
+ return filter_by_vector(vector.data) if vector.size == size
41
+
42
+ raise DataFrameArgumentError, "Size is not match in booleans: #{slicer}"
43
+ end
44
+ return take_by_array(vector) if vector.numeric?
45
+
46
+ raise DataFrameArgumentError, "Invalid argument #{slicer}"
47
+ end
48
+
49
+ # remove selected observations to create sub DataFrame
50
+ def remove(*args, &block)
51
+ remover = args
52
+ if block
53
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
54
+
55
+ remover = instance_eval(&block)
56
+ end
57
+ remover = [remover].flatten
58
+
59
+ raise DataFrameArgumentError, 'Empty dataframe' if empty?
60
+ return self if remover.empty? || remover[0].nil?
61
+
62
+ vector = parse_to_vector(remover)
63
+ if vector.boolean?
64
+ return filter_by_vector(vector.primitive_invert.data) if vector.size == size
65
+
66
+ raise DataFrameArgumentError, "Size is not match in booleans: #{remover}"
67
+ end
68
+ if vector.numeric?
69
+ raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
70
+
71
+ normalized_indices = (vector < 0).if_else(vector + size, vector) # normalize index from tail
72
+ if normalized_indices.max >= size
73
+ raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}"
16
74
  end
17
75
 
18
- return select_rows(expanded) if integers?(expanded)
19
- return select_columns(expanded.map(&:to_sym)) if sym_or_str?(expanded)
76
+ normalized_indices = normalized_indices.floor.to_a.map(&:to_i) # round to integer array
77
+ return remove_all_values if normalized_indices == indices
78
+ return self if normalized_indices.empty?
79
+
80
+ index_array = indices - normalized_indices
20
81
 
21
- raise DataFrameArgumentError, "Invalid argument #{args}"
82
+ datum = Arrow::Function.find(:take).execute([table, index_array])
83
+ return DataFrame.new(datum.value)
84
+ end
85
+
86
+ raise DataFrameArgumentError, "Invalid argument #{remover}"
87
+ end
88
+
89
+ def remove_nil
90
+ func = Arrow::Function.find(:drop_null)
91
+ DataFrame.new(func.execute([table]).value)
92
+ end
93
+ alias_method :drop_nil, :remove_nil
94
+
95
+ # Select a variable by a key in String or Symbol
96
+ def v(key)
97
+ unless key.is_a?(Symbol) || key.is_a?(String)
98
+ raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
99
+ end
100
+ raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
101
+
102
+ variables[key.to_sym]
22
103
  end
23
104
 
24
- def head(n_rows = 5)
25
- raise DataFrameArgumentError, "Index is out of range #{n_rows}" if n_rows.negative?
105
+ def head(n_obs = 5)
106
+ raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
26
107
 
27
- self[0...[n_rows, size].min]
108
+ self[0...[n_obs, size].min]
28
109
  end
29
110
 
30
- def tail(n_rows = 5)
31
- raise DataFrameArgumentError, "Index is out of range #{n_rows}" if n_rows.negative?
111
+ def tail(n_obs = 5)
112
+ raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
32
113
 
33
- self[-[n_rows, size].min..]
114
+ self[-[n_obs, size].min..]
34
115
  end
35
116
 
36
- def first(n_rows = 1)
37
- head(n_rows)
117
+ def first(n_obs = 1)
118
+ head(n_obs)
38
119
  end
39
120
 
40
- def last(n_rows = 1)
41
- tail(n_rows)
121
+ def last(n_obs = 1)
122
+ tail(n_obs)
42
123
  end
43
124
 
44
- private # =====
125
+ # Undocumented
126
+ # TODO: support for option {boundscheck: true}
127
+ def take(*indices)
128
+ indices.flatten!
129
+ return remove_all_values if indices.empty?
45
130
 
46
- def select_columns(keys)
47
- if keys.one?
48
- t = @table[*keys]
49
- raise DataFrameArgumentError, "Key is not exists #{keys}" unless t
131
+ indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
132
+ indices = Vector.new(indices) unless indices.is_a?(Vector)
50
133
 
51
- Vector.new(t.data)
52
- else
53
- DataFrame.new(@table[keys])
54
- end
134
+ take_by_array(indices)
55
135
  end
56
136
 
57
- def select_rows(indeces)
58
- out_of_range?(indeces) && raise(DataFrameArgumentError, "Invalid index: #{indeces} for 0..#{size - 1}")
137
+ # Undocumented
138
+ # TODO: support for option {null_selection_behavior: :drop}
139
+ def filter(*booleans)
140
+ booleans.flatten!
141
+ return remove_all_values if booleans.empty?
59
142
 
60
- a = indeces.map { |i| @table.slice(i).to_a }
61
- DataFrame.new(@table.schema, a)
143
+ b = booleans[0]
144
+ case b
145
+ when Vector
146
+ raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
147
+
148
+ filter_by_vector(b.data)
149
+ when Arrow::BooleanArray
150
+ filter_by_vector(b)
151
+ else
152
+ raise DataFrameArgumentError, 'Argument is not a boolean.' unless booleans?(booleans)
153
+
154
+ filter_by_vector(Arrow::BooleanArray.new(booleans))
155
+ end
62
156
  end
63
157
 
64
- def normalized_array(range)
65
- both_end = [range.begin, range.end]
66
- both_end[1] -= 1 if range.exclude_end? && range.end.is_a?(Integer)
158
+ private
67
159
 
68
- if both_end.any?(Integer) || both_end.all?(&:nil?)
69
- if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
70
- raise DataFrameArgumentError, "Index out of range: #{range} for 0..#{size - 1}"
71
- end
160
+ def select_vars_by_keys(keys)
161
+ if keys.one?
162
+ key = keys[0].to_sym
163
+ raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key
72
164
 
73
- (0...size).to_a[range]
165
+ variables[key]
74
166
  else
75
- range.to_a
167
+ DataFrame.new(@table[keys])
76
168
  end
77
169
  end
78
170
 
79
- def out_of_range?(indeces)
80
- indeces.max >= size || indeces.min < -size
171
+ # Accepts indices by numeric Vector
172
+ def take_by_array(indices)
173
+ raise DataFrameArgumentError, "Indices must be a numeric Vector: #{indices}" unless indices.numeric?
174
+ raise DataFrameArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
175
+
176
+ normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
177
+ raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
178
+
179
+ index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
180
+
181
+ datum = Arrow::Function.find(:take).execute([table, index_array])
182
+ DataFrame.new(datum.value)
81
183
  end
82
184
 
83
- def integers?(enum)
84
- enum.all?(Integer)
185
+ # Accepts booleans by Arrow::BooleanArray
186
+ def filter_by_vector(boolean_array)
187
+ raise DataFrameArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
188
+
189
+ datum = Arrow::Function.find(:filter).execute([table, boolean_array])
190
+ DataFrame.new(datum.value)
85
191
  end
86
192
 
87
- def sym_or_str?(enum)
88
- enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
193
+ # return a DataFrame with same keys as self without values
194
+ def remove_all_values
195
+ filter_by_vector(Arrow::BooleanArray.new([false] * size))
89
196
  end
90
197
  end
91
198
  end