red_amber 0.1.3 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +31 -7
- data/CHANGELOG.md +214 -10
- data/Gemfile +4 -0
- data/README.md +117 -342
- data/benchmark/csv_load_penguins.yml +15 -0
- data/benchmark/drop_nil.yml +11 -0
- data/doc/DataFrame.md +854 -0
- data/doc/Vector.md +449 -0
- data/doc/image/arrow_table_new.png +0 -0
- data/doc/image/dataframe/assign.png +0 -0
- data/doc/image/dataframe/drop.png +0 -0
- data/doc/image/dataframe/pick.png +0 -0
- data/doc/image/dataframe/remove.png +0 -0
- data/doc/image/dataframe/rename.png +0 -0
- data/doc/image/dataframe/slice.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/doc/image/example_in_red_arrow.png +0 -0
- data/doc/image/tdr.png +0 -0
- data/doc/image/tdr_and_table.png +0 -0
- data/doc/image/tidy_data_in_TDR.png +0 -0
- data/doc/image/vector/binary_element_wise.png +0 -0
- data/doc/image/vector/unary_aggregation.png +0 -0
- data/doc/image/vector/unary_aggregation_w_option.png +0 -0
- data/doc/image/vector/unary_element_wise.png +0 -0
- data/doc/tdr.md +56 -0
- data/doc/tdr_ja.md +56 -0
- data/lib/red-amber.rb +27 -0
- data/lib/red_amber/data_frame.rb +91 -37
- data/lib/red_amber/{data_frame_output.rb → data_frame_displayable.rb} +49 -41
- data/lib/red_amber/data_frame_indexable.rb +38 -0
- data/lib/red_amber/data_frame_observation_operation.rb +11 -0
- data/lib/red_amber/data_frame_selectable.rb +155 -48
- data/lib/red_amber/data_frame_variable_operation.rb +137 -0
- data/lib/red_amber/helper.rb +61 -0
- data/lib/red_amber/vector.rb +69 -16
- data/lib/red_amber/vector_functions.rb +80 -45
- data/lib/red_amber/vector_selectable.rb +124 -0
- data/lib/red_amber/vector_updatable.rb +104 -0
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +1 -16
- data/red_amber.gemspec +3 -6
- metadata +38 -9
data/lib/red_amber/data_frame.rb
CHANGED
@@ -5,19 +5,23 @@ module RedAmber
|
|
5
5
|
# @table : holds Arrow::Table object
|
6
6
|
class DataFrame
|
7
7
|
# mix-in
|
8
|
+
include DataFrameDisplayable
|
9
|
+
include DataFrameIndexable
|
8
10
|
include DataFrameSelectable
|
9
|
-
include
|
11
|
+
include DataFrameObservationOperation
|
12
|
+
include DataFrameVariableOperation
|
13
|
+
include Helper
|
10
14
|
|
11
15
|
def initialize(*args)
|
12
|
-
|
13
|
-
# returns empty DataFrame
|
14
|
-
@table = Arrow::Table.new({}, [])
|
16
|
+
@variables = @keys = @vectors = @types = @data_types = nil
|
15
17
|
# bug in gobject-introspection: ruby-gnome/ruby-gnome#1472
|
16
18
|
# [Arrow::Table] == [nil] shows ArgumentError
|
17
19
|
# temporary use yoda condition to workaround
|
18
|
-
|
19
|
-
|
20
|
-
|
20
|
+
if args.empty? || args == [[]] || args == [{}] || [nil] == args
|
21
|
+
# DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
|
22
|
+
# returns empty DataFrame
|
23
|
+
@table = Arrow::Table.new({}, [])
|
24
|
+
elsif args.size > 1
|
21
25
|
@table = Arrow::Table.new(*args)
|
22
26
|
else
|
23
27
|
arg = args[0]
|
@@ -39,67 +43,71 @@ module RedAmber
|
|
39
43
|
|
40
44
|
attr_reader :table
|
41
45
|
|
46
|
+
def to_arrow
|
47
|
+
@table
|
48
|
+
end
|
49
|
+
|
42
50
|
def save(output, options = {})
|
43
51
|
@table.save(output, options)
|
44
52
|
end
|
45
53
|
|
46
|
-
|
47
|
-
def n_rows
|
54
|
+
def size
|
48
55
|
@table.n_rows
|
49
56
|
end
|
50
|
-
alias_method :
|
51
|
-
alias_method :
|
52
|
-
alias_method :length, :n_rows
|
57
|
+
alias_method :n_rows, :size
|
58
|
+
alias_method :n_obs, :size
|
53
59
|
|
54
|
-
def
|
60
|
+
def n_keys
|
55
61
|
@table.n_columns
|
56
62
|
end
|
57
|
-
alias_method :
|
58
|
-
alias_method :
|
63
|
+
alias_method :n_cols, :n_keys
|
64
|
+
alias_method :n_vars, :n_keys
|
59
65
|
|
60
66
|
def shape
|
61
|
-
[
|
67
|
+
[size, n_keys]
|
62
68
|
end
|
63
69
|
|
64
|
-
def
|
65
|
-
@
|
70
|
+
def variables
|
71
|
+
@variables || @variables = init_instance_vars(:variables)
|
66
72
|
end
|
67
|
-
alias_method :
|
68
|
-
|
73
|
+
alias_method :vars, :variables
|
74
|
+
|
75
|
+
def keys
|
76
|
+
@keys || @keys = init_instance_vars(:keys)
|
77
|
+
end
|
78
|
+
alias_method :column_names, :keys
|
79
|
+
alias_method :var_names, :keys
|
69
80
|
|
70
81
|
def key?(key)
|
71
|
-
|
82
|
+
@keys.include?(key.to_sym)
|
72
83
|
end
|
73
84
|
alias_method :has_key?, :key?
|
74
85
|
|
75
86
|
def key_index(key)
|
76
|
-
|
87
|
+
@keys.find_index(key.to_sym)
|
77
88
|
end
|
78
89
|
alias_method :find_index, :key_index
|
79
90
|
alias_method :index, :key_index
|
80
91
|
|
81
92
|
def types
|
82
|
-
@table.columns.map
|
83
|
-
column.data_type.to_s.to_sym
|
84
|
-
end
|
93
|
+
@types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
|
85
94
|
end
|
86
95
|
|
87
|
-
def
|
88
|
-
@table.columns.map
|
89
|
-
column.data_type.class
|
90
|
-
end
|
96
|
+
def type_classes
|
97
|
+
@data_types || @data_types = @table.columns.map { |column| column.data_type.class }
|
91
98
|
end
|
92
99
|
|
93
100
|
def vectors
|
94
|
-
@
|
95
|
-
|
96
|
-
|
101
|
+
@vectors || @vectors = init_instance_vars(:vectors)
|
102
|
+
end
|
103
|
+
|
104
|
+
def indices
|
105
|
+
(0...size).to_a
|
97
106
|
end
|
107
|
+
alias_method :indexes, :indices
|
98
108
|
|
99
109
|
def to_h
|
100
|
-
|
101
|
-
result[column.name.to_sym] = column.entries
|
102
|
-
end
|
110
|
+
variables.transform_values(&:to_a)
|
103
111
|
end
|
104
112
|
|
105
113
|
def to_a
|
@@ -118,13 +126,59 @@ module RedAmber
|
|
118
126
|
end
|
119
127
|
|
120
128
|
def empty?
|
121
|
-
|
129
|
+
variables.empty?
|
122
130
|
end
|
123
131
|
|
124
132
|
def to_rover
|
125
133
|
Rover::DataFrame.new(to_h)
|
126
134
|
end
|
127
135
|
|
128
|
-
|
136
|
+
def to_iruby
|
137
|
+
require 'iruby'
|
138
|
+
return ['text/plain', '(empty DataFrame)'] if empty?
|
139
|
+
|
140
|
+
if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'tdr') == 'table'
|
141
|
+
['text/html', html_table]
|
142
|
+
elsif size <= 5
|
143
|
+
['text/plain', tdr_str(tally: 0)]
|
144
|
+
else
|
145
|
+
['text/plain', tdr_str]
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
private
|
150
|
+
|
151
|
+
# initialize @variable, @keys, @vectors and return one of them
|
152
|
+
def init_instance_vars(var)
|
153
|
+
ary = @table.columns.each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
|
154
|
+
v = Vector.new(column.data)
|
155
|
+
k = column.name.to_sym
|
156
|
+
v.key = k
|
157
|
+
variables[k] = v
|
158
|
+
keys << k
|
159
|
+
vectors << v
|
160
|
+
end
|
161
|
+
@variables, @keys, @vectors = ary
|
162
|
+
ary[%i[variables keys vectors].index(var)]
|
163
|
+
end
|
164
|
+
|
165
|
+
def html_table
|
166
|
+
reduced = size > 8 ? self[0..4, -4..-1] : self
|
167
|
+
|
168
|
+
converted = reduced.assign do
|
169
|
+
vectors.select.with_object({}) do |vector, assigner|
|
170
|
+
if vector.has_nil?
|
171
|
+
assigner[vector.key] = vector.to_a.map do |e|
|
172
|
+
e = e.nil? ? '<i>(nil)</i>' : e.to_s # nil
|
173
|
+
e = '""' if e.empty? # empty string
|
174
|
+
e.sub(/(\s+)/, '"\1"') # blank spaces
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
html = IRuby::HTML.table(converted.to_h, maxrows: 8, maxcols: 15)
|
181
|
+
"#{size} x #{n_keys} vector#{pl(n_keys)} ; #{html}"
|
182
|
+
end
|
129
183
|
end
|
130
184
|
end
|
@@ -4,7 +4,7 @@ require 'stringio'
|
|
4
4
|
|
5
5
|
module RedAmber
|
6
6
|
# mix-ins for the class DataFrame
|
7
|
-
module
|
7
|
+
module DataFrameDisplayable
|
8
8
|
def to_s
|
9
9
|
@table.to_s
|
10
10
|
end
|
@@ -13,19 +13,37 @@ module RedAmber
|
|
13
13
|
|
14
14
|
# def summary() end
|
15
15
|
|
16
|
-
def
|
17
|
-
|
16
|
+
def inspect
|
17
|
+
if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'tdr') == 'table'
|
18
|
+
"#<#{shape_str(with_id: true)}>\n#{self}"
|
19
|
+
else
|
20
|
+
"#<#{shape_str(with_id: true)}>\n#{dataframe_info(3)}"
|
21
|
+
end
|
18
22
|
end
|
19
23
|
|
20
|
-
# -
|
21
|
-
# -
|
22
|
-
# -
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
# - limit: max num of Vectors to show
|
25
|
+
# - tally: max level to use tally mode
|
26
|
+
# - elements: max element to show values in each vector
|
27
|
+
def tdr(limit = 10, tally: 5, elements: 5)
|
28
|
+
puts tdr_str(limit, tally: tally, elements: elements)
|
29
|
+
end
|
30
|
+
|
31
|
+
def tdr_str(limit = 10, tally: 5, elements: 5)
|
32
|
+
"#{shape_str}\n#{dataframe_info(limit, tally_level: tally, max_element: elements)}"
|
33
|
+
end
|
34
|
+
|
35
|
+
private # =====
|
36
|
+
|
37
|
+
def shape_str(with_id: false)
|
38
|
+
shape_info = empty? ? '(empty)' : "#{size} x #{n_keys} Vector#{pl(n_keys)}"
|
39
|
+
id = with_id ? format(', 0x%016x', object_id) : ''
|
40
|
+
"#{self.class} : #{shape_info}#{id}"
|
41
|
+
end
|
27
42
|
|
28
|
-
|
43
|
+
def dataframe_info(limit, tally_level: 5, max_element: 5)
|
44
|
+
return '' if empty?
|
45
|
+
|
46
|
+
limit = n_keys if [:all, -1].include? limit
|
29
47
|
|
30
48
|
tallys = vectors.map(&:tally)
|
31
49
|
levels = tallys.map(&:size)
|
@@ -34,52 +52,41 @@ module RedAmber
|
|
34
52
|
headers = { idx: '#', key: 'key', type: 'type', levels: 'level', data: 'data_preview' }
|
35
53
|
header_format = make_header_format(levels, headers, quoted_keys)
|
36
54
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
"#{self.class} : #{nrow} x #{ncol} #{vs}"
|
41
|
-
|
42
|
-
# 2nd row: show var counts by type
|
43
|
-
stringio.puts "#{vs} : #{var_type_count(type_groups).join(', ')}"
|
55
|
+
sio = StringIO.new # output string buffer
|
56
|
+
sio.puts "Vector#{pl(n_keys)} : #{var_type_count(type_groups).join(', ')}"
|
57
|
+
sio.printf header_format, *headers.values
|
44
58
|
|
45
|
-
# 3rd row: print header of rows
|
46
|
-
stringio.printf header_format, *headers.values
|
47
|
-
|
48
|
-
# 4th row ~: show details for each column (vector)
|
49
59
|
vectors.each.with_index do |vector, i|
|
60
|
+
if i >= limit
|
61
|
+
sio << " ... #{n_keys - i} more Vector#{pl(n_keys - i)} ...\n"
|
62
|
+
break
|
63
|
+
end
|
50
64
|
key = quoted_keys[i]
|
51
65
|
type = types[i]
|
52
66
|
type_group = type_groups[i]
|
53
67
|
data_tally = tallys[i]
|
54
|
-
|
55
68
|
a = case type_group
|
56
69
|
when :numeric, :string, :boolean
|
57
|
-
if data_tally.size <= tally_level && data_tally.size !=
|
70
|
+
if data_tally.size <= tally_level && data_tally.size != size
|
58
71
|
[data_tally.to_s]
|
59
72
|
else
|
60
|
-
[shorthand(vector,
|
73
|
+
[shorthand(vector, size, max_element)].concat na_string(vector)
|
61
74
|
end
|
62
75
|
else
|
63
|
-
shorthand(vector,
|
76
|
+
[shorthand(vector, size, max_element)]
|
64
77
|
end
|
65
|
-
|
78
|
+
sio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
|
66
79
|
end
|
67
|
-
|
68
|
-
end
|
69
|
-
|
70
|
-
private # =====
|
71
|
-
|
72
|
-
def pl(num)
|
73
|
-
num > 1 ? 's' : ''
|
80
|
+
sio.string
|
74
81
|
end
|
75
82
|
|
76
83
|
def make_header_format(levels, headers, quoted_keys)
|
77
|
-
# find longest word to adjust
|
78
|
-
w_idx =
|
84
|
+
# find longest word to adjust width
|
85
|
+
w_idx = n_keys.to_s.size
|
79
86
|
w_key = [quoted_keys.map(&:size).max, headers[:key].size].max
|
80
87
|
w_type = [types.map(&:size).max, headers[:type].size].max
|
81
|
-
|
82
|
-
"%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{
|
88
|
+
w_level = [levels.map { |l| l.to_s.size }.max, headers[:levels].size].max
|
89
|
+
"%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{w_level}s %s\n"
|
83
90
|
end
|
84
91
|
|
85
92
|
def type_group(data_type)
|
@@ -103,10 +110,11 @@ module RedAmber
|
|
103
110
|
a
|
104
111
|
end
|
105
112
|
|
106
|
-
def shorthand(vector,
|
107
|
-
|
113
|
+
def shorthand(vector, size, max_element)
|
114
|
+
max = vector.temporal? ? 2 : max_element
|
115
|
+
a = vector.to_a.take(max)
|
108
116
|
a.map! { |e| e.nil? ? 'nil' : e.inspect }
|
109
|
-
a << '... ' if
|
117
|
+
a << '... ' if size > max
|
110
118
|
"[#{a.join(', ')}]"
|
111
119
|
end
|
112
120
|
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-ins for the class DataFrame
|
5
|
+
module DataFrameIndexable
|
6
|
+
# Common method
|
7
|
+
def map_indices(*indices)
|
8
|
+
return self if indices.empty?
|
9
|
+
|
10
|
+
indices = indices[0].data if indices[0].is_a?(Vector)
|
11
|
+
|
12
|
+
new_dataframe_by(indices)
|
13
|
+
end
|
14
|
+
|
15
|
+
# @param sort_keys [Arrow::SortKey]
|
16
|
+
# :key, "key" or "+key" denotes ascending,
|
17
|
+
# "-key" denotes descending order
|
18
|
+
# @return [RedAmber::Vector] Sorted indices in Vector
|
19
|
+
def sort_indices(*sort_keys)
|
20
|
+
indices = @table.sort_indices(sort_keys.flatten)
|
21
|
+
Vector.new(indices)
|
22
|
+
end
|
23
|
+
|
24
|
+
# @return [RedAmber::DataFrame] Sorted DataFrame
|
25
|
+
def sort(*sort_keys)
|
26
|
+
indices = @table.sort_indices(sort_keys.flatten)
|
27
|
+
|
28
|
+
new_dataframe_by(indices)
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def new_dataframe_by(index_array)
|
34
|
+
t = Arrow::Function.find(:take).execute([@table, index_array]).value
|
35
|
+
RedAmber::DataFrame.new(t)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-ins for the class DataFrame
|
5
|
+
module DataFrameObservationOperation
|
6
|
+
def group(aggregating_keys, func, target_keys)
|
7
|
+
t = table.group(*aggregating_keys)
|
8
|
+
RedAmber::DataFrame.new(t.send(func, *target_keys))
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -3,89 +3,196 @@
|
|
3
3
|
module RedAmber
|
4
4
|
# mix-in for the class DataFrame
|
5
5
|
module DataFrameSelectable
|
6
|
-
# select
|
7
|
-
# select
|
6
|
+
# select variables: [symbol] or [string]
|
7
|
+
# select observations: [array of index], [range]
|
8
8
|
def [](*args)
|
9
|
+
args.flatten!
|
9
10
|
raise DataFrameArgumentError, 'Empty dataframe' if empty?
|
10
|
-
|
11
|
+
return remove_all_values if args.empty? || args[0].nil?
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
vector = parse_to_vector(args)
|
14
|
+
if vector.boolean?
|
15
|
+
return filter_by_vector(vector.data) if vector.size == size
|
16
|
+
|
17
|
+
raise DataFrameArgumentError, "Size is not match in booleans: #{args}"
|
18
|
+
end
|
19
|
+
return take_by_array(vector) if vector.numeric?
|
20
|
+
return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.type == :dictionary
|
21
|
+
|
22
|
+
raise DataFrameArgumentError, "Invalid argument: #{args}"
|
23
|
+
end
|
24
|
+
|
25
|
+
# slice and select some observations to create sub DataFrame
|
26
|
+
def slice(*args, &block)
|
27
|
+
slicer = args
|
28
|
+
if block
|
29
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
30
|
+
|
31
|
+
slicer = instance_eval(&block)
|
32
|
+
end
|
33
|
+
slicer = [slicer].flatten
|
34
|
+
|
35
|
+
raise DataFrameArgumentError, 'Empty dataframe' if empty?
|
36
|
+
return remove_all_values if slicer.empty? || slicer[0].nil?
|
37
|
+
|
38
|
+
vector = parse_to_vector(slicer)
|
39
|
+
if vector.boolean?
|
40
|
+
return filter_by_vector(vector.data) if vector.size == size
|
41
|
+
|
42
|
+
raise DataFrameArgumentError, "Size is not match in booleans: #{slicer}"
|
43
|
+
end
|
44
|
+
return take_by_array(vector) if vector.numeric?
|
45
|
+
|
46
|
+
raise DataFrameArgumentError, "Invalid argument #{slicer}"
|
47
|
+
end
|
48
|
+
|
49
|
+
# remove selected observations to create sub DataFrame
|
50
|
+
def remove(*args, &block)
|
51
|
+
remover = args
|
52
|
+
if block
|
53
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
54
|
+
|
55
|
+
remover = instance_eval(&block)
|
56
|
+
end
|
57
|
+
remover = [remover].flatten
|
58
|
+
|
59
|
+
raise DataFrameArgumentError, 'Empty dataframe' if empty?
|
60
|
+
return self if remover.empty? || remover[0].nil?
|
61
|
+
|
62
|
+
vector = parse_to_vector(remover)
|
63
|
+
if vector.boolean?
|
64
|
+
return filter_by_vector(vector.primitive_invert.data) if vector.size == size
|
65
|
+
|
66
|
+
raise DataFrameArgumentError, "Size is not match in booleans: #{remover}"
|
67
|
+
end
|
68
|
+
if vector.numeric?
|
69
|
+
raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
|
70
|
+
|
71
|
+
normalized_indices = (vector < 0).if_else(vector + size, vector) # normalize index from tail
|
72
|
+
if normalized_indices.max >= size
|
73
|
+
raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}"
|
16
74
|
end
|
17
75
|
|
18
|
-
|
19
|
-
|
76
|
+
normalized_indices = normalized_indices.floor.to_a.map(&:to_i) # round to integer array
|
77
|
+
return remove_all_values if normalized_indices == indices
|
78
|
+
return self if normalized_indices.empty?
|
79
|
+
|
80
|
+
index_array = indices - normalized_indices
|
20
81
|
|
21
|
-
|
82
|
+
datum = Arrow::Function.find(:take).execute([table, index_array])
|
83
|
+
return DataFrame.new(datum.value)
|
84
|
+
end
|
85
|
+
|
86
|
+
raise DataFrameArgumentError, "Invalid argument #{remover}"
|
87
|
+
end
|
88
|
+
|
89
|
+
def remove_nil
|
90
|
+
func = Arrow::Function.find(:drop_null)
|
91
|
+
DataFrame.new(func.execute([table]).value)
|
92
|
+
end
|
93
|
+
alias_method :drop_nil, :remove_nil
|
94
|
+
|
95
|
+
# Select a variable by a key in String or Symbol
|
96
|
+
def v(key)
|
97
|
+
unless key.is_a?(Symbol) || key.is_a?(String)
|
98
|
+
raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
|
99
|
+
end
|
100
|
+
raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
|
101
|
+
|
102
|
+
variables[key.to_sym]
|
22
103
|
end
|
23
104
|
|
24
|
-
def head(
|
25
|
-
raise DataFrameArgumentError, "Index is out of range #{
|
105
|
+
def head(n_obs = 5)
|
106
|
+
raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
|
26
107
|
|
27
|
-
self[0...[
|
108
|
+
self[0...[n_obs, size].min]
|
28
109
|
end
|
29
110
|
|
30
|
-
def tail(
|
31
|
-
raise DataFrameArgumentError, "Index is out of range #{
|
111
|
+
def tail(n_obs = 5)
|
112
|
+
raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
|
32
113
|
|
33
|
-
self[-[
|
114
|
+
self[-[n_obs, size].min..]
|
34
115
|
end
|
35
116
|
|
36
|
-
def first(
|
37
|
-
head(
|
117
|
+
def first(n_obs = 1)
|
118
|
+
head(n_obs)
|
38
119
|
end
|
39
120
|
|
40
|
-
def last(
|
41
|
-
tail(
|
121
|
+
def last(n_obs = 1)
|
122
|
+
tail(n_obs)
|
42
123
|
end
|
43
124
|
|
44
|
-
|
125
|
+
# Undocumented
|
126
|
+
# TODO: support for option {boundscheck: true}
|
127
|
+
def take(*indices)
|
128
|
+
indices.flatten!
|
129
|
+
return remove_all_values if indices.empty?
|
45
130
|
|
46
|
-
|
47
|
-
|
48
|
-
t = @table[*keys]
|
49
|
-
raise DataFrameArgumentError, "Key is not exists #{keys}" unless t
|
131
|
+
indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
|
132
|
+
indices = Vector.new(indices) unless indices.is_a?(Vector)
|
50
133
|
|
51
|
-
|
52
|
-
else
|
53
|
-
DataFrame.new(@table[keys])
|
54
|
-
end
|
134
|
+
take_by_array(indices)
|
55
135
|
end
|
56
136
|
|
57
|
-
|
58
|
-
|
137
|
+
# Undocumented
|
138
|
+
# TODO: support for option {null_selection_behavior: :drop}
|
139
|
+
def filter(*booleans)
|
140
|
+
booleans.flatten!
|
141
|
+
return remove_all_values if booleans.empty?
|
59
142
|
|
60
|
-
|
61
|
-
|
143
|
+
b = booleans[0]
|
144
|
+
case b
|
145
|
+
when Vector
|
146
|
+
raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
|
147
|
+
|
148
|
+
filter_by_vector(b.data)
|
149
|
+
when Arrow::BooleanArray
|
150
|
+
filter_by_vector(b)
|
151
|
+
else
|
152
|
+
raise DataFrameArgumentError, 'Argument is not a boolean.' unless booleans?(booleans)
|
153
|
+
|
154
|
+
filter_by_vector(Arrow::BooleanArray.new(booleans))
|
155
|
+
end
|
62
156
|
end
|
63
157
|
|
64
|
-
|
65
|
-
both_end = [range.begin, range.end]
|
66
|
-
both_end[1] -= 1 if range.exclude_end? && range.end.is_a?(Integer)
|
158
|
+
private
|
67
159
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
160
|
+
def select_vars_by_keys(keys)
|
161
|
+
if keys.one?
|
162
|
+
key = keys[0].to_sym
|
163
|
+
raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key
|
72
164
|
|
73
|
-
|
165
|
+
variables[key]
|
74
166
|
else
|
75
|
-
|
167
|
+
DataFrame.new(@table[keys])
|
76
168
|
end
|
77
169
|
end
|
78
170
|
|
79
|
-
|
80
|
-
|
171
|
+
# Accepts indices by numeric Vector
|
172
|
+
def take_by_array(indices)
|
173
|
+
raise DataFrameArgumentError, "Indices must be a numeric Vector: #{indices}" unless indices.numeric?
|
174
|
+
raise DataFrameArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
|
175
|
+
|
176
|
+
normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
|
177
|
+
raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
|
178
|
+
|
179
|
+
index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
|
180
|
+
|
181
|
+
datum = Arrow::Function.find(:take).execute([table, index_array])
|
182
|
+
DataFrame.new(datum.value)
|
81
183
|
end
|
82
184
|
|
83
|
-
|
84
|
-
|
185
|
+
# Accepts booleans by Arrow::BooleanArray
|
186
|
+
def filter_by_vector(boolean_array)
|
187
|
+
raise DataFrameArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
|
188
|
+
|
189
|
+
datum = Arrow::Function.find(:filter).execute([table, boolean_array])
|
190
|
+
DataFrame.new(datum.value)
|
85
191
|
end
|
86
192
|
|
87
|
-
|
88
|
-
|
193
|
+
# return a DataFrame with same keys as self without values
|
194
|
+
def remove_all_values
|
195
|
+
filter_by_vector(Arrow::BooleanArray.new([false] * size))
|
89
196
|
end
|
90
197
|
end
|
91
198
|
end
|