red_amber 0.1.3 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +31 -7
- data/CHANGELOG.md +214 -10
- data/Gemfile +4 -0
- data/README.md +117 -342
- data/benchmark/csv_load_penguins.yml +15 -0
- data/benchmark/drop_nil.yml +11 -0
- data/doc/DataFrame.md +854 -0
- data/doc/Vector.md +449 -0
- data/doc/image/arrow_table_new.png +0 -0
- data/doc/image/dataframe/assign.png +0 -0
- data/doc/image/dataframe/drop.png +0 -0
- data/doc/image/dataframe/pick.png +0 -0
- data/doc/image/dataframe/remove.png +0 -0
- data/doc/image/dataframe/rename.png +0 -0
- data/doc/image/dataframe/slice.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/doc/image/example_in_red_arrow.png +0 -0
- data/doc/image/tdr.png +0 -0
- data/doc/image/tdr_and_table.png +0 -0
- data/doc/image/tidy_data_in_TDR.png +0 -0
- data/doc/image/vector/binary_element_wise.png +0 -0
- data/doc/image/vector/unary_aggregation.png +0 -0
- data/doc/image/vector/unary_aggregation_w_option.png +0 -0
- data/doc/image/vector/unary_element_wise.png +0 -0
- data/doc/tdr.md +56 -0
- data/doc/tdr_ja.md +56 -0
- data/lib/red-amber.rb +27 -0
- data/lib/red_amber/data_frame.rb +91 -37
- data/lib/red_amber/{data_frame_output.rb → data_frame_displayable.rb} +49 -41
- data/lib/red_amber/data_frame_indexable.rb +38 -0
- data/lib/red_amber/data_frame_observation_operation.rb +11 -0
- data/lib/red_amber/data_frame_selectable.rb +155 -48
- data/lib/red_amber/data_frame_variable_operation.rb +137 -0
- data/lib/red_amber/helper.rb +61 -0
- data/lib/red_amber/vector.rb +69 -16
- data/lib/red_amber/vector_functions.rb +80 -45
- data/lib/red_amber/vector_selectable.rb +124 -0
- data/lib/red_amber/vector_updatable.rb +104 -0
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +1 -16
- data/red_amber.gemspec +3 -6
- metadata +38 -9
data/lib/red_amber/data_frame.rb
CHANGED
@@ -5,19 +5,23 @@ module RedAmber
|
|
5
5
|
# @table : holds Arrow::Table object
|
6
6
|
class DataFrame
|
7
7
|
# mix-in
|
8
|
+
include DataFrameDisplayable
|
9
|
+
include DataFrameIndexable
|
8
10
|
include DataFrameSelectable
|
9
|
-
include
|
11
|
+
include DataFrameObservationOperation
|
12
|
+
include DataFrameVariableOperation
|
13
|
+
include Helper
|
10
14
|
|
11
15
|
def initialize(*args)
|
12
|
-
|
13
|
-
# returns empty DataFrame
|
14
|
-
@table = Arrow::Table.new({}, [])
|
16
|
+
@variables = @keys = @vectors = @types = @data_types = nil
|
15
17
|
# bug in gobject-introspection: ruby-gnome/ruby-gnome#1472
|
16
18
|
# [Arrow::Table] == [nil] shows ArgumentError
|
17
19
|
# temporary use yoda condition to workaround
|
18
|
-
|
19
|
-
|
20
|
-
|
20
|
+
if args.empty? || args == [[]] || args == [{}] || [nil] == args
|
21
|
+
# DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
|
22
|
+
# returns empty DataFrame
|
23
|
+
@table = Arrow::Table.new({}, [])
|
24
|
+
elsif args.size > 1
|
21
25
|
@table = Arrow::Table.new(*args)
|
22
26
|
else
|
23
27
|
arg = args[0]
|
@@ -39,67 +43,71 @@ module RedAmber
|
|
39
43
|
|
40
44
|
attr_reader :table
|
41
45
|
|
46
|
+
def to_arrow
|
47
|
+
@table
|
48
|
+
end
|
49
|
+
|
42
50
|
def save(output, options = {})
|
43
51
|
@table.save(output, options)
|
44
52
|
end
|
45
53
|
|
46
|
-
|
47
|
-
def n_rows
|
54
|
+
def size
|
48
55
|
@table.n_rows
|
49
56
|
end
|
50
|
-
alias_method :
|
51
|
-
alias_method :
|
52
|
-
alias_method :length, :n_rows
|
57
|
+
alias_method :n_rows, :size
|
58
|
+
alias_method :n_obs, :size
|
53
59
|
|
54
|
-
def
|
60
|
+
def n_keys
|
55
61
|
@table.n_columns
|
56
62
|
end
|
57
|
-
alias_method :
|
58
|
-
alias_method :
|
63
|
+
alias_method :n_cols, :n_keys
|
64
|
+
alias_method :n_vars, :n_keys
|
59
65
|
|
60
66
|
def shape
|
61
|
-
[
|
67
|
+
[size, n_keys]
|
62
68
|
end
|
63
69
|
|
64
|
-
def
|
65
|
-
@
|
70
|
+
def variables
|
71
|
+
@variables || @variables = init_instance_vars(:variables)
|
66
72
|
end
|
67
|
-
alias_method :
|
68
|
-
|
73
|
+
alias_method :vars, :variables
|
74
|
+
|
75
|
+
def keys
|
76
|
+
@keys || @keys = init_instance_vars(:keys)
|
77
|
+
end
|
78
|
+
alias_method :column_names, :keys
|
79
|
+
alias_method :var_names, :keys
|
69
80
|
|
70
81
|
def key?(key)
|
71
|
-
|
82
|
+
@keys.include?(key.to_sym)
|
72
83
|
end
|
73
84
|
alias_method :has_key?, :key?
|
74
85
|
|
75
86
|
def key_index(key)
|
76
|
-
|
87
|
+
@keys.find_index(key.to_sym)
|
77
88
|
end
|
78
89
|
alias_method :find_index, :key_index
|
79
90
|
alias_method :index, :key_index
|
80
91
|
|
81
92
|
def types
|
82
|
-
@table.columns.map
|
83
|
-
column.data_type.to_s.to_sym
|
84
|
-
end
|
93
|
+
@types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
|
85
94
|
end
|
86
95
|
|
87
|
-
def
|
88
|
-
@table.columns.map
|
89
|
-
column.data_type.class
|
90
|
-
end
|
96
|
+
def type_classes
|
97
|
+
@data_types || @data_types = @table.columns.map { |column| column.data_type.class }
|
91
98
|
end
|
92
99
|
|
93
100
|
def vectors
|
94
|
-
@
|
95
|
-
|
96
|
-
|
101
|
+
@vectors || @vectors = init_instance_vars(:vectors)
|
102
|
+
end
|
103
|
+
|
104
|
+
def indices
|
105
|
+
(0...size).to_a
|
97
106
|
end
|
107
|
+
alias_method :indexes, :indices
|
98
108
|
|
99
109
|
def to_h
|
100
|
-
|
101
|
-
result[column.name.to_sym] = column.entries
|
102
|
-
end
|
110
|
+
variables.transform_values(&:to_a)
|
103
111
|
end
|
104
112
|
|
105
113
|
def to_a
|
@@ -118,13 +126,59 @@ module RedAmber
|
|
118
126
|
end
|
119
127
|
|
120
128
|
def empty?
|
121
|
-
|
129
|
+
variables.empty?
|
122
130
|
end
|
123
131
|
|
124
132
|
def to_rover
|
125
133
|
Rover::DataFrame.new(to_h)
|
126
134
|
end
|
127
135
|
|
128
|
-
|
136
|
+
def to_iruby
|
137
|
+
require 'iruby'
|
138
|
+
return ['text/plain', '(empty DataFrame)'] if empty?
|
139
|
+
|
140
|
+
if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'tdr') == 'table'
|
141
|
+
['text/html', html_table]
|
142
|
+
elsif size <= 5
|
143
|
+
['text/plain', tdr_str(tally: 0)]
|
144
|
+
else
|
145
|
+
['text/plain', tdr_str]
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
private
|
150
|
+
|
151
|
+
# initialize @variable, @keys, @vectors and return one of them
|
152
|
+
def init_instance_vars(var)
|
153
|
+
ary = @table.columns.each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
|
154
|
+
v = Vector.new(column.data)
|
155
|
+
k = column.name.to_sym
|
156
|
+
v.key = k
|
157
|
+
variables[k] = v
|
158
|
+
keys << k
|
159
|
+
vectors << v
|
160
|
+
end
|
161
|
+
@variables, @keys, @vectors = ary
|
162
|
+
ary[%i[variables keys vectors].index(var)]
|
163
|
+
end
|
164
|
+
|
165
|
+
def html_table
|
166
|
+
reduced = size > 8 ? self[0..4, -4..-1] : self
|
167
|
+
|
168
|
+
converted = reduced.assign do
|
169
|
+
vectors.select.with_object({}) do |vector, assigner|
|
170
|
+
if vector.has_nil?
|
171
|
+
assigner[vector.key] = vector.to_a.map do |e|
|
172
|
+
e = e.nil? ? '<i>(nil)</i>' : e.to_s # nil
|
173
|
+
e = '""' if e.empty? # empty string
|
174
|
+
e.sub(/(\s+)/, '"\1"') # blank spaces
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
html = IRuby::HTML.table(converted.to_h, maxrows: 8, maxcols: 15)
|
181
|
+
"#{size} x #{n_keys} vector#{pl(n_keys)} ; #{html}"
|
182
|
+
end
|
129
183
|
end
|
130
184
|
end
|
@@ -4,7 +4,7 @@ require 'stringio'
|
|
4
4
|
|
5
5
|
module RedAmber
|
6
6
|
# mix-ins for the class DataFrame
|
7
|
-
module
|
7
|
+
module DataFrameDisplayable
|
8
8
|
def to_s
|
9
9
|
@table.to_s
|
10
10
|
end
|
@@ -13,19 +13,37 @@ module RedAmber
|
|
13
13
|
|
14
14
|
# def summary() end
|
15
15
|
|
16
|
-
def
|
17
|
-
|
16
|
+
def inspect
|
17
|
+
if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'tdr') == 'table'
|
18
|
+
"#<#{shape_str(with_id: true)}>\n#{self}"
|
19
|
+
else
|
20
|
+
"#<#{shape_str(with_id: true)}>\n#{dataframe_info(3)}"
|
21
|
+
end
|
18
22
|
end
|
19
23
|
|
20
|
-
# -
|
21
|
-
# -
|
22
|
-
# -
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
# - limit: max num of Vectors to show
|
25
|
+
# - tally: max level to use tally mode
|
26
|
+
# - elements: max element to show values in each vector
|
27
|
+
def tdr(limit = 10, tally: 5, elements: 5)
|
28
|
+
puts tdr_str(limit, tally: tally, elements: elements)
|
29
|
+
end
|
30
|
+
|
31
|
+
def tdr_str(limit = 10, tally: 5, elements: 5)
|
32
|
+
"#{shape_str}\n#{dataframe_info(limit, tally_level: tally, max_element: elements)}"
|
33
|
+
end
|
34
|
+
|
35
|
+
private # =====
|
36
|
+
|
37
|
+
def shape_str(with_id: false)
|
38
|
+
shape_info = empty? ? '(empty)' : "#{size} x #{n_keys} Vector#{pl(n_keys)}"
|
39
|
+
id = with_id ? format(', 0x%016x', object_id) : ''
|
40
|
+
"#{self.class} : #{shape_info}#{id}"
|
41
|
+
end
|
27
42
|
|
28
|
-
|
43
|
+
def dataframe_info(limit, tally_level: 5, max_element: 5)
|
44
|
+
return '' if empty?
|
45
|
+
|
46
|
+
limit = n_keys if [:all, -1].include? limit
|
29
47
|
|
30
48
|
tallys = vectors.map(&:tally)
|
31
49
|
levels = tallys.map(&:size)
|
@@ -34,52 +52,41 @@ module RedAmber
|
|
34
52
|
headers = { idx: '#', key: 'key', type: 'type', levels: 'level', data: 'data_preview' }
|
35
53
|
header_format = make_header_format(levels, headers, quoted_keys)
|
36
54
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
"#{self.class} : #{nrow} x #{ncol} #{vs}"
|
41
|
-
|
42
|
-
# 2nd row: show var counts by type
|
43
|
-
stringio.puts "#{vs} : #{var_type_count(type_groups).join(', ')}"
|
55
|
+
sio = StringIO.new # output string buffer
|
56
|
+
sio.puts "Vector#{pl(n_keys)} : #{var_type_count(type_groups).join(', ')}"
|
57
|
+
sio.printf header_format, *headers.values
|
44
58
|
|
45
|
-
# 3rd row: print header of rows
|
46
|
-
stringio.printf header_format, *headers.values
|
47
|
-
|
48
|
-
# 4th row ~: show details for each column (vector)
|
49
59
|
vectors.each.with_index do |vector, i|
|
60
|
+
if i >= limit
|
61
|
+
sio << " ... #{n_keys - i} more Vector#{pl(n_keys - i)} ...\n"
|
62
|
+
break
|
63
|
+
end
|
50
64
|
key = quoted_keys[i]
|
51
65
|
type = types[i]
|
52
66
|
type_group = type_groups[i]
|
53
67
|
data_tally = tallys[i]
|
54
|
-
|
55
68
|
a = case type_group
|
56
69
|
when :numeric, :string, :boolean
|
57
|
-
if data_tally.size <= tally_level && data_tally.size !=
|
70
|
+
if data_tally.size <= tally_level && data_tally.size != size
|
58
71
|
[data_tally.to_s]
|
59
72
|
else
|
60
|
-
[shorthand(vector,
|
73
|
+
[shorthand(vector, size, max_element)].concat na_string(vector)
|
61
74
|
end
|
62
75
|
else
|
63
|
-
shorthand(vector,
|
76
|
+
[shorthand(vector, size, max_element)]
|
64
77
|
end
|
65
|
-
|
78
|
+
sio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
|
66
79
|
end
|
67
|
-
|
68
|
-
end
|
69
|
-
|
70
|
-
private # =====
|
71
|
-
|
72
|
-
def pl(num)
|
73
|
-
num > 1 ? 's' : ''
|
80
|
+
sio.string
|
74
81
|
end
|
75
82
|
|
76
83
|
def make_header_format(levels, headers, quoted_keys)
|
77
|
-
# find longest word to adjust
|
78
|
-
w_idx =
|
84
|
+
# find longest word to adjust width
|
85
|
+
w_idx = n_keys.to_s.size
|
79
86
|
w_key = [quoted_keys.map(&:size).max, headers[:key].size].max
|
80
87
|
w_type = [types.map(&:size).max, headers[:type].size].max
|
81
|
-
|
82
|
-
"%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{
|
88
|
+
w_level = [levels.map { |l| l.to_s.size }.max, headers[:levels].size].max
|
89
|
+
"%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{w_level}s %s\n"
|
83
90
|
end
|
84
91
|
|
85
92
|
def type_group(data_type)
|
@@ -103,10 +110,11 @@ module RedAmber
|
|
103
110
|
a
|
104
111
|
end
|
105
112
|
|
106
|
-
def shorthand(vector,
|
107
|
-
|
113
|
+
def shorthand(vector, size, max_element)
|
114
|
+
max = vector.temporal? ? 2 : max_element
|
115
|
+
a = vector.to_a.take(max)
|
108
116
|
a.map! { |e| e.nil? ? 'nil' : e.inspect }
|
109
|
-
a << '... ' if
|
117
|
+
a << '... ' if size > max
|
110
118
|
"[#{a.join(', ')}]"
|
111
119
|
end
|
112
120
|
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-ins for the class DataFrame
|
5
|
+
module DataFrameIndexable
|
6
|
+
# Common method
|
7
|
+
def map_indices(*indices)
|
8
|
+
return self if indices.empty?
|
9
|
+
|
10
|
+
indices = indices[0].data if indices[0].is_a?(Vector)
|
11
|
+
|
12
|
+
new_dataframe_by(indices)
|
13
|
+
end
|
14
|
+
|
15
|
+
# @param sort_keys [Arrow::SortKey]
|
16
|
+
# :key, "key" or "+key" denotes ascending,
|
17
|
+
# "-key" denotes descending order
|
18
|
+
# @return [RedAmber::Vector] Sorted indices in Vector
|
19
|
+
def sort_indices(*sort_keys)
|
20
|
+
indices = @table.sort_indices(sort_keys.flatten)
|
21
|
+
Vector.new(indices)
|
22
|
+
end
|
23
|
+
|
24
|
+
# @return [RedAmber::DataFrame] Sorted DataFrame
|
25
|
+
def sort(*sort_keys)
|
26
|
+
indices = @table.sort_indices(sort_keys.flatten)
|
27
|
+
|
28
|
+
new_dataframe_by(indices)
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def new_dataframe_by(index_array)
|
34
|
+
t = Arrow::Function.find(:take).execute([@table, index_array]).value
|
35
|
+
RedAmber::DataFrame.new(t)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-ins for the class DataFrame
|
5
|
+
module DataFrameObservationOperation
|
6
|
+
def group(aggregating_keys, func, target_keys)
|
7
|
+
t = table.group(*aggregating_keys)
|
8
|
+
RedAmber::DataFrame.new(t.send(func, *target_keys))
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -3,89 +3,196 @@
|
|
3
3
|
module RedAmber
|
4
4
|
# mix-in for the class DataFrame
|
5
5
|
module DataFrameSelectable
|
6
|
-
# select
|
7
|
-
# select
|
6
|
+
# select variables: [symbol] or [string]
|
7
|
+
# select observations: [array of index], [range]
|
8
8
|
def [](*args)
|
9
|
+
args.flatten!
|
9
10
|
raise DataFrameArgumentError, 'Empty dataframe' if empty?
|
10
|
-
|
11
|
+
return remove_all_values if args.empty? || args[0].nil?
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
vector = parse_to_vector(args)
|
14
|
+
if vector.boolean?
|
15
|
+
return filter_by_vector(vector.data) if vector.size == size
|
16
|
+
|
17
|
+
raise DataFrameArgumentError, "Size is not match in booleans: #{args}"
|
18
|
+
end
|
19
|
+
return take_by_array(vector) if vector.numeric?
|
20
|
+
return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.type == :dictionary
|
21
|
+
|
22
|
+
raise DataFrameArgumentError, "Invalid argument: #{args}"
|
23
|
+
end
|
24
|
+
|
25
|
+
# slice and select some observations to create sub DataFrame
|
26
|
+
def slice(*args, &block)
|
27
|
+
slicer = args
|
28
|
+
if block
|
29
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
30
|
+
|
31
|
+
slicer = instance_eval(&block)
|
32
|
+
end
|
33
|
+
slicer = [slicer].flatten
|
34
|
+
|
35
|
+
raise DataFrameArgumentError, 'Empty dataframe' if empty?
|
36
|
+
return remove_all_values if slicer.empty? || slicer[0].nil?
|
37
|
+
|
38
|
+
vector = parse_to_vector(slicer)
|
39
|
+
if vector.boolean?
|
40
|
+
return filter_by_vector(vector.data) if vector.size == size
|
41
|
+
|
42
|
+
raise DataFrameArgumentError, "Size is not match in booleans: #{slicer}"
|
43
|
+
end
|
44
|
+
return take_by_array(vector) if vector.numeric?
|
45
|
+
|
46
|
+
raise DataFrameArgumentError, "Invalid argument #{slicer}"
|
47
|
+
end
|
48
|
+
|
49
|
+
# remove selected observations to create sub DataFrame
|
50
|
+
def remove(*args, &block)
|
51
|
+
remover = args
|
52
|
+
if block
|
53
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
54
|
+
|
55
|
+
remover = instance_eval(&block)
|
56
|
+
end
|
57
|
+
remover = [remover].flatten
|
58
|
+
|
59
|
+
raise DataFrameArgumentError, 'Empty dataframe' if empty?
|
60
|
+
return self if remover.empty? || remover[0].nil?
|
61
|
+
|
62
|
+
vector = parse_to_vector(remover)
|
63
|
+
if vector.boolean?
|
64
|
+
return filter_by_vector(vector.primitive_invert.data) if vector.size == size
|
65
|
+
|
66
|
+
raise DataFrameArgumentError, "Size is not match in booleans: #{remover}"
|
67
|
+
end
|
68
|
+
if vector.numeric?
|
69
|
+
raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
|
70
|
+
|
71
|
+
normalized_indices = (vector < 0).if_else(vector + size, vector) # normalize index from tail
|
72
|
+
if normalized_indices.max >= size
|
73
|
+
raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}"
|
16
74
|
end
|
17
75
|
|
18
|
-
|
19
|
-
|
76
|
+
normalized_indices = normalized_indices.floor.to_a.map(&:to_i) # round to integer array
|
77
|
+
return remove_all_values if normalized_indices == indices
|
78
|
+
return self if normalized_indices.empty?
|
79
|
+
|
80
|
+
index_array = indices - normalized_indices
|
20
81
|
|
21
|
-
|
82
|
+
datum = Arrow::Function.find(:take).execute([table, index_array])
|
83
|
+
return DataFrame.new(datum.value)
|
84
|
+
end
|
85
|
+
|
86
|
+
raise DataFrameArgumentError, "Invalid argument #{remover}"
|
87
|
+
end
|
88
|
+
|
89
|
+
def remove_nil
|
90
|
+
func = Arrow::Function.find(:drop_null)
|
91
|
+
DataFrame.new(func.execute([table]).value)
|
92
|
+
end
|
93
|
+
alias_method :drop_nil, :remove_nil
|
94
|
+
|
95
|
+
# Select a variable by a key in String or Symbol
|
96
|
+
def v(key)
|
97
|
+
unless key.is_a?(Symbol) || key.is_a?(String)
|
98
|
+
raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
|
99
|
+
end
|
100
|
+
raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
|
101
|
+
|
102
|
+
variables[key.to_sym]
|
22
103
|
end
|
23
104
|
|
24
|
-
def head(
|
25
|
-
raise DataFrameArgumentError, "Index is out of range #{
|
105
|
+
def head(n_obs = 5)
|
106
|
+
raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
|
26
107
|
|
27
|
-
self[0...[
|
108
|
+
self[0...[n_obs, size].min]
|
28
109
|
end
|
29
110
|
|
30
|
-
def tail(
|
31
|
-
raise DataFrameArgumentError, "Index is out of range #{
|
111
|
+
def tail(n_obs = 5)
|
112
|
+
raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
|
32
113
|
|
33
|
-
self[-[
|
114
|
+
self[-[n_obs, size].min..]
|
34
115
|
end
|
35
116
|
|
36
|
-
def first(
|
37
|
-
head(
|
117
|
+
def first(n_obs = 1)
|
118
|
+
head(n_obs)
|
38
119
|
end
|
39
120
|
|
40
|
-
def last(
|
41
|
-
tail(
|
121
|
+
def last(n_obs = 1)
|
122
|
+
tail(n_obs)
|
42
123
|
end
|
43
124
|
|
44
|
-
|
125
|
+
# Undocumented
|
126
|
+
# TODO: support for option {boundscheck: true}
|
127
|
+
def take(*indices)
|
128
|
+
indices.flatten!
|
129
|
+
return remove_all_values if indices.empty?
|
45
130
|
|
46
|
-
|
47
|
-
|
48
|
-
t = @table[*keys]
|
49
|
-
raise DataFrameArgumentError, "Key is not exists #{keys}" unless t
|
131
|
+
indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
|
132
|
+
indices = Vector.new(indices) unless indices.is_a?(Vector)
|
50
133
|
|
51
|
-
|
52
|
-
else
|
53
|
-
DataFrame.new(@table[keys])
|
54
|
-
end
|
134
|
+
take_by_array(indices)
|
55
135
|
end
|
56
136
|
|
57
|
-
|
58
|
-
|
137
|
+
# Undocumented
|
138
|
+
# TODO: support for option {null_selection_behavior: :drop}
|
139
|
+
def filter(*booleans)
|
140
|
+
booleans.flatten!
|
141
|
+
return remove_all_values if booleans.empty?
|
59
142
|
|
60
|
-
|
61
|
-
|
143
|
+
b = booleans[0]
|
144
|
+
case b
|
145
|
+
when Vector
|
146
|
+
raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
|
147
|
+
|
148
|
+
filter_by_vector(b.data)
|
149
|
+
when Arrow::BooleanArray
|
150
|
+
filter_by_vector(b)
|
151
|
+
else
|
152
|
+
raise DataFrameArgumentError, 'Argument is not a boolean.' unless booleans?(booleans)
|
153
|
+
|
154
|
+
filter_by_vector(Arrow::BooleanArray.new(booleans))
|
155
|
+
end
|
62
156
|
end
|
63
157
|
|
64
|
-
|
65
|
-
both_end = [range.begin, range.end]
|
66
|
-
both_end[1] -= 1 if range.exclude_end? && range.end.is_a?(Integer)
|
158
|
+
private
|
67
159
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
160
|
+
def select_vars_by_keys(keys)
|
161
|
+
if keys.one?
|
162
|
+
key = keys[0].to_sym
|
163
|
+
raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key
|
72
164
|
|
73
|
-
|
165
|
+
variables[key]
|
74
166
|
else
|
75
|
-
|
167
|
+
DataFrame.new(@table[keys])
|
76
168
|
end
|
77
169
|
end
|
78
170
|
|
79
|
-
|
80
|
-
|
171
|
+
# Accepts indices by numeric Vector
|
172
|
+
def take_by_array(indices)
|
173
|
+
raise DataFrameArgumentError, "Indices must be a numeric Vector: #{indices}" unless indices.numeric?
|
174
|
+
raise DataFrameArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
|
175
|
+
|
176
|
+
normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
|
177
|
+
raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
|
178
|
+
|
179
|
+
index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
|
180
|
+
|
181
|
+
datum = Arrow::Function.find(:take).execute([table, index_array])
|
182
|
+
DataFrame.new(datum.value)
|
81
183
|
end
|
82
184
|
|
83
|
-
|
84
|
-
|
185
|
+
# Accepts booleans by Arrow::BooleanArray
|
186
|
+
def filter_by_vector(boolean_array)
|
187
|
+
raise DataFrameArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
|
188
|
+
|
189
|
+
datum = Arrow::Function.find(:filter).execute([table, boolean_array])
|
190
|
+
DataFrame.new(datum.value)
|
85
191
|
end
|
86
192
|
|
87
|
-
|
88
|
-
|
193
|
+
# return a DataFrame with same keys as self without values
|
194
|
+
def remove_all_values
|
195
|
+
filter_by_vector(Arrow::BooleanArray.new([false] * size))
|
89
196
|
end
|
90
197
|
end
|
91
198
|
end
|