red_amber 0.1.5 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +33 -5
- data/.rubocop_todo.yml +2 -15
- data/.yardopts +1 -0
- data/CHANGELOG.md +164 -18
- data/Gemfile +6 -1
- data/README.md +247 -33
- data/Rakefile +1 -0
- data/benchmark/csv_load_penguins.yml +1 -1
- data/doc/DataFrame.md +383 -219
- data/doc/Vector.md +247 -37
- data/doc/examples_of_red_amber.ipynb +5454 -0
- data/doc/image/dataframe/assign.png +0 -0
- data/doc/image/dataframe/drop.png +0 -0
- data/doc/image/dataframe/pick.png +0 -0
- data/doc/image/dataframe/remove.png +0 -0
- data/doc/image/dataframe/rename.png +0 -0
- data/doc/image/dataframe/slice.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/doc/image/vector/binary_element_wise.png +0 -0
- data/doc/image/vector/unary_aggregation.png +0 -0
- data/doc/image/vector/unary_aggregation_w_option.png +0 -0
- data/doc/image/vector/unary_element_wise.png +0 -0
- data/lib/red-amber.rb +3 -0
- data/lib/red_amber/data_frame.rb +62 -10
- data/lib/red_amber/data_frame_displayable.rb +86 -9
- data/lib/red_amber/data_frame_selectable.rb +151 -32
- data/lib/red_amber/data_frame_variable_operation.rb +4 -0
- data/lib/red_amber/group.rb +59 -0
- data/lib/red_amber/helper.rb +61 -0
- data/lib/red_amber/vector.rb +59 -15
- data/lib/red_amber/vector_functions.rb +47 -38
- data/lib/red_amber/vector_selectable.rb +126 -0
- data/lib/red_amber/vector_updatable.rb +125 -0
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +6 -3
- data/red_amber.gemspec +0 -2
- metadata +9 -33
- data/lib/red_amber/data_frame_helper.rb +0 -64
- data/lib/red_amber/data_frame_observation_operation.rb +0 -83
- data/lib/red_amber/vector_compensable.rb +0 -68
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/red-amber.rb
ADDED
data/lib/red_amber/data_frame.rb
CHANGED
@@ -6,18 +6,14 @@ module RedAmber
|
|
6
6
|
class DataFrame
|
7
7
|
# mix-in
|
8
8
|
include DataFrameDisplayable
|
9
|
-
include DataFrameHelper
|
10
9
|
include DataFrameIndexable
|
11
10
|
include DataFrameSelectable
|
12
|
-
include DataFrameObservationOperation
|
13
11
|
include DataFrameVariableOperation
|
12
|
+
include Helper
|
14
13
|
|
15
14
|
def initialize(*args)
|
16
15
|
@variables = @keys = @vectors = @types = @data_types = nil
|
17
|
-
|
18
|
-
# [Arrow::Table] == [nil] shows ArgumentError
|
19
|
-
# temporary use yoda condition to workaround
|
20
|
-
if args.empty? || args == [[]] || args == [{}] || [nil] == args
|
16
|
+
if args.empty? || args[0] == [] || args[0] == {} || args[0].nil?
|
21
17
|
# DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
|
22
18
|
# returns empty DataFrame
|
23
19
|
@table = Arrow::Table.new({}, [])
|
@@ -35,6 +31,7 @@ module RedAmber
|
|
35
31
|
raise DataFrameTypeError, "invalid argument: #{arg}"
|
36
32
|
end
|
37
33
|
end
|
34
|
+
name_unnamed_keys
|
38
35
|
end
|
39
36
|
|
40
37
|
def self.load(path, options = {})
|
@@ -44,7 +41,7 @@ module RedAmber
|
|
44
41
|
attr_reader :table
|
45
42
|
|
46
43
|
def to_arrow
|
47
|
-
table
|
44
|
+
@table
|
48
45
|
end
|
49
46
|
|
50
47
|
def save(output, options = {})
|
@@ -101,10 +98,10 @@ module RedAmber
|
|
101
98
|
@vectors || @vectors = init_instance_vars(:vectors)
|
102
99
|
end
|
103
100
|
|
104
|
-
def
|
105
|
-
0...size
|
101
|
+
def indices
|
102
|
+
(0...size).to_a
|
106
103
|
end
|
107
|
-
alias_method :
|
104
|
+
alias_method :indexes, :indices
|
108
105
|
|
109
106
|
def to_h
|
110
107
|
variables.transform_values(&:to_a)
|
@@ -130,9 +127,27 @@ module RedAmber
|
|
130
127
|
end
|
131
128
|
|
132
129
|
def to_rover
|
130
|
+
require 'rover'
|
133
131
|
Rover::DataFrame.new(to_h)
|
134
132
|
end
|
135
133
|
|
134
|
+
def to_iruby
|
135
|
+
require 'iruby'
|
136
|
+
return ['text/plain', '(empty DataFrame)'] if empty?
|
137
|
+
|
138
|
+
if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table') == 'TDR'
|
139
|
+
size <= 5 ? ['text/plain', tdr_str(tally: 0)] : ['text/plain', tdr_str]
|
140
|
+
else
|
141
|
+
['text/html', html_table]
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def group(*group_keys, &block)
|
146
|
+
g = Group.new(self, group_keys)
|
147
|
+
g = g.summarize(&block) if block
|
148
|
+
g
|
149
|
+
end
|
150
|
+
|
136
151
|
private
|
137
152
|
|
138
153
|
# initialize @variable, @keys, @vectors and return one of them
|
@@ -148,5 +163,42 @@ module RedAmber
|
|
148
163
|
@variables, @keys, @vectors = ary
|
149
164
|
ary[%i[variables keys vectors].index(var)]
|
150
165
|
end
|
166
|
+
|
167
|
+
def html_table
|
168
|
+
reduced = size > 8 ? self[0..4, -4..-1] : self
|
169
|
+
|
170
|
+
converted = reduced.assign do
|
171
|
+
vectors.select.with_object({}) do |vector, assigner|
|
172
|
+
if vector.has_nil?
|
173
|
+
assigner[vector.key] = vector.to_a.map do |e|
|
174
|
+
e = e.nil? ? '<i>(nil)</i>' : e.to_s # nil
|
175
|
+
e = '""' if e.empty? # empty string
|
176
|
+
e.sub(/(\s+)/, '"\1"') # blank spaces
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
html = IRuby::HTML.table(converted.to_h, maxrows: 8, maxcols: 15)
|
183
|
+
"#{self.class} <#{size} x #{n_keys} vector#{pl(n_keys)}> #{html}"
|
184
|
+
end
|
185
|
+
|
186
|
+
def name_unnamed_keys
|
187
|
+
return unless @table[:'']
|
188
|
+
|
189
|
+
# We can't use #keys because it causes mismatch of @table and @keys
|
190
|
+
keys = @table.schema.fields.map { |f| f.name.to_sym }
|
191
|
+
unnamed = (:unnamed1..).find { |e| !keys.include?(e) }
|
192
|
+
fields =
|
193
|
+
@table.schema.fields.map do |field|
|
194
|
+
if field.name.empty?
|
195
|
+
Arrow::Field.new(unnamed, field.data_type)
|
196
|
+
else
|
197
|
+
field
|
198
|
+
end
|
199
|
+
end
|
200
|
+
schema = Arrow::Schema.new(fields)
|
201
|
+
@table = Arrow::Table.new(schema, @table.columns)
|
202
|
+
end
|
151
203
|
end
|
152
204
|
end
|
@@ -5,8 +5,12 @@ require 'stringio'
|
|
5
5
|
module RedAmber
|
6
6
|
# mix-ins for the class DataFrame
|
7
7
|
module DataFrameDisplayable
|
8
|
+
INDEX_KEY = :index_key_for_format_table
|
9
|
+
|
8
10
|
def to_s
|
9
|
-
|
11
|
+
return '' if empty?
|
12
|
+
|
13
|
+
format_table(width: 80)
|
10
14
|
end
|
11
15
|
|
12
16
|
# def describe() end
|
@@ -14,7 +18,11 @@ module RedAmber
|
|
14
18
|
# def summary() end
|
15
19
|
|
16
20
|
def inspect
|
17
|
-
|
21
|
+
if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table') == 'TDR'
|
22
|
+
"#<#{shape_str(with_id: true)}>\n#{dataframe_info(3)}"
|
23
|
+
else
|
24
|
+
"#<#{shape_str(with_id: true)}>\n#{self}"
|
25
|
+
end
|
18
26
|
end
|
19
27
|
|
20
28
|
# - limit: max num of Vectors to show
|
@@ -30,10 +38,6 @@ module RedAmber
|
|
30
38
|
|
31
39
|
private # =====
|
32
40
|
|
33
|
-
def pl(num)
|
34
|
-
num > 1 ? 's' : ''
|
35
|
-
end
|
36
|
-
|
37
41
|
def shape_str(with_id: false)
|
38
42
|
shape_info = empty? ? '(empty)' : "#{size} x #{n_keys} Vector#{pl(n_keys)}"
|
39
43
|
id = with_id ? format(', 0x%016x', object_id) : ''
|
@@ -81,12 +85,12 @@ module RedAmber
|
|
81
85
|
end
|
82
86
|
|
83
87
|
def make_header_format(levels, headers, quoted_keys)
|
84
|
-
# find longest word to adjust
|
88
|
+
# find longest word to adjust width
|
85
89
|
w_idx = n_keys.to_s.size
|
86
90
|
w_key = [quoted_keys.map(&:size).max, headers[:key].size].max
|
87
91
|
w_type = [types.map(&:size).max, headers[:type].size].max
|
88
|
-
|
89
|
-
"%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{
|
92
|
+
w_level = [levels.map { |l| l.to_s.size }.max, headers[:levels].size].max
|
93
|
+
"%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{w_level}s %s\n"
|
90
94
|
end
|
91
95
|
|
92
96
|
def type_group(data_type)
|
@@ -128,5 +132,78 @@ module RedAmber
|
|
128
132
|
a << "#{n_nil} nil#{pl(n_nil)}" unless n_nil.zero?
|
129
133
|
a
|
130
134
|
end
|
135
|
+
|
136
|
+
def format_table(width: 80)
|
137
|
+
head = 5
|
138
|
+
tail = 3
|
139
|
+
n_digit = 1
|
140
|
+
|
141
|
+
original = self
|
142
|
+
indices = size > head + tail ? [*0...head, *(size - tail)...size] : [*0...size]
|
143
|
+
df = slice(indices).assign do
|
144
|
+
assigner = { INDEX_KEY => indices.map { |i| (i + 1).to_s } }
|
145
|
+
vectors.each_with_object(assigner) do |v, a|
|
146
|
+
a[v.key] = v.to_a.map do |e|
|
147
|
+
if e.nil?
|
148
|
+
'(nil)'
|
149
|
+
elsif v.float?
|
150
|
+
e.round(n_digit).to_s
|
151
|
+
elsif v.string?
|
152
|
+
e
|
153
|
+
else
|
154
|
+
e.to_s
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
df = df.pick { [INDEX_KEY, keys - [INDEX_KEY]] }
|
161
|
+
df = size > head + tail ? df[0, 0, 0...head, 0, -tail..-1] : df[0, 0, 0..-1]
|
162
|
+
df = df.assign do
|
163
|
+
vectors.each_with_object({}) do |v, assigner|
|
164
|
+
vec = v.replace(0, v.key == INDEX_KEY ? '' : v.key.to_s)
|
165
|
+
.replace(1, v.key == INDEX_KEY ? '' : "<#{original[v.key].type}>")
|
166
|
+
assigner[v.key] = size > head + tail ? vec.replace(head + 2, ':') : vec
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
width_list = df.vectors.map { |v| v.to_a.map(&:length).max }
|
171
|
+
total_length = width_list[-1] # reserved for last column
|
172
|
+
|
173
|
+
formats = []
|
174
|
+
row_ellipsis = nil
|
175
|
+
df.vectors.each_with_index do |v, i|
|
176
|
+
w = width_list[i]
|
177
|
+
if total_length + w > width && i < df.n_keys - 1
|
178
|
+
row_ellipsis = i
|
179
|
+
formats << '%3s'
|
180
|
+
formats << format_for_column(df.vectors[-1], original, width_list[-1])
|
181
|
+
break
|
182
|
+
end
|
183
|
+
formats << format_for_column(v, original, w)
|
184
|
+
total_length += w
|
185
|
+
end
|
186
|
+
format_str = formats.join(' ')
|
187
|
+
|
188
|
+
str = StringIO.new
|
189
|
+
if row_ellipsis
|
190
|
+
df = df[df.keys[0..row_ellipsis], df.keys[-1]]
|
191
|
+
df = df.assign(df.keys[row_ellipsis] => ['...'] * df.size)
|
192
|
+
end
|
193
|
+
|
194
|
+
df.to_a.each do |row|
|
195
|
+
str.puts format(format_str, *row).rstrip
|
196
|
+
end
|
197
|
+
|
198
|
+
str.string
|
199
|
+
end
|
200
|
+
|
201
|
+
def format_for_column(vector, original, width)
|
202
|
+
if vector.key != INDEX_KEY && !original[vector.key].numeric?
|
203
|
+
"%-#{width}s"
|
204
|
+
else
|
205
|
+
"%#{width}s"
|
206
|
+
end
|
207
|
+
end
|
131
208
|
end
|
132
209
|
end
|
@@ -3,35 +3,94 @@
|
|
3
3
|
module RedAmber
|
4
4
|
# mix-in for the class DataFrame
|
5
5
|
module DataFrameSelectable
|
6
|
-
# select
|
7
|
-
# select
|
6
|
+
# select variables: [symbol] or [string]
|
7
|
+
# select observations: [array of index], [range]
|
8
8
|
def [](*args)
|
9
|
+
args.flatten!
|
9
10
|
raise DataFrameArgumentError, 'Empty dataframe' if empty?
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
11
|
+
return remove_all_values if args.empty? || args[0].nil?
|
12
|
+
|
13
|
+
vector = parse_to_vector(args)
|
14
|
+
if vector.boolean?
|
15
|
+
return filter_by_vector(vector.data) if vector.size == size
|
16
|
+
|
17
|
+
raise DataFrameArgumentError, "Size is not match in booleans: #{args}"
|
18
|
+
end
|
19
|
+
return take_by_array(vector) if vector.numeric?
|
20
|
+
return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.type == :dictionary
|
21
|
+
|
22
|
+
raise DataFrameArgumentError, "Invalid argument: #{args}"
|
23
|
+
end
|
24
|
+
|
25
|
+
# slice and select some observations to create sub DataFrame
|
26
|
+
def slice(*args, &block)
|
27
|
+
slicer = args
|
28
|
+
if block
|
29
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
30
|
+
|
31
|
+
slicer = instance_eval(&block)
|
32
|
+
end
|
33
|
+
slicer = [slicer].flatten
|
34
|
+
|
35
|
+
raise DataFrameArgumentError, 'Empty dataframe' if empty?
|
36
|
+
return remove_all_values if slicer.empty? || slicer[0].nil?
|
37
|
+
|
38
|
+
vector = parse_to_vector(slicer)
|
39
|
+
if vector.boolean?
|
40
|
+
return filter_by_vector(vector.data) if vector.size == size
|
41
|
+
|
42
|
+
raise DataFrameArgumentError, "Size is not match in booleans: #{slicer}"
|
24
43
|
end
|
44
|
+
return take_by_array(vector) if vector.numeric?
|
45
|
+
|
46
|
+
raise DataFrameArgumentError, "Invalid argument #{slicer}"
|
47
|
+
end
|
48
|
+
|
49
|
+
# remove selected observations to create sub DataFrame
|
50
|
+
def remove(*args, &block)
|
51
|
+
remover = args
|
52
|
+
if block
|
53
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
54
|
+
|
55
|
+
remover = instance_eval(&block)
|
56
|
+
end
|
57
|
+
remover = [remover].flatten
|
58
|
+
|
59
|
+
raise DataFrameArgumentError, 'Empty dataframe' if empty?
|
60
|
+
return self if remover.empty? || remover[0].nil?
|
61
|
+
|
62
|
+
vector = parse_to_vector(remover)
|
63
|
+
if vector.boolean?
|
64
|
+
return filter_by_vector(vector.primitive_invert.data) if vector.size == size
|
65
|
+
|
66
|
+
raise DataFrameArgumentError, "Size is not match in booleans: #{remover}"
|
67
|
+
end
|
68
|
+
if vector.numeric?
|
69
|
+
raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
|
70
|
+
|
71
|
+
normalized_indices = (vector < 0).if_else(vector + size, vector) # normalize index from tail
|
72
|
+
if normalized_indices.max >= size
|
73
|
+
raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}"
|
74
|
+
end
|
25
75
|
|
26
|
-
|
76
|
+
normalized_indices = normalized_indices.floor.to_a.map(&:to_i) # round to integer array
|
77
|
+
return remove_all_values if normalized_indices == indices
|
78
|
+
return self if normalized_indices.empty?
|
27
79
|
|
28
|
-
|
29
|
-
expanded = expand_range(args)
|
30
|
-
return map_indices(*expanded) if integers?(expanded)
|
31
|
-
return select_vars_by_keys(expanded.map(&:to_sym)) if sym_or_str?(expanded)
|
80
|
+
index_array = indices - normalized_indices
|
32
81
|
|
33
|
-
|
82
|
+
datum = Arrow::Function.find(:take).execute([table, index_array])
|
83
|
+
return DataFrame.new(datum.value)
|
84
|
+
end
|
85
|
+
|
86
|
+
raise DataFrameArgumentError, "Invalid argument #{remover}"
|
87
|
+
end
|
88
|
+
|
89
|
+
def remove_nil
|
90
|
+
func = Arrow::Function.find(:drop_null)
|
91
|
+
DataFrame.new(func.execute([table]).value)
|
34
92
|
end
|
93
|
+
alias_method :drop_nil, :remove_nil
|
35
94
|
|
36
95
|
# Select a variable by a key in String or Symbol
|
37
96
|
def v(key)
|
@@ -43,24 +102,57 @@ module RedAmber
|
|
43
102
|
variables[key.to_sym]
|
44
103
|
end
|
45
104
|
|
46
|
-
def head(
|
47
|
-
raise DataFrameArgumentError, "Index is out of range #{
|
105
|
+
def head(n_obs = 5)
|
106
|
+
raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
|
48
107
|
|
49
|
-
self[0...[
|
108
|
+
self[0...[n_obs, size].min]
|
50
109
|
end
|
51
110
|
|
52
|
-
def tail(
|
53
|
-
raise DataFrameArgumentError, "Index is out of range #{
|
111
|
+
def tail(n_obs = 5)
|
112
|
+
raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
|
54
113
|
|
55
|
-
self[-[
|
114
|
+
self[-[n_obs, size].min..]
|
56
115
|
end
|
57
116
|
|
58
|
-
def first(
|
59
|
-
head(
|
117
|
+
def first(n_obs = 1)
|
118
|
+
head(n_obs)
|
60
119
|
end
|
61
120
|
|
62
|
-
def last(
|
63
|
-
tail(
|
121
|
+
def last(n_obs = 1)
|
122
|
+
tail(n_obs)
|
123
|
+
end
|
124
|
+
|
125
|
+
# Undocumented
|
126
|
+
# TODO: support for option {boundscheck: true}
|
127
|
+
def take(*indices)
|
128
|
+
indices.flatten!
|
129
|
+
return remove_all_values if indices.empty?
|
130
|
+
|
131
|
+
indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
|
132
|
+
indices = Vector.new(indices) unless indices.is_a?(Vector)
|
133
|
+
|
134
|
+
take_by_array(indices)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Undocumented
|
138
|
+
# TODO: support for option {null_selection_behavior: :drop}
|
139
|
+
def filter(*booleans)
|
140
|
+
booleans.flatten!
|
141
|
+
return remove_all_values if booleans.empty?
|
142
|
+
|
143
|
+
b = booleans[0]
|
144
|
+
case b
|
145
|
+
when Vector
|
146
|
+
raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
|
147
|
+
|
148
|
+
filter_by_vector(b.data)
|
149
|
+
when Arrow::BooleanArray
|
150
|
+
filter_by_vector(b)
|
151
|
+
else
|
152
|
+
raise DataFrameArgumentError, 'Argument is not a boolean.' unless booleans?(booleans)
|
153
|
+
|
154
|
+
filter_by_vector(Arrow::BooleanArray.new(booleans))
|
155
|
+
end
|
64
156
|
end
|
65
157
|
|
66
158
|
private
|
@@ -75,5 +167,32 @@ module RedAmber
|
|
75
167
|
DataFrame.new(@table[keys])
|
76
168
|
end
|
77
169
|
end
|
170
|
+
|
171
|
+
# Accepts indices by numeric Vector
|
172
|
+
def take_by_array(indices)
|
173
|
+
raise DataFrameArgumentError, "Indices must be a numeric Vector: #{indices}" unless indices.numeric?
|
174
|
+
raise DataFrameArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
|
175
|
+
|
176
|
+
normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
|
177
|
+
raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
|
178
|
+
|
179
|
+
index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
|
180
|
+
|
181
|
+
datum = Arrow::Function.find(:take).execute([table, index_array])
|
182
|
+
DataFrame.new(datum.value)
|
183
|
+
end
|
184
|
+
|
185
|
+
# Accepts booleans by Arrow::BooleanArray
|
186
|
+
def filter_by_vector(boolean_array)
|
187
|
+
raise DataFrameArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
|
188
|
+
|
189
|
+
datum = Arrow::Function.find(:filter).execute([table, boolean_array])
|
190
|
+
DataFrame.new(datum.value)
|
191
|
+
end
|
192
|
+
|
193
|
+
# return a DataFrame with same keys as self without values
|
194
|
+
def remove_all_values
|
195
|
+
filter_by_vector(Arrow::BooleanArray.new([false] * size))
|
196
|
+
end
|
78
197
|
end
|
79
198
|
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# group class
|
5
|
+
class Group
|
6
|
+
def initialize(dataframe, *group_keys)
|
7
|
+
@dataframe = dataframe
|
8
|
+
@table = @dataframe.table
|
9
|
+
@group_keys = group_keys.flatten
|
10
|
+
|
11
|
+
raise GroupArgumentError, 'group_keys is empty.' if @group_keys.empty?
|
12
|
+
|
13
|
+
d = @group_keys - @dataframe.keys
|
14
|
+
raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}." unless d.empty?
|
15
|
+
|
16
|
+
@group = @table.group(*@group_keys)
|
17
|
+
end
|
18
|
+
|
19
|
+
functions = %i[count sum product mean min max stddev variance]
|
20
|
+
functions.each do |function|
|
21
|
+
define_method(function) do |*summary_keys|
|
22
|
+
by(function, summary_keys)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def inspect
|
27
|
+
tallys = @dataframe.pick(@group_keys).vectors.map.with_object({}) do |v, h|
|
28
|
+
h[v.key] = v.tally
|
29
|
+
end
|
30
|
+
"#<#{self.class}:#{format('0x%016x', object_id)}\n#{tallys}>"
|
31
|
+
end
|
32
|
+
|
33
|
+
def summarize(&block)
|
34
|
+
agg = instance_eval(&block)
|
35
|
+
case agg
|
36
|
+
when DataFrame
|
37
|
+
agg
|
38
|
+
when Array
|
39
|
+
agg.reduce { |aggregated, df| aggregated.assign(df.to_h) }
|
40
|
+
else
|
41
|
+
raise GroupArgumentError, "Unknown argument: #{agg}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def by(func, summary_keys)
|
48
|
+
summary_keys = Array(summary_keys).flatten
|
49
|
+
d = summary_keys - @dataframe.keys
|
50
|
+
raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}." unless summary_keys.empty? || d.empty?
|
51
|
+
|
52
|
+
df = RedAmber::DataFrame.new(@group.send(func, *summary_keys))
|
53
|
+
df = df[df.keys[-1], df.keys[0...-1]]
|
54
|
+
# if counts are the same (no nil included), aggregate count columns.
|
55
|
+
df = df[df.keys[0..1]].rename(df.keys[1], :count) if func == :count && df.to_h.values[1..].uniq.size == 1
|
56
|
+
df
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-in for the class DataFrame
|
5
|
+
module Helper
|
6
|
+
private
|
7
|
+
|
8
|
+
def pl(num)
|
9
|
+
num > 1 ? 's' : ''
|
10
|
+
end
|
11
|
+
|
12
|
+
def out_of_range?(indeces)
|
13
|
+
indeces.max >= size || indeces.min < -size
|
14
|
+
end
|
15
|
+
|
16
|
+
def integers?(enum)
|
17
|
+
enum.all?(Integer)
|
18
|
+
end
|
19
|
+
|
20
|
+
def sym_or_str?(enum)
|
21
|
+
enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
|
22
|
+
end
|
23
|
+
|
24
|
+
def booleans?(enum)
|
25
|
+
enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
|
26
|
+
end
|
27
|
+
|
28
|
+
def create_dataframe_from_vector(key, vector)
|
29
|
+
DataFrame.new(key => vector.data)
|
30
|
+
end
|
31
|
+
|
32
|
+
def parse_to_vector(args)
|
33
|
+
a = args.reduce([]) do |accum, elem|
|
34
|
+
accum.concat(normalize_element(elem))
|
35
|
+
end
|
36
|
+
Vector.new(a)
|
37
|
+
end
|
38
|
+
|
39
|
+
def normalize_element(elem)
|
40
|
+
case elem
|
41
|
+
when Numeric, String, Symbol, TrueClass, FalseClass, NilClass
|
42
|
+
[elem]
|
43
|
+
when Range
|
44
|
+
both_end = [elem.begin, elem.end]
|
45
|
+
both_end[1] -= 1 if elem.exclude_end? && elem.end.is_a?(Integer)
|
46
|
+
|
47
|
+
if both_end.any?(Integer) || both_end.all?(&:nil?)
|
48
|
+
if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
|
49
|
+
raise DataFrameArgumentError, "Index out of range: #{elem} for 0..#{size - 1}"
|
50
|
+
end
|
51
|
+
|
52
|
+
(0...size).to_a[elem]
|
53
|
+
else
|
54
|
+
elem.to_a
|
55
|
+
end
|
56
|
+
else
|
57
|
+
Array(elem)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|