red_amber 0.1.8 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -1
- data/CHANGELOG.md +71 -2
- data/Gemfile +1 -1
- data/README.md +58 -33
- data/doc/DataFrame.md +196 -55
- data/doc/Vector.md +5 -1
- data/doc/examples_of_red_amber.ipynb +1677 -348
- data/lib/red_amber/data_frame.rb +92 -15
- data/lib/red_amber/data_frame_displayable.rb +25 -10
- data/lib/red_amber/data_frame_reshaping.rb +85 -0
- data/lib/red_amber/data_frame_variable_operation.rb +89 -40
- data/lib/red_amber/group.rb +5 -1
- data/lib/red_amber/vector_functions.rb +46 -1
- data/lib/red_amber/vector_selectable.rb +1 -1
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +1 -1
- data/red_amber.gemspec +1 -1
- metadata +5 -4
data/lib/red_amber/data_frame.rb
CHANGED
@@ -1,35 +1,55 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module RedAmber
|
4
|
-
# data frame
|
5
|
-
#
|
4
|
+
# Class to represent a data frame.
|
5
|
+
# Variable @table holds an Arrow::Table object.
|
6
6
|
class DataFrame
|
7
7
|
# mix-in
|
8
8
|
include DataFrameDisplayable
|
9
9
|
include DataFrameIndexable
|
10
|
+
include DataFrameReshaping
|
10
11
|
include DataFrameSelectable
|
11
12
|
include DataFrameVariableOperation
|
12
13
|
include Helper
|
13
14
|
|
15
|
+
# Creates a new RedAmber::DataFrame.
|
16
|
+
#
|
17
|
+
# @overload initialize(hash)
|
18
|
+
#
|
19
|
+
# @params hash [Hash]
|
20
|
+
#
|
21
|
+
# @overload initialize(table)
|
22
|
+
#
|
23
|
+
# @params table [Arrow::Table]
|
24
|
+
#
|
25
|
+
# @overload initialize(dataframe)
|
26
|
+
#
|
27
|
+
# @params dataframe [RedAmber::DataFrame, Rover::DataFrame]
|
28
|
+
#
|
29
|
+
# @overload initialize(null)
|
30
|
+
#
|
31
|
+
# @params null [NilClass] No arguments.
|
32
|
+
#
|
14
33
|
def initialize(*args)
|
15
34
|
@variables = @keys = @vectors = @types = @data_types = nil
|
16
|
-
|
35
|
+
case args
|
36
|
+
in nil | [nil] | [] | {} | [[]] | [{}]
|
17
37
|
# DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
|
18
38
|
# returns empty DataFrame
|
19
39
|
@table = Arrow::Table.new({}, [])
|
20
|
-
|
21
|
-
@table =
|
40
|
+
in [Arrow::Table => table]
|
41
|
+
@table = table
|
42
|
+
in [DataFrame => dataframe]
|
43
|
+
@table = dataframe.table
|
44
|
+
in [rover_or_hash]
|
45
|
+
begin
|
46
|
+
# Accepts Rover::DataFrame or Hash
|
47
|
+
@table = Arrow::Table.new(rover_or_hash.to_h)
|
48
|
+
rescue StandardError
|
49
|
+
raise DataFrameTypeError, "invalid argument: #{rover_or_hash}"
|
50
|
+
end
|
22
51
|
else
|
23
|
-
|
24
|
-
@table =
|
25
|
-
case arg
|
26
|
-
when Arrow::Table then arg
|
27
|
-
when DataFrame then arg.table
|
28
|
-
when Rover::DataFrame then Arrow::Table.new(arg.to_h)
|
29
|
-
when Hash then Arrow::Table.new(arg)
|
30
|
-
else
|
31
|
-
raise DataFrameTypeError, "invalid argument: #{arg}"
|
32
|
-
end
|
52
|
+
@table = Arrow::Table.new(*args)
|
33
53
|
end
|
34
54
|
name_unnamed_keys
|
35
55
|
end
|
@@ -48,56 +68,101 @@ module RedAmber
|
|
48
68
|
@table.save(output, options)
|
49
69
|
end
|
50
70
|
|
71
|
+
# Returns the number of rows.
|
72
|
+
#
|
73
|
+
# @return [Integer] Number of rows.
|
51
74
|
def size
|
52
75
|
@table.n_rows
|
53
76
|
end
|
54
77
|
alias_method :n_rows, :size
|
55
78
|
alias_method :n_obs, :size
|
56
79
|
|
80
|
+
# Returns the number of columns.
|
81
|
+
#
|
82
|
+
# @return [Integer] Number of columns.
|
57
83
|
def n_keys
|
58
84
|
@table.n_columns
|
59
85
|
end
|
60
86
|
alias_method :n_cols, :n_keys
|
61
87
|
alias_method :n_vars, :n_keys
|
62
88
|
|
89
|
+
# Returns the numbers of rows and columns.
|
90
|
+
#
|
91
|
+
# @return [Array]
|
92
|
+
# Number of rows and number of columns in an array.
|
93
|
+
# Same as [size, n_keys].
|
63
94
|
def shape
|
64
95
|
[size, n_keys]
|
65
96
|
end
|
66
97
|
|
98
|
+
# Returns a Hash of key and Vector pairs in the columns.
|
99
|
+
#
|
100
|
+
# @return [Hash]
|
101
|
+
# key => Vector pairs for each columns.
|
67
102
|
def variables
|
68
103
|
@variables || @variables = init_instance_vars(:variables)
|
69
104
|
end
|
70
105
|
alias_method :vars, :variables
|
71
106
|
|
107
|
+
# Returns an Array of keys.
|
108
|
+
#
|
109
|
+
# @return [Array]
|
110
|
+
# Keys in an Array.
|
72
111
|
def keys
|
73
112
|
@keys || @keys = init_instance_vars(:keys)
|
74
113
|
end
|
75
114
|
alias_method :column_names, :keys
|
76
115
|
alias_method :var_names, :keys
|
77
116
|
|
117
|
+
# Returns true if self has a specified key in the argument.
|
118
|
+
#
|
119
|
+
# @param key [Symbol, String] Key to test.
|
120
|
+
# @return [Boolean]
|
121
|
+
# Returns true if self has key in Symbol.
|
78
122
|
def key?(key)
|
79
123
|
keys.include?(key.to_sym)
|
80
124
|
end
|
81
125
|
alias_method :has_key?, :key?
|
82
126
|
|
127
|
+
# Returns index of specified key in the Array keys.
|
128
|
+
#
|
129
|
+
# @param key [Symbol, String] key to know.
|
130
|
+
# @return [Integer]
|
131
|
+
# Index of key in the Array keys.
|
83
132
|
def key_index(key)
|
84
133
|
keys.find_index(key.to_sym)
|
85
134
|
end
|
86
135
|
alias_method :find_index, :key_index
|
87
136
|
alias_method :index, :key_index
|
88
137
|
|
138
|
+
# Returns abbreviated type names in an Array.
|
139
|
+
#
|
140
|
+
# @return [Array]
|
141
|
+
# Abbreviated Red Arrow data type names.
|
89
142
|
def types
|
90
143
|
@types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
|
91
144
|
end
|
92
145
|
|
146
|
+
# Returns an Array of Classes of data type.
|
147
|
+
#
|
148
|
+
# @return [Array]
|
149
|
+
# An Array of Red Arrow data type Classes.
|
93
150
|
def type_classes
|
94
151
|
@data_types || @data_types = @table.columns.map { |column| column.data_type.class }
|
95
152
|
end
|
96
153
|
|
154
|
+
# Returns Vectors in an Array.
|
155
|
+
#
|
156
|
+
# @return [Array]
|
157
|
+
# An Array of RedAmber::Vector s.
|
97
158
|
def vectors
|
98
159
|
@vectors || @vectors = init_instance_vars(:vectors)
|
99
160
|
end
|
100
161
|
|
162
|
+
# Returns row indices (0...size) in an Array.
|
163
|
+
#
|
164
|
+
# @return [Array]
|
165
|
+
# An Array of all indices of rows.
|
101
166
|
def indices
|
102
167
|
(0...size).to_a
|
103
168
|
end
|
@@ -126,6 +191,18 @@ module RedAmber
|
|
126
191
|
variables.empty?
|
127
192
|
end
|
128
193
|
|
194
|
+
def each_row
|
195
|
+
return enum_for(:each_row) unless block_given?
|
196
|
+
|
197
|
+
size.times do |i|
|
198
|
+
key_row_pairs =
|
199
|
+
vectors.each_with_object({}) do |v, h|
|
200
|
+
h[v.key] = v.data[i]
|
201
|
+
end
|
202
|
+
yield key_row_pairs
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
129
206
|
def to_rover
|
130
207
|
require 'rover'
|
131
208
|
Rover::DataFrame.new(to_h)
|
@@ -7,15 +7,34 @@ module RedAmber
|
|
7
7
|
module DataFrameDisplayable
|
8
8
|
INDEX_KEY = :index_key_for_format_table
|
9
9
|
|
10
|
-
def to_s
|
10
|
+
def to_s(width: 80)
|
11
11
|
return '' if empty?
|
12
12
|
|
13
|
-
format_table(width:
|
13
|
+
format_table(width: width)
|
14
14
|
end
|
15
15
|
|
16
|
-
#
|
17
|
-
|
18
|
-
#
|
16
|
+
# Show statistical summary by a new DatFrame.
|
17
|
+
# Make stats for numeric columns only.
|
18
|
+
# NaNs are ignored.
|
19
|
+
# Counts also show non-NaN counts.
|
20
|
+
#
|
21
|
+
# @return [DataFrame] a new dataframe.
|
22
|
+
def summary
|
23
|
+
num_keys = keys.select { |key| self[key].numeric? }
|
24
|
+
|
25
|
+
DataFrame.new(
|
26
|
+
variables: num_keys,
|
27
|
+
count: num_keys.map { |k| self[k].count },
|
28
|
+
mean: num_keys.map { |k| self[k].mean },
|
29
|
+
std: num_keys.map { |k| self[k].std },
|
30
|
+
min: num_keys.map { |k| self[k].min },
|
31
|
+
'25%': num_keys.map { |k| self[k].quantile(0.25) },
|
32
|
+
median: num_keys.map { |k| self[k].median },
|
33
|
+
'75%': num_keys.map { |k| self[k].quantile(0.75) },
|
34
|
+
max: num_keys.map { |k| self[k].max }
|
35
|
+
)
|
36
|
+
end
|
37
|
+
alias_method :describe, :summary
|
19
38
|
|
20
39
|
def inspect
|
21
40
|
if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table') == 'TDR'
|
@@ -133,11 +152,7 @@ module RedAmber
|
|
133
152
|
a
|
134
153
|
end
|
135
154
|
|
136
|
-
def format_table(width: 80)
|
137
|
-
head = 5
|
138
|
-
tail = 3
|
139
|
-
n_digit = 1
|
140
|
-
|
155
|
+
def format_table(width: 80, head: 5, tail: 3, n_digit: 2)
|
141
156
|
original = self
|
142
157
|
indices = size > head + tail ? [*0...head, *(size - tail)...size] : [*0...size]
|
143
158
|
df = slice(indices).assign do
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-ins for the class DataFrame
|
5
|
+
module DataFrameReshaping
|
6
|
+
# Transpose a wide DataFrame.
|
7
|
+
#
|
8
|
+
# @param key [Symbol, FalseClass] key of the index column
|
9
|
+
# to transepose into keys.
|
10
|
+
# If it is false, keys[0] is used.
|
11
|
+
# @param new_key [Symbol, FalseClass] key name of transposed index column.
|
12
|
+
# If it is false, :name is used. If it already exists, :name1.succ is used.
|
13
|
+
# @return [DataFrame] trnsposed DataFrame
|
14
|
+
def transpose(key: keys.first, new_key: :name)
|
15
|
+
raise DataFrameArgumentError, "Not include: #{key}" unless keys.include?(key)
|
16
|
+
|
17
|
+
# Find unused name
|
18
|
+
new_keys = self[key].to_a.map { |e| e.to_s.to_sym }
|
19
|
+
new_key = (:name1..).find { |k| !new_keys.include?(k) } if new_keys.include?(new_key)
|
20
|
+
|
21
|
+
hash = { new_key => (keys - [key]) }
|
22
|
+
i = keys.index(key)
|
23
|
+
each_row do |h|
|
24
|
+
k = h.values[i]
|
25
|
+
hash[k] = h.values - [k]
|
26
|
+
end
|
27
|
+
DataFrame.new(hash)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Reshape wide DataFrame to a longer DataFrame.
|
31
|
+
#
|
32
|
+
# @param keep_keys [Array] keys to keep.
|
33
|
+
# @param name [Symbol, String] key of the column which is come **from values**.
|
34
|
+
# @param value [Symbol, String] key of the column which is come **from values**.
|
35
|
+
# @return [DataFrame] long DataFrame.
|
36
|
+
def to_long(*keep_keys, name: :name, value: :value)
|
37
|
+
not_included = keep_keys - keys
|
38
|
+
raise DataFrameArgumentError, "Not have keys #{not_included}" unless not_included.empty?
|
39
|
+
|
40
|
+
name = name.to_sym
|
41
|
+
raise DataFrameArgumentError, "Invalid key: #{name}" if keep_keys.include?(name)
|
42
|
+
|
43
|
+
value = value.to_sym
|
44
|
+
raise DataFrameArgumentError, "Invalid key: #{value}" if keep_keys.include?(value)
|
45
|
+
|
46
|
+
hash = Hash.new { |h, k| h[k] = [] }
|
47
|
+
l = keys.size - keep_keys.size
|
48
|
+
each_row do |row|
|
49
|
+
row.each do |k, v|
|
50
|
+
if keep_keys.include?(k)
|
51
|
+
hash[k].concat([v] * l)
|
52
|
+
else
|
53
|
+
hash[name] << k
|
54
|
+
hash[value] << v
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
DataFrame.new(hash)
|
59
|
+
end
|
60
|
+
|
61
|
+
# Reshape long DataFrame to a wide DataFrame.
|
62
|
+
#
|
63
|
+
# @param name [Symbol, String] key of the column which will be expanded **to key names**.
|
64
|
+
# @param value [Symbol, String] key of the column which will be expanded **to values**.
|
65
|
+
# @return [DataFrame] wide DataFrame.
|
66
|
+
def to_wide(name: :name, value: :value)
|
67
|
+
name = name.to_sym
|
68
|
+
raise DataFrameArgumentError, "Invalid key: #{name}" unless keys.include?(name)
|
69
|
+
|
70
|
+
value = value.to_sym
|
71
|
+
raise DataFrameArgumentError, "Invalid key: #{value}" unless keys.include?(value)
|
72
|
+
|
73
|
+
hash = Hash.new { |h, k| h[k] = {} }
|
74
|
+
keep_keys = keys - [name, value]
|
75
|
+
each_row do |row|
|
76
|
+
keeps, converts = row.partition { |k, _| keep_keys.include?(k) }
|
77
|
+
h = converts.to_h
|
78
|
+
hash[keeps.to_h][h[name].to_s.to_sym] = h[value]
|
79
|
+
end
|
80
|
+
ks = hash.first[0].keys + hash.first[1].keys
|
81
|
+
vs = hash.map { |k, v| k.values + v.values }.transpose
|
82
|
+
DataFrame.new(ks.zip(vs))
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -44,64 +44,106 @@ module RedAmber
|
|
44
44
|
raise DataFrameArgumentError, "Invalid argument #{args}"
|
45
45
|
end
|
46
46
|
|
47
|
-
# rename variables to create new DataFrame
|
48
|
-
def rename(*
|
49
|
-
renamer = args
|
47
|
+
# rename variables to create a new DataFrame
|
48
|
+
def rename(*renamer, &block)
|
50
49
|
if block
|
51
|
-
raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless
|
50
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless renamer.empty?
|
52
51
|
|
53
|
-
renamer = instance_eval(&block)
|
52
|
+
renamer = [instance_eval(&block)]
|
54
53
|
end
|
55
|
-
|
56
|
-
|
54
|
+
case renamer
|
55
|
+
in [] | [nil] | [{}] | [[]]
|
56
|
+
return self
|
57
|
+
in [Hash => key_pairs]
|
58
|
+
# noop
|
59
|
+
in [ (Symbol | String) => from, (Symbol | String) => to]
|
60
|
+
key_pairs = { from => to }
|
61
|
+
in [Array => array_in_array]
|
62
|
+
key_pairs = try_convert_to_hash(array_in_array)
|
63
|
+
in [Array, *] => array_in_array1
|
64
|
+
key_pairs = try_convert_to_hash(array_in_array1)
|
65
|
+
else
|
66
|
+
raise DataFrameArgumentError, "Invalid argument #{renamer}"
|
67
|
+
end
|
68
|
+
rename_by_hash(key_pairs)
|
69
|
+
end
|
57
70
|
|
58
|
-
|
59
|
-
|
71
|
+
# assign variables to create a new DataFrame
|
72
|
+
def assign(*assigner, &block)
|
73
|
+
appender, fields, arrays = assign_update(*assigner, &block)
|
74
|
+
return self if appender.is_a?(DataFrame)
|
60
75
|
|
61
|
-
|
76
|
+
append_to_fields_and_arrays(appender, fields, arrays, append_to_left: false) unless appender.empty?
|
77
|
+
|
78
|
+
DataFrame.new(Arrow::Table.new(Arrow::Schema.new(fields), arrays))
|
79
|
+
end
|
80
|
+
|
81
|
+
def assign_left(*assigner, &block)
|
82
|
+
appender, fields, arrays = assign_update(*assigner, &block)
|
83
|
+
return self if appender.is_a?(DataFrame)
|
84
|
+
|
85
|
+
append_to_fields_and_arrays(appender, fields, arrays, append_to_left: true) unless appender.empty?
|
86
|
+
|
87
|
+
DataFrame.new(Arrow::Table.new(Arrow::Schema.new(fields), arrays))
|
62
88
|
end
|
63
89
|
|
64
|
-
|
65
|
-
|
66
|
-
|
90
|
+
private
|
91
|
+
|
92
|
+
def assign_update(*assigner, &block)
|
67
93
|
if block
|
68
|
-
raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless
|
94
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless assigner.empty?
|
69
95
|
|
70
|
-
assigner = instance_eval(&block)
|
96
|
+
assigner = [instance_eval(&block)]
|
97
|
+
end
|
98
|
+
case assigner
|
99
|
+
in [] | [nil] | [{}] | [[]]
|
100
|
+
return self
|
101
|
+
in [Hash => key_array_pairs]
|
102
|
+
# noop
|
103
|
+
in [(Symbol | String) => key, (Vector | Array | Arrow::Array) => array]
|
104
|
+
key_array_pairs = { key => array }
|
105
|
+
in [Array => array_in_array]
|
106
|
+
key_array_pairs = try_convert_to_hash(array_in_array)
|
107
|
+
in [Array, *] => array_in_array1
|
108
|
+
key_array_pairs = try_convert_to_hash(array_in_array1)
|
109
|
+
else
|
110
|
+
raise DataFrameArgumentError, "Invalid argument #{assigner}"
|
71
111
|
end
|
72
|
-
assigner = [assigner].flatten
|
73
|
-
return self if assigner.empty? || assigner == [nil]
|
74
|
-
|
75
|
-
raise DataFrameArgumentError, "Invalid argument #{args}" unless assigner.one? && assigner[0].is_a?(Hash)
|
76
112
|
|
77
113
|
updater = {}
|
78
114
|
appender = {}
|
79
|
-
|
115
|
+
key_array_pairs.each do |key, array|
|
80
116
|
if keys.include? key
|
81
|
-
updater[key] =
|
117
|
+
updater[key] = array
|
82
118
|
else
|
83
|
-
appender[key] =
|
119
|
+
appender[key] = array
|
84
120
|
end
|
85
121
|
end
|
86
|
-
|
87
|
-
append_to_fields_and_arrays(appender, fields, arrays) unless appender.empty?
|
88
|
-
|
89
|
-
DataFrame.new(Arrow::Table.new(Arrow::Schema.new(fields), arrays))
|
122
|
+
[appender, *update_fields_and_arrays(updater)]
|
90
123
|
end
|
91
124
|
|
92
|
-
|
125
|
+
def try_convert_to_hash(array)
|
126
|
+
array.to_h
|
127
|
+
rescue TypeError
|
128
|
+
[array].to_h
|
129
|
+
rescue TypeError # rubocop:disable Lint/DuplicateRescueException
|
130
|
+
raise DataFrameArgumentError, "Invalid argument in Array #{array}"
|
131
|
+
end
|
93
132
|
|
94
133
|
def rename_by_hash(key_pairs)
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
134
|
+
not_existing_keys = key_pairs.keys - keys
|
135
|
+
raise DataFrameArgumentError, "Not existing: #{not_existing_keys}" unless not_existing_keys.empty?
|
136
|
+
|
137
|
+
fields =
|
138
|
+
keys.map do |key|
|
139
|
+
new_key = key_pairs[key]
|
140
|
+
if new_key
|
141
|
+
Arrow::Field.new(new_key.to_sym, @table[key].data_type)
|
142
|
+
else
|
143
|
+
@table.schema[key]
|
144
|
+
end
|
101
145
|
end
|
102
|
-
|
103
|
-
schema = Arrow::Schema.new(fields)
|
104
|
-
DataFrame.new(Arrow::Table.new(schema, @table.columns))
|
146
|
+
DataFrame.new(Arrow::Table.new(Arrow::Schema.new(fields), @table.columns))
|
105
147
|
end
|
106
148
|
|
107
149
|
def update_fields_and_arrays(updater)
|
@@ -120,13 +162,20 @@ module RedAmber
|
|
120
162
|
[fields, arrays]
|
121
163
|
end
|
122
164
|
|
123
|
-
def append_to_fields_and_arrays(appender, fields, arrays)
|
124
|
-
appender.
|
165
|
+
def append_to_fields_and_arrays(appender, fields, arrays, append_to_left: false)
|
166
|
+
enum = append_to_left ? appender.reverse_each : appender.each
|
167
|
+
enum.each do |key, data|
|
125
168
|
raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
|
126
169
|
|
127
170
|
a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
|
128
|
-
|
129
|
-
|
171
|
+
|
172
|
+
if append_to_left
|
173
|
+
fields.unshift(Arrow::Field.new(key.to_sym, a.value_data_type))
|
174
|
+
arrays.unshift(Arrow::ChunkedArray.new([a]))
|
175
|
+
else
|
176
|
+
fields << Arrow::Field.new(key.to_sym, a.value_data_type)
|
177
|
+
arrays << Arrow::ChunkedArray.new([a])
|
178
|
+
end
|
130
179
|
end
|
131
180
|
end
|
132
181
|
|
data/lib/red_amber/group.rb
CHANGED
@@ -3,6 +3,10 @@
|
|
3
3
|
module RedAmber
|
4
4
|
# group class
|
5
5
|
class Group
|
6
|
+
# Creates a new Group object.
|
7
|
+
#
|
8
|
+
# @param dataframe [DataFrame] dataframe to be grouped.
|
9
|
+
# @param group_keys [Array<>] keys for grouping.
|
6
10
|
def initialize(dataframe, *group_keys)
|
7
11
|
@dataframe = dataframe
|
8
12
|
@table = @dataframe.table
|
@@ -50,7 +54,7 @@ module RedAmber
|
|
50
54
|
raise GroupArgumentError, "#{d} is not a key of\n #{@dataframe}." unless summary_keys.empty? || d.empty?
|
51
55
|
|
52
56
|
df = RedAmber::DataFrame.new(@group.send(func, *summary_keys))
|
53
|
-
df = df[df.keys
|
57
|
+
df = df[@group_keys, df.keys - @group_keys]
|
54
58
|
# if counts are the same (no nil included), aggregate count columns.
|
55
59
|
df = df[df.keys[0..1]].rename(df.keys[1], :count) if func == :count && df.to_h.values[1..].uniq.size == 1
|
56
60
|
df
|
@@ -39,9 +39,53 @@ module RedAmber
|
|
39
39
|
|
40
40
|
# Returns other than value
|
41
41
|
# - mode
|
42
|
-
# - quantile
|
43
42
|
# - tdigest
|
44
43
|
|
44
|
+
# Return quantile
|
45
|
+
# 0.5 quantile (median) is returned by default.
|
46
|
+
# Or return quantile for specified probability (prob).
|
47
|
+
# If quantile lies between two data points, interpolated value is
|
48
|
+
# returned based on selected interpolation method.
|
49
|
+
# Nils and NaNs are ignored.
|
50
|
+
# Nil is returned if there are no valid data point.
|
51
|
+
#
|
52
|
+
# @param prob [Float] probability.
|
53
|
+
# @param interpolation [Symbol] specifies interpolation method to use,
|
54
|
+
# when the quantile lies between the data i and j.
|
55
|
+
# - Default value is :linear, which returns i + (j - i) * fraction.
|
56
|
+
# - :lower returns i.
|
57
|
+
# - :higher returns j.
|
58
|
+
# - :nearest returns i or j, whichever is closer.
|
59
|
+
# - :midpoint returns (i + j) / 2.
|
60
|
+
# @param skip_nils [Boolean] wheather to ignore nil.
|
61
|
+
# @param min_count [Integer] min count.
|
62
|
+
# @return [Float] quantile.
|
63
|
+
def quantile(prob = 0.5, interpolation: :linear, skip_nils: true, min_count: 0)
|
64
|
+
raise VectorArgumentError, "Invalid: probability #{prob} must be between 0 and 1" unless (0..1).cover? prob
|
65
|
+
|
66
|
+
datum = find(:quantile).execute([data],
|
67
|
+
q: prob,
|
68
|
+
interpolation: interpolation,
|
69
|
+
skip_nulls: skip_nils,
|
70
|
+
min_count: min_count)
|
71
|
+
datum.value.to_a.first
|
72
|
+
end
|
73
|
+
|
74
|
+
# Return quantiles in a DataFrame
|
75
|
+
#
|
76
|
+
def quantiles(probs = [1.0, 0.75, 0.5, 0.25, 0.0], interpolation: :linear, skip_nils: true, min_count: 0)
|
77
|
+
if probs.empty? || !probs.all? { |q| (0..1).cover?(q) }
|
78
|
+
raise VectorArgumentError, "Invarid probavilities #{probs}"
|
79
|
+
end
|
80
|
+
|
81
|
+
DataFrame.new(
|
82
|
+
probs: probs,
|
83
|
+
quantiles: probs.map do |q|
|
84
|
+
quantile(q, interpolation: interpolation, skip_nils: skip_nils, min_count: min_count)
|
85
|
+
end
|
86
|
+
)
|
87
|
+
end
|
88
|
+
|
45
89
|
# [Unary element-wise]: vector.func => vector
|
46
90
|
unary_element_wise =
|
47
91
|
%i[abs array_sort_indices atan bit_wise_not ceil cos fill_null_backward fill_null_forward floor is_finite
|
@@ -63,6 +107,7 @@ module RedAmber
|
|
63
107
|
|
64
108
|
alias_method :sort_indexes, :array_sort_indices
|
65
109
|
alias_method :sort_indices, :array_sort_indices
|
110
|
+
alias_method :sort_index, :array_sort_indices
|
66
111
|
|
67
112
|
alias_method :uniq, :unique
|
68
113
|
|
@@ -111,7 +111,7 @@ module RedAmber
|
|
111
111
|
|
112
112
|
index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
|
113
113
|
|
114
|
-
datum = find(:
|
114
|
+
datum = find(:take).execute([data, index_array]) # :array_take will fail with ChunkedArray
|
115
115
|
Vector.new(datum.value)
|
116
116
|
end
|
117
117
|
|
data/lib/red_amber/version.rb
CHANGED
data/lib/red_amber.rb
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'arrow'
|
4
|
-
require 'rover-df'
|
5
4
|
|
6
5
|
require_relative 'red_amber/helper'
|
7
6
|
require_relative 'red_amber/data_frame_displayable'
|
8
7
|
require_relative 'red_amber/data_frame_indexable'
|
8
|
+
require_relative 'red_amber/data_frame_reshaping'
|
9
9
|
require_relative 'red_amber/data_frame_selectable'
|
10
10
|
require_relative 'red_amber/data_frame_variable_operation'
|
11
11
|
require_relative 'red_amber/data_frame'
|
data/red_amber.gemspec
CHANGED
@@ -30,7 +30,7 @@ Gem::Specification.new do |spec|
|
|
30
30
|
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
31
31
|
spec.require_paths = ['lib']
|
32
32
|
|
33
|
-
spec.add_dependency 'red-arrow', '>=
|
33
|
+
spec.add_dependency 'red-arrow', '>= 9.0.0'
|
34
34
|
|
35
35
|
# Development dependency has gone to the Gemfile (rubygems/bundler#7237)
|
36
36
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red_amber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hirokazu SUZUKI (heronshoes)
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-08-
|
11
|
+
date: 2022-08-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: red-arrow
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 9.0.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 9.0.0
|
27
27
|
description: RedAmber is a simple dataframe library inspired by Rover-df and powered
|
28
28
|
by Red Arrow.
|
29
29
|
email:
|
@@ -69,6 +69,7 @@ files:
|
|
69
69
|
- lib/red_amber/data_frame.rb
|
70
70
|
- lib/red_amber/data_frame_displayable.rb
|
71
71
|
- lib/red_amber/data_frame_indexable.rb
|
72
|
+
- lib/red_amber/data_frame_reshaping.rb
|
72
73
|
- lib/red_amber/data_frame_selectable.rb
|
73
74
|
- lib/red_amber/data_frame_variable_operation.rb
|
74
75
|
- lib/red_amber/group.rb
|