red_amber 0.1.2 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +21 -10
- data/CHANGELOG.md +162 -6
- data/Gemfile +3 -0
- data/README.md +89 -303
- data/benchmark/csv_load_penguins.yml +15 -0
- data/benchmark/drop_nil.yml +11 -0
- data/doc/DataFrame.md +840 -0
- data/doc/Vector.md +317 -0
- data/doc/image/arrow_table_new.png +0 -0
- data/doc/image/dataframe/assign.png +0 -0
- data/doc/image/dataframe/drop.png +0 -0
- data/doc/image/dataframe/pick.png +0 -0
- data/doc/image/dataframe/remove.png +0 -0
- data/doc/image/dataframe/rename.png +0 -0
- data/doc/image/dataframe/slice.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/doc/image/example_in_red_arrow.png +0 -0
- data/doc/image/tdr.png +0 -0
- data/doc/image/tdr_and_table.png +0 -0
- data/doc/image/tidy_data_in_TDR.png +0 -0
- data/doc/image/vector/binary_element_wise.png +0 -0
- data/doc/image/vector/unary_aggregation.png +0 -0
- data/doc/image/vector/unary_aggregation_w_option.png +0 -0
- data/doc/image/vector/unary_element_wise.png +0 -0
- data/doc/tdr.md +56 -0
- data/doc/tdr_ja.md +56 -0
- data/lib/red_amber/data_frame.rb +68 -35
- data/lib/red_amber/data_frame_displayable.rb +132 -0
- data/lib/red_amber/data_frame_helper.rb +64 -0
- data/lib/red_amber/data_frame_indexable.rb +38 -0
- data/lib/red_amber/data_frame_observation_operation.rb +83 -0
- data/lib/red_amber/data_frame_selectable.rb +34 -43
- data/lib/red_amber/data_frame_variable_operation.rb +133 -0
- data/lib/red_amber/vector.rb +58 -6
- data/lib/red_amber/vector_compensable.rb +68 -0
- data/lib/red_amber/vector_functions.rb +147 -68
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +9 -1
- data/red_amber.gemspec +3 -6
- metadata +36 -9
- data/lib/red_amber/data_frame_output.rb +0 -116
@@ -0,0 +1,132 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
module RedAmber
|
6
|
+
# mix-ins for the class DataFrame
|
7
|
+
module DataFrameDisplayable
|
8
|
+
def to_s
|
9
|
+
@table.to_s
|
10
|
+
end
|
11
|
+
|
12
|
+
# def describe() end
|
13
|
+
|
14
|
+
# def summary() end
|
15
|
+
|
16
|
+
def inspect
|
17
|
+
"#<#{shape_str(with_id: true)}>\n#{dataframe_info(3)}"
|
18
|
+
end
|
19
|
+
|
20
|
+
# - limit: max num of Vectors to show
|
21
|
+
# - tally: max level to use tally mode
|
22
|
+
# - elements: max element to show values in each vector
|
23
|
+
def tdr(limit = 10, tally: 5, elements: 5)
|
24
|
+
puts tdr_str(limit, tally: tally, elements: elements)
|
25
|
+
end
|
26
|
+
|
27
|
+
def tdr_str(limit = 10, tally: 5, elements: 5)
|
28
|
+
"#{shape_str}\n#{dataframe_info(limit, tally_level: tally, max_element: elements)}"
|
29
|
+
end
|
30
|
+
|
31
|
+
private # =====
|
32
|
+
|
33
|
+
def pl(num)
|
34
|
+
num > 1 ? 's' : ''
|
35
|
+
end
|
36
|
+
|
37
|
+
def shape_str(with_id: false)
|
38
|
+
shape_info = empty? ? '(empty)' : "#{size} x #{n_keys} Vector#{pl(n_keys)}"
|
39
|
+
id = with_id ? format(', 0x%016x', object_id) : ''
|
40
|
+
"#{self.class} : #{shape_info}#{id}"
|
41
|
+
end
|
42
|
+
|
43
|
+
def dataframe_info(limit, tally_level: 5, max_element: 5)
|
44
|
+
return '' if empty?
|
45
|
+
|
46
|
+
limit = n_keys if [:all, -1].include? limit
|
47
|
+
|
48
|
+
tallys = vectors.map(&:tally)
|
49
|
+
levels = tallys.map(&:size)
|
50
|
+
type_groups = @table.columns.map { |column| type_group(column.data_type) }
|
51
|
+
quoted_keys = keys.map(&:inspect)
|
52
|
+
headers = { idx: '#', key: 'key', type: 'type', levels: 'level', data: 'data_preview' }
|
53
|
+
header_format = make_header_format(levels, headers, quoted_keys)
|
54
|
+
|
55
|
+
sio = StringIO.new # output string buffer
|
56
|
+
sio.puts "Vector#{pl(n_keys)} : #{var_type_count(type_groups).join(', ')}"
|
57
|
+
sio.printf header_format, *headers.values
|
58
|
+
|
59
|
+
vectors.each.with_index do |vector, i|
|
60
|
+
if i >= limit
|
61
|
+
sio << " ... #{n_keys - i} more Vector#{pl(n_keys - i)} ...\n"
|
62
|
+
break
|
63
|
+
end
|
64
|
+
key = quoted_keys[i]
|
65
|
+
type = types[i]
|
66
|
+
type_group = type_groups[i]
|
67
|
+
data_tally = tallys[i]
|
68
|
+
a = case type_group
|
69
|
+
when :numeric, :string, :boolean
|
70
|
+
if data_tally.size <= tally_level && data_tally.size != size
|
71
|
+
[data_tally.to_s]
|
72
|
+
else
|
73
|
+
[shorthand(vector, size, max_element)].concat na_string(vector)
|
74
|
+
end
|
75
|
+
else
|
76
|
+
[shorthand(vector, size, max_element)]
|
77
|
+
end
|
78
|
+
sio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
|
79
|
+
end
|
80
|
+
sio.string
|
81
|
+
end
|
82
|
+
|
83
|
+
def make_header_format(levels, headers, quoted_keys)
|
84
|
+
# find longest word to adjust column width
|
85
|
+
w_idx = n_keys.to_s.size
|
86
|
+
w_key = [quoted_keys.map(&:size).max, headers[:key].size].max
|
87
|
+
w_type = [types.map(&:size).max, headers[:type].size].max
|
88
|
+
w_row = [levels.map { |l| l.to_s.size }.max, headers[:levels].size].max
|
89
|
+
"%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{w_row}s %s\n"
|
90
|
+
end
|
91
|
+
|
92
|
+
def type_group(data_type)
|
93
|
+
case data_type
|
94
|
+
when Arrow::NumericDataType then :numeric
|
95
|
+
when Arrow::StringDataType then :string
|
96
|
+
when Arrow::BooleanDataType then :boolean
|
97
|
+
when Arrow::TemporalDataType then :temporal
|
98
|
+
else
|
99
|
+
:other
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def var_type_count(type_groups)
|
104
|
+
tg = type_groups.tally
|
105
|
+
a = []
|
106
|
+
a << "#{tg[:numeric]} numeric" if tg[:numeric]
|
107
|
+
a << "#{tg[:string]} string#{pl(tg[:string])}" if tg[:string]
|
108
|
+
a << "#{tg[:boolean]} boolean" if tg[:boolean]
|
109
|
+
a << "#{tg[:temporal]} temporal" if tg[:temporal]
|
110
|
+
a
|
111
|
+
end
|
112
|
+
|
113
|
+
def shorthand(vector, size, max_element)
|
114
|
+
max = vector.temporal? ? 2 : max_element
|
115
|
+
a = vector.to_a.take(max)
|
116
|
+
a.map! { |e| e.nil? ? 'nil' : e.inspect }
|
117
|
+
a << '... ' if size > max
|
118
|
+
"[#{a.join(', ')}]"
|
119
|
+
end
|
120
|
+
|
121
|
+
def na_string(vector)
|
122
|
+
n_nan = vector.n_nans
|
123
|
+
n_nil = vector.n_nils
|
124
|
+
a = []
|
125
|
+
return a if (n_nan + n_nil).zero?
|
126
|
+
|
127
|
+
a << "#{n_nan} NaN#{pl(n_nan)}" unless n_nan.zero?
|
128
|
+
a << "#{n_nil} nil#{pl(n_nil)}" unless n_nil.zero?
|
129
|
+
a
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-in for the class DataFrame
|
5
|
+
module DataFrameHelper
|
6
|
+
private
|
7
|
+
|
8
|
+
def expand_range(args)
|
9
|
+
ary = args.each_with_object([]) do |e, a|
|
10
|
+
e.is_a?(Range) ? a.concat(normalized_array(e)) : a.append(e)
|
11
|
+
end
|
12
|
+
ary.map do |e|
|
13
|
+
if e.is_a?(Integer) && e.negative?
|
14
|
+
e + size
|
15
|
+
else
|
16
|
+
e
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def normalized_array(range)
|
22
|
+
both_end = [range.begin, range.end]
|
23
|
+
both_end[1] -= 1 if range.exclude_end? && range.end.is_a?(Integer)
|
24
|
+
|
25
|
+
if both_end.any?(Integer) || both_end.all?(&:nil?)
|
26
|
+
if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
|
27
|
+
raise DataFrameArgumentError, "Index out of range: #{range} for 0..#{size - 1}"
|
28
|
+
end
|
29
|
+
|
30
|
+
(0...size).to_a[range]
|
31
|
+
else
|
32
|
+
range.to_a
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def out_of_range?(indeces)
|
37
|
+
indeces.max >= size || indeces.min < -size
|
38
|
+
end
|
39
|
+
|
40
|
+
def integers?(enum)
|
41
|
+
enum.all?(Integer)
|
42
|
+
end
|
43
|
+
|
44
|
+
def sym_or_str?(enum)
|
45
|
+
enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
|
46
|
+
end
|
47
|
+
|
48
|
+
def booleans?(enum)
|
49
|
+
enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
|
50
|
+
end
|
51
|
+
|
52
|
+
def create_dataframe_from_vector(key, vector)
|
53
|
+
DataFrame.new(key => vector.data)
|
54
|
+
end
|
55
|
+
|
56
|
+
def select_obs_by_boolean(array)
|
57
|
+
DataFrame.new(@table.filter(array))
|
58
|
+
end
|
59
|
+
|
60
|
+
def keys_by_booleans(booleans)
|
61
|
+
keys.select.with_index { |_, i| booleans[i] }
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-ins for the class DataFrame
|
5
|
+
module DataFrameIndexable
|
6
|
+
# Common method
|
7
|
+
def map_indices(*indices)
|
8
|
+
return self if indices.empty?
|
9
|
+
|
10
|
+
indices = indices[0].data if indices[0].is_a?(Vector)
|
11
|
+
|
12
|
+
new_dataframe_by(indices)
|
13
|
+
end
|
14
|
+
|
15
|
+
# @param sort_keys [Arrow::SortKey]
|
16
|
+
# :key, "key" or "+key" denotes ascending,
|
17
|
+
# "-key" denotes descending order
|
18
|
+
# @return [RedAmber::Vector] Sorted indices in Vector
|
19
|
+
def sort_indices(*sort_keys)
|
20
|
+
indices = @table.sort_indices(sort_keys.flatten)
|
21
|
+
Vector.new(indices)
|
22
|
+
end
|
23
|
+
|
24
|
+
# @return [RedAmber::DataFrame] Sorted DataFrame
|
25
|
+
def sort(*sort_keys)
|
26
|
+
indices = @table.sort_indices(sort_keys.flatten)
|
27
|
+
|
28
|
+
new_dataframe_by(indices)
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def new_dataframe_by(index_array)
|
34
|
+
t = Arrow::Function.find(:take).execute([@table, index_array]).value
|
35
|
+
RedAmber::DataFrame.new(t)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-ins for the class DataFrame
|
5
|
+
module DataFrameObservationOperation
|
6
|
+
# slice and select some observations to create sub DataFrame
|
7
|
+
def slice(*args, &block)
|
8
|
+
slicer = args
|
9
|
+
if block
|
10
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
11
|
+
|
12
|
+
slicer = instance_eval(&block)
|
13
|
+
end
|
14
|
+
slicer = [slicer].flatten
|
15
|
+
return remove_all_values if slicer.empty? || slicer[0].nil?
|
16
|
+
|
17
|
+
# filter with same length
|
18
|
+
booleans = nil
|
19
|
+
if slicer[0].is_a?(Vector) || slicer[0].is_a?(Arrow::BooleanArray)
|
20
|
+
booleans = slicer[0].to_a
|
21
|
+
elsif slicer.size == size && booleans?(slicer)
|
22
|
+
booleans = slicer
|
23
|
+
end
|
24
|
+
return select_obs_by_boolean(booleans) if booleans
|
25
|
+
|
26
|
+
# filter with indexes
|
27
|
+
slicer = expand_range(slicer)
|
28
|
+
return map_indices(*slicer) if integers?(slicer)
|
29
|
+
|
30
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
31
|
+
end
|
32
|
+
|
33
|
+
# remove selected observations to create sub DataFrame
|
34
|
+
def remove(*args, &block)
|
35
|
+
remover = args
|
36
|
+
if block
|
37
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
38
|
+
|
39
|
+
remover = instance_eval(&block)
|
40
|
+
end
|
41
|
+
remover = [remover].flatten
|
42
|
+
|
43
|
+
return self if remover.empty?
|
44
|
+
|
45
|
+
# filter with same length
|
46
|
+
booleans = nil
|
47
|
+
if remover[0].is_a?(Vector) || remover[0].is_a?(Arrow::BooleanArray)
|
48
|
+
booleans = remover[0].to_a
|
49
|
+
elsif remover.size == size && booleans?(remover)
|
50
|
+
booleans = remover
|
51
|
+
end
|
52
|
+
if booleans
|
53
|
+
inverted = booleans.map(&:!)
|
54
|
+
return select_obs_by_boolean(inverted)
|
55
|
+
end
|
56
|
+
|
57
|
+
# filter with indexes
|
58
|
+
slicer = indexes.to_a - expand_range(remover)
|
59
|
+
return remove_all_values if slicer.empty?
|
60
|
+
return map_indices(*slicer) if integers?(slicer)
|
61
|
+
|
62
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
63
|
+
end
|
64
|
+
|
65
|
+
def remove_nil
|
66
|
+
func = Arrow::Function.find(:drop_null)
|
67
|
+
DataFrame.new(func.execute([table]).value)
|
68
|
+
end
|
69
|
+
alias_method :drop_nil, :remove_nil
|
70
|
+
|
71
|
+
def group(aggregating_keys, func, target_keys)
|
72
|
+
t = table.group(*aggregating_keys)
|
73
|
+
RedAmber::DataFrame.new(t.send(func, *target_keys))
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
# return a DataFrame with same keys as self without values
|
79
|
+
def remove_all_values
|
80
|
+
DataFrame.new(keys.each_with_object({}) { |key, h| h[key] = [] })
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -9,18 +9,40 @@ module RedAmber
|
|
9
9
|
raise DataFrameArgumentError, 'Empty dataframe' if empty?
|
10
10
|
raise DataFrameArgumentError, 'Empty argument' if args.empty?
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
if args.one?
|
13
|
+
case args[0]
|
14
|
+
when Vector
|
15
|
+
return select_obs_by_boolean(Arrow::BooleanArray.new(args[0].data))
|
16
|
+
when Arrow::BooleanArray
|
17
|
+
return select_obs_by_boolean(args[0])
|
18
|
+
when Array
|
19
|
+
return select_obs_by_boolean(Arrow::BooleanArray.new(args[0]))
|
20
|
+
|
21
|
+
# when Hash
|
22
|
+
# specify conditions to select by a Hash
|
16
23
|
end
|
24
|
+
end
|
17
25
|
|
18
|
-
return
|
19
|
-
|
26
|
+
return select_obs_by_boolean(args) if booleans?(args)
|
27
|
+
|
28
|
+
# expand Range like [1..3, 4] to [1, 2, 3, 4]
|
29
|
+
expanded = expand_range(args)
|
30
|
+
return map_indices(*expanded) if integers?(expanded)
|
31
|
+
return select_vars_by_keys(expanded.map(&:to_sym)) if sym_or_str?(expanded)
|
20
32
|
|
21
33
|
raise DataFrameArgumentError, "Invalid argument #{args}"
|
22
34
|
end
|
23
35
|
|
36
|
+
# Select a variable by a key in String or Symbol
|
37
|
+
def v(key)
|
38
|
+
unless key.is_a?(Symbol) || key.is_a?(String)
|
39
|
+
raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
|
40
|
+
end
|
41
|
+
raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
|
42
|
+
|
43
|
+
variables[key.to_sym]
|
44
|
+
end
|
45
|
+
|
24
46
|
def head(n_rows = 5)
|
25
47
|
raise DataFrameArgumentError, "Index is out of range #{n_rows}" if n_rows.negative?
|
26
48
|
|
@@ -41,48 +63,17 @@ module RedAmber
|
|
41
63
|
tail(n_rows)
|
42
64
|
end
|
43
65
|
|
44
|
-
private
|
66
|
+
private
|
45
67
|
|
46
|
-
def
|
68
|
+
def select_vars_by_keys(keys)
|
47
69
|
if keys.one?
|
48
|
-
|
49
|
-
|
50
|
-
DataFrame.new(@table[keys])
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
def select_rows(indeces)
|
55
|
-
out_of_range?(indeces) && raise(DataFrameArgumentError, "Invalid index: #{indeces} for 0..#{size - 1}")
|
70
|
+
key = keys[0].to_sym
|
71
|
+
raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key
|
56
72
|
|
57
|
-
|
58
|
-
DataFrame.new(@table.schema, a)
|
59
|
-
end
|
60
|
-
|
61
|
-
def normalized_array(range)
|
62
|
-
both_end = [range.begin, range.end]
|
63
|
-
both_end[1] -= 1 if range.exclude_end? && range.end.is_a?(Integer)
|
64
|
-
|
65
|
-
if both_end.any?(Integer) || both_end.all?(&:nil?)
|
66
|
-
if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
|
67
|
-
raise DataFrameArgumentError, "Index out of range: #{range} for 0..#{size - 1}"
|
68
|
-
end
|
69
|
-
|
70
|
-
(0...size).to_a[range]
|
73
|
+
variables[key]
|
71
74
|
else
|
72
|
-
|
75
|
+
DataFrame.new(@table[keys])
|
73
76
|
end
|
74
77
|
end
|
75
|
-
|
76
|
-
def out_of_range?(indeces)
|
77
|
-
indeces.max >= size || indeces.min < -size
|
78
|
-
end
|
79
|
-
|
80
|
-
def integers?(enum)
|
81
|
-
enum.all?(Integer)
|
82
|
-
end
|
83
|
-
|
84
|
-
def sym_or_str?(enum)
|
85
|
-
enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
|
86
|
-
end
|
87
78
|
end
|
88
79
|
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-ins for the class DataFrame
|
5
|
+
module DataFrameVariableOperation
|
6
|
+
# pick up some variables to create sub DataFrame
|
7
|
+
def pick(*args, &block)
|
8
|
+
picker = args
|
9
|
+
if block
|
10
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
11
|
+
|
12
|
+
picker = instance_eval(&block)
|
13
|
+
end
|
14
|
+
picker = [picker].flatten
|
15
|
+
return DataFrame.new if picker.empty? || picker == [nil]
|
16
|
+
|
17
|
+
picker = keys_by_booleans(picker) if booleans?(picker)
|
18
|
+
|
19
|
+
# DataFrame#[] creates a Vector with single key is specified.
|
20
|
+
# DataFrame#pick creates a DataFrame with single key.
|
21
|
+
return DataFrame.new(@table[picker]) if sym_or_str?(picker)
|
22
|
+
|
23
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
24
|
+
end
|
25
|
+
|
26
|
+
# drop some variables to create remainer sub DataFrame
|
27
|
+
def drop(*args, &block)
|
28
|
+
dropper = args
|
29
|
+
if block
|
30
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
31
|
+
|
32
|
+
dropper = instance_eval(&block)
|
33
|
+
end
|
34
|
+
dropper = [dropper].flatten
|
35
|
+
dropper = keys_by_booleans(dropper) if booleans?(dropper)
|
36
|
+
|
37
|
+
picker = keys - dropper
|
38
|
+
return DataFrame.new if picker.empty?
|
39
|
+
|
40
|
+
# DataFrame#[] creates a Vector with single key is specified.
|
41
|
+
# DataFrame#drop creates a DataFrame with single key.
|
42
|
+
return DataFrame.new(@table[picker]) if sym_or_str?(picker)
|
43
|
+
|
44
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
45
|
+
end
|
46
|
+
|
47
|
+
# rename variables to create new DataFrame
|
48
|
+
def rename(*args, &block)
|
49
|
+
renamer = args
|
50
|
+
if block
|
51
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
|
52
|
+
|
53
|
+
renamer = instance_eval(&block)
|
54
|
+
end
|
55
|
+
renamer = [renamer].flatten
|
56
|
+
return self if renamer.empty?
|
57
|
+
|
58
|
+
return rename_by_hash([renamer].to_h) if renamer.size == 2 && sym_or_str?(renamer) # rename(from, to)
|
59
|
+
return rename_by_hash(renamer[0]) if renamer.one? && renamer[0].is_a?(Hash) # rename({from => to})
|
60
|
+
|
61
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
62
|
+
end
|
63
|
+
|
64
|
+
# assign variables to create new DataFrame
|
65
|
+
def assign(*args, &block)
|
66
|
+
assigner = args
|
67
|
+
if block
|
68
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
|
69
|
+
|
70
|
+
assigner = instance_eval(&block)
|
71
|
+
end
|
72
|
+
assigner = [assigner].flatten
|
73
|
+
return self if assigner.empty? || assigner == [nil]
|
74
|
+
|
75
|
+
raise DataFrameArgumentError, "Invalid argument #{args}" unless assigner.one? && assigner[0].is_a?(Hash)
|
76
|
+
|
77
|
+
updater = {}
|
78
|
+
appender = {}
|
79
|
+
assigner[0].each do |key, value|
|
80
|
+
if keys.include? key
|
81
|
+
updater[key] = value
|
82
|
+
else
|
83
|
+
appender[key] = value
|
84
|
+
end
|
85
|
+
end
|
86
|
+
fields, arrays = update_fields_and_arrays(updater)
|
87
|
+
append_to_fields_and_arrays(appender, fields, arrays) unless appender.empty?
|
88
|
+
|
89
|
+
DataFrame.new(Arrow::Table.new(Arrow::Schema.new(fields), arrays))
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
|
94
|
+
def rename_by_hash(key_pairs)
|
95
|
+
fields = keys.map do |key|
|
96
|
+
new_key = key_pairs[key]
|
97
|
+
if new_key
|
98
|
+
Arrow::Field.new(new_key.to_sym, @table[key].data_type)
|
99
|
+
else
|
100
|
+
@table.schema[key]
|
101
|
+
end
|
102
|
+
end
|
103
|
+
schema = Arrow::Schema.new(fields)
|
104
|
+
DataFrame.new(Arrow::Table.new(schema, @table.columns))
|
105
|
+
end
|
106
|
+
|
107
|
+
def update_fields_and_arrays(updater)
|
108
|
+
fields = @table.columns.map(&:field)
|
109
|
+
arrays = @table.columns.map(&:data) # chunked_arrays
|
110
|
+
keys.each_with_index do |key, i|
|
111
|
+
data = updater[key]
|
112
|
+
next unless data
|
113
|
+
|
114
|
+
raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
|
115
|
+
|
116
|
+
a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
|
117
|
+
fields[i] = Arrow::Field.new(key, a.value_data_type)
|
118
|
+
arrays[i] = Arrow::ChunkedArray.new([a])
|
119
|
+
end
|
120
|
+
[fields, arrays]
|
121
|
+
end
|
122
|
+
|
123
|
+
def append_to_fields_and_arrays(appender, fields, arrays)
|
124
|
+
appender.each do |key, data|
|
125
|
+
raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
|
126
|
+
|
127
|
+
a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
|
128
|
+
fields << Arrow::Field.new(key.to_sym, a.value_data_type)
|
129
|
+
arrays << Arrow::ChunkedArray.new([a])
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
data/lib/red_amber/vector.rb
CHANGED
@@ -5,10 +5,12 @@ module RedAmber
|
|
5
5
|
# @data : holds Arrow::ChunkedArray
|
6
6
|
class Vector
|
7
7
|
# mix-in
|
8
|
+
include VectorCompensable
|
8
9
|
include VectorFunctions
|
9
10
|
|
10
11
|
# chunked_array may come from column.data
|
11
12
|
def initialize(array)
|
13
|
+
@key = nil # default is 'headless'
|
12
14
|
case array
|
13
15
|
when Vector
|
14
16
|
@data = array.data
|
@@ -17,18 +19,31 @@ module RedAmber
|
|
17
19
|
when Array
|
18
20
|
@data = Arrow::Array.new(array)
|
19
21
|
else
|
20
|
-
raise
|
22
|
+
raise VectorArgumentError, 'Unknown array in argument'
|
21
23
|
end
|
22
24
|
end
|
23
25
|
|
24
26
|
attr_reader :data
|
27
|
+
attr_accessor :key
|
25
28
|
|
26
29
|
def to_s
|
27
30
|
@data.to_a.inspect
|
28
31
|
end
|
29
32
|
|
30
|
-
def inspect
|
31
|
-
|
33
|
+
def inspect(limit: 80)
|
34
|
+
sio = StringIO.new << '['
|
35
|
+
to_a.each_with_object(sio).with_index do |(e, s), i|
|
36
|
+
next_str = "#{s.size > 1 ? ', ' : ''}#{e.inspect}"
|
37
|
+
if (s.size + next_str.size) < limit
|
38
|
+
s << next_str
|
39
|
+
else
|
40
|
+
s << ', ... ' if i < size
|
41
|
+
break
|
42
|
+
end
|
43
|
+
end
|
44
|
+
sio << ']'
|
45
|
+
|
46
|
+
format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n", object_id, sio.string
|
32
47
|
end
|
33
48
|
|
34
49
|
def values
|
@@ -49,8 +64,24 @@ module RedAmber
|
|
49
64
|
@data.value_type.nick.to_sym
|
50
65
|
end
|
51
66
|
|
52
|
-
def
|
53
|
-
|
67
|
+
def boolean?
|
68
|
+
type == :boolean
|
69
|
+
end
|
70
|
+
|
71
|
+
def numeric?
|
72
|
+
type_class < Arrow::NumericDataType
|
73
|
+
end
|
74
|
+
|
75
|
+
def string?
|
76
|
+
type == :string
|
77
|
+
end
|
78
|
+
|
79
|
+
def temporal?
|
80
|
+
type_class < Arrow::TemporalDataType
|
81
|
+
end
|
82
|
+
|
83
|
+
def type_class
|
84
|
+
@data.value_data_type.class
|
54
85
|
end
|
55
86
|
|
56
87
|
# def each() end
|
@@ -66,11 +97,32 @@ module RedAmber
|
|
66
97
|
# def each_chunk() end
|
67
98
|
|
68
99
|
def tally
|
69
|
-
values.tally
|
100
|
+
hash = values.tally
|
101
|
+
if (type_class < Arrow::FloatingPointDataType) && is_nan.any
|
102
|
+
a = 0
|
103
|
+
hash.each do |key, value|
|
104
|
+
if key.is_a?(Float) && key.nan?
|
105
|
+
hash.delete(key)
|
106
|
+
a += value
|
107
|
+
end
|
108
|
+
end
|
109
|
+
hash[Float::NAN] = a
|
110
|
+
end
|
111
|
+
hash
|
112
|
+
end
|
113
|
+
|
114
|
+
def value_counts
|
115
|
+
values, counts = Arrow::Function.find(:value_counts).execute([data]).value.fields
|
116
|
+
values.zip(counts).to_h
|
70
117
|
end
|
71
118
|
|
72
119
|
def n_nulls
|
73
120
|
@data.n_nulls
|
74
121
|
end
|
122
|
+
alias_method :n_nils, :n_nulls
|
123
|
+
|
124
|
+
def n_nans
|
125
|
+
numeric? ? is_nan.to_a.count(true) : 0
|
126
|
+
end
|
75
127
|
end
|
76
128
|
end
|