daru 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.rubocop.yml +99 -0
- data/.rubocop_todo.yml +44 -0
- data/.travis.yml +3 -1
- data/CONTRIBUTING.md +5 -1
- data/History.md +43 -0
- data/README.md +3 -4
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +7 -7
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/sorting.rb +9 -2
- data/benchmarks/statistics.rb +39 -0
- data/daru.gemspec +4 -4
- data/lib/daru.rb +9 -9
- data/lib/daru/accessors/array_wrapper.rb +15 -11
- data/lib/daru/accessors/dataframe_by_row.rb +1 -1
- data/lib/daru/accessors/gsl_wrapper.rb +30 -19
- data/lib/daru/accessors/mdarray_wrapper.rb +1 -3
- data/lib/daru/accessors/nmatrix_wrapper.rb +15 -15
- data/lib/daru/core/group_by.rb +69 -16
- data/lib/daru/core/merge.rb +135 -151
- data/lib/daru/core/query.rb +9 -30
- data/lib/daru/dataframe.rb +476 -439
- data/lib/daru/date_time/index.rb +150 -137
- data/lib/daru/date_time/offsets.rb +45 -41
- data/lib/daru/extensions/rserve.rb +4 -4
- data/lib/daru/index.rb +88 -64
- data/lib/daru/io/io.rb +33 -34
- data/lib/daru/io/sql_data_source.rb +11 -11
- data/lib/daru/maths/arithmetic/dataframe.rb +19 -19
- data/lib/daru/maths/arithmetic/vector.rb +9 -14
- data/lib/daru/maths/statistics/dataframe.rb +89 -61
- data/lib/daru/maths/statistics/vector.rb +226 -97
- data/lib/daru/monkeys.rb +23 -30
- data/lib/daru/plotting/dataframe.rb +27 -28
- data/lib/daru/plotting/vector.rb +12 -13
- data/lib/daru/vector.rb +221 -330
- data/lib/daru/version.rb +2 -2
- data/spec/core/group_by_spec.rb +16 -0
- data/spec/core/merge_spec.rb +30 -14
- data/spec/dataframe_spec.rb +268 -14
- data/spec/index_spec.rb +23 -5
- data/spec/io/io_spec.rb +37 -16
- data/spec/math/statistics/dataframe_spec.rb +40 -8
- data/spec/math/statistics/vector_spec.rb +135 -10
- data/spec/monkeys_spec.rb +3 -3
- data/spec/vector_spec.rb +157 -25
- metadata +41 -21
data/lib/daru/core/merge.rb
CHANGED
@@ -4,17 +4,17 @@ module Daru
|
|
4
4
|
class << self
|
5
5
|
def replace_keys_if_duplicates hash, matcher
|
6
6
|
matched = nil
|
7
|
-
hash.keys.each { |d|
|
7
|
+
hash.keys.each { |d|
|
8
8
|
if matcher.match(Regexp.new(d.to_s))
|
9
9
|
matched = d
|
10
10
|
break
|
11
|
-
end
|
11
|
+
end
|
12
12
|
}
|
13
13
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
return unless matched
|
15
|
+
|
16
|
+
hash[matcher] = hash[matched]
|
17
|
+
hash.delete matched
|
18
18
|
end
|
19
19
|
|
20
20
|
def resolve_duplicates df_hash1, df_hash2, on
|
@@ -29,198 +29,182 @@ module Daru
|
|
29
29
|
end
|
30
30
|
|
31
31
|
def hashify df
|
32
|
-
hsh = df.
|
32
|
+
hsh = df.to_h
|
33
33
|
hsh.each { |k,v| hsh[k] = v.to_a }
|
34
34
|
hsh
|
35
35
|
end
|
36
|
-
|
36
|
+
|
37
37
|
def arrayify df
|
38
38
|
arr = df.to_a
|
39
39
|
col_names = arr[0][0].keys
|
40
|
-
values = arr[0].map
|
40
|
+
values = arr[0].map(&:values)
|
41
41
|
|
42
|
-
|
42
|
+
[col_names, values]
|
43
43
|
end
|
44
44
|
|
45
|
-
def
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
end
|
45
|
+
def arrayify_with_sort_keys(size, df_hash, on)
|
46
|
+
# Converting to a hash and then to an array is more complex
|
47
|
+
# than using df.to_a or df.map(:row). However, it's
|
48
|
+
# substantially faster this way.
|
50
49
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
end
|
58
|
-
end
|
50
|
+
# idx_keys = on.map { |key| df_hash.keys.index(key) }
|
51
|
+
|
52
|
+
(0...size).reduce([]) do |r, idx|
|
53
|
+
key_values = on.map { |col| df_hash[col][idx] }
|
54
|
+
row_values = df_hash.map { |_col, val| val[idx] }
|
55
|
+
r << [key_values, row_values]
|
59
56
|
end
|
60
57
|
|
61
|
-
|
58
|
+
# Conceptually simpler and does the same thing, but slows down the
|
59
|
+
# total merge algorithm by 2x. Would be nice to improve the performance
|
60
|
+
# of df.map(:row)
|
61
|
+
#
|
62
|
+
# df.map(:row) do |row|
|
63
|
+
# key_values = on.map { |key| row[key] }
|
64
|
+
# [key_values, row.to_a]
|
65
|
+
# end
|
62
66
|
end
|
63
67
|
|
64
|
-
def
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
if (col_names1.include?(name))
|
73
|
-
col_names1[col_names1.index(name)] = (name.to_s + "_1").to_sym unless on.include?(name)
|
74
|
-
(name.to_s + "_2").to_sym
|
75
|
-
else
|
76
|
-
name
|
77
|
-
end
|
78
|
-
end
|
68
|
+
def verify_dataframes df_hash1, df_hash2, on
|
69
|
+
raise ArgumentError,
|
70
|
+
'All fields in :on must be present in self' unless on.all? { |e| df_hash1[e] }
|
71
|
+
raise ArgumentError,
|
72
|
+
'All fields in :on must be present in other DF' unless on.all? { |e| df_hash2[e] }
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
79
76
|
|
80
|
-
|
81
|
-
|
82
|
-
|
77
|
+
class MergeFrame
|
78
|
+
def initialize(df1, df2, on: nil)
|
79
|
+
@df1 = df1
|
80
|
+
@df2 = df2
|
81
|
+
@on = on
|
82
|
+
end
|
83
83
|
|
84
|
-
|
85
|
-
|
86
|
-
|
84
|
+
def inner _opts
|
85
|
+
merge_join(left: false, right: false)
|
86
|
+
end
|
87
87
|
|
88
|
-
|
89
|
-
|
88
|
+
def left _opts
|
89
|
+
merge_join(left: true, right: false)
|
90
|
+
end
|
90
91
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
if (bf.include?(x))
|
95
|
-
{x_ind => on_cols2.each_index.select{|y_ind| on_cols2[y_ind] == x}}
|
96
|
-
else
|
97
|
-
{x_ind => []}
|
98
|
-
end
|
99
|
-
end
|
100
|
-
.reduce({}) {|h,pairs| pairs.each {|k,v| (h[k] ||= []) << v}; h}
|
101
|
-
.flat_map{|ind1, inds2| inds2.flatten.map{|ind2| [table1[ind1], table2[ind2]].flatten} if inds2.flatten.size > 0}
|
92
|
+
def right _opts
|
93
|
+
merge_join(left: false, right: true)
|
94
|
+
end
|
102
95
|
|
103
|
-
|
104
|
-
|
105
|
-
|
96
|
+
def outer _opts
|
97
|
+
merge_join(left: true, right: true)
|
98
|
+
end
|
106
99
|
|
107
|
-
|
108
|
-
|
100
|
+
def merge_join(left: true, right: true)
|
101
|
+
MergeHelper.verify_dataframes df1_hash, df2_hash, @on
|
102
|
+
MergeHelper.resolve_duplicates df1_hash, df2_hash, @on
|
109
103
|
|
110
|
-
|
111
|
-
|
112
|
-
|
104
|
+
# TODO: Use native dataframe sorting.
|
105
|
+
# It would be ideal to reuse sorting functionality that is native
|
106
|
+
# to dataframes. Unfortunately, native dataframe sort introduces
|
107
|
+
# an overhead that reduces join performance by a factor of 4! Until
|
108
|
+
# that aspect is improved, we resort to a simpler array sort.
|
109
|
+
df1_array.sort_by! { |row| [row[0].nil? ? 0 : 1, row[0]] }
|
110
|
+
df2_array.sort_by! { |row| [row[0].nil? ? 0 : 1, row[0]] }
|
113
111
|
|
114
|
-
|
115
|
-
|
116
|
-
end
|
112
|
+
idx1 = 0
|
113
|
+
idx2 = 0
|
117
114
|
|
118
|
-
|
119
|
-
joined_hash = {}
|
120
|
-
((df_hash1.keys - on) | on | (df_hash2.keys - on)).each do |k|
|
121
|
-
joined_hash[k] = []
|
122
|
-
end
|
115
|
+
while idx1 < @df1.size || idx2 < @df2.size
|
123
116
|
|
124
|
-
|
125
|
-
|
126
|
-
joined = false
|
127
|
-
(0...df2.size).each do |id2|
|
128
|
-
if on.all? { |n| df_hash1[n][id1] == df_hash2[n][id2] }
|
129
|
-
joined = true
|
130
|
-
joined_hash.each do |k,v|
|
131
|
-
v << (df_hash1.has_key?(k) ? df_hash1[k][id1] : df_hash2[k][id2])
|
132
|
-
end
|
133
|
-
end
|
134
|
-
end
|
117
|
+
key1 = df1_array[idx1][0] if idx1 < @df1.size
|
118
|
+
key2 = df2_array[idx2][0] if idx2 < @df2.size
|
135
119
|
|
136
|
-
|
137
|
-
|
138
|
-
joined_hash[k] << df_hash1[k][id1]
|
139
|
-
end
|
120
|
+
if key1 == key2 && idx1 < @df1.size && idx2 < @df2.size
|
121
|
+
idx2_start = idx2
|
140
122
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
joined = false
|
123
|
+
while (idx2 < @df2.size) && (df1_array[idx1][0] == df2_array[idx2][0])
|
124
|
+
add_merge_row_to_hash([df1_array[idx1], df2_array[idx2]], joined_hash)
|
125
|
+
idx2 += 1
|
145
126
|
end
|
127
|
+
|
128
|
+
idx2 = idx2_start if idx1+1 < @df1.size && df1_array[idx1][0] == df1_array[idx1+1][0]
|
129
|
+
idx1 += 1
|
130
|
+
elsif ((key2.nil? || [key1,key2].sort == [key1,key2]) && idx1 < @df1.size) || idx2 == @df2.size
|
131
|
+
add_merge_row_to_hash([df1_array[idx1], nil], joined_hash) if left
|
132
|
+
idx1 += 1
|
133
|
+
elsif idx2 < @df2.size || idx1 == @df1.size
|
134
|
+
add_merge_row_to_hash([nil, df2_array[idx2]], joined_hash) if right
|
135
|
+
idx2 += 1
|
136
|
+
else
|
137
|
+
raise 'Unexpected condition met during merge'
|
146
138
|
end
|
139
|
+
end
|
147
140
|
|
148
|
-
|
149
|
-
|
141
|
+
Daru::DataFrame.new(joined_hash, order: joined_hash.keys)
|
142
|
+
end
|
143
|
+
|
144
|
+
private
|
145
|
+
|
146
|
+
def joined_hash
|
147
|
+
return @joined_hash if @joined_hash
|
148
|
+
@joined_hash ||= {}
|
149
|
+
|
150
|
+
((df1_keys - @on) | @on | (df2_keys - @on)).each do |k|
|
151
|
+
@joined_hash[k] = []
|
150
152
|
end
|
151
153
|
|
152
|
-
|
153
|
-
|
154
|
-
((df_hash1.keys - on) | on | (df_hash2.keys - on)).each do |k|
|
155
|
-
joined_hash[k] = []
|
156
|
-
end
|
154
|
+
@joined_hash
|
155
|
+
end
|
157
156
|
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
if on.all? { |n| df_hash2[n][id1] == df_hash1[n][id2] }
|
162
|
-
joined = true
|
163
|
-
joined_hash.each do |k,v|
|
164
|
-
v << (df_hash2.has_key?(k) ? df_hash2[k][id1] : df_hash1[k][id2])
|
165
|
-
end
|
166
|
-
end
|
167
|
-
end
|
157
|
+
def df1_hash
|
158
|
+
@df1_hash ||= MergeHelper.hashify @df1
|
159
|
+
end
|
168
160
|
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
end
|
161
|
+
def df2_hash
|
162
|
+
@df2_hash ||= MergeHelper.hashify @df2
|
163
|
+
end
|
173
164
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
joined = false
|
178
|
-
end
|
179
|
-
end
|
165
|
+
def df1_array
|
166
|
+
@df1_array ||= MergeHelper.arrayify_with_sort_keys @df1.size, df1_hash, @on
|
167
|
+
end
|
180
168
|
|
181
|
-
|
182
|
-
|
183
|
-
|
169
|
+
def df2_array
|
170
|
+
@df2_array ||= MergeHelper.arrayify_with_sort_keys @df2.size, df2_hash, @on
|
171
|
+
end
|
184
172
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
173
|
+
def df1_keys
|
174
|
+
df1_hash.keys
|
175
|
+
end
|
176
|
+
|
177
|
+
def df2_keys
|
178
|
+
df2_hash.keys
|
179
|
+
end
|
180
|
+
|
181
|
+
# Private: The merge row contains two elements, the first is the row from the
|
182
|
+
# first dataframe, the second is the row from the second dataframe.
|
183
|
+
def add_merge_row_to_hash row, hash
|
184
|
+
@df1_key_to_index ||= df1_keys.each_with_index.map { |k,idx| [k, idx] }.to_h
|
185
|
+
@df2_key_to_index ||= df2_keys.each_with_index.map { |k,idx| [k, idx] }.to_h
|
186
|
+
|
187
|
+
hash.each do |k,v|
|
188
|
+
v ||= []
|
189
|
+
|
190
|
+
left = df1_keys.include?(k) ? row[0] && row[0][1][@df1_key_to_index[k]] : nil
|
191
|
+
right = df2_keys.include?(k) ? row[1] && row[1][1][@df2_key_to_index[k]] : nil
|
192
|
+
|
193
|
+
v << (left || right)
|
190
194
|
end
|
191
195
|
end
|
192
196
|
end
|
197
|
+
|
193
198
|
# Private module containing methods for join, merge, concat operations on
|
194
199
|
# dataframes and vectors.
|
195
200
|
# @private
|
196
201
|
module Merge
|
197
202
|
class << self
|
198
203
|
def join df1, df2, opts={}
|
199
|
-
helper = MergeHelper
|
200
|
-
|
201
|
-
df_hash1 = helper.hashify df1
|
202
|
-
df_hash2 = helper.hashify df2
|
203
204
|
on = opts[:on]
|
204
205
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
case opts[:how]
|
209
|
-
when :inner
|
210
|
-
if Daru.has_bloomfilter_rb?
|
211
|
-
helper.bf_inner_join df1, df2, on
|
212
|
-
else
|
213
|
-
helper.inner_join df1, df2, df_hash1, df_hash2, on
|
214
|
-
end
|
215
|
-
when :outer
|
216
|
-
helper.full_outer_join df1, df2, df_hash1, df_hash2, on
|
217
|
-
when :left
|
218
|
-
helper.left_outer_join df1, df2, df_hash1, df_hash2, on
|
219
|
-
when :right
|
220
|
-
helper.right_outer_join df1, df2, df_hash1, df_hash2, on
|
221
|
-
else
|
222
|
-
raise ArgumentError, "Unrecognized option in :how => #{opts[:how]}"
|
223
|
-
end
|
206
|
+
mf = MergeFrame.new df1, df2, on: on
|
207
|
+
mf.send opts[:how], {}
|
224
208
|
end
|
225
209
|
end
|
226
210
|
end
|
data/lib/daru/core/query.rb
CHANGED
@@ -9,31 +9,19 @@ module Daru
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def & other
|
12
|
-
|
13
|
-
other_barry = other.barry
|
14
|
-
@barry.each_with_index do |b, i|
|
15
|
-
new_bool << (b and other_barry[i])
|
16
|
-
end
|
17
|
-
|
18
|
-
BoolArray.new(new_bool)
|
12
|
+
BoolArray.new @barry.zip(other.barry).map { |b, o| b && o }
|
19
13
|
end
|
20
14
|
|
21
15
|
alias :and :&
|
22
16
|
|
23
17
|
def | other
|
24
|
-
|
25
|
-
other_barry = other.barry
|
26
|
-
@barry.each_with_index do |b, i|
|
27
|
-
new_bool << (b or other_barry[i])
|
28
|
-
end
|
29
|
-
|
30
|
-
BoolArray.new(new_bool)
|
18
|
+
BoolArray.new @barry.zip(other.barry).map { |b, o| b || o }
|
31
19
|
end
|
32
20
|
|
33
21
|
alias :or :|
|
34
22
|
|
35
23
|
def !
|
36
|
-
BoolArray.new(@barry.map
|
24
|
+
BoolArray.new(@barry.map(&:!))
|
37
25
|
end
|
38
26
|
|
39
27
|
def == other
|
@@ -45,27 +33,17 @@ module Daru
|
|
45
33
|
end
|
46
34
|
|
47
35
|
def inspect
|
48
|
-
"(#{self.class}:#{
|
36
|
+
"(#{self.class}:#{object_id} bool_arry=#{@barry})"
|
49
37
|
end
|
50
38
|
end
|
51
39
|
|
52
40
|
class << self
|
53
41
|
def apply_scalar_operator operator, data, other
|
54
|
-
|
55
|
-
memo << (d.send(operator, other) ? true : false)
|
56
|
-
memo
|
57
|
-
end
|
58
|
-
|
59
|
-
BoolArray.new(arry)
|
42
|
+
BoolArray.new data.map { |d| !!d.send(operator, other) }
|
60
43
|
end
|
61
44
|
|
62
45
|
def apply_vector_operator operator, vector, other
|
63
|
-
|
64
|
-
vector.each_with_index do |d, i|
|
65
|
-
bool_arry << (d.send(operator, other[i]) ? true : false)
|
66
|
-
end
|
67
|
-
|
68
|
-
BoolArray.new(bool_arry)
|
46
|
+
BoolArray.new vector.zip(other).map { |d, o| !!d.send(operator, o) }
|
69
47
|
end
|
70
48
|
|
71
49
|
def df_where data_frame, bool_array
|
@@ -74,7 +52,8 @@ module Daru
|
|
74
52
|
end
|
75
53
|
|
76
54
|
Daru::DataFrame.new(
|
77
|
-
vecs, order: data_frame.vectors, index: vecs[0].index, clone: false
|
55
|
+
vecs, order: data_frame.vectors, index: vecs[0].index, clone: false
|
56
|
+
)
|
78
57
|
end
|
79
58
|
|
80
59
|
def vector_where data, index, bool_array, dtype
|
@@ -92,4 +71,4 @@ module Daru
|
|
92
71
|
end
|
93
72
|
end
|
94
73
|
end
|
95
|
-
end
|
74
|
+
end
|
data/lib/daru/dataframe.rb
CHANGED
@@ -1,14 +1,11 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require '
|
4
|
-
require '
|
5
|
-
require '
|
6
|
-
require 'plotting/dataframe.rb'
|
7
|
-
require 'io/io.rb'
|
1
|
+
require 'daru/accessors/dataframe_by_row.rb'
|
2
|
+
require 'daru/maths/arithmetic/dataframe.rb'
|
3
|
+
require 'daru/maths/statistics/dataframe.rb'
|
4
|
+
require 'daru/plotting/dataframe.rb'
|
5
|
+
require 'daru/io/io.rb'
|
8
6
|
|
9
7
|
module Daru
|
10
8
|
class DataFrame
|
11
|
-
|
12
9
|
include Daru::Maths::Arithmetic::DataFrame
|
13
10
|
include Daru::Maths::Statistics::DataFrame
|
14
11
|
include Daru::Plotting::DataFrame if Daru.has_nyaplot?
|
@@ -115,31 +112,30 @@ module Daru
|
|
115
112
|
# Create DataFrame by specifying rows as an Array of Arrays or Array of
|
116
113
|
# Daru::Vector objects.
|
117
114
|
def rows source, opts={}
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
115
|
+
first = source.first
|
116
|
+
|
117
|
+
raise SizeError, 'All vectors must have same length' \
|
118
|
+
unless source.all? { |v| v.size == first.size }
|
119
|
+
|
120
|
+
index = []
|
121
|
+
opts[:order] ||=
|
122
|
+
case first
|
123
|
+
when Daru::Vector # assume that all are Vectors
|
124
|
+
index = source.map(&:name)
|
125
125
|
first.index.to_a
|
126
|
-
|
127
|
-
Array.new(first.size
|
126
|
+
when Array
|
127
|
+
Array.new(first.size, &:to_s)
|
128
128
|
end
|
129
129
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
130
|
+
if source.all? { |s| s.is_a?(Array) }
|
131
|
+
Daru::DataFrame.new(source.transpose, opts)
|
132
|
+
else # array of Daru::Vectors
|
133
|
+
Daru::DataFrame.new({}, opts).tap do |df|
|
134
134
|
source.each_with_index do |row, idx|
|
135
|
-
df[
|
135
|
+
df[index[idx] || idx, :row] = row
|
136
136
|
end
|
137
137
|
end
|
138
|
-
else
|
139
|
-
raise SizeError, "All vectors must have same length"
|
140
138
|
end
|
141
|
-
|
142
|
-
df
|
143
139
|
end
|
144
140
|
|
145
141
|
# Generates a new dataset, using three vectors
|
@@ -162,18 +158,16 @@ module Daru
|
|
162
158
|
#
|
163
159
|
# Useful to process outputs from databases
|
164
160
|
def crosstab_by_assignation rows, columns, values
|
165
|
-
raise
|
166
|
-
rows.size != columns.size
|
161
|
+
raise 'Three vectors should be equal size' if
|
162
|
+
rows.size != columns.size || rows.size!=values.size
|
167
163
|
|
168
164
|
cols_values = columns.factors
|
169
165
|
cols_n = cols_values.size
|
170
166
|
|
171
|
-
h_rows = rows.factors.
|
172
|
-
a[v] = cols_values.
|
167
|
+
h_rows = rows.factors.each_with_object({}) do |v, a|
|
168
|
+
a[v] = cols_values.each_with_object({}) do |v1, a1|
|
173
169
|
a1[v1]=nil
|
174
|
-
a1
|
175
170
|
end
|
176
|
-
a
|
177
171
|
end
|
178
172
|
|
179
173
|
values.each_index do |i|
|
@@ -250,7 +244,7 @@ module Daru
|
|
250
244
|
@data = []
|
251
245
|
|
252
246
|
temp_name = opts[:name]
|
253
|
-
@name
|
247
|
+
@name = temp_name || SecureRandom.uuid
|
254
248
|
|
255
249
|
if source.empty?
|
256
250
|
@vectors = try_create_index vectors
|
@@ -266,7 +260,7 @@ module Daru
|
|
266
260
|
@index = try_create_index(index || source[0].size)
|
267
261
|
@vectors = try_create_index(vectors)
|
268
262
|
|
269
|
-
@vectors.each_with_index do |
|
263
|
+
@vectors.each_with_index do |_vec,idx|
|
270
264
|
@data << Daru::Vector.new(source[idx], index: @index)
|
271
265
|
end
|
272
266
|
elsif source.all? { |s| s.is_a?(Daru::Vector) }
|
@@ -276,18 +270,18 @@ module Daru
|
|
276
270
|
end
|
277
271
|
initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
|
278
272
|
else # array of hashes
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
(vectors + (source[0].keys - vectors)).uniq)
|
284
|
-
|
273
|
+
@vectors =
|
274
|
+
if vectors.nil?
|
275
|
+
Daru::Index.new source[0].keys
|
276
|
+
else
|
277
|
+
Daru::Index.new((vectors + (source[0].keys - vectors)).uniq)
|
278
|
+
end
|
285
279
|
@index = Daru::Index.new(index || source.size)
|
286
280
|
|
287
281
|
@vectors.each do |name|
|
288
282
|
v = []
|
289
|
-
source.each do |
|
290
|
-
v << (
|
283
|
+
source.each do |h|
|
284
|
+
v << (h[name] || h[name.to_s])
|
291
285
|
end
|
292
286
|
|
293
287
|
@data << Daru::Vector.new(v, name: set_name(name), index: @index)
|
@@ -296,10 +290,10 @@ module Daru
|
|
296
290
|
when Hash
|
297
291
|
create_vectors_index_with vectors, source
|
298
292
|
if all_daru_vectors_in_source? source
|
293
|
+
vectors_have_same_index = all_vectors_have_equal_indexes?(source)
|
299
294
|
if !index.nil?
|
300
295
|
@index = try_create_index index
|
301
|
-
elsif
|
302
|
-
vectors_have_same_index = true
|
296
|
+
elsif vectors_have_same_index
|
303
297
|
@index = source.values[0].index.dup
|
304
298
|
else
|
305
299
|
all_indexes = []
|
@@ -320,14 +314,10 @@ module Daru
|
|
320
314
|
if vectors_have_same_index
|
321
315
|
v = source[vector].dup
|
322
316
|
else
|
323
|
-
v = Daru::Vector.new([], name: vector, index: @index)
|
317
|
+
v = Daru::Vector.new([], name: vector, metadata: source[vector].metadata.dup, index: @index)
|
324
318
|
|
325
319
|
@index.each do |idx|
|
326
|
-
|
327
|
-
v[idx] = source[vector][idx]
|
328
|
-
else
|
329
|
-
v[idx] = nil
|
330
|
-
end
|
320
|
+
v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
|
331
321
|
end
|
332
322
|
end
|
333
323
|
@data << v
|
@@ -339,7 +329,8 @@ module Daru
|
|
339
329
|
@index = try_create_index(index || source.values[0].size)
|
340
330
|
|
341
331
|
@vectors.each do |name|
|
342
|
-
|
332
|
+
meta_opt = source[name].respond_to?(:metadata) ? {metadata: source[name].metadata.dup} : {}
|
333
|
+
@data << Daru::Vector.new(source[name].dup, name: set_name(name), **meta_opt, index: @index)
|
343
334
|
end
|
344
335
|
end
|
345
336
|
end
|
@@ -350,17 +341,16 @@ module Daru
|
|
350
341
|
update
|
351
342
|
end
|
352
343
|
|
353
|
-
def vector
|
354
|
-
$stderr.puts
|
344
|
+
def vector(*)
|
345
|
+
$stderr.puts '#vector has been deprecated in favour of #[]. Please use that.'
|
355
346
|
self[*names]
|
356
347
|
end
|
357
348
|
|
358
349
|
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
|
359
350
|
# Defaults to *:vector*. Use of this method is not recommended for accessing
|
360
|
-
# rows
|
361
|
-
# df.vector[:vec] for accessing vector with index *:vec*.
|
351
|
+
# rows. Use df.row[:a] for accessing row with index ':a'.
|
362
352
|
def [](*names)
|
363
|
-
if names[-1] == :vector
|
353
|
+
if names[-1] == :vector || names[-1] == :row
|
364
354
|
axis = names[-1]
|
365
355
|
names = names[0..-2]
|
366
356
|
else
|
@@ -368,9 +358,9 @@ module Daru
|
|
368
358
|
end
|
369
359
|
|
370
360
|
if axis == :vector
|
371
|
-
access_vector
|
361
|
+
access_vector(*names)
|
372
362
|
elsif axis == :row
|
373
|
-
access_row
|
363
|
+
access_row(*names)
|
374
364
|
else
|
375
365
|
raise IndexError, "Expected axis to be row or vector not #{axis}"
|
376
366
|
end
|
@@ -433,7 +423,7 @@ module Daru
|
|
433
423
|
|
434
424
|
src = []
|
435
425
|
vectors_to_dup.each do |vec|
|
436
|
-
src << @data[@vectors[vec]].
|
426
|
+
src << @data[@vectors[vec]].dup
|
437
427
|
end
|
438
428
|
new_order = Daru::Index.new(vectors_to_dup)
|
439
429
|
|
@@ -454,11 +444,10 @@ module Daru
|
|
454
444
|
# a view of the whole data frame otherwise.
|
455
445
|
def clone *vectors_to_clone
|
456
446
|
vectors_to_clone.flatten! unless vectors_to_clone.all? { |a| !a.is_a?(Array) }
|
457
|
-
|
447
|
+
vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
|
458
448
|
|
459
|
-
h = vectors_to_clone.
|
449
|
+
h = vectors_to_clone.each_with_object({}) do |vec, hsh|
|
460
450
|
hsh[vec] = self[vec]
|
461
|
-
hsh
|
462
451
|
end
|
463
452
|
Daru::DataFrame.new(h, clone: false)
|
464
453
|
end
|
@@ -476,9 +465,8 @@ module Daru
|
|
476
465
|
# Creates a new duplicate dataframe containing only rows
|
477
466
|
# without a single missing value.
|
478
467
|
def dup_only_valid vecs=nil
|
479
|
-
rows_with_nil = @data.
|
468
|
+
rows_with_nil = @data.each_with_object([]) do |vector, memo|
|
480
469
|
memo.concat vector.missing_positions
|
481
|
-
memo
|
482
470
|
end.uniq
|
483
471
|
|
484
472
|
row_indexes = @index.to_a
|
@@ -505,7 +493,7 @@ module Daru
|
|
505
493
|
alias_method :each_column, :each_vector
|
506
494
|
|
507
495
|
# Iterate over each vector alongwith the name of the vector
|
508
|
-
def each_vector_with_index
|
496
|
+
def each_vector_with_index
|
509
497
|
return to_enum(:each_vector_with_index) unless block_given?
|
510
498
|
|
511
499
|
@vectors.each do |vector|
|
@@ -518,7 +506,7 @@ module Daru
|
|
518
506
|
alias_method :each_column_with_index, :each_vector_with_index
|
519
507
|
|
520
508
|
# Iterate over each row
|
521
|
-
def each_row
|
509
|
+
def each_row
|
522
510
|
return to_enum(:each_row) unless block_given?
|
523
511
|
|
524
512
|
@index.each do |index|
|
@@ -528,7 +516,7 @@ module Daru
|
|
528
516
|
self
|
529
517
|
end
|
530
518
|
|
531
|
-
def each_row_with_index
|
519
|
+
def each_row_with_index
|
532
520
|
return to_enum(:each_row_with_index) unless block_given?
|
533
521
|
|
534
522
|
@index.each do |index|
|
@@ -552,7 +540,7 @@ module Daru
|
|
552
540
|
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
553
541
|
# or :row. Default to :vector.
|
554
542
|
def each axis=:vector, &block
|
555
|
-
if axis == :vector
|
543
|
+
if axis == :vector || axis == :column
|
556
544
|
each_vector(&block)
|
557
545
|
elsif axis == :row
|
558
546
|
each_row(&block)
|
@@ -577,7 +565,7 @@ module Daru
|
|
577
565
|
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
578
566
|
# or :row. Default to :vector.
|
579
567
|
def collect axis=:vector, &block
|
580
|
-
if axis == :vector
|
568
|
+
if axis == :vector || axis == :column
|
581
569
|
collect_vectors(&block)
|
582
570
|
elsif axis == :row
|
583
571
|
collect_rows(&block)
|
@@ -603,7 +591,7 @@ module Daru
|
|
603
591
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
604
592
|
# Default to :vector.
|
605
593
|
def map axis=:vector, &block
|
606
|
-
if axis == :vector
|
594
|
+
if axis == :vector || axis == :column
|
607
595
|
map_vectors(&block)
|
608
596
|
elsif axis == :row
|
609
597
|
map_rows(&block)
|
@@ -621,7 +609,7 @@ module Daru
|
|
621
609
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
622
610
|
# Default to :vector.
|
623
611
|
def map! axis=:vector, &block
|
624
|
-
if axis == :vector
|
612
|
+
if axis == :vector || axis == :column
|
625
613
|
map_vectors!(&block)
|
626
614
|
elsif axis == :row
|
627
615
|
map_rows!(&block)
|
@@ -646,7 +634,7 @@ module Daru
|
|
646
634
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
647
635
|
# Default to :vector.
|
648
636
|
def recode axis=:vector, &block
|
649
|
-
if axis == :vector
|
637
|
+
if axis == :vector || axis == :column
|
650
638
|
recode_vectors(&block)
|
651
639
|
elsif axis == :row
|
652
640
|
recode_rows(&block)
|
@@ -682,17 +670,17 @@ module Daru
|
|
682
670
|
# row[:a] + row[:d] < 100
|
683
671
|
# end
|
684
672
|
def filter axis=:vector, &block
|
685
|
-
if axis == :vector
|
673
|
+
if axis == :vector || axis == :column
|
686
674
|
filter_vectors(&block)
|
687
675
|
elsif axis == :row
|
688
676
|
filter_rows(&block)
|
689
677
|
end
|
690
678
|
end
|
691
679
|
|
692
|
-
def recode_vectors
|
680
|
+
def recode_vectors
|
693
681
|
block_given? or return to_enum(:recode_vectors)
|
694
682
|
|
695
|
-
df =
|
683
|
+
df = dup
|
696
684
|
df.each_vector_with_index do |v, i|
|
697
685
|
ret = yield v
|
698
686
|
ret.is_a?(Daru::Vector) or
|
@@ -703,10 +691,10 @@ module Daru
|
|
703
691
|
df
|
704
692
|
end
|
705
693
|
|
706
|
-
def recode_rows
|
694
|
+
def recode_rows
|
707
695
|
block_given? or return to_enum(:recode_rows)
|
708
696
|
|
709
|
-
df =
|
697
|
+
df = dup
|
710
698
|
df.each_row_with_index do |r, i|
|
711
699
|
ret = yield r
|
712
700
|
ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
|
@@ -717,7 +705,7 @@ module Daru
|
|
717
705
|
end
|
718
706
|
|
719
707
|
# Map each vector and return an Array.
|
720
|
-
def map_vectors
|
708
|
+
def map_vectors
|
721
709
|
return to_enum(:map_vectors) unless block_given?
|
722
710
|
|
723
711
|
arry = []
|
@@ -729,7 +717,7 @@ module Daru
|
|
729
717
|
end
|
730
718
|
|
731
719
|
# Destructive form of #map_vectors
|
732
|
-
def map_vectors!
|
720
|
+
def map_vectors!
|
733
721
|
return to_enum(:map_vectors!) unless block_given?
|
734
722
|
|
735
723
|
vectors.dup.each do |n|
|
@@ -742,7 +730,7 @@ module Daru
|
|
742
730
|
end
|
743
731
|
|
744
732
|
# Map vectors alongwith the index.
|
745
|
-
def map_vectors_with_index
|
733
|
+
def map_vectors_with_index
|
746
734
|
return to_enum(:map_vectors_with_index) unless block_given?
|
747
735
|
|
748
736
|
dt = []
|
@@ -754,7 +742,7 @@ module Daru
|
|
754
742
|
end
|
755
743
|
|
756
744
|
# Map each row
|
757
|
-
def map_rows
|
745
|
+
def map_rows
|
758
746
|
return to_enum(:map_rows) unless block_given?
|
759
747
|
|
760
748
|
dt = []
|
@@ -765,7 +753,7 @@ module Daru
|
|
765
753
|
dt
|
766
754
|
end
|
767
755
|
|
768
|
-
def map_rows_with_index
|
756
|
+
def map_rows_with_index
|
769
757
|
return to_enum(:map_rows_with_index) unless block_given?
|
770
758
|
|
771
759
|
dt = []
|
@@ -776,13 +764,13 @@ module Daru
|
|
776
764
|
dt
|
777
765
|
end
|
778
766
|
|
779
|
-
def map_rows!
|
767
|
+
def map_rows!
|
780
768
|
return to_enum(:map_rows!) unless block_given?
|
781
769
|
|
782
770
|
index.dup.each do |i|
|
783
|
-
r = yield
|
771
|
+
r = yield row[i]
|
784
772
|
r.is_a?(Daru::Vector) or raise TypeError, "Returned object must be Daru::Vector not #{r.class}"
|
785
|
-
|
773
|
+
row[i] = r
|
786
774
|
end
|
787
775
|
|
788
776
|
self
|
@@ -790,7 +778,7 @@ module Daru
|
|
790
778
|
|
791
779
|
# Retrieves a Daru::Vector, based on the result of calculation
|
792
780
|
# performed on each row.
|
793
|
-
def collect_rows
|
781
|
+
def collect_rows
|
794
782
|
return to_enum(:collect_rows) unless block_given?
|
795
783
|
|
796
784
|
data = []
|
@@ -801,7 +789,7 @@ module Daru
|
|
801
789
|
Daru::Vector.new(data, index: @index)
|
802
790
|
end
|
803
791
|
|
804
|
-
def collect_row_with_index
|
792
|
+
def collect_row_with_index
|
805
793
|
return to_enum(:collect_row_with_index) unless block_given?
|
806
794
|
|
807
795
|
data = []
|
@@ -814,7 +802,7 @@ module Daru
|
|
814
802
|
|
815
803
|
# Retrives a Daru::Vector, based on the result of calculation
|
816
804
|
# performed on each vector.
|
817
|
-
def collect_vectors
|
805
|
+
def collect_vectors
|
818
806
|
return to_enum(:collect_vectors) unless block_given?
|
819
807
|
|
820
808
|
data = []
|
@@ -825,7 +813,7 @@ module Daru
|
|
825
813
|
Daru::Vector.new(data, index: @vectors)
|
826
814
|
end
|
827
815
|
|
828
|
-
def collect_vector_with_index
|
816
|
+
def collect_vector_with_index
|
829
817
|
return to_enum(:collect_vector_with_index) unless block_given?
|
830
818
|
|
831
819
|
data = []
|
@@ -852,15 +840,19 @@ module Daru
|
|
852
840
|
Matrix.rows(rows)
|
853
841
|
end
|
854
842
|
|
855
|
-
|
856
843
|
# Delete a vector
|
857
844
|
def delete_vector vector
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
845
|
+
raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
|
846
|
+
|
847
|
+
@data.delete_at @vectors[vector]
|
848
|
+
@vectors = Daru::Index.new @vectors.to_a - [vector]
|
849
|
+
|
850
|
+
self
|
851
|
+
end
|
852
|
+
|
853
|
+
# Deletes a list of vectors
|
854
|
+
def delete_vectors *vectors
|
855
|
+
Array(vectors).each { |vec| delete_vector vec }
|
864
856
|
|
865
857
|
self
|
866
858
|
end
|
@@ -869,13 +861,10 @@ module Daru
|
|
869
861
|
def delete_row index
|
870
862
|
idx = named_index_for index
|
871
863
|
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
end
|
877
|
-
else
|
878
|
-
raise IndexError, "Index #{index} does not exist."
|
864
|
+
raise IndexError, "Index #{index} does not exist." unless @index.include? idx
|
865
|
+
@index = Daru::Index.new(@index.to_a - [idx])
|
866
|
+
each_vector do |vector|
|
867
|
+
vector.delete_at idx
|
879
868
|
end
|
880
869
|
|
881
870
|
set_size
|
@@ -895,7 +884,7 @@ module Daru
|
|
895
884
|
ds_boot
|
896
885
|
end
|
897
886
|
|
898
|
-
def keep_row_if
|
887
|
+
def keep_row_if
|
899
888
|
deletion = []
|
900
889
|
|
901
890
|
@index.each do |index|
|
@@ -908,7 +897,7 @@ module Daru
|
|
908
897
|
}
|
909
898
|
end
|
910
899
|
|
911
|
-
def keep_vector_if
|
900
|
+
def keep_vector_if
|
912
901
|
@vectors.each do |vector|
|
913
902
|
keep_vector = yield @data[@vectors[vector]], vector
|
914
903
|
|
@@ -923,27 +912,17 @@ module Daru
|
|
923
912
|
d.push(row[vec]) if yield row
|
924
913
|
end
|
925
914
|
|
926
|
-
Daru::Vector.new(d)
|
915
|
+
Daru::Vector.new(d, metadata: self[vec].metadata.dup)
|
927
916
|
end
|
928
917
|
|
929
918
|
# Iterates over each row and retains it in a new DataFrame if the block returns
|
930
919
|
# true for that row.
|
931
|
-
def filter_rows
|
920
|
+
def filter_rows
|
932
921
|
return to_enum(:filter_rows) unless block_given?
|
933
922
|
|
934
|
-
|
935
|
-
marked = []
|
936
|
-
|
937
|
-
@index.each do |index|
|
938
|
-
keep_row = yield access_row(index)
|
939
|
-
marked << index if keep_row
|
940
|
-
end
|
941
|
-
|
942
|
-
marked.each do |idx|
|
943
|
-
df.row[idx] = self[idx, :row]
|
944
|
-
end
|
923
|
+
keep_rows = @index.map { |index| yield access_row(index) }
|
945
924
|
|
946
|
-
|
925
|
+
where keep_rows
|
947
926
|
end
|
948
927
|
|
949
928
|
# Iterates over each vector and retains it in a new DataFrame if the block returns
|
@@ -951,8 +930,8 @@ module Daru
|
|
951
930
|
def filter_vectors &block
|
952
931
|
return to_enum(:filter_vectors) unless block_given?
|
953
932
|
|
954
|
-
df =
|
955
|
-
df.keep_vector_if
|
933
|
+
df = dup
|
934
|
+
df.keep_vector_if(&block)
|
956
935
|
|
957
936
|
df
|
958
937
|
end
|
@@ -962,7 +941,7 @@ module Daru
|
|
962
941
|
#
|
963
942
|
# The function returns an array with all errors.
|
964
943
|
def verify(*tests)
|
965
|
-
if
|
944
|
+
if tests[0].is_a? Symbol
|
966
945
|
id = tests[0]
|
967
946
|
tests.shift
|
968
947
|
else
|
@@ -974,13 +953,12 @@ module Daru
|
|
974
953
|
each(:row) do |row|
|
975
954
|
i += 1
|
976
955
|
tests.each do |test|
|
977
|
-
if
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
end
|
982
|
-
vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
|
956
|
+
next if test[2].call(row)
|
957
|
+
values = ''
|
958
|
+
unless test[1].empty?
|
959
|
+
values = ' (' + test[1].collect { |k| "#{k}=#{row[k]}" }.join(', ') + ')'
|
983
960
|
end
|
961
|
+
vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
|
984
962
|
end
|
985
963
|
end
|
986
964
|
vr
|
@@ -1051,7 +1029,7 @@ module Daru
|
|
1051
1029
|
alias :vector_missing_values :missing_values_rows
|
1052
1030
|
|
1053
1031
|
def has_missing_data?
|
1054
|
-
!!@data.any?
|
1032
|
+
!!@data.any?(&:has_missing_data?)
|
1055
1033
|
end
|
1056
1034
|
|
1057
1035
|
alias :flawed? :has_missing_data?
|
@@ -1075,9 +1053,9 @@ module Daru
|
|
1075
1053
|
name = row[tree_keys.last]
|
1076
1054
|
if !block
|
1077
1055
|
current[name] ||= []
|
1078
|
-
current[name].push(row.
|
1056
|
+
current[name].push(row.to_h.delete_if { |key,_value| tree_keys.include? key })
|
1079
1057
|
else
|
1080
|
-
current[name] =
|
1058
|
+
current[name] = yield(row, current, name)
|
1081
1059
|
end
|
1082
1060
|
end
|
1083
1061
|
|
@@ -1087,7 +1065,7 @@ module Daru
|
|
1087
1065
|
def vector_count_characters vecs=nil
|
1088
1066
|
vecs ||= @vectors.to_a
|
1089
1067
|
|
1090
|
-
|
1068
|
+
collect_rows do |row|
|
1091
1069
|
vecs.inject(0) do |memo, vec|
|
1092
1070
|
memo + (row[vec].nil? ? 0 : row[vec].to_s.size)
|
1093
1071
|
end
|
@@ -1129,7 +1107,7 @@ module Daru
|
|
1129
1107
|
# row[:a] < 3 and row[:b] == 'b'
|
1130
1108
|
# end #=> true
|
1131
1109
|
def any? axis=:vector, &block
|
1132
|
-
if axis == :vector
|
1110
|
+
if axis == :vector || axis == :column
|
1133
1111
|
@data.any?(&block)
|
1134
1112
|
elsif axis == :row
|
1135
1113
|
each_row do |row|
|
@@ -1151,7 +1129,7 @@ module Daru
|
|
1151
1129
|
# row[:a] < 10
|
1152
1130
|
# end #=> true
|
1153
1131
|
def all? axis=:vector, &block
|
1154
|
-
if axis == :vector
|
1132
|
+
if axis == :vector || axis == :column
|
1155
1133
|
@data.all?(&block)
|
1156
1134
|
elsif axis == :row
|
1157
1135
|
each_row do |row|
|
@@ -1236,46 +1214,52 @@ module Daru
|
|
1236
1214
|
# # ["foo", "two", 3]=>[2, 4]}
|
1237
1215
|
def group_by *vectors
|
1238
1216
|
vectors.flatten!
|
1239
|
-
vectors.each { |v|
|
1240
|
-
has_vector?(v)
|
1217
|
+
vectors.each { |v|
|
1218
|
+
raise(ArgumentError, "Vector #{v} does not exist") unless has_vector?(v)
|
1219
|
+
}
|
1241
1220
|
|
1242
1221
|
Daru::Core::GroupBy.new(self, vectors)
|
1243
1222
|
end
|
1244
1223
|
|
1245
1224
|
def reindex_vectors new_vectors
|
1246
|
-
raise ArgumentError,
|
1247
|
-
"subclasses, not #{new_index.class}" unless new_vectors.
|
1225
|
+
raise ArgumentError, 'Must pass the new index of type Index or its '\
|
1226
|
+
"subclasses, not #{new_index.class}" unless new_vectors.is_a?(Daru::Index)
|
1248
1227
|
|
1249
1228
|
cl = Daru::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
|
1250
1229
|
new_vectors.each do |vec|
|
1251
|
-
|
1252
|
-
cl[vec] = self[vec]
|
1253
|
-
else
|
1254
|
-
cl[vec] = [nil]*nrows
|
1255
|
-
end
|
1230
|
+
cl[vec] = @vectors.include?(vec) ? self[vec] : cl[vec] = [nil]*nrows
|
1256
1231
|
end
|
1257
1232
|
|
1258
1233
|
cl
|
1259
1234
|
end
|
1260
1235
|
|
1261
1236
|
# Concatenate another DataFrame along corresponding columns.
|
1262
|
-
#
|
1237
|
+
# If columns do not exist in both dataframes, they are filled with nils
|
1263
1238
|
def concat other_df
|
1264
|
-
vectors =
|
1265
|
-
|
1266
|
-
|
1239
|
+
vectors = @vectors.to_a
|
1240
|
+
data = []
|
1241
|
+
|
1242
|
+
vectors.each do |v|
|
1243
|
+
other_vec = other_df.vectors.include?(v) ? other_df[v].to_a : [nil] * other_df.size
|
1244
|
+
data << self[v].dup.to_a.concat(other_vec)
|
1245
|
+
end
|
1246
|
+
|
1247
|
+
other_df.vectors.each do |v|
|
1248
|
+
next if vectors.include?(v)
|
1249
|
+
vectors << v
|
1250
|
+
data << ([nil] * size).concat(other_df[v].to_a)
|
1267
1251
|
end
|
1268
1252
|
|
1269
|
-
Daru::DataFrame.new(
|
1253
|
+
Daru::DataFrame.new(data, order: vectors)
|
1270
1254
|
end
|
1271
1255
|
|
1272
1256
|
# Set a particular column as the new DF
|
1273
1257
|
def set_index new_index, opts={}
|
1274
|
-
raise ArgumentError,
|
1258
|
+
raise ArgumentError, 'All elements in new index must be unique.' if
|
1275
1259
|
@size != self[new_index].uniq.size
|
1276
1260
|
|
1277
1261
|
self.index = Daru::Index.new(self[new_index].to_a)
|
1278
|
-
|
1262
|
+
delete_vector(new_index) unless opts[:keep]
|
1279
1263
|
|
1280
1264
|
self
|
1281
1265
|
end
|
@@ -1303,16 +1287,12 @@ module Daru
|
|
1303
1287
|
# # a 1 11
|
1304
1288
|
# # g nil nil
|
1305
1289
|
def reindex new_index
|
1306
|
-
raise ArgumentError,
|
1307
|
-
"subclasses, not #{new_index.class}" unless new_index.
|
1290
|
+
raise ArgumentError, 'Must pass the new index of type Index or its '\
|
1291
|
+
"subclasses, not #{new_index.class}" unless new_index.is_a?(Daru::Index)
|
1308
1292
|
|
1309
1293
|
cl = Daru::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
|
1310
1294
|
new_index.each do |idx|
|
1311
|
-
|
1312
|
-
cl.row[idx] = self.row[idx]
|
1313
|
-
else
|
1314
|
-
cl.row[idx] = [nil]*ncols
|
1315
|
-
end
|
1295
|
+
cl.row[idx] = @index.include?(idx) ? row[idx] : [nil]*ncols
|
1316
1296
|
end
|
1317
1297
|
|
1318
1298
|
cl
|
@@ -1330,7 +1310,7 @@ module Daru
|
|
1330
1310
|
# df.index.to_a #=> ['a','b','c','d']
|
1331
1311
|
# df.row['a'].to_a #=> [1,11]
|
1332
1312
|
def index= idx
|
1333
|
-
@data.each { |vec| vec.index = idx}
|
1313
|
+
@data.each { |vec| vec.index = idx }
|
1334
1314
|
@index = idx
|
1335
1315
|
|
1336
1316
|
self
|
@@ -1347,8 +1327,8 @@ module Daru
|
|
1347
1327
|
# df.vectors = Daru::Index.new([:foo, :bar, :baz])
|
1348
1328
|
# df.vectors.to_a #=> [:foo, :bar, :baz]
|
1349
1329
|
def vectors= idx
|
1350
|
-
raise ArgumentError,
|
1351
|
-
index.
|
1330
|
+
raise ArgumentError, 'Can only reindex with Index and its subclasses' unless
|
1331
|
+
index.is_a?(Daru::Index)
|
1352
1332
|
raise ArgumentError, "Specified index length #{idx.size} not equal to"\
|
1353
1333
|
"dataframe size #{ncols}" if idx.size != ncols
|
1354
1334
|
|
@@ -1356,13 +1336,35 @@ module Daru
|
|
1356
1336
|
self
|
1357
1337
|
end
|
1358
1338
|
|
1339
|
+
# Renames the vectors
|
1340
|
+
#
|
1341
|
+
# == Arguments
|
1342
|
+
#
|
1343
|
+
# * name_map - A hash where the keys are the exising vector names and
|
1344
|
+
# the values are the new names. If a vector is renamed
|
1345
|
+
# to a vector name that is already in use, the existing
|
1346
|
+
# one is overwritten.
|
1347
|
+
#
|
1348
|
+
# == Usage
|
1349
|
+
#
|
1350
|
+
# df = Daru::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
|
1351
|
+
# df.rename_vectors :a => :alpha, :c => :gamma
|
1352
|
+
# df.vectors.to_a #=> [:alpha, :b, :gamma]
|
1353
|
+
def rename_vectors name_map
|
1354
|
+
existing_targets = name_map.select { |k,v| k != v }.values & vectors.to_a
|
1355
|
+
delete_vectors(*existing_targets)
|
1356
|
+
|
1357
|
+
new_names = vectors.to_a.map { |v| name_map[v] ? name_map[v] : v }
|
1358
|
+
self.vectors = Daru::Index.new new_names
|
1359
|
+
end
|
1360
|
+
|
1359
1361
|
# Return the indexes of all the numeric vectors. Will include vectors with nils
|
1360
1362
|
# alongwith numbers.
|
1361
1363
|
def numeric_vectors
|
1362
1364
|
numerics = []
|
1363
1365
|
|
1364
1366
|
each_vector_with_index do |vec, i|
|
1365
|
-
numerics << i if
|
1367
|
+
numerics << i if vec.type == :numeric
|
1366
1368
|
end
|
1367
1369
|
numerics
|
1368
1370
|
end
|
@@ -1371,7 +1373,7 @@ module Daru
|
|
1371
1373
|
numerics = []
|
1372
1374
|
|
1373
1375
|
@vectors.each do |v|
|
1374
|
-
numerics << v if
|
1376
|
+
numerics << v if self[v].type == :numeric
|
1375
1377
|
end
|
1376
1378
|
numerics
|
1377
1379
|
end
|
@@ -1382,9 +1384,8 @@ module Daru
|
|
1382
1384
|
def only_numerics opts={}
|
1383
1385
|
cln = opts[:clone] == false ? false : true
|
1384
1386
|
nv = numeric_vectors
|
1385
|
-
arry = nv.
|
1387
|
+
arry = nv.each_with_object([]) do |v, arr|
|
1386
1388
|
arr << self[v]
|
1387
|
-
arr
|
1388
1389
|
end
|
1389
1390
|
|
1390
1391
|
order = Index.new(nv)
|
@@ -1392,12 +1393,12 @@ module Daru
|
|
1392
1393
|
end
|
1393
1394
|
|
1394
1395
|
# Generate a summary of this DataFrame with ReportBuilder.
|
1395
|
-
def summary(method
|
1396
|
+
def summary(method=:to_text)
|
1396
1397
|
ReportBuilder.new(no_title: true).add(self).send(method)
|
1397
1398
|
end
|
1398
1399
|
|
1399
1400
|
def report_building(b) # :nodoc: #
|
1400
|
-
b.section(:name
|
1401
|
+
b.section(name: @name) do |g|
|
1401
1402
|
g.text "Number of rows: #{nrows}"
|
1402
1403
|
@vectors.each do |v|
|
1403
1404
|
g.text "Element:[#{v}]"
|
@@ -1406,8 +1407,8 @@ module Daru
|
|
1406
1407
|
end
|
1407
1408
|
end
|
1408
1409
|
|
1409
|
-
# Sorts a dataframe (ascending/descending)
|
1410
|
-
# vectors,
|
1410
|
+
# Sorts a dataframe (ascending/descending) in the given pripority sequence of
|
1411
|
+
# vectors, with or without a block.
|
1411
1412
|
#
|
1412
1413
|
# @param order [Array] The order of vector names in which the DataFrame
|
1413
1414
|
# should be sorted.
|
@@ -1415,42 +1416,121 @@ module Daru
|
|
1415
1416
|
# @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
|
1416
1417
|
# or descending order. Specify Array corresponding to *order* for multiple
|
1417
1418
|
# sort orders.
|
1418
|
-
# @option opts [Hash] :by ({|a
|
1419
|
+
# @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
|
1419
1420
|
# to be used for sorting, for each vector name in *order* as a hash of
|
1420
|
-
# vector name and lambda
|
1421
|
+
# vector name and lambda expressions. In case a lambda for a vector is not
|
1421
1422
|
# specified, the default will be used.
|
1423
|
+
# @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
|
1424
|
+
# automatically or not when a block is provided.
|
1425
|
+
# If set to True, nils will appear at top after sorting.
|
1422
1426
|
#
|
1423
|
-
#
|
1427
|
+
# @example Sort a dataframe with a vector sequence.
|
1428
|
+
#
|
1429
|
+
#
|
1430
|
+
# df = Daru::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
|
1431
|
+
#
|
1432
|
+
# df.sort [:a, :b]
|
1433
|
+
# # =>
|
1434
|
+
# # <Daru::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
|
1435
|
+
# # a b
|
1436
|
+
# # 2 1 3
|
1437
|
+
# # 0 1 5
|
1438
|
+
# # 3 2 2
|
1439
|
+
# # 1 2 4
|
1440
|
+
# # 4 3 1
|
1441
|
+
#
|
1442
|
+
# @example Sort a dataframe without a block. Here nils will be handled automatically.
|
1443
|
+
#
|
1444
|
+
# df = Daru::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
|
1445
|
+
#
|
1446
|
+
# df.sort([:a])
|
1447
|
+
# # =>
|
1448
|
+
# # <Daru::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
|
1449
|
+
# # a b
|
1450
|
+
# # 1 nil 3
|
1451
|
+
# # 3 nil 1
|
1452
|
+
# # 0 -3 4
|
1453
|
+
# # 2 -1 2
|
1454
|
+
# # 4 5 4
|
1455
|
+
#
|
1456
|
+
# @example Sort a dataframe with a block with nils handled automatically.
|
1457
|
+
#
|
1458
|
+
# df = Daru::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
|
1424
1459
|
#
|
1425
|
-
# df
|
1460
|
+
# df.sort [:b], by: {b: lambda { |a| a.length } }
|
1461
|
+
# # NoMethodError: undefined method `length' for nil:NilClass
|
1462
|
+
# # from (pry):8:in `block in __pry__'
|
1426
1463
|
#
|
1427
|
-
#
|
1428
|
-
#
|
1429
|
-
# #
|
1430
|
-
# #
|
1431
|
-
# #
|
1432
|
-
# #
|
1433
|
-
#
|
1464
|
+
# df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
|
1465
|
+
#
|
1466
|
+
# # =>
|
1467
|
+
# # <Daru::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
|
1468
|
+
# # a b
|
1469
|
+
# # 2 1 nil
|
1470
|
+
# # 5 1 nil
|
1471
|
+
# # 4 -1 x
|
1472
|
+
# # 1 -1 aa
|
1473
|
+
# # 0 nil aaa
|
1474
|
+
# # 3 nil baaa
|
1475
|
+
#
|
1476
|
+
# @example Sort a dataframe with a block with nils handled manually.
|
1477
|
+
#
|
1478
|
+
# df = Daru::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
|
1479
|
+
#
|
1480
|
+
# # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
|
1481
|
+
# df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
|
1482
|
+
#
|
1483
|
+
# # =>
|
1484
|
+
# #<Daru::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
|
1485
|
+
# # a b
|
1486
|
+
# # 4 -1 x
|
1487
|
+
# # 1 -1 aa
|
1488
|
+
# # 0 nil aaa
|
1489
|
+
# # 3 nil baaa
|
1490
|
+
# # 2 1 nil
|
1491
|
+
# # 5 1 nil
|
1492
|
+
|
1434
1493
|
def sort! vector_order, opts={}
|
1435
|
-
raise ArgumentError,
|
1494
|
+
raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
|
1436
1495
|
opts = {
|
1437
1496
|
ascending: true,
|
1438
|
-
|
1497
|
+
handle_nils: false,
|
1439
1498
|
by: {}
|
1440
1499
|
}.merge(opts)
|
1441
1500
|
|
1442
|
-
opts[:by] = create_logic_blocks vector_order, opts[:by]
|
1443
1501
|
opts[:ascending] = sort_order_array vector_order, opts[:ascending]
|
1444
|
-
|
1445
|
-
|
1446
|
-
|
1502
|
+
opts[:handle_nils] = handle_nils_array vector_order, opts[:handle_nils]
|
1503
|
+
blocks = create_logic_blocks vector_order, opts[:by], opts[:ascending]
|
1504
|
+
|
1505
|
+
block = lambda do |r1, r2|
|
1506
|
+
# Build left and right array to compare two rows
|
1507
|
+
left = build_array_from_blocks vector_order, opts, blocks, r1, r2
|
1508
|
+
right = build_array_from_blocks vector_order, opts, blocks, r2, r1
|
1509
|
+
|
1510
|
+
# Resolve conflict by Index if all attributes are same
|
1511
|
+
left << r1
|
1512
|
+
right << r2
|
1513
|
+
left <=> right
|
1514
|
+
end
|
1515
|
+
|
1516
|
+
idx = (0..@index.size-1).sort(&block)
|
1517
|
+
|
1518
|
+
old_index = @index.to_a
|
1519
|
+
self.index = Daru::Index.new(idx.map { |i| old_index[i] })
|
1520
|
+
|
1521
|
+
vectors.each do |v|
|
1522
|
+
@data[@vectors[v]] = Daru::Vector.new(
|
1523
|
+
idx.map { |i| @data[@vectors[v]].data[i] },
|
1524
|
+
name: self[v].name, metadata: self[v].metadata.dup, index: index
|
1525
|
+
)
|
1526
|
+
end
|
1447
1527
|
|
1448
1528
|
self
|
1449
1529
|
end
|
1450
1530
|
|
1451
1531
|
# Non-destructive version of #sort!
|
1452
1532
|
def sort vector_order, opts={}
|
1453
|
-
|
1533
|
+
dup.sort! vector_order, opts
|
1454
1534
|
end
|
1455
1535
|
|
1456
1536
|
# Pivots a data frame on specified vectors and applies an aggregate function
|
@@ -1489,25 +1569,27 @@ module Daru
|
|
1489
1569
|
# # [:foo] 10 12
|
1490
1570
|
def pivot_table opts={}
|
1491
1571
|
raise ArgumentError,
|
1492
|
-
|
1572
|
+
'Specify grouping index' if !opts[:index] || opts[:index].empty?
|
1493
1573
|
|
1494
1574
|
index = opts[:index]
|
1495
1575
|
vectors = opts[:vectors] || []
|
1496
1576
|
aggregate_function = opts[:agg] || :mean
|
1497
1577
|
values =
|
1498
|
-
|
1499
|
-
|
1500
|
-
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1578
|
+
if opts[:values].is_a?(Symbol)
|
1579
|
+
[opts[:values]]
|
1580
|
+
elsif opts[:values].is_a?(Array)
|
1581
|
+
opts[:values]
|
1582
|
+
else # nil
|
1583
|
+
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
1584
|
+
end
|
1505
1585
|
|
1506
|
-
raise IndexError,
|
1586
|
+
raise IndexError, 'No numeric vectors to aggregate' if values.empty?
|
1507
1587
|
|
1508
|
-
grouped
|
1588
|
+
grouped = group_by(index)
|
1509
1589
|
|
1510
|
-
|
1590
|
+
if vectors.empty?
|
1591
|
+
grouped.send(aggregate_function)
|
1592
|
+
else
|
1511
1593
|
super_hash = {}
|
1512
1594
|
values.each do |value|
|
1513
1595
|
grouped.groups.each do |group_name, row_numbers|
|
@@ -1548,8 +1630,6 @@ module Daru
|
|
1548
1630
|
end
|
1549
1631
|
end
|
1550
1632
|
return pivoted_dataframe
|
1551
|
-
else
|
1552
|
-
grouped.send(aggregate_function)
|
1553
1633
|
end
|
1554
1634
|
end
|
1555
1635
|
|
@@ -1561,8 +1641,8 @@ module Daru
|
|
1561
1641
|
raise "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" unless nrows == other_df.nrows
|
1562
1642
|
|
1563
1643
|
new_fields = (@vectors.to_a + other_df.vectors.to_a)
|
1564
|
-
|
1565
|
-
|
1644
|
+
.recode_repeated
|
1645
|
+
.map(&:to_sym)
|
1566
1646
|
df_new = DataFrame.new({}, order: new_fields)
|
1567
1647
|
|
1568
1648
|
(0...nrows).to_a.each do |i|
|
@@ -1603,7 +1683,6 @@ module Daru
|
|
1603
1683
|
Daru::Core::Merge.join(self, other_df, opts)
|
1604
1684
|
end
|
1605
1685
|
|
1606
|
-
|
1607
1686
|
# Creates a new dataset for one to many relations
|
1608
1687
|
# on a dataset, based on pattern of field names.
|
1609
1688
|
#
|
@@ -1632,26 +1711,25 @@ module Daru
|
|
1632
1711
|
# # ["white", "2", 20]
|
1633
1712
|
# # ]
|
1634
1713
|
def one_to_many(parent_fields, pattern)
|
1635
|
-
re = Regexp.new pattern.gsub(
|
1714
|
+
re = Regexp.new pattern.gsub('%v','(.+?)').gsub('%n','(\\d+?)')
|
1636
1715
|
ds_vars = parent_fields.dup
|
1637
1716
|
vars = []
|
1638
1717
|
max_n = 0
|
1639
|
-
h = parent_fields.
|
1718
|
+
h = parent_fields.each_with_object({}) { |v, a|
|
1640
1719
|
a[v] = Daru::Vector.new([])
|
1641
|
-
a
|
1642
1720
|
}
|
1643
1721
|
# Adding _row_id
|
1644
1722
|
h['_col_id'] = Daru::Vector.new([])
|
1645
1723
|
ds_vars.push('_col_id')
|
1646
1724
|
|
1647
1725
|
@vectors.each do |f|
|
1648
|
-
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1652
|
-
end
|
1653
|
-
max_n = $2.to_i if max_n < $2.to_i
|
1726
|
+
next unless f =~ re
|
1727
|
+
unless vars.include? $1
|
1728
|
+
vars.push($1)
|
1729
|
+
h[$1] = Daru::Vector.new([])
|
1654
1730
|
end
|
1731
|
+
|
1732
|
+
max_n = $2.to_i if max_n < $2.to_i
|
1655
1733
|
end
|
1656
1734
|
ds = DataFrame.new(h, order: ds_vars+vars)
|
1657
1735
|
|
@@ -1662,12 +1740,12 @@ module Daru
|
|
1662
1740
|
end
|
1663
1741
|
|
1664
1742
|
max_n.times do |n1|
|
1665
|
-
n
|
1743
|
+
n = n1+1
|
1666
1744
|
any_data = false
|
1667
1745
|
vars.each do |v|
|
1668
|
-
data = row[pattern.gsub(
|
1746
|
+
data = row[pattern.gsub('%v',v.to_s).gsub('%n',n.to_s)]
|
1669
1747
|
row_out[v] = data
|
1670
|
-
any_data = true
|
1748
|
+
any_data = true unless data.nil?
|
1671
1749
|
end
|
1672
1750
|
|
1673
1751
|
if any_data
|
@@ -1685,7 +1763,7 @@ module Daru
|
|
1685
1763
|
i = 1
|
1686
1764
|
split.each { |k,v|
|
1687
1765
|
new_field = name_.to_s + join + i.to_s
|
1688
|
-
v.rename name_.to_s +
|
1766
|
+
v.rename name_.to_s + ':' + k.to_s
|
1689
1767
|
self[new_field.to_sym] = v
|
1690
1768
|
i += 1
|
1691
1769
|
}
|
@@ -1707,11 +1785,11 @@ module Daru
|
|
1707
1785
|
# ds.create_sql('names')
|
1708
1786
|
# #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
|
1709
1787
|
#
|
1710
|
-
def create_sql(table,charset=
|
1788
|
+
def create_sql(table,charset='UTF8')
|
1711
1789
|
sql = "CREATE TABLE #{table} ("
|
1712
|
-
fields =
|
1790
|
+
fields = vectors.to_a.collect do |f|
|
1713
1791
|
v = self[f]
|
1714
|
-
f.to_s +
|
1792
|
+
f.to_s + ' ' + v.db_type
|
1715
1793
|
end
|
1716
1794
|
|
1717
1795
|
sql + fields.join(",\n ")+") CHARACTER SET=#{charset};"
|
@@ -1724,14 +1802,14 @@ module Daru
|
|
1724
1802
|
numerics_as_arrays << self[n].to_a
|
1725
1803
|
end
|
1726
1804
|
|
1727
|
-
GSL::Matrix.alloc
|
1805
|
+
GSL::Matrix.alloc(*numerics_as_arrays.transpose)
|
1728
1806
|
end
|
1729
1807
|
|
1730
1808
|
# Convert all vectors of type *:numeric* into a Matrix.
|
1731
1809
|
def to_matrix
|
1732
1810
|
numerics_as_arrays = []
|
1733
1811
|
each_vector do |vector|
|
1734
|
-
numerics_as_arrays << vector.to_a if
|
1812
|
+
numerics_as_arrays << vector.to_a if vector.type == :numeric
|
1735
1813
|
end
|
1736
1814
|
|
1737
1815
|
Matrix.columns numerics_as_arrays
|
@@ -1746,8 +1824,8 @@ module Daru
|
|
1746
1824
|
def to_nmatrix
|
1747
1825
|
numerics_as_arrays = []
|
1748
1826
|
each_vector do |vector|
|
1749
|
-
numerics_as_arrays << vector.to_a if
|
1750
|
-
|
1827
|
+
numerics_as_arrays << vector.to_a if vector.type == :numeric &&
|
1828
|
+
vector.missing_positions.empty?
|
1751
1829
|
end
|
1752
1830
|
|
1753
1831
|
numerics_as_arrays.transpose.to_nm
|
@@ -1760,8 +1838,8 @@ module Daru
|
|
1760
1838
|
# in the array of hashes, which has the same index.
|
1761
1839
|
def to_a
|
1762
1840
|
arry = [[],[]]
|
1763
|
-
|
1764
|
-
arry[0] << row.
|
1841
|
+
each_row do |row|
|
1842
|
+
arry[0] << row.to_h
|
1765
1843
|
end
|
1766
1844
|
arry[1] = @index.to_a
|
1767
1845
|
|
@@ -1772,15 +1850,15 @@ module Daru
|
|
1772
1850
|
# in the JSON thus created.
|
1773
1851
|
def to_json no_index=true
|
1774
1852
|
if no_index
|
1775
|
-
|
1853
|
+
to_a[0].to_json
|
1776
1854
|
else
|
1777
|
-
|
1855
|
+
to_a.to_json
|
1778
1856
|
end
|
1779
1857
|
end
|
1780
1858
|
|
1781
|
-
# Converts DataFrame to a hash with keys as vector names and values as
|
1859
|
+
# Converts DataFrame to a hash (explicit) with keys as vector names and values as
|
1782
1860
|
# the corresponding vectors.
|
1783
|
-
def
|
1861
|
+
def to_h
|
1784
1862
|
hsh = {}
|
1785
1863
|
@vectors.each_with_index do |vec_name, idx|
|
1786
1864
|
hsh[vec_name] = @data[idx]
|
@@ -1791,12 +1869,12 @@ module Daru
|
|
1791
1869
|
|
1792
1870
|
# Convert to html for IRuby.
|
1793
1871
|
def to_html threshold=30
|
1794
|
-
html =
|
1795
|
-
|
1796
|
-
"<th colspan=\"#{@vectors.size+1}\">"
|
1797
|
-
"Daru::DataFrame:#{
|
1798
|
-
|
1799
|
-
|
1872
|
+
html = '<table>' \
|
1873
|
+
'<tr>' \
|
1874
|
+
"<th colspan=\"#{@vectors.size+1}\">" \
|
1875
|
+
"Daru::DataFrame:#{object_id} " + " rows: #{nrows} " + " cols: #{ncols}" \
|
1876
|
+
'</th>' \
|
1877
|
+
'</tr>'
|
1800
1878
|
html +='<tr><th></th>'
|
1801
1879
|
@vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
|
1802
1880
|
html += '</tr>'
|
@@ -1805,26 +1883,26 @@ module Daru
|
|
1805
1883
|
html += '<tr>'
|
1806
1884
|
html += '<td>' + index.to_s + '</td>'
|
1807
1885
|
|
1808
|
-
|
1886
|
+
row[index].each do |element|
|
1809
1887
|
html += '<td>' + element.to_s + '</td>'
|
1810
1888
|
end
|
1811
1889
|
|
1812
1890
|
html += '</tr>'
|
1813
|
-
if num
|
1814
|
-
|
1815
|
-
|
1816
|
-
|
1817
|
-
|
1818
|
-
|
1819
|
-
|
1820
|
-
|
1821
|
-
|
1822
|
-
|
1823
|
-
|
1824
|
-
|
1825
|
-
html += '</tr>'
|
1826
|
-
break
|
1891
|
+
next if num <= threshold
|
1892
|
+
|
1893
|
+
html += '<tr>'
|
1894
|
+
(@vectors.size + 1).times { html += '<td>...</td>' }
|
1895
|
+
html += '</tr>'
|
1896
|
+
|
1897
|
+
last_index = @index.to_a.last
|
1898
|
+
last_row = row[last_index]
|
1899
|
+
html += '<tr>'
|
1900
|
+
html += '<td>' + last_index.to_s + '</td>'
|
1901
|
+
(0..(ncols - 1)).to_a.each do |i|
|
1902
|
+
html += '<td>' + last_row[i].to_s + '</td>'
|
1827
1903
|
end
|
1904
|
+
html += '</tr>'
|
1905
|
+
break
|
1828
1906
|
end
|
1829
1907
|
html += '</table>'
|
1830
1908
|
|
@@ -1841,7 +1919,7 @@ module Daru
|
|
1841
1919
|
# assignment/deletion of elements is done. Updating data this way is called
|
1842
1920
|
# lazy loading. To set or unset lazy loading, see the .lazy_update= method.
|
1843
1921
|
def update
|
1844
|
-
@data.each
|
1922
|
+
@data.each(&:update) if Daru.lazy_update
|
1845
1923
|
end
|
1846
1924
|
|
1847
1925
|
# Rename the DataFrame.
|
@@ -1890,19 +1968,18 @@ module Daru
|
|
1890
1968
|
Daru::IO.dataframe_write_sql self, dbh, table
|
1891
1969
|
end
|
1892
1970
|
|
1893
|
-
|
1894
1971
|
# Use marshalling to save dataframe to a file.
|
1895
1972
|
def save filename
|
1896
1973
|
Daru::IO.save self, filename
|
1897
1974
|
end
|
1898
1975
|
|
1899
|
-
def _dump
|
1900
|
-
Marshal.dump(
|
1976
|
+
def _dump(_depth)
|
1977
|
+
Marshal.dump(
|
1901
1978
|
data: @data,
|
1902
1979
|
index: @index.to_a,
|
1903
1980
|
order: @vectors.to_a,
|
1904
1981
|
name: @name
|
1905
|
-
|
1982
|
+
)
|
1906
1983
|
end
|
1907
1984
|
|
1908
1985
|
def self._load data
|
@@ -1939,29 +2016,29 @@ module Daru
|
|
1939
2016
|
longest = [@name.to_s.size,
|
1940
2017
|
(@vectors.map(&:to_s).map(&:size).max || 0),
|
1941
2018
|
(@index .map(&:to_s).map(&:size).max || 0),
|
1942
|
-
(@data .map{ |v| v.map(&:to_s).map(&:size).max}.max || 0)].max
|
2019
|
+
(@data .map { |v| v.map(&:to_s).map(&:size).max }.max || 0)].max
|
1943
2020
|
|
1944
2021
|
name = @name || 'nil'
|
1945
|
-
content =
|
2022
|
+
content = ''
|
1946
2023
|
longest = spacing if longest > spacing
|
1947
2024
|
formatter = "\n"
|
1948
2025
|
|
1949
2026
|
(@vectors.size + 1).times { formatter += "%#{longest}.#{longest}s " }
|
1950
|
-
content += "\n#<" + self.class.to_s +
|
1951
|
-
|
1952
|
-
content +=
|
2027
|
+
content += "\n#<" + self.class.to_s + ':' + object_id.to_s + ' @name = ' +
|
2028
|
+
name.to_s + ' @size = ' + @size.to_s + '>'
|
2029
|
+
content += formatter % ['', *@vectors.map(&:to_s)]
|
1953
2030
|
row_num = 1
|
1954
2031
|
|
1955
|
-
|
1956
|
-
content +=
|
2032
|
+
each_row_with_index do |row, index|
|
2033
|
+
content += formatter % [index.to_s, *row.to_h.values.map { |e| (e || 'nil').to_s }]
|
1957
2034
|
row_num += 1
|
1958
|
-
if row_num
|
1959
|
-
dots = []
|
2035
|
+
next if row_num <= threshold
|
1960
2036
|
|
1961
|
-
|
1962
|
-
|
1963
|
-
|
1964
|
-
|
2037
|
+
dots = []
|
2038
|
+
|
2039
|
+
(@vectors.size + 1).times { dots << '...' }
|
2040
|
+
content += formatter % dots
|
2041
|
+
break
|
1965
2042
|
end
|
1966
2043
|
content += "\n"
|
1967
2044
|
|
@@ -1974,24 +2051,24 @@ module Daru
|
|
1974
2051
|
end
|
1975
2052
|
|
1976
2053
|
def == other
|
1977
|
-
self.class == other.class
|
1978
|
-
|
1979
|
-
|
1980
|
-
|
1981
|
-
|
2054
|
+
self.class == other.class &&
|
2055
|
+
@size == other.size &&
|
2056
|
+
@index == other.index &&
|
2057
|
+
@vectors == other.vectors &&
|
2058
|
+
@vectors.to_a.all? { |v| self[v] == other[v] }
|
1982
2059
|
end
|
1983
2060
|
|
1984
2061
|
def method_missing(name, *args, &block)
|
1985
|
-
if
|
1986
|
-
insert_or_modify_vector name[/(.+)\=/].delete(
|
1987
|
-
elsif
|
2062
|
+
if name =~ /(.+)\=/
|
2063
|
+
insert_or_modify_vector name[/(.+)\=/].delete('=').to_sym, args[0]
|
2064
|
+
elsif has_vector? name
|
1988
2065
|
self[name]
|
1989
2066
|
else
|
1990
2067
|
super(name, *args, &block)
|
1991
2068
|
end
|
1992
2069
|
end
|
1993
2070
|
|
1994
|
-
|
2071
|
+
private
|
1995
2072
|
|
1996
2073
|
def possibly_multi_index? index
|
1997
2074
|
if @index.is_a?(MultiIndex)
|
@@ -2001,101 +2078,51 @@ module Daru
|
|
2001
2078
|
end
|
2002
2079
|
end
|
2003
2080
|
|
2004
|
-
def
|
2005
|
-
|
2006
|
-
|
2007
|
-
|
2008
|
-
|
2009
|
-
|
2010
|
-
|
2011
|
-
|
2012
|
-
|
2013
|
-
|
2014
|
-
|
2015
|
-
# right_upper -
|
2016
|
-
def recursive_quick_sort vector_order, index, by, ascending, left_lower, right_upper
|
2017
|
-
if left_lower < right_upper
|
2018
|
-
left_upper, right_lower = partition(vector_order, index, by, ascending, left_lower, right_upper)
|
2019
|
-
if left_upper - left_lower < right_upper - right_lower
|
2020
|
-
recursive_quick_sort(vector_order, index, by, ascending, left_lower, left_upper)
|
2021
|
-
recursive_quick_sort(vector_order, index, by, ascending, right_lower, right_upper)
|
2022
|
-
else
|
2023
|
-
recursive_quick_sort(vector_order, index, by, ascending, right_lower, right_upper)
|
2024
|
-
recursive_quick_sort(vector_order, index, by, ascending, left_lower, left_upper)
|
2025
|
-
end
|
2026
|
-
end
|
2027
|
-
end
|
2028
|
-
|
2029
|
-
def partition vector_order, index, by, ascending, left_lower, right_upper
|
2030
|
-
mindex = (left_lower + right_upper) / 2
|
2031
|
-
mvalues = vector_order.inject([]) { |a, vector_name| a << self[vector_name][mindex]; a }
|
2032
|
-
i = left_lower
|
2033
|
-
j = right_upper
|
2034
|
-
descending = ascending.map { |a| !a }
|
2035
|
-
|
2036
|
-
i += 1 while(keep?(i, mvalues, vector_order, ascending , by, 0))
|
2037
|
-
j -= 1 while(keep?(j, mvalues, vector_order, descending, by, 0))
|
2038
|
-
|
2039
|
-
while i < j - 1
|
2040
|
-
@data.each do |vector|
|
2041
|
-
vector[i], vector[j] = vector[j], vector[i]
|
2042
|
-
end
|
2043
|
-
index[i], index[j] = index[j], index[i]
|
2044
|
-
i += 1
|
2045
|
-
j -= 1
|
2046
|
-
|
2047
|
-
i += 1 while(keep?(i, mvalues, vector_order, ascending , by,0))
|
2048
|
-
j -= 1 while(keep?(j, mvalues, vector_order, descending, by,0))
|
2049
|
-
end
|
2050
|
-
|
2051
|
-
if i <= j
|
2052
|
-
if i < j
|
2053
|
-
@data.each do |vector|
|
2054
|
-
vector[i], vector[j] = vector[j], vector[i]
|
2081
|
+
def create_logic_blocks vector_order, _by, ascending
|
2082
|
+
# Create blocks to handle nils
|
2083
|
+
blocks = {}
|
2084
|
+
universal_block_ascending = ->(a) { [a.nil? ? 0 : 1, a] }
|
2085
|
+
universal_block_decending = ->(a) { [a.nil? ? 1 : 0, a] }
|
2086
|
+
vector_order.each_with_index do |vector, i|
|
2087
|
+
blocks[vector] =
|
2088
|
+
if ascending[i]
|
2089
|
+
universal_block_ascending
|
2090
|
+
else
|
2091
|
+
universal_block_decending
|
2055
2092
|
end
|
2056
|
-
index[i], index[j] = index[j], index[i]
|
2057
|
-
end
|
2058
|
-
i += 1
|
2059
|
-
j -= 1
|
2060
2093
|
end
|
2061
2094
|
|
2062
|
-
|
2095
|
+
blocks
|
2063
2096
|
end
|
2064
2097
|
|
2065
|
-
def
|
2066
|
-
|
2067
|
-
|
2068
|
-
|
2069
|
-
|
2098
|
+
def build_array_from_blocks vector_order, opts, blocks, r1, r2
|
2099
|
+
# Create an array to be used for comparison of two rows in sorting
|
2100
|
+
vector_order.map.each_with_index do |v, i|
|
2101
|
+
value = if opts[:ascending][i]
|
2102
|
+
@data[@vectors[v]].data[r1]
|
2103
|
+
else
|
2104
|
+
@data[@vectors[v]].data[r2]
|
2105
|
+
end
|
2070
2106
|
|
2071
|
-
if
|
2072
|
-
|
2073
|
-
|
2074
|
-
if eval == 0
|
2075
|
-
keep?(current_index, mvalues, vector_order, sort_order, by, vector_order_index + 1)
|
2076
|
-
end
|
2077
|
-
else # sort in descending order
|
2078
|
-
return false if eval == -1
|
2079
|
-
return true if eval == 1
|
2080
|
-
if eval == 0
|
2081
|
-
keep?(current_index, mvalues, vector_order, sort_order, by, vector_order_index + 1)
|
2082
|
-
end
|
2083
|
-
end
|
2084
|
-
end
|
2085
|
-
end
|
2107
|
+
if opts[:by][v] && !opts[:handle_nils][i]
|
2108
|
+
# Block given and nils handled manually
|
2109
|
+
value = opts[:by][v].call value
|
2086
2110
|
|
2087
|
-
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
end
|
2111
|
+
elsif opts[:by][v] && opts[:handle_nils][i]
|
2112
|
+
# Block given and nils handled automatically
|
2113
|
+
value = opts[:by][v].call value rescue nil
|
2114
|
+
blocks[v].call value
|
2092
2115
|
|
2093
|
-
|
2116
|
+
else
|
2117
|
+
# Block not given and nils handled automatically
|
2118
|
+
blocks[v].call value
|
2119
|
+
end
|
2120
|
+
end
|
2094
2121
|
end
|
2095
2122
|
|
2096
2123
|
def sort_order_array vector_order, ascending
|
2097
|
-
if ascending.is_a?
|
2098
|
-
raise ArgumentError,
|
2124
|
+
if ascending.is_a? Array
|
2125
|
+
raise ArgumentError, 'Specify same number of vector names and sort orders' if
|
2099
2126
|
vector_order.size != ascending.size
|
2100
2127
|
return ascending
|
2101
2128
|
else
|
@@ -2103,6 +2130,16 @@ module Daru
|
|
2103
2130
|
end
|
2104
2131
|
end
|
2105
2132
|
|
2133
|
+
def handle_nils_array vector_order, handle_nils
|
2134
|
+
if handle_nils.is_a? Array
|
2135
|
+
raise ArgumentError, 'Specify same number of vector names and handle nils' if
|
2136
|
+
vector_order.size != handle_nils.size
|
2137
|
+
return handle_nils
|
2138
|
+
else
|
2139
|
+
Array.new(vector_order.size, handle_nils)
|
2140
|
+
end
|
2141
|
+
end
|
2142
|
+
|
2106
2143
|
def vectors_index_for location
|
2107
2144
|
if @vectors.include?(location)
|
2108
2145
|
@vectors[location]
|
@@ -2118,39 +2155,35 @@ module Daru
|
|
2118
2155
|
if @vectors.is_a?(MultiIndex)
|
2119
2156
|
pos = @vectors[names]
|
2120
2157
|
|
2121
|
-
if pos.is_a?(Integer)
|
2122
|
-
return @data[pos]
|
2123
|
-
else # MultiIndex
|
2124
|
-
new_vectors = pos.map do |tuple|
|
2125
|
-
@data[@vectors[tuple]]
|
2126
|
-
end
|
2158
|
+
return @data[pos] if pos.is_a?(Integer)
|
2127
2159
|
|
2128
|
-
|
2129
|
-
|
2130
|
-
|
2160
|
+
# MultiIndex
|
2161
|
+
new_vectors = pos.map do |tuple|
|
2162
|
+
@data[@vectors[tuple]]
|
2163
|
+
end
|
2131
2164
|
|
2132
|
-
|
2133
|
-
|
2165
|
+
if !location.is_a?(Range) && names.size < @vectors.width
|
2166
|
+
pos = pos.drop_left_level names.size
|
2134
2167
|
end
|
2168
|
+
|
2169
|
+
Daru::DataFrame.new(new_vectors, index: @index, order: pos)
|
2135
2170
|
else
|
2136
2171
|
unless names[1]
|
2137
2172
|
pos = @vectors[location]
|
2138
2173
|
|
2139
|
-
if pos.is_a?(Numeric)
|
2140
|
-
|
2141
|
-
|
2142
|
-
names = pos
|
2143
|
-
end
|
2174
|
+
return @data[pos] if pos.is_a?(Numeric)
|
2175
|
+
|
2176
|
+
names = pos
|
2144
2177
|
end
|
2145
2178
|
|
2146
|
-
|
2179
|
+
new_vectors = {}
|
2147
2180
|
names.each do |name|
|
2148
|
-
|
2181
|
+
new_vectors[name] = @data[@vectors[name]]
|
2149
2182
|
end
|
2150
2183
|
|
2151
2184
|
order = names.is_a?(Array) ? Daru::Index.new(names) : names
|
2152
|
-
Daru::DataFrame.new(
|
2153
|
-
|
2185
|
+
Daru::DataFrame.new(new_vectors, order: order,
|
2186
|
+
index: @index, name: @name)
|
2154
2187
|
end
|
2155
2188
|
end
|
2156
2189
|
|
@@ -2161,16 +2194,15 @@ module Daru
|
|
2161
2194
|
pos = @index[names]
|
2162
2195
|
if pos.is_a?(Integer)
|
2163
2196
|
return Daru::Vector.new(populate_row_for(pos), index: @vectors, name: pos)
|
2164
|
-
|
2165
|
-
new_rows = pos.map { |tuple| populate_row_for(tuple) }
|
2197
|
+
end
|
2166
2198
|
|
2167
|
-
|
2168
|
-
pos = pos.drop_left_level names.size
|
2169
|
-
end
|
2199
|
+
new_rows = pos.map { |tuple| populate_row_for(tuple) }
|
2170
2200
|
|
2171
|
-
|
2172
|
-
|
2201
|
+
if !location.is_a?(Range) && names.size < @index.width
|
2202
|
+
pos = pos.drop_left_level names.size
|
2173
2203
|
end
|
2204
|
+
|
2205
|
+
Daru::DataFrame.rows(new_rows, order: @vectors, name: @name, index: pos)
|
2174
2206
|
else
|
2175
2207
|
if names[1].nil?
|
2176
2208
|
names = @index[location]
|
@@ -2189,7 +2221,7 @@ module Daru
|
|
2189
2221
|
rows << self.row[name].to_a
|
2190
2222
|
end
|
2191
2223
|
|
2192
|
-
Daru::DataFrame.rows rows, index: names
|
2224
|
+
Daru::DataFrame.rows rows, index: names,name: @name, order: @vectors
|
2193
2225
|
end
|
2194
2226
|
end
|
2195
2227
|
|
@@ -2201,17 +2233,22 @@ module Daru
|
|
2201
2233
|
|
2202
2234
|
def insert_or_modify_vector name, vector
|
2203
2235
|
name = name[0] unless @vectors.is_a?(MultiIndex)
|
2204
|
-
|
2236
|
+
vec = nil
|
2205
2237
|
|
2206
2238
|
if @index.empty?
|
2207
|
-
|
2208
|
-
|
2209
|
-
|
2239
|
+
vec = if vector.is_a?(Daru::Vector)
|
2240
|
+
vector
|
2241
|
+
else
|
2242
|
+
Daru::Vector.new(vector.to_a, name: set_name(name))
|
2243
|
+
end
|
2244
|
+
|
2245
|
+
@index = vec.index
|
2246
|
+
assign_or_add_vector name, vec
|
2210
2247
|
set_size
|
2211
2248
|
|
2212
2249
|
@data.map! do |v|
|
2213
|
-
if v.
|
2214
|
-
Daru::Vector.new([nil]*@size, name: set_name(name), index: @index)
|
2250
|
+
if v.empty?
|
2251
|
+
Daru::Vector.new([nil]*@size, name: set_name(name), metadata: v.metadata, index: @index)
|
2215
2252
|
else
|
2216
2253
|
v
|
2217
2254
|
end
|
@@ -2219,15 +2256,11 @@ module Daru
|
|
2219
2256
|
else
|
2220
2257
|
if vector.is_a?(Daru::Vector)
|
2221
2258
|
if vector.index == @index # so that index-by-index assignment is avoided when possible.
|
2222
|
-
|
2259
|
+
vec = vector.dup
|
2223
2260
|
else
|
2224
|
-
|
2261
|
+
vec = Daru::Vector.new [], name: set_name(name), metadata: vector.metadata.dup, index: @index
|
2225
2262
|
@index.each do |idx|
|
2226
|
-
|
2227
|
-
v[idx] = vector[idx]
|
2228
|
-
else
|
2229
|
-
v[idx] = nil
|
2230
|
-
end
|
2263
|
+
vec[idx] = vector.index.include?(idx) ? vector[idx] : nil
|
2231
2264
|
end
|
2232
2265
|
end
|
2233
2266
|
else
|
@@ -2235,26 +2268,30 @@ module Daru
|
|
2235
2268
|
"Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
|
2236
2269
|
@size != vector.size
|
2237
2270
|
|
2238
|
-
|
2271
|
+
vec = Daru::Vector.new(vector, name: set_name(name), index: @index)
|
2239
2272
|
end
|
2240
2273
|
|
2241
|
-
assign_or_add_vector name,
|
2274
|
+
assign_or_add_vector name, vec
|
2242
2275
|
end
|
2243
2276
|
end
|
2244
2277
|
|
2245
2278
|
def assign_or_add_vector name, v
|
2246
|
-
#FIXME: fix this jugaad. need to make changes in Indexing itself.
|
2247
|
-
|
2279
|
+
# FIXME: fix this jugaad. need to make changes in Indexing itself.
|
2280
|
+
begin
|
2281
|
+
pos = @vectors[name]
|
2282
|
+
rescue IndexError
|
2283
|
+
pos = name
|
2284
|
+
end
|
2248
2285
|
|
2249
|
-
if !pos.
|
2250
|
-
|
2286
|
+
if !pos.is_a?(Daru::Index) && pos == name &&
|
2287
|
+
(@vectors.include?(name) || (pos.is_a?(Integer) && pos < @data.size))
|
2251
2288
|
@data[pos] = v
|
2252
|
-
elsif pos.
|
2289
|
+
elsif pos.is_a?(Daru::Index)
|
2253
2290
|
pos.each do |p|
|
2254
2291
|
@data[@vectors[p]] = v
|
2255
2292
|
end
|
2256
2293
|
else
|
2257
|
-
@vectors
|
2294
|
+
@vectors |= [name] unless @vectors.include?(name)
|
2258
2295
|
@data[@vectors[name]] = v
|
2259
2296
|
end
|
2260
2297
|
end
|
@@ -2264,21 +2301,21 @@ module Daru
|
|
2264
2301
|
# TODO
|
2265
2302
|
else
|
2266
2303
|
name = name[0]
|
2267
|
-
|
2268
|
-
|
2269
|
-
|
2270
|
-
|
2271
|
-
|
2272
|
-
|
2304
|
+
vec =
|
2305
|
+
if vector.is_a?(Daru::Vector)
|
2306
|
+
vector
|
2307
|
+
else
|
2308
|
+
Daru::Vector.new(vector, name: set_name(name), index: @vectors)
|
2309
|
+
end
|
2273
2310
|
|
2274
2311
|
if @index.include? name
|
2275
|
-
|
2276
|
-
|
2312
|
+
each_vector_with_index do |v,i|
|
2313
|
+
v[name] = vec.index.include?(i) ? vec[i] : nil
|
2277
2314
|
end
|
2278
2315
|
else
|
2279
|
-
@index
|
2280
|
-
|
2281
|
-
|
2316
|
+
@index |= [name]
|
2317
|
+
each_vector_with_index do |v,i|
|
2318
|
+
v.concat((vec.index.include?(i) ? vec[i] : nil), name)
|
2282
2319
|
end
|
2283
2320
|
end
|
2284
2321
|
|
@@ -2294,15 +2331,15 @@ module Daru
|
|
2294
2331
|
|
2295
2332
|
def validate_labels
|
2296
2333
|
raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if
|
2297
|
-
@vectors
|
2334
|
+
@vectors && @vectors.size != @data.size
|
2298
2335
|
|
2299
|
-
raise IndexError,
|
2300
|
-
@index
|
2336
|
+
raise IndexError, 'Expected number of indexes same as number of rows' if
|
2337
|
+
@index && @data[0] && @index.size != @data[0].size
|
2301
2338
|
end
|
2302
2339
|
|
2303
2340
|
def validate_vector_sizes
|
2304
2341
|
@data.each do |vector|
|
2305
|
-
raise IndexError,
|
2342
|
+
raise IndexError, 'Expected vectors with equal length' if vector.size != @size
|
2306
2343
|
end
|
2307
2344
|
end
|
2308
2345
|
|
@@ -2332,14 +2369,14 @@ module Daru
|
|
2332
2369
|
end
|
2333
2370
|
|
2334
2371
|
def create_vectors_index_with vectors, source
|
2335
|
-
vectors = source.keys.sort_by
|
2372
|
+
vectors = source.keys.sort_by(&:to_s) if vectors.nil?
|
2336
2373
|
|
2337
2374
|
@vectors =
|
2338
|
-
|
2339
|
-
|
2340
|
-
|
2341
|
-
|
2342
|
-
|
2375
|
+
if vectors.is_a?(Index) || vectors.is_a?(MultiIndex)
|
2376
|
+
vectors
|
2377
|
+
else
|
2378
|
+
Daru::Index.new((vectors + (source.keys - vectors)).uniq)
|
2379
|
+
end
|
2343
2380
|
end
|
2344
2381
|
|
2345
2382
|
def all_vectors_have_equal_indexes? source
|
@@ -2351,24 +2388,24 @@ module Daru
|
|
2351
2388
|
end
|
2352
2389
|
|
2353
2390
|
def try_create_index index
|
2354
|
-
index.
|
2391
|
+
index.is_a?(Index) ? index : Daru::Index.new(index)
|
2355
2392
|
end
|
2356
2393
|
|
2357
|
-
def set_name potential_name
|
2394
|
+
def set_name potential_name # rubocop:disable Style/AccessorMethodName
|
2358
2395
|
potential_name.is_a?(Array) ? potential_name.join : potential_name
|
2359
2396
|
end
|
2360
2397
|
|
2361
2398
|
def symbolize arry
|
2362
2399
|
symbolized_arry =
|
2363
|
-
|
2364
|
-
|
2365
|
-
|
2366
|
-
|
2400
|
+
if arry.all? { |e| e.is_a?(Array) }
|
2401
|
+
arry.map do |sub_arry|
|
2402
|
+
sub_arry.map do |e|
|
2403
|
+
e.is_a?(Numeric) ? e : e.to_sym
|
2404
|
+
end
|
2367
2405
|
end
|
2406
|
+
else
|
2407
|
+
arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym }
|
2368
2408
|
end
|
2369
|
-
else
|
2370
|
-
arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym }
|
2371
|
-
end
|
2372
2409
|
|
2373
2410
|
symbolized_arry
|
2374
2411
|
end
|