daru_lite 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +35 -33
- data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
- data/lib/daru_lite/data_frame/calculatable.rb +140 -0
- data/lib/daru_lite/data_frame/convertible.rb +107 -0
- data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
- data/lib/daru_lite/data_frame/fetchable.rb +301 -0
- data/lib/daru_lite/data_frame/filterable.rb +144 -0
- data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
- data/lib/daru_lite/data_frame/indexable.rb +168 -0
- data/lib/daru_lite/data_frame/iterable.rb +339 -0
- data/lib/daru_lite/data_frame/joinable.rb +152 -0
- data/lib/daru_lite/data_frame/missable.rb +75 -0
- data/lib/daru_lite/data_frame/pivotable.rb +108 -0
- data/lib/daru_lite/data_frame/queryable.rb +67 -0
- data/lib/daru_lite/data_frame/setable.rb +109 -0
- data/lib/daru_lite/data_frame/sortable.rb +241 -0
- data/lib/daru_lite/dataframe.rb +138 -2353
- data/lib/daru_lite/index/index.rb +13 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1 -1
- data/lib/daru_lite/vector/aggregatable.rb +9 -0
- data/lib/daru_lite/vector/calculatable.rb +78 -0
- data/lib/daru_lite/vector/convertible.rb +77 -0
- data/lib/daru_lite/vector/duplicatable.rb +17 -0
- data/lib/daru_lite/vector/fetchable.rb +175 -0
- data/lib/daru_lite/vector/filterable.rb +128 -0
- data/lib/daru_lite/vector/indexable.rb +77 -0
- data/lib/daru_lite/vector/iterable.rb +95 -0
- data/lib/daru_lite/vector/joinable.rb +17 -0
- data/lib/daru_lite/vector/missable.rb +124 -0
- data/lib/daru_lite/vector/queryable.rb +45 -0
- data/lib/daru_lite/vector/setable.rb +47 -0
- data/lib/daru_lite/vector/sortable.rb +113 -0
- data/lib/daru_lite/vector.rb +36 -932
- data/lib/daru_lite/version.rb +1 -1
- data/spec/data_frame/aggregatable_example.rb +65 -0
- data/spec/data_frame/buildable_example.rb +109 -0
- data/spec/data_frame/calculatable_example.rb +135 -0
- data/spec/data_frame/convertible_example.rb +180 -0
- data/spec/data_frame/duplicatable_example.rb +111 -0
- data/spec/data_frame/fetchable_example.rb +476 -0
- data/spec/data_frame/filterable_example.rb +250 -0
- data/spec/data_frame/indexable_example.rb +221 -0
- data/spec/data_frame/iterable_example.rb +465 -0
- data/spec/data_frame/joinable_example.rb +106 -0
- data/spec/data_frame/missable_example.rb +47 -0
- data/spec/data_frame/pivotable_example.rb +297 -0
- data/spec/data_frame/queryable_example.rb +92 -0
- data/spec/data_frame/setable_example.rb +482 -0
- data/spec/data_frame/sortable_example.rb +350 -0
- data/spec/dataframe_spec.rb +181 -3289
- data/spec/index/index_spec.rb +8 -0
- data/spec/vector/aggregatable_example.rb +27 -0
- data/spec/vector/calculatable_example.rb +82 -0
- data/spec/vector/convertible_example.rb +126 -0
- data/spec/vector/duplicatable_example.rb +48 -0
- data/spec/vector/fetchable_example.rb +463 -0
- data/spec/vector/filterable_example.rb +165 -0
- data/spec/vector/indexable_example.rb +201 -0
- data/spec/vector/iterable_example.rb +111 -0
- data/spec/vector/joinable_example.rb +25 -0
- data/spec/vector/missable_example.rb +88 -0
- data/spec/vector/queryable_example.rb +91 -0
- data/spec/vector/setable_example.rb +300 -0
- data/spec/vector/sortable_example.rb +242 -0
- data/spec/vector_spec.rb +111 -1805
- metadata +86 -2
@@ -0,0 +1,168 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Indexable
|
4
|
+
module SetSingleIndexStrategy
|
5
|
+
def self.uniq_size(df, col)
|
6
|
+
df[col].uniq.size
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.new_index(df, col)
|
10
|
+
DaruLite::Index.new(df[col].to_a)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.delete_vector(df, col)
|
14
|
+
df.delete_vector(col)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
module SetCategoricalIndexStrategy
|
19
|
+
def self.new_index(df, col)
|
20
|
+
DaruLite::CategoricalIndex.new(df[col].to_a)
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.delete_vector(df, col)
|
24
|
+
df.delete_vector(col)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
module SetMultiIndexStrategy
|
29
|
+
def self.uniq_size(df, cols)
|
30
|
+
df[*cols].uniq.size
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.new_index(df, cols)
|
34
|
+
DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
|
35
|
+
mi.name = cols
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.delete_vector(df, cols)
|
40
|
+
df.delete_vectors(*cols)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Set a particular column as the new DF
|
45
|
+
def set_index(new_index_col, keep: false, categorical: false)
|
46
|
+
if categorical
|
47
|
+
strategy = SetCategoricalIndexStrategy
|
48
|
+
elsif new_index_col.respond_to?(:to_a)
|
49
|
+
strategy = SetMultiIndexStrategy
|
50
|
+
new_index_col = new_index_col.to_a
|
51
|
+
else
|
52
|
+
strategy = SetSingleIndexStrategy
|
53
|
+
end
|
54
|
+
|
55
|
+
unless categorical
|
56
|
+
uniq_size = strategy.uniq_size(self, new_index_col)
|
57
|
+
raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
|
58
|
+
end
|
59
|
+
|
60
|
+
self.index = strategy.new_index(self, new_index_col)
|
61
|
+
strategy.delete_vector(self, new_index_col) unless keep
|
62
|
+
self
|
63
|
+
end
|
64
|
+
|
65
|
+
# Change the index of the DataFrame and preserve the labels of the previous
|
66
|
+
# indexing. New index can be DaruLite::Index or any of its subclasses.
|
67
|
+
#
|
68
|
+
# @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
|
69
|
+
# @example Reindexing DataFrame
|
70
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
|
71
|
+
# index: ['a','b','c','d'])
|
72
|
+
# #=>
|
73
|
+
# ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
74
|
+
# # a b
|
75
|
+
# # a 1 11
|
76
|
+
# # b 2 22
|
77
|
+
# # c 3 33
|
78
|
+
# # d 4 44
|
79
|
+
# df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
|
80
|
+
# #=>
|
81
|
+
# ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
82
|
+
# # a b
|
83
|
+
# # b 2 22
|
84
|
+
# # 0 nil nil
|
85
|
+
# # a 1 11
|
86
|
+
# # g nil nil
|
87
|
+
def reindex(new_index)
|
88
|
+
unless new_index.is_a?(DaruLite::Index)
|
89
|
+
raise ArgumentError, 'Must pass the new index of type Index or its ' \
|
90
|
+
"subclasses, not #{new_index.class}"
|
91
|
+
end
|
92
|
+
|
93
|
+
cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
|
94
|
+
new_index.each_with_object(cl) do |idx, memo|
|
95
|
+
memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def reset_index
|
100
|
+
index_df = index.to_df
|
101
|
+
names = index.name
|
102
|
+
names = [names] unless names.instance_of?(Array)
|
103
|
+
new_vectors = names + vectors.to_a
|
104
|
+
self.index = index_df.index
|
105
|
+
names.each do |name|
|
106
|
+
self[name] = index_df[name]
|
107
|
+
end
|
108
|
+
self.order = new_vectors
|
109
|
+
self
|
110
|
+
end
|
111
|
+
|
112
|
+
# Reassign index with a new index of type DaruLite::Index or any of its subclasses.
|
113
|
+
#
|
114
|
+
# @param [DaruLite::Index] idx New index object on which the rows of the dataframe
|
115
|
+
# are to be indexed.
|
116
|
+
# @example Reassigining index of a DataFrame
|
117
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
|
118
|
+
# df.index.to_a #=> [0,1,2,3]
|
119
|
+
#
|
120
|
+
# df.index = DaruLite::Index.new(['a','b','c','d'])
|
121
|
+
# df.index.to_a #=> ['a','b','c','d']
|
122
|
+
# df.row['a'].to_a #=> [1,11]
|
123
|
+
def index=(idx)
|
124
|
+
@index = Index.coerce idx
|
125
|
+
@data.each { |vec| vec.index = @index }
|
126
|
+
|
127
|
+
self
|
128
|
+
end
|
129
|
+
|
130
|
+
def reindex_vectors(new_vectors)
|
131
|
+
unless new_vectors.is_a?(DaruLite::Index)
|
132
|
+
raise ArgumentError, 'Must pass the new index of type Index or its ' \
|
133
|
+
"subclasses, not #{new_vectors.class}"
|
134
|
+
end
|
135
|
+
|
136
|
+
cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
|
137
|
+
new_vectors.each_with_object(cl) do |vec, memo|
|
138
|
+
memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
# Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
|
143
|
+
#
|
144
|
+
# @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
|
145
|
+
# be indexed. Must of the same size as ncols.
|
146
|
+
# @example Reassigning vectors of a DataFrame
|
147
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
|
148
|
+
# df.vectors.to_a #=> [:a, :b, :c]
|
149
|
+
#
|
150
|
+
# df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
|
151
|
+
# df.vectors.to_a #=> [:foo, :bar, :baz]
|
152
|
+
def vectors=(new_index)
|
153
|
+
raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
|
154
|
+
|
155
|
+
if new_index.size != ncols
|
156
|
+
raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
|
157
|
+
"dataframe size #{ncols}"
|
158
|
+
end
|
159
|
+
|
160
|
+
@vectors = new_index
|
161
|
+
@data.zip(new_index.to_a).each do |vect, name|
|
162
|
+
vect.name = name
|
163
|
+
end
|
164
|
+
self
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
@@ -0,0 +1,339 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Iterable
|
4
|
+
# Iterate over each index of the DataFrame.
|
5
|
+
def each_index(&block)
|
6
|
+
return to_enum(:each_index) unless block
|
7
|
+
|
8
|
+
@index.each(&block)
|
9
|
+
|
10
|
+
self
|
11
|
+
end
|
12
|
+
|
13
|
+
# Iterate over each vector
|
14
|
+
def each_vector(&block)
|
15
|
+
return to_enum(:each_vector) unless block
|
16
|
+
|
17
|
+
@data.each(&block)
|
18
|
+
|
19
|
+
self
|
20
|
+
end
|
21
|
+
|
22
|
+
alias each_column each_vector
|
23
|
+
|
24
|
+
# Iterate over each vector alongwith the name of the vector
|
25
|
+
def each_vector_with_index
|
26
|
+
return to_enum(:each_vector_with_index) unless block_given?
|
27
|
+
|
28
|
+
@vectors.each do |vector|
|
29
|
+
yield @data[@vectors[vector]], vector
|
30
|
+
end
|
31
|
+
|
32
|
+
self
|
33
|
+
end
|
34
|
+
|
35
|
+
alias each_column_with_index each_vector_with_index
|
36
|
+
|
37
|
+
# Iterate over each row
|
38
|
+
def each_row
|
39
|
+
return to_enum(:each_row) unless block_given?
|
40
|
+
|
41
|
+
@index.size.times do |pos|
|
42
|
+
yield row_at(pos)
|
43
|
+
end
|
44
|
+
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
def each_row_with_index
|
49
|
+
return to_enum(:each_row_with_index) unless block_given?
|
50
|
+
|
51
|
+
@index.each do |index|
|
52
|
+
yield access_row(index), index
|
53
|
+
end
|
54
|
+
|
55
|
+
self
|
56
|
+
end
|
57
|
+
|
58
|
+
# Iterate over each row or vector of the DataFrame. Specify axis
|
59
|
+
# by passing :vector or :row as the argument. Default to :vector.
|
60
|
+
#
|
61
|
+
# == Description
|
62
|
+
#
|
63
|
+
# `#each` works exactly like Array#each. The default mode for `each`
|
64
|
+
# is to iterate over the columns of the DataFrame. To iterate over
|
65
|
+
# rows you must pass the axis, i.e `:row` as an argument.
|
66
|
+
#
|
67
|
+
# == Arguments
|
68
|
+
#
|
69
|
+
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
70
|
+
# or :row. Default to :vector.
|
71
|
+
def each(axis = :vector, &block)
|
72
|
+
dispatch_to_axis axis, :each, &block
|
73
|
+
end
|
74
|
+
|
75
|
+
# Iterate over a row or vector and return results in a DaruLite::Vector.
|
76
|
+
# Specify axis with :vector or :row. Default to :vector.
|
77
|
+
#
|
78
|
+
# == Description
|
79
|
+
#
|
80
|
+
# The #collect iterator works similar to #map, the only difference
|
81
|
+
# being that it returns a DaruLite::Vector comprising of the results of
|
82
|
+
# each block run. The resultant Vector has the same index as that
|
83
|
+
# of the axis over which collect has iterated. It also accepts the
|
84
|
+
# optional axis argument.
|
85
|
+
#
|
86
|
+
# == Arguments
|
87
|
+
#
|
88
|
+
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
89
|
+
# or :row. Default to :vector.
|
90
|
+
def collect(axis = :vector, &block)
|
91
|
+
dispatch_to_axis_pl axis, :collect, &block
|
92
|
+
end
|
93
|
+
|
94
|
+
# Map over each vector or row of the data frame according to
|
95
|
+
# the argument specified. Will return an Array of the resulting
|
96
|
+
# elements. To map over each row/vector and get a DataFrame,
|
97
|
+
# see #recode.
|
98
|
+
#
|
99
|
+
# == Description
|
100
|
+
#
|
101
|
+
# The #map iterator works like Array#map. The value returned by
|
102
|
+
# each run of the block is added to an Array and the Array is
|
103
|
+
# returned. This method also accepts an axis argument, like #each.
|
104
|
+
# The default is :vector.
|
105
|
+
#
|
106
|
+
# == Arguments
|
107
|
+
#
|
108
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
109
|
+
# Default to :vector.
|
110
|
+
def map(axis = :vector, &block)
|
111
|
+
dispatch_to_axis_pl axis, :map, &block
|
112
|
+
end
|
113
|
+
|
114
|
+
# Destructive map. Modifies the DataFrame. Each run of the block
|
115
|
+
# must return a DaruLite::Vector. You can specify the axis to map over
|
116
|
+
# as the argument. Default to :vector.
|
117
|
+
#
|
118
|
+
# == Arguments
|
119
|
+
#
|
120
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
121
|
+
# Default to :vector.
|
122
|
+
def map!(axis = :vector, &block)
|
123
|
+
if %i[vector column].include?(axis)
|
124
|
+
map_vectors!(&block)
|
125
|
+
elsif axis == :row
|
126
|
+
map_rows!(&block)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Maps over the DataFrame and returns a DataFrame. Each run of the
|
131
|
+
# block must return a DaruLite::Vector object. You can specify the axis
|
132
|
+
# to map over. Default to :vector.
|
133
|
+
#
|
134
|
+
# == Description
|
135
|
+
#
|
136
|
+
# Recode works similarly to #map, but an important difference between
|
137
|
+
# the two is that recode returns a modified DaruLite::DataFrame instead
|
138
|
+
# of an Array. For this reason, #recode expects that every run of the
|
139
|
+
# block to return a DaruLite::Vector.
|
140
|
+
#
|
141
|
+
# Just like map and each, recode also accepts an optional _axis_ argument.
|
142
|
+
#
|
143
|
+
# == Arguments
|
144
|
+
#
|
145
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
146
|
+
# Default to :vector.
|
147
|
+
def recode(axis = :vector, &block)
|
148
|
+
dispatch_to_axis_pl axis, :recode, &block
|
149
|
+
end
|
150
|
+
|
151
|
+
# Replace specified values with given value
|
152
|
+
# @param [Array] old_values values to replace with new value
|
153
|
+
# @param [object] new_value new value to replace with
|
154
|
+
# @return [DaruLite::DataFrame] Data Frame itself with old values replace
|
155
|
+
# with new value
|
156
|
+
# @example
|
157
|
+
# df = DaruLite::DataFrame.new({
|
158
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
159
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
160
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
161
|
+
# }, index: 11..18)
|
162
|
+
# df.replace_values nil, Float::NAN
|
163
|
+
# # => #<DaruLite::DataFrame(8x3)>
|
164
|
+
# # a b c
|
165
|
+
# # 11 1 a a
|
166
|
+
# # 12 2 b NaN
|
167
|
+
# # 13 3 NaN 3
|
168
|
+
# # 14 NaN NaN 4
|
169
|
+
# # 15 NaN NaN 3
|
170
|
+
# # 16 NaN 3 5
|
171
|
+
# # 17 1 5 NaN
|
172
|
+
# # 18 7 8 7
|
173
|
+
def replace_values(old_values, new_value)
|
174
|
+
@data.each { |vec| vec.replace_values old_values, new_value }
|
175
|
+
self
|
176
|
+
end
|
177
|
+
|
178
|
+
# Test each row with one or more tests.
|
179
|
+
# @param tests [Proc] Each test is a Proc with the form
|
180
|
+
# *Proc.new {|row| row[:age] > 0}*
|
181
|
+
# The function returns an array with all errors.
|
182
|
+
#
|
183
|
+
# FIXME: description here is too sparse. As far as I can get,
|
184
|
+
# it should tell something about that each test is [descr, fields, block],
|
185
|
+
# and that first value may be column name to output. - zverok, 2016-05-18
|
186
|
+
def verify(*tests)
|
187
|
+
id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
|
188
|
+
|
189
|
+
each_row_with_index.map do |row, i|
|
190
|
+
tests.reject { |*_, block| block.call(row) }
|
191
|
+
.map { |test| verify_error_message row, test, id, i }
|
192
|
+
end.flatten
|
193
|
+
end
|
194
|
+
|
195
|
+
def recode_vectors
|
196
|
+
block_given? or return to_enum(:recode_vectors)
|
197
|
+
|
198
|
+
dup.tap do |df|
|
199
|
+
df.each_vector_with_index do |v, i|
|
200
|
+
df[*i] = should_be_vector!(yield(v))
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
def recode_rows
|
206
|
+
block_given? or return to_enum(:recode_rows)
|
207
|
+
|
208
|
+
dup.tap do |df|
|
209
|
+
df.each_row_with_index do |r, i|
|
210
|
+
df.row[i] = should_be_vector!(yield(r))
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
# Map each vector and return an Array.
|
216
|
+
def map_vectors(&block)
|
217
|
+
return to_enum(:map_vectors) unless block
|
218
|
+
|
219
|
+
@data.map(&block)
|
220
|
+
end
|
221
|
+
|
222
|
+
# Destructive form of #map_vectors
|
223
|
+
def map_vectors!
|
224
|
+
return to_enum(:map_vectors!) unless block_given?
|
225
|
+
|
226
|
+
vectors.dup.each do |n|
|
227
|
+
self[n] = should_be_vector!(yield(self[n]))
|
228
|
+
end
|
229
|
+
|
230
|
+
self
|
231
|
+
end
|
232
|
+
|
233
|
+
# Map vectors alongwith the index.
|
234
|
+
def map_vectors_with_index(&block)
|
235
|
+
return to_enum(:map_vectors_with_index) unless block
|
236
|
+
|
237
|
+
each_vector_with_index.map(&block)
|
238
|
+
end
|
239
|
+
|
240
|
+
# Map each row
|
241
|
+
def map_rows(&block)
|
242
|
+
return to_enum(:map_rows) unless block
|
243
|
+
|
244
|
+
each_row.map(&block)
|
245
|
+
end
|
246
|
+
|
247
|
+
def map_rows_with_index(&block)
|
248
|
+
return to_enum(:map_rows_with_index) unless block
|
249
|
+
|
250
|
+
each_row_with_index.map(&block)
|
251
|
+
end
|
252
|
+
|
253
|
+
def map_rows!
|
254
|
+
return to_enum(:map_rows!) unless block_given?
|
255
|
+
|
256
|
+
index.dup.each do |i|
|
257
|
+
row[i] = should_be_vector!(yield(row[i]))
|
258
|
+
end
|
259
|
+
|
260
|
+
self
|
261
|
+
end
|
262
|
+
|
263
|
+
def apply_method(method, keys: nil, by_position: true)
|
264
|
+
df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
|
265
|
+
|
266
|
+
case method
|
267
|
+
when Symbol then df.send(method)
|
268
|
+
when Proc then method.call(df)
|
269
|
+
when Array
|
270
|
+
method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
|
271
|
+
else raise
|
272
|
+
end
|
273
|
+
end
|
274
|
+
alias apply_method_on_sub_df apply_method
|
275
|
+
|
276
|
+
# Retrieves a DaruLite::Vector, based on the result of calculation
|
277
|
+
# performed on each row.
|
278
|
+
def collect_rows(&block)
|
279
|
+
return to_enum(:collect_rows) unless block
|
280
|
+
|
281
|
+
DaruLite::Vector.new(each_row.map(&block), index: @index)
|
282
|
+
end
|
283
|
+
|
284
|
+
def collect_row_with_index(&block)
|
285
|
+
return to_enum(:collect_row_with_index) unless block
|
286
|
+
|
287
|
+
DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
|
288
|
+
end
|
289
|
+
|
290
|
+
# Retrives a DaruLite::Vector, based on the result of calculation
|
291
|
+
# performed on each vector.
|
292
|
+
def collect_vectors(&block)
|
293
|
+
return to_enum(:collect_vectors) unless block
|
294
|
+
|
295
|
+
DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
|
296
|
+
end
|
297
|
+
|
298
|
+
def collect_vector_with_index(&block)
|
299
|
+
return to_enum(:collect_vector_with_index) unless block
|
300
|
+
|
301
|
+
DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
|
302
|
+
end
|
303
|
+
|
304
|
+
# Generate a matrix, based on vector names of the DataFrame.
|
305
|
+
#
|
306
|
+
# @return {::Matrix}
|
307
|
+
# :nocov:
|
308
|
+
# FIXME: Even not trying to cover this: I can't get, how it is expected
|
309
|
+
# to work.... -- zverok
|
310
|
+
def collect_matrix
|
311
|
+
return to_enum(:collect_matrix) unless block_given?
|
312
|
+
|
313
|
+
vecs = vectors.to_a
|
314
|
+
rows = vecs.collect do |row|
|
315
|
+
vecs.collect do |col|
|
316
|
+
yield row, col
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
Matrix.rows(rows)
|
321
|
+
end
|
322
|
+
# :nocov:
|
323
|
+
|
324
|
+
private
|
325
|
+
|
326
|
+
def should_be_vector!(val)
|
327
|
+
return val if val.is_a?(DaruLite::Vector)
|
328
|
+
|
329
|
+
raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
|
330
|
+
end
|
331
|
+
|
332
|
+
def verify_error_message(row, test, id, i)
|
333
|
+
description, fields, = test
|
334
|
+
values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
|
335
|
+
"#{i + 1} [#{row[id]}]: #{description}#{values}"
|
336
|
+
end
|
337
|
+
end
|
338
|
+
end
|
339
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Joinable
|
4
|
+
# Concatenate another DataFrame along corresponding columns.
|
5
|
+
# If columns do not exist in both dataframes, they are filled with nils
|
6
|
+
def concat(other_df)
|
7
|
+
vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
|
8
|
+
|
9
|
+
data = vectors.map do |v|
|
10
|
+
get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
|
11
|
+
end
|
12
|
+
|
13
|
+
DaruLite::DataFrame.new(data, order: vectors)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Concatenates another DataFrame as #concat.
|
17
|
+
# Additionally it tries to preserve the index. If the indices contain
|
18
|
+
# common elements, #union will overwrite the according rows in the
|
19
|
+
# first dataframe.
|
20
|
+
def union(other_df)
|
21
|
+
index = (@index.to_a + other_df.index.to_a).uniq
|
22
|
+
df = row[*(@index.to_a - other_df.index.to_a)]
|
23
|
+
|
24
|
+
df = df.concat(other_df)
|
25
|
+
df.index = DaruLite::Index.new(index)
|
26
|
+
df
|
27
|
+
end
|
28
|
+
|
29
|
+
# Merge vectors from two DataFrames. In case of name collision,
|
30
|
+
# the vectors names are changed to x_1, x_2 ....
|
31
|
+
#
|
32
|
+
# @return {DaruLite::DataFrame}
|
33
|
+
def merge(other_df)
|
34
|
+
unless nrows == other_df.nrows
|
35
|
+
raise ArgumentError,
|
36
|
+
"Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
|
37
|
+
end
|
38
|
+
|
39
|
+
new_fields = (@vectors.to_a + other_df.vectors.to_a)
|
40
|
+
new_fields = ArrayHelper.recode_repeated(new_fields)
|
41
|
+
DataFrame.new({}, order: new_fields).tap do |df_new|
|
42
|
+
(0...nrows).each do |i|
|
43
|
+
df_new.add_row row[i].to_a + other_df.row[i].to_a
|
44
|
+
end
|
45
|
+
df_new.index = @index if @index == other_df.index
|
46
|
+
df_new.update
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Join 2 DataFrames with SQL style joins. Currently supports inner, left
|
51
|
+
# outer, right outer and full outer joins.
|
52
|
+
#
|
53
|
+
# @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
|
54
|
+
# to be performed.
|
55
|
+
# @param [Hash] opts Options Hash
|
56
|
+
# @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
|
57
|
+
# @option :on [Array] The columns on which the join is to be performed.
|
58
|
+
# Column names specified here must be common to both DataFrames.
|
59
|
+
# @option :indicator [Symbol] The name of a vector to add to the resultant
|
60
|
+
# dataframe that indicates whether the record was in the left (:left_only),
|
61
|
+
# right (:right_only), or both (:both) joining dataframes.
|
62
|
+
# @return [DaruLite::DataFrame]
|
63
|
+
# @example Inner Join
|
64
|
+
# left = DaruLite::DataFrame.new({
|
65
|
+
# :id => [1,2,3,4],
|
66
|
+
# :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
|
67
|
+
# })
|
68
|
+
# right = DaruLite::DataFrame.new({
|
69
|
+
# :id => [1,2,3,4],
|
70
|
+
# :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
|
71
|
+
# })
|
72
|
+
# left.join(right, how: :inner, on: [:name])
|
73
|
+
# #=>
|
74
|
+
# ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
|
75
|
+
# # id_1 name id_2
|
76
|
+
# # 0 1 Pirate 2
|
77
|
+
# # 1 3 Ninja 4
|
78
|
+
def join(other_df, opts = {})
|
79
|
+
DaruLite::Core::Merge.join(self, other_df, opts)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Creates a new dataset for one to many relations
|
83
|
+
# on a dataset, based on pattern of field names.
|
84
|
+
#
|
85
|
+
# for example, you have a survey for number of children
|
86
|
+
# with this structure:
|
87
|
+
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
|
88
|
+
# with
|
89
|
+
# ds.one_to_many([:id], "child_%v_%n"
|
90
|
+
# the field of first parameters will be copied verbatim
|
91
|
+
# to new dataset, and fields which responds to second
|
92
|
+
# pattern will be added one case for each different %n.
|
93
|
+
#
|
94
|
+
# @example
|
95
|
+
# cases=[
|
96
|
+
# ['1','george','red',10,'blue',20,nil,nil],
|
97
|
+
# ['2','fred','green',15,'orange',30,'white',20],
|
98
|
+
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
99
|
+
# ]
|
100
|
+
# ds=DaruLite::DataFrame.rows(cases, order:
|
101
|
+
# [:id, :name,
|
102
|
+
# :car_color1, :car_value1,
|
103
|
+
# :car_color2, :car_value2,
|
104
|
+
# :car_color3, :car_value3])
|
105
|
+
# ds.one_to_many([:id],'car_%v%n').to_matrix
|
106
|
+
# #=> Matrix[
|
107
|
+
# # ["red", "1", 10],
|
108
|
+
# # ["blue", "1", 20],
|
109
|
+
# # ["green", "2", 15],
|
110
|
+
# # ["orange", "2", 30],
|
111
|
+
# # ["white", "2", 20]
|
112
|
+
# # ]
|
113
|
+
def one_to_many(parent_fields, pattern)
|
114
|
+
vars, numbers = one_to_many_components(pattern)
|
115
|
+
|
116
|
+
DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
|
117
|
+
each_row do |row|
|
118
|
+
verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
|
119
|
+
numbers.each do |n|
|
120
|
+
generated = one_to_many_row row, n, vars, pattern
|
121
|
+
next if generated.values.all?(&:nil?)
|
122
|
+
|
123
|
+
ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
|
124
|
+
end
|
125
|
+
end
|
126
|
+
ds.update
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
private
|
131
|
+
|
132
|
+
def one_to_many_components(pattern)
|
133
|
+
re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
|
134
|
+
|
135
|
+
vars, numbers =
|
136
|
+
@vectors
|
137
|
+
.map { |v| v.scan(re) }
|
138
|
+
.reject(&:empty?).flatten(1).transpose
|
139
|
+
|
140
|
+
[vars.uniq, numbers.map(&:to_i).sort.uniq]
|
141
|
+
end
|
142
|
+
|
143
|
+
def one_to_many_row(row, number, vars, pattern)
|
144
|
+
vars
|
145
|
+
.to_h do |v|
|
146
|
+
name = pattern.sub('%v', v).sub('%n', number.to_s)
|
147
|
+
[v, row[name]]
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|