daru_lite 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +35 -33
- data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
- data/lib/daru_lite/data_frame/calculatable.rb +140 -0
- data/lib/daru_lite/data_frame/convertible.rb +107 -0
- data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
- data/lib/daru_lite/data_frame/fetchable.rb +301 -0
- data/lib/daru_lite/data_frame/filterable.rb +144 -0
- data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
- data/lib/daru_lite/data_frame/indexable.rb +168 -0
- data/lib/daru_lite/data_frame/iterable.rb +339 -0
- data/lib/daru_lite/data_frame/joinable.rb +152 -0
- data/lib/daru_lite/data_frame/missable.rb +75 -0
- data/lib/daru_lite/data_frame/pivotable.rb +108 -0
- data/lib/daru_lite/data_frame/queryable.rb +67 -0
- data/lib/daru_lite/data_frame/setable.rb +109 -0
- data/lib/daru_lite/data_frame/sortable.rb +241 -0
- data/lib/daru_lite/dataframe.rb +138 -2353
- data/lib/daru_lite/index/index.rb +13 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1 -1
- data/lib/daru_lite/vector/aggregatable.rb +9 -0
- data/lib/daru_lite/vector/calculatable.rb +78 -0
- data/lib/daru_lite/vector/convertible.rb +77 -0
- data/lib/daru_lite/vector/duplicatable.rb +17 -0
- data/lib/daru_lite/vector/fetchable.rb +175 -0
- data/lib/daru_lite/vector/filterable.rb +128 -0
- data/lib/daru_lite/vector/indexable.rb +77 -0
- data/lib/daru_lite/vector/iterable.rb +95 -0
- data/lib/daru_lite/vector/joinable.rb +17 -0
- data/lib/daru_lite/vector/missable.rb +124 -0
- data/lib/daru_lite/vector/queryable.rb +45 -0
- data/lib/daru_lite/vector/setable.rb +47 -0
- data/lib/daru_lite/vector/sortable.rb +113 -0
- data/lib/daru_lite/vector.rb +36 -932
- data/lib/daru_lite/version.rb +1 -1
- data/spec/data_frame/aggregatable_example.rb +65 -0
- data/spec/data_frame/buildable_example.rb +109 -0
- data/spec/data_frame/calculatable_example.rb +135 -0
- data/spec/data_frame/convertible_example.rb +180 -0
- data/spec/data_frame/duplicatable_example.rb +111 -0
- data/spec/data_frame/fetchable_example.rb +476 -0
- data/spec/data_frame/filterable_example.rb +250 -0
- data/spec/data_frame/indexable_example.rb +221 -0
- data/spec/data_frame/iterable_example.rb +465 -0
- data/spec/data_frame/joinable_example.rb +106 -0
- data/spec/data_frame/missable_example.rb +47 -0
- data/spec/data_frame/pivotable_example.rb +297 -0
- data/spec/data_frame/queryable_example.rb +92 -0
- data/spec/data_frame/setable_example.rb +482 -0
- data/spec/data_frame/sortable_example.rb +350 -0
- data/spec/dataframe_spec.rb +181 -3289
- data/spec/index/index_spec.rb +8 -0
- data/spec/vector/aggregatable_example.rb +27 -0
- data/spec/vector/calculatable_example.rb +82 -0
- data/spec/vector/convertible_example.rb +126 -0
- data/spec/vector/duplicatable_example.rb +48 -0
- data/spec/vector/fetchable_example.rb +463 -0
- data/spec/vector/filterable_example.rb +165 -0
- data/spec/vector/indexable_example.rb +201 -0
- data/spec/vector/iterable_example.rb +111 -0
- data/spec/vector/joinable_example.rb +25 -0
- data/spec/vector/missable_example.rb +88 -0
- data/spec/vector/queryable_example.rb +91 -0
- data/spec/vector/setable_example.rb +300 -0
- data/spec/vector/sortable_example.rb +242 -0
- data/spec/vector_spec.rb +111 -1805
- metadata +86 -2
@@ -0,0 +1,168 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Indexable
|
4
|
+
module SetSingleIndexStrategy
|
5
|
+
def self.uniq_size(df, col)
|
6
|
+
df[col].uniq.size
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.new_index(df, col)
|
10
|
+
DaruLite::Index.new(df[col].to_a)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.delete_vector(df, col)
|
14
|
+
df.delete_vector(col)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
module SetCategoricalIndexStrategy
|
19
|
+
def self.new_index(df, col)
|
20
|
+
DaruLite::CategoricalIndex.new(df[col].to_a)
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.delete_vector(df, col)
|
24
|
+
df.delete_vector(col)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
module SetMultiIndexStrategy
|
29
|
+
def self.uniq_size(df, cols)
|
30
|
+
df[*cols].uniq.size
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.new_index(df, cols)
|
34
|
+
DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
|
35
|
+
mi.name = cols
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.delete_vector(df, cols)
|
40
|
+
df.delete_vectors(*cols)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Set a particular column as the new DF
|
45
|
+
def set_index(new_index_col, keep: false, categorical: false)
|
46
|
+
if categorical
|
47
|
+
strategy = SetCategoricalIndexStrategy
|
48
|
+
elsif new_index_col.respond_to?(:to_a)
|
49
|
+
strategy = SetMultiIndexStrategy
|
50
|
+
new_index_col = new_index_col.to_a
|
51
|
+
else
|
52
|
+
strategy = SetSingleIndexStrategy
|
53
|
+
end
|
54
|
+
|
55
|
+
unless categorical
|
56
|
+
uniq_size = strategy.uniq_size(self, new_index_col)
|
57
|
+
raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
|
58
|
+
end
|
59
|
+
|
60
|
+
self.index = strategy.new_index(self, new_index_col)
|
61
|
+
strategy.delete_vector(self, new_index_col) unless keep
|
62
|
+
self
|
63
|
+
end
|
64
|
+
|
65
|
+
# Change the index of the DataFrame and preserve the labels of the previous
|
66
|
+
# indexing. New index can be DaruLite::Index or any of its subclasses.
|
67
|
+
#
|
68
|
+
# @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
|
69
|
+
# @example Reindexing DataFrame
|
70
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
|
71
|
+
# index: ['a','b','c','d'])
|
72
|
+
# #=>
|
73
|
+
# ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
74
|
+
# # a b
|
75
|
+
# # a 1 11
|
76
|
+
# # b 2 22
|
77
|
+
# # c 3 33
|
78
|
+
# # d 4 44
|
79
|
+
# df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
|
80
|
+
# #=>
|
81
|
+
# ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
82
|
+
# # a b
|
83
|
+
# # b 2 22
|
84
|
+
# # 0 nil nil
|
85
|
+
# # a 1 11
|
86
|
+
# # g nil nil
|
87
|
+
def reindex(new_index)
|
88
|
+
unless new_index.is_a?(DaruLite::Index)
|
89
|
+
raise ArgumentError, 'Must pass the new index of type Index or its ' \
|
90
|
+
"subclasses, not #{new_index.class}"
|
91
|
+
end
|
92
|
+
|
93
|
+
cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
|
94
|
+
new_index.each_with_object(cl) do |idx, memo|
|
95
|
+
memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def reset_index
|
100
|
+
index_df = index.to_df
|
101
|
+
names = index.name
|
102
|
+
names = [names] unless names.instance_of?(Array)
|
103
|
+
new_vectors = names + vectors.to_a
|
104
|
+
self.index = index_df.index
|
105
|
+
names.each do |name|
|
106
|
+
self[name] = index_df[name]
|
107
|
+
end
|
108
|
+
self.order = new_vectors
|
109
|
+
self
|
110
|
+
end
|
111
|
+
|
112
|
+
# Reassign index with a new index of type DaruLite::Index or any of its subclasses.
|
113
|
+
#
|
114
|
+
# @param [DaruLite::Index] idx New index object on which the rows of the dataframe
|
115
|
+
# are to be indexed.
|
116
|
+
# @example Reassigining index of a DataFrame
|
117
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
|
118
|
+
# df.index.to_a #=> [0,1,2,3]
|
119
|
+
#
|
120
|
+
# df.index = DaruLite::Index.new(['a','b','c','d'])
|
121
|
+
# df.index.to_a #=> ['a','b','c','d']
|
122
|
+
# df.row['a'].to_a #=> [1,11]
|
123
|
+
def index=(idx)
|
124
|
+
@index = Index.coerce idx
|
125
|
+
@data.each { |vec| vec.index = @index }
|
126
|
+
|
127
|
+
self
|
128
|
+
end
|
129
|
+
|
130
|
+
def reindex_vectors(new_vectors)
|
131
|
+
unless new_vectors.is_a?(DaruLite::Index)
|
132
|
+
raise ArgumentError, 'Must pass the new index of type Index or its ' \
|
133
|
+
"subclasses, not #{new_vectors.class}"
|
134
|
+
end
|
135
|
+
|
136
|
+
cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
|
137
|
+
new_vectors.each_with_object(cl) do |vec, memo|
|
138
|
+
memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
# Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
|
143
|
+
#
|
144
|
+
# @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
|
145
|
+
# be indexed. Must of the same size as ncols.
|
146
|
+
# @example Reassigning vectors of a DataFrame
|
147
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
|
148
|
+
# df.vectors.to_a #=> [:a, :b, :c]
|
149
|
+
#
|
150
|
+
# df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
|
151
|
+
# df.vectors.to_a #=> [:foo, :bar, :baz]
|
152
|
+
def vectors=(new_index)
|
153
|
+
raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
|
154
|
+
|
155
|
+
if new_index.size != ncols
|
156
|
+
raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
|
157
|
+
"dataframe size #{ncols}"
|
158
|
+
end
|
159
|
+
|
160
|
+
@vectors = new_index
|
161
|
+
@data.zip(new_index.to_a).each do |vect, name|
|
162
|
+
vect.name = name
|
163
|
+
end
|
164
|
+
self
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
@@ -0,0 +1,339 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Iterable
|
4
|
+
# Iterate over each index of the DataFrame.
|
5
|
+
def each_index(&block)
|
6
|
+
return to_enum(:each_index) unless block
|
7
|
+
|
8
|
+
@index.each(&block)
|
9
|
+
|
10
|
+
self
|
11
|
+
end
|
12
|
+
|
13
|
+
# Iterate over each vector
|
14
|
+
def each_vector(&block)
|
15
|
+
return to_enum(:each_vector) unless block
|
16
|
+
|
17
|
+
@data.each(&block)
|
18
|
+
|
19
|
+
self
|
20
|
+
end
|
21
|
+
|
22
|
+
alias each_column each_vector
|
23
|
+
|
24
|
+
# Iterate over each vector alongwith the name of the vector
|
25
|
+
def each_vector_with_index
|
26
|
+
return to_enum(:each_vector_with_index) unless block_given?
|
27
|
+
|
28
|
+
@vectors.each do |vector|
|
29
|
+
yield @data[@vectors[vector]], vector
|
30
|
+
end
|
31
|
+
|
32
|
+
self
|
33
|
+
end
|
34
|
+
|
35
|
+
alias each_column_with_index each_vector_with_index
|
36
|
+
|
37
|
+
# Iterate over each row
|
38
|
+
def each_row
|
39
|
+
return to_enum(:each_row) unless block_given?
|
40
|
+
|
41
|
+
@index.size.times do |pos|
|
42
|
+
yield row_at(pos)
|
43
|
+
end
|
44
|
+
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
def each_row_with_index
|
49
|
+
return to_enum(:each_row_with_index) unless block_given?
|
50
|
+
|
51
|
+
@index.each do |index|
|
52
|
+
yield access_row(index), index
|
53
|
+
end
|
54
|
+
|
55
|
+
self
|
56
|
+
end
|
57
|
+
|
58
|
+
# Iterate over each row or vector of the DataFrame. Specify axis
|
59
|
+
# by passing :vector or :row as the argument. Default to :vector.
|
60
|
+
#
|
61
|
+
# == Description
|
62
|
+
#
|
63
|
+
# `#each` works exactly like Array#each. The default mode for `each`
|
64
|
+
# is to iterate over the columns of the DataFrame. To iterate over
|
65
|
+
# rows you must pass the axis, i.e `:row` as an argument.
|
66
|
+
#
|
67
|
+
# == Arguments
|
68
|
+
#
|
69
|
+
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
70
|
+
# or :row. Default to :vector.
|
71
|
+
def each(axis = :vector, &block)
|
72
|
+
dispatch_to_axis axis, :each, &block
|
73
|
+
end
|
74
|
+
|
75
|
+
# Iterate over a row or vector and return results in a DaruLite::Vector.
|
76
|
+
# Specify axis with :vector or :row. Default to :vector.
|
77
|
+
#
|
78
|
+
# == Description
|
79
|
+
#
|
80
|
+
# The #collect iterator works similar to #map, the only difference
|
81
|
+
# being that it returns a DaruLite::Vector comprising of the results of
|
82
|
+
# each block run. The resultant Vector has the same index as that
|
83
|
+
# of the axis over which collect has iterated. It also accepts the
|
84
|
+
# optional axis argument.
|
85
|
+
#
|
86
|
+
# == Arguments
|
87
|
+
#
|
88
|
+
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
89
|
+
# or :row. Default to :vector.
|
90
|
+
def collect(axis = :vector, &block)
|
91
|
+
dispatch_to_axis_pl axis, :collect, &block
|
92
|
+
end
|
93
|
+
|
94
|
+
# Map over each vector or row of the data frame according to
|
95
|
+
# the argument specified. Will return an Array of the resulting
|
96
|
+
# elements. To map over each row/vector and get a DataFrame,
|
97
|
+
# see #recode.
|
98
|
+
#
|
99
|
+
# == Description
|
100
|
+
#
|
101
|
+
# The #map iterator works like Array#map. The value returned by
|
102
|
+
# each run of the block is added to an Array and the Array is
|
103
|
+
# returned. This method also accepts an axis argument, like #each.
|
104
|
+
# The default is :vector.
|
105
|
+
#
|
106
|
+
# == Arguments
|
107
|
+
#
|
108
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
109
|
+
# Default to :vector.
|
110
|
+
def map(axis = :vector, &block)
|
111
|
+
dispatch_to_axis_pl axis, :map, &block
|
112
|
+
end
|
113
|
+
|
114
|
+
# Destructive map. Modifies the DataFrame. Each run of the block
|
115
|
+
# must return a DaruLite::Vector. You can specify the axis to map over
|
116
|
+
# as the argument. Default to :vector.
|
117
|
+
#
|
118
|
+
# == Arguments
|
119
|
+
#
|
120
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
121
|
+
# Default to :vector.
|
122
|
+
def map!(axis = :vector, &block)
|
123
|
+
if %i[vector column].include?(axis)
|
124
|
+
map_vectors!(&block)
|
125
|
+
elsif axis == :row
|
126
|
+
map_rows!(&block)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Maps over the DataFrame and returns a DataFrame. Each run of the
|
131
|
+
# block must return a DaruLite::Vector object. You can specify the axis
|
132
|
+
# to map over. Default to :vector.
|
133
|
+
#
|
134
|
+
# == Description
|
135
|
+
#
|
136
|
+
# Recode works similarly to #map, but an important difference between
|
137
|
+
# the two is that recode returns a modified DaruLite::DataFrame instead
|
138
|
+
# of an Array. For this reason, #recode expects that every run of the
|
139
|
+
# block to return a DaruLite::Vector.
|
140
|
+
#
|
141
|
+
# Just like map and each, recode also accepts an optional _axis_ argument.
|
142
|
+
#
|
143
|
+
# == Arguments
|
144
|
+
#
|
145
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
146
|
+
# Default to :vector.
|
147
|
+
def recode(axis = :vector, &block)
|
148
|
+
dispatch_to_axis_pl axis, :recode, &block
|
149
|
+
end
|
150
|
+
|
151
|
+
# Replace specified values with given value
|
152
|
+
# @param [Array] old_values values to replace with new value
|
153
|
+
# @param [object] new_value new value to replace with
|
154
|
+
# @return [DaruLite::DataFrame] Data Frame itself with old values replace
|
155
|
+
# with new value
|
156
|
+
# @example
|
157
|
+
# df = DaruLite::DataFrame.new({
|
158
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
159
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
160
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
161
|
+
# }, index: 11..18)
|
162
|
+
# df.replace_values nil, Float::NAN
|
163
|
+
# # => #<DaruLite::DataFrame(8x3)>
|
164
|
+
# # a b c
|
165
|
+
# # 11 1 a a
|
166
|
+
# # 12 2 b NaN
|
167
|
+
# # 13 3 NaN 3
|
168
|
+
# # 14 NaN NaN 4
|
169
|
+
# # 15 NaN NaN 3
|
170
|
+
# # 16 NaN 3 5
|
171
|
+
# # 17 1 5 NaN
|
172
|
+
# # 18 7 8 7
|
173
|
+
def replace_values(old_values, new_value)
|
174
|
+
@data.each { |vec| vec.replace_values old_values, new_value }
|
175
|
+
self
|
176
|
+
end
|
177
|
+
|
178
|
+
# Test each row with one or more tests.
|
179
|
+
# @param tests [Proc] Each test is a Proc with the form
|
180
|
+
# *Proc.new {|row| row[:age] > 0}*
|
181
|
+
# The function returns an array with all errors.
|
182
|
+
#
|
183
|
+
# FIXME: description here is too sparse. As far as I can get,
|
184
|
+
# it should tell something about that each test is [descr, fields, block],
|
185
|
+
# and that first value may be column name to output. - zverok, 2016-05-18
|
186
|
+
def verify(*tests)
|
187
|
+
id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
|
188
|
+
|
189
|
+
each_row_with_index.map do |row, i|
|
190
|
+
tests.reject { |*_, block| block.call(row) }
|
191
|
+
.map { |test| verify_error_message row, test, id, i }
|
192
|
+
end.flatten
|
193
|
+
end
|
194
|
+
|
195
|
+
def recode_vectors
|
196
|
+
block_given? or return to_enum(:recode_vectors)
|
197
|
+
|
198
|
+
dup.tap do |df|
|
199
|
+
df.each_vector_with_index do |v, i|
|
200
|
+
df[*i] = should_be_vector!(yield(v))
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
def recode_rows
|
206
|
+
block_given? or return to_enum(:recode_rows)
|
207
|
+
|
208
|
+
dup.tap do |df|
|
209
|
+
df.each_row_with_index do |r, i|
|
210
|
+
df.row[i] = should_be_vector!(yield(r))
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
# Map each vector and return an Array.
|
216
|
+
def map_vectors(&block)
|
217
|
+
return to_enum(:map_vectors) unless block
|
218
|
+
|
219
|
+
@data.map(&block)
|
220
|
+
end
|
221
|
+
|
222
|
+
# Destructive form of #map_vectors
|
223
|
+
def map_vectors!
|
224
|
+
return to_enum(:map_vectors!) unless block_given?
|
225
|
+
|
226
|
+
vectors.dup.each do |n|
|
227
|
+
self[n] = should_be_vector!(yield(self[n]))
|
228
|
+
end
|
229
|
+
|
230
|
+
self
|
231
|
+
end
|
232
|
+
|
233
|
+
# Map vectors alongwith the index.
|
234
|
+
def map_vectors_with_index(&block)
|
235
|
+
return to_enum(:map_vectors_with_index) unless block
|
236
|
+
|
237
|
+
each_vector_with_index.map(&block)
|
238
|
+
end
|
239
|
+
|
240
|
+
# Map each row
|
241
|
+
def map_rows(&block)
|
242
|
+
return to_enum(:map_rows) unless block
|
243
|
+
|
244
|
+
each_row.map(&block)
|
245
|
+
end
|
246
|
+
|
247
|
+
def map_rows_with_index(&block)
|
248
|
+
return to_enum(:map_rows_with_index) unless block
|
249
|
+
|
250
|
+
each_row_with_index.map(&block)
|
251
|
+
end
|
252
|
+
|
253
|
+
def map_rows!
|
254
|
+
return to_enum(:map_rows!) unless block_given?
|
255
|
+
|
256
|
+
index.dup.each do |i|
|
257
|
+
row[i] = should_be_vector!(yield(row[i]))
|
258
|
+
end
|
259
|
+
|
260
|
+
self
|
261
|
+
end
|
262
|
+
|
263
|
+
def apply_method(method, keys: nil, by_position: true)
|
264
|
+
df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
|
265
|
+
|
266
|
+
case method
|
267
|
+
when Symbol then df.send(method)
|
268
|
+
when Proc then method.call(df)
|
269
|
+
when Array
|
270
|
+
method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
|
271
|
+
else raise
|
272
|
+
end
|
273
|
+
end
|
274
|
+
alias apply_method_on_sub_df apply_method
|
275
|
+
|
276
|
+
# Retrieves a DaruLite::Vector, based on the result of calculation
|
277
|
+
# performed on each row.
|
278
|
+
def collect_rows(&block)
|
279
|
+
return to_enum(:collect_rows) unless block
|
280
|
+
|
281
|
+
DaruLite::Vector.new(each_row.map(&block), index: @index)
|
282
|
+
end
|
283
|
+
|
284
|
+
def collect_row_with_index(&block)
|
285
|
+
return to_enum(:collect_row_with_index) unless block
|
286
|
+
|
287
|
+
DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
|
288
|
+
end
|
289
|
+
|
290
|
+
# Retrives a DaruLite::Vector, based on the result of calculation
|
291
|
+
# performed on each vector.
|
292
|
+
def collect_vectors(&block)
|
293
|
+
return to_enum(:collect_vectors) unless block
|
294
|
+
|
295
|
+
DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
|
296
|
+
end
|
297
|
+
|
298
|
+
def collect_vector_with_index(&block)
|
299
|
+
return to_enum(:collect_vector_with_index) unless block
|
300
|
+
|
301
|
+
DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
|
302
|
+
end
|
303
|
+
|
304
|
+
# Generate a matrix, based on vector names of the DataFrame.
|
305
|
+
#
|
306
|
+
# @return {::Matrix}
|
307
|
+
# :nocov:
|
308
|
+
# FIXME: Even not trying to cover this: I can't get, how it is expected
|
309
|
+
# to work.... -- zverok
|
310
|
+
def collect_matrix
|
311
|
+
return to_enum(:collect_matrix) unless block_given?
|
312
|
+
|
313
|
+
vecs = vectors.to_a
|
314
|
+
rows = vecs.collect do |row|
|
315
|
+
vecs.collect do |col|
|
316
|
+
yield row, col
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
Matrix.rows(rows)
|
321
|
+
end
|
322
|
+
# :nocov:
|
323
|
+
|
324
|
+
private
|
325
|
+
|
326
|
+
def should_be_vector!(val)
|
327
|
+
return val if val.is_a?(DaruLite::Vector)
|
328
|
+
|
329
|
+
raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
|
330
|
+
end
|
331
|
+
|
332
|
+
def verify_error_message(row, test, id, i)
|
333
|
+
description, fields, = test
|
334
|
+
values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
|
335
|
+
"#{i + 1} [#{row[id]}]: #{description}#{values}"
|
336
|
+
end
|
337
|
+
end
|
338
|
+
end
|
339
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Joinable
|
4
|
+
# Concatenate another DataFrame along corresponding columns.
|
5
|
+
# If columns do not exist in both dataframes, they are filled with nils
|
6
|
+
def concat(other_df)
|
7
|
+
vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
|
8
|
+
|
9
|
+
data = vectors.map do |v|
|
10
|
+
get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
|
11
|
+
end
|
12
|
+
|
13
|
+
DaruLite::DataFrame.new(data, order: vectors)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Concatenates another DataFrame as #concat.
|
17
|
+
# Additionally it tries to preserve the index. If the indices contain
|
18
|
+
# common elements, #union will overwrite the according rows in the
|
19
|
+
# first dataframe.
|
20
|
+
def union(other_df)
|
21
|
+
index = (@index.to_a + other_df.index.to_a).uniq
|
22
|
+
df = row[*(@index.to_a - other_df.index.to_a)]
|
23
|
+
|
24
|
+
df = df.concat(other_df)
|
25
|
+
df.index = DaruLite::Index.new(index)
|
26
|
+
df
|
27
|
+
end
|
28
|
+
|
29
|
+
# Merge vectors from two DataFrames. In case of name collision,
|
30
|
+
# the vectors names are changed to x_1, x_2 ....
|
31
|
+
#
|
32
|
+
# @return {DaruLite::DataFrame}
|
33
|
+
def merge(other_df)
|
34
|
+
unless nrows == other_df.nrows
|
35
|
+
raise ArgumentError,
|
36
|
+
"Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
|
37
|
+
end
|
38
|
+
|
39
|
+
new_fields = (@vectors.to_a + other_df.vectors.to_a)
|
40
|
+
new_fields = ArrayHelper.recode_repeated(new_fields)
|
41
|
+
DataFrame.new({}, order: new_fields).tap do |df_new|
|
42
|
+
(0...nrows).each do |i|
|
43
|
+
df_new.add_row row[i].to_a + other_df.row[i].to_a
|
44
|
+
end
|
45
|
+
df_new.index = @index if @index == other_df.index
|
46
|
+
df_new.update
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Join 2 DataFrames with SQL style joins. Currently supports inner, left
|
51
|
+
# outer, right outer and full outer joins.
|
52
|
+
#
|
53
|
+
# @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
|
54
|
+
# to be performed.
|
55
|
+
# @param [Hash] opts Options Hash
|
56
|
+
# @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
|
57
|
+
# @option :on [Array] The columns on which the join is to be performed.
|
58
|
+
# Column names specified here must be common to both DataFrames.
|
59
|
+
# @option :indicator [Symbol] The name of a vector to add to the resultant
|
60
|
+
# dataframe that indicates whether the record was in the left (:left_only),
|
61
|
+
# right (:right_only), or both (:both) joining dataframes.
|
62
|
+
# @return [DaruLite::DataFrame]
|
63
|
+
# @example Inner Join
|
64
|
+
# left = DaruLite::DataFrame.new({
|
65
|
+
# :id => [1,2,3,4],
|
66
|
+
# :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
|
67
|
+
# })
|
68
|
+
# right = DaruLite::DataFrame.new({
|
69
|
+
# :id => [1,2,3,4],
|
70
|
+
# :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
|
71
|
+
# })
|
72
|
+
# left.join(right, how: :inner, on: [:name])
|
73
|
+
# #=>
|
74
|
+
# ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
|
75
|
+
# # id_1 name id_2
|
76
|
+
# # 0 1 Pirate 2
|
77
|
+
# # 1 3 Ninja 4
|
78
|
+
def join(other_df, opts = {})
|
79
|
+
DaruLite::Core::Merge.join(self, other_df, opts)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Creates a new dataset for one to many relations
|
83
|
+
# on a dataset, based on pattern of field names.
|
84
|
+
#
|
85
|
+
# for example, you have a survey for number of children
|
86
|
+
# with this structure:
|
87
|
+
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
|
88
|
+
# with
|
89
|
+
# ds.one_to_many([:id], "child_%v_%n"
|
90
|
+
# the field of first parameters will be copied verbatim
|
91
|
+
# to new dataset, and fields which responds to second
|
92
|
+
# pattern will be added one case for each different %n.
|
93
|
+
#
|
94
|
+
# @example
|
95
|
+
# cases=[
|
96
|
+
# ['1','george','red',10,'blue',20,nil,nil],
|
97
|
+
# ['2','fred','green',15,'orange',30,'white',20],
|
98
|
+
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
99
|
+
# ]
|
100
|
+
# ds=DaruLite::DataFrame.rows(cases, order:
|
101
|
+
# [:id, :name,
|
102
|
+
# :car_color1, :car_value1,
|
103
|
+
# :car_color2, :car_value2,
|
104
|
+
# :car_color3, :car_value3])
|
105
|
+
# ds.one_to_many([:id],'car_%v%n').to_matrix
|
106
|
+
# #=> Matrix[
|
107
|
+
# # ["red", "1", 10],
|
108
|
+
# # ["blue", "1", 20],
|
109
|
+
# # ["green", "2", 15],
|
110
|
+
# # ["orange", "2", 30],
|
111
|
+
# # ["white", "2", 20]
|
112
|
+
# # ]
|
113
|
+
def one_to_many(parent_fields, pattern)
|
114
|
+
vars, numbers = one_to_many_components(pattern)
|
115
|
+
|
116
|
+
DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
|
117
|
+
each_row do |row|
|
118
|
+
verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
|
119
|
+
numbers.each do |n|
|
120
|
+
generated = one_to_many_row row, n, vars, pattern
|
121
|
+
next if generated.values.all?(&:nil?)
|
122
|
+
|
123
|
+
ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
|
124
|
+
end
|
125
|
+
end
|
126
|
+
ds.update
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
private
|
131
|
+
|
132
|
+
def one_to_many_components(pattern)
|
133
|
+
re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
|
134
|
+
|
135
|
+
vars, numbers =
|
136
|
+
@vectors
|
137
|
+
.map { |v| v.scan(re) }
|
138
|
+
.reject(&:empty?).flatten(1).transpose
|
139
|
+
|
140
|
+
[vars.uniq, numbers.map(&:to_i).sort.uniq]
|
141
|
+
end
|
142
|
+
|
143
|
+
def one_to_many_row(row, number, vars, pattern)
|
144
|
+
vars
|
145
|
+
.to_h do |v|
|
146
|
+
name = pattern.sub('%v', v).sub('%n', number.to_s)
|
147
|
+
[v, row[name]]
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|