daru_lite 0.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
- data/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
- data/.github/workflows/ci.yml +20 -0
- data/.rubocop_todo.yml +35 -33
- data/README.md +19 -115
- data/daru_lite.gemspec +1 -0
- data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
- data/lib/daru_lite/data_frame/calculatable.rb +140 -0
- data/lib/daru_lite/data_frame/convertible.rb +107 -0
- data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
- data/lib/daru_lite/data_frame/fetchable.rb +301 -0
- data/lib/daru_lite/data_frame/filterable.rb +144 -0
- data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
- data/lib/daru_lite/data_frame/indexable.rb +168 -0
- data/lib/daru_lite/data_frame/iterable.rb +339 -0
- data/lib/daru_lite/data_frame/joinable.rb +152 -0
- data/lib/daru_lite/data_frame/missable.rb +75 -0
- data/lib/daru_lite/data_frame/pivotable.rb +108 -0
- data/lib/daru_lite/data_frame/queryable.rb +67 -0
- data/lib/daru_lite/data_frame/setable.rb +109 -0
- data/lib/daru_lite/data_frame/sortable.rb +241 -0
- data/lib/daru_lite/dataframe.rb +142 -2355
- data/lib/daru_lite/index/index.rb +13 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1 -1
- data/lib/daru_lite/vector/aggregatable.rb +9 -0
- data/lib/daru_lite/vector/calculatable.rb +78 -0
- data/lib/daru_lite/vector/convertible.rb +77 -0
- data/lib/daru_lite/vector/duplicatable.rb +17 -0
- data/lib/daru_lite/vector/fetchable.rb +175 -0
- data/lib/daru_lite/vector/filterable.rb +128 -0
- data/lib/daru_lite/vector/indexable.rb +77 -0
- data/lib/daru_lite/vector/iterable.rb +95 -0
- data/lib/daru_lite/vector/joinable.rb +17 -0
- data/lib/daru_lite/vector/missable.rb +124 -0
- data/lib/daru_lite/vector/queryable.rb +45 -0
- data/lib/daru_lite/vector/setable.rb +47 -0
- data/lib/daru_lite/vector/sortable.rb +113 -0
- data/lib/daru_lite/vector.rb +36 -932
- data/lib/daru_lite/version.rb +1 -1
- data/spec/data_frame/aggregatable_example.rb +65 -0
- data/spec/data_frame/buildable_example.rb +109 -0
- data/spec/data_frame/calculatable_example.rb +135 -0
- data/spec/data_frame/convertible_example.rb +180 -0
- data/spec/data_frame/duplicatable_example.rb +111 -0
- data/spec/data_frame/fetchable_example.rb +476 -0
- data/spec/data_frame/filterable_example.rb +250 -0
- data/spec/data_frame/indexable_example.rb +221 -0
- data/spec/data_frame/iterable_example.rb +465 -0
- data/spec/data_frame/joinable_example.rb +106 -0
- data/spec/data_frame/missable_example.rb +47 -0
- data/spec/data_frame/pivotable_example.rb +297 -0
- data/spec/data_frame/queryable_example.rb +92 -0
- data/spec/data_frame/setable_example.rb +482 -0
- data/spec/data_frame/sortable_example.rb +350 -0
- data/spec/dataframe_spec.rb +181 -3243
- data/spec/index/index_spec.rb +8 -0
- data/spec/vector/aggregatable_example.rb +27 -0
- data/spec/vector/calculatable_example.rb +82 -0
- data/spec/vector/convertible_example.rb +126 -0
- data/spec/vector/duplicatable_example.rb +48 -0
- data/spec/vector/fetchable_example.rb +463 -0
- data/spec/vector/filterable_example.rb +165 -0
- data/spec/vector/indexable_example.rb +201 -0
- data/spec/vector/iterable_example.rb +111 -0
- data/spec/vector/joinable_example.rb +25 -0
- data/spec/vector/missable_example.rb +88 -0
- data/spec/vector/queryable_example.rb +91 -0
- data/spec/vector/setable_example.rb +300 -0
- data/spec/vector/sortable_example.rb +242 -0
- data/spec/vector_spec.rb +111 -1805
- metadata +102 -3
- data/.github/ISSUE_TEMPLATE.md +0 -18
@@ -0,0 +1,339 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Iterable
|
4
|
+
# Iterate over each index of the DataFrame.
|
5
|
+
def each_index(&block)
|
6
|
+
return to_enum(:each_index) unless block
|
7
|
+
|
8
|
+
@index.each(&block)
|
9
|
+
|
10
|
+
self
|
11
|
+
end
|
12
|
+
|
13
|
+
# Iterate over each vector
|
14
|
+
def each_vector(&block)
|
15
|
+
return to_enum(:each_vector) unless block
|
16
|
+
|
17
|
+
@data.each(&block)
|
18
|
+
|
19
|
+
self
|
20
|
+
end
|
21
|
+
|
22
|
+
alias each_column each_vector
|
23
|
+
|
24
|
+
# Iterate over each vector alongwith the name of the vector
|
25
|
+
def each_vector_with_index
|
26
|
+
return to_enum(:each_vector_with_index) unless block_given?
|
27
|
+
|
28
|
+
@vectors.each do |vector|
|
29
|
+
yield @data[@vectors[vector]], vector
|
30
|
+
end
|
31
|
+
|
32
|
+
self
|
33
|
+
end
|
34
|
+
|
35
|
+
alias each_column_with_index each_vector_with_index
|
36
|
+
|
37
|
+
# Iterate over each row
|
38
|
+
def each_row
|
39
|
+
return to_enum(:each_row) unless block_given?
|
40
|
+
|
41
|
+
@index.size.times do |pos|
|
42
|
+
yield row_at(pos)
|
43
|
+
end
|
44
|
+
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
def each_row_with_index
|
49
|
+
return to_enum(:each_row_with_index) unless block_given?
|
50
|
+
|
51
|
+
@index.each do |index|
|
52
|
+
yield access_row(index), index
|
53
|
+
end
|
54
|
+
|
55
|
+
self
|
56
|
+
end
|
57
|
+
|
58
|
+
# Iterate over each row or vector of the DataFrame. Specify axis
|
59
|
+
# by passing :vector or :row as the argument. Default to :vector.
|
60
|
+
#
|
61
|
+
# == Description
|
62
|
+
#
|
63
|
+
# `#each` works exactly like Array#each. The default mode for `each`
|
64
|
+
# is to iterate over the columns of the DataFrame. To iterate over
|
65
|
+
# rows you must pass the axis, i.e `:row` as an argument.
|
66
|
+
#
|
67
|
+
# == Arguments
|
68
|
+
#
|
69
|
+
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
70
|
+
# or :row. Default to :vector.
|
71
|
+
def each(axis = :vector, &block)
|
72
|
+
dispatch_to_axis axis, :each, &block
|
73
|
+
end
|
74
|
+
|
75
|
+
# Iterate over a row or vector and return results in a DaruLite::Vector.
|
76
|
+
# Specify axis with :vector or :row. Default to :vector.
|
77
|
+
#
|
78
|
+
# == Description
|
79
|
+
#
|
80
|
+
# The #collect iterator works similar to #map, the only difference
|
81
|
+
# being that it returns a DaruLite::Vector comprising of the results of
|
82
|
+
# each block run. The resultant Vector has the same index as that
|
83
|
+
# of the axis over which collect has iterated. It also accepts the
|
84
|
+
# optional axis argument.
|
85
|
+
#
|
86
|
+
# == Arguments
|
87
|
+
#
|
88
|
+
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
89
|
+
# or :row. Default to :vector.
|
90
|
+
def collect(axis = :vector, &block)
|
91
|
+
dispatch_to_axis_pl axis, :collect, &block
|
92
|
+
end
|
93
|
+
|
94
|
+
# Map over each vector or row of the data frame according to
|
95
|
+
# the argument specified. Will return an Array of the resulting
|
96
|
+
# elements. To map over each row/vector and get a DataFrame,
|
97
|
+
# see #recode.
|
98
|
+
#
|
99
|
+
# == Description
|
100
|
+
#
|
101
|
+
# The #map iterator works like Array#map. The value returned by
|
102
|
+
# each run of the block is added to an Array and the Array is
|
103
|
+
# returned. This method also accepts an axis argument, like #each.
|
104
|
+
# The default is :vector.
|
105
|
+
#
|
106
|
+
# == Arguments
|
107
|
+
#
|
108
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
109
|
+
# Default to :vector.
|
110
|
+
def map(axis = :vector, &block)
|
111
|
+
dispatch_to_axis_pl axis, :map, &block
|
112
|
+
end
|
113
|
+
|
114
|
+
# Destructive map. Modifies the DataFrame. Each run of the block
|
115
|
+
# must return a DaruLite::Vector. You can specify the axis to map over
|
116
|
+
# as the argument. Default to :vector.
|
117
|
+
#
|
118
|
+
# == Arguments
|
119
|
+
#
|
120
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
121
|
+
# Default to :vector.
|
122
|
+
def map!(axis = :vector, &block)
|
123
|
+
if %i[vector column].include?(axis)
|
124
|
+
map_vectors!(&block)
|
125
|
+
elsif axis == :row
|
126
|
+
map_rows!(&block)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Maps over the DataFrame and returns a DataFrame. Each run of the
|
131
|
+
# block must return a DaruLite::Vector object. You can specify the axis
|
132
|
+
# to map over. Default to :vector.
|
133
|
+
#
|
134
|
+
# == Description
|
135
|
+
#
|
136
|
+
# Recode works similarly to #map, but an important difference between
|
137
|
+
# the two is that recode returns a modified DaruLite::DataFrame instead
|
138
|
+
# of an Array. For this reason, #recode expects that every run of the
|
139
|
+
# block to return a DaruLite::Vector.
|
140
|
+
#
|
141
|
+
# Just like map and each, recode also accepts an optional _axis_ argument.
|
142
|
+
#
|
143
|
+
# == Arguments
|
144
|
+
#
|
145
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
146
|
+
# Default to :vector.
|
147
|
+
def recode(axis = :vector, &block)
|
148
|
+
dispatch_to_axis_pl axis, :recode, &block
|
149
|
+
end
|
150
|
+
|
151
|
+
# Replace specified values with given value
|
152
|
+
# @param [Array] old_values values to replace with new value
|
153
|
+
# @param [object] new_value new value to replace with
|
154
|
+
# @return [DaruLite::DataFrame] Data Frame itself with old values replace
|
155
|
+
# with new value
|
156
|
+
# @example
|
157
|
+
# df = DaruLite::DataFrame.new({
|
158
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
159
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
160
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
161
|
+
# }, index: 11..18)
|
162
|
+
# df.replace_values nil, Float::NAN
|
163
|
+
# # => #<DaruLite::DataFrame(8x3)>
|
164
|
+
# # a b c
|
165
|
+
# # 11 1 a a
|
166
|
+
# # 12 2 b NaN
|
167
|
+
# # 13 3 NaN 3
|
168
|
+
# # 14 NaN NaN 4
|
169
|
+
# # 15 NaN NaN 3
|
170
|
+
# # 16 NaN 3 5
|
171
|
+
# # 17 1 5 NaN
|
172
|
+
# # 18 7 8 7
|
173
|
+
def replace_values(old_values, new_value)
|
174
|
+
@data.each { |vec| vec.replace_values old_values, new_value }
|
175
|
+
self
|
176
|
+
end
|
177
|
+
|
178
|
+
# Test each row with one or more tests.
|
179
|
+
# @param tests [Proc] Each test is a Proc with the form
|
180
|
+
# *Proc.new {|row| row[:age] > 0}*
|
181
|
+
# The function returns an array with all errors.
|
182
|
+
#
|
183
|
+
# FIXME: description here is too sparse. As far as I can get,
|
184
|
+
# it should tell something about that each test is [descr, fields, block],
|
185
|
+
# and that first value may be column name to output. - zverok, 2016-05-18
|
186
|
+
def verify(*tests)
|
187
|
+
id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
|
188
|
+
|
189
|
+
each_row_with_index.map do |row, i|
|
190
|
+
tests.reject { |*_, block| block.call(row) }
|
191
|
+
.map { |test| verify_error_message row, test, id, i }
|
192
|
+
end.flatten
|
193
|
+
end
|
194
|
+
|
195
|
+
def recode_vectors
|
196
|
+
block_given? or return to_enum(:recode_vectors)
|
197
|
+
|
198
|
+
dup.tap do |df|
|
199
|
+
df.each_vector_with_index do |v, i|
|
200
|
+
df[*i] = should_be_vector!(yield(v))
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
def recode_rows
|
206
|
+
block_given? or return to_enum(:recode_rows)
|
207
|
+
|
208
|
+
dup.tap do |df|
|
209
|
+
df.each_row_with_index do |r, i|
|
210
|
+
df.row[i] = should_be_vector!(yield(r))
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
# Map each vector and return an Array.
|
216
|
+
def map_vectors(&block)
|
217
|
+
return to_enum(:map_vectors) unless block
|
218
|
+
|
219
|
+
@data.map(&block)
|
220
|
+
end
|
221
|
+
|
222
|
+
# Destructive form of #map_vectors
|
223
|
+
def map_vectors!
|
224
|
+
return to_enum(:map_vectors!) unless block_given?
|
225
|
+
|
226
|
+
vectors.dup.each do |n|
|
227
|
+
self[n] = should_be_vector!(yield(self[n]))
|
228
|
+
end
|
229
|
+
|
230
|
+
self
|
231
|
+
end
|
232
|
+
|
233
|
+
# Map vectors alongwith the index.
|
234
|
+
def map_vectors_with_index(&block)
|
235
|
+
return to_enum(:map_vectors_with_index) unless block
|
236
|
+
|
237
|
+
each_vector_with_index.map(&block)
|
238
|
+
end
|
239
|
+
|
240
|
+
# Map each row
|
241
|
+
def map_rows(&block)
|
242
|
+
return to_enum(:map_rows) unless block
|
243
|
+
|
244
|
+
each_row.map(&block)
|
245
|
+
end
|
246
|
+
|
247
|
+
def map_rows_with_index(&block)
|
248
|
+
return to_enum(:map_rows_with_index) unless block
|
249
|
+
|
250
|
+
each_row_with_index.map(&block)
|
251
|
+
end
|
252
|
+
|
253
|
+
def map_rows!
|
254
|
+
return to_enum(:map_rows!) unless block_given?
|
255
|
+
|
256
|
+
index.dup.each do |i|
|
257
|
+
row[i] = should_be_vector!(yield(row[i]))
|
258
|
+
end
|
259
|
+
|
260
|
+
self
|
261
|
+
end
|
262
|
+
|
263
|
+
def apply_method(method, keys: nil, by_position: true)
|
264
|
+
df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
|
265
|
+
|
266
|
+
case method
|
267
|
+
when Symbol then df.send(method)
|
268
|
+
when Proc then method.call(df)
|
269
|
+
when Array
|
270
|
+
method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
|
271
|
+
else raise
|
272
|
+
end
|
273
|
+
end
|
274
|
+
alias apply_method_on_sub_df apply_method
|
275
|
+
|
276
|
+
# Retrieves a DaruLite::Vector, based on the result of calculation
|
277
|
+
# performed on each row.
|
278
|
+
def collect_rows(&block)
|
279
|
+
return to_enum(:collect_rows) unless block
|
280
|
+
|
281
|
+
DaruLite::Vector.new(each_row.map(&block), index: @index)
|
282
|
+
end
|
283
|
+
|
284
|
+
def collect_row_with_index(&block)
|
285
|
+
return to_enum(:collect_row_with_index) unless block
|
286
|
+
|
287
|
+
DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
|
288
|
+
end
|
289
|
+
|
290
|
+
# Retrives a DaruLite::Vector, based on the result of calculation
|
291
|
+
# performed on each vector.
|
292
|
+
def collect_vectors(&block)
|
293
|
+
return to_enum(:collect_vectors) unless block
|
294
|
+
|
295
|
+
DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
|
296
|
+
end
|
297
|
+
|
298
|
+
def collect_vector_with_index(&block)
|
299
|
+
return to_enum(:collect_vector_with_index) unless block
|
300
|
+
|
301
|
+
DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
|
302
|
+
end
|
303
|
+
|
304
|
+
# Generate a matrix, based on vector names of the DataFrame.
|
305
|
+
#
|
306
|
+
# @return {::Matrix}
|
307
|
+
# :nocov:
|
308
|
+
# FIXME: Even not trying to cover this: I can't get, how it is expected
|
309
|
+
# to work.... -- zverok
|
310
|
+
def collect_matrix
|
311
|
+
return to_enum(:collect_matrix) unless block_given?
|
312
|
+
|
313
|
+
vecs = vectors.to_a
|
314
|
+
rows = vecs.collect do |row|
|
315
|
+
vecs.collect do |col|
|
316
|
+
yield row, col
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
Matrix.rows(rows)
|
321
|
+
end
|
322
|
+
# :nocov:
|
323
|
+
|
324
|
+
private
|
325
|
+
|
326
|
+
def should_be_vector!(val)
|
327
|
+
return val if val.is_a?(DaruLite::Vector)
|
328
|
+
|
329
|
+
raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
|
330
|
+
end
|
331
|
+
|
332
|
+
def verify_error_message(row, test, id, i)
|
333
|
+
description, fields, = test
|
334
|
+
values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
|
335
|
+
"#{i + 1} [#{row[id]}]: #{description}#{values}"
|
336
|
+
end
|
337
|
+
end
|
338
|
+
end
|
339
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Joinable
|
4
|
+
# Concatenate another DataFrame along corresponding columns.
|
5
|
+
# If columns do not exist in both dataframes, they are filled with nils
|
6
|
+
def concat(other_df)
|
7
|
+
vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
|
8
|
+
|
9
|
+
data = vectors.map do |v|
|
10
|
+
get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
|
11
|
+
end
|
12
|
+
|
13
|
+
DaruLite::DataFrame.new(data, order: vectors)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Concatenates another DataFrame as #concat.
|
17
|
+
# Additionally it tries to preserve the index. If the indices contain
|
18
|
+
# common elements, #union will overwrite the according rows in the
|
19
|
+
# first dataframe.
|
20
|
+
def union(other_df)
|
21
|
+
index = (@index.to_a + other_df.index.to_a).uniq
|
22
|
+
df = row[*(@index.to_a - other_df.index.to_a)]
|
23
|
+
|
24
|
+
df = df.concat(other_df)
|
25
|
+
df.index = DaruLite::Index.new(index)
|
26
|
+
df
|
27
|
+
end
|
28
|
+
|
29
|
+
# Merge vectors from two DataFrames. In case of name collision,
|
30
|
+
# the vectors names are changed to x_1, x_2 ....
|
31
|
+
#
|
32
|
+
# @return {DaruLite::DataFrame}
|
33
|
+
def merge(other_df)
|
34
|
+
unless nrows == other_df.nrows
|
35
|
+
raise ArgumentError,
|
36
|
+
"Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
|
37
|
+
end
|
38
|
+
|
39
|
+
new_fields = (@vectors.to_a + other_df.vectors.to_a)
|
40
|
+
new_fields = ArrayHelper.recode_repeated(new_fields)
|
41
|
+
DataFrame.new({}, order: new_fields).tap do |df_new|
|
42
|
+
(0...nrows).each do |i|
|
43
|
+
df_new.add_row row[i].to_a + other_df.row[i].to_a
|
44
|
+
end
|
45
|
+
df_new.index = @index if @index == other_df.index
|
46
|
+
df_new.update
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Join 2 DataFrames with SQL style joins. Currently supports inner, left
|
51
|
+
# outer, right outer and full outer joins.
|
52
|
+
#
|
53
|
+
# @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
|
54
|
+
# to be performed.
|
55
|
+
# @param [Hash] opts Options Hash
|
56
|
+
# @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
|
57
|
+
# @option :on [Array] The columns on which the join is to be performed.
|
58
|
+
# Column names specified here must be common to both DataFrames.
|
59
|
+
# @option :indicator [Symbol] The name of a vector to add to the resultant
|
60
|
+
# dataframe that indicates whether the record was in the left (:left_only),
|
61
|
+
# right (:right_only), or both (:both) joining dataframes.
|
62
|
+
# @return [DaruLite::DataFrame]
|
63
|
+
# @example Inner Join
|
64
|
+
# left = DaruLite::DataFrame.new({
|
65
|
+
# :id => [1,2,3,4],
|
66
|
+
# :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
|
67
|
+
# })
|
68
|
+
# right = DaruLite::DataFrame.new({
|
69
|
+
# :id => [1,2,3,4],
|
70
|
+
# :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
|
71
|
+
# })
|
72
|
+
# left.join(right, how: :inner, on: [:name])
|
73
|
+
# #=>
|
74
|
+
# ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
|
75
|
+
# # id_1 name id_2
|
76
|
+
# # 0 1 Pirate 2
|
77
|
+
# # 1 3 Ninja 4
|
78
|
+
def join(other_df, opts = {})
|
79
|
+
DaruLite::Core::Merge.join(self, other_df, opts)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Creates a new dataset for one to many relations
|
83
|
+
# on a dataset, based on pattern of field names.
|
84
|
+
#
|
85
|
+
# for example, you have a survey for number of children
|
86
|
+
# with this structure:
|
87
|
+
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
|
88
|
+
# with
|
89
|
+
# ds.one_to_many([:id], "child_%v_%n"
|
90
|
+
# the field of first parameters will be copied verbatim
|
91
|
+
# to new dataset, and fields which responds to second
|
92
|
+
# pattern will be added one case for each different %n.
|
93
|
+
#
|
94
|
+
# @example
|
95
|
+
# cases=[
|
96
|
+
# ['1','george','red',10,'blue',20,nil,nil],
|
97
|
+
# ['2','fred','green',15,'orange',30,'white',20],
|
98
|
+
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
99
|
+
# ]
|
100
|
+
# ds=DaruLite::DataFrame.rows(cases, order:
|
101
|
+
# [:id, :name,
|
102
|
+
# :car_color1, :car_value1,
|
103
|
+
# :car_color2, :car_value2,
|
104
|
+
# :car_color3, :car_value3])
|
105
|
+
# ds.one_to_many([:id],'car_%v%n').to_matrix
|
106
|
+
# #=> Matrix[
|
107
|
+
# # ["red", "1", 10],
|
108
|
+
# # ["blue", "1", 20],
|
109
|
+
# # ["green", "2", 15],
|
110
|
+
# # ["orange", "2", 30],
|
111
|
+
# # ["white", "2", 20]
|
112
|
+
# # ]
|
113
|
+
def one_to_many(parent_fields, pattern)
|
114
|
+
vars, numbers = one_to_many_components(pattern)
|
115
|
+
|
116
|
+
DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
|
117
|
+
each_row do |row|
|
118
|
+
verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
|
119
|
+
numbers.each do |n|
|
120
|
+
generated = one_to_many_row row, n, vars, pattern
|
121
|
+
next if generated.values.all?(&:nil?)
|
122
|
+
|
123
|
+
ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
|
124
|
+
end
|
125
|
+
end
|
126
|
+
ds.update
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
private
|
131
|
+
|
132
|
+
def one_to_many_components(pattern)
|
133
|
+
re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
|
134
|
+
|
135
|
+
vars, numbers =
|
136
|
+
@vectors
|
137
|
+
.map { |v| v.scan(re) }
|
138
|
+
.reject(&:empty?).flatten(1).transpose
|
139
|
+
|
140
|
+
[vars.uniq, numbers.map(&:to_i).sort.uniq]
|
141
|
+
end
|
142
|
+
|
143
|
+
def one_to_many_row(row, number, vars, pattern)
|
144
|
+
vars
|
145
|
+
.to_h do |v|
|
146
|
+
name = pattern.sub('%v', v).sub('%n', number.to_s)
|
147
|
+
[v, row[name]]
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Missable
|
4
|
+
extend Gem::Deprecate
|
5
|
+
|
6
|
+
# Rolling fillna
|
7
|
+
# replace all Float::NAN and NIL values with the preceeding or following value
|
8
|
+
#
|
9
|
+
# @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# df = DaruLite::DataFrame.new({
|
13
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
14
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
|
15
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
16
|
+
# })
|
17
|
+
#
|
18
|
+
# => #<DaruLite::DataFrame(8x3)>
|
19
|
+
# a b c
|
20
|
+
# 0 1 a a
|
21
|
+
# 1 2 b NaN
|
22
|
+
# 2 3 nil 3
|
23
|
+
# 3 nil NaN 4
|
24
|
+
# 4 NaN nil 3
|
25
|
+
# 5 nil 3 5
|
26
|
+
# 6 1 5 nil
|
27
|
+
# 7 7 nil 7
|
28
|
+
#
|
29
|
+
# 2.3.3 :068 > df.rolling_fillna(:forward)
|
30
|
+
# => #<DaruLite::DataFrame(8x3)>
|
31
|
+
# a b c
|
32
|
+
# 0 1 a a
|
33
|
+
# 1 2 b a
|
34
|
+
# 2 3 b 3
|
35
|
+
# 3 3 b 4
|
36
|
+
# 4 3 b 3
|
37
|
+
# 5 3 3 5
|
38
|
+
# 6 1 5 5
|
39
|
+
# 7 7 5 7
|
40
|
+
#
|
41
|
+
def rolling_fillna!(direction = :forward)
|
42
|
+
@data.each { |vec| vec.rolling_fillna!(direction) }
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
def rolling_fillna(direction = :forward)
|
47
|
+
dup.rolling_fillna!(direction)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Return a vector with the number of missing values in each row.
|
51
|
+
#
|
52
|
+
# == Arguments
|
53
|
+
#
|
54
|
+
# * +missing_values+ - An Array of the values that should be
|
55
|
+
# treated as 'missing'. The default missing value is *nil*.
|
56
|
+
def missing_values_rows(missing_values = [nil])
|
57
|
+
number_of_missing = each_row.map do |row|
|
58
|
+
row.indexes(*missing_values).size
|
59
|
+
end
|
60
|
+
|
61
|
+
DaruLite::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
|
62
|
+
end
|
63
|
+
|
64
|
+
# TODO: remove next version
|
65
|
+
alias vector_missing_values missing_values_rows
|
66
|
+
|
67
|
+
def has_missing_data?
|
68
|
+
@data.any? { |vec| vec.include_values?(*DaruLite::MISSING_VALUES) }
|
69
|
+
end
|
70
|
+
alias flawed? has_missing_data?
|
71
|
+
deprecate :has_missing_data?, :include_values?, 2016, 10
|
72
|
+
deprecate :flawed?, :include_values?, 2016, 10
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Pivotable
|
4
|
+
# Pivots a data frame on specified vectors and applies an aggregate function
|
5
|
+
# to quickly generate a summary.
|
6
|
+
#
|
7
|
+
# == Options
|
8
|
+
#
|
9
|
+
# +:index+ - Keys to group by on the pivot table row index. Pass vector names
|
10
|
+
# contained in an Array.
|
11
|
+
#
|
12
|
+
# +:vectors+ - Keys to group by on the pivot table column index. Pass vector
|
13
|
+
# names contained in an Array.
|
14
|
+
#
|
15
|
+
# +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
|
16
|
+
# use any of the statistics functions applicable on Vectors that can be found in
|
17
|
+
# the DaruLite::Statistics::Vector module.
|
18
|
+
#
|
19
|
+
# +:values+ - Columns to aggregate. Will consider all numeric columns not
|
20
|
+
# specified in *:index* or *:vectors*. Optional.
|
21
|
+
#
|
22
|
+
# == Usage
|
23
|
+
#
|
24
|
+
# df = DaruLite::DataFrame.new({
|
25
|
+
# a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
|
26
|
+
# b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
|
27
|
+
# c: ['small','large','large','small','small','large','small','large','small'],
|
28
|
+
# d: [1,2,2,3,3,4,5,6,7],
|
29
|
+
# e: [2,4,4,6,6,8,10,12,14]
|
30
|
+
# })
|
31
|
+
# df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
|
32
|
+
#
|
33
|
+
# #=>
|
34
|
+
# # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
|
35
|
+
# # [:e, :one] [:e, :two]
|
36
|
+
# # [:bar] 18 26
|
37
|
+
# # [:foo] 10 12
|
38
|
+
def pivot_table(opts = {})
|
39
|
+
raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?
|
40
|
+
|
41
|
+
index = opts[:index]
|
42
|
+
vectors = opts[:vectors] || []
|
43
|
+
aggregate_function = opts[:agg] || :mean
|
44
|
+
values = prepare_pivot_values index, vectors, opts
|
45
|
+
raise IndexError, 'No numeric vectors to aggregate' if values.empty?
|
46
|
+
|
47
|
+
grouped = group_by(index)
|
48
|
+
return grouped.send(aggregate_function) if vectors.empty?
|
49
|
+
|
50
|
+
super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
|
51
|
+
|
52
|
+
pivot_dataframe super_hash
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def prepare_pivot_values(index, vectors, opts)
|
58
|
+
case opts[:values]
|
59
|
+
when nil # values not specified at all.
|
60
|
+
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
61
|
+
when Array # multiple values specified.
|
62
|
+
opts[:values]
|
63
|
+
else # single value specified.
|
64
|
+
[opts[:values]]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def make_pivot_hash(grouped, vectors, values, aggregate_function)
|
69
|
+
grouped.groups.transform_values { |_| {} }.tap do |super_hash|
|
70
|
+
values.each do |value|
|
71
|
+
grouped.groups.each do |group_name, row_numbers|
|
72
|
+
row_numbers.each do |num|
|
73
|
+
arry = [value, *vectors.map { |v| self[v][num] }]
|
74
|
+
sub_hash = super_hash[group_name]
|
75
|
+
sub_hash[arry] ||= []
|
76
|
+
|
77
|
+
sub_hash[arry] << self[value][num]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
setup_pivot_aggregates super_hash, aggregate_function
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def setup_pivot_aggregates(super_hash, aggregate_function)
|
87
|
+
super_hash.each_value do |sub_hash|
|
88
|
+
sub_hash.each do |group_name, aggregates|
|
89
|
+
sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def pivot_dataframe(super_hash)
|
95
|
+
df_index = DaruLite::MultiIndex.from_tuples super_hash.keys
|
96
|
+
df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
|
97
|
+
|
98
|
+
DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
|
99
|
+
super_hash.each do |row_index, sub_h|
|
100
|
+
sub_h.each do |vector_index, val|
|
101
|
+
pivoted_dataframe[vector_index][row_index] = val
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|