daru_lite 0.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
  3. data/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
  4. data/.github/workflows/ci.yml +20 -0
  5. data/.rubocop_todo.yml +35 -33
  6. data/README.md +19 -115
  7. data/daru_lite.gemspec +1 -0
  8. data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
  9. data/lib/daru_lite/data_frame/calculatable.rb +140 -0
  10. data/lib/daru_lite/data_frame/convertible.rb +107 -0
  11. data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
  12. data/lib/daru_lite/data_frame/fetchable.rb +301 -0
  13. data/lib/daru_lite/data_frame/filterable.rb +144 -0
  14. data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
  15. data/lib/daru_lite/data_frame/indexable.rb +168 -0
  16. data/lib/daru_lite/data_frame/iterable.rb +339 -0
  17. data/lib/daru_lite/data_frame/joinable.rb +152 -0
  18. data/lib/daru_lite/data_frame/missable.rb +75 -0
  19. data/lib/daru_lite/data_frame/pivotable.rb +108 -0
  20. data/lib/daru_lite/data_frame/queryable.rb +67 -0
  21. data/lib/daru_lite/data_frame/setable.rb +109 -0
  22. data/lib/daru_lite/data_frame/sortable.rb +241 -0
  23. data/lib/daru_lite/dataframe.rb +142 -2355
  24. data/lib/daru_lite/index/index.rb +13 -0
  25. data/lib/daru_lite/maths/statistics/vector.rb +1 -1
  26. data/lib/daru_lite/vector/aggregatable.rb +9 -0
  27. data/lib/daru_lite/vector/calculatable.rb +78 -0
  28. data/lib/daru_lite/vector/convertible.rb +77 -0
  29. data/lib/daru_lite/vector/duplicatable.rb +17 -0
  30. data/lib/daru_lite/vector/fetchable.rb +175 -0
  31. data/lib/daru_lite/vector/filterable.rb +128 -0
  32. data/lib/daru_lite/vector/indexable.rb +77 -0
  33. data/lib/daru_lite/vector/iterable.rb +95 -0
  34. data/lib/daru_lite/vector/joinable.rb +17 -0
  35. data/lib/daru_lite/vector/missable.rb +124 -0
  36. data/lib/daru_lite/vector/queryable.rb +45 -0
  37. data/lib/daru_lite/vector/setable.rb +47 -0
  38. data/lib/daru_lite/vector/sortable.rb +113 -0
  39. data/lib/daru_lite/vector.rb +36 -932
  40. data/lib/daru_lite/version.rb +1 -1
  41. data/spec/data_frame/aggregatable_example.rb +65 -0
  42. data/spec/data_frame/buildable_example.rb +109 -0
  43. data/spec/data_frame/calculatable_example.rb +135 -0
  44. data/spec/data_frame/convertible_example.rb +180 -0
  45. data/spec/data_frame/duplicatable_example.rb +111 -0
  46. data/spec/data_frame/fetchable_example.rb +476 -0
  47. data/spec/data_frame/filterable_example.rb +250 -0
  48. data/spec/data_frame/indexable_example.rb +221 -0
  49. data/spec/data_frame/iterable_example.rb +465 -0
  50. data/spec/data_frame/joinable_example.rb +106 -0
  51. data/spec/data_frame/missable_example.rb +47 -0
  52. data/spec/data_frame/pivotable_example.rb +297 -0
  53. data/spec/data_frame/queryable_example.rb +92 -0
  54. data/spec/data_frame/setable_example.rb +482 -0
  55. data/spec/data_frame/sortable_example.rb +350 -0
  56. data/spec/dataframe_spec.rb +181 -3243
  57. data/spec/index/index_spec.rb +8 -0
  58. data/spec/vector/aggregatable_example.rb +27 -0
  59. data/spec/vector/calculatable_example.rb +82 -0
  60. data/spec/vector/convertible_example.rb +126 -0
  61. data/spec/vector/duplicatable_example.rb +48 -0
  62. data/spec/vector/fetchable_example.rb +463 -0
  63. data/spec/vector/filterable_example.rb +165 -0
  64. data/spec/vector/indexable_example.rb +201 -0
  65. data/spec/vector/iterable_example.rb +111 -0
  66. data/spec/vector/joinable_example.rb +25 -0
  67. data/spec/vector/missable_example.rb +88 -0
  68. data/spec/vector/queryable_example.rb +91 -0
  69. data/spec/vector/setable_example.rb +300 -0
  70. data/spec/vector/sortable_example.rb +242 -0
  71. data/spec/vector_spec.rb +111 -1805
  72. metadata +102 -3
  73. data/.github/ISSUE_TEMPLATE.md +0 -18
@@ -0,0 +1,339 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Iterable
4
+ # Iterate over each index of the DataFrame.
5
+ def each_index(&block)
6
+ return to_enum(:each_index) unless block
7
+
8
+ @index.each(&block)
9
+
10
+ self
11
+ end
12
+
13
+ # Iterate over each vector
14
+ def each_vector(&block)
15
+ return to_enum(:each_vector) unless block
16
+
17
+ @data.each(&block)
18
+
19
+ self
20
+ end
21
+
22
+ alias each_column each_vector
23
+
24
+ # Iterate over each vector alongwith the name of the vector
25
+ def each_vector_with_index
26
+ return to_enum(:each_vector_with_index) unless block_given?
27
+
28
+ @vectors.each do |vector|
29
+ yield @data[@vectors[vector]], vector
30
+ end
31
+
32
+ self
33
+ end
34
+
35
+ alias each_column_with_index each_vector_with_index
36
+
37
+ # Iterate over each row
38
+ def each_row
39
+ return to_enum(:each_row) unless block_given?
40
+
41
+ @index.size.times do |pos|
42
+ yield row_at(pos)
43
+ end
44
+
45
+ self
46
+ end
47
+
48
+ def each_row_with_index
49
+ return to_enum(:each_row_with_index) unless block_given?
50
+
51
+ @index.each do |index|
52
+ yield access_row(index), index
53
+ end
54
+
55
+ self
56
+ end
57
+
58
+ # Iterate over each row or vector of the DataFrame. Specify axis
59
+ # by passing :vector or :row as the argument. Default to :vector.
60
+ #
61
+ # == Description
62
+ #
63
+ # `#each` works exactly like Array#each. The default mode for `each`
64
+ # is to iterate over the columns of the DataFrame. To iterate over
65
+ # rows you must pass the axis, i.e `:row` as an argument.
66
+ #
67
+ # == Arguments
68
+ #
69
+ # * +axis+ - The axis to iterate over. Can be :vector (or :column)
70
+ # or :row. Default to :vector.
71
+ def each(axis = :vector, &block)
72
+ dispatch_to_axis axis, :each, &block
73
+ end
74
+
75
+ # Iterate over a row or vector and return results in a DaruLite::Vector.
76
+ # Specify axis with :vector or :row. Default to :vector.
77
+ #
78
+ # == Description
79
+ #
80
+ # The #collect iterator works similar to #map, the only difference
81
+ # being that it returns a DaruLite::Vector comprising of the results of
82
+ # each block run. The resultant Vector has the same index as that
83
+ # of the axis over which collect has iterated. It also accepts the
84
+ # optional axis argument.
85
+ #
86
+ # == Arguments
87
+ #
88
+ # * +axis+ - The axis to iterate over. Can be :vector (or :column)
89
+ # or :row. Default to :vector.
90
+ def collect(axis = :vector, &block)
91
+ dispatch_to_axis_pl axis, :collect, &block
92
+ end
93
+
94
+ # Map over each vector or row of the data frame according to
95
+ # the argument specified. Will return an Array of the resulting
96
+ # elements. To map over each row/vector and get a DataFrame,
97
+ # see #recode.
98
+ #
99
+ # == Description
100
+ #
101
+ # The #map iterator works like Array#map. The value returned by
102
+ # each run of the block is added to an Array and the Array is
103
+ # returned. This method also accepts an axis argument, like #each.
104
+ # The default is :vector.
105
+ #
106
+ # == Arguments
107
+ #
108
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
109
+ # Default to :vector.
110
+ def map(axis = :vector, &block)
111
+ dispatch_to_axis_pl axis, :map, &block
112
+ end
113
+
114
+ # Destructive map. Modifies the DataFrame. Each run of the block
115
+ # must return a DaruLite::Vector. You can specify the axis to map over
116
+ # as the argument. Default to :vector.
117
+ #
118
+ # == Arguments
119
+ #
120
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
121
+ # Default to :vector.
122
+ def map!(axis = :vector, &block)
123
+ if %i[vector column].include?(axis)
124
+ map_vectors!(&block)
125
+ elsif axis == :row
126
+ map_rows!(&block)
127
+ end
128
+ end
129
+
130
+ # Maps over the DataFrame and returns a DataFrame. Each run of the
131
+ # block must return a DaruLite::Vector object. You can specify the axis
132
+ # to map over. Default to :vector.
133
+ #
134
+ # == Description
135
+ #
136
+ # Recode works similarly to #map, but an important difference between
137
+ # the two is that recode returns a modified DaruLite::DataFrame instead
138
+ # of an Array. For this reason, #recode expects that every run of the
139
+ # block to return a DaruLite::Vector.
140
+ #
141
+ # Just like map and each, recode also accepts an optional _axis_ argument.
142
+ #
143
+ # == Arguments
144
+ #
145
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
146
+ # Default to :vector.
147
+ def recode(axis = :vector, &block)
148
+ dispatch_to_axis_pl axis, :recode, &block
149
+ end
150
+
151
+ # Replace specified values with given value
152
+ # @param [Array] old_values values to replace with new value
153
+ # @param [object] new_value new value to replace with
154
+ # @return [DaruLite::DataFrame] Data Frame itself with old values replace
155
+ # with new value
156
+ # @example
157
+ # df = DaruLite::DataFrame.new({
158
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
159
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
160
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
161
+ # }, index: 11..18)
162
+ # df.replace_values nil, Float::NAN
163
+ # # => #<DaruLite::DataFrame(8x3)>
164
+ # # a b c
165
+ # # 11 1 a a
166
+ # # 12 2 b NaN
167
+ # # 13 3 NaN 3
168
+ # # 14 NaN NaN 4
169
+ # # 15 NaN NaN 3
170
+ # # 16 NaN 3 5
171
+ # # 17 1 5 NaN
172
+ # # 18 7 8 7
173
+ def replace_values(old_values, new_value)
174
+ @data.each { |vec| vec.replace_values old_values, new_value }
175
+ self
176
+ end
177
+
178
+ # Test each row with one or more tests.
179
+ # @param tests [Proc] Each test is a Proc with the form
180
+ # *Proc.new {|row| row[:age] > 0}*
181
+ # The function returns an array with all errors.
182
+ #
183
+ # FIXME: description here is too sparse. As far as I can get,
184
+ # it should tell something about that each test is [descr, fields, block],
185
+ # and that first value may be column name to output. - zverok, 2016-05-18
186
+ def verify(*tests)
187
+ id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
188
+
189
+ each_row_with_index.map do |row, i|
190
+ tests.reject { |*_, block| block.call(row) }
191
+ .map { |test| verify_error_message row, test, id, i }
192
+ end.flatten
193
+ end
194
+
195
+ def recode_vectors
196
+ block_given? or return to_enum(:recode_vectors)
197
+
198
+ dup.tap do |df|
199
+ df.each_vector_with_index do |v, i|
200
+ df[*i] = should_be_vector!(yield(v))
201
+ end
202
+ end
203
+ end
204
+
205
+ def recode_rows
206
+ block_given? or return to_enum(:recode_rows)
207
+
208
+ dup.tap do |df|
209
+ df.each_row_with_index do |r, i|
210
+ df.row[i] = should_be_vector!(yield(r))
211
+ end
212
+ end
213
+ end
214
+
215
+ # Map each vector and return an Array.
216
+ def map_vectors(&block)
217
+ return to_enum(:map_vectors) unless block
218
+
219
+ @data.map(&block)
220
+ end
221
+
222
+ # Destructive form of #map_vectors
223
+ def map_vectors!
224
+ return to_enum(:map_vectors!) unless block_given?
225
+
226
+ vectors.dup.each do |n|
227
+ self[n] = should_be_vector!(yield(self[n]))
228
+ end
229
+
230
+ self
231
+ end
232
+
233
+ # Map vectors alongwith the index.
234
+ def map_vectors_with_index(&block)
235
+ return to_enum(:map_vectors_with_index) unless block
236
+
237
+ each_vector_with_index.map(&block)
238
+ end
239
+
240
+ # Map each row
241
+ def map_rows(&block)
242
+ return to_enum(:map_rows) unless block
243
+
244
+ each_row.map(&block)
245
+ end
246
+
247
+ def map_rows_with_index(&block)
248
+ return to_enum(:map_rows_with_index) unless block
249
+
250
+ each_row_with_index.map(&block)
251
+ end
252
+
253
+ def map_rows!
254
+ return to_enum(:map_rows!) unless block_given?
255
+
256
+ index.dup.each do |i|
257
+ row[i] = should_be_vector!(yield(row[i]))
258
+ end
259
+
260
+ self
261
+ end
262
+
263
+ def apply_method(method, keys: nil, by_position: true)
264
+ df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
265
+
266
+ case method
267
+ when Symbol then df.send(method)
268
+ when Proc then method.call(df)
269
+ when Array
270
+ method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
271
+ else raise
272
+ end
273
+ end
274
+ alias apply_method_on_sub_df apply_method
275
+
276
+ # Retrieves a DaruLite::Vector, based on the result of calculation
277
+ # performed on each row.
278
+ def collect_rows(&block)
279
+ return to_enum(:collect_rows) unless block
280
+
281
+ DaruLite::Vector.new(each_row.map(&block), index: @index)
282
+ end
283
+
284
+ def collect_row_with_index(&block)
285
+ return to_enum(:collect_row_with_index) unless block
286
+
287
+ DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
288
+ end
289
+
290
+ # Retrives a DaruLite::Vector, based on the result of calculation
291
+ # performed on each vector.
292
+ def collect_vectors(&block)
293
+ return to_enum(:collect_vectors) unless block
294
+
295
+ DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
296
+ end
297
+
298
+ def collect_vector_with_index(&block)
299
+ return to_enum(:collect_vector_with_index) unless block
300
+
301
+ DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
302
+ end
303
+
304
+ # Generate a matrix, based on vector names of the DataFrame.
305
+ #
306
+ # @return {::Matrix}
307
+ # :nocov:
308
+ # FIXME: Even not trying to cover this: I can't get, how it is expected
309
+ # to work.... -- zverok
310
+ def collect_matrix
311
+ return to_enum(:collect_matrix) unless block_given?
312
+
313
+ vecs = vectors.to_a
314
+ rows = vecs.collect do |row|
315
+ vecs.collect do |col|
316
+ yield row, col
317
+ end
318
+ end
319
+
320
+ Matrix.rows(rows)
321
+ end
322
+ # :nocov:
323
+
324
+ private
325
+
326
+ def should_be_vector!(val)
327
+ return val if val.is_a?(DaruLite::Vector)
328
+
329
+ raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
330
+ end
331
+
332
+ def verify_error_message(row, test, id, i)
333
+ description, fields, = test
334
+ values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
335
+ "#{i + 1} [#{row[id]}]: #{description}#{values}"
336
+ end
337
+ end
338
+ end
339
+ end
@@ -0,0 +1,152 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Joinable
4
+ # Concatenate another DataFrame along corresponding columns.
5
+ # If columns do not exist in both dataframes, they are filled with nils
6
+ def concat(other_df)
7
+ vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
8
+
9
+ data = vectors.map do |v|
10
+ get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
11
+ end
12
+
13
+ DaruLite::DataFrame.new(data, order: vectors)
14
+ end
15
+
16
+ # Concatenates another DataFrame as #concat.
17
+ # Additionally it tries to preserve the index. If the indices contain
18
+ # common elements, #union will overwrite the according rows in the
19
+ # first dataframe.
20
+ def union(other_df)
21
+ index = (@index.to_a + other_df.index.to_a).uniq
22
+ df = row[*(@index.to_a - other_df.index.to_a)]
23
+
24
+ df = df.concat(other_df)
25
+ df.index = DaruLite::Index.new(index)
26
+ df
27
+ end
28
+
29
+ # Merge vectors from two DataFrames. In case of name collision,
30
+ # the vectors names are changed to x_1, x_2 ....
31
+ #
32
+ # @return {DaruLite::DataFrame}
33
+ def merge(other_df)
34
+ unless nrows == other_df.nrows
35
+ raise ArgumentError,
36
+ "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
37
+ end
38
+
39
+ new_fields = (@vectors.to_a + other_df.vectors.to_a)
40
+ new_fields = ArrayHelper.recode_repeated(new_fields)
41
+ DataFrame.new({}, order: new_fields).tap do |df_new|
42
+ (0...nrows).each do |i|
43
+ df_new.add_row row[i].to_a + other_df.row[i].to_a
44
+ end
45
+ df_new.index = @index if @index == other_df.index
46
+ df_new.update
47
+ end
48
+ end
49
+
50
+ # Join 2 DataFrames with SQL style joins. Currently supports inner, left
51
+ # outer, right outer and full outer joins.
52
+ #
53
+ # @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
54
+ # to be performed.
55
+ # @param [Hash] opts Options Hash
56
+ # @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
57
+ # @option :on [Array] The columns on which the join is to be performed.
58
+ # Column names specified here must be common to both DataFrames.
59
+ # @option :indicator [Symbol] The name of a vector to add to the resultant
60
+ # dataframe that indicates whether the record was in the left (:left_only),
61
+ # right (:right_only), or both (:both) joining dataframes.
62
+ # @return [DaruLite::DataFrame]
63
+ # @example Inner Join
64
+ # left = DaruLite::DataFrame.new({
65
+ # :id => [1,2,3,4],
66
+ # :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
67
+ # })
68
+ # right = DaruLite::DataFrame.new({
69
+ # :id => [1,2,3,4],
70
+ # :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
71
+ # })
72
+ # left.join(right, how: :inner, on: [:name])
73
+ # #=>
74
+ # ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
75
+ # # id_1 name id_2
76
+ # # 0 1 Pirate 2
77
+ # # 1 3 Ninja 4
78
+ def join(other_df, opts = {})
79
+ DaruLite::Core::Merge.join(self, other_df, opts)
80
+ end
81
+
82
+ # Creates a new dataset for one to many relations
83
+ # on a dataset, based on pattern of field names.
84
+ #
85
+ # for example, you have a survey for number of children
86
+ # with this structure:
87
+ # id, name, child_name_1, child_age_1, child_name_2, child_age_2
88
+ # with
89
+ # ds.one_to_many([:id], "child_%v_%n"
90
+ # the field of first parameters will be copied verbatim
91
+ # to new dataset, and fields which responds to second
92
+ # pattern will be added one case for each different %n.
93
+ #
94
+ # @example
95
+ # cases=[
96
+ # ['1','george','red',10,'blue',20,nil,nil],
97
+ # ['2','fred','green',15,'orange',30,'white',20],
98
+ # ['3','alfred',nil,nil,nil,nil,nil,nil]
99
+ # ]
100
+ # ds=DaruLite::DataFrame.rows(cases, order:
101
+ # [:id, :name,
102
+ # :car_color1, :car_value1,
103
+ # :car_color2, :car_value2,
104
+ # :car_color3, :car_value3])
105
+ # ds.one_to_many([:id],'car_%v%n').to_matrix
106
+ # #=> Matrix[
107
+ # # ["red", "1", 10],
108
+ # # ["blue", "1", 20],
109
+ # # ["green", "2", 15],
110
+ # # ["orange", "2", 30],
111
+ # # ["white", "2", 20]
112
+ # # ]
113
+ def one_to_many(parent_fields, pattern)
114
+ vars, numbers = one_to_many_components(pattern)
115
+
116
+ DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
117
+ each_row do |row|
118
+ verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
119
+ numbers.each do |n|
120
+ generated = one_to_many_row row, n, vars, pattern
121
+ next if generated.values.all?(&:nil?)
122
+
123
+ ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
124
+ end
125
+ end
126
+ ds.update
127
+ end
128
+ end
129
+
130
+ private
131
+
132
+ def one_to_many_components(pattern)
133
+ re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
134
+
135
+ vars, numbers =
136
+ @vectors
137
+ .map { |v| v.scan(re) }
138
+ .reject(&:empty?).flatten(1).transpose
139
+
140
+ [vars.uniq, numbers.map(&:to_i).sort.uniq]
141
+ end
142
+
143
+ def one_to_many_row(row, number, vars, pattern)
144
+ vars
145
+ .to_h do |v|
146
+ name = pattern.sub('%v', v).sub('%n', number.to_s)
147
+ [v, row[name]]
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,75 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Missable
4
+ extend Gem::Deprecate
5
+
6
+ # Rolling fillna
7
+ # replace all Float::NAN and NIL values with the preceeding or following value
8
+ #
9
+ # @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
10
+ #
11
+ # @example
12
+ # df = DaruLite::DataFrame.new({
13
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
14
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
15
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
16
+ # })
17
+ #
18
+ # => #<DaruLite::DataFrame(8x3)>
19
+ # a b c
20
+ # 0 1 a a
21
+ # 1 2 b NaN
22
+ # 2 3 nil 3
23
+ # 3 nil NaN 4
24
+ # 4 NaN nil 3
25
+ # 5 nil 3 5
26
+ # 6 1 5 nil
27
+ # 7 7 nil 7
28
+ #
29
+ # 2.3.3 :068 > df.rolling_fillna(:forward)
30
+ # => #<DaruLite::DataFrame(8x3)>
31
+ # a b c
32
+ # 0 1 a a
33
+ # 1 2 b a
34
+ # 2 3 b 3
35
+ # 3 3 b 4
36
+ # 4 3 b 3
37
+ # 5 3 3 5
38
+ # 6 1 5 5
39
+ # 7 7 5 7
40
+ #
41
+ def rolling_fillna!(direction = :forward)
42
+ @data.each { |vec| vec.rolling_fillna!(direction) }
43
+ self
44
+ end
45
+
46
+ def rolling_fillna(direction = :forward)
47
+ dup.rolling_fillna!(direction)
48
+ end
49
+
50
+ # Return a vector with the number of missing values in each row.
51
+ #
52
+ # == Arguments
53
+ #
54
+ # * +missing_values+ - An Array of the values that should be
55
+ # treated as 'missing'. The default missing value is *nil*.
56
+ def missing_values_rows(missing_values = [nil])
57
+ number_of_missing = each_row.map do |row|
58
+ row.indexes(*missing_values).size
59
+ end
60
+
61
+ DaruLite::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
62
+ end
63
+
64
+ # TODO: remove next version
65
+ alias vector_missing_values missing_values_rows
66
+
67
+ def has_missing_data?
68
+ @data.any? { |vec| vec.include_values?(*DaruLite::MISSING_VALUES) }
69
+ end
70
+ alias flawed? has_missing_data?
71
+ deprecate :has_missing_data?, :include_values?, 2016, 10
72
+ deprecate :flawed?, :include_values?, 2016, 10
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,108 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Pivotable
4
+ # Pivots a data frame on specified vectors and applies an aggregate function
5
+ # to quickly generate a summary.
6
+ #
7
+ # == Options
8
+ #
9
+ # +:index+ - Keys to group by on the pivot table row index. Pass vector names
10
+ # contained in an Array.
11
+ #
12
+ # +:vectors+ - Keys to group by on the pivot table column index. Pass vector
13
+ # names contained in an Array.
14
+ #
15
+ # +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
16
+ # use any of the statistics functions applicable on Vectors that can be found in
17
+ # the DaruLite::Statistics::Vector module.
18
+ #
19
+ # +:values+ - Columns to aggregate. Will consider all numeric columns not
20
+ # specified in *:index* or *:vectors*. Optional.
21
+ #
22
+ # == Usage
23
+ #
24
+ # df = DaruLite::DataFrame.new({
25
+ # a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
26
+ # b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
27
+ # c: ['small','large','large','small','small','large','small','large','small'],
28
+ # d: [1,2,2,3,3,4,5,6,7],
29
+ # e: [2,4,4,6,6,8,10,12,14]
30
+ # })
31
+ # df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
32
+ #
33
+ # #=>
34
+ # # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
35
+ # # [:e, :one] [:e, :two]
36
+ # # [:bar] 18 26
37
+ # # [:foo] 10 12
38
+ def pivot_table(opts = {})
39
+ raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?
40
+
41
+ index = opts[:index]
42
+ vectors = opts[:vectors] || []
43
+ aggregate_function = opts[:agg] || :mean
44
+ values = prepare_pivot_values index, vectors, opts
45
+ raise IndexError, 'No numeric vectors to aggregate' if values.empty?
46
+
47
+ grouped = group_by(index)
48
+ return grouped.send(aggregate_function) if vectors.empty?
49
+
50
+ super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
51
+
52
+ pivot_dataframe super_hash
53
+ end
54
+
55
+ private
56
+
57
+ def prepare_pivot_values(index, vectors, opts)
58
+ case opts[:values]
59
+ when nil # values not specified at all.
60
+ (@vectors.to_a - (index | vectors)) & numeric_vector_names
61
+ when Array # multiple values specified.
62
+ opts[:values]
63
+ else # single value specified.
64
+ [opts[:values]]
65
+ end
66
+ end
67
+
68
+ def make_pivot_hash(grouped, vectors, values, aggregate_function)
69
+ grouped.groups.transform_values { |_| {} }.tap do |super_hash|
70
+ values.each do |value|
71
+ grouped.groups.each do |group_name, row_numbers|
72
+ row_numbers.each do |num|
73
+ arry = [value, *vectors.map { |v| self[v][num] }]
74
+ sub_hash = super_hash[group_name]
75
+ sub_hash[arry] ||= []
76
+
77
+ sub_hash[arry] << self[value][num]
78
+ end
79
+ end
80
+ end
81
+
82
+ setup_pivot_aggregates super_hash, aggregate_function
83
+ end
84
+ end
85
+
86
+ def setup_pivot_aggregates(super_hash, aggregate_function)
87
+ super_hash.each_value do |sub_hash|
88
+ sub_hash.each do |group_name, aggregates|
89
+ sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function)
90
+ end
91
+ end
92
+ end
93
+
94
+ def pivot_dataframe(super_hash)
95
+ df_index = DaruLite::MultiIndex.from_tuples super_hash.keys
96
+ df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
97
+
98
+ DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
99
+ super_hash.each do |row_index, sub_h|
100
+ sub_h.each do |vector_index, val|
101
+ pivoted_dataframe[vector_index][row_index] = val
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end