daru_lite 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +35 -33
  3. data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
  4. data/lib/daru_lite/data_frame/calculatable.rb +140 -0
  5. data/lib/daru_lite/data_frame/convertible.rb +107 -0
  6. data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
  7. data/lib/daru_lite/data_frame/fetchable.rb +301 -0
  8. data/lib/daru_lite/data_frame/filterable.rb +144 -0
  9. data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
  10. data/lib/daru_lite/data_frame/indexable.rb +168 -0
  11. data/lib/daru_lite/data_frame/iterable.rb +339 -0
  12. data/lib/daru_lite/data_frame/joinable.rb +152 -0
  13. data/lib/daru_lite/data_frame/missable.rb +75 -0
  14. data/lib/daru_lite/data_frame/pivotable.rb +108 -0
  15. data/lib/daru_lite/data_frame/queryable.rb +67 -0
  16. data/lib/daru_lite/data_frame/setable.rb +109 -0
  17. data/lib/daru_lite/data_frame/sortable.rb +241 -0
  18. data/lib/daru_lite/dataframe.rb +138 -2353
  19. data/lib/daru_lite/index/index.rb +13 -0
  20. data/lib/daru_lite/maths/statistics/vector.rb +1 -1
  21. data/lib/daru_lite/vector/aggregatable.rb +9 -0
  22. data/lib/daru_lite/vector/calculatable.rb +78 -0
  23. data/lib/daru_lite/vector/convertible.rb +77 -0
  24. data/lib/daru_lite/vector/duplicatable.rb +17 -0
  25. data/lib/daru_lite/vector/fetchable.rb +175 -0
  26. data/lib/daru_lite/vector/filterable.rb +128 -0
  27. data/lib/daru_lite/vector/indexable.rb +77 -0
  28. data/lib/daru_lite/vector/iterable.rb +95 -0
  29. data/lib/daru_lite/vector/joinable.rb +17 -0
  30. data/lib/daru_lite/vector/missable.rb +124 -0
  31. data/lib/daru_lite/vector/queryable.rb +45 -0
  32. data/lib/daru_lite/vector/setable.rb +47 -0
  33. data/lib/daru_lite/vector/sortable.rb +113 -0
  34. data/lib/daru_lite/vector.rb +36 -932
  35. data/lib/daru_lite/version.rb +1 -1
  36. data/spec/data_frame/aggregatable_example.rb +65 -0
  37. data/spec/data_frame/buildable_example.rb +109 -0
  38. data/spec/data_frame/calculatable_example.rb +135 -0
  39. data/spec/data_frame/convertible_example.rb +180 -0
  40. data/spec/data_frame/duplicatable_example.rb +111 -0
  41. data/spec/data_frame/fetchable_example.rb +476 -0
  42. data/spec/data_frame/filterable_example.rb +250 -0
  43. data/spec/data_frame/indexable_example.rb +221 -0
  44. data/spec/data_frame/iterable_example.rb +465 -0
  45. data/spec/data_frame/joinable_example.rb +106 -0
  46. data/spec/data_frame/missable_example.rb +47 -0
  47. data/spec/data_frame/pivotable_example.rb +297 -0
  48. data/spec/data_frame/queryable_example.rb +92 -0
  49. data/spec/data_frame/setable_example.rb +482 -0
  50. data/spec/data_frame/sortable_example.rb +350 -0
  51. data/spec/dataframe_spec.rb +181 -3289
  52. data/spec/index/index_spec.rb +8 -0
  53. data/spec/vector/aggregatable_example.rb +27 -0
  54. data/spec/vector/calculatable_example.rb +82 -0
  55. data/spec/vector/convertible_example.rb +126 -0
  56. data/spec/vector/duplicatable_example.rb +48 -0
  57. data/spec/vector/fetchable_example.rb +463 -0
  58. data/spec/vector/filterable_example.rb +165 -0
  59. data/spec/vector/indexable_example.rb +201 -0
  60. data/spec/vector/iterable_example.rb +111 -0
  61. data/spec/vector/joinable_example.rb +25 -0
  62. data/spec/vector/missable_example.rb +88 -0
  63. data/spec/vector/queryable_example.rb +91 -0
  64. data/spec/vector/setable_example.rb +300 -0
  65. data/spec/vector/sortable_example.rb +242 -0
  66. data/spec/vector_spec.rb +111 -1805
  67. metadata +86 -2
@@ -0,0 +1,168 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Indexable
4
+ module SetSingleIndexStrategy
5
+ def self.uniq_size(df, col)
6
+ df[col].uniq.size
7
+ end
8
+
9
+ def self.new_index(df, col)
10
+ DaruLite::Index.new(df[col].to_a)
11
+ end
12
+
13
+ def self.delete_vector(df, col)
14
+ df.delete_vector(col)
15
+ end
16
+ end
17
+
18
+ module SetCategoricalIndexStrategy
19
+ def self.new_index(df, col)
20
+ DaruLite::CategoricalIndex.new(df[col].to_a)
21
+ end
22
+
23
+ def self.delete_vector(df, col)
24
+ df.delete_vector(col)
25
+ end
26
+ end
27
+
28
+ module SetMultiIndexStrategy
29
+ def self.uniq_size(df, cols)
30
+ df[*cols].uniq.size
31
+ end
32
+
33
+ def self.new_index(df, cols)
34
+ DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
35
+ mi.name = cols
36
+ end
37
+ end
38
+
39
+ def self.delete_vector(df, cols)
40
+ df.delete_vectors(*cols)
41
+ end
42
+ end
43
+
44
+ # Set a particular column as the new DF
45
+ def set_index(new_index_col, keep: false, categorical: false)
46
+ if categorical
47
+ strategy = SetCategoricalIndexStrategy
48
+ elsif new_index_col.respond_to?(:to_a)
49
+ strategy = SetMultiIndexStrategy
50
+ new_index_col = new_index_col.to_a
51
+ else
52
+ strategy = SetSingleIndexStrategy
53
+ end
54
+
55
+ unless categorical
56
+ uniq_size = strategy.uniq_size(self, new_index_col)
57
+ raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
58
+ end
59
+
60
+ self.index = strategy.new_index(self, new_index_col)
61
+ strategy.delete_vector(self, new_index_col) unless keep
62
+ self
63
+ end
64
+
65
+ # Change the index of the DataFrame and preserve the labels of the previous
66
+ # indexing. New index can be DaruLite::Index or any of its subclasses.
67
+ #
68
+ # @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
69
+ # @example Reindexing DataFrame
70
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
71
+ # index: ['a','b','c','d'])
72
+ # #=>
73
+ # ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
74
+ # # a b
75
+ # # a 1 11
76
+ # # b 2 22
77
+ # # c 3 33
78
+ # # d 4 44
79
+ # df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
80
+ # #=>
81
+ # ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
82
+ # # a b
83
+ # # b 2 22
84
+ # # 0 nil nil
85
+ # # a 1 11
86
+ # # g nil nil
87
+ def reindex(new_index)
88
+ unless new_index.is_a?(DaruLite::Index)
89
+ raise ArgumentError, 'Must pass the new index of type Index or its ' \
90
+ "subclasses, not #{new_index.class}"
91
+ end
92
+
93
+ cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
94
+ new_index.each_with_object(cl) do |idx, memo|
95
+ memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
96
+ end
97
+ end
98
+
99
+ def reset_index
100
+ index_df = index.to_df
101
+ names = index.name
102
+ names = [names] unless names.instance_of?(Array)
103
+ new_vectors = names + vectors.to_a
104
+ self.index = index_df.index
105
+ names.each do |name|
106
+ self[name] = index_df[name]
107
+ end
108
+ self.order = new_vectors
109
+ self
110
+ end
111
+
112
+ # Reassign index with a new index of type DaruLite::Index or any of its subclasses.
113
+ #
114
+ # @param [DaruLite::Index] idx New index object on which the rows of the dataframe
115
+ # are to be indexed.
116
+ # @example Reassigining index of a DataFrame
117
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
118
+ # df.index.to_a #=> [0,1,2,3]
119
+ #
120
+ # df.index = DaruLite::Index.new(['a','b','c','d'])
121
+ # df.index.to_a #=> ['a','b','c','d']
122
+ # df.row['a'].to_a #=> [1,11]
123
+ def index=(idx)
124
+ @index = Index.coerce idx
125
+ @data.each { |vec| vec.index = @index }
126
+
127
+ self
128
+ end
129
+
130
+ def reindex_vectors(new_vectors)
131
+ unless new_vectors.is_a?(DaruLite::Index)
132
+ raise ArgumentError, 'Must pass the new index of type Index or its ' \
133
+ "subclasses, not #{new_vectors.class}"
134
+ end
135
+
136
+ cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
137
+ new_vectors.each_with_object(cl) do |vec, memo|
138
+ memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
139
+ end
140
+ end
141
+
142
+ # Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
143
+ #
144
+ # @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
145
+ # be indexed. Must of the same size as ncols.
146
+ # @example Reassigning vectors of a DataFrame
147
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
148
+ # df.vectors.to_a #=> [:a, :b, :c]
149
+ #
150
+ # df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
151
+ # df.vectors.to_a #=> [:foo, :bar, :baz]
152
+ def vectors=(new_index)
153
+ raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
154
+
155
+ if new_index.size != ncols
156
+ raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
157
+ "dataframe size #{ncols}"
158
+ end
159
+
160
+ @vectors = new_index
161
+ @data.zip(new_index.to_a).each do |vect, name|
162
+ vect.name = name
163
+ end
164
+ self
165
+ end
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,339 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Iterable
4
+ # Iterate over each index of the DataFrame.
5
+ def each_index(&block)
6
+ return to_enum(:each_index) unless block
7
+
8
+ @index.each(&block)
9
+
10
+ self
11
+ end
12
+
13
+ # Iterate over each vector
14
+ def each_vector(&block)
15
+ return to_enum(:each_vector) unless block
16
+
17
+ @data.each(&block)
18
+
19
+ self
20
+ end
21
+
22
+ alias each_column each_vector
23
+
24
+ # Iterate over each vector alongwith the name of the vector
25
+ def each_vector_with_index
26
+ return to_enum(:each_vector_with_index) unless block_given?
27
+
28
+ @vectors.each do |vector|
29
+ yield @data[@vectors[vector]], vector
30
+ end
31
+
32
+ self
33
+ end
34
+
35
+ alias each_column_with_index each_vector_with_index
36
+
37
+ # Iterate over each row
38
+ def each_row
39
+ return to_enum(:each_row) unless block_given?
40
+
41
+ @index.size.times do |pos|
42
+ yield row_at(pos)
43
+ end
44
+
45
+ self
46
+ end
47
+
48
+ def each_row_with_index
49
+ return to_enum(:each_row_with_index) unless block_given?
50
+
51
+ @index.each do |index|
52
+ yield access_row(index), index
53
+ end
54
+
55
+ self
56
+ end
57
+
58
+ # Iterate over each row or vector of the DataFrame. Specify axis
59
+ # by passing :vector or :row as the argument. Default to :vector.
60
+ #
61
+ # == Description
62
+ #
63
+ # `#each` works exactly like Array#each. The default mode for `each`
64
+ # is to iterate over the columns of the DataFrame. To iterate over
65
+ # rows you must pass the axis, i.e `:row` as an argument.
66
+ #
67
+ # == Arguments
68
+ #
69
+ # * +axis+ - The axis to iterate over. Can be :vector (or :column)
70
+ # or :row. Default to :vector.
71
+ def each(axis = :vector, &block)
72
+ dispatch_to_axis axis, :each, &block
73
+ end
74
+
75
+ # Iterate over a row or vector and return results in a DaruLite::Vector.
76
+ # Specify axis with :vector or :row. Default to :vector.
77
+ #
78
+ # == Description
79
+ #
80
+ # The #collect iterator works similar to #map, the only difference
81
+ # being that it returns a DaruLite::Vector comprising of the results of
82
+ # each block run. The resultant Vector has the same index as that
83
+ # of the axis over which collect has iterated. It also accepts the
84
+ # optional axis argument.
85
+ #
86
+ # == Arguments
87
+ #
88
+ # * +axis+ - The axis to iterate over. Can be :vector (or :column)
89
+ # or :row. Default to :vector.
90
+ def collect(axis = :vector, &block)
91
+ dispatch_to_axis_pl axis, :collect, &block
92
+ end
93
+
94
+ # Map over each vector or row of the data frame according to
95
+ # the argument specified. Will return an Array of the resulting
96
+ # elements. To map over each row/vector and get a DataFrame,
97
+ # see #recode.
98
+ #
99
+ # == Description
100
+ #
101
+ # The #map iterator works like Array#map. The value returned by
102
+ # each run of the block is added to an Array and the Array is
103
+ # returned. This method also accepts an axis argument, like #each.
104
+ # The default is :vector.
105
+ #
106
+ # == Arguments
107
+ #
108
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
109
+ # Default to :vector.
110
+ def map(axis = :vector, &block)
111
+ dispatch_to_axis_pl axis, :map, &block
112
+ end
113
+
114
+ # Destructive map. Modifies the DataFrame. Each run of the block
115
+ # must return a DaruLite::Vector. You can specify the axis to map over
116
+ # as the argument. Default to :vector.
117
+ #
118
+ # == Arguments
119
+ #
120
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
121
+ # Default to :vector.
122
+ def map!(axis = :vector, &block)
123
+ if %i[vector column].include?(axis)
124
+ map_vectors!(&block)
125
+ elsif axis == :row
126
+ map_rows!(&block)
127
+ end
128
+ end
129
+
130
+ # Maps over the DataFrame and returns a DataFrame. Each run of the
131
+ # block must return a DaruLite::Vector object. You can specify the axis
132
+ # to map over. Default to :vector.
133
+ #
134
+ # == Description
135
+ #
136
+ # Recode works similarly to #map, but an important difference between
137
+ # the two is that recode returns a modified DaruLite::DataFrame instead
138
+ # of an Array. For this reason, #recode expects that every run of the
139
+ # block to return a DaruLite::Vector.
140
+ #
141
+ # Just like map and each, recode also accepts an optional _axis_ argument.
142
+ #
143
+ # == Arguments
144
+ #
145
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
146
+ # Default to :vector.
147
+ def recode(axis = :vector, &block)
148
+ dispatch_to_axis_pl axis, :recode, &block
149
+ end
150
+
151
+ # Replace specified values with given value
152
+ # @param [Array] old_values values to replace with new value
153
+ # @param [object] new_value new value to replace with
154
+ # @return [DaruLite::DataFrame] Data Frame itself with old values replace
155
+ # with new value
156
+ # @example
157
+ # df = DaruLite::DataFrame.new({
158
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
159
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
160
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
161
+ # }, index: 11..18)
162
+ # df.replace_values nil, Float::NAN
163
+ # # => #<DaruLite::DataFrame(8x3)>
164
+ # # a b c
165
+ # # 11 1 a a
166
+ # # 12 2 b NaN
167
+ # # 13 3 NaN 3
168
+ # # 14 NaN NaN 4
169
+ # # 15 NaN NaN 3
170
+ # # 16 NaN 3 5
171
+ # # 17 1 5 NaN
172
+ # # 18 7 8 7
173
+ def replace_values(old_values, new_value)
174
+ @data.each { |vec| vec.replace_values old_values, new_value }
175
+ self
176
+ end
177
+
178
+ # Test each row with one or more tests.
179
+ # @param tests [Proc] Each test is a Proc with the form
180
+ # *Proc.new {|row| row[:age] > 0}*
181
+ # The function returns an array with all errors.
182
+ #
183
+ # FIXME: description here is too sparse. As far as I can get,
184
+ # it should tell something about that each test is [descr, fields, block],
185
+ # and that first value may be column name to output. - zverok, 2016-05-18
186
+ def verify(*tests)
187
+ id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
188
+
189
+ each_row_with_index.map do |row, i|
190
+ tests.reject { |*_, block| block.call(row) }
191
+ .map { |test| verify_error_message row, test, id, i }
192
+ end.flatten
193
+ end
194
+
195
+ def recode_vectors
196
+ block_given? or return to_enum(:recode_vectors)
197
+
198
+ dup.tap do |df|
199
+ df.each_vector_with_index do |v, i|
200
+ df[*i] = should_be_vector!(yield(v))
201
+ end
202
+ end
203
+ end
204
+
205
+ def recode_rows
206
+ block_given? or return to_enum(:recode_rows)
207
+
208
+ dup.tap do |df|
209
+ df.each_row_with_index do |r, i|
210
+ df.row[i] = should_be_vector!(yield(r))
211
+ end
212
+ end
213
+ end
214
+
215
+ # Map each vector and return an Array.
216
+ def map_vectors(&block)
217
+ return to_enum(:map_vectors) unless block
218
+
219
+ @data.map(&block)
220
+ end
221
+
222
+ # Destructive form of #map_vectors
223
+ def map_vectors!
224
+ return to_enum(:map_vectors!) unless block_given?
225
+
226
+ vectors.dup.each do |n|
227
+ self[n] = should_be_vector!(yield(self[n]))
228
+ end
229
+
230
+ self
231
+ end
232
+
233
+ # Map vectors alongwith the index.
234
+ def map_vectors_with_index(&block)
235
+ return to_enum(:map_vectors_with_index) unless block
236
+
237
+ each_vector_with_index.map(&block)
238
+ end
239
+
240
+ # Map each row
241
+ def map_rows(&block)
242
+ return to_enum(:map_rows) unless block
243
+
244
+ each_row.map(&block)
245
+ end
246
+
247
+ def map_rows_with_index(&block)
248
+ return to_enum(:map_rows_with_index) unless block
249
+
250
+ each_row_with_index.map(&block)
251
+ end
252
+
253
+ def map_rows!
254
+ return to_enum(:map_rows!) unless block_given?
255
+
256
+ index.dup.each do |i|
257
+ row[i] = should_be_vector!(yield(row[i]))
258
+ end
259
+
260
+ self
261
+ end
262
+
263
+ def apply_method(method, keys: nil, by_position: true)
264
+ df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
265
+
266
+ case method
267
+ when Symbol then df.send(method)
268
+ when Proc then method.call(df)
269
+ when Array
270
+ method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
271
+ else raise
272
+ end
273
+ end
274
+ alias apply_method_on_sub_df apply_method
275
+
276
+ # Retrieves a DaruLite::Vector, based on the result of calculation
277
+ # performed on each row.
278
+ def collect_rows(&block)
279
+ return to_enum(:collect_rows) unless block
280
+
281
+ DaruLite::Vector.new(each_row.map(&block), index: @index)
282
+ end
283
+
284
+ def collect_row_with_index(&block)
285
+ return to_enum(:collect_row_with_index) unless block
286
+
287
+ DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
288
+ end
289
+
290
+ # Retrives a DaruLite::Vector, based on the result of calculation
291
+ # performed on each vector.
292
+ def collect_vectors(&block)
293
+ return to_enum(:collect_vectors) unless block
294
+
295
+ DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
296
+ end
297
+
298
+ def collect_vector_with_index(&block)
299
+ return to_enum(:collect_vector_with_index) unless block
300
+
301
+ DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
302
+ end
303
+
304
+ # Generate a matrix, based on vector names of the DataFrame.
305
+ #
306
+ # @return {::Matrix}
307
+ # :nocov:
308
+ # FIXME: Even not trying to cover this: I can't get, how it is expected
309
+ # to work.... -- zverok
310
+ def collect_matrix
311
+ return to_enum(:collect_matrix) unless block_given?
312
+
313
+ vecs = vectors.to_a
314
+ rows = vecs.collect do |row|
315
+ vecs.collect do |col|
316
+ yield row, col
317
+ end
318
+ end
319
+
320
+ Matrix.rows(rows)
321
+ end
322
+ # :nocov:
323
+
324
+ private
325
+
326
+ def should_be_vector!(val)
327
+ return val if val.is_a?(DaruLite::Vector)
328
+
329
+ raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
330
+ end
331
+
332
+ def verify_error_message(row, test, id, i)
333
+ description, fields, = test
334
+ values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
335
+ "#{i + 1} [#{row[id]}]: #{description}#{values}"
336
+ end
337
+ end
338
+ end
339
+ end
@@ -0,0 +1,152 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Joinable
4
+ # Concatenate another DataFrame along corresponding columns.
5
+ # If columns do not exist in both dataframes, they are filled with nils
6
+ def concat(other_df)
7
+ vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
8
+
9
+ data = vectors.map do |v|
10
+ get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
11
+ end
12
+
13
+ DaruLite::DataFrame.new(data, order: vectors)
14
+ end
15
+
16
+ # Concatenates another DataFrame as #concat.
17
+ # Additionally it tries to preserve the index. If the indices contain
18
+ # common elements, #union will overwrite the according rows in the
19
+ # first dataframe.
20
+ def union(other_df)
21
+ index = (@index.to_a + other_df.index.to_a).uniq
22
+ df = row[*(@index.to_a - other_df.index.to_a)]
23
+
24
+ df = df.concat(other_df)
25
+ df.index = DaruLite::Index.new(index)
26
+ df
27
+ end
28
+
29
+ # Merge vectors from two DataFrames. In case of name collision,
30
+ # the vectors names are changed to x_1, x_2 ....
31
+ #
32
+ # @return {DaruLite::DataFrame}
33
+ def merge(other_df)
34
+ unless nrows == other_df.nrows
35
+ raise ArgumentError,
36
+ "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
37
+ end
38
+
39
+ new_fields = (@vectors.to_a + other_df.vectors.to_a)
40
+ new_fields = ArrayHelper.recode_repeated(new_fields)
41
+ DataFrame.new({}, order: new_fields).tap do |df_new|
42
+ (0...nrows).each do |i|
43
+ df_new.add_row row[i].to_a + other_df.row[i].to_a
44
+ end
45
+ df_new.index = @index if @index == other_df.index
46
+ df_new.update
47
+ end
48
+ end
49
+
50
+ # Join 2 DataFrames with SQL style joins. Currently supports inner, left
51
+ # outer, right outer and full outer joins.
52
+ #
53
+ # @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
54
+ # to be performed.
55
+ # @param [Hash] opts Options Hash
56
+ # @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
57
+ # @option :on [Array] The columns on which the join is to be performed.
58
+ # Column names specified here must be common to both DataFrames.
59
+ # @option :indicator [Symbol] The name of a vector to add to the resultant
60
+ # dataframe that indicates whether the record was in the left (:left_only),
61
+ # right (:right_only), or both (:both) joining dataframes.
62
+ # @return [DaruLite::DataFrame]
63
+ # @example Inner Join
64
+ # left = DaruLite::DataFrame.new({
65
+ # :id => [1,2,3,4],
66
+ # :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
67
+ # })
68
+ # right = DaruLite::DataFrame.new({
69
+ # :id => [1,2,3,4],
70
+ # :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
71
+ # })
72
+ # left.join(right, how: :inner, on: [:name])
73
+ # #=>
74
+ # ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
75
+ # # id_1 name id_2
76
+ # # 0 1 Pirate 2
77
+ # # 1 3 Ninja 4
78
+ def join(other_df, opts = {})
79
+ DaruLite::Core::Merge.join(self, other_df, opts)
80
+ end
81
+
82
+ # Creates a new dataset for one to many relations
83
+ # on a dataset, based on pattern of field names.
84
+ #
85
+ # for example, you have a survey for number of children
86
+ # with this structure:
87
+ # id, name, child_name_1, child_age_1, child_name_2, child_age_2
88
+ # with
89
+ # ds.one_to_many([:id], "child_%v_%n"
90
+ # the field of first parameters will be copied verbatim
91
+ # to new dataset, and fields which responds to second
92
+ # pattern will be added one case for each different %n.
93
+ #
94
+ # @example
95
+ # cases=[
96
+ # ['1','george','red',10,'blue',20,nil,nil],
97
+ # ['2','fred','green',15,'orange',30,'white',20],
98
+ # ['3','alfred',nil,nil,nil,nil,nil,nil]
99
+ # ]
100
+ # ds=DaruLite::DataFrame.rows(cases, order:
101
+ # [:id, :name,
102
+ # :car_color1, :car_value1,
103
+ # :car_color2, :car_value2,
104
+ # :car_color3, :car_value3])
105
+ # ds.one_to_many([:id],'car_%v%n').to_matrix
106
+ # #=> Matrix[
107
+ # # ["red", "1", 10],
108
+ # # ["blue", "1", 20],
109
+ # # ["green", "2", 15],
110
+ # # ["orange", "2", 30],
111
+ # # ["white", "2", 20]
112
+ # # ]
113
+ def one_to_many(parent_fields, pattern)
114
+ vars, numbers = one_to_many_components(pattern)
115
+
116
+ DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
117
+ each_row do |row|
118
+ verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
119
+ numbers.each do |n|
120
+ generated = one_to_many_row row, n, vars, pattern
121
+ next if generated.values.all?(&:nil?)
122
+
123
+ ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
124
+ end
125
+ end
126
+ ds.update
127
+ end
128
+ end
129
+
130
+ private
131
+
132
+ def one_to_many_components(pattern)
133
+ re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
134
+
135
+ vars, numbers =
136
+ @vectors
137
+ .map { |v| v.scan(re) }
138
+ .reject(&:empty?).flatten(1).transpose
139
+
140
+ [vars.uniq, numbers.map(&:to_i).sort.uniq]
141
+ end
142
+
143
+ def one_to_many_row(row, number, vars, pattern)
144
+ vars
145
+ .to_h do |v|
146
+ name = pattern.sub('%v', v).sub('%n', number.to_s)
147
+ [v, row[name]]
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end