daru_lite 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +35 -33
  3. data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
  4. data/lib/daru_lite/data_frame/calculatable.rb +140 -0
  5. data/lib/daru_lite/data_frame/convertible.rb +107 -0
  6. data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
  7. data/lib/daru_lite/data_frame/fetchable.rb +301 -0
  8. data/lib/daru_lite/data_frame/filterable.rb +144 -0
  9. data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
  10. data/lib/daru_lite/data_frame/indexable.rb +168 -0
  11. data/lib/daru_lite/data_frame/iterable.rb +339 -0
  12. data/lib/daru_lite/data_frame/joinable.rb +152 -0
  13. data/lib/daru_lite/data_frame/missable.rb +75 -0
  14. data/lib/daru_lite/data_frame/pivotable.rb +108 -0
  15. data/lib/daru_lite/data_frame/queryable.rb +67 -0
  16. data/lib/daru_lite/data_frame/setable.rb +109 -0
  17. data/lib/daru_lite/data_frame/sortable.rb +241 -0
  18. data/lib/daru_lite/dataframe.rb +138 -2353
  19. data/lib/daru_lite/index/index.rb +14 -1
  20. data/lib/daru_lite/index/multi_index.rb +9 -0
  21. data/lib/daru_lite/maths/statistics/vector.rb +1 -1
  22. data/lib/daru_lite/vector/aggregatable.rb +9 -0
  23. data/lib/daru_lite/vector/calculatable.rb +78 -0
  24. data/lib/daru_lite/vector/convertible.rb +77 -0
  25. data/lib/daru_lite/vector/duplicatable.rb +17 -0
  26. data/lib/daru_lite/vector/fetchable.rb +175 -0
  27. data/lib/daru_lite/vector/filterable.rb +128 -0
  28. data/lib/daru_lite/vector/indexable.rb +77 -0
  29. data/lib/daru_lite/vector/iterable.rb +95 -0
  30. data/lib/daru_lite/vector/joinable.rb +17 -0
  31. data/lib/daru_lite/vector/missable.rb +124 -0
  32. data/lib/daru_lite/vector/queryable.rb +45 -0
  33. data/lib/daru_lite/vector/setable.rb +47 -0
  34. data/lib/daru_lite/vector/sortable.rb +113 -0
  35. data/lib/daru_lite/vector.rb +36 -932
  36. data/lib/daru_lite/version.rb +1 -1
  37. data/spec/data_frame/aggregatable_example.rb +65 -0
  38. data/spec/data_frame/buildable_example.rb +109 -0
  39. data/spec/data_frame/calculatable_example.rb +135 -0
  40. data/spec/data_frame/convertible_example.rb +180 -0
  41. data/spec/data_frame/duplicatable_example.rb +111 -0
  42. data/spec/data_frame/fetchable_example.rb +476 -0
  43. data/spec/data_frame/filterable_example.rb +409 -0
  44. data/spec/data_frame/indexable_example.rb +221 -0
  45. data/spec/data_frame/iterable_example.rb +465 -0
  46. data/spec/data_frame/joinable_example.rb +106 -0
  47. data/spec/data_frame/missable_example.rb +47 -0
  48. data/spec/data_frame/pivotable_example.rb +297 -0
  49. data/spec/data_frame/queryable_example.rb +92 -0
  50. data/spec/data_frame/setable_example.rb +482 -0
  51. data/spec/data_frame/sortable_example.rb +350 -0
  52. data/spec/dataframe_spec.rb +181 -3289
  53. data/spec/index/categorical_index_spec.rb +27 -8
  54. data/spec/index/index_spec.rb +21 -0
  55. data/spec/index/multi_index_spec.rb +85 -76
  56. data/spec/vector/aggregatable_example.rb +27 -0
  57. data/spec/vector/calculatable_example.rb +82 -0
  58. data/spec/vector/convertible_example.rb +126 -0
  59. data/spec/vector/duplicatable_example.rb +48 -0
  60. data/spec/vector/fetchable_example.rb +463 -0
  61. data/spec/vector/filterable_example.rb +165 -0
  62. data/spec/vector/indexable_example.rb +201 -0
  63. data/spec/vector/iterable_example.rb +111 -0
  64. data/spec/vector/joinable_example.rb +25 -0
  65. data/spec/vector/missable_example.rb +88 -0
  66. data/spec/vector/queryable_example.rb +91 -0
  67. data/spec/vector/setable_example.rb +300 -0
  68. data/spec/vector/sortable_example.rb +242 -0
  69. data/spec/vector_spec.rb +111 -1805
  70. metadata +86 -2
@@ -0,0 +1,168 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Indexable
4
+ module SetSingleIndexStrategy
5
+ def self.uniq_size(df, col)
6
+ df[col].uniq.size
7
+ end
8
+
9
+ def self.new_index(df, col)
10
+ DaruLite::Index.new(df[col].to_a)
11
+ end
12
+
13
+ def self.delete_vector(df, col)
14
+ df.delete_vector(col)
15
+ end
16
+ end
17
+
18
+ module SetCategoricalIndexStrategy
19
+ def self.new_index(df, col)
20
+ DaruLite::CategoricalIndex.new(df[col].to_a)
21
+ end
22
+
23
+ def self.delete_vector(df, col)
24
+ df.delete_vector(col)
25
+ end
26
+ end
27
+
28
+ module SetMultiIndexStrategy
29
+ def self.uniq_size(df, cols)
30
+ df[*cols].uniq.size
31
+ end
32
+
33
+ def self.new_index(df, cols)
34
+ DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
35
+ mi.name = cols
36
+ end
37
+ end
38
+
39
+ def self.delete_vector(df, cols)
40
+ df.delete_vectors(*cols)
41
+ end
42
+ end
43
+
44
+ # Set a particular column as the new DF
45
+ def set_index(new_index_col, keep: false, categorical: false)
46
+ if categorical
47
+ strategy = SetCategoricalIndexStrategy
48
+ elsif new_index_col.respond_to?(:to_a)
49
+ strategy = SetMultiIndexStrategy
50
+ new_index_col = new_index_col.to_a
51
+ else
52
+ strategy = SetSingleIndexStrategy
53
+ end
54
+
55
+ unless categorical
56
+ uniq_size = strategy.uniq_size(self, new_index_col)
57
+ raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
58
+ end
59
+
60
+ self.index = strategy.new_index(self, new_index_col)
61
+ strategy.delete_vector(self, new_index_col) unless keep
62
+ self
63
+ end
64
+
65
+ # Change the index of the DataFrame and preserve the labels of the previous
66
+ # indexing. New index can be DaruLite::Index or any of its subclasses.
67
+ #
68
+ # @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
69
+ # @example Reindexing DataFrame
70
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
71
+ # index: ['a','b','c','d'])
72
+ # #=>
73
+ # ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
74
+ # # a b
75
+ # # a 1 11
76
+ # # b 2 22
77
+ # # c 3 33
78
+ # # d 4 44
79
+ # df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
80
+ # #=>
81
+ # ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
82
+ # # a b
83
+ # # b 2 22
84
+ # # 0 nil nil
85
+ # # a 1 11
86
+ # # g nil nil
87
+ def reindex(new_index)
88
+ unless new_index.is_a?(DaruLite::Index)
89
+ raise ArgumentError, 'Must pass the new index of type Index or its ' \
90
+ "subclasses, not #{new_index.class}"
91
+ end
92
+
93
+ cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
94
+ new_index.each_with_object(cl) do |idx, memo|
95
+ memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
96
+ end
97
+ end
98
+
99
+ def reset_index
100
+ index_df = index.to_df
101
+ names = index.name
102
+ names = [names] unless names.instance_of?(Array)
103
+ new_vectors = names + vectors.to_a
104
+ self.index = index_df.index
105
+ names.each do |name|
106
+ self[name] = index_df[name]
107
+ end
108
+ self.order = new_vectors
109
+ self
110
+ end
111
+
112
+ # Reassign index with a new index of type DaruLite::Index or any of its subclasses.
113
+ #
114
+ # @param [DaruLite::Index] idx New index object on which the rows of the dataframe
115
+ # are to be indexed.
116
+ # @example Reassigining index of a DataFrame
117
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
118
+ # df.index.to_a #=> [0,1,2,3]
119
+ #
120
+ # df.index = DaruLite::Index.new(['a','b','c','d'])
121
+ # df.index.to_a #=> ['a','b','c','d']
122
+ # df.row['a'].to_a #=> [1,11]
123
+ def index=(idx)
124
+ @index = Index.coerce idx
125
+ @data.each { |vec| vec.index = @index }
126
+
127
+ self
128
+ end
129
+
130
+ def reindex_vectors(new_vectors)
131
+ unless new_vectors.is_a?(DaruLite::Index)
132
+ raise ArgumentError, 'Must pass the new index of type Index or its ' \
133
+ "subclasses, not #{new_vectors.class}"
134
+ end
135
+
136
+ cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
137
+ new_vectors.each_with_object(cl) do |vec, memo|
138
+ memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
139
+ end
140
+ end
141
+
142
+ # Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
143
+ #
144
+ # @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
145
+ # be indexed. Must of the same size as ncols.
146
+ # @example Reassigning vectors of a DataFrame
147
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
148
+ # df.vectors.to_a #=> [:a, :b, :c]
149
+ #
150
+ # df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
151
+ # df.vectors.to_a #=> [:foo, :bar, :baz]
152
+ def vectors=(new_index)
153
+ raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
154
+
155
+ if new_index.size != ncols
156
+ raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
157
+ "dataframe size #{ncols}"
158
+ end
159
+
160
+ @vectors = new_index
161
+ @data.zip(new_index.to_a).each do |vect, name|
162
+ vect.name = name
163
+ end
164
+ self
165
+ end
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,339 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Iterable
4
+ # Iterate over each index of the DataFrame.
5
+ def each_index(&block)
6
+ return to_enum(:each_index) unless block
7
+
8
+ @index.each(&block)
9
+
10
+ self
11
+ end
12
+
13
+ # Iterate over each vector
14
+ def each_vector(&block)
15
+ return to_enum(:each_vector) unless block
16
+
17
+ @data.each(&block)
18
+
19
+ self
20
+ end
21
+
22
+ alias each_column each_vector
23
+
24
+ # Iterate over each vector alongwith the name of the vector
25
+ def each_vector_with_index
26
+ return to_enum(:each_vector_with_index) unless block_given?
27
+
28
+ @vectors.each do |vector|
29
+ yield @data[@vectors[vector]], vector
30
+ end
31
+
32
+ self
33
+ end
34
+
35
+ alias each_column_with_index each_vector_with_index
36
+
37
+ # Iterate over each row
38
+ def each_row
39
+ return to_enum(:each_row) unless block_given?
40
+
41
+ @index.size.times do |pos|
42
+ yield row_at(pos)
43
+ end
44
+
45
+ self
46
+ end
47
+
48
+ def each_row_with_index
49
+ return to_enum(:each_row_with_index) unless block_given?
50
+
51
+ @index.each do |index|
52
+ yield access_row(index), index
53
+ end
54
+
55
+ self
56
+ end
57
+
58
+ # Iterate over each row or vector of the DataFrame. Specify axis
59
+ # by passing :vector or :row as the argument. Default to :vector.
60
+ #
61
+ # == Description
62
+ #
63
+ # `#each` works exactly like Array#each. The default mode for `each`
64
+ # is to iterate over the columns of the DataFrame. To iterate over
65
+ # rows you must pass the axis, i.e `:row` as an argument.
66
+ #
67
+ # == Arguments
68
+ #
69
+ # * +axis+ - The axis to iterate over. Can be :vector (or :column)
70
+ # or :row. Default to :vector.
71
+ def each(axis = :vector, &block)
72
+ dispatch_to_axis axis, :each, &block
73
+ end
74
+
75
+ # Iterate over a row or vector and return results in a DaruLite::Vector.
76
+ # Specify axis with :vector or :row. Default to :vector.
77
+ #
78
+ # == Description
79
+ #
80
+ # The #collect iterator works similar to #map, the only difference
81
+ # being that it returns a DaruLite::Vector comprising of the results of
82
+ # each block run. The resultant Vector has the same index as that
83
+ # of the axis over which collect has iterated. It also accepts the
84
+ # optional axis argument.
85
+ #
86
+ # == Arguments
87
+ #
88
+ # * +axis+ - The axis to iterate over. Can be :vector (or :column)
89
+ # or :row. Default to :vector.
90
+ def collect(axis = :vector, &block)
91
+ dispatch_to_axis_pl axis, :collect, &block
92
+ end
93
+
94
+ # Map over each vector or row of the data frame according to
95
+ # the argument specified. Will return an Array of the resulting
96
+ # elements. To map over each row/vector and get a DataFrame,
97
+ # see #recode.
98
+ #
99
+ # == Description
100
+ #
101
+ # The #map iterator works like Array#map. The value returned by
102
+ # each run of the block is added to an Array and the Array is
103
+ # returned. This method also accepts an axis argument, like #each.
104
+ # The default is :vector.
105
+ #
106
+ # == Arguments
107
+ #
108
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
109
+ # Default to :vector.
110
+ def map(axis = :vector, &block)
111
+ dispatch_to_axis_pl axis, :map, &block
112
+ end
113
+
114
+ # Destructive map. Modifies the DataFrame. Each run of the block
115
+ # must return a DaruLite::Vector. You can specify the axis to map over
116
+ # as the argument. Default to :vector.
117
+ #
118
+ # == Arguments
119
+ #
120
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
121
+ # Default to :vector.
122
+ def map!(axis = :vector, &block)
123
+ if %i[vector column].include?(axis)
124
+ map_vectors!(&block)
125
+ elsif axis == :row
126
+ map_rows!(&block)
127
+ end
128
+ end
129
+
130
+ # Maps over the DataFrame and returns a DataFrame. Each run of the
131
+ # block must return a DaruLite::Vector object. You can specify the axis
132
+ # to map over. Default to :vector.
133
+ #
134
+ # == Description
135
+ #
136
+ # Recode works similarly to #map, but an important difference between
137
+ # the two is that recode returns a modified DaruLite::DataFrame instead
138
+ # of an Array. For this reason, #recode expects that every run of the
139
+ # block to return a DaruLite::Vector.
140
+ #
141
+ # Just like map and each, recode also accepts an optional _axis_ argument.
142
+ #
143
+ # == Arguments
144
+ #
145
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
146
+ # Default to :vector.
147
+ def recode(axis = :vector, &block)
148
+ dispatch_to_axis_pl axis, :recode, &block
149
+ end
150
+
151
+ # Replace specified values with given value
152
+ # @param [Array] old_values values to replace with new value
153
+ # @param [object] new_value new value to replace with
154
+ # @return [DaruLite::DataFrame] Data Frame itself with old values replace
155
+ # with new value
156
+ # @example
157
+ # df = DaruLite::DataFrame.new({
158
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
159
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
160
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
161
+ # }, index: 11..18)
162
+ # df.replace_values nil, Float::NAN
163
+ # # => #<DaruLite::DataFrame(8x3)>
164
+ # # a b c
165
+ # # 11 1 a a
166
+ # # 12 2 b NaN
167
+ # # 13 3 NaN 3
168
+ # # 14 NaN NaN 4
169
+ # # 15 NaN NaN 3
170
+ # # 16 NaN 3 5
171
+ # # 17 1 5 NaN
172
+ # # 18 7 8 7
173
+ def replace_values(old_values, new_value)
174
+ @data.each { |vec| vec.replace_values old_values, new_value }
175
+ self
176
+ end
177
+
178
+ # Test each row with one or more tests.
179
+ # @param tests [Proc] Each test is a Proc with the form
180
+ # *Proc.new {|row| row[:age] > 0}*
181
+ # The function returns an array with all errors.
182
+ #
183
+ # FIXME: description here is too sparse. As far as I can get,
184
+ # it should tell something about that each test is [descr, fields, block],
185
+ # and that first value may be column name to output. - zverok, 2016-05-18
186
+ def verify(*tests)
187
+ id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
188
+
189
+ each_row_with_index.map do |row, i|
190
+ tests.reject { |*_, block| block.call(row) }
191
+ .map { |test| verify_error_message row, test, id, i }
192
+ end.flatten
193
+ end
194
+
195
+ def recode_vectors
196
+ block_given? or return to_enum(:recode_vectors)
197
+
198
+ dup.tap do |df|
199
+ df.each_vector_with_index do |v, i|
200
+ df[*i] = should_be_vector!(yield(v))
201
+ end
202
+ end
203
+ end
204
+
205
+ def recode_rows
206
+ block_given? or return to_enum(:recode_rows)
207
+
208
+ dup.tap do |df|
209
+ df.each_row_with_index do |r, i|
210
+ df.row[i] = should_be_vector!(yield(r))
211
+ end
212
+ end
213
+ end
214
+
215
+ # Map each vector and return an Array.
216
+ def map_vectors(&block)
217
+ return to_enum(:map_vectors) unless block
218
+
219
+ @data.map(&block)
220
+ end
221
+
222
+ # Destructive form of #map_vectors
223
+ def map_vectors!
224
+ return to_enum(:map_vectors!) unless block_given?
225
+
226
+ vectors.dup.each do |n|
227
+ self[n] = should_be_vector!(yield(self[n]))
228
+ end
229
+
230
+ self
231
+ end
232
+
233
+ # Map vectors alongwith the index.
234
+ def map_vectors_with_index(&block)
235
+ return to_enum(:map_vectors_with_index) unless block
236
+
237
+ each_vector_with_index.map(&block)
238
+ end
239
+
240
+ # Map each row
241
+ def map_rows(&block)
242
+ return to_enum(:map_rows) unless block
243
+
244
+ each_row.map(&block)
245
+ end
246
+
247
+ def map_rows_with_index(&block)
248
+ return to_enum(:map_rows_with_index) unless block
249
+
250
+ each_row_with_index.map(&block)
251
+ end
252
+
253
+ def map_rows!
254
+ return to_enum(:map_rows!) unless block_given?
255
+
256
+ index.dup.each do |i|
257
+ row[i] = should_be_vector!(yield(row[i]))
258
+ end
259
+
260
+ self
261
+ end
262
+
263
+ def apply_method(method, keys: nil, by_position: true)
264
+ df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
265
+
266
+ case method
267
+ when Symbol then df.send(method)
268
+ when Proc then method.call(df)
269
+ when Array
270
+ method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
271
+ else raise
272
+ end
273
+ end
274
+ alias apply_method_on_sub_df apply_method
275
+
276
+ # Retrieves a DaruLite::Vector, based on the result of calculation
277
+ # performed on each row.
278
+ def collect_rows(&block)
279
+ return to_enum(:collect_rows) unless block
280
+
281
+ DaruLite::Vector.new(each_row.map(&block), index: @index)
282
+ end
283
+
284
+ def collect_row_with_index(&block)
285
+ return to_enum(:collect_row_with_index) unless block
286
+
287
+ DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
288
+ end
289
+
290
+ # Retrives a DaruLite::Vector, based on the result of calculation
291
+ # performed on each vector.
292
+ def collect_vectors(&block)
293
+ return to_enum(:collect_vectors) unless block
294
+
295
+ DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
296
+ end
297
+
298
+ def collect_vector_with_index(&block)
299
+ return to_enum(:collect_vector_with_index) unless block
300
+
301
+ DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
302
+ end
303
+
304
+ # Generate a matrix, based on vector names of the DataFrame.
305
+ #
306
+ # @return {::Matrix}
307
+ # :nocov:
308
+ # FIXME: Even not trying to cover this: I can't get, how it is expected
309
+ # to work.... -- zverok
310
+ def collect_matrix
311
+ return to_enum(:collect_matrix) unless block_given?
312
+
313
+ vecs = vectors.to_a
314
+ rows = vecs.collect do |row|
315
+ vecs.collect do |col|
316
+ yield row, col
317
+ end
318
+ end
319
+
320
+ Matrix.rows(rows)
321
+ end
322
+ # :nocov:
323
+
324
+ private
325
+
326
+ def should_be_vector!(val)
327
+ return val if val.is_a?(DaruLite::Vector)
328
+
329
+ raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
330
+ end
331
+
332
+ def verify_error_message(row, test, id, i)
333
+ description, fields, = test
334
+ values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
335
+ "#{i + 1} [#{row[id]}]: #{description}#{values}"
336
+ end
337
+ end
338
+ end
339
+ end
@@ -0,0 +1,152 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Joinable
4
+ # Concatenate another DataFrame along corresponding columns.
5
+ # If columns do not exist in both dataframes, they are filled with nils
6
+ def concat(other_df)
7
+ vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
8
+
9
+ data = vectors.map do |v|
10
+ get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
11
+ end
12
+
13
+ DaruLite::DataFrame.new(data, order: vectors)
14
+ end
15
+
16
+ # Concatenates another DataFrame as #concat.
17
+ # Additionally it tries to preserve the index. If the indices contain
18
+ # common elements, #union will overwrite the according rows in the
19
+ # first dataframe.
20
+ def union(other_df)
21
+ index = (@index.to_a + other_df.index.to_a).uniq
22
+ df = row[*(@index.to_a - other_df.index.to_a)]
23
+
24
+ df = df.concat(other_df)
25
+ df.index = DaruLite::Index.new(index)
26
+ df
27
+ end
28
+
29
+ # Merge vectors from two DataFrames. In case of name collision,
30
+ # the vectors names are changed to x_1, x_2 ....
31
+ #
32
+ # @return {DaruLite::DataFrame}
33
+ def merge(other_df)
34
+ unless nrows == other_df.nrows
35
+ raise ArgumentError,
36
+ "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
37
+ end
38
+
39
+ new_fields = (@vectors.to_a + other_df.vectors.to_a)
40
+ new_fields = ArrayHelper.recode_repeated(new_fields)
41
+ DataFrame.new({}, order: new_fields).tap do |df_new|
42
+ (0...nrows).each do |i|
43
+ df_new.add_row row[i].to_a + other_df.row[i].to_a
44
+ end
45
+ df_new.index = @index if @index == other_df.index
46
+ df_new.update
47
+ end
48
+ end
49
+
50
+ # Join 2 DataFrames with SQL style joins. Currently supports inner, left
51
+ # outer, right outer and full outer joins.
52
+ #
53
+ # @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
54
+ # to be performed.
55
+ # @param [Hash] opts Options Hash
56
+ # @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
57
+ # @option :on [Array] The columns on which the join is to be performed.
58
+ # Column names specified here must be common to both DataFrames.
59
+ # @option :indicator [Symbol] The name of a vector to add to the resultant
60
+ # dataframe that indicates whether the record was in the left (:left_only),
61
+ # right (:right_only), or both (:both) joining dataframes.
62
+ # @return [DaruLite::DataFrame]
63
+ # @example Inner Join
64
+ # left = DaruLite::DataFrame.new({
65
+ # :id => [1,2,3,4],
66
+ # :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
67
+ # })
68
+ # right = DaruLite::DataFrame.new({
69
+ # :id => [1,2,3,4],
70
+ # :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
71
+ # })
72
+ # left.join(right, how: :inner, on: [:name])
73
+ # #=>
74
+ # ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
75
+ # # id_1 name id_2
76
+ # # 0 1 Pirate 2
77
+ # # 1 3 Ninja 4
78
+ def join(other_df, opts = {})
79
+ DaruLite::Core::Merge.join(self, other_df, opts)
80
+ end
81
+
82
+ # Creates a new dataset for one to many relations
83
+ # on a dataset, based on pattern of field names.
84
+ #
85
+ # for example, you have a survey for number of children
86
+ # with this structure:
87
+ # id, name, child_name_1, child_age_1, child_name_2, child_age_2
88
+ # with
89
+ # ds.one_to_many([:id], "child_%v_%n"
90
+ # the field of first parameters will be copied verbatim
91
+ # to new dataset, and fields which responds to second
92
+ # pattern will be added one case for each different %n.
93
+ #
94
+ # @example
95
+ # cases=[
96
+ # ['1','george','red',10,'blue',20,nil,nil],
97
+ # ['2','fred','green',15,'orange',30,'white',20],
98
+ # ['3','alfred',nil,nil,nil,nil,nil,nil]
99
+ # ]
100
+ # ds=DaruLite::DataFrame.rows(cases, order:
101
+ # [:id, :name,
102
+ # :car_color1, :car_value1,
103
+ # :car_color2, :car_value2,
104
+ # :car_color3, :car_value3])
105
+ # ds.one_to_many([:id],'car_%v%n').to_matrix
106
+ # #=> Matrix[
107
+ # # ["red", "1", 10],
108
+ # # ["blue", "1", 20],
109
+ # # ["green", "2", 15],
110
+ # # ["orange", "2", 30],
111
+ # # ["white", "2", 20]
112
+ # # ]
113
+ def one_to_many(parent_fields, pattern)
114
+ vars, numbers = one_to_many_components(pattern)
115
+
116
+ DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
117
+ each_row do |row|
118
+ verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
119
+ numbers.each do |n|
120
+ generated = one_to_many_row row, n, vars, pattern
121
+ next if generated.values.all?(&:nil?)
122
+
123
+ ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
124
+ end
125
+ end
126
+ ds.update
127
+ end
128
+ end
129
+
130
+ private
131
+
132
+ def one_to_many_components(pattern)
133
+ re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
134
+
135
+ vars, numbers =
136
+ @vectors
137
+ .map { |v| v.scan(re) }
138
+ .reject(&:empty?).flatten(1).transpose
139
+
140
+ [vars.uniq, numbers.map(&:to_i).sort.uniq]
141
+ end
142
+
143
+ def one_to_many_row(row, number, vars, pattern)
144
+ vars
145
+ .to_h do |v|
146
+ name = pattern.sub('%v', v).sub('%n', number.to_s)
147
+ [v, row[name]]
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end