daru 0.1.3.1 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/.rspec +2 -1
  4. data/.rspec_formatter.rb +33 -0
  5. data/.rubocop.yml +26 -2
  6. data/History.md +38 -0
  7. data/README.md +22 -13
  8. data/Rakefile +50 -2
  9. data/benchmarks/csv_reading.rb +22 -0
  10. data/daru.gemspec +9 -2
  11. data/lib/daru.rb +36 -4
  12. data/lib/daru/accessors/array_wrapper.rb +6 -1
  13. data/lib/daru/accessors/dataframe_by_row.rb +10 -2
  14. data/lib/daru/accessors/gsl_wrapper.rb +1 -3
  15. data/lib/daru/accessors/nmatrix_wrapper.rb +9 -0
  16. data/lib/daru/category.rb +935 -0
  17. data/lib/daru/core/group_by.rb +29 -38
  18. data/lib/daru/core/merge.rb +186 -145
  19. data/lib/daru/core/query.rb +22 -11
  20. data/lib/daru/dataframe.rb +976 -885
  21. data/lib/daru/date_time/index.rb +166 -166
  22. data/lib/daru/date_time/offsets.rb +66 -77
  23. data/lib/daru/formatters/table.rb +54 -0
  24. data/lib/daru/helpers/array.rb +40 -0
  25. data/lib/daru/index.rb +476 -73
  26. data/lib/daru/io/io.rb +66 -45
  27. data/lib/daru/io/sql_data_source.rb +33 -62
  28. data/lib/daru/iruby/helpers.rb +38 -0
  29. data/lib/daru/iruby/templates/dataframe.html.erb +52 -0
  30. data/lib/daru/iruby/templates/dataframe_mi.html.erb +58 -0
  31. data/lib/daru/iruby/templates/multi_index.html.erb +12 -0
  32. data/lib/daru/iruby/templates/vector.html.erb +27 -0
  33. data/lib/daru/iruby/templates/vector_mi.html.erb +36 -0
  34. data/lib/daru/maths/arithmetic/dataframe.rb +16 -18
  35. data/lib/daru/maths/arithmetic/vector.rb +4 -6
  36. data/lib/daru/maths/statistics/dataframe.rb +8 -15
  37. data/lib/daru/maths/statistics/vector.rb +120 -98
  38. data/lib/daru/monkeys.rb +12 -40
  39. data/lib/daru/plotting/gruff.rb +3 -0
  40. data/lib/daru/plotting/gruff/category.rb +49 -0
  41. data/lib/daru/plotting/gruff/dataframe.rb +91 -0
  42. data/lib/daru/plotting/gruff/vector.rb +57 -0
  43. data/lib/daru/plotting/nyaplot.rb +3 -0
  44. data/lib/daru/plotting/nyaplot/category.rb +34 -0
  45. data/lib/daru/plotting/nyaplot/dataframe.rb +187 -0
  46. data/lib/daru/plotting/nyaplot/vector.rb +46 -0
  47. data/lib/daru/vector.rb +694 -421
  48. data/lib/daru/version.rb +1 -1
  49. data/profile/_base.rb +23 -0
  50. data/profile/df_to_a.rb +10 -0
  51. data/profile/filter.rb +13 -0
  52. data/profile/joining.rb +13 -0
  53. data/profile/sorting.rb +12 -0
  54. data/profile/vector_each_with_index.rb +9 -0
  55. data/spec/accessors/wrappers_spec.rb +2 -4
  56. data/spec/categorical_spec.rb +1734 -0
  57. data/spec/core/group_by_spec.rb +52 -2
  58. data/spec/core/merge_spec.rb +63 -2
  59. data/spec/core/query_spec.rb +236 -80
  60. data/spec/dataframe_spec.rb +1373 -79
  61. data/spec/date_time/data_spec.rb +3 -5
  62. data/spec/date_time/index_spec.rb +154 -17
  63. data/spec/date_time/offsets_spec.rb +3 -4
  64. data/spec/fixtures/empties.dat +2 -0
  65. data/spec/fixtures/strings.dat +2 -0
  66. data/spec/formatters/table_formatter_spec.rb +99 -0
  67. data/spec/helpers_spec.rb +8 -0
  68. data/spec/index/categorical_index_spec.rb +168 -0
  69. data/spec/index/index_spec.rb +283 -0
  70. data/spec/index/multi_index_spec.rb +570 -0
  71. data/spec/io/io_spec.rb +31 -4
  72. data/spec/io/sql_data_source_spec.rb +0 -1
  73. data/spec/iruby/dataframe_spec.rb +172 -0
  74. data/spec/iruby/helpers_spec.rb +49 -0
  75. data/spec/iruby/multi_index_spec.rb +37 -0
  76. data/spec/iruby/vector_spec.rb +107 -0
  77. data/spec/math/arithmetic/dataframe_spec.rb +71 -13
  78. data/spec/math/arithmetic/vector_spec.rb +8 -10
  79. data/spec/math/statistics/dataframe_spec.rb +3 -5
  80. data/spec/math/statistics/vector_spec.rb +45 -55
  81. data/spec/monkeys_spec.rb +32 -9
  82. data/spec/plotting/dataframe_spec.rb +386 -0
  83. data/spec/plotting/vector_spec.rb +230 -0
  84. data/spec/shared/vector_display_spec.rb +215 -0
  85. data/spec/spec_helper.rb +23 -0
  86. data/spec/vector_spec.rb +905 -138
  87. metadata +143 -11
  88. data/.rubocop_todo.yml +0 -44
  89. data/lib/daru/plotting/dataframe.rb +0 -104
  90. data/lib/daru/plotting/vector.rb +0 -38
  91. data/spec/daru_spec.rb +0 -58
  92. data/spec/index_spec.rb +0 -375
@@ -33,7 +33,7 @@ module Daru
33
33
  end
34
34
 
35
35
  def inspect
36
- "(#{self.class}:#{object_id} bool_arry=#{@barry})"
36
+ "#<#{self.class}:#{object_id} bool_arry=#{@barry}>"
37
37
  end
38
38
  end
39
39
 
@@ -56,17 +56,28 @@ module Daru
56
56
  )
57
57
  end
58
58
 
59
- def vector_where data, index, bool_array, dtype
60
- new_data = []
61
- new_index = []
62
- bool_array.to_a.each_with_index do |b, i|
63
- if b
64
- new_data << data[i]
65
- new_index << index[i]
66
- end
67
- end
59
+ def vector_where dv, bool_array
60
+ new_data, new_index = fetch_new_data_and_index dv, bool_array
61
+
62
+ resultant_dv = Daru::Vector.new new_data,
63
+ index: dv.index.class.new(new_index),
64
+ dtype: dv.dtype,
65
+ type: dv.type,
66
+ name: dv.name
67
+
68
+ # Preserve categories order for category vector
69
+ resultant_dv.categories = dv.categories if dv.category?
70
+ resultant_dv
71
+ end
72
+
73
+ private
68
74
 
69
- Daru::Vector.new(new_data, index: new_index, dtype: dtype)
75
+ def fetch_new_data_and_index dv, bool_array
76
+ barry = bool_array.to_a
77
+ positions = dv.size.times.select { |i| barry[i] }
78
+ new_data = dv.to_a.values_at(*positions)
79
+ new_index = dv.index.to_a.values_at(*positions)
80
+ [new_data, new_index]
70
81
  end
71
82
  end
72
83
  end
@@ -1,14 +1,17 @@
1
1
  require 'daru/accessors/dataframe_by_row.rb'
2
2
  require 'daru/maths/arithmetic/dataframe.rb'
3
3
  require 'daru/maths/statistics/dataframe.rb'
4
- require 'daru/plotting/dataframe.rb'
4
+ require 'daru/plotting/gruff.rb'
5
+ require 'daru/plotting/nyaplot.rb'
5
6
  require 'daru/io/io.rb'
6
7
 
7
8
  module Daru
8
- class DataFrame
9
+ class DataFrame # rubocop:disable Metrics/ClassLength
9
10
  include Daru::Maths::Arithmetic::DataFrame
10
11
  include Daru::Maths::Statistics::DataFrame
11
- include Daru::Plotting::DataFrame if Daru.has_nyaplot?
12
+ # TODO: Remove this line but its causing erros due to unkown reason
13
+ include Daru::Plotting::DataFrame::NyaplotLibrary if Daru.has_nyaplot?
14
+ extend Gem::Deprecate
12
15
 
13
16
  class << self
14
17
  # Load data from a CSV file. Specify an optional block to grab the CSV
@@ -112,29 +115,17 @@ module Daru
112
115
  # Create DataFrame by specifying rows as an Array of Arrays or Array of
113
116
  # Daru::Vector objects.
114
117
  def rows source, opts={}
115
- first = source.first
116
-
117
118
  raise SizeError, 'All vectors must have same length' \
118
- unless source.all? { |v| v.size == first.size }
119
-
120
- index = []
121
- opts[:order] ||=
122
- case first
123
- when Daru::Vector # assume that all are Vectors
124
- index = source.map(&:name)
125
- first.index.to_a
126
- when Array
127
- Array.new(first.size, &:to_s)
128
- end
119
+ unless source.all? { |v| v.size == source.first.size }
129
120
 
130
- if source.all? { |s| s.is_a?(Array) }
131
- Daru::DataFrame.new(source.transpose, opts)
132
- else # array of Daru::Vectors
133
- Daru::DataFrame.new({}, opts).tap do |df|
134
- source.each_with_index do |row, idx|
135
- df[index[idx] || idx, :row] = row
136
- end
137
- end
121
+ opts[:order] ||= guess_order(source)
122
+
123
+ if ArrayHelper.array_of?(source, Array)
124
+ DataFrame.new(source.transpose, opts)
125
+ elsif ArrayHelper.array_of?(source, Vector)
126
+ from_vector_rows(source, opts)
127
+ else
128
+ raise ArgumentError, "Can't create DataFrame from #{source}"
138
129
  end
139
130
  end
140
131
 
@@ -161,36 +152,47 @@ module Daru
161
152
  raise 'Three vectors should be equal size' if
162
153
  rows.size != columns.size || rows.size!=values.size
163
154
 
164
- cols_values = columns.factors
165
- cols_n = cols_values.size
155
+ data = Hash.new { |h, col|
156
+ h[col] = rows.factors.map { |r| [r, nil] }.to_h
157
+ }
158
+ columns.zip(rows, values).each { |c, r, v| data[c][r] = v }
166
159
 
167
- h_rows = rows.factors.each_with_object({}) do |v, a|
168
- a[v] = cols_values.each_with_object({}) do |v1, a1|
169
- a1[v1]=nil
170
- end
171
- end
160
+ # FIXME: in fact, WITHOUT this line you'll obtain more "right"
161
+ # data: with vectors having "rows" as an index...
162
+ data = data.map { |c, r| [c, r.values] }.to_h
163
+ data[:_id] = rows.factors
164
+
165
+ DataFrame.new(data)
166
+ end
167
+
168
+ private
172
169
 
173
- values.each_index do |i|
174
- h_rows[rows[i]][columns[i]] = values[i]
170
+ def guess_order source
171
+ case source.first
172
+ when Vector # assume that all are Vectors
173
+ source.first.index.to_a
174
+ when Array
175
+ Array.new(source.first.size, &:to_s)
175
176
  end
176
- df = Daru::DataFrame.new({}, order: [:_id] + cols_values.to_a)
177
+ end
177
178
 
178
- rows.factors.each do |row|
179
- n_row = Array.new(cols_n+1)
180
- n_row[0] = row
181
- cols_values.each_index do |i|
182
- n_row[i+1] = h_rows[row][cols_values[i]]
183
- end
179
+ def from_vector_rows source, opts
180
+ index = source.map(&:name)
181
+ .each_with_index.map { |n, i| n || i }
182
+ index = ArrayHelper.recode_repeated(index)
184
183
 
185
- df.add_row(n_row)
184
+ DataFrame.new({}, opts).tap do |df|
185
+ source.each_with_index do |row, idx|
186
+ df[index[idx] || idx, :row] = row
187
+ end
186
188
  end
187
- df.update
188
- df
189
189
  end
190
190
  end
191
191
 
192
192
  # The vectors (columns) index of the DataFrame
193
193
  attr_reader :vectors
194
+ # TOREMOVE
195
+ attr_reader :data
194
196
 
195
197
  # The index of the rows of the DataFrame
196
198
  attr_reader :index
@@ -237,135 +239,181 @@ module Daru
237
239
  # # b 7 2
238
240
  # # c 8 3
239
241
  # # d 9 4
240
- def initialize source, opts={}
241
- vectors = opts[:order]
242
- index = opts[:index]
243
- clone = opts[:clone] == false ? false : true
244
- @data = []
245
-
246
- temp_name = opts[:name]
247
- @name = temp_name || SecureRandom.uuid
248
-
249
- if source.empty?
250
- @vectors = try_create_index vectors
251
- @index = try_create_index index
242
+ def initialize source, opts={} # rubocop:disable Metrics/MethodLength
243
+ vectors, index = opts[:order], opts[:index] # FIXME: just keyword arges after Ruby 2.1
244
+ @data = []
245
+ @name = opts[:name]
246
+
247
+ case source
248
+ when ->(s) { s.empty? }
249
+ @vectors = Index.coerce vectors
250
+ @index = Index.coerce index
252
251
  create_empty_vectors
253
- else
254
- case source
255
- when Array
256
- if source.all? { |s| s.is_a?(Array) }
257
- raise ArgumentError, "Number of vectors (#{vectors.size}) should \
258
- equal order size (#{source.size})" if source.size != vectors.size
259
-
260
- @index = try_create_index(index || source[0].size)
261
- @vectors = try_create_index(vectors)
262
-
263
- @vectors.each_with_index do |_vec,idx|
264
- @data << Daru::Vector.new(source[idx], index: @index)
265
- end
266
- elsif source.all? { |s| s.is_a?(Daru::Vector) }
267
- hsh = {}
268
- vectors.each_with_index do |name, idx|
269
- hsh[name] = source[idx]
270
- end
271
- initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
272
- else # array of hashes
273
- @vectors =
274
- if vectors.nil?
275
- Daru::Index.new source[0].keys
276
- else
277
- Daru::Index.new((vectors + (source[0].keys - vectors)).uniq)
278
- end
279
- @index = Daru::Index.new(index || source.size)
280
-
281
- @vectors.each do |name|
282
- v = []
283
- source.each do |h|
284
- v << (h[name] || h[name.to_s])
285
- end
286
-
287
- @data << Daru::Vector.new(v, name: set_name(name), index: @index)
288
- end
289
- end
290
- when Hash
291
- create_vectors_index_with vectors, source
292
- if all_daru_vectors_in_source? source
293
- vectors_have_same_index = all_vectors_have_equal_indexes?(source)
294
- if !index.nil?
295
- @index = try_create_index index
296
- elsif vectors_have_same_index
297
- @index = source.values[0].index.dup
298
- else
299
- all_indexes = []
300
- source.each_value do |vector|
301
- all_indexes << vector.index.to_a
302
- end
303
- # sort only if missing indexes detected
304
- all_indexes.flatten!.uniq!.sort!
305
-
306
- @index = Daru::Index.new all_indexes
307
- clone = true
308
- end
309
-
310
- if clone
311
- @vectors.each do |vector|
312
- # avoids matching indexes of vectors if all the supplied vectors
313
- # have the same index.
314
- if vectors_have_same_index
315
- v = source[vector].dup
316
- else
317
- v = Daru::Vector.new([], name: vector, metadata: source[vector].metadata.dup, index: @index)
318
-
319
- @index.each do |idx|
320
- v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
321
- end
322
- end
323
- @data << v
324
- end
325
- else
326
- @data.concat source.values
327
- end
328
- else
329
- @index = try_create_index(index || source.values[0].size)
330
-
331
- @vectors.each do |name|
332
- meta_opt = source[name].respond_to?(:metadata) ? {metadata: source[name].metadata.dup} : {}
333
- @data << Daru::Vector.new(source[name].dup, name: set_name(name), **meta_opt, index: @index)
334
- end
335
- end
336
- end
252
+ when Array
253
+ initialize_from_array source, vectors, index, opts
254
+ when Hash
255
+ initialize_from_hash source, vectors, index, opts
337
256
  end
338
257
 
339
258
  set_size
340
259
  validate
341
260
  update
261
+ self.plotting_library = Daru.plotting_library
342
262
  end
343
263
 
344
- def vector(*)
345
- $stderr.puts '#vector has been deprecated in favour of #[]. Please use that.'
346
- self[*names]
264
+ def plotting_library= lib
265
+ case lib
266
+ when :gruff, :nyaplot
267
+ @plotting_library = lib
268
+ extend Module.const_get(
269
+ "Daru::Plotting::DataFrame::#{lib.to_s.capitalize}Library"
270
+ ) if Daru.send("has_#{lib}?".to_sym)
271
+ else
272
+ raise ArguementError, "Plotting library #{lib} not supported. "\
273
+ 'Supported libraries are :nyaplot and :gruff'
274
+ end
347
275
  end
348
276
 
349
277
  # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
350
278
  # Defaults to *:vector*. Use of this method is not recommended for accessing
351
279
  # rows. Use df.row[:a] for accessing row with index ':a'.
352
280
  def [](*names)
353
- if names[-1] == :vector || names[-1] == :row
354
- axis = names[-1]
355
- names = names[0..-2]
281
+ axis = extract_axis(names, :vector)
282
+ dispatch_to_axis axis, :access, *names
283
+ end
284
+
285
+ # Retrive rows by positions
286
+ # @param [Array<Integer>] *positions positions of rows to retrive
287
+ # @return [Daru::Vector, Daru::DataFrame] vector for single position and dataframe for multiple positions
288
+ # @example
289
+ # df = Daru::DataFrame.new({
290
+ # a: [1, 2, 3],
291
+ # b: ['a', 'b', 'c']
292
+ # })
293
+ # df.row_at 1, 2
294
+ # # => #<Daru::DataFrame(2x2)>
295
+ # # a b
296
+ # # 1 2 b
297
+ # # 2 3 c
298
+ def row_at *positions
299
+ original_positions = positions
300
+ positions = coerce_positions(*positions, nrows)
301
+ validate_positions(*positions, nrows)
302
+
303
+ if positions.is_a? Integer
304
+ return Daru::Vector.new @data.map { |vec| vec.at(*positions) },
305
+ index: @vectors
356
306
  else
357
- axis = :vector
307
+ new_rows = @data.map { |vec| vec.at(*original_positions) }
308
+ return Daru::DataFrame.new new_rows,
309
+ index: @index.at(*original_positions),
310
+ order: @vectors
358
311
  end
312
+ end
359
313
 
360
- if axis == :vector
361
- access_vector(*names)
362
- elsif axis == :row
363
- access_row(*names)
314
+ # Set rows by positions
315
+ # @param [Array<Integer>] positions positions of rows to set
316
+ # @vector [Array, Daru::Vector] vector vector to be assigned
317
+ # @example
318
+ # df = Daru::DataFrame.new({
319
+ # a: [1, 2, 3],
320
+ # b: ['a', 'b', 'c']
321
+ # })
322
+ # df.set_row_at [0, 1], ['x', 'x']
323
+ # df
324
+ # #=> #<Daru::DataFrame(3x2)>
325
+ # # a b
326
+ # # 0 x x
327
+ # # 1 x x
328
+ # # 2 3 c
329
+ def set_row_at positions, vector
330
+ validate_positions(*positions, nrows)
331
+ vector =
332
+ if vector.is_a? Daru::Vector
333
+ vector.reindex @vectors
334
+ else
335
+ Daru::Vector.new vector
336
+ end
337
+
338
+ raise SizeError, 'Vector length should match row length' if
339
+ vector.size != @vectors.size
340
+
341
+ @data.each_with_index do |vec, pos|
342
+ vec.set_at(positions, vector.at(pos))
343
+ end
344
+ @index = @data[0].index
345
+ set_size
346
+ end
347
+
348
+ # Retrive vectors by positions
349
+ # @param [Array<Integer>] *positions positions of vectors to retrive
350
+ # @return [Daru::Vector, Daru::DataFrame] vector for single position and dataframe for multiple positions
351
+ # @example
352
+ # df = Daru::DataFrame.new({
353
+ # a: [1, 2, 3],
354
+ # b: ['a', 'b', 'c']
355
+ # })
356
+ # df.at 0
357
+ # # => #<Daru::Vector(3)>
358
+ # # a
359
+ # # 0 1
360
+ # # 1 2
361
+ # # 2 3
362
+ def at *positions
363
+ if AXES.include? positions.last
364
+ axis = positions.pop
365
+ return row_at(*positions) if axis == :row
366
+ end
367
+
368
+ original_positions = positions
369
+ positions = coerce_positions(*positions, ncols)
370
+ validate_positions(*positions, ncols)
371
+
372
+ if positions.is_a? Integer
373
+ @data[positions].dup
364
374
  else
365
- raise IndexError, "Expected axis to be row or vector not #{axis}"
375
+ Daru::DataFrame.new positions.map { |pos| @data[pos].dup },
376
+ index: @index,
377
+ order: @vectors.at(*original_positions),
378
+ name: @name
366
379
  end
367
380
  end
368
381
 
382
+ # Set vectors by positions
383
+ # @param [Array<Integer>] positions positions of vectors to set
384
+ # @param [Array, Daru::Vector] vector vector to be assigned
385
+ # @example
386
+ # df = Daru::DataFrame.new({
387
+ # a: [1, 2, 3],
388
+ # b: ['a', 'b', 'c']
389
+ # })
390
+ # df.set_at [0], ['x', 'y', 'z']
391
+ # df
392
+ # #=> #<Daru::DataFrame(3x2)>
393
+ # # a b
394
+ # # 0 x a
395
+ # # 1 y b
396
+ # # 2 z c
397
+ def set_at positions, vector
398
+ if positions.last == :row
399
+ positions.pop
400
+ return set_row_at(positions, vector)
401
+ end
402
+
403
+ validate_positions(*positions, ncols)
404
+ vector =
405
+ if vector.is_a? Daru::Vector
406
+ vector.reindex @index
407
+ else
408
+ Daru::Vector.new vector
409
+ end
410
+
411
+ raise SizeError, 'Vector length should match index length' if
412
+ vector.size != @index.size
413
+
414
+ positions.each { |pos| @data[pos] = vector }
415
+ end
416
+
369
417
  # Insert a new row/vector of the specified name or modify a previous row.
370
418
  # Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
371
419
  # a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
@@ -374,25 +422,11 @@ module Daru
374
422
  # of the vector will be matched against the row/vector indexes of the DataFrame
375
423
  # before an insertion is performed. Unmatched indexes will be set to nil.
376
424
  def []=(*args)
377
- axis = args.include?(:row) ? :row : :vector
378
- args.delete :vector
379
- args.delete :row
425
+ vector = args.pop
426
+ axis = extract_axis(args)
427
+ names = args
380
428
 
381
- name = args[0..-2]
382
- vector = args[-1]
383
-
384
- if axis == :vector
385
- insert_or_modify_vector name, vector
386
- elsif axis == :row
387
- insert_or_modify_row name, vector
388
- else
389
- raise IndexError, "Expected axis to be row or vector, not #{axis}."
390
- end
391
- end
392
-
393
- # Access a vector by name.
394
- def column name
395
- vector[name]
429
+ dispatch_to_axis axis, :insert_or_modify, names, vector
396
430
  end
397
431
 
398
432
  def add_row row, index=nil
@@ -421,10 +455,7 @@ module Daru
421
455
  def dup vectors_to_dup=nil
422
456
  vectors_to_dup = @vectors.to_a unless vectors_to_dup
423
457
 
424
- src = []
425
- vectors_to_dup.each do |vec|
426
- src << @data[@vectors[vec]].dup
427
- end
458
+ src = vectors_to_dup.map { |vec| @data[@vectors[vec]].dup }
428
459
  new_order = Daru::Index.new(vectors_to_dup)
429
460
 
430
461
  Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
@@ -443,20 +474,18 @@ module Daru
443
474
  # +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
444
475
  # a view of the whole data frame otherwise.
445
476
  def clone *vectors_to_clone
446
- vectors_to_clone.flatten! unless vectors_to_clone.all? { |a| !a.is_a?(Array) }
477
+ vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
447
478
  vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
448
479
 
449
- h = vectors_to_clone.each_with_object({}) do |vec, hsh|
450
- hsh[vec] = self[vec]
451
- end
452
- Daru::DataFrame.new(h, clone: false)
480
+ h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
481
+ Daru::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
453
482
  end
454
483
 
455
484
  # Returns a 'shallow' copy of DataFrame if missing data is not present,
456
485
  # or a full copy of only valid data if missing data is present.
457
486
  def clone_only_valid
458
- if has_missing_data?
459
- dup_only_valid
487
+ if include_values?(*Daru::MISSING_VALUES)
488
+ reject_values(*Daru::MISSING_VALUES)
460
489
  else
461
490
  clone
462
491
  end
@@ -465,19 +494,76 @@ module Daru
465
494
  # Creates a new duplicate dataframe containing only rows
466
495
  # without a single missing value.
467
496
  def dup_only_valid vecs=nil
468
- rows_with_nil = @data.each_with_object([]) do |vector, memo|
469
- memo.concat vector.missing_positions
470
- end.uniq
497
+ rows_with_nil = @data.map { |vec| vec.indexes(*Daru::MISSING_VALUES) }
498
+ .inject(&:concat)
499
+ .uniq
471
500
 
472
501
  row_indexes = @index.to_a
473
502
  (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
474
503
  end
504
+ deprecate :dup_only_valid, :reject_values, 2016, 10
505
+
506
+ # Returns a dataframe in which rows with any of the mentioned values
507
+ # are ignored.
508
+ # @param [Array] *values values to reject to form the new dataframe
509
+ # @return [Daru::DataFrame] Data Frame with only rows which doesn't
510
+ # contain the mentioned values
511
+ # @example
512
+ # df = Daru::DataFrame.new({
513
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
514
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
515
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
516
+ # }, index: 11..18)
517
+ # df.reject_values nil, Float::NAN
518
+ # # => #<Daru::DataFrame(2x3)>
519
+ # # a b c
520
+ # # 11 1 a a
521
+ # # 18 7 8 7
522
+ def reject_values(*values)
523
+ positions =
524
+ size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
525
+ # Handle the case when positions size is 1 and #row_at wouldn't return a df
526
+ if positions.size == 1
527
+ pos = positions.first
528
+ row_at(pos..pos)
529
+ else
530
+ row_at(*positions)
531
+ end
532
+ end
533
+
534
+ # Replace specified values with given value
535
+ # @param [Array] old_values values to replace with new value
536
+ # @param [object] new_value new value to replace with
537
+ # @return [Daru::DataFrame] Data Frame itself with old values replace
538
+ # with new value
539
+ # @example
540
+ # df = Daru::DataFrame.new({
541
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
542
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
543
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
544
+ # }, index: 11..18)
545
+ # df
546
+ # # => #<Daru::DataFrame(8x3)>
547
+ # # a b c
548
+ # # 11 1 a a
549
+ # # 12 2 b NaN
550
+ # # 13 3 NaN 3
551
+ # # 14 NaN NaN 4
552
+ # # 15 NaN NaN 3
553
+ # # 16 NaN 3 5
554
+ # # 17 1 5 NaN
555
+ # # 18 7 8 7
556
+ def replace_values old_values, new_value
557
+ @data.each { |vec| vec.replace_values old_values, new_value }
558
+ self
559
+ end
475
560
 
476
561
  # Iterate over each index of the DataFrame.
477
562
  def each_index &block
478
563
  return to_enum(:each_index) unless block_given?
479
564
 
480
565
  @index.each(&block)
566
+
481
567
  self
482
568
  end
483
569
 
@@ -509,8 +595,8 @@ module Daru
509
595
  def each_row
510
596
  return to_enum(:each_row) unless block_given?
511
597
 
512
- @index.each do |index|
513
- yield access_row(index)
598
+ @index.size.times do |pos|
599
+ yield row_at(pos)
514
600
  end
515
601
 
516
602
  self
@@ -540,13 +626,7 @@ module Daru
540
626
  # * +axis+ - The axis to iterate over. Can be :vector (or :column)
541
627
  # or :row. Default to :vector.
542
628
  def each axis=:vector, &block
543
- if axis == :vector || axis == :column
544
- each_vector(&block)
545
- elsif axis == :row
546
- each_row(&block)
547
- else
548
- raise ArgumentError, "Unknown axis #{axis}"
549
- end
629
+ dispatch_to_axis axis, :each, &block
550
630
  end
551
631
 
552
632
  # Iterate over a row or vector and return results in a Daru::Vector.
@@ -565,13 +645,7 @@ module Daru
565
645
  # * +axis+ - The axis to iterate over. Can be :vector (or :column)
566
646
  # or :row. Default to :vector.
567
647
  def collect axis=:vector, &block
568
- if axis == :vector || axis == :column
569
- collect_vectors(&block)
570
- elsif axis == :row
571
- collect_rows(&block)
572
- else
573
- raise ArgumentError, "Unknown axis #{axis}"
574
- end
648
+ dispatch_to_axis_pl axis, :collect, &block
575
649
  end
576
650
 
577
651
  # Map over each vector or row of the data frame according to
@@ -591,13 +665,7 @@ module Daru
591
665
  # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
592
666
  # Default to :vector.
593
667
  def map axis=:vector, &block
594
- if axis == :vector || axis == :column
595
- map_vectors(&block)
596
- elsif axis == :row
597
- map_rows(&block)
598
- else
599
- raise ArgumentError, "Unknown axis #{axis}"
600
- end
668
+ dispatch_to_axis_pl axis, :map, &block
601
669
  end
602
670
 
603
671
  # Destructive map. Modifies the DataFrame. Each run of the block
@@ -634,11 +702,7 @@ module Daru
634
702
  # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
635
703
  # Default to :vector.
636
704
  def recode axis=:vector, &block
637
- if axis == :vector || axis == :column
638
- recode_vectors(&block)
639
- elsif axis == :row
640
- recode_rows(&block)
641
- end
705
+ dispatch_to_axis_pl axis, :recode, &block
642
706
  end
643
707
 
644
708
  # Retain vectors or rows if the block returns a truthy value.
@@ -670,50 +734,34 @@ module Daru
670
734
  # row[:a] + row[:d] < 100
671
735
  # end
672
736
  def filter axis=:vector, &block
673
- if axis == :vector || axis == :column
674
- filter_vectors(&block)
675
- elsif axis == :row
676
- filter_rows(&block)
677
- end
737
+ dispatch_to_axis_pl axis, :filter, &block
678
738
  end
679
739
 
680
740
  def recode_vectors
681
741
  block_given? or return to_enum(:recode_vectors)
682
742
 
683
- df = dup
684
- df.each_vector_with_index do |v, i|
685
- ret = yield v
686
- ret.is_a?(Daru::Vector) or
687
- raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
688
- df[*i] = ret
743
+ dup.tap do |df|
744
+ df.each_vector_with_index do |v, i|
745
+ df[*i] = should_be_vector!(yield(v))
746
+ end
689
747
  end
690
-
691
- df
692
748
  end
693
749
 
694
750
  def recode_rows
695
751
  block_given? or return to_enum(:recode_rows)
696
752
 
697
- df = dup
698
- df.each_row_with_index do |r, i|
699
- ret = yield r
700
- ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
701
- df.row[i] = ret
753
+ dup.tap do |df|
754
+ df.each_row_with_index do |r, i|
755
+ df.row[i] = should_be_vector!(yield(r))
756
+ end
702
757
  end
703
-
704
- df
705
758
  end
706
759
 
707
760
  # Map each vector and return an Array.
708
- def map_vectors
761
+ def map_vectors &block
709
762
  return to_enum(:map_vectors) unless block_given?
710
763
 
711
- arry = []
712
- @data.each do |vec|
713
- arry << yield(vec)
714
- end
715
-
716
- arry
764
+ @data.map(&block)
717
765
  end
718
766
 
719
767
  # Destructive form of #map_vectors
@@ -721,56 +769,37 @@ module Daru
721
769
  return to_enum(:map_vectors!) unless block_given?
722
770
 
723
771
  vectors.dup.each do |n|
724
- v = yield self[n]
725
- v.is_a?(Daru::Vector) or raise TypeError, "Must return a Daru::Vector not #{v.class}"
726
- self[n] = v
772
+ self[n] = should_be_vector!(yield(self[n]))
727
773
  end
728
774
 
729
775
  self
730
776
  end
731
777
 
732
778
  # Map vectors alongwith the index.
733
- def map_vectors_with_index
779
+ def map_vectors_with_index &block
734
780
  return to_enum(:map_vectors_with_index) unless block_given?
735
781
 
736
- dt = []
737
- each_vector_with_index do |vector, name|
738
- dt << yield(vector, name)
739
- end
740
-
741
- dt
782
+ each_vector_with_index.map(&block)
742
783
  end
743
784
 
744
785
  # Map each row
745
- def map_rows
786
+ def map_rows &block
746
787
  return to_enum(:map_rows) unless block_given?
747
788
 
748
- dt = []
749
- each_row do |row|
750
- dt << yield(row)
751
- end
752
-
753
- dt
789
+ each_row.map(&block)
754
790
  end
755
791
 
756
- def map_rows_with_index
792
+ def map_rows_with_index &block
757
793
  return to_enum(:map_rows_with_index) unless block_given?
758
794
 
759
- dt = []
760
- each_row_with_index do |row, index|
761
- dt << yield(row, index)
762
- end
763
-
764
- dt
795
+ each_row_with_index.map(&block)
765
796
  end
766
797
 
767
798
  def map_rows!
768
799
  return to_enum(:map_rows!) unless block_given?
769
800
 
770
801
  index.dup.each do |i|
771
- r = yield row[i]
772
- r.is_a?(Daru::Vector) or raise TypeError, "Returned object must be Daru::Vector not #{r.class}"
773
- row[i] = r
802
+ row[i] = should_be_vector!(yield(row[i]))
774
803
  end
775
804
 
776
805
  self
@@ -778,55 +807,38 @@ module Daru
778
807
 
779
808
  # Retrieves a Daru::Vector, based on the result of calculation
780
809
  # performed on each row.
781
- def collect_rows
810
+ def collect_rows &block
782
811
  return to_enum(:collect_rows) unless block_given?
783
812
 
784
- data = []
785
- each_row do |row|
786
- data.push yield(row)
787
- end
788
-
789
- Daru::Vector.new(data, index: @index)
813
+ Daru::Vector.new(each_row.map(&block), index: @index)
790
814
  end
791
815
 
792
- def collect_row_with_index
816
+ def collect_row_with_index &block
793
817
  return to_enum(:collect_row_with_index) unless block_given?
794
818
 
795
- data = []
796
- each_row_with_index do |row, i|
797
- data.push yield(row, i)
798
- end
799
-
800
- Daru::Vector.new(data, index: @index)
819
+ Daru::Vector.new(each_row_with_index.map(&block), index: @index)
801
820
  end
802
821
 
803
822
  # Retrives a Daru::Vector, based on the result of calculation
804
823
  # performed on each vector.
805
- def collect_vectors
824
+ def collect_vectors &block
806
825
  return to_enum(:collect_vectors) unless block_given?
807
826
 
808
- data = []
809
- each_vector do |vec|
810
- data.push yield(vec)
811
- end
812
-
813
- Daru::Vector.new(data, index: @vectors)
827
+ Daru::Vector.new(each_vector.map(&block), index: @vectors)
814
828
  end
815
829
 
816
- def collect_vector_with_index
830
+ def collect_vector_with_index &block
817
831
  return to_enum(:collect_vector_with_index) unless block_given?
818
832
 
819
- data = []
820
- each_vector_with_index do |vec, i|
821
- data.push yield(vec, i)
822
- end
823
-
824
- Daru::Vector.new(data, index: @vectors)
833
+ Daru::Vector.new(each_vector_with_index.map(&block), index: @vectors)
825
834
  end
826
835
 
827
836
  # Generate a matrix, based on vector names of the DataFrame.
828
837
  #
829
838
  # @return {::Matrix}
839
+ # :nocov:
840
+ # FIXME: Even not trying to cover this: I can't get, how it is expected
841
+ # to work.... -- zverok
830
842
  def collect_matrix
831
843
  return to_enum(:collect_matrix) unless block_given?
832
844
 
@@ -839,6 +851,7 @@ module Daru
839
851
 
840
852
  Matrix.rows(rows)
841
853
  end
854
+ # :nocov:
842
855
 
843
856
  # Delete a vector
844
857
  def delete_vector vector
@@ -876,43 +889,29 @@ module Daru
876
889
  # @return {Daru::DataFrame}
877
890
  def bootstrap(n=nil)
878
891
  n ||= nrows
879
- ds_boot = Daru::DataFrame.new({}, order: @vectors)
880
- n.times do
881
- ds_boot.add_row(row[rand(n)])
892
+ Daru::DataFrame.new({}, order: @vectors).tap do |df_boot|
893
+ n.times do
894
+ df_boot.add_row(row[rand(n)])
895
+ end
896
+ df_boot.update
882
897
  end
883
- ds_boot.update
884
- ds_boot
885
898
  end
886
899
 
887
900
  def keep_row_if
888
- deletion = []
889
-
890
- @index.each do |index|
891
- keep_row = yield access_row(index)
892
-
893
- deletion << index unless keep_row
894
- end
895
- deletion.each { |idx|
896
- delete_row idx
897
- }
901
+ @index
902
+ .reject { |idx| yield access_row(idx) }
903
+ .each { |idx| delete_row idx }
898
904
  end
899
905
 
900
906
  def keep_vector_if
901
907
  @vectors.each do |vector|
902
- keep_vector = yield @data[@vectors[vector]], vector
903
-
904
- delete_vector vector unless keep_vector
908
+ delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
905
909
  end
906
910
  end
907
911
 
908
912
  # creates a new vector with the data of a given field which the block returns true
909
- def filter_vector vec
910
- d = []
911
- each_row do |row|
912
- d.push(row[vec]) if yield row
913
- end
914
-
915
- Daru::Vector.new(d, metadata: self[vec].metadata.dup)
913
+ def filter_vector vec, &block
914
+ Daru::Vector.new each_row.select(&block).map { |row| row[vec] }
916
915
  end
917
916
 
918
917
  # Iterates over each row and retains it in a new DataFrame if the block returns
@@ -930,38 +929,24 @@ module Daru
930
929
  def filter_vectors &block
931
930
  return to_enum(:filter_vectors) unless block_given?
932
931
 
933
- df = dup
934
- df.keep_vector_if(&block)
935
-
936
- df
932
+ dup.tap { |df| df.keep_vector_if(&block) }
937
933
  end
938
934
 
939
935
  # Test each row with one or more tests. Each test is a Proc with the form
940
936
  # *Proc.new {|row| row[:age] > 0}*
941
937
  #
942
938
  # The function returns an array with all errors.
939
+ #
940
+ # FIXME: description here is too sparse. As far as I can get,
941
+ # it should tell something about that each test is [descr, fields, block],
942
+ # and that first value may be column name to output. - zverok, 2016-05-18
943
943
  def verify(*tests)
944
- if tests[0].is_a? Symbol
945
- id = tests[0]
946
- tests.shift
947
- else
948
- id = @vectors.first
949
- end
950
-
951
- vr = []
952
- i = 0
953
- each(:row) do |row|
954
- i += 1
955
- tests.each do |test|
956
- next if test[2].call(row)
957
- values = ''
958
- unless test[1].empty?
959
- values = ' (' + test[1].collect { |k| "#{k}=#{row[k]}" }.join(', ') + ')'
960
- end
961
- vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
962
- end
963
- end
964
- vr
944
+ id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
945
+
946
+ each_row_with_index.map do |row, i|
947
+ tests.reject { |*_, block| block.call(row) }
948
+ .map { |test| verify_error_message row, test, id, i }
949
+ end.flatten
965
950
  end
966
951
 
967
952
  # DSL for yielding each row and returning a Daru::Vector based on the
@@ -984,10 +969,7 @@ module Daru
984
969
  # # 5 666
985
970
  # # 6 777
986
971
  def vector_by_calculation &block
987
- a = []
988
- each_row do |r|
989
- a.push r.instance_eval(&block)
990
- end
972
+ a = each_row.map { |r| r.instance_eval(&block) }
991
973
 
992
974
  Daru::Vector.new a, index: @index
993
975
  end
@@ -1016,10 +998,8 @@ module Daru
1016
998
  # * +missing_values+ - An Array of the values that should be
1017
999
  # treated as 'missing'. The default missing value is *nil*.
1018
1000
  def missing_values_rows missing_values=[nil]
1019
- number_of_missing = []
1020
- each_row do |row|
1021
- row.missing_values = missing_values
1022
- number_of_missing << row.missing_positions.size
1001
+ number_of_missing = each_row.map do |row|
1002
+ row.indexes(*missing_values).size
1023
1003
  end
1024
1004
 
1025
1005
  Daru::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
@@ -1029,67 +1009,77 @@ module Daru
1029
1009
  alias :vector_missing_values :missing_values_rows
1030
1010
 
1031
1011
  def has_missing_data?
1032
- !!@data.any?(&:has_missing_data?)
1012
+ !!@data.any? { |vec| vec.include_values?(*Daru::MISSING_VALUES) }
1033
1013
  end
1034
-
1035
1014
  alias :flawed? :has_missing_data?
1015
+ deprecate :has_missing_data?, :include_values?, 2016, 10
1016
+ deprecate :flawed?, :include_values?, 2016, 10
1017
+
1018
+ # Check if any of given values occur in the data frame
1019
+ # @param [Array] *values values to check for
1020
+ # @return [true, false] true if any of the given values occur in the
1021
+ # dataframe, false otherwise
1022
+ # @example
1023
+ # df = Daru::DataFrame.new({
1024
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
1025
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
1026
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
1027
+ # }, index: 11..18)
1028
+ # df.include_values? nil
1029
+ # # => true
1030
+ def include_values?(*values)
1031
+ @data.any? { |vec| vec.include_values?(*values) }
1032
+ end
1036
1033
 
1037
1034
  # Return a nested hash using vector names as keys and an array constructed of
1038
1035
  # hashes with other values. If block provided, is used to provide the
1039
1036
  # values, with parameters +row+ of dataset, +current+ last hash on
1040
1037
  # hierarchy and +name+ of the key to include
1041
- def nest *tree_keys, &block
1038
+ def nest *tree_keys, &_block
1042
1039
  tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
1043
- out = {}
1044
1040
 
1045
- each_row do |row|
1046
- current = out
1041
+ each_row.each_with_object({}) do |row, current|
1047
1042
  # Create tree
1048
- tree_keys[0, tree_keys.size-1].each do |f|
1049
- root = row[f]
1050
- current[root] ||= {}
1051
- current = current[root]
1052
- end
1053
- name = row[tree_keys.last]
1054
- if !block
1043
+ *keys, last = tree_keys
1044
+ current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
1045
+ name = row[last]
1046
+
1047
+ if block_given?
1048
+ current[name] = yield(row, current, name)
1049
+ else
1055
1050
  current[name] ||= []
1056
1051
  current[name].push(row.to_h.delete_if { |key,_value| tree_keys.include? key })
1057
- else
1058
- current[name] = yield(row, current, name)
1059
1052
  end
1060
1053
  end
1061
-
1062
- out
1063
1054
  end
1064
1055
 
1065
1056
  def vector_count_characters vecs=nil
1066
1057
  vecs ||= @vectors.to_a
1067
1058
 
1068
1059
  collect_rows do |row|
1069
- vecs.inject(0) do |memo, vec|
1070
- memo + (row[vec].nil? ? 0 : row[vec].to_s.size)
1071
- end
1060
+ vecs.map { |v| row[v].to_s.size }.inject(:+)
1072
1061
  end
1073
1062
  end
1074
1063
 
1075
1064
  def add_vectors_by_split(name,join='-',sep=Daru::SPLIT_TOKEN)
1076
- split = self[name].split_by_separator(sep)
1077
- split.each { |k,v| self[(name.to_s + join + k.to_s).to_sym] = v }
1065
+ self[name]
1066
+ .split_by_separator(sep)
1067
+ .each { |k,v| self["#{name}#{join}#{k}".to_sym] = v }
1078
1068
  end
1079
1069
 
1080
1070
  # Return the number of rows and columns of the DataFrame in an Array.
1081
1071
  def shape
1082
- [@index.size, @vectors.size]
1072
+ [nrows, ncols]
1083
1073
  end
1084
1074
 
1085
1075
  # The number of rows
1086
1076
  def nrows
1087
- shape[0]
1077
+ @index.size
1088
1078
  end
1089
1079
 
1090
1080
  # The number of vectors
1091
1081
  def ncols
1092
- shape[1]
1082
+ @vectors.size
1093
1083
  end
1094
1084
 
1095
1085
  # Check if a vector is present
@@ -1132,10 +1122,7 @@ module Daru
1132
1122
  if axis == :vector || axis == :column
1133
1123
  @data.all?(&block)
1134
1124
  elsif axis == :row
1135
- each_row do |row|
1136
- return false unless yield(row)
1137
- end
1138
- return true
1125
+ each_row.all?(&block)
1139
1126
  else
1140
1127
  raise ArgumentError, "Unidentified axis #{axis}"
1141
1128
  end
@@ -1145,7 +1132,7 @@ module Daru
1145
1132
  #
1146
1133
  # @param [Fixnum] quantity (10) The number of elements to display from the top.
1147
1134
  def head quantity=10
1148
- self[0..(quantity-1), :row]
1135
+ row.at 0..(quantity-1)
1149
1136
  end
1150
1137
 
1151
1138
  alias :first :head
@@ -1154,22 +1141,19 @@ module Daru
1154
1141
  #
1155
1142
  # @param [Fixnum] quantity (10) The number of elements to display from the bottom.
1156
1143
  def tail quantity=10
1157
- self[(@size - quantity)..(@size-1), :row]
1144
+ start = [-quantity, -size].max
1145
+ row.at start..-1
1158
1146
  end
1159
1147
 
1160
1148
  alias :last :tail
1161
1149
 
1162
1150
  # Returns a vector with sum of all vectors specified in the argument.
1163
- # Tf vecs parameter is empty, sum all numeric vector.
1151
+ # If vecs parameter is empty, sum all numeric vector.
1164
1152
  def vector_sum vecs=nil
1165
1153
  vecs ||= numeric_vectors
1166
1154
  sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype
1167
1155
 
1168
- vecs.each do |n|
1169
- sum += self[n]
1170
- end
1171
-
1172
- sum
1156
+ vecs.inject(sum) { |memo, n| memo + self[n] }
1173
1157
  end
1174
1158
 
1175
1159
  # Calculate mean of the rows of the dataframe.
@@ -1179,13 +1163,13 @@ module Daru
1179
1163
  # * +max_missing+ - The maximum number of elements in the row that can be
1180
1164
  # zero for the mean calculation to happen. Default to 0.
1181
1165
  def vector_mean max_missing=0
1166
+ # FIXME: in vector_sum we preserve created vector dtype, but
1167
+ # here we are not. Is this by design or ...? - zverok, 2016-05-18
1182
1168
  mean_vec = Daru::Vector.new [0]*@size, index: @index, name: "mean_#{@name}"
1183
1169
 
1184
- each_row_with_index do |row, i|
1185
- mean_vec[i] = row.missing_positions.size > max_missing ? nil : row.mean
1170
+ each_row_with_index.each_with_object(mean_vec) do |(row, i), memo|
1171
+ memo[i] = row.indexes(*Daru::MISSING_VALUES).size > max_missing ? nil : row.mean
1186
1172
  end
1187
-
1188
- mean_vec
1189
1173
  end
1190
1174
 
1191
1175
  # Group elements by vector to perform operations on them. Returns a
@@ -1214,6 +1198,8 @@ module Daru
1214
1198
  # # ["foo", "two", 3]=>[2, 4]}
1215
1199
  def group_by *vectors
1216
1200
  vectors.flatten!
1201
+ # FIXME: wouldn't it better to do vectors - @vectors here and
1202
+ # raise one error with all non-existent vector names?.. - zverok, 2016-05-18
1217
1203
  vectors.each { |v|
1218
1204
  raise(ArgumentError, "Vector #{v} does not exist") unless has_vector?(v)
1219
1205
  }
@@ -1226,28 +1212,22 @@ module Daru
1226
1212
  "subclasses, not #{new_index.class}" unless new_vectors.is_a?(Daru::Index)
1227
1213
 
1228
1214
  cl = Daru::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
1229
- new_vectors.each do |vec|
1230
- cl[vec] = @vectors.include?(vec) ? self[vec] : cl[vec] = [nil]*nrows
1215
+ new_vectors.each_with_object(cl) do |vec, memo|
1216
+ memo[vec] = @vectors.include?(vec) ? self[vec] : [nil]*nrows
1231
1217
  end
1218
+ end
1232
1219
 
1233
- cl
1220
+ def get_vector_anyways(v)
1221
+ @vectors.include?(v) ? self[v].to_a : [nil] * size
1234
1222
  end
1235
1223
 
1236
1224
  # Concatenate another DataFrame along corresponding columns.
1237
1225
  # If columns do not exist in both dataframes, they are filled with nils
1238
1226
  def concat other_df
1239
- vectors = @vectors.to_a
1240
- data = []
1227
+ vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
1241
1228
 
1242
- vectors.each do |v|
1243
- other_vec = other_df.vectors.include?(v) ? other_df[v].to_a : [nil] * other_df.size
1244
- data << self[v].dup.to_a.concat(other_vec)
1245
- end
1246
-
1247
- other_df.vectors.each do |v|
1248
- next if vectors.include?(v)
1249
- vectors << v
1250
- data << ([nil] * size).concat(other_df[v].to_a)
1229
+ data = vectors.map do |v|
1230
+ get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
1251
1231
  end
1252
1232
 
1253
1233
  Daru::DataFrame.new(data, order: vectors)
@@ -1291,11 +1271,9 @@ module Daru
1291
1271
  "subclasses, not #{new_index.class}" unless new_index.is_a?(Daru::Index)
1292
1272
 
1293
1273
  cl = Daru::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
1294
- new_index.each do |idx|
1295
- cl.row[idx] = @index.include?(idx) ? row[idx] : [nil]*ncols
1274
+ new_index.each_with_object(cl) do |idx, memo|
1275
+ memo.row[idx] = @index.include?(idx) ? row[idx] : [nil]*ncols
1296
1276
  end
1297
-
1298
- cl
1299
1277
  end
1300
1278
 
1301
1279
  # Reassign index with a new index of type Daru::Index or any of its subclasses.
@@ -1310,8 +1288,8 @@ module Daru
1310
1288
  # df.index.to_a #=> ['a','b','c','d']
1311
1289
  # df.row['a'].to_a #=> [1,11]
1312
1290
  def index= idx
1313
- @data.each { |vec| vec.index = idx }
1314
- @index = idx
1291
+ @index = Index.coerce idx
1292
+ @data.each { |vec| vec.index = @index }
1315
1293
 
1316
1294
  self
1317
1295
  end
@@ -1361,21 +1339,14 @@ module Daru
1361
1339
  # Return the indexes of all the numeric vectors. Will include vectors with nils
1362
1340
  # alongwith numbers.
1363
1341
  def numeric_vectors
1364
- numerics = []
1365
-
1366
- each_vector_with_index do |vec, i|
1367
- numerics << i if vec.type == :numeric
1368
- end
1369
- numerics
1342
+ # FIXME: Why _with_index ?..
1343
+ each_vector_with_index
1344
+ .select { |vec, _i| vec.numeric? }
1345
+ .map(&:last)
1370
1346
  end
1371
1347
 
1372
1348
  def numeric_vector_names
1373
- numerics = []
1374
-
1375
- @vectors.each do |v|
1376
- numerics << v if self[v].type == :numeric
1377
- end
1378
- numerics
1349
+ @vectors.select { |v| self[v].numeric? }
1379
1350
  end
1380
1351
 
1381
1352
  # Return a DataFrame of only the numerical Vectors. If clone: false
@@ -1383,12 +1354,9 @@ module Daru
1383
1354
  # returned. Defaults to clone: true.
1384
1355
  def only_numerics opts={}
1385
1356
  cln = opts[:clone] == false ? false : true
1386
- nv = numeric_vectors
1387
- arry = nv.each_with_object([]) do |v, arr|
1388
- arr << self[v]
1389
- end
1357
+ arry = numeric_vectors.map { |v| self[v] }
1390
1358
 
1391
- order = Index.new(nv)
1359
+ order = Index.new(numeric_vectors)
1392
1360
  Daru::DataFrame.new(arry, clone: cln, order: order, index: @index)
1393
1361
  end
1394
1362
 
@@ -1492,39 +1460,24 @@ module Daru
1492
1460
 
1493
1461
  def sort! vector_order, opts={}
1494
1462
  raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
1495
- opts = {
1496
- ascending: true,
1497
- handle_nils: false,
1498
- by: {}
1499
- }.merge(opts)
1500
1463
 
1501
- opts[:ascending] = sort_order_array vector_order, opts[:ascending]
1502
- opts[:handle_nils] = handle_nils_array vector_order, opts[:handle_nils]
1503
- blocks = create_logic_blocks vector_order, opts[:by], opts[:ascending]
1464
+ # To enable sorting with categorical data,
1465
+ # map categories to integers preserving their order
1466
+ old = convert_categorical_vectors vector_order
1467
+ block = sort_prepare_block vector_order, opts
1504
1468
 
1505
- block = lambda do |r1, r2|
1506
- # Build left and right array to compare two rows
1507
- left = build_array_from_blocks vector_order, opts, blocks, r1, r2
1508
- right = build_array_from_blocks vector_order, opts, blocks, r2, r1
1469
+ order = @index.size.times.sort(&block)
1470
+ new_index = @index.reorder order
1509
1471
 
1510
- # Resolve conflict by Index if all attributes are same
1511
- left << r1
1512
- right << r2
1513
- left <=> right
1514
- end
1472
+ # To reverse map mapping of categorical data to integers
1473
+ restore_categorical_vectors old
1515
1474
 
1516
- idx = (0..@index.size-1).sort(&block)
1517
-
1518
- old_index = @index.to_a
1519
- self.index = Daru::Index.new(idx.map { |i| old_index[i] })
1520
-
1521
- vectors.each do |v|
1522
- @data[@vectors[v]] = Daru::Vector.new(
1523
- idx.map { |i| @data[@vectors[v]].data[i] },
1524
- name: self[v].name, metadata: self[v].metadata.dup, index: index
1525
- )
1475
+ @data.each do |vector|
1476
+ vector.reorder! order
1526
1477
  end
1527
1478
 
1479
+ self.index = new_index
1480
+
1528
1481
  self
1529
1482
  end
1530
1483
 
@@ -1568,90 +1521,41 @@ module Daru
1568
1521
  # # [:bar] 18 26
1569
1522
  # # [:foo] 10 12
1570
1523
  def pivot_table opts={}
1571
- raise ArgumentError,
1572
- 'Specify grouping index' if !opts[:index] || opts[:index].empty?
1573
-
1574
- index = opts[:index]
1575
- vectors = opts[:vectors] || []
1576
- aggregate_function = opts[:agg] || :mean
1577
- values =
1578
- if opts[:values].is_a?(Symbol)
1579
- [opts[:values]]
1580
- elsif opts[:values].is_a?(Array)
1581
- opts[:values]
1582
- else # nil
1583
- (@vectors.to_a - (index | vectors)) & numeric_vector_names
1584
- end
1524
+ raise ArgumentError, 'Specify grouping index' if opts[:index].to_a.empty?
1585
1525
 
1526
+ index = opts[:index]
1527
+ vectors = opts[:vectors] || []
1528
+ aggregate_function = opts[:agg] || :mean
1529
+ values = prepare_pivot_values index, vectors, opts
1586
1530
  raise IndexError, 'No numeric vectors to aggregate' if values.empty?
1587
1531
 
1588
1532
  grouped = group_by(index)
1533
+ return grouped.send(aggregate_function) if vectors.empty?
1589
1534
 
1590
- if vectors.empty?
1591
- grouped.send(aggregate_function)
1592
- else
1593
- super_hash = {}
1594
- values.each do |value|
1595
- grouped.groups.each do |group_name, row_numbers|
1596
- super_hash[group_name] ||= {}
1535
+ super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
1597
1536
 
1598
- row_numbers.each do |num|
1599
- arry = []
1600
- arry << value
1601
- vectors.each { |v| arry << self[v][num] }
1602
- sub_hash = super_hash[group_name]
1603
- sub_hash[arry] ||= []
1604
-
1605
- sub_hash[arry] << self[value][num]
1606
- end
1607
- end
1608
- end
1609
-
1610
- super_hash.each_value do |sub_hash|
1611
- sub_hash.each do |group_name, aggregates|
1612
- sub_hash[group_name] = Daru::Vector.new(aggregates).send(aggregate_function)
1613
- end
1614
- end
1615
-
1616
- df_index = Daru::MultiIndex.from_tuples super_hash.keys
1617
-
1618
- vector_indexes = []
1619
- super_hash.each_value do |sub_hash|
1620
- vector_indexes.concat sub_hash.keys
1621
- end
1622
-
1623
- df_vectors = Daru::MultiIndex.from_tuples vector_indexes.uniq
1624
- pivoted_dataframe = Daru::DataFrame.new({}, index: df_index, order: df_vectors)
1625
-
1626
- super_hash.each do |row_index, sub_h|
1627
- sub_h.each do |vector_index, val|
1628
- # pivoted_dataframe[symbolize(vector_index)][symbolize(row_index)] = val
1629
- pivoted_dataframe[vector_index][row_index] = val
1630
- end
1631
- end
1632
- return pivoted_dataframe
1633
- end
1537
+ pivot_dataframe super_hash
1634
1538
  end
1635
1539
 
1636
1540
  # Merge vectors from two DataFrames. In case of name collision,
1637
1541
  # the vectors names are changed to x_1, x_2 ....
1638
1542
  #
1639
1543
  # @return {Daru::DataFrame}
1640
- def merge other_df
1641
- raise "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" unless nrows == other_df.nrows
1544
+ def merge other_df # rubocop:disable Metrics/AbcSize
1545
+ raise ArgumentError,
1546
+ "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" \
1547
+ unless nrows == other_df.nrows
1642
1548
 
1643
1549
  new_fields = (@vectors.to_a + other_df.vectors.to_a)
1644
- .recode_repeated
1645
- .map(&:to_sym)
1646
- df_new = DataFrame.new({}, order: new_fields)
1550
+ new_fields = ArrayHelper.recode_repeated(new_fields)
1647
1551
 
1648
- (0...nrows).to_a.each do |i|
1649
- row = self.row[i].to_a + other_df.row[i].to_a
1650
- df_new.add_row(row)
1651
- end
1552
+ DataFrame.new({}, order: new_fields).tap do |df_new|
1553
+ (0...nrows).each do |i|
1554
+ df_new.add_row row[i].to_a + other_df.row[i].to_a
1555
+ end
1652
1556
 
1653
- df_new.update
1654
- df_new
1557
+ df_new.update
1558
+ end
1655
1559
  end
1656
1560
 
1657
1561
  # Join 2 DataFrames with SQL style joins. Currently supports inner, left
@@ -1701,7 +1605,11 @@ module Daru
1701
1605
  # ['2','fred','green',15,'orange',30,'white',20],
1702
1606
  # ['3','alfred',nil,nil,nil,nil,nil,nil]
1703
1607
  # ]
1704
- # ds=Daru::DataFrame.rows(cases, order: [:id, :name, :car_color1, :car_value1, :car_color2, :car_value2, :car_color3, :car_value3])
1608
+ # ds=Daru::DataFrame.rows(cases, order:
1609
+ # [:id, :name,
1610
+ # :car_color1, :car_value1,
1611
+ # :car_color2, :car_value2,
1612
+ # :car_color3, :car_value3])
1705
1613
  # ds.one_to_many([:id],'car_%v%n').to_matrix
1706
1614
  # #=> Matrix[
1707
1615
  # # ["red", "1", 10],
@@ -1711,62 +1619,29 @@ module Daru
1711
1619
  # # ["white", "2", 20]
1712
1620
  # # ]
1713
1621
  def one_to_many(parent_fields, pattern)
1714
- re = Regexp.new pattern.gsub('%v','(.+?)').gsub('%n','(\\d+?)')
1715
- ds_vars = parent_fields.dup
1716
- vars = []
1717
- max_n = 0
1718
- h = parent_fields.each_with_object({}) { |v, a|
1719
- a[v] = Daru::Vector.new([])
1720
- }
1721
- # Adding _row_id
1722
- h['_col_id'] = Daru::Vector.new([])
1723
- ds_vars.push('_col_id')
1724
-
1725
- @vectors.each do |f|
1726
- next unless f =~ re
1727
- unless vars.include? $1
1728
- vars.push($1)
1729
- h[$1] = Daru::Vector.new([])
1730
- end
1622
+ vars, numbers = one_to_many_components(pattern)
1731
1623
 
1732
- max_n = $2.to_i if max_n < $2.to_i
1733
- end
1734
- ds = DataFrame.new(h, order: ds_vars+vars)
1735
-
1736
- each_row do |row|
1737
- row_out = {}
1738
- parent_fields.each do |f|
1739
- row_out[f] = row[f]
1740
- end
1741
-
1742
- max_n.times do |n1|
1743
- n = n1+1
1744
- any_data = false
1745
- vars.each do |v|
1746
- data = row[pattern.gsub('%v',v.to_s).gsub('%n',n.to_s)]
1747
- row_out[v] = data
1748
- any_data = true unless data.nil?
1749
- end
1624
+ DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
1625
+ each_row do |row|
1626
+ verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
1627
+ numbers.each do |n|
1628
+ generated = one_to_many_row row, n, vars, pattern
1629
+ next if generated.values.all?(&:nil?)
1750
1630
 
1751
- if any_data
1752
- row_out['_col_id'] = n
1753
- ds.add_row(row_out)
1631
+ ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
1754
1632
  end
1755
1633
  end
1634
+ ds.update
1756
1635
  end
1757
- ds.update
1758
- ds
1759
1636
  end
1760
1637
 
1761
- def add_vectors_by_split_recode(name_, join='-', sep=Daru::SPLIT_TOKEN)
1762
- split = self[name_].split_by_separator(sep)
1763
- i = 1
1764
- split.each { |k,v|
1765
- new_field = name_.to_s + join + i.to_s
1766
- v.rename name_.to_s + ':' + k.to_s
1767
- self[new_field.to_sym] = v
1768
- i += 1
1769
- }
1638
+ def add_vectors_by_split_recode(nm, join='-', sep=Daru::SPLIT_TOKEN)
1639
+ self[nm]
1640
+ .split_by_separator(sep)
1641
+ .each_with_index do |(k, v), i|
1642
+ v.rename "#{nm}:#{k}"
1643
+ self["#{nm}#{join}#{i + 1}".to_sym] = v
1644
+ end
1770
1645
  end
1771
1646
 
1772
1647
  # Create a sql, basen on a given Dataset
@@ -1795,40 +1670,37 @@ module Daru
1795
1670
  sql + fields.join(",\n ")+") CHARACTER SET=#{charset};"
1796
1671
  end
1797
1672
 
1673
+ # Returns the dataframe. This can be convenient when the user does not
1674
+ # know whether the object is a vector or a dataframe.
1675
+ # @return [self] the dataframe
1676
+ def to_df
1677
+ self
1678
+ end
1679
+
1798
1680
  # Convert all numeric vectors to GSL::Matrix
1799
1681
  def to_gsl
1800
- numerics_as_arrays = []
1801
- numeric_vectors.each do |n|
1802
- numerics_as_arrays << self[n].to_a
1803
- end
1682
+ numerics_as_arrays = numeric_vectors.map { |n| self[n].to_a }
1804
1683
 
1805
1684
  GSL::Matrix.alloc(*numerics_as_arrays.transpose)
1806
1685
  end
1807
1686
 
1808
1687
  # Convert all vectors of type *:numeric* into a Matrix.
1809
1688
  def to_matrix
1810
- numerics_as_arrays = []
1811
- each_vector do |vector|
1812
- numerics_as_arrays << vector.to_a if vector.type == :numeric
1813
- end
1814
-
1815
- Matrix.columns numerics_as_arrays
1689
+ Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
1816
1690
  end
1817
1691
 
1818
1692
  # Return a Nyaplot::DataFrame from the data of this DataFrame.
1693
+ # :nocov:
1819
1694
  def to_nyaplotdf
1820
1695
  Nyaplot::DataFrame.new(to_a[0])
1821
1696
  end
1697
+ # :nocov:
1822
1698
 
1823
1699
  # Convert all vectors of type *:numeric* and not containing nils into an NMatrix.
1824
1700
  def to_nmatrix
1825
- numerics_as_arrays = []
1826
- each_vector do |vector|
1827
- numerics_as_arrays << vector.to_a if vector.type == :numeric &&
1828
- vector.missing_positions.empty?
1829
- end
1830
-
1831
- numerics_as_arrays.transpose.to_nm
1701
+ each_vector.select do |vector|
1702
+ vector.numeric? && !vector.include_values?(*Daru::MISSING_VALUES)
1703
+ end.map(&:to_a).transpose.to_nm
1832
1704
  end
1833
1705
 
1834
1706
  # Converts the DataFrame into an array of hashes where key is vector name
@@ -1837,13 +1709,7 @@ module Daru
1837
1709
  # of the dataframe. Each element in the index array corresponds to its row
1838
1710
  # in the array of hashes, which has the same index.
1839
1711
  def to_a
1840
- arry = [[],[]]
1841
- each_row do |row|
1842
- arry[0] << row.to_h
1843
- end
1844
- arry[1] = @index.to_a
1845
-
1846
- arry
1712
+ [each_row.map(&:to_h), @index.to_a]
1847
1713
  end
1848
1714
 
1849
1715
  # Convert to json. If no_index is false then the index will NOT be included
@@ -1859,54 +1725,19 @@ module Daru
1859
1725
  # Converts DataFrame to a hash (explicit) with keys as vector names and values as
1860
1726
  # the corresponding vectors.
1861
1727
  def to_h
1862
- hsh = {}
1863
- @vectors.each_with_index do |vec_name, idx|
1864
- hsh[vec_name] = @data[idx]
1865
- end
1866
-
1867
- hsh
1728
+ @vectors
1729
+ .each_with_index
1730
+ .map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
1868
1731
  end
1869
1732
 
1870
1733
  # Convert to html for IRuby.
1871
1734
  def to_html threshold=30
1872
- html = '<table>' \
1873
- '<tr>' \
1874
- "<th colspan=\"#{@vectors.size+1}\">" \
1875
- "Daru::DataFrame:#{object_id} " + " rows: #{nrows} " + " cols: #{ncols}" \
1876
- '</th>' \
1877
- '</tr>'
1878
- html +='<tr><th></th>'
1879
- @vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
1880
- html += '</tr>'
1881
-
1882
- @index.each_with_index do |index, num|
1883
- html += '<tr>'
1884
- html += '<td>' + index.to_s + '</td>'
1885
-
1886
- row[index].each do |element|
1887
- html += '<td>' + element.to_s + '</td>'
1888
- end
1889
-
1890
- html += '</tr>'
1891
- next if num <= threshold
1892
-
1893
- html += '<tr>'
1894
- (@vectors.size + 1).times { html += '<td>...</td>' }
1895
- html += '</tr>'
1896
-
1897
- last_index = @index.to_a.last
1898
- last_row = row[last_index]
1899
- html += '<tr>'
1900
- html += '<td>' + last_index.to_s + '</td>'
1901
- (0..(ncols - 1)).to_a.each do |i|
1902
- html += '<td>' + last_row[i].to_s + '</td>'
1903
- end
1904
- html += '</tr>'
1905
- break
1906
- end
1907
- html += '</table>'
1908
-
1909
- html
1735
+ path = if index.is_a?(MultiIndex)
1736
+ File.expand_path('../iruby/templates/dataframe_mi.html.erb', __FILE__)
1737
+ else
1738
+ File.expand_path('../iruby/templates/dataframe.html.erb', __FILE__)
1739
+ end
1740
+ ERB.new(File.read(path).strip).result(binding)
1910
1741
  end
1911
1742
 
1912
1743
  def to_s
@@ -1925,8 +1756,11 @@ module Daru
1925
1756
  # Rename the DataFrame.
1926
1757
  def rename new_name
1927
1758
  @name = new_name
1759
+ self
1928
1760
  end
1929
1761
 
1762
+ alias_method :name=, :rename
1763
+
1930
1764
  # Write this DataFrame to a CSV file.
1931
1765
  #
1932
1766
  # == Arguements
@@ -2003,46 +1837,28 @@ module Daru
2003
1837
 
2004
1838
  # Transpose a DataFrame, tranposing elements and row, column indexing.
2005
1839
  def transpose
2006
- arrys = []
2007
- each_vector do |vec|
2008
- arrys << vec.to_a
2009
- end
2010
-
2011
- Daru::DataFrame.new(arrys.transpose, index: @vectors, order: @index, dtype: @dtype, name: @name)
1840
+ Daru::DataFrame.new(
1841
+ each_vector.map(&:to_a).transpose,
1842
+ index: @vectors,
1843
+ order: @index,
1844
+ dtype: @dtype,
1845
+ name: @name
1846
+ )
2012
1847
  end
2013
1848
 
2014
1849
  # Pretty print in a nice table format for the command line (irb/pry/iruby)
2015
1850
  def inspect spacing=10, threshold=15
2016
- longest = [@name.to_s.size,
2017
- (@vectors.map(&:to_s).map(&:size).max || 0),
2018
- (@index .map(&:to_s).map(&:size).max || 0),
2019
- (@data .map { |v| v.map(&:to_s).map(&:size).max }.max || 0)].max
2020
-
2021
- name = @name || 'nil'
2022
- content = ''
2023
- longest = spacing if longest > spacing
2024
- formatter = "\n"
2025
-
2026
- (@vectors.size + 1).times { formatter += "%#{longest}.#{longest}s " }
2027
- content += "\n#<" + self.class.to_s + ':' + object_id.to_s + ' @name = ' +
2028
- name.to_s + ' @size = ' + @size.to_s + '>'
2029
- content += formatter % ['', *@vectors.map(&:to_s)]
2030
- row_num = 1
2031
-
2032
- each_row_with_index do |row, index|
2033
- content += formatter % [index.to_s, *row.to_h.values.map { |e| (e || 'nil').to_s }]
2034
- row_num += 1
2035
- next if row_num <= threshold
2036
-
2037
- dots = []
2038
-
2039
- (@vectors.size + 1).times { dots << '...' }
2040
- content += formatter % dots
2041
- break
2042
- end
2043
- content += "\n"
2044
-
2045
- content
1851
+ row_headers = index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
1852
+ name_part = @name ? ": #{@name} " : ''
1853
+
1854
+ "#<#{self.class}#{name_part}(#{nrows}x#{ncols})>\n" +
1855
+ Formatters::Table.format(
1856
+ each_row.lazy,
1857
+ row_headers: row_headers,
1858
+ headers: vectors,
1859
+ threshold: threshold,
1860
+ spacing: spacing
1861
+ )
2046
1862
  end
2047
1863
 
2048
1864
  # Query a DataFrame by passing a Daru::Core::Query::BoolArray object.
@@ -2058,218 +1874,202 @@ module Daru
2058
1874
  @vectors.to_a.all? { |v| self[v] == other[v] }
2059
1875
  end
2060
1876
 
1877
+ # Converts the specified non category type vectors to category type vectors
1878
+ # @param [Array] *names names of non category type vectors to be converted
1879
+ # @return [Daru::DataFrame] data frame in which specified vectors have been
1880
+ # converted to category type
1881
+ # @example
1882
+ # df = Daru::DataFrame.new({
1883
+ # a: [1, 2, 3],
1884
+ # b: ['a', 'a', 'b']
1885
+ # })
1886
+ # df.to_category :b
1887
+ # df[:b].type
1888
+ # # => :category
1889
+ def to_category *names
1890
+ names.each { |n| self[n] = self[n].to_category }
1891
+ self
1892
+ end
1893
+
2061
1894
  def method_missing(name, *args, &block)
2062
1895
  if name =~ /(.+)\=/
2063
- insert_or_modify_vector name[/(.+)\=/].delete('=').to_sym, args[0]
1896
+ insert_or_modify_vector [name[/(.+)\=/].delete('=').to_sym], args[0]
2064
1897
  elsif has_vector? name
2065
1898
  self[name]
2066
1899
  else
2067
- super(name, *args, &block)
1900
+ super
2068
1901
  end
2069
1902
  end
2070
1903
 
2071
- private
1904
+ def respond_to_missing?(name, include_private=false)
1905
+ name.to_s.end_with?('=') || has_vector?(name) || super
1906
+ end
2072
1907
 
2073
- def possibly_multi_index? index
2074
- if @index.is_a?(MultiIndex)
2075
- Daru::MultiIndex.from_tuples(index)
2076
- else
2077
- Daru::Index.new(index)
1908
+ def interact_code vector_names, full
1909
+ dfs = vector_names.zip(full).map do |vec_name, f|
1910
+ self[vec_name].contrast_code(full: f).each.to_a
2078
1911
  end
1912
+
1913
+ all_vectors = recursive_product(dfs)
1914
+ Daru::DataFrame.new all_vectors,
1915
+ order: all_vectors.map(&:name)
2079
1916
  end
2080
1917
 
2081
- def create_logic_blocks vector_order, _by, ascending
2082
- # Create blocks to handle nils
2083
- blocks = {}
2084
- universal_block_ascending = ->(a) { [a.nil? ? 0 : 1, a] }
2085
- universal_block_decending = ->(a) { [a.nil? ? 1 : 0, a] }
2086
- vector_order.each_with_index do |vector, i|
2087
- blocks[vector] =
2088
- if ascending[i]
2089
- universal_block_ascending
2090
- else
2091
- universal_block_decending
2092
- end
1918
+ # Split the dataframe into many dataframes based on category vector
1919
+ # @param [object] cat_name name of category vector to split the dataframe
1920
+ # @return [Array] array of dataframes split by category with category vector
1921
+ # used to split not included
1922
+ # @example
1923
+ # df = Daru::DataFrame.new({
1924
+ # a: [1, 2, 3],
1925
+ # b: ['a', 'a', 'b']
1926
+ # })
1927
+ # df.to_category :b
1928
+ # df.split_by_category :b
1929
+ # # => [#<Daru::DataFrame: a (2x1)>
1930
+ # # a
1931
+ # # 0 1
1932
+ # # 1 2,
1933
+ # # #<Daru::DataFrame: b (1x1)>
1934
+ # # a
1935
+ # # 2 3]
1936
+ def split_by_category cat_name
1937
+ cat_dv = self[cat_name]
1938
+ raise ArguementError, "#{cat_name} is not a category vector" unless
1939
+ cat_dv.category?
1940
+
1941
+ cat_dv.categories.map do |cat|
1942
+ where(cat_dv.eq cat)
1943
+ .rename(cat)
1944
+ .delete_vector cat_name
2093
1945
  end
1946
+ end
1947
+
1948
+ private
2094
1949
 
2095
- blocks
1950
+ def convert_categorical_vectors names
1951
+ names.map do |n|
1952
+ next unless self[n].category?
1953
+ old = [n, self[n]]
1954
+ self[n] = Daru::Vector.new(self[n].to_ints)
1955
+ old
1956
+ end.compact
2096
1957
  end
2097
1958
 
2098
- def build_array_from_blocks vector_order, opts, blocks, r1, r2
2099
- # Create an array to be used for comparison of two rows in sorting
2100
- vector_order.map.each_with_index do |v, i|
2101
- value = if opts[:ascending][i]
2102
- @data[@vectors[v]].data[r1]
2103
- else
2104
- @data[@vectors[v]].data[r2]
2105
- end
2106
-
2107
- if opts[:by][v] && !opts[:handle_nils][i]
2108
- # Block given and nils handled manually
2109
- value = opts[:by][v].call value
2110
-
2111
- elsif opts[:by][v] && opts[:handle_nils][i]
2112
- # Block given and nils handled automatically
2113
- value = opts[:by][v].call value rescue nil
2114
- blocks[v].call value
1959
+ def restore_categorical_vectors old
1960
+ old.each { |name, vector| self[name] = vector }
1961
+ end
2115
1962
 
2116
- else
2117
- # Block not given and nils handled automatically
2118
- blocks[v].call value
2119
- end
1963
+ def recursive_product dfs
1964
+ return dfs.first if dfs.size == 1
1965
+
1966
+ left = dfs.first
1967
+ dfs.shift
1968
+ right = recursive_product dfs
1969
+ left.product(right).map do |dv1, dv2|
1970
+ (dv1*dv2).rename "#{dv1.name}:#{dv2.name}"
2120
1971
  end
2121
1972
  end
2122
1973
 
2123
- def sort_order_array vector_order, ascending
2124
- if ascending.is_a? Array
2125
- raise ArgumentError, 'Specify same number of vector names and sort orders' if
2126
- vector_order.size != ascending.size
2127
- return ascending
1974
+ def should_be_vector! val
1975
+ return val if val.is_a?(Daru::Vector)
1976
+ raise TypeError, "Every iteration must return Daru::Vector not #{val.class}"
1977
+ end
1978
+
1979
+ def dispatch_to_axis(axis, method, *args, &block)
1980
+ if axis == :vector || axis == :column
1981
+ send("#{method}_vector", *args, &block)
1982
+ elsif axis == :row
1983
+ send("#{method}_row", *args, &block)
2128
1984
  else
2129
- Array.new(vector_order.size, ascending)
1985
+ raise ArgumentError, "Unknown axis #{axis}"
2130
1986
  end
2131
1987
  end
2132
1988
 
2133
- def handle_nils_array vector_order, handle_nils
2134
- if handle_nils.is_a? Array
2135
- raise ArgumentError, 'Specify same number of vector names and handle nils' if
2136
- vector_order.size != handle_nils.size
2137
- return handle_nils
1989
+ def dispatch_to_axis_pl(axis, method, *args, &block)
1990
+ if axis == :vector || axis == :column
1991
+ send("#{method}_vectors", *args, &block)
1992
+ elsif axis == :row
1993
+ send("#{method}_rows", *args, &block)
2138
1994
  else
2139
- Array.new(vector_order.size, handle_nils)
1995
+ raise ArgumentError, "Unknown axis #{axis}"
2140
1996
  end
2141
1997
  end
2142
1998
 
2143
- def vectors_index_for location
2144
- if @vectors.include?(location)
2145
- @vectors[location]
2146
- elsif location[0].is_a?(Integer)
2147
- location[0]
1999
+ AXES = [:row, :vector].freeze
2000
+
2001
+ def extract_axis names, default=:vector
2002
+ if AXES.include?(names.last)
2003
+ names.pop
2004
+ else
2005
+ default
2148
2006
  end
2149
2007
  end
2150
2008
 
2151
2009
  def access_vector *names
2152
- location = names[0]
2010
+ if names.first.is_a?(Range)
2011
+ dup(@vectors[names.first])
2012
+ elsif @vectors.is_a?(MultiIndex)
2013
+ access_vector_multi_index(*names)
2014
+ else
2015
+ access_vector_single_index(*names)
2016
+ end
2017
+ end
2153
2018
 
2154
- return dup(@vectors[location]) if location.is_a?(Range)
2155
- if @vectors.is_a?(MultiIndex)
2156
- pos = @vectors[names]
2019
+ def access_vector_multi_index *names
2020
+ pos = @vectors[names]
2157
2021
 
2158
- return @data[pos] if pos.is_a?(Integer)
2022
+ return @data[pos] if pos.is_a?(Integer)
2159
2023
 
2160
- # MultiIndex
2161
- new_vectors = pos.map do |tuple|
2162
- @data[@vectors[tuple]]
2163
- end
2024
+ new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
2164
2025
 
2165
- if !location.is_a?(Range) && names.size < @vectors.width
2166
- pos = pos.drop_left_level names.size
2167
- end
2026
+ pos = pos.drop_left_level(names.size) if names.size < @vectors.width
2168
2027
 
2169
- Daru::DataFrame.new(new_vectors, index: @index, order: pos)
2170
- else
2171
- unless names[1]
2172
- pos = @vectors[location]
2173
-
2174
- return @data[pos] if pos.is_a?(Numeric)
2028
+ Daru::DataFrame.new(new_vectors, index: @index, order: pos)
2029
+ end
2175
2030
 
2176
- names = pos
2177
- end
2031
+ def access_vector_single_index *names
2032
+ if names.count < 2
2033
+ pos = @vectors[names.first]
2178
2034
 
2179
- new_vectors = {}
2180
- names.each do |name|
2181
- new_vectors[name] = @data[@vectors[name]]
2182
- end
2035
+ return @data[pos] if pos.is_a?(Numeric)
2183
2036
 
2184
- order = names.is_a?(Array) ? Daru::Index.new(names) : names
2185
- Daru::DataFrame.new(new_vectors, order: order,
2186
- index: @index, name: @name)
2037
+ names = pos
2187
2038
  end
2188
- end
2189
2039
 
2190
- def access_row *names
2191
- location = names[0]
2040
+ new_vectors = names.map { |name| [name, @data[@vectors[name]]] }.to_h
2192
2041
 
2193
- if @index.is_a?(MultiIndex)
2194
- pos = @index[names]
2195
- if pos.is_a?(Integer)
2196
- return Daru::Vector.new(populate_row_for(pos), index: @vectors, name: pos)
2197
- end
2042
+ order = names.is_a?(Array) ? Daru::Index.new(names) : names
2043
+ Daru::DataFrame.new(new_vectors, order: order,
2044
+ index: @index, name: @name)
2045
+ end
2198
2046
 
2199
- new_rows = pos.map { |tuple| populate_row_for(tuple) }
2047
+ def access_row *indexes
2048
+ positions = @index.pos(*indexes)
2200
2049
 
2201
- if !location.is_a?(Range) && names.size < @index.width
2202
- pos = pos.drop_left_level names.size
2203
- end
2204
-
2205
- Daru::DataFrame.rows(new_rows, order: @vectors, name: @name, index: pos)
2050
+ if positions.is_a? Numeric
2051
+ return Daru::Vector.new populate_row_for(positions),
2052
+ index: @vectors,
2053
+ name: indexes.first
2206
2054
  else
2207
- if names[1].nil?
2208
- names = @index[location]
2209
- if names.is_a?(Numeric)
2210
- row = []
2211
- @data.each do |vector|
2212
- row << vector[location]
2213
- end
2214
-
2215
- return Daru::Vector.new(row, index: @vectors, name: set_name(location))
2216
- end
2217
- end
2218
- # Access multiple rows
2219
- rows = []
2220
- names.each do |name|
2221
- rows << self.row[name].to_a
2222
- end
2223
-
2224
- Daru::DataFrame.rows rows, index: names,name: @name, order: @vectors
2055
+ new_rows = @data.map { |vec| vec[*indexes] }
2056
+ return Daru::DataFrame.new new_rows,
2057
+ index: @index.subset(*indexes),
2058
+ order: @vectors
2225
2059
  end
2226
2060
  end
2227
2061
 
2228
2062
  def populate_row_for pos
2229
- @data.map do |vector|
2230
- vector[pos]
2231
- end
2063
+ @data.map { |vector| vector[pos] }
2232
2064
  end
2233
2065
 
2234
2066
  def insert_or_modify_vector name, vector
2235
2067
  name = name[0] unless @vectors.is_a?(MultiIndex)
2236
- vec = nil
2237
2068
 
2238
2069
  if @index.empty?
2239
- vec = if vector.is_a?(Daru::Vector)
2240
- vector
2241
- else
2242
- Daru::Vector.new(vector.to_a, name: set_name(name))
2243
- end
2244
-
2245
- @index = vec.index
2246
- assign_or_add_vector name, vec
2247
- set_size
2248
-
2249
- @data.map! do |v|
2250
- if v.empty?
2251
- Daru::Vector.new([nil]*@size, name: set_name(name), metadata: v.metadata, index: @index)
2252
- else
2253
- v
2254
- end
2255
- end
2070
+ insert_vector_in_empty name, vector
2256
2071
  else
2257
- if vector.is_a?(Daru::Vector)
2258
- if vector.index == @index # so that index-by-index assignment is avoided when possible.
2259
- vec = vector.dup
2260
- else
2261
- vec = Daru::Vector.new [], name: set_name(name), metadata: vector.metadata.dup, index: @index
2262
- @index.each do |idx|
2263
- vec[idx] = vector.index.include?(idx) ? vector[idx] : nil
2264
- end
2265
- end
2266
- else
2267
- raise SizeError,
2268
- "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
2269
- @size != vector.size
2270
-
2271
- vec = Daru::Vector.new(vector, name: set_name(name), index: @index)
2272
- end
2072
+ vec = prepare_vector_for_insert name, vector
2273
2073
 
2274
2074
  assign_or_add_vector name, vec
2275
2075
  end
@@ -2283,54 +2083,82 @@ module Daru
2283
2083
  pos = name
2284
2084
  end
2285
2085
 
2286
- if !pos.is_a?(Daru::Index) && pos == name &&
2287
- (@vectors.include?(name) || (pos.is_a?(Integer) && pos < @data.size))
2086
+ case
2087
+ when pos.is_a?(Daru::Index)
2088
+ assign_multiple_vectors pos, v
2089
+ when pos == name &&
2090
+ (@vectors.include?(name) || (pos.is_a?(Integer) && pos < @data.size))
2091
+
2288
2092
  @data[pos] = v
2289
- elsif pos.is_a?(Daru::Index)
2290
- pos.each do |p|
2291
- @data[@vectors[p]] = v
2292
- end
2293
2093
  else
2294
- @vectors |= [name] unless @vectors.include?(name)
2295
- @data[@vectors[name]] = v
2094
+ assign_or_add_vector_rough name, v
2296
2095
  end
2297
2096
  end
2298
2097
 
2299
- def insert_or_modify_row name, vector
2300
- if index.is_a?(MultiIndex)
2301
- # TODO
2302
- else
2303
- name = name[0]
2304
- vec =
2305
- if vector.is_a?(Daru::Vector)
2306
- vector
2307
- else
2308
- Daru::Vector.new(vector, name: set_name(name), index: @vectors)
2309
- end
2098
+ def assign_multiple_vectors pos, v
2099
+ pos.each do |p|
2100
+ @data[@vectors[p]] = v
2101
+ end
2102
+ end
2310
2103
 
2311
- if @index.include? name
2312
- each_vector_with_index do |v,i|
2313
- v[name] = vec.index.include?(i) ? vec[i] : nil
2314
- end
2315
- else
2316
- @index |= [name]
2317
- each_vector_with_index do |v,i|
2318
- v.concat((vec.index.include?(i) ? vec[i] : nil), name)
2104
+ def assign_or_add_vector_rough name, v
2105
+ @vectors |= [name] unless @vectors.include?(name)
2106
+ @data[@vectors[name]] = v
2107
+ end
2108
+
2109
+ def insert_vector_in_empty name, vector
2110
+ vec = Vector.coerce(vector.to_a, name: coerce_name(name))
2111
+
2112
+ @index = vec.index
2113
+ assign_or_add_vector name, vec
2114
+ set_size
2115
+
2116
+ @data.map! { |v| v.empty? ? v.reindex(@index) : v }
2117
+ end
2118
+
2119
+ def prepare_vector_for_insert name, vector
2120
+ if vector.is_a?(Daru::Vector)
2121
+ # so that index-by-index assignment is avoided when possible.
2122
+ return vector.dup if vector.index == @index
2123
+
2124
+ Daru::Vector.new([], name: coerce_name(name), index: @index).tap { |v|
2125
+ @index.each do |idx|
2126
+ v[idx] = vector.index.include?(idx) ? vector[idx] : nil
2319
2127
  end
2320
- end
2128
+ }
2129
+ else
2130
+ # FIXME: No spec checks this case... And SizeError is not a thing - zverok, 2016-05-08
2131
+ raise SizeError,
2132
+ "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
2133
+ @size != vector.size
2134
+
2135
+ Daru::Vector.new(vector, name: coerce_name(name), index: @index)
2136
+ end
2137
+ end
2138
+
2139
+ def insert_or_modify_row indexes, vector
2140
+ vector = coerce_vector vector
2141
+
2142
+ raise SizeError, 'Vector length should match row length' if
2143
+ vector.size != @vectors.size
2321
2144
 
2322
- set_size
2145
+ @data.each_with_index do |vec, pos|
2146
+ vec.send(:set, indexes, vector.at(pos))
2323
2147
  end
2148
+ @index = @data[0].index
2149
+
2150
+ set_size
2324
2151
  end
2325
2152
 
2326
2153
  def create_empty_vectors
2327
- @vectors.each do |name|
2328
- @data << Daru::Vector.new([], name: set_name(name), index: @index)
2154
+ @data = @vectors.map do |name|
2155
+ Daru::Vector.new([], name: coerce_name(name), index: @index)
2329
2156
  end
2330
2157
  end
2331
2158
 
2332
2159
  def validate_labels
2333
- raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if
2160
+ raise IndexError, "Expected equal number of vector names (#{@vectors.size}) " \
2161
+ "for number of vectors (#{@data.size})." if
2334
2162
  @vectors && @vectors.size != @data.size
2335
2163
 
2336
2164
  raise IndexError, 'Expected number of indexes same as number of rows' if
@@ -2348,12 +2176,6 @@ module Daru
2348
2176
  validate_vector_sizes
2349
2177
  end
2350
2178
 
2351
- def all_daru_vectors_in_source? source
2352
- source.values.all? do |vector|
2353
- vector.is_a?(Daru::Vector)
2354
- end
2355
- end
2356
-
2357
2179
  def set_size
2358
2180
  @size = @index.size
2359
2181
  end
@@ -2382,32 +2204,301 @@ module Daru
2382
2204
  def all_vectors_have_equal_indexes? source
2383
2205
  idx = source.values[0].index
2384
2206
 
2385
- source.values.all? do |vector|
2386
- idx == vector.index
2207
+ source.values.all? { |vector| idx == vector.index }
2208
+ end
2209
+
2210
+ def coerce_name potential_name
2211
+ potential_name.is_a?(Array) ? potential_name.join : potential_name
2212
+ end
2213
+
2214
+ def initialize_from_array source, vectors, index, opts
2215
+ raise ArgumentError, 'All objects in data source should be same class' \
2216
+ unless source.map(&:class).uniq.size == 1
2217
+
2218
+ case source.first
2219
+ when Array
2220
+ initialize_from_array_of_arrays source, vectors, index, opts
2221
+ when Vector
2222
+ initialize_from_array_of_vectors source, vectors, index, opts
2223
+ when Hash
2224
+ initialize_from_array_of_hashes source, vectors, index, opts
2225
+ else
2226
+ raise ArgumentError, "Can't create DataFrame from #{source}"
2387
2227
  end
2388
2228
  end
2389
2229
 
2390
- def try_create_index index
2391
- index.is_a?(Index) ? index : Daru::Index.new(index)
2230
+ def initialize_from_array_of_arrays source, vectors, index, _opts
2231
+ raise ArgumentError, "Number of vectors (#{vectors.size}) should \
2232
+ equal order size (#{source.size})" if source.size != vectors.size
2233
+
2234
+ @index = Index.coerce(index || source[0].size)
2235
+ @vectors = Index.coerce(vectors)
2236
+
2237
+ @data = @vectors.each_with_index.map do |_vec,idx|
2238
+ Daru::Vector.new(source[idx], index: @index, name: vectors[idx])
2239
+ end
2392
2240
  end
2393
2241
 
2394
- def set_name potential_name # rubocop:disable Style/AccessorMethodName
2395
- potential_name.is_a?(Array) ? potential_name.join : potential_name
2242
+ def initialize_from_array_of_vectors source, vectors, index, opts
2243
+ clone = opts[:clone] != false
2244
+ hsh = vectors.each_with_index.map do |name, idx|
2245
+ [name, source[idx]]
2246
+ end.to_h
2247
+ initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
2248
+ end
2249
+
2250
+ def initialize_from_array_of_hashes source, vectors, index, _opts
2251
+ names =
2252
+ if vectors.nil?
2253
+ source[0].keys
2254
+ else
2255
+ (vectors + source[0].keys).uniq
2256
+ end
2257
+ @vectors = Daru::Index.new(names)
2258
+ @index = Daru::Index.new(index || source.size)
2259
+
2260
+ @data = @vectors.map do |name|
2261
+ v = source.map { |h| h[name] || h[name.to_s] }
2262
+ Daru::Vector.new(v, name: coerce_name(name), index: @index)
2263
+ end
2264
+ end
2265
+
2266
+ def initialize_from_hash source, vectors, index, opts
2267
+ create_vectors_index_with vectors, source
2268
+
2269
+ if ArrayHelper.array_of?(source.values, Vector)
2270
+ initialize_from_hash_with_vectors source, index, opts
2271
+ else
2272
+ initialize_from_hash_with_arrays source, index, opts
2273
+ end
2274
+ end
2275
+
2276
+ def initialize_from_hash_with_vectors source, index, opts
2277
+ vectors_have_same_index = all_vectors_have_equal_indexes?(source)
2278
+
2279
+ clone = opts[:clone] != false
2280
+ clone = true unless index || vectors_have_same_index
2281
+
2282
+ @index = deduce_index index, source, vectors_have_same_index
2283
+
2284
+ if clone
2285
+ @data = clone_vectors source, vectors_have_same_index
2286
+ else
2287
+ @data.concat source.values
2288
+ end
2289
+ end
2290
+
2291
+ def deduce_index index, source, vectors_have_same_index
2292
+ if !index.nil?
2293
+ Index.coerce index
2294
+ elsif vectors_have_same_index
2295
+ source.values[0].index.dup
2296
+ else
2297
+ all_indexes = source
2298
+ .values.map { |v| v.index.to_a }
2299
+ .flatten.uniq.sort # sort only if missing indexes detected
2300
+
2301
+ Daru::Index.new all_indexes
2302
+ end
2396
2303
  end
2397
2304
 
2398
- def symbolize arry
2399
- symbolized_arry =
2400
- if arry.all? { |e| e.is_a?(Array) }
2401
- arry.map do |sub_arry|
2402
- sub_arry.map do |e|
2403
- e.is_a?(Numeric) ? e : e.to_sym
2305
+ def clone_vectors source, vectors_have_same_index
2306
+ @vectors.map do |vector|
2307
+ # avoids matching indexes of vectors if all the supplied vectors
2308
+ # have the same index.
2309
+ if vectors_have_same_index
2310
+ source[vector].dup
2311
+ else
2312
+ Daru::Vector.new([], name: vector, index: @index).tap do |v|
2313
+ @index.each do |idx|
2314
+ v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
2404
2315
  end
2405
2316
  end
2317
+ end
2318
+ end
2319
+ end
2320
+
2321
+ def initialize_from_hash_with_arrays source, index, _opts
2322
+ @index = Index.coerce(index || source.values[0].size)
2323
+
2324
+ @vectors.each do |name|
2325
+ @data << Daru::Vector.new(source[name].dup, name: coerce_name(name), index: @index)
2326
+ end
2327
+ end
2328
+
2329
+ def sort_build_row vector_locs, by_blocks, ascending, handle_nils, r1, r2 # rubocop:disable Metrics/ParameterLists
2330
+ # Create an array to be used for comparison of two rows in sorting
2331
+ vector_locs
2332
+ .zip(by_blocks, ascending, handle_nils)
2333
+ .map do |vector_loc, by, asc, handle_nil|
2334
+ value = @data[vector_loc].data[asc ? r1 : r2]
2335
+
2336
+ value = by.call(value) rescue nil if by
2337
+
2338
+ sort_handle_nils value, asc, handle_nil || !by
2339
+ end
2340
+ end
2341
+
2342
+ def sort_handle_nils value, asc, handle_nil
2343
+ case
2344
+ when !handle_nil
2345
+ value
2346
+ when asc
2347
+ [value.nil? ? 0 : 1, value]
2348
+ else
2349
+ [value.nil? ? 1 : 0, value]
2350
+ end
2351
+ end
2352
+
2353
+ def sort_coerce_boolean opts, symbol, default, size
2354
+ val = opts[symbol]
2355
+ case val
2356
+ when true, false
2357
+ Array.new(size, val)
2358
+ when nil
2359
+ Array.new(size, default)
2360
+ when Array
2361
+ raise ArgumentError, "Specify same number of vector names and #{symbol}" if
2362
+ size != val.size
2363
+ val
2364
+ else
2365
+ raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
2366
+ end
2367
+ end
2368
+
2369
+ def sort_prepare_block vector_order, opts
2370
+ ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
2371
+ handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
2372
+
2373
+ by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
2374
+ vector_locs = vector_order.map { |v| @vectors[v] }
2375
+
2376
+ lambda do |index1, index2|
2377
+ # Build left and right array to compare two rows
2378
+ left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
2379
+ right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
2380
+
2381
+ # Resolve conflict by Index if all attributes are same
2382
+ left << index1
2383
+ right << index2
2384
+ left <=> right
2385
+ end
2386
+ end
2387
+
2388
+ def verify_error_message row, test, id, i
2389
+ description, fields, = test
2390
+ values =
2391
+ if fields.empty?
2392
+ ''
2406
2393
  else
2407
- arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym }
2394
+ ' (' + fields.collect { |k| "#{k}=#{row[k]}" }.join(', ') + ')'
2395
+ end
2396
+ "#{i+1} [#{row[id]}]: #{description}#{values}"
2397
+ end
2398
+
2399
+ def prepare_pivot_values index, vectors, opts
2400
+ case opts[:values]
2401
+ when nil # values not specified at all.
2402
+ (@vectors.to_a - (index | vectors)) & numeric_vector_names
2403
+ when Array # multiple values specified.
2404
+ opts[:values]
2405
+ else # single value specified.
2406
+ [opts[:values]]
2407
+ end
2408
+ end
2409
+
2410
+ def make_pivot_hash grouped, vectors, values, aggregate_function
2411
+ grouped.groups.map { |n, _| [n, {}] }.to_h.tap do |super_hash|
2412
+ values.each do |value|
2413
+ grouped.groups.each do |group_name, row_numbers|
2414
+ row_numbers.each do |num|
2415
+ arry = [value, *vectors.map { |v| self[v][num] }]
2416
+ sub_hash = super_hash[group_name]
2417
+ sub_hash[arry] ||= []
2418
+
2419
+ sub_hash[arry] << self[value][num]
2420
+ end
2421
+ end
2422
+ end
2423
+
2424
+ setup_pivot_aggregates super_hash, aggregate_function
2425
+ end
2426
+ end
2427
+
2428
+ def setup_pivot_aggregates super_hash, aggregate_function
2429
+ super_hash.each_value do |sub_hash|
2430
+ sub_hash.each do |group_name, aggregates|
2431
+ sub_hash[group_name] = Daru::Vector.new(aggregates).send(aggregate_function)
2408
2432
  end
2433
+ end
2434
+ end
2435
+
2436
+ def pivot_dataframe super_hash
2437
+ df_index = Daru::MultiIndex.from_tuples super_hash.keys
2438
+ df_vectors = Daru::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
2409
2439
 
2410
- symbolized_arry
2440
+ Daru::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
2441
+ super_hash.each do |row_index, sub_h|
2442
+ sub_h.each do |vector_index, val|
2443
+ pivoted_dataframe[vector_index][row_index] = val
2444
+ end
2445
+ end
2446
+ end
2447
+ end
2448
+
2449
+ def one_to_many_components pattern
2450
+ re = Regexp.new pattern.gsub('%v','(.+?)').gsub('%n','(\\d+?)')
2451
+
2452
+ vars, numbers =
2453
+ @vectors
2454
+ .map { |v| v.scan(re) }
2455
+ .reject(&:empty?).flatten(1).transpose
2456
+
2457
+ [vars.uniq, numbers.map(&:to_i).sort.uniq]
2458
+ end
2459
+
2460
+ def one_to_many_row row, number, vars, pattern
2461
+ vars
2462
+ .map { |v|
2463
+ name = pattern.sub('%v', v).sub('%n', number.to_s)
2464
+ [v, row[name]]
2465
+ }.to_h
2466
+ end
2467
+
2468
+ # Raises IndexError when one of the positions is not a valid position
2469
+ def validate_positions *positions, size
2470
+ positions = [positions] if positions.is_a? Integer
2471
+ positions.each do |pos|
2472
+ raise IndexError, "#{pos} is not a valid position." if pos >= size
2473
+ end
2474
+ end
2475
+
2476
+ # Accepts hash, enumerable and vector and align it properly so it can be added
2477
+ def coerce_vector vector
2478
+ case vector
2479
+ when Daru::Vector
2480
+ vector.reindex @vectors
2481
+ when Hash
2482
+ Daru::Vector.new(vector).reindex @vectors
2483
+ else
2484
+ Daru::Vector.new vector
2485
+ end
2486
+ end
2487
+
2488
+ # coerce ranges, integers and array in appropriate ways
2489
+ def coerce_positions *positions, size
2490
+ if positions.size == 1
2491
+ case positions.first
2492
+ when Integer
2493
+ positions.first
2494
+ when Range
2495
+ size.times.to_a[positions.first]
2496
+ else
2497
+ raise ArgumentError, 'Unkown position type.'
2498
+ end
2499
+ else
2500
+ positions
2501
+ end
2411
2502
  end
2412
2503
  end
2413
2504
  end