daru 0.1.3.1 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (92) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/.rspec +2 -1
  4. data/.rspec_formatter.rb +33 -0
  5. data/.rubocop.yml +26 -2
  6. data/History.md +38 -0
  7. data/README.md +22 -13
  8. data/Rakefile +50 -2
  9. data/benchmarks/csv_reading.rb +22 -0
  10. data/daru.gemspec +9 -2
  11. data/lib/daru.rb +36 -4
  12. data/lib/daru/accessors/array_wrapper.rb +6 -1
  13. data/lib/daru/accessors/dataframe_by_row.rb +10 -2
  14. data/lib/daru/accessors/gsl_wrapper.rb +1 -3
  15. data/lib/daru/accessors/nmatrix_wrapper.rb +9 -0
  16. data/lib/daru/category.rb +935 -0
  17. data/lib/daru/core/group_by.rb +29 -38
  18. data/lib/daru/core/merge.rb +186 -145
  19. data/lib/daru/core/query.rb +22 -11
  20. data/lib/daru/dataframe.rb +976 -885
  21. data/lib/daru/date_time/index.rb +166 -166
  22. data/lib/daru/date_time/offsets.rb +66 -77
  23. data/lib/daru/formatters/table.rb +54 -0
  24. data/lib/daru/helpers/array.rb +40 -0
  25. data/lib/daru/index.rb +476 -73
  26. data/lib/daru/io/io.rb +66 -45
  27. data/lib/daru/io/sql_data_source.rb +33 -62
  28. data/lib/daru/iruby/helpers.rb +38 -0
  29. data/lib/daru/iruby/templates/dataframe.html.erb +52 -0
  30. data/lib/daru/iruby/templates/dataframe_mi.html.erb +58 -0
  31. data/lib/daru/iruby/templates/multi_index.html.erb +12 -0
  32. data/lib/daru/iruby/templates/vector.html.erb +27 -0
  33. data/lib/daru/iruby/templates/vector_mi.html.erb +36 -0
  34. data/lib/daru/maths/arithmetic/dataframe.rb +16 -18
  35. data/lib/daru/maths/arithmetic/vector.rb +4 -6
  36. data/lib/daru/maths/statistics/dataframe.rb +8 -15
  37. data/lib/daru/maths/statistics/vector.rb +120 -98
  38. data/lib/daru/monkeys.rb +12 -40
  39. data/lib/daru/plotting/gruff.rb +3 -0
  40. data/lib/daru/plotting/gruff/category.rb +49 -0
  41. data/lib/daru/plotting/gruff/dataframe.rb +91 -0
  42. data/lib/daru/plotting/gruff/vector.rb +57 -0
  43. data/lib/daru/plotting/nyaplot.rb +3 -0
  44. data/lib/daru/plotting/nyaplot/category.rb +34 -0
  45. data/lib/daru/plotting/nyaplot/dataframe.rb +187 -0
  46. data/lib/daru/plotting/nyaplot/vector.rb +46 -0
  47. data/lib/daru/vector.rb +694 -421
  48. data/lib/daru/version.rb +1 -1
  49. data/profile/_base.rb +23 -0
  50. data/profile/df_to_a.rb +10 -0
  51. data/profile/filter.rb +13 -0
  52. data/profile/joining.rb +13 -0
  53. data/profile/sorting.rb +12 -0
  54. data/profile/vector_each_with_index.rb +9 -0
  55. data/spec/accessors/wrappers_spec.rb +2 -4
  56. data/spec/categorical_spec.rb +1734 -0
  57. data/spec/core/group_by_spec.rb +52 -2
  58. data/spec/core/merge_spec.rb +63 -2
  59. data/spec/core/query_spec.rb +236 -80
  60. data/spec/dataframe_spec.rb +1373 -79
  61. data/spec/date_time/data_spec.rb +3 -5
  62. data/spec/date_time/index_spec.rb +154 -17
  63. data/spec/date_time/offsets_spec.rb +3 -4
  64. data/spec/fixtures/empties.dat +2 -0
  65. data/spec/fixtures/strings.dat +2 -0
  66. data/spec/formatters/table_formatter_spec.rb +99 -0
  67. data/spec/helpers_spec.rb +8 -0
  68. data/spec/index/categorical_index_spec.rb +168 -0
  69. data/spec/index/index_spec.rb +283 -0
  70. data/spec/index/multi_index_spec.rb +570 -0
  71. data/spec/io/io_spec.rb +31 -4
  72. data/spec/io/sql_data_source_spec.rb +0 -1
  73. data/spec/iruby/dataframe_spec.rb +172 -0
  74. data/spec/iruby/helpers_spec.rb +49 -0
  75. data/spec/iruby/multi_index_spec.rb +37 -0
  76. data/spec/iruby/vector_spec.rb +107 -0
  77. data/spec/math/arithmetic/dataframe_spec.rb +71 -13
  78. data/spec/math/arithmetic/vector_spec.rb +8 -10
  79. data/spec/math/statistics/dataframe_spec.rb +3 -5
  80. data/spec/math/statistics/vector_spec.rb +45 -55
  81. data/spec/monkeys_spec.rb +32 -9
  82. data/spec/plotting/dataframe_spec.rb +386 -0
  83. data/spec/plotting/vector_spec.rb +230 -0
  84. data/spec/shared/vector_display_spec.rb +215 -0
  85. data/spec/spec_helper.rb +23 -0
  86. data/spec/vector_spec.rb +905 -138
  87. metadata +143 -11
  88. data/.rubocop_todo.yml +0 -44
  89. data/lib/daru/plotting/dataframe.rb +0 -104
  90. data/lib/daru/plotting/vector.rb +0 -38
  91. data/spec/daru_spec.rb +0 -58
  92. data/spec/index_spec.rb +0 -375
@@ -33,7 +33,7 @@ module Daru
33
33
  end
34
34
 
35
35
  def inspect
36
- "(#{self.class}:#{object_id} bool_arry=#{@barry})"
36
+ "#<#{self.class}:#{object_id} bool_arry=#{@barry}>"
37
37
  end
38
38
  end
39
39
 
@@ -56,17 +56,28 @@ module Daru
56
56
  )
57
57
  end
58
58
 
59
- def vector_where data, index, bool_array, dtype
60
- new_data = []
61
- new_index = []
62
- bool_array.to_a.each_with_index do |b, i|
63
- if b
64
- new_data << data[i]
65
- new_index << index[i]
66
- end
67
- end
59
+ def vector_where dv, bool_array
60
+ new_data, new_index = fetch_new_data_and_index dv, bool_array
61
+
62
+ resultant_dv = Daru::Vector.new new_data,
63
+ index: dv.index.class.new(new_index),
64
+ dtype: dv.dtype,
65
+ type: dv.type,
66
+ name: dv.name
67
+
68
+ # Preserve categories order for category vector
69
+ resultant_dv.categories = dv.categories if dv.category?
70
+ resultant_dv
71
+ end
72
+
73
+ private
68
74
 
69
- Daru::Vector.new(new_data, index: new_index, dtype: dtype)
75
+ def fetch_new_data_and_index dv, bool_array
76
+ barry = bool_array.to_a
77
+ positions = dv.size.times.select { |i| barry[i] }
78
+ new_data = dv.to_a.values_at(*positions)
79
+ new_index = dv.index.to_a.values_at(*positions)
80
+ [new_data, new_index]
70
81
  end
71
82
  end
72
83
  end
@@ -1,14 +1,17 @@
1
1
  require 'daru/accessors/dataframe_by_row.rb'
2
2
  require 'daru/maths/arithmetic/dataframe.rb'
3
3
  require 'daru/maths/statistics/dataframe.rb'
4
- require 'daru/plotting/dataframe.rb'
4
+ require 'daru/plotting/gruff.rb'
5
+ require 'daru/plotting/nyaplot.rb'
5
6
  require 'daru/io/io.rb'
6
7
 
7
8
  module Daru
8
- class DataFrame
9
+ class DataFrame # rubocop:disable Metrics/ClassLength
9
10
  include Daru::Maths::Arithmetic::DataFrame
10
11
  include Daru::Maths::Statistics::DataFrame
11
- include Daru::Plotting::DataFrame if Daru.has_nyaplot?
12
+ # TODO: Remove this line but its causing erros due to unkown reason
13
+ include Daru::Plotting::DataFrame::NyaplotLibrary if Daru.has_nyaplot?
14
+ extend Gem::Deprecate
12
15
 
13
16
  class << self
14
17
  # Load data from a CSV file. Specify an optional block to grab the CSV
@@ -112,29 +115,17 @@ module Daru
112
115
  # Create DataFrame by specifying rows as an Array of Arrays or Array of
113
116
  # Daru::Vector objects.
114
117
  def rows source, opts={}
115
- first = source.first
116
-
117
118
  raise SizeError, 'All vectors must have same length' \
118
- unless source.all? { |v| v.size == first.size }
119
-
120
- index = []
121
- opts[:order] ||=
122
- case first
123
- when Daru::Vector # assume that all are Vectors
124
- index = source.map(&:name)
125
- first.index.to_a
126
- when Array
127
- Array.new(first.size, &:to_s)
128
- end
119
+ unless source.all? { |v| v.size == source.first.size }
129
120
 
130
- if source.all? { |s| s.is_a?(Array) }
131
- Daru::DataFrame.new(source.transpose, opts)
132
- else # array of Daru::Vectors
133
- Daru::DataFrame.new({}, opts).tap do |df|
134
- source.each_with_index do |row, idx|
135
- df[index[idx] || idx, :row] = row
136
- end
137
- end
121
+ opts[:order] ||= guess_order(source)
122
+
123
+ if ArrayHelper.array_of?(source, Array)
124
+ DataFrame.new(source.transpose, opts)
125
+ elsif ArrayHelper.array_of?(source, Vector)
126
+ from_vector_rows(source, opts)
127
+ else
128
+ raise ArgumentError, "Can't create DataFrame from #{source}"
138
129
  end
139
130
  end
140
131
 
@@ -161,36 +152,47 @@ module Daru
161
152
  raise 'Three vectors should be equal size' if
162
153
  rows.size != columns.size || rows.size!=values.size
163
154
 
164
- cols_values = columns.factors
165
- cols_n = cols_values.size
155
+ data = Hash.new { |h, col|
156
+ h[col] = rows.factors.map { |r| [r, nil] }.to_h
157
+ }
158
+ columns.zip(rows, values).each { |c, r, v| data[c][r] = v }
166
159
 
167
- h_rows = rows.factors.each_with_object({}) do |v, a|
168
- a[v] = cols_values.each_with_object({}) do |v1, a1|
169
- a1[v1]=nil
170
- end
171
- end
160
+ # FIXME: in fact, WITHOUT this line you'll obtain more "right"
161
+ # data: with vectors having "rows" as an index...
162
+ data = data.map { |c, r| [c, r.values] }.to_h
163
+ data[:_id] = rows.factors
164
+
165
+ DataFrame.new(data)
166
+ end
167
+
168
+ private
172
169
 
173
- values.each_index do |i|
174
- h_rows[rows[i]][columns[i]] = values[i]
170
+ def guess_order source
171
+ case source.first
172
+ when Vector # assume that all are Vectors
173
+ source.first.index.to_a
174
+ when Array
175
+ Array.new(source.first.size, &:to_s)
175
176
  end
176
- df = Daru::DataFrame.new({}, order: [:_id] + cols_values.to_a)
177
+ end
177
178
 
178
- rows.factors.each do |row|
179
- n_row = Array.new(cols_n+1)
180
- n_row[0] = row
181
- cols_values.each_index do |i|
182
- n_row[i+1] = h_rows[row][cols_values[i]]
183
- end
179
+ def from_vector_rows source, opts
180
+ index = source.map(&:name)
181
+ .each_with_index.map { |n, i| n || i }
182
+ index = ArrayHelper.recode_repeated(index)
184
183
 
185
- df.add_row(n_row)
184
+ DataFrame.new({}, opts).tap do |df|
185
+ source.each_with_index do |row, idx|
186
+ df[index[idx] || idx, :row] = row
187
+ end
186
188
  end
187
- df.update
188
- df
189
189
  end
190
190
  end
191
191
 
192
192
  # The vectors (columns) index of the DataFrame
193
193
  attr_reader :vectors
194
+ # TOREMOVE
195
+ attr_reader :data
194
196
 
195
197
  # The index of the rows of the DataFrame
196
198
  attr_reader :index
@@ -237,135 +239,181 @@ module Daru
237
239
  # # b 7 2
238
240
  # # c 8 3
239
241
  # # d 9 4
240
- def initialize source, opts={}
241
- vectors = opts[:order]
242
- index = opts[:index]
243
- clone = opts[:clone] == false ? false : true
244
- @data = []
245
-
246
- temp_name = opts[:name]
247
- @name = temp_name || SecureRandom.uuid
248
-
249
- if source.empty?
250
- @vectors = try_create_index vectors
251
- @index = try_create_index index
242
+ def initialize source, opts={} # rubocop:disable Metrics/MethodLength
243
+ vectors, index = opts[:order], opts[:index] # FIXME: just keyword arges after Ruby 2.1
244
+ @data = []
245
+ @name = opts[:name]
246
+
247
+ case source
248
+ when ->(s) { s.empty? }
249
+ @vectors = Index.coerce vectors
250
+ @index = Index.coerce index
252
251
  create_empty_vectors
253
- else
254
- case source
255
- when Array
256
- if source.all? { |s| s.is_a?(Array) }
257
- raise ArgumentError, "Number of vectors (#{vectors.size}) should \
258
- equal order size (#{source.size})" if source.size != vectors.size
259
-
260
- @index = try_create_index(index || source[0].size)
261
- @vectors = try_create_index(vectors)
262
-
263
- @vectors.each_with_index do |_vec,idx|
264
- @data << Daru::Vector.new(source[idx], index: @index)
265
- end
266
- elsif source.all? { |s| s.is_a?(Daru::Vector) }
267
- hsh = {}
268
- vectors.each_with_index do |name, idx|
269
- hsh[name] = source[idx]
270
- end
271
- initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
272
- else # array of hashes
273
- @vectors =
274
- if vectors.nil?
275
- Daru::Index.new source[0].keys
276
- else
277
- Daru::Index.new((vectors + (source[0].keys - vectors)).uniq)
278
- end
279
- @index = Daru::Index.new(index || source.size)
280
-
281
- @vectors.each do |name|
282
- v = []
283
- source.each do |h|
284
- v << (h[name] || h[name.to_s])
285
- end
286
-
287
- @data << Daru::Vector.new(v, name: set_name(name), index: @index)
288
- end
289
- end
290
- when Hash
291
- create_vectors_index_with vectors, source
292
- if all_daru_vectors_in_source? source
293
- vectors_have_same_index = all_vectors_have_equal_indexes?(source)
294
- if !index.nil?
295
- @index = try_create_index index
296
- elsif vectors_have_same_index
297
- @index = source.values[0].index.dup
298
- else
299
- all_indexes = []
300
- source.each_value do |vector|
301
- all_indexes << vector.index.to_a
302
- end
303
- # sort only if missing indexes detected
304
- all_indexes.flatten!.uniq!.sort!
305
-
306
- @index = Daru::Index.new all_indexes
307
- clone = true
308
- end
309
-
310
- if clone
311
- @vectors.each do |vector|
312
- # avoids matching indexes of vectors if all the supplied vectors
313
- # have the same index.
314
- if vectors_have_same_index
315
- v = source[vector].dup
316
- else
317
- v = Daru::Vector.new([], name: vector, metadata: source[vector].metadata.dup, index: @index)
318
-
319
- @index.each do |idx|
320
- v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
321
- end
322
- end
323
- @data << v
324
- end
325
- else
326
- @data.concat source.values
327
- end
328
- else
329
- @index = try_create_index(index || source.values[0].size)
330
-
331
- @vectors.each do |name|
332
- meta_opt = source[name].respond_to?(:metadata) ? {metadata: source[name].metadata.dup} : {}
333
- @data << Daru::Vector.new(source[name].dup, name: set_name(name), **meta_opt, index: @index)
334
- end
335
- end
336
- end
252
+ when Array
253
+ initialize_from_array source, vectors, index, opts
254
+ when Hash
255
+ initialize_from_hash source, vectors, index, opts
337
256
  end
338
257
 
339
258
  set_size
340
259
  validate
341
260
  update
261
+ self.plotting_library = Daru.plotting_library
342
262
  end
343
263
 
344
- def vector(*)
345
- $stderr.puts '#vector has been deprecated in favour of #[]. Please use that.'
346
- self[*names]
264
+ def plotting_library= lib
265
+ case lib
266
+ when :gruff, :nyaplot
267
+ @plotting_library = lib
268
+ extend Module.const_get(
269
+ "Daru::Plotting::DataFrame::#{lib.to_s.capitalize}Library"
270
+ ) if Daru.send("has_#{lib}?".to_sym)
271
+ else
272
+ raise ArguementError, "Plotting library #{lib} not supported. "\
273
+ 'Supported libraries are :nyaplot and :gruff'
274
+ end
347
275
  end
348
276
 
349
277
  # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
350
278
  # Defaults to *:vector*. Use of this method is not recommended for accessing
351
279
  # rows. Use df.row[:a] for accessing row with index ':a'.
352
280
  def [](*names)
353
- if names[-1] == :vector || names[-1] == :row
354
- axis = names[-1]
355
- names = names[0..-2]
281
+ axis = extract_axis(names, :vector)
282
+ dispatch_to_axis axis, :access, *names
283
+ end
284
+
285
+ # Retrive rows by positions
286
+ # @param [Array<Integer>] *positions positions of rows to retrive
287
+ # @return [Daru::Vector, Daru::DataFrame] vector for single position and dataframe for multiple positions
288
+ # @example
289
+ # df = Daru::DataFrame.new({
290
+ # a: [1, 2, 3],
291
+ # b: ['a', 'b', 'c']
292
+ # })
293
+ # df.row_at 1, 2
294
+ # # => #<Daru::DataFrame(2x2)>
295
+ # # a b
296
+ # # 1 2 b
297
+ # # 2 3 c
298
+ def row_at *positions
299
+ original_positions = positions
300
+ positions = coerce_positions(*positions, nrows)
301
+ validate_positions(*positions, nrows)
302
+
303
+ if positions.is_a? Integer
304
+ return Daru::Vector.new @data.map { |vec| vec.at(*positions) },
305
+ index: @vectors
356
306
  else
357
- axis = :vector
307
+ new_rows = @data.map { |vec| vec.at(*original_positions) }
308
+ return Daru::DataFrame.new new_rows,
309
+ index: @index.at(*original_positions),
310
+ order: @vectors
358
311
  end
312
+ end
359
313
 
360
- if axis == :vector
361
- access_vector(*names)
362
- elsif axis == :row
363
- access_row(*names)
314
+ # Set rows by positions
315
+ # @param [Array<Integer>] positions positions of rows to set
316
+ # @vector [Array, Daru::Vector] vector vector to be assigned
317
+ # @example
318
+ # df = Daru::DataFrame.new({
319
+ # a: [1, 2, 3],
320
+ # b: ['a', 'b', 'c']
321
+ # })
322
+ # df.set_row_at [0, 1], ['x', 'x']
323
+ # df
324
+ # #=> #<Daru::DataFrame(3x2)>
325
+ # # a b
326
+ # # 0 x x
327
+ # # 1 x x
328
+ # # 2 3 c
329
+ def set_row_at positions, vector
330
+ validate_positions(*positions, nrows)
331
+ vector =
332
+ if vector.is_a? Daru::Vector
333
+ vector.reindex @vectors
334
+ else
335
+ Daru::Vector.new vector
336
+ end
337
+
338
+ raise SizeError, 'Vector length should match row length' if
339
+ vector.size != @vectors.size
340
+
341
+ @data.each_with_index do |vec, pos|
342
+ vec.set_at(positions, vector.at(pos))
343
+ end
344
+ @index = @data[0].index
345
+ set_size
346
+ end
347
+
348
+ # Retrive vectors by positions
349
+ # @param [Array<Integer>] *positions positions of vectors to retrive
350
+ # @return [Daru::Vector, Daru::DataFrame] vector for single position and dataframe for multiple positions
351
+ # @example
352
+ # df = Daru::DataFrame.new({
353
+ # a: [1, 2, 3],
354
+ # b: ['a', 'b', 'c']
355
+ # })
356
+ # df.at 0
357
+ # # => #<Daru::Vector(3)>
358
+ # # a
359
+ # # 0 1
360
+ # # 1 2
361
+ # # 2 3
362
+ def at *positions
363
+ if AXES.include? positions.last
364
+ axis = positions.pop
365
+ return row_at(*positions) if axis == :row
366
+ end
367
+
368
+ original_positions = positions
369
+ positions = coerce_positions(*positions, ncols)
370
+ validate_positions(*positions, ncols)
371
+
372
+ if positions.is_a? Integer
373
+ @data[positions].dup
364
374
  else
365
- raise IndexError, "Expected axis to be row or vector not #{axis}"
375
+ Daru::DataFrame.new positions.map { |pos| @data[pos].dup },
376
+ index: @index,
377
+ order: @vectors.at(*original_positions),
378
+ name: @name
366
379
  end
367
380
  end
368
381
 
382
+ # Set vectors by positions
383
+ # @param [Array<Integer>] positions positions of vectors to set
384
+ # @param [Array, Daru::Vector] vector vector to be assigned
385
+ # @example
386
+ # df = Daru::DataFrame.new({
387
+ # a: [1, 2, 3],
388
+ # b: ['a', 'b', 'c']
389
+ # })
390
+ # df.set_at [0], ['x', 'y', 'z']
391
+ # df
392
+ # #=> #<Daru::DataFrame(3x2)>
393
+ # # a b
394
+ # # 0 x a
395
+ # # 1 y b
396
+ # # 2 z c
397
+ def set_at positions, vector
398
+ if positions.last == :row
399
+ positions.pop
400
+ return set_row_at(positions, vector)
401
+ end
402
+
403
+ validate_positions(*positions, ncols)
404
+ vector =
405
+ if vector.is_a? Daru::Vector
406
+ vector.reindex @index
407
+ else
408
+ Daru::Vector.new vector
409
+ end
410
+
411
+ raise SizeError, 'Vector length should match index length' if
412
+ vector.size != @index.size
413
+
414
+ positions.each { |pos| @data[pos] = vector }
415
+ end
416
+
369
417
  # Insert a new row/vector of the specified name or modify a previous row.
370
418
  # Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
371
419
  # a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
@@ -374,25 +422,11 @@ module Daru
374
422
  # of the vector will be matched against the row/vector indexes of the DataFrame
375
423
  # before an insertion is performed. Unmatched indexes will be set to nil.
376
424
  def []=(*args)
377
- axis = args.include?(:row) ? :row : :vector
378
- args.delete :vector
379
- args.delete :row
425
+ vector = args.pop
426
+ axis = extract_axis(args)
427
+ names = args
380
428
 
381
- name = args[0..-2]
382
- vector = args[-1]
383
-
384
- if axis == :vector
385
- insert_or_modify_vector name, vector
386
- elsif axis == :row
387
- insert_or_modify_row name, vector
388
- else
389
- raise IndexError, "Expected axis to be row or vector, not #{axis}."
390
- end
391
- end
392
-
393
- # Access a vector by name.
394
- def column name
395
- vector[name]
429
+ dispatch_to_axis axis, :insert_or_modify, names, vector
396
430
  end
397
431
 
398
432
  def add_row row, index=nil
@@ -421,10 +455,7 @@ module Daru
421
455
  def dup vectors_to_dup=nil
422
456
  vectors_to_dup = @vectors.to_a unless vectors_to_dup
423
457
 
424
- src = []
425
- vectors_to_dup.each do |vec|
426
- src << @data[@vectors[vec]].dup
427
- end
458
+ src = vectors_to_dup.map { |vec| @data[@vectors[vec]].dup }
428
459
  new_order = Daru::Index.new(vectors_to_dup)
429
460
 
430
461
  Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
@@ -443,20 +474,18 @@ module Daru
443
474
  # +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
444
475
  # a view of the whole data frame otherwise.
445
476
  def clone *vectors_to_clone
446
- vectors_to_clone.flatten! unless vectors_to_clone.all? { |a| !a.is_a?(Array) }
477
+ vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
447
478
  vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
448
479
 
449
- h = vectors_to_clone.each_with_object({}) do |vec, hsh|
450
- hsh[vec] = self[vec]
451
- end
452
- Daru::DataFrame.new(h, clone: false)
480
+ h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
481
+ Daru::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
453
482
  end
454
483
 
455
484
  # Returns a 'shallow' copy of DataFrame if missing data is not present,
456
485
  # or a full copy of only valid data if missing data is present.
457
486
  def clone_only_valid
458
- if has_missing_data?
459
- dup_only_valid
487
+ if include_values?(*Daru::MISSING_VALUES)
488
+ reject_values(*Daru::MISSING_VALUES)
460
489
  else
461
490
  clone
462
491
  end
@@ -465,19 +494,76 @@ module Daru
465
494
  # Creates a new duplicate dataframe containing only rows
466
495
  # without a single missing value.
467
496
  def dup_only_valid vecs=nil
468
- rows_with_nil = @data.each_with_object([]) do |vector, memo|
469
- memo.concat vector.missing_positions
470
- end.uniq
497
+ rows_with_nil = @data.map { |vec| vec.indexes(*Daru::MISSING_VALUES) }
498
+ .inject(&:concat)
499
+ .uniq
471
500
 
472
501
  row_indexes = @index.to_a
473
502
  (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
474
503
  end
504
+ deprecate :dup_only_valid, :reject_values, 2016, 10
505
+
506
+ # Returns a dataframe in which rows with any of the mentioned values
507
+ # are ignored.
508
+ # @param [Array] *values values to reject to form the new dataframe
509
+ # @return [Daru::DataFrame] Data Frame with only rows which doesn't
510
+ # contain the mentioned values
511
+ # @example
512
+ # df = Daru::DataFrame.new({
513
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
514
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
515
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
516
+ # }, index: 11..18)
517
+ # df.reject_values nil, Float::NAN
518
+ # # => #<Daru::DataFrame(2x3)>
519
+ # # a b c
520
+ # # 11 1 a a
521
+ # # 18 7 8 7
522
+ def reject_values(*values)
523
+ positions =
524
+ size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
525
+ # Handle the case when positions size is 1 and #row_at wouldn't return a df
526
+ if positions.size == 1
527
+ pos = positions.first
528
+ row_at(pos..pos)
529
+ else
530
+ row_at(*positions)
531
+ end
532
+ end
533
+
534
+ # Replace specified values with given value
535
+ # @param [Array] old_values values to replace with new value
536
+ # @param [object] new_value new value to replace with
537
+ # @return [Daru::DataFrame] Data Frame itself with old values replace
538
+ # with new value
539
+ # @example
540
+ # df = Daru::DataFrame.new({
541
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
542
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
543
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
544
+ # }, index: 11..18)
545
+ # df
546
+ # # => #<Daru::DataFrame(8x3)>
547
+ # # a b c
548
+ # # 11 1 a a
549
+ # # 12 2 b NaN
550
+ # # 13 3 NaN 3
551
+ # # 14 NaN NaN 4
552
+ # # 15 NaN NaN 3
553
+ # # 16 NaN 3 5
554
+ # # 17 1 5 NaN
555
+ # # 18 7 8 7
556
+ def replace_values old_values, new_value
557
+ @data.each { |vec| vec.replace_values old_values, new_value }
558
+ self
559
+ end
475
560
 
476
561
  # Iterate over each index of the DataFrame.
477
562
  def each_index &block
478
563
  return to_enum(:each_index) unless block_given?
479
564
 
480
565
  @index.each(&block)
566
+
481
567
  self
482
568
  end
483
569
 
@@ -509,8 +595,8 @@ module Daru
509
595
  def each_row
510
596
  return to_enum(:each_row) unless block_given?
511
597
 
512
- @index.each do |index|
513
- yield access_row(index)
598
+ @index.size.times do |pos|
599
+ yield row_at(pos)
514
600
  end
515
601
 
516
602
  self
@@ -540,13 +626,7 @@ module Daru
540
626
  # * +axis+ - The axis to iterate over. Can be :vector (or :column)
541
627
  # or :row. Default to :vector.
542
628
  def each axis=:vector, &block
543
- if axis == :vector || axis == :column
544
- each_vector(&block)
545
- elsif axis == :row
546
- each_row(&block)
547
- else
548
- raise ArgumentError, "Unknown axis #{axis}"
549
- end
629
+ dispatch_to_axis axis, :each, &block
550
630
  end
551
631
 
552
632
  # Iterate over a row or vector and return results in a Daru::Vector.
@@ -565,13 +645,7 @@ module Daru
565
645
  # * +axis+ - The axis to iterate over. Can be :vector (or :column)
566
646
  # or :row. Default to :vector.
567
647
  def collect axis=:vector, &block
568
- if axis == :vector || axis == :column
569
- collect_vectors(&block)
570
- elsif axis == :row
571
- collect_rows(&block)
572
- else
573
- raise ArgumentError, "Unknown axis #{axis}"
574
- end
648
+ dispatch_to_axis_pl axis, :collect, &block
575
649
  end
576
650
 
577
651
  # Map over each vector or row of the data frame according to
@@ -591,13 +665,7 @@ module Daru
591
665
  # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
592
666
  # Default to :vector.
593
667
  def map axis=:vector, &block
594
- if axis == :vector || axis == :column
595
- map_vectors(&block)
596
- elsif axis == :row
597
- map_rows(&block)
598
- else
599
- raise ArgumentError, "Unknown axis #{axis}"
600
- end
668
+ dispatch_to_axis_pl axis, :map, &block
601
669
  end
602
670
 
603
671
  # Destructive map. Modifies the DataFrame. Each run of the block
@@ -634,11 +702,7 @@ module Daru
634
702
  # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
635
703
  # Default to :vector.
636
704
  def recode axis=:vector, &block
637
- if axis == :vector || axis == :column
638
- recode_vectors(&block)
639
- elsif axis == :row
640
- recode_rows(&block)
641
- end
705
+ dispatch_to_axis_pl axis, :recode, &block
642
706
  end
643
707
 
644
708
  # Retain vectors or rows if the block returns a truthy value.
@@ -670,50 +734,34 @@ module Daru
670
734
  # row[:a] + row[:d] < 100
671
735
  # end
672
736
  def filter axis=:vector, &block
673
- if axis == :vector || axis == :column
674
- filter_vectors(&block)
675
- elsif axis == :row
676
- filter_rows(&block)
677
- end
737
+ dispatch_to_axis_pl axis, :filter, &block
678
738
  end
679
739
 
680
740
  def recode_vectors
681
741
  block_given? or return to_enum(:recode_vectors)
682
742
 
683
- df = dup
684
- df.each_vector_with_index do |v, i|
685
- ret = yield v
686
- ret.is_a?(Daru::Vector) or
687
- raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
688
- df[*i] = ret
743
+ dup.tap do |df|
744
+ df.each_vector_with_index do |v, i|
745
+ df[*i] = should_be_vector!(yield(v))
746
+ end
689
747
  end
690
-
691
- df
692
748
  end
693
749
 
694
750
  def recode_rows
695
751
  block_given? or return to_enum(:recode_rows)
696
752
 
697
- df = dup
698
- df.each_row_with_index do |r, i|
699
- ret = yield r
700
- ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
701
- df.row[i] = ret
753
+ dup.tap do |df|
754
+ df.each_row_with_index do |r, i|
755
+ df.row[i] = should_be_vector!(yield(r))
756
+ end
702
757
  end
703
-
704
- df
705
758
  end
706
759
 
707
760
  # Map each vector and return an Array.
708
- def map_vectors
761
+ def map_vectors &block
709
762
  return to_enum(:map_vectors) unless block_given?
710
763
 
711
- arry = []
712
- @data.each do |vec|
713
- arry << yield(vec)
714
- end
715
-
716
- arry
764
+ @data.map(&block)
717
765
  end
718
766
 
719
767
  # Destructive form of #map_vectors
@@ -721,56 +769,37 @@ module Daru
721
769
  return to_enum(:map_vectors!) unless block_given?
722
770
 
723
771
  vectors.dup.each do |n|
724
- v = yield self[n]
725
- v.is_a?(Daru::Vector) or raise TypeError, "Must return a Daru::Vector not #{v.class}"
726
- self[n] = v
772
+ self[n] = should_be_vector!(yield(self[n]))
727
773
  end
728
774
 
729
775
  self
730
776
  end
731
777
 
732
778
  # Map vectors alongwith the index.
733
- def map_vectors_with_index
779
+ def map_vectors_with_index &block
734
780
  return to_enum(:map_vectors_with_index) unless block_given?
735
781
 
736
- dt = []
737
- each_vector_with_index do |vector, name|
738
- dt << yield(vector, name)
739
- end
740
-
741
- dt
782
+ each_vector_with_index.map(&block)
742
783
  end
743
784
 
744
785
  # Map each row
745
- def map_rows
786
+ def map_rows &block
746
787
  return to_enum(:map_rows) unless block_given?
747
788
 
748
- dt = []
749
- each_row do |row|
750
- dt << yield(row)
751
- end
752
-
753
- dt
789
+ each_row.map(&block)
754
790
  end
755
791
 
756
- def map_rows_with_index
792
+ def map_rows_with_index &block
757
793
  return to_enum(:map_rows_with_index) unless block_given?
758
794
 
759
- dt = []
760
- each_row_with_index do |row, index|
761
- dt << yield(row, index)
762
- end
763
-
764
- dt
795
+ each_row_with_index.map(&block)
765
796
  end
766
797
 
767
798
  def map_rows!
768
799
  return to_enum(:map_rows!) unless block_given?
769
800
 
770
801
  index.dup.each do |i|
771
- r = yield row[i]
772
- r.is_a?(Daru::Vector) or raise TypeError, "Returned object must be Daru::Vector not #{r.class}"
773
- row[i] = r
802
+ row[i] = should_be_vector!(yield(row[i]))
774
803
  end
775
804
 
776
805
  self
@@ -778,55 +807,38 @@ module Daru
778
807
 
779
808
  # Retrieves a Daru::Vector, based on the result of calculation
780
809
  # performed on each row.
781
- def collect_rows
810
+ def collect_rows &block
782
811
  return to_enum(:collect_rows) unless block_given?
783
812
 
784
- data = []
785
- each_row do |row|
786
- data.push yield(row)
787
- end
788
-
789
- Daru::Vector.new(data, index: @index)
813
+ Daru::Vector.new(each_row.map(&block), index: @index)
790
814
  end
791
815
 
792
- def collect_row_with_index
816
+ def collect_row_with_index &block
793
817
  return to_enum(:collect_row_with_index) unless block_given?
794
818
 
795
- data = []
796
- each_row_with_index do |row, i|
797
- data.push yield(row, i)
798
- end
799
-
800
- Daru::Vector.new(data, index: @index)
819
+ Daru::Vector.new(each_row_with_index.map(&block), index: @index)
801
820
  end
802
821
 
803
822
  # Retrives a Daru::Vector, based on the result of calculation
804
823
  # performed on each vector.
805
- def collect_vectors
824
+ def collect_vectors &block
806
825
  return to_enum(:collect_vectors) unless block_given?
807
826
 
808
- data = []
809
- each_vector do |vec|
810
- data.push yield(vec)
811
- end
812
-
813
- Daru::Vector.new(data, index: @vectors)
827
+ Daru::Vector.new(each_vector.map(&block), index: @vectors)
814
828
  end
815
829
 
816
- def collect_vector_with_index
830
+ def collect_vector_with_index &block
817
831
  return to_enum(:collect_vector_with_index) unless block_given?
818
832
 
819
- data = []
820
- each_vector_with_index do |vec, i|
821
- data.push yield(vec, i)
822
- end
823
-
824
- Daru::Vector.new(data, index: @vectors)
833
+ Daru::Vector.new(each_vector_with_index.map(&block), index: @vectors)
825
834
  end
826
835
 
827
836
  # Generate a matrix, based on vector names of the DataFrame.
828
837
  #
829
838
  # @return {::Matrix}
839
+ # :nocov:
840
+ # FIXME: Even not trying to cover this: I can't get, how it is expected
841
+ # to work.... -- zverok
830
842
  def collect_matrix
831
843
  return to_enum(:collect_matrix) unless block_given?
832
844
 
@@ -839,6 +851,7 @@ module Daru
839
851
 
840
852
  Matrix.rows(rows)
841
853
  end
854
+ # :nocov:
842
855
 
843
856
  # Delete a vector
844
857
  def delete_vector vector
@@ -876,43 +889,29 @@ module Daru
876
889
  # @return {Daru::DataFrame}
877
890
  def bootstrap(n=nil)
878
891
  n ||= nrows
879
- ds_boot = Daru::DataFrame.new({}, order: @vectors)
880
- n.times do
881
- ds_boot.add_row(row[rand(n)])
892
+ Daru::DataFrame.new({}, order: @vectors).tap do |df_boot|
893
+ n.times do
894
+ df_boot.add_row(row[rand(n)])
895
+ end
896
+ df_boot.update
882
897
  end
883
- ds_boot.update
884
- ds_boot
885
898
  end
886
899
 
887
900
  def keep_row_if
888
- deletion = []
889
-
890
- @index.each do |index|
891
- keep_row = yield access_row(index)
892
-
893
- deletion << index unless keep_row
894
- end
895
- deletion.each { |idx|
896
- delete_row idx
897
- }
901
+ @index
902
+ .reject { |idx| yield access_row(idx) }
903
+ .each { |idx| delete_row idx }
898
904
  end
899
905
 
900
906
  def keep_vector_if
901
907
  @vectors.each do |vector|
902
- keep_vector = yield @data[@vectors[vector]], vector
903
-
904
- delete_vector vector unless keep_vector
908
+ delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
905
909
  end
906
910
  end
907
911
 
908
912
  # creates a new vector with the data of a given field which the block returns true
909
- def filter_vector vec
910
- d = []
911
- each_row do |row|
912
- d.push(row[vec]) if yield row
913
- end
914
-
915
- Daru::Vector.new(d, metadata: self[vec].metadata.dup)
913
+ def filter_vector vec, &block
914
+ Daru::Vector.new each_row.select(&block).map { |row| row[vec] }
916
915
  end
917
916
 
918
917
  # Iterates over each row and retains it in a new DataFrame if the block returns
@@ -930,38 +929,24 @@ module Daru
930
929
  def filter_vectors &block
931
930
  return to_enum(:filter_vectors) unless block_given?
932
931
 
933
- df = dup
934
- df.keep_vector_if(&block)
935
-
936
- df
932
+ dup.tap { |df| df.keep_vector_if(&block) }
937
933
  end
938
934
 
939
935
  # Test each row with one or more tests. Each test is a Proc with the form
940
936
  # *Proc.new {|row| row[:age] > 0}*
941
937
  #
942
938
  # The function returns an array with all errors.
939
+ #
940
+ # FIXME: description here is too sparse. As far as I can get,
941
+ # it should tell something about that each test is [descr, fields, block],
942
+ # and that first value may be column name to output. - zverok, 2016-05-18
943
943
  def verify(*tests)
944
- if tests[0].is_a? Symbol
945
- id = tests[0]
946
- tests.shift
947
- else
948
- id = @vectors.first
949
- end
950
-
951
- vr = []
952
- i = 0
953
- each(:row) do |row|
954
- i += 1
955
- tests.each do |test|
956
- next if test[2].call(row)
957
- values = ''
958
- unless test[1].empty?
959
- values = ' (' + test[1].collect { |k| "#{k}=#{row[k]}" }.join(', ') + ')'
960
- end
961
- vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
962
- end
963
- end
964
- vr
944
+ id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
945
+
946
+ each_row_with_index.map do |row, i|
947
+ tests.reject { |*_, block| block.call(row) }
948
+ .map { |test| verify_error_message row, test, id, i }
949
+ end.flatten
965
950
  end
966
951
 
967
952
  # DSL for yielding each row and returning a Daru::Vector based on the
@@ -984,10 +969,7 @@ module Daru
984
969
  # # 5 666
985
970
  # # 6 777
986
971
  def vector_by_calculation &block
987
- a = []
988
- each_row do |r|
989
- a.push r.instance_eval(&block)
990
- end
972
+ a = each_row.map { |r| r.instance_eval(&block) }
991
973
 
992
974
  Daru::Vector.new a, index: @index
993
975
  end
@@ -1016,10 +998,8 @@ module Daru
1016
998
  # * +missing_values+ - An Array of the values that should be
1017
999
  # treated as 'missing'. The default missing value is *nil*.
1018
1000
  def missing_values_rows missing_values=[nil]
1019
- number_of_missing = []
1020
- each_row do |row|
1021
- row.missing_values = missing_values
1022
- number_of_missing << row.missing_positions.size
1001
+ number_of_missing = each_row.map do |row|
1002
+ row.indexes(*missing_values).size
1023
1003
  end
1024
1004
 
1025
1005
  Daru::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
@@ -1029,67 +1009,77 @@ module Daru
1029
1009
  alias :vector_missing_values :missing_values_rows
1030
1010
 
1031
1011
  def has_missing_data?
1032
- !!@data.any?(&:has_missing_data?)
1012
+ !!@data.any? { |vec| vec.include_values?(*Daru::MISSING_VALUES) }
1033
1013
  end
1034
-
1035
1014
  alias :flawed? :has_missing_data?
1015
+ deprecate :has_missing_data?, :include_values?, 2016, 10
1016
+ deprecate :flawed?, :include_values?, 2016, 10
1017
+
1018
+ # Check if any of given values occur in the data frame
1019
+ # @param [Array] *values values to check for
1020
+ # @return [true, false] true if any of the given values occur in the
1021
+ # dataframe, false otherwise
1022
+ # @example
1023
+ # df = Daru::DataFrame.new({
1024
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
1025
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
1026
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
1027
+ # }, index: 11..18)
1028
+ # df.include_values? nil
1029
+ # # => true
1030
+ def include_values?(*values)
1031
+ @data.any? { |vec| vec.include_values?(*values) }
1032
+ end
1036
1033
 
1037
1034
  # Return a nested hash using vector names as keys and an array constructed of
1038
1035
  # hashes with other values. If block provided, is used to provide the
1039
1036
  # values, with parameters +row+ of dataset, +current+ last hash on
1040
1037
  # hierarchy and +name+ of the key to include
1041
- def nest *tree_keys, &block
1038
+ def nest *tree_keys, &_block
1042
1039
  tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
1043
- out = {}
1044
1040
 
1045
- each_row do |row|
1046
- current = out
1041
+ each_row.each_with_object({}) do |row, current|
1047
1042
  # Create tree
1048
- tree_keys[0, tree_keys.size-1].each do |f|
1049
- root = row[f]
1050
- current[root] ||= {}
1051
- current = current[root]
1052
- end
1053
- name = row[tree_keys.last]
1054
- if !block
1043
+ *keys, last = tree_keys
1044
+ current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
1045
+ name = row[last]
1046
+
1047
+ if block_given?
1048
+ current[name] = yield(row, current, name)
1049
+ else
1055
1050
  current[name] ||= []
1056
1051
  current[name].push(row.to_h.delete_if { |key,_value| tree_keys.include? key })
1057
- else
1058
- current[name] = yield(row, current, name)
1059
1052
  end
1060
1053
  end
1061
-
1062
- out
1063
1054
  end
1064
1055
 
1065
1056
  def vector_count_characters vecs=nil
1066
1057
  vecs ||= @vectors.to_a
1067
1058
 
1068
1059
  collect_rows do |row|
1069
- vecs.inject(0) do |memo, vec|
1070
- memo + (row[vec].nil? ? 0 : row[vec].to_s.size)
1071
- end
1060
+ vecs.map { |v| row[v].to_s.size }.inject(:+)
1072
1061
  end
1073
1062
  end
1074
1063
 
1075
1064
  def add_vectors_by_split(name,join='-',sep=Daru::SPLIT_TOKEN)
1076
- split = self[name].split_by_separator(sep)
1077
- split.each { |k,v| self[(name.to_s + join + k.to_s).to_sym] = v }
1065
+ self[name]
1066
+ .split_by_separator(sep)
1067
+ .each { |k,v| self["#{name}#{join}#{k}".to_sym] = v }
1078
1068
  end
1079
1069
 
1080
1070
  # Return the number of rows and columns of the DataFrame in an Array.
1081
1071
  def shape
1082
- [@index.size, @vectors.size]
1072
+ [nrows, ncols]
1083
1073
  end
1084
1074
 
1085
1075
  # The number of rows
1086
1076
  def nrows
1087
- shape[0]
1077
+ @index.size
1088
1078
  end
1089
1079
 
1090
1080
  # The number of vectors
1091
1081
  def ncols
1092
- shape[1]
1082
+ @vectors.size
1093
1083
  end
1094
1084
 
1095
1085
  # Check if a vector is present
@@ -1132,10 +1122,7 @@ module Daru
1132
1122
  if axis == :vector || axis == :column
1133
1123
  @data.all?(&block)
1134
1124
  elsif axis == :row
1135
- each_row do |row|
1136
- return false unless yield(row)
1137
- end
1138
- return true
1125
+ each_row.all?(&block)
1139
1126
  else
1140
1127
  raise ArgumentError, "Unidentified axis #{axis}"
1141
1128
  end
@@ -1145,7 +1132,7 @@ module Daru
1145
1132
  #
1146
1133
  # @param [Fixnum] quantity (10) The number of elements to display from the top.
1147
1134
  def head quantity=10
1148
- self[0..(quantity-1), :row]
1135
+ row.at 0..(quantity-1)
1149
1136
  end
1150
1137
 
1151
1138
  alias :first :head
@@ -1154,22 +1141,19 @@ module Daru
1154
1141
  #
1155
1142
  # @param [Fixnum] quantity (10) The number of elements to display from the bottom.
1156
1143
  def tail quantity=10
1157
- self[(@size - quantity)..(@size-1), :row]
1144
+ start = [-quantity, -size].max
1145
+ row.at start..-1
1158
1146
  end
1159
1147
 
1160
1148
  alias :last :tail
1161
1149
 
1162
1150
  # Returns a vector with sum of all vectors specified in the argument.
1163
- # Tf vecs parameter is empty, sum all numeric vector.
1151
+ # If vecs parameter is empty, sum all numeric vector.
1164
1152
  def vector_sum vecs=nil
1165
1153
  vecs ||= numeric_vectors
1166
1154
  sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype
1167
1155
 
1168
- vecs.each do |n|
1169
- sum += self[n]
1170
- end
1171
-
1172
- sum
1156
+ vecs.inject(sum) { |memo, n| memo + self[n] }
1173
1157
  end
1174
1158
 
1175
1159
  # Calculate mean of the rows of the dataframe.
@@ -1179,13 +1163,13 @@ module Daru
1179
1163
  # * +max_missing+ - The maximum number of elements in the row that can be
1180
1164
  # zero for the mean calculation to happen. Default to 0.
1181
1165
  def vector_mean max_missing=0
1166
+ # FIXME: in vector_sum we preserve created vector dtype, but
1167
+ # here we are not. Is this by design or ...? - zverok, 2016-05-18
1182
1168
  mean_vec = Daru::Vector.new [0]*@size, index: @index, name: "mean_#{@name}"
1183
1169
 
1184
- each_row_with_index do |row, i|
1185
- mean_vec[i] = row.missing_positions.size > max_missing ? nil : row.mean
1170
+ each_row_with_index.each_with_object(mean_vec) do |(row, i), memo|
1171
+ memo[i] = row.indexes(*Daru::MISSING_VALUES).size > max_missing ? nil : row.mean
1186
1172
  end
1187
-
1188
- mean_vec
1189
1173
  end
1190
1174
 
1191
1175
  # Group elements by vector to perform operations on them. Returns a
@@ -1214,6 +1198,8 @@ module Daru
1214
1198
  # # ["foo", "two", 3]=>[2, 4]}
1215
1199
  def group_by *vectors
1216
1200
  vectors.flatten!
1201
+ # FIXME: wouldn't it better to do vectors - @vectors here and
1202
+ # raise one error with all non-existent vector names?.. - zverok, 2016-05-18
1217
1203
  vectors.each { |v|
1218
1204
  raise(ArgumentError, "Vector #{v} does not exist") unless has_vector?(v)
1219
1205
  }
@@ -1226,28 +1212,22 @@ module Daru
1226
1212
  "subclasses, not #{new_index.class}" unless new_vectors.is_a?(Daru::Index)
1227
1213
 
1228
1214
  cl = Daru::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
1229
- new_vectors.each do |vec|
1230
- cl[vec] = @vectors.include?(vec) ? self[vec] : cl[vec] = [nil]*nrows
1215
+ new_vectors.each_with_object(cl) do |vec, memo|
1216
+ memo[vec] = @vectors.include?(vec) ? self[vec] : [nil]*nrows
1231
1217
  end
1218
+ end
1232
1219
 
1233
- cl
1220
+ def get_vector_anyways(v)
1221
+ @vectors.include?(v) ? self[v].to_a : [nil] * size
1234
1222
  end
1235
1223
 
1236
1224
  # Concatenate another DataFrame along corresponding columns.
1237
1225
  # If columns do not exist in both dataframes, they are filled with nils
1238
1226
  def concat other_df
1239
- vectors = @vectors.to_a
1240
- data = []
1227
+ vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
1241
1228
 
1242
- vectors.each do |v|
1243
- other_vec = other_df.vectors.include?(v) ? other_df[v].to_a : [nil] * other_df.size
1244
- data << self[v].dup.to_a.concat(other_vec)
1245
- end
1246
-
1247
- other_df.vectors.each do |v|
1248
- next if vectors.include?(v)
1249
- vectors << v
1250
- data << ([nil] * size).concat(other_df[v].to_a)
1229
+ data = vectors.map do |v|
1230
+ get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
1251
1231
  end
1252
1232
 
1253
1233
  Daru::DataFrame.new(data, order: vectors)
@@ -1291,11 +1271,9 @@ module Daru
1291
1271
  "subclasses, not #{new_index.class}" unless new_index.is_a?(Daru::Index)
1292
1272
 
1293
1273
  cl = Daru::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
1294
- new_index.each do |idx|
1295
- cl.row[idx] = @index.include?(idx) ? row[idx] : [nil]*ncols
1274
+ new_index.each_with_object(cl) do |idx, memo|
1275
+ memo.row[idx] = @index.include?(idx) ? row[idx] : [nil]*ncols
1296
1276
  end
1297
-
1298
- cl
1299
1277
  end
1300
1278
 
1301
1279
  # Reassign index with a new index of type Daru::Index or any of its subclasses.
@@ -1310,8 +1288,8 @@ module Daru
1310
1288
  # df.index.to_a #=> ['a','b','c','d']
1311
1289
  # df.row['a'].to_a #=> [1,11]
1312
1290
  def index= idx
1313
- @data.each { |vec| vec.index = idx }
1314
- @index = idx
1291
+ @index = Index.coerce idx
1292
+ @data.each { |vec| vec.index = @index }
1315
1293
 
1316
1294
  self
1317
1295
  end
@@ -1361,21 +1339,14 @@ module Daru
1361
1339
  # Return the indexes of all the numeric vectors. Will include vectors with nils
1362
1340
  # alongwith numbers.
1363
1341
  def numeric_vectors
1364
- numerics = []
1365
-
1366
- each_vector_with_index do |vec, i|
1367
- numerics << i if vec.type == :numeric
1368
- end
1369
- numerics
1342
+ # FIXME: Why _with_index ?..
1343
+ each_vector_with_index
1344
+ .select { |vec, _i| vec.numeric? }
1345
+ .map(&:last)
1370
1346
  end
1371
1347
 
1372
1348
  def numeric_vector_names
1373
- numerics = []
1374
-
1375
- @vectors.each do |v|
1376
- numerics << v if self[v].type == :numeric
1377
- end
1378
- numerics
1349
+ @vectors.select { |v| self[v].numeric? }
1379
1350
  end
1380
1351
 
1381
1352
  # Return a DataFrame of only the numerical Vectors. If clone: false
@@ -1383,12 +1354,9 @@ module Daru
1383
1354
  # returned. Defaults to clone: true.
1384
1355
  def only_numerics opts={}
1385
1356
  cln = opts[:clone] == false ? false : true
1386
- nv = numeric_vectors
1387
- arry = nv.each_with_object([]) do |v, arr|
1388
- arr << self[v]
1389
- end
1357
+ arry = numeric_vectors.map { |v| self[v] }
1390
1358
 
1391
- order = Index.new(nv)
1359
+ order = Index.new(numeric_vectors)
1392
1360
  Daru::DataFrame.new(arry, clone: cln, order: order, index: @index)
1393
1361
  end
1394
1362
 
@@ -1492,39 +1460,24 @@ module Daru
1492
1460
 
1493
1461
  def sort! vector_order, opts={}
1494
1462
  raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
1495
- opts = {
1496
- ascending: true,
1497
- handle_nils: false,
1498
- by: {}
1499
- }.merge(opts)
1500
1463
 
1501
- opts[:ascending] = sort_order_array vector_order, opts[:ascending]
1502
- opts[:handle_nils] = handle_nils_array vector_order, opts[:handle_nils]
1503
- blocks = create_logic_blocks vector_order, opts[:by], opts[:ascending]
1464
+ # To enable sorting with categorical data,
1465
+ # map categories to integers preserving their order
1466
+ old = convert_categorical_vectors vector_order
1467
+ block = sort_prepare_block vector_order, opts
1504
1468
 
1505
- block = lambda do |r1, r2|
1506
- # Build left and right array to compare two rows
1507
- left = build_array_from_blocks vector_order, opts, blocks, r1, r2
1508
- right = build_array_from_blocks vector_order, opts, blocks, r2, r1
1469
+ order = @index.size.times.sort(&block)
1470
+ new_index = @index.reorder order
1509
1471
 
1510
- # Resolve conflict by Index if all attributes are same
1511
- left << r1
1512
- right << r2
1513
- left <=> right
1514
- end
1472
+ # To reverse map mapping of categorical data to integers
1473
+ restore_categorical_vectors old
1515
1474
 
1516
- idx = (0..@index.size-1).sort(&block)
1517
-
1518
- old_index = @index.to_a
1519
- self.index = Daru::Index.new(idx.map { |i| old_index[i] })
1520
-
1521
- vectors.each do |v|
1522
- @data[@vectors[v]] = Daru::Vector.new(
1523
- idx.map { |i| @data[@vectors[v]].data[i] },
1524
- name: self[v].name, metadata: self[v].metadata.dup, index: index
1525
- )
1475
+ @data.each do |vector|
1476
+ vector.reorder! order
1526
1477
  end
1527
1478
 
1479
+ self.index = new_index
1480
+
1528
1481
  self
1529
1482
  end
1530
1483
 
@@ -1568,90 +1521,41 @@ module Daru
1568
1521
  # # [:bar] 18 26
1569
1522
  # # [:foo] 10 12
1570
1523
  def pivot_table opts={}
1571
- raise ArgumentError,
1572
- 'Specify grouping index' if !opts[:index] || opts[:index].empty?
1573
-
1574
- index = opts[:index]
1575
- vectors = opts[:vectors] || []
1576
- aggregate_function = opts[:agg] || :mean
1577
- values =
1578
- if opts[:values].is_a?(Symbol)
1579
- [opts[:values]]
1580
- elsif opts[:values].is_a?(Array)
1581
- opts[:values]
1582
- else # nil
1583
- (@vectors.to_a - (index | vectors)) & numeric_vector_names
1584
- end
1524
+ raise ArgumentError, 'Specify grouping index' if opts[:index].to_a.empty?
1585
1525
 
1526
+ index = opts[:index]
1527
+ vectors = opts[:vectors] || []
1528
+ aggregate_function = opts[:agg] || :mean
1529
+ values = prepare_pivot_values index, vectors, opts
1586
1530
  raise IndexError, 'No numeric vectors to aggregate' if values.empty?
1587
1531
 
1588
1532
  grouped = group_by(index)
1533
+ return grouped.send(aggregate_function) if vectors.empty?
1589
1534
 
1590
- if vectors.empty?
1591
- grouped.send(aggregate_function)
1592
- else
1593
- super_hash = {}
1594
- values.each do |value|
1595
- grouped.groups.each do |group_name, row_numbers|
1596
- super_hash[group_name] ||= {}
1535
+ super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
1597
1536
 
1598
- row_numbers.each do |num|
1599
- arry = []
1600
- arry << value
1601
- vectors.each { |v| arry << self[v][num] }
1602
- sub_hash = super_hash[group_name]
1603
- sub_hash[arry] ||= []
1604
-
1605
- sub_hash[arry] << self[value][num]
1606
- end
1607
- end
1608
- end
1609
-
1610
- super_hash.each_value do |sub_hash|
1611
- sub_hash.each do |group_name, aggregates|
1612
- sub_hash[group_name] = Daru::Vector.new(aggregates).send(aggregate_function)
1613
- end
1614
- end
1615
-
1616
- df_index = Daru::MultiIndex.from_tuples super_hash.keys
1617
-
1618
- vector_indexes = []
1619
- super_hash.each_value do |sub_hash|
1620
- vector_indexes.concat sub_hash.keys
1621
- end
1622
-
1623
- df_vectors = Daru::MultiIndex.from_tuples vector_indexes.uniq
1624
- pivoted_dataframe = Daru::DataFrame.new({}, index: df_index, order: df_vectors)
1625
-
1626
- super_hash.each do |row_index, sub_h|
1627
- sub_h.each do |vector_index, val|
1628
- # pivoted_dataframe[symbolize(vector_index)][symbolize(row_index)] = val
1629
- pivoted_dataframe[vector_index][row_index] = val
1630
- end
1631
- end
1632
- return pivoted_dataframe
1633
- end
1537
+ pivot_dataframe super_hash
1634
1538
  end
1635
1539
 
1636
1540
  # Merge vectors from two DataFrames. In case of name collision,
1637
1541
  # the vectors names are changed to x_1, x_2 ....
1638
1542
  #
1639
1543
  # @return {Daru::DataFrame}
1640
- def merge other_df
1641
- raise "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" unless nrows == other_df.nrows
1544
+ def merge other_df # rubocop:disable Metrics/AbcSize
1545
+ raise ArgumentError,
1546
+ "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" \
1547
+ unless nrows == other_df.nrows
1642
1548
 
1643
1549
  new_fields = (@vectors.to_a + other_df.vectors.to_a)
1644
- .recode_repeated
1645
- .map(&:to_sym)
1646
- df_new = DataFrame.new({}, order: new_fields)
1550
+ new_fields = ArrayHelper.recode_repeated(new_fields)
1647
1551
 
1648
- (0...nrows).to_a.each do |i|
1649
- row = self.row[i].to_a + other_df.row[i].to_a
1650
- df_new.add_row(row)
1651
- end
1552
+ DataFrame.new({}, order: new_fields).tap do |df_new|
1553
+ (0...nrows).each do |i|
1554
+ df_new.add_row row[i].to_a + other_df.row[i].to_a
1555
+ end
1652
1556
 
1653
- df_new.update
1654
- df_new
1557
+ df_new.update
1558
+ end
1655
1559
  end
1656
1560
 
1657
1561
  # Join 2 DataFrames with SQL style joins. Currently supports inner, left
@@ -1701,7 +1605,11 @@ module Daru
1701
1605
  # ['2','fred','green',15,'orange',30,'white',20],
1702
1606
  # ['3','alfred',nil,nil,nil,nil,nil,nil]
1703
1607
  # ]
1704
- # ds=Daru::DataFrame.rows(cases, order: [:id, :name, :car_color1, :car_value1, :car_color2, :car_value2, :car_color3, :car_value3])
1608
+ # ds=Daru::DataFrame.rows(cases, order:
1609
+ # [:id, :name,
1610
+ # :car_color1, :car_value1,
1611
+ # :car_color2, :car_value2,
1612
+ # :car_color3, :car_value3])
1705
1613
  # ds.one_to_many([:id],'car_%v%n').to_matrix
1706
1614
  # #=> Matrix[
1707
1615
  # # ["red", "1", 10],
@@ -1711,62 +1619,29 @@ module Daru
1711
1619
  # # ["white", "2", 20]
1712
1620
  # # ]
1713
1621
  def one_to_many(parent_fields, pattern)
1714
- re = Regexp.new pattern.gsub('%v','(.+?)').gsub('%n','(\\d+?)')
1715
- ds_vars = parent_fields.dup
1716
- vars = []
1717
- max_n = 0
1718
- h = parent_fields.each_with_object({}) { |v, a|
1719
- a[v] = Daru::Vector.new([])
1720
- }
1721
- # Adding _row_id
1722
- h['_col_id'] = Daru::Vector.new([])
1723
- ds_vars.push('_col_id')
1724
-
1725
- @vectors.each do |f|
1726
- next unless f =~ re
1727
- unless vars.include? $1
1728
- vars.push($1)
1729
- h[$1] = Daru::Vector.new([])
1730
- end
1622
+ vars, numbers = one_to_many_components(pattern)
1731
1623
 
1732
- max_n = $2.to_i if max_n < $2.to_i
1733
- end
1734
- ds = DataFrame.new(h, order: ds_vars+vars)
1735
-
1736
- each_row do |row|
1737
- row_out = {}
1738
- parent_fields.each do |f|
1739
- row_out[f] = row[f]
1740
- end
1741
-
1742
- max_n.times do |n1|
1743
- n = n1+1
1744
- any_data = false
1745
- vars.each do |v|
1746
- data = row[pattern.gsub('%v',v.to_s).gsub('%n',n.to_s)]
1747
- row_out[v] = data
1748
- any_data = true unless data.nil?
1749
- end
1624
+ DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
1625
+ each_row do |row|
1626
+ verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
1627
+ numbers.each do |n|
1628
+ generated = one_to_many_row row, n, vars, pattern
1629
+ next if generated.values.all?(&:nil?)
1750
1630
 
1751
- if any_data
1752
- row_out['_col_id'] = n
1753
- ds.add_row(row_out)
1631
+ ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
1754
1632
  end
1755
1633
  end
1634
+ ds.update
1756
1635
  end
1757
- ds.update
1758
- ds
1759
1636
  end
1760
1637
 
1761
- def add_vectors_by_split_recode(name_, join='-', sep=Daru::SPLIT_TOKEN)
1762
- split = self[name_].split_by_separator(sep)
1763
- i = 1
1764
- split.each { |k,v|
1765
- new_field = name_.to_s + join + i.to_s
1766
- v.rename name_.to_s + ':' + k.to_s
1767
- self[new_field.to_sym] = v
1768
- i += 1
1769
- }
1638
+ def add_vectors_by_split_recode(nm, join='-', sep=Daru::SPLIT_TOKEN)
1639
+ self[nm]
1640
+ .split_by_separator(sep)
1641
+ .each_with_index do |(k, v), i|
1642
+ v.rename "#{nm}:#{k}"
1643
+ self["#{nm}#{join}#{i + 1}".to_sym] = v
1644
+ end
1770
1645
  end
1771
1646
 
1772
1647
  # Create a sql, basen on a given Dataset
@@ -1795,40 +1670,37 @@ module Daru
1795
1670
  sql + fields.join(",\n ")+") CHARACTER SET=#{charset};"
1796
1671
  end
1797
1672
 
1673
+ # Returns the dataframe. This can be convenient when the user does not
1674
+ # know whether the object is a vector or a dataframe.
1675
+ # @return [self] the dataframe
1676
+ def to_df
1677
+ self
1678
+ end
1679
+
1798
1680
  # Convert all numeric vectors to GSL::Matrix
1799
1681
  def to_gsl
1800
- numerics_as_arrays = []
1801
- numeric_vectors.each do |n|
1802
- numerics_as_arrays << self[n].to_a
1803
- end
1682
+ numerics_as_arrays = numeric_vectors.map { |n| self[n].to_a }
1804
1683
 
1805
1684
  GSL::Matrix.alloc(*numerics_as_arrays.transpose)
1806
1685
  end
1807
1686
 
1808
1687
  # Convert all vectors of type *:numeric* into a Matrix.
1809
1688
  def to_matrix
1810
- numerics_as_arrays = []
1811
- each_vector do |vector|
1812
- numerics_as_arrays << vector.to_a if vector.type == :numeric
1813
- end
1814
-
1815
- Matrix.columns numerics_as_arrays
1689
+ Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
1816
1690
  end
1817
1691
 
1818
1692
  # Return a Nyaplot::DataFrame from the data of this DataFrame.
1693
+ # :nocov:
1819
1694
  def to_nyaplotdf
1820
1695
  Nyaplot::DataFrame.new(to_a[0])
1821
1696
  end
1697
+ # :nocov:
1822
1698
 
1823
1699
  # Convert all vectors of type *:numeric* and not containing nils into an NMatrix.
1824
1700
  def to_nmatrix
1825
- numerics_as_arrays = []
1826
- each_vector do |vector|
1827
- numerics_as_arrays << vector.to_a if vector.type == :numeric &&
1828
- vector.missing_positions.empty?
1829
- end
1830
-
1831
- numerics_as_arrays.transpose.to_nm
1701
+ each_vector.select do |vector|
1702
+ vector.numeric? && !vector.include_values?(*Daru::MISSING_VALUES)
1703
+ end.map(&:to_a).transpose.to_nm
1832
1704
  end
1833
1705
 
1834
1706
  # Converts the DataFrame into an array of hashes where key is vector name
@@ -1837,13 +1709,7 @@ module Daru
1837
1709
  # of the dataframe. Each element in the index array corresponds to its row
1838
1710
  # in the array of hashes, which has the same index.
1839
1711
  def to_a
1840
- arry = [[],[]]
1841
- each_row do |row|
1842
- arry[0] << row.to_h
1843
- end
1844
- arry[1] = @index.to_a
1845
-
1846
- arry
1712
+ [each_row.map(&:to_h), @index.to_a]
1847
1713
  end
1848
1714
 
1849
1715
  # Convert to json. If no_index is false then the index will NOT be included
@@ -1859,54 +1725,19 @@ module Daru
1859
1725
  # Converts DataFrame to a hash (explicit) with keys as vector names and values as
1860
1726
  # the corresponding vectors.
1861
1727
  def to_h
1862
- hsh = {}
1863
- @vectors.each_with_index do |vec_name, idx|
1864
- hsh[vec_name] = @data[idx]
1865
- end
1866
-
1867
- hsh
1728
+ @vectors
1729
+ .each_with_index
1730
+ .map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
1868
1731
  end
1869
1732
 
1870
1733
  # Convert to html for IRuby.
1871
1734
  def to_html threshold=30
1872
- html = '<table>' \
1873
- '<tr>' \
1874
- "<th colspan=\"#{@vectors.size+1}\">" \
1875
- "Daru::DataFrame:#{object_id} " + " rows: #{nrows} " + " cols: #{ncols}" \
1876
- '</th>' \
1877
- '</tr>'
1878
- html +='<tr><th></th>'
1879
- @vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
1880
- html += '</tr>'
1881
-
1882
- @index.each_with_index do |index, num|
1883
- html += '<tr>'
1884
- html += '<td>' + index.to_s + '</td>'
1885
-
1886
- row[index].each do |element|
1887
- html += '<td>' + element.to_s + '</td>'
1888
- end
1889
-
1890
- html += '</tr>'
1891
- next if num <= threshold
1892
-
1893
- html += '<tr>'
1894
- (@vectors.size + 1).times { html += '<td>...</td>' }
1895
- html += '</tr>'
1896
-
1897
- last_index = @index.to_a.last
1898
- last_row = row[last_index]
1899
- html += '<tr>'
1900
- html += '<td>' + last_index.to_s + '</td>'
1901
- (0..(ncols - 1)).to_a.each do |i|
1902
- html += '<td>' + last_row[i].to_s + '</td>'
1903
- end
1904
- html += '</tr>'
1905
- break
1906
- end
1907
- html += '</table>'
1908
-
1909
- html
1735
+ path = if index.is_a?(MultiIndex)
1736
+ File.expand_path('../iruby/templates/dataframe_mi.html.erb', __FILE__)
1737
+ else
1738
+ File.expand_path('../iruby/templates/dataframe.html.erb', __FILE__)
1739
+ end
1740
+ ERB.new(File.read(path).strip).result(binding)
1910
1741
  end
1911
1742
 
1912
1743
  def to_s
@@ -1925,8 +1756,11 @@ module Daru
1925
1756
  # Rename the DataFrame.
1926
1757
  def rename new_name
1927
1758
  @name = new_name
1759
+ self
1928
1760
  end
1929
1761
 
1762
+ alias_method :name=, :rename
1763
+
1930
1764
  # Write this DataFrame to a CSV file.
1931
1765
  #
1932
1766
  # == Arguements
@@ -2003,46 +1837,28 @@ module Daru
2003
1837
 
2004
1838
  # Transpose a DataFrame, tranposing elements and row, column indexing.
2005
1839
  def transpose
2006
- arrys = []
2007
- each_vector do |vec|
2008
- arrys << vec.to_a
2009
- end
2010
-
2011
- Daru::DataFrame.new(arrys.transpose, index: @vectors, order: @index, dtype: @dtype, name: @name)
1840
+ Daru::DataFrame.new(
1841
+ each_vector.map(&:to_a).transpose,
1842
+ index: @vectors,
1843
+ order: @index,
1844
+ dtype: @dtype,
1845
+ name: @name
1846
+ )
2012
1847
  end
2013
1848
 
2014
1849
  # Pretty print in a nice table format for the command line (irb/pry/iruby)
2015
1850
  def inspect spacing=10, threshold=15
2016
- longest = [@name.to_s.size,
2017
- (@vectors.map(&:to_s).map(&:size).max || 0),
2018
- (@index .map(&:to_s).map(&:size).max || 0),
2019
- (@data .map { |v| v.map(&:to_s).map(&:size).max }.max || 0)].max
2020
-
2021
- name = @name || 'nil'
2022
- content = ''
2023
- longest = spacing if longest > spacing
2024
- formatter = "\n"
2025
-
2026
- (@vectors.size + 1).times { formatter += "%#{longest}.#{longest}s " }
2027
- content += "\n#<" + self.class.to_s + ':' + object_id.to_s + ' @name = ' +
2028
- name.to_s + ' @size = ' + @size.to_s + '>'
2029
- content += formatter % ['', *@vectors.map(&:to_s)]
2030
- row_num = 1
2031
-
2032
- each_row_with_index do |row, index|
2033
- content += formatter % [index.to_s, *row.to_h.values.map { |e| (e || 'nil').to_s }]
2034
- row_num += 1
2035
- next if row_num <= threshold
2036
-
2037
- dots = []
2038
-
2039
- (@vectors.size + 1).times { dots << '...' }
2040
- content += formatter % dots
2041
- break
2042
- end
2043
- content += "\n"
2044
-
2045
- content
1851
+ row_headers = index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
1852
+ name_part = @name ? ": #{@name} " : ''
1853
+
1854
+ "#<#{self.class}#{name_part}(#{nrows}x#{ncols})>\n" +
1855
+ Formatters::Table.format(
1856
+ each_row.lazy,
1857
+ row_headers: row_headers,
1858
+ headers: vectors,
1859
+ threshold: threshold,
1860
+ spacing: spacing
1861
+ )
2046
1862
  end
2047
1863
 
2048
1864
  # Query a DataFrame by passing a Daru::Core::Query::BoolArray object.
@@ -2058,218 +1874,202 @@ module Daru
2058
1874
  @vectors.to_a.all? { |v| self[v] == other[v] }
2059
1875
  end
2060
1876
 
1877
+ # Converts the specified non category type vectors to category type vectors
1878
+ # @param [Array] *names names of non category type vectors to be converted
1879
+ # @return [Daru::DataFrame] data frame in which specified vectors have been
1880
+ # converted to category type
1881
+ # @example
1882
+ # df = Daru::DataFrame.new({
1883
+ # a: [1, 2, 3],
1884
+ # b: ['a', 'a', 'b']
1885
+ # })
1886
+ # df.to_category :b
1887
+ # df[:b].type
1888
+ # # => :category
1889
+ def to_category *names
1890
+ names.each { |n| self[n] = self[n].to_category }
1891
+ self
1892
+ end
1893
+
2061
1894
  def method_missing(name, *args, &block)
2062
1895
  if name =~ /(.+)\=/
2063
- insert_or_modify_vector name[/(.+)\=/].delete('=').to_sym, args[0]
1896
+ insert_or_modify_vector [name[/(.+)\=/].delete('=').to_sym], args[0]
2064
1897
  elsif has_vector? name
2065
1898
  self[name]
2066
1899
  else
2067
- super(name, *args, &block)
1900
+ super
2068
1901
  end
2069
1902
  end
2070
1903
 
2071
- private
1904
+ def respond_to_missing?(name, include_private=false)
1905
+ name.to_s.end_with?('=') || has_vector?(name) || super
1906
+ end
2072
1907
 
2073
- def possibly_multi_index? index
2074
- if @index.is_a?(MultiIndex)
2075
- Daru::MultiIndex.from_tuples(index)
2076
- else
2077
- Daru::Index.new(index)
1908
+ def interact_code vector_names, full
1909
+ dfs = vector_names.zip(full).map do |vec_name, f|
1910
+ self[vec_name].contrast_code(full: f).each.to_a
2078
1911
  end
1912
+
1913
+ all_vectors = recursive_product(dfs)
1914
+ Daru::DataFrame.new all_vectors,
1915
+ order: all_vectors.map(&:name)
2079
1916
  end
2080
1917
 
2081
- def create_logic_blocks vector_order, _by, ascending
2082
- # Create blocks to handle nils
2083
- blocks = {}
2084
- universal_block_ascending = ->(a) { [a.nil? ? 0 : 1, a] }
2085
- universal_block_decending = ->(a) { [a.nil? ? 1 : 0, a] }
2086
- vector_order.each_with_index do |vector, i|
2087
- blocks[vector] =
2088
- if ascending[i]
2089
- universal_block_ascending
2090
- else
2091
- universal_block_decending
2092
- end
1918
+ # Split the dataframe into many dataframes based on category vector
1919
+ # @param [object] cat_name name of category vector to split the dataframe
1920
+ # @return [Array] array of dataframes split by category with category vector
1921
+ # used to split not included
1922
+ # @example
1923
+ # df = Daru::DataFrame.new({
1924
+ # a: [1, 2, 3],
1925
+ # b: ['a', 'a', 'b']
1926
+ # })
1927
+ # df.to_category :b
1928
+ # df.split_by_category :b
1929
+ # # => [#<Daru::DataFrame: a (2x1)>
1930
+ # # a
1931
+ # # 0 1
1932
+ # # 1 2,
1933
+ # # #<Daru::DataFrame: b (1x1)>
1934
+ # # a
1935
+ # # 2 3]
1936
+ def split_by_category cat_name
1937
+ cat_dv = self[cat_name]
1938
+ raise ArguementError, "#{cat_name} is not a category vector" unless
1939
+ cat_dv.category?
1940
+
1941
+ cat_dv.categories.map do |cat|
1942
+ where(cat_dv.eq cat)
1943
+ .rename(cat)
1944
+ .delete_vector cat_name
2093
1945
  end
1946
+ end
1947
+
1948
+ private
2094
1949
 
2095
- blocks
1950
+ def convert_categorical_vectors names
1951
+ names.map do |n|
1952
+ next unless self[n].category?
1953
+ old = [n, self[n]]
1954
+ self[n] = Daru::Vector.new(self[n].to_ints)
1955
+ old
1956
+ end.compact
2096
1957
  end
2097
1958
 
2098
- def build_array_from_blocks vector_order, opts, blocks, r1, r2
2099
- # Create an array to be used for comparison of two rows in sorting
2100
- vector_order.map.each_with_index do |v, i|
2101
- value = if opts[:ascending][i]
2102
- @data[@vectors[v]].data[r1]
2103
- else
2104
- @data[@vectors[v]].data[r2]
2105
- end
2106
-
2107
- if opts[:by][v] && !opts[:handle_nils][i]
2108
- # Block given and nils handled manually
2109
- value = opts[:by][v].call value
2110
-
2111
- elsif opts[:by][v] && opts[:handle_nils][i]
2112
- # Block given and nils handled automatically
2113
- value = opts[:by][v].call value rescue nil
2114
- blocks[v].call value
1959
+ def restore_categorical_vectors old
1960
+ old.each { |name, vector| self[name] = vector }
1961
+ end
2115
1962
 
2116
- else
2117
- # Block not given and nils handled automatically
2118
- blocks[v].call value
2119
- end
1963
+ def recursive_product dfs
1964
+ return dfs.first if dfs.size == 1
1965
+
1966
+ left = dfs.first
1967
+ dfs.shift
1968
+ right = recursive_product dfs
1969
+ left.product(right).map do |dv1, dv2|
1970
+ (dv1*dv2).rename "#{dv1.name}:#{dv2.name}"
2120
1971
  end
2121
1972
  end
2122
1973
 
2123
- def sort_order_array vector_order, ascending
2124
- if ascending.is_a? Array
2125
- raise ArgumentError, 'Specify same number of vector names and sort orders' if
2126
- vector_order.size != ascending.size
2127
- return ascending
1974
+ def should_be_vector! val
1975
+ return val if val.is_a?(Daru::Vector)
1976
+ raise TypeError, "Every iteration must return Daru::Vector not #{val.class}"
1977
+ end
1978
+
1979
+ def dispatch_to_axis(axis, method, *args, &block)
1980
+ if axis == :vector || axis == :column
1981
+ send("#{method}_vector", *args, &block)
1982
+ elsif axis == :row
1983
+ send("#{method}_row", *args, &block)
2128
1984
  else
2129
- Array.new(vector_order.size, ascending)
1985
+ raise ArgumentError, "Unknown axis #{axis}"
2130
1986
  end
2131
1987
  end
2132
1988
 
2133
- def handle_nils_array vector_order, handle_nils
2134
- if handle_nils.is_a? Array
2135
- raise ArgumentError, 'Specify same number of vector names and handle nils' if
2136
- vector_order.size != handle_nils.size
2137
- return handle_nils
1989
+ def dispatch_to_axis_pl(axis, method, *args, &block)
1990
+ if axis == :vector || axis == :column
1991
+ send("#{method}_vectors", *args, &block)
1992
+ elsif axis == :row
1993
+ send("#{method}_rows", *args, &block)
2138
1994
  else
2139
- Array.new(vector_order.size, handle_nils)
1995
+ raise ArgumentError, "Unknown axis #{axis}"
2140
1996
  end
2141
1997
  end
2142
1998
 
2143
- def vectors_index_for location
2144
- if @vectors.include?(location)
2145
- @vectors[location]
2146
- elsif location[0].is_a?(Integer)
2147
- location[0]
1999
+ AXES = [:row, :vector].freeze
2000
+
2001
+ def extract_axis names, default=:vector
2002
+ if AXES.include?(names.last)
2003
+ names.pop
2004
+ else
2005
+ default
2148
2006
  end
2149
2007
  end
2150
2008
 
2151
2009
  def access_vector *names
2152
- location = names[0]
2010
+ if names.first.is_a?(Range)
2011
+ dup(@vectors[names.first])
2012
+ elsif @vectors.is_a?(MultiIndex)
2013
+ access_vector_multi_index(*names)
2014
+ else
2015
+ access_vector_single_index(*names)
2016
+ end
2017
+ end
2153
2018
 
2154
- return dup(@vectors[location]) if location.is_a?(Range)
2155
- if @vectors.is_a?(MultiIndex)
2156
- pos = @vectors[names]
2019
+ def access_vector_multi_index *names
2020
+ pos = @vectors[names]
2157
2021
 
2158
- return @data[pos] if pos.is_a?(Integer)
2022
+ return @data[pos] if pos.is_a?(Integer)
2159
2023
 
2160
- # MultiIndex
2161
- new_vectors = pos.map do |tuple|
2162
- @data[@vectors[tuple]]
2163
- end
2024
+ new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
2164
2025
 
2165
- if !location.is_a?(Range) && names.size < @vectors.width
2166
- pos = pos.drop_left_level names.size
2167
- end
2026
+ pos = pos.drop_left_level(names.size) if names.size < @vectors.width
2168
2027
 
2169
- Daru::DataFrame.new(new_vectors, index: @index, order: pos)
2170
- else
2171
- unless names[1]
2172
- pos = @vectors[location]
2173
-
2174
- return @data[pos] if pos.is_a?(Numeric)
2028
+ Daru::DataFrame.new(new_vectors, index: @index, order: pos)
2029
+ end
2175
2030
 
2176
- names = pos
2177
- end
2031
+ def access_vector_single_index *names
2032
+ if names.count < 2
2033
+ pos = @vectors[names.first]
2178
2034
 
2179
- new_vectors = {}
2180
- names.each do |name|
2181
- new_vectors[name] = @data[@vectors[name]]
2182
- end
2035
+ return @data[pos] if pos.is_a?(Numeric)
2183
2036
 
2184
- order = names.is_a?(Array) ? Daru::Index.new(names) : names
2185
- Daru::DataFrame.new(new_vectors, order: order,
2186
- index: @index, name: @name)
2037
+ names = pos
2187
2038
  end
2188
- end
2189
2039
 
2190
- def access_row *names
2191
- location = names[0]
2040
+ new_vectors = names.map { |name| [name, @data[@vectors[name]]] }.to_h
2192
2041
 
2193
- if @index.is_a?(MultiIndex)
2194
- pos = @index[names]
2195
- if pos.is_a?(Integer)
2196
- return Daru::Vector.new(populate_row_for(pos), index: @vectors, name: pos)
2197
- end
2042
+ order = names.is_a?(Array) ? Daru::Index.new(names) : names
2043
+ Daru::DataFrame.new(new_vectors, order: order,
2044
+ index: @index, name: @name)
2045
+ end
2198
2046
 
2199
- new_rows = pos.map { |tuple| populate_row_for(tuple) }
2047
+ def access_row *indexes
2048
+ positions = @index.pos(*indexes)
2200
2049
 
2201
- if !location.is_a?(Range) && names.size < @index.width
2202
- pos = pos.drop_left_level names.size
2203
- end
2204
-
2205
- Daru::DataFrame.rows(new_rows, order: @vectors, name: @name, index: pos)
2050
+ if positions.is_a? Numeric
2051
+ return Daru::Vector.new populate_row_for(positions),
2052
+ index: @vectors,
2053
+ name: indexes.first
2206
2054
  else
2207
- if names[1].nil?
2208
- names = @index[location]
2209
- if names.is_a?(Numeric)
2210
- row = []
2211
- @data.each do |vector|
2212
- row << vector[location]
2213
- end
2214
-
2215
- return Daru::Vector.new(row, index: @vectors, name: set_name(location))
2216
- end
2217
- end
2218
- # Access multiple rows
2219
- rows = []
2220
- names.each do |name|
2221
- rows << self.row[name].to_a
2222
- end
2223
-
2224
- Daru::DataFrame.rows rows, index: names,name: @name, order: @vectors
2055
+ new_rows = @data.map { |vec| vec[*indexes] }
2056
+ return Daru::DataFrame.new new_rows,
2057
+ index: @index.subset(*indexes),
2058
+ order: @vectors
2225
2059
  end
2226
2060
  end
2227
2061
 
2228
2062
  def populate_row_for pos
2229
- @data.map do |vector|
2230
- vector[pos]
2231
- end
2063
+ @data.map { |vector| vector[pos] }
2232
2064
  end
2233
2065
 
2234
2066
  def insert_or_modify_vector name, vector
2235
2067
  name = name[0] unless @vectors.is_a?(MultiIndex)
2236
- vec = nil
2237
2068
 
2238
2069
  if @index.empty?
2239
- vec = if vector.is_a?(Daru::Vector)
2240
- vector
2241
- else
2242
- Daru::Vector.new(vector.to_a, name: set_name(name))
2243
- end
2244
-
2245
- @index = vec.index
2246
- assign_or_add_vector name, vec
2247
- set_size
2248
-
2249
- @data.map! do |v|
2250
- if v.empty?
2251
- Daru::Vector.new([nil]*@size, name: set_name(name), metadata: v.metadata, index: @index)
2252
- else
2253
- v
2254
- end
2255
- end
2070
+ insert_vector_in_empty name, vector
2256
2071
  else
2257
- if vector.is_a?(Daru::Vector)
2258
- if vector.index == @index # so that index-by-index assignment is avoided when possible.
2259
- vec = vector.dup
2260
- else
2261
- vec = Daru::Vector.new [], name: set_name(name), metadata: vector.metadata.dup, index: @index
2262
- @index.each do |idx|
2263
- vec[idx] = vector.index.include?(idx) ? vector[idx] : nil
2264
- end
2265
- end
2266
- else
2267
- raise SizeError,
2268
- "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
2269
- @size != vector.size
2270
-
2271
- vec = Daru::Vector.new(vector, name: set_name(name), index: @index)
2272
- end
2072
+ vec = prepare_vector_for_insert name, vector
2273
2073
 
2274
2074
  assign_or_add_vector name, vec
2275
2075
  end
@@ -2283,54 +2083,82 @@ module Daru
2283
2083
  pos = name
2284
2084
  end
2285
2085
 
2286
- if !pos.is_a?(Daru::Index) && pos == name &&
2287
- (@vectors.include?(name) || (pos.is_a?(Integer) && pos < @data.size))
2086
+ case
2087
+ when pos.is_a?(Daru::Index)
2088
+ assign_multiple_vectors pos, v
2089
+ when pos == name &&
2090
+ (@vectors.include?(name) || (pos.is_a?(Integer) && pos < @data.size))
2091
+
2288
2092
  @data[pos] = v
2289
- elsif pos.is_a?(Daru::Index)
2290
- pos.each do |p|
2291
- @data[@vectors[p]] = v
2292
- end
2293
2093
  else
2294
- @vectors |= [name] unless @vectors.include?(name)
2295
- @data[@vectors[name]] = v
2094
+ assign_or_add_vector_rough name, v
2296
2095
  end
2297
2096
  end
2298
2097
 
2299
- def insert_or_modify_row name, vector
2300
- if index.is_a?(MultiIndex)
2301
- # TODO
2302
- else
2303
- name = name[0]
2304
- vec =
2305
- if vector.is_a?(Daru::Vector)
2306
- vector
2307
- else
2308
- Daru::Vector.new(vector, name: set_name(name), index: @vectors)
2309
- end
2098
+ def assign_multiple_vectors pos, v
2099
+ pos.each do |p|
2100
+ @data[@vectors[p]] = v
2101
+ end
2102
+ end
2310
2103
 
2311
- if @index.include? name
2312
- each_vector_with_index do |v,i|
2313
- v[name] = vec.index.include?(i) ? vec[i] : nil
2314
- end
2315
- else
2316
- @index |= [name]
2317
- each_vector_with_index do |v,i|
2318
- v.concat((vec.index.include?(i) ? vec[i] : nil), name)
2104
+ def assign_or_add_vector_rough name, v
2105
+ @vectors |= [name] unless @vectors.include?(name)
2106
+ @data[@vectors[name]] = v
2107
+ end
2108
+
2109
+ def insert_vector_in_empty name, vector
2110
+ vec = Vector.coerce(vector.to_a, name: coerce_name(name))
2111
+
2112
+ @index = vec.index
2113
+ assign_or_add_vector name, vec
2114
+ set_size
2115
+
2116
+ @data.map! { |v| v.empty? ? v.reindex(@index) : v }
2117
+ end
2118
+
2119
+ def prepare_vector_for_insert name, vector
2120
+ if vector.is_a?(Daru::Vector)
2121
+ # so that index-by-index assignment is avoided when possible.
2122
+ return vector.dup if vector.index == @index
2123
+
2124
+ Daru::Vector.new([], name: coerce_name(name), index: @index).tap { |v|
2125
+ @index.each do |idx|
2126
+ v[idx] = vector.index.include?(idx) ? vector[idx] : nil
2319
2127
  end
2320
- end
2128
+ }
2129
+ else
2130
+ # FIXME: No spec checks this case... And SizeError is not a thing - zverok, 2016-05-08
2131
+ raise SizeError,
2132
+ "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
2133
+ @size != vector.size
2134
+
2135
+ Daru::Vector.new(vector, name: coerce_name(name), index: @index)
2136
+ end
2137
+ end
2138
+
2139
+ def insert_or_modify_row indexes, vector
2140
+ vector = coerce_vector vector
2141
+
2142
+ raise SizeError, 'Vector length should match row length' if
2143
+ vector.size != @vectors.size
2321
2144
 
2322
- set_size
2145
+ @data.each_with_index do |vec, pos|
2146
+ vec.send(:set, indexes, vector.at(pos))
2323
2147
  end
2148
+ @index = @data[0].index
2149
+
2150
+ set_size
2324
2151
  end
2325
2152
 
2326
2153
  def create_empty_vectors
2327
- @vectors.each do |name|
2328
- @data << Daru::Vector.new([], name: set_name(name), index: @index)
2154
+ @data = @vectors.map do |name|
2155
+ Daru::Vector.new([], name: coerce_name(name), index: @index)
2329
2156
  end
2330
2157
  end
2331
2158
 
2332
2159
  def validate_labels
2333
- raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if
2160
+ raise IndexError, "Expected equal number of vector names (#{@vectors.size}) " \
2161
+ "for number of vectors (#{@data.size})." if
2334
2162
  @vectors && @vectors.size != @data.size
2335
2163
 
2336
2164
  raise IndexError, 'Expected number of indexes same as number of rows' if
@@ -2348,12 +2176,6 @@ module Daru
2348
2176
  validate_vector_sizes
2349
2177
  end
2350
2178
 
2351
- def all_daru_vectors_in_source? source
2352
- source.values.all? do |vector|
2353
- vector.is_a?(Daru::Vector)
2354
- end
2355
- end
2356
-
2357
2179
  def set_size
2358
2180
  @size = @index.size
2359
2181
  end
@@ -2382,32 +2204,301 @@ module Daru
2382
2204
  def all_vectors_have_equal_indexes? source
2383
2205
  idx = source.values[0].index
2384
2206
 
2385
- source.values.all? do |vector|
2386
- idx == vector.index
2207
+ source.values.all? { |vector| idx == vector.index }
2208
+ end
2209
+
2210
+ def coerce_name potential_name
2211
+ potential_name.is_a?(Array) ? potential_name.join : potential_name
2212
+ end
2213
+
2214
+ def initialize_from_array source, vectors, index, opts
2215
+ raise ArgumentError, 'All objects in data source should be same class' \
2216
+ unless source.map(&:class).uniq.size == 1
2217
+
2218
+ case source.first
2219
+ when Array
2220
+ initialize_from_array_of_arrays source, vectors, index, opts
2221
+ when Vector
2222
+ initialize_from_array_of_vectors source, vectors, index, opts
2223
+ when Hash
2224
+ initialize_from_array_of_hashes source, vectors, index, opts
2225
+ else
2226
+ raise ArgumentError, "Can't create DataFrame from #{source}"
2387
2227
  end
2388
2228
  end
2389
2229
 
2390
- def try_create_index index
2391
- index.is_a?(Index) ? index : Daru::Index.new(index)
2230
+ def initialize_from_array_of_arrays source, vectors, index, _opts
2231
+ raise ArgumentError, "Number of vectors (#{vectors.size}) should \
2232
+ equal order size (#{source.size})" if source.size != vectors.size
2233
+
2234
+ @index = Index.coerce(index || source[0].size)
2235
+ @vectors = Index.coerce(vectors)
2236
+
2237
+ @data = @vectors.each_with_index.map do |_vec,idx|
2238
+ Daru::Vector.new(source[idx], index: @index, name: vectors[idx])
2239
+ end
2392
2240
  end
2393
2241
 
2394
- def set_name potential_name # rubocop:disable Style/AccessorMethodName
2395
- potential_name.is_a?(Array) ? potential_name.join : potential_name
2242
+ def initialize_from_array_of_vectors source, vectors, index, opts
2243
+ clone = opts[:clone] != false
2244
+ hsh = vectors.each_with_index.map do |name, idx|
2245
+ [name, source[idx]]
2246
+ end.to_h
2247
+ initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
2248
+ end
2249
+
2250
+ def initialize_from_array_of_hashes source, vectors, index, _opts
2251
+ names =
2252
+ if vectors.nil?
2253
+ source[0].keys
2254
+ else
2255
+ (vectors + source[0].keys).uniq
2256
+ end
2257
+ @vectors = Daru::Index.new(names)
2258
+ @index = Daru::Index.new(index || source.size)
2259
+
2260
+ @data = @vectors.map do |name|
2261
+ v = source.map { |h| h[name] || h[name.to_s] }
2262
+ Daru::Vector.new(v, name: coerce_name(name), index: @index)
2263
+ end
2264
+ end
2265
+
2266
+ def initialize_from_hash source, vectors, index, opts
2267
+ create_vectors_index_with vectors, source
2268
+
2269
+ if ArrayHelper.array_of?(source.values, Vector)
2270
+ initialize_from_hash_with_vectors source, index, opts
2271
+ else
2272
+ initialize_from_hash_with_arrays source, index, opts
2273
+ end
2274
+ end
2275
+
2276
+ def initialize_from_hash_with_vectors source, index, opts
2277
+ vectors_have_same_index = all_vectors_have_equal_indexes?(source)
2278
+
2279
+ clone = opts[:clone] != false
2280
+ clone = true unless index || vectors_have_same_index
2281
+
2282
+ @index = deduce_index index, source, vectors_have_same_index
2283
+
2284
+ if clone
2285
+ @data = clone_vectors source, vectors_have_same_index
2286
+ else
2287
+ @data.concat source.values
2288
+ end
2289
+ end
2290
+
2291
+ def deduce_index index, source, vectors_have_same_index
2292
+ if !index.nil?
2293
+ Index.coerce index
2294
+ elsif vectors_have_same_index
2295
+ source.values[0].index.dup
2296
+ else
2297
+ all_indexes = source
2298
+ .values.map { |v| v.index.to_a }
2299
+ .flatten.uniq.sort # sort only if missing indexes detected
2300
+
2301
+ Daru::Index.new all_indexes
2302
+ end
2396
2303
  end
2397
2304
 
2398
- def symbolize arry
2399
- symbolized_arry =
2400
- if arry.all? { |e| e.is_a?(Array) }
2401
- arry.map do |sub_arry|
2402
- sub_arry.map do |e|
2403
- e.is_a?(Numeric) ? e : e.to_sym
2305
+ def clone_vectors source, vectors_have_same_index
2306
+ @vectors.map do |vector|
2307
+ # avoids matching indexes of vectors if all the supplied vectors
2308
+ # have the same index.
2309
+ if vectors_have_same_index
2310
+ source[vector].dup
2311
+ else
2312
+ Daru::Vector.new([], name: vector, index: @index).tap do |v|
2313
+ @index.each do |idx|
2314
+ v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
2404
2315
  end
2405
2316
  end
2317
+ end
2318
+ end
2319
+ end
2320
+
2321
+ def initialize_from_hash_with_arrays source, index, _opts
2322
+ @index = Index.coerce(index || source.values[0].size)
2323
+
2324
+ @vectors.each do |name|
2325
+ @data << Daru::Vector.new(source[name].dup, name: coerce_name(name), index: @index)
2326
+ end
2327
+ end
2328
+
2329
+ def sort_build_row vector_locs, by_blocks, ascending, handle_nils, r1, r2 # rubocop:disable Metrics/ParameterLists
2330
+ # Create an array to be used for comparison of two rows in sorting
2331
+ vector_locs
2332
+ .zip(by_blocks, ascending, handle_nils)
2333
+ .map do |vector_loc, by, asc, handle_nil|
2334
+ value = @data[vector_loc].data[asc ? r1 : r2]
2335
+
2336
+ value = by.call(value) rescue nil if by
2337
+
2338
+ sort_handle_nils value, asc, handle_nil || !by
2339
+ end
2340
+ end
2341
+
2342
+ def sort_handle_nils value, asc, handle_nil
2343
+ case
2344
+ when !handle_nil
2345
+ value
2346
+ when asc
2347
+ [value.nil? ? 0 : 1, value]
2348
+ else
2349
+ [value.nil? ? 1 : 0, value]
2350
+ end
2351
+ end
2352
+
2353
+ def sort_coerce_boolean opts, symbol, default, size
2354
+ val = opts[symbol]
2355
+ case val
2356
+ when true, false
2357
+ Array.new(size, val)
2358
+ when nil
2359
+ Array.new(size, default)
2360
+ when Array
2361
+ raise ArgumentError, "Specify same number of vector names and #{symbol}" if
2362
+ size != val.size
2363
+ val
2364
+ else
2365
+ raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
2366
+ end
2367
+ end
2368
+
2369
+ def sort_prepare_block vector_order, opts
2370
+ ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
2371
+ handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
2372
+
2373
+ by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
2374
+ vector_locs = vector_order.map { |v| @vectors[v] }
2375
+
2376
+ lambda do |index1, index2|
2377
+ # Build left and right array to compare two rows
2378
+ left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
2379
+ right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
2380
+
2381
+ # Resolve conflict by Index if all attributes are same
2382
+ left << index1
2383
+ right << index2
2384
+ left <=> right
2385
+ end
2386
+ end
2387
+
2388
+ def verify_error_message row, test, id, i
2389
+ description, fields, = test
2390
+ values =
2391
+ if fields.empty?
2392
+ ''
2406
2393
  else
2407
- arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym }
2394
+ ' (' + fields.collect { |k| "#{k}=#{row[k]}" }.join(', ') + ')'
2395
+ end
2396
+ "#{i+1} [#{row[id]}]: #{description}#{values}"
2397
+ end
2398
+
2399
+ def prepare_pivot_values index, vectors, opts
2400
+ case opts[:values]
2401
+ when nil # values not specified at all.
2402
+ (@vectors.to_a - (index | vectors)) & numeric_vector_names
2403
+ when Array # multiple values specified.
2404
+ opts[:values]
2405
+ else # single value specified.
2406
+ [opts[:values]]
2407
+ end
2408
+ end
2409
+
2410
+ def make_pivot_hash grouped, vectors, values, aggregate_function
2411
+ grouped.groups.map { |n, _| [n, {}] }.to_h.tap do |super_hash|
2412
+ values.each do |value|
2413
+ grouped.groups.each do |group_name, row_numbers|
2414
+ row_numbers.each do |num|
2415
+ arry = [value, *vectors.map { |v| self[v][num] }]
2416
+ sub_hash = super_hash[group_name]
2417
+ sub_hash[arry] ||= []
2418
+
2419
+ sub_hash[arry] << self[value][num]
2420
+ end
2421
+ end
2422
+ end
2423
+
2424
+ setup_pivot_aggregates super_hash, aggregate_function
2425
+ end
2426
+ end
2427
+
2428
+ def setup_pivot_aggregates super_hash, aggregate_function
2429
+ super_hash.each_value do |sub_hash|
2430
+ sub_hash.each do |group_name, aggregates|
2431
+ sub_hash[group_name] = Daru::Vector.new(aggregates).send(aggregate_function)
2408
2432
  end
2433
+ end
2434
+ end
2435
+
2436
+ def pivot_dataframe super_hash
2437
+ df_index = Daru::MultiIndex.from_tuples super_hash.keys
2438
+ df_vectors = Daru::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
2409
2439
 
2410
- symbolized_arry
2440
+ Daru::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
2441
+ super_hash.each do |row_index, sub_h|
2442
+ sub_h.each do |vector_index, val|
2443
+ pivoted_dataframe[vector_index][row_index] = val
2444
+ end
2445
+ end
2446
+ end
2447
+ end
2448
+
2449
+ def one_to_many_components pattern
2450
+ re = Regexp.new pattern.gsub('%v','(.+?)').gsub('%n','(\\d+?)')
2451
+
2452
+ vars, numbers =
2453
+ @vectors
2454
+ .map { |v| v.scan(re) }
2455
+ .reject(&:empty?).flatten(1).transpose
2456
+
2457
+ [vars.uniq, numbers.map(&:to_i).sort.uniq]
2458
+ end
2459
+
2460
+ def one_to_many_row row, number, vars, pattern
2461
+ vars
2462
+ .map { |v|
2463
+ name = pattern.sub('%v', v).sub('%n', number.to_s)
2464
+ [v, row[name]]
2465
+ }.to_h
2466
+ end
2467
+
2468
+ # Raises IndexError when one of the positions is not a valid position
2469
+ def validate_positions *positions, size
2470
+ positions = [positions] if positions.is_a? Integer
2471
+ positions.each do |pos|
2472
+ raise IndexError, "#{pos} is not a valid position." if pos >= size
2473
+ end
2474
+ end
2475
+
2476
+ # Accepts hash, enumerable and vector and align it properly so it can be added
2477
+ def coerce_vector vector
2478
+ case vector
2479
+ when Daru::Vector
2480
+ vector.reindex @vectors
2481
+ when Hash
2482
+ Daru::Vector.new(vector).reindex @vectors
2483
+ else
2484
+ Daru::Vector.new vector
2485
+ end
2486
+ end
2487
+
2488
+ # coerce ranges, integers and array in appropriate ways
2489
+ def coerce_positions *positions, size
2490
+ if positions.size == 1
2491
+ case positions.first
2492
+ when Integer
2493
+ positions.first
2494
+ when Range
2495
+ size.times.to_a[positions.first]
2496
+ else
2497
+ raise ArgumentError, 'Unkown position type.'
2498
+ end
2499
+ else
2500
+ positions
2501
+ end
2411
2502
  end
2412
2503
  end
2413
2504
  end