daru 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +0 -0
  3. data/Gemfile +0 -1
  4. data/History.txt +35 -0
  5. data/README.md +178 -198
  6. data/daru.gemspec +5 -7
  7. data/lib/daru.rb +10 -2
  8. data/lib/daru/accessors/array_wrapper.rb +36 -198
  9. data/lib/daru/accessors/nmatrix_wrapper.rb +60 -209
  10. data/lib/daru/core/group_by.rb +183 -0
  11. data/lib/daru/dataframe.rb +615 -167
  12. data/lib/daru/index.rb +17 -16
  13. data/lib/daru/io/io.rb +5 -12
  14. data/lib/daru/maths/arithmetic/dataframe.rb +72 -8
  15. data/lib/daru/maths/arithmetic/vector.rb +19 -6
  16. data/lib/daru/maths/statistics/dataframe.rb +103 -2
  17. data/lib/daru/maths/statistics/vector.rb +102 -61
  18. data/lib/daru/monkeys.rb +8 -0
  19. data/lib/daru/multi_index.rb +199 -0
  20. data/lib/daru/plotting/dataframe.rb +24 -24
  21. data/lib/daru/plotting/vector.rb +14 -15
  22. data/lib/daru/vector.rb +402 -98
  23. data/lib/version.rb +1 -1
  24. data/notebooks/grouping_splitting_pivots.ipynb +529 -0
  25. data/notebooks/intro_with_music_data_.ipynb +104 -119
  26. data/spec/accessors/wrappers_spec.rb +36 -0
  27. data/spec/core/group_by_spec.rb +331 -0
  28. data/spec/dataframe_spec.rb +1237 -475
  29. data/spec/fixtures/sales-funnel.csv +18 -0
  30. data/spec/index_spec.rb +10 -21
  31. data/spec/io/io_spec.rb +4 -14
  32. data/spec/math/arithmetic/dataframe_spec.rb +66 -0
  33. data/spec/math/arithmetic/vector_spec.rb +45 -4
  34. data/spec/math/statistics/dataframe_spec.rb +91 -1
  35. data/spec/math/statistics/vector_spec.rb +32 -6
  36. data/spec/monkeys_spec.rb +10 -1
  37. data/spec/multi_index_spec.rb +216 -0
  38. data/spec/spec_helper.rb +1 -0
  39. data/spec/vector_spec.rb +505 -57
  40. metadata +21 -15
@@ -0,0 +1,183 @@
1
+ module Daru
2
+ module Core
3
+ class GroupBy
4
+
5
+ attr_reader :groups
6
+
7
+ def initialize context, names
8
+ @groups = {}
9
+ @non_group_vectors = context.vectors.to_a - names
10
+ @context = context
11
+ vectors = names.map { |vec| context.vector[vec].to_a }
12
+ tuples = vectors[0].zip(*vectors[1..-1])
13
+ keys = tuples.uniq.sort
14
+
15
+ keys.each do |key|
16
+ @groups[key] = all_indices_for(tuples, key)
17
+ end
18
+ @groups.freeze
19
+ end
20
+
21
+ def size
22
+ index =
23
+ if multi_indexed_grouping?
24
+ Daru::MultiIndex.new symbolize(@groups.keys)
25
+ else
26
+ Daru::Index.new symbolize(@groups.keys.flatten)
27
+ end
28
+
29
+ values = @groups.values.map { |e| e.size }
30
+ Daru::Vector.new(values, index: index, name: :size)
31
+ end
32
+
33
+ def first
34
+ head(1)
35
+ end
36
+
37
+ def last
38
+ tail(1)
39
+ end
40
+
41
+ def head quantity=5
42
+ select_groups_from :first, quantity
43
+ end
44
+
45
+ def tail quantity=5
46
+ select_groups_from :last, quantity
47
+ end
48
+
49
+ # Calculate mean of numeric groups, excluding missing values.
50
+ def mean
51
+ apply_method :numeric, :mean
52
+ end
53
+
54
+ # Calculate the median of numeric groups, excluding missing values.
55
+ def median
56
+ apply_method :numeric, :median
57
+ end
58
+
59
+ # Calculate sum of numeric groups, excluding missing values.
60
+ def sum
61
+ apply_method :numeric, :sum
62
+ end
63
+
64
+ def count
65
+ width = @non_group_vectors.size
66
+ Daru::DataFrame.new([size]*width, order: @non_group_vectors)
67
+ end
68
+
69
+ # Calculate sample standard deviation of numeric vector groups, excluding
70
+ # missing values.
71
+ def std
72
+ apply_method :numeric, :std
73
+ end
74
+
75
+ # Find the max element of each numeric vector group.
76
+ def max
77
+ apply_method :numeric, :max
78
+ end
79
+
80
+ # Find the min element of each numeric vector group.
81
+ def min
82
+ apply_method :numeric, :min
83
+ end
84
+
85
+ # Returns one of the selected groups as a DataFrame.
86
+ def get_group group
87
+ indexes = @groups[group]
88
+ elements = []
89
+
90
+ @context.each_vector do |vector|
91
+ elements << vector.to_a
92
+ end
93
+ rows = []
94
+ transpose = elements.transpose
95
+
96
+ indexes.each do |idx|
97
+ rows << transpose[idx]
98
+ end
99
+ Daru::DataFrame.rows(rows, index: @context.index[indexes], order: @context.vectors)
100
+ end
101
+
102
+ private
103
+
104
+ def select_groups_from method, quantity
105
+ selection = @context
106
+ rows, indexes = [], []
107
+
108
+ @groups.each_value do |index|
109
+ index.send(method, quantity).each do |idx|
110
+ rows << selection.row[idx].to_a
111
+ indexes << idx
112
+ end
113
+ end
114
+ indexes.flatten!
115
+
116
+ Daru::DataFrame.rows(rows, order: @context.vectors, index: indexes)
117
+ end
118
+
119
+ def apply_method method_type, method
120
+ multi_index = multi_indexed_grouping?
121
+ rows, order = [], []
122
+
123
+ @groups.each do |group, indexes|
124
+ single_row = []
125
+ @non_group_vectors.each do |ngvector|
126
+ vector = @context.vector[ngvector]
127
+ if method_type == :numeric and vector.type == :numeric
128
+ slice = vector[*indexes]
129
+
130
+ single_row << (slice.is_a?(Numeric) ? slice : slice.send(method))
131
+ order << ngvector
132
+ end
133
+ end
134
+
135
+ rows << single_row
136
+ end
137
+
138
+ index = symbolize @groups.keys
139
+ index = multi_index ? Daru::MultiIndex.new(index) : Daru::Index.new(index.flatten)
140
+ order = symbolize order
141
+ order =
142
+ if order.all?{ |e| e.is_a?(Array) }
143
+ Daru::MultiIndex.new(order)
144
+ else
145
+ Daru::Index.new(order)
146
+ end
147
+
148
+ Daru::DataFrame.new(rows.transpose, index: index, order: order)
149
+ end
150
+
151
+ def all_indices_for arry, element
152
+ found, index, indexes = -1, -1, []
153
+ while found
154
+ found = arry[index+1..-1].index(element)
155
+ if found
156
+ index = index + found + 1
157
+ indexes << index
158
+ end
159
+ end
160
+ indexes
161
+ end
162
+
163
+ def symbolize arry
164
+ symbolized_arry =
165
+ if arry.all? { |e| e.is_a?(Array) }
166
+ arry.map do |sub_arry|
167
+ sub_arry.map do |e|
168
+ e.is_a?(Numeric) ? e : e.to_sym
169
+ end
170
+ end
171
+ else
172
+ arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym }
173
+ end
174
+
175
+ symbolized_arry
176
+ end
177
+
178
+ def multi_indexed_grouping?
179
+ @groups.keys[0][1] ? true : false
180
+ end
181
+ end
182
+ end
183
+ end
@@ -1,9 +1,11 @@
1
- require_relative 'accessors/dataframe_by_row.rb'
2
- require_relative 'accessors/dataframe_by_vector.rb'
3
- require_relative 'maths/arithmetic/dataframe.rb'
4
- require_relative 'maths/statistics/dataframe.rb'
5
- require_relative 'plotting/dataframe.rb'
6
- require_relative 'io/io.rb'
1
+ $:.unshift File.dirname(__FILE__)
2
+
3
+ require 'accessors/dataframe_by_row.rb'
4
+ require 'accessors/dataframe_by_vector.rb'
5
+ require 'maths/arithmetic/dataframe.rb'
6
+ require 'maths/statistics/dataframe.rb'
7
+ require 'plotting/dataframe.rb'
8
+ require 'io/io.rb'
7
9
 
8
10
  module Daru
9
11
  class DataFrame
@@ -14,7 +16,7 @@ module Daru
14
16
 
15
17
  class << self
16
18
  # Load data from a CSV file.
17
- # Arguments - path, options, block(optional)
19
+ # Arguments - path, options, block(optional)
18
20
  #
19
21
  # Accepts a block for pre-conditioning of CSV data if any.
20
22
  def from_csv path, opts={}, &block
@@ -24,25 +26,25 @@ module Daru
24
26
  # Create DataFrame by specifying rows as an Array of Arrays or Array of
25
27
  # Daru::Vector objects.
26
28
  def rows source, opts={}
29
+ df = nil
27
30
  if source.all? { |v| v.size == source[0].size }
28
31
  first = source[0]
29
32
  index = []
30
- order =
31
- unless opts[:order]
32
- if first.is_a?(Daru::Vector) # assume that all are Vectors only
33
- source.each { |vec| index << vec.name }
34
- first.index.to_a
35
- elsif first.is_a?(Array)
36
- Array.new(first.size) { |i| i.to_s }
37
- end
38
- else
39
- opts[:order]
33
+ opts[:order] ||=
34
+ if first.is_a?(Daru::Vector) # assume that all are Vectors
35
+ source.each { |vec| index << vec.name }
36
+ first.index.to_a
37
+ elsif first.is_a?(Array)
38
+ Array.new(first.size) { |i| i.to_s }
40
39
  end
41
40
 
42
- opts[:order] = order
43
- df = Daru::DataFrame.new({}, opts)
44
- source.each_with_index do |row,idx|
45
- df[(index[idx] || idx), :row] = row
41
+ if source.all? { |s| s.is_a?(Array) }
42
+ df = Daru::DataFrame.new(source.transpose, opts)
43
+ else # array of Daru::Vectors
44
+ df = Daru::DataFrame.new({}, opts)
45
+ source.each_with_index do |row, idx|
46
+ df[(index[idx] || idx), :row] = row
47
+ end
46
48
  end
47
49
  else
48
50
  raise SizeError, "All vectors must have same length"
@@ -65,8 +67,8 @@ module Daru
65
67
  attr_reader :size
66
68
 
67
69
  # DataFrame basically consists of an Array of Vector objects.
68
- # These objects are indexed by row and column by vectors and index Index objects.
69
- # Arguments - source, vectors, index, name.
70
+ # These objects are indexed by row and column by vectors and index Index objects.
71
+ # Arguments - source, vectors, index, name.
70
72
  #
71
73
  # == Usage
72
74
  # df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
@@ -82,43 +84,55 @@ module Daru
82
84
  def initialize source, opts={}
83
85
  vectors = opts[:order]
84
86
  index = opts[:index]
85
- @dtype = opts[:dtype] || Array
86
87
  @name = (opts[:name] || SecureRandom.uuid).to_sym
87
88
  @data = []
88
89
 
89
90
  if source.empty?
90
- @vectors = Daru::Index.new vectors
91
- @index = Daru::Index.new index
91
+ @vectors = create_index vectors
92
+ @index = create_index index
92
93
  create_empty_vectors
93
94
  else
94
95
  case source
95
96
  when Array
96
- if vectors.nil?
97
- @vectors = Daru::Index.new source[0].keys.map(&:to_sym)
98
- else
99
- @vectors = Daru::Index.new (vectors + (source[0].keys - vectors)).uniq.map(&:to_sym)
100
- end
97
+ if source.all? { |s| s.is_a?(Array) }
98
+ raise ArgumentError, "Number of vectors (#{vectors.size}) should \
99
+ equal order size (#{source.size})" if source.size != vectors.size
101
100
 
102
- if index.nil?
103
- @index = Daru::Index.new source.size
104
- else
105
- @index = Daru::Index.new index
106
- end
101
+ @index = create_index(index || source[0].size)
102
+ @vectors = create_index(vectors)
107
103
 
108
- @vectors.each do |name|
109
- v = []
110
- source.each do |hsh|
111
- v << (hsh[name] || hsh[name.to_s])
104
+ @vectors.each_with_index do |vec,idx|
105
+ @data << Daru::Vector.new(source[idx], index: @index)
106
+ end
107
+ elsif source.all? { |s| s.is_a?(Daru::Vector) }
108
+ hsh = {}
109
+ vectors.each_with_index do |name, idx|
110
+ hsh[name] = source[idx]
111
+ end
112
+ initialize(hsh, index: index, order: vectors, name: @name)
113
+ else # array of hashes
114
+ if vectors.nil?
115
+ @vectors = Daru::Index.new source[0].keys.map(&:to_sym)
116
+ else
117
+ @vectors = Daru::Index.new (vectors + (source[0].keys - vectors)).uniq.map(&:to_sym)
112
118
  end
119
+ @index = Daru::Index.new(index || source.size)
120
+
121
+ @vectors.each do |name|
122
+ v = []
123
+ source.each do |hsh|
124
+ v << (hsh[name] || hsh[name.to_s])
125
+ end
113
126
 
114
- @data << v.dv(name, @index, @dtype)
127
+ @data << Daru::Vector.new(v, name: set_name(name), index: @index)
128
+ end
115
129
  end
116
130
  when Hash
117
131
  create_vectors_index_with vectors, source
118
132
  if all_daru_vectors_in_source? source
119
133
  if !index.nil?
120
- @index = index.to_index
121
- elsif all_vectors_have_equal_indexes? source
134
+ @index = create_index index
135
+ elsif all_vectors_have_equal_indexes?(source)
122
136
  @index = source.values[0].index.dup
123
137
  else
124
138
  all_indexes = []
@@ -131,29 +145,17 @@ module Daru
131
145
  @index = Daru::Index.new all_indexes
132
146
  end
133
147
  @vectors.each do |vector|
134
- @data << Daru::Vector.new([], name: vector, index: @index, dtype: @dtype)
148
+ @data << Daru::Vector.new([], name: vector, index: @index)
135
149
 
136
150
  @index.each do |idx|
137
- begin
138
- @data[@vectors[vector]][idx] = source[vector][idx]
139
- rescue IndexError
140
- # If the index is not present in the vector under consideration
141
- # (in source) then an error is raised. Put a nil in that place if
142
- # that is the case.
143
- @data[@vectors[vector]][idx] = nil
144
- end
151
+ @data[@vectors[vector]][idx] = source[vector][idx]
145
152
  end
146
153
  end
147
- else
148
- index = source.values[0].size if index.nil?
149
- if index.is_a?(Daru::Index)
150
- @index = index.to_index
151
- else
152
- @index = Daru::Index.new index
153
- end
154
+ else
155
+ @index = create_index(index || source.values[0].size)
154
156
 
155
157
  @vectors.each do |name|
156
- @data << source[name].dup.dv(name, @index, @dtype)
158
+ @data << Daru::Vector.new(source[name].dup, name: set_name(name), index: @index)
157
159
  end
158
160
  end
159
161
  end
@@ -164,10 +166,17 @@ module Daru
164
166
  end
165
167
 
166
168
  # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
167
- # Use of this method is not recommended for accessing rows or vectors.
168
- # Use df.row[:a] for accessing row with index ':a' or df.vector[:vec] for
169
- # accessing vector with index ':vec'
170
- def [](*names, axis)
169
+ # Defaults to *:vector*. Use of this method is not recommended for accessing
170
+ # rows or vectors. Use df.row[:a] for accessing row with index ':a' or
171
+ # df.vector[:vec] for accessing vector with index *:vec*.
172
+ def [](*names)
173
+ if names[-1] == :vector or names[-1] == :row
174
+ axis = names[-1]
175
+ names = names[0..-2]
176
+ else
177
+ axis = :vector
178
+ end
179
+
171
180
  if axis == :vector
172
181
  access_vector *names
173
182
  elsif axis == :row
@@ -184,7 +193,12 @@ module Daru
184
193
  # In case a Daru::Vector is specified after the equality the sign, the indexes
185
194
  # of the vector will be matched against the row/vector indexes of the DataFrame
186
195
  # before an insertion is performed. Unmatched indexes will be set to nil.
187
- def []=(name, axis ,vector)
196
+ def []=(*args)
197
+ name = args[0]
198
+ axis = args[1]
199
+ vector = args[-1]
200
+
201
+ axis = (!axis.is_a?(Symbol) and (axis != :vector or axis != :row)) ? :vector : axis
188
202
  if axis == :vector
189
203
  insert_or_modify_vector name, vector
190
204
  elsif axis == :row
@@ -203,6 +217,11 @@ module Daru
203
217
  Daru::Accessors::DataFrameByVector.new(self)
204
218
  end
205
219
 
220
+ # Access a vector by name.
221
+ def column name
222
+ vector[name]
223
+ end
224
+
206
225
  # Access a row or set/create a row. Refer #[] and #[]= docs for details.
207
226
  #
208
227
  # == Usage
@@ -219,18 +238,24 @@ module Daru
219
238
  src[vector] = @data[@vectors[vector]].dup
220
239
  end
221
240
 
222
- Daru::DataFrame.new src, order: @vectors.dup, index: @index.dup, name: @name, dtype: @dtype
241
+ Daru::DataFrame.new src, order: @vectors.dup, index: @index.dup, name: @name
223
242
  end
224
243
 
225
244
  # Iterate over each vector
226
245
  def each_vector(&block)
246
+ return to_enum(:each_vector) unless block_given?
247
+
227
248
  @data.each(&block)
228
249
 
229
250
  self
230
251
  end
231
252
 
253
+ alias_method :each_column, :each_vector
254
+
232
255
  # Iterate over each vector alongwith the name of the vector
233
256
  def each_vector_with_index(&block)
257
+ return to_enum(:each_vector_with_index) unless block_given?
258
+
234
259
  @vectors.each do |vector|
235
260
  yield @data[@vectors[vector]], vector
236
261
  end
@@ -238,8 +263,12 @@ module Daru
238
263
  self
239
264
  end
240
265
 
266
+ alias_method :each_column_with_index, :each_vector_with_index
267
+
241
268
  # Iterate over each row
242
269
  def each_row(&block)
270
+ return to_enum(:each_row) unless block_given?
271
+
243
272
  @index.each do |index|
244
273
  yield access_row(index)
245
274
  end
@@ -248,6 +277,8 @@ module Daru
248
277
  end
249
278
 
250
279
  def each_row_with_index(&block)
280
+ return to_enum(:each_row_with_index) unless block_given?
281
+
251
282
  @index.each do |index|
252
283
  yield access_row(index), index
253
284
  end
@@ -256,17 +287,27 @@ module Daru
256
287
  end
257
288
 
258
289
  # Map each vector. Returns a DataFrame whose vectors are modified according
259
- # to the value returned by the block.
290
+ # to the value returned by the block. As is the case with Enumerable#map,
291
+ # the object returned by each block must be a Daru::Vector for the dataframe
292
+ # to remain relevant.
260
293
  def map_vectors(&block)
261
- df = self.dup
262
- df.each_vector_with_index do |vector, name|
263
- df[name, :vector] = yield(vector)
264
- end
294
+ return to_enum(:map_vectors) unless block_given?
265
295
 
266
- df
296
+ self.dup.map_vectors!(&block)
267
297
  end
268
298
 
299
+ # Destructive form of #map_vectors
300
+ def map_vectors!(&block)
301
+ return to_enum(:map_vectors!) unless block_given?
302
+
303
+ @data.map!(&block)
304
+ self
305
+ end
306
+
307
+ # Map vectors alongwith the index.
269
308
  def map_vectors_with_index(&block)
309
+ return to_enum(:map_vectors_with_index) unless block_given?
310
+
270
311
  df = self.dup
271
312
  df.each_vector_with_index do |vector, name|
272
313
  df[name, :vector] = yield(vector, name)
@@ -277,6 +318,8 @@ module Daru
277
318
 
278
319
  # Map each row
279
320
  def map_rows(&block)
321
+ return to_enum(:map_rows) unless block_given?
322
+
280
323
  df = self.dup
281
324
  df.each_row_with_index do |row, index|
282
325
  df[index, :row] = yield(row)
@@ -286,6 +329,8 @@ module Daru
286
329
  end
287
330
 
288
331
  def map_rows_with_index(&block)
332
+ return to_enum(:map_rows_with_index) unless block_given?
333
+
289
334
  df = self.dup
290
335
  df.each_row_with_index do |row, index|
291
336
  df[index, :row] = yield(row, index)
@@ -302,13 +347,16 @@ module Daru
302
347
  else
303
348
  raise IndexError, "Vector #{vector} does not exist."
304
349
  end
350
+
351
+ self
305
352
  end
306
353
 
354
+ # Delete a row
307
355
  def delete_row index
308
356
  idx = named_index_for index
309
357
 
310
358
  if @index.include? idx
311
- @index = (@index.to_a - [idx]).to_index
359
+ @index = reassign_index_as(@index.to_a - [idx])
312
360
  self.each_vector do |vector|
313
361
  vector.delete_at idx
314
362
  end
@@ -343,6 +391,8 @@ module Daru
343
391
  # Iterates over each row and retains it in a new DataFrame if the block returns
344
392
  # true for that row.
345
393
  def filter_rows &block
394
+ return to_enum(:filter_rows) unless block_given?
395
+
346
396
  df = Daru::DataFrame.new({}, order: @vectors.to_a)
347
397
  marked = []
348
398
 
@@ -361,39 +411,255 @@ module Daru
361
411
  # Iterates over each vector and retains it in a new DataFrame if the block returns
362
412
  # true for that vector.
363
413
  def filter_vectors &block
414
+ return to_enum(:filter_vectors) unless block_given?
415
+
364
416
  df = self.dup
365
417
  df.keep_vector_if &block
366
418
 
367
419
  df
368
420
  end
369
421
 
422
+ # Return the number of rows and columns of the DataFrame in an Array.
423
+ def shape
424
+ [@index.size, @vectors.size]
425
+ end
426
+
427
+ # The number of rows
428
+ def rows
429
+ shape[0]
430
+ end
431
+
432
+ # The number of vectors
433
+ def cols
434
+ shape[1]
435
+ end
436
+
370
437
  # Check if a vector is present
371
- def has_vector? name
372
- !!@vectors[name]
438
+ def has_vector? vector
439
+ !!@vectors[*vector]
373
440
  end
374
441
 
442
+ # The first ten elements of the DataFrame
443
+ #
444
+ # @param [Fixnum] quantity (10) The number of elements to display from the top.
375
445
  def head quantity=10
376
446
  self[0..quantity, :row]
377
447
  end
378
448
 
449
+ # The last ten elements of the DataFrame
450
+ #
451
+ # @param [Fixnum] quantity (10) The number of elements to display from the bottom.
379
452
  def tail quantity=10
380
- self[(@size - quantity)..@size, :row]
453
+ self[(@size - quantity)..(@size-1), :row]
381
454
  end
382
455
 
383
- # def sort_by_row name
384
-
385
- # end
456
+ # Group elements by vector to perform operations on them.
457
+ def group_by vectors
458
+ vectors = [vectors] if vectors.is_a?(Symbol)
459
+ vectors.each { |v| raise(ArgumentError, "Vector #{v} does not exist") unless
460
+ has_vector?(v) }
461
+
462
+ Daru::Core::GroupBy.new(self, vectors)
463
+ end
464
+
465
+ # Change the index of the DataFrame and its underlying vectors. Destructive.
466
+ #
467
+ # @param [Symbol, Array] new_index Specify an Array if
468
+ def reindex! new_index
469
+ raise ArgumentError, "Index size must equal dataframe size" if new_index.is_a?(Array) and new_index.size != @size
470
+
471
+ @index = possibly_multi_index?(new_index == :seq ? @size : new_index)
472
+ @data.map! do |vector|
473
+ vector.reindex possibly_multi_index?(@index.to_a)
474
+ end
475
+
476
+ self
477
+ end
478
+
479
+ # Non-destructive version of #reindex!
480
+ def reindex new_index
481
+ self.dup.reindex! new_index
482
+ end
483
+
484
+ # Return the names of all the numeric vectors. Will include vectors with nils
485
+ # alongwith numbers.
486
+ def numeric_vectors
487
+ numerics = []
488
+
489
+ each_vector do |vec|
490
+ numerics << vec.name if(vec.type == :numeric)
491
+ end
492
+ numerics
493
+ end
494
+
495
+ # Sorts a dataframe (ascending/descending)according to the given sequence of
496
+ # vectors, using the attributes provided in the blocks. Works for 2 LEVELS ONLY.
497
+ #
498
+ # @param order [Array] The order of vector names in which the DataFrame
499
+ # should be sorted.
500
+ # @param [Hash] opts The options to sort with.
501
+ # @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
502
+ # or descending order. Specify Array corresponding to *order* for multiple
503
+ # sort orders.
504
+ # @option opts [Hash] :by ({|a,b| a <=> b}) Specify attributes of objects to
505
+ # to be used for sorting, for each vector name in *order* as a hash of
506
+ # vector name and lambda pairs. In case a lambda for a vector is not
507
+ # specified, the default will be used.
508
+ #
509
+ # == Usage
510
+ #
511
+ # df = Daru::DataFrame.new({a: [-3,2,-1,4], b: [4,3,2,1]})
512
+ #
513
+ # #<Daru::DataFrame:140630680 @name = 04e00197-f8d5-4161-bca2-93266bfabc6f @size = 4>
514
+ # # a b
515
+ # # 0 -3 4
516
+ # # 1 2 3
517
+ # # 2 -1 2
518
+ # # 3 4 1
519
+ # df.sort([:a], by: { a: lambda { |a,b| a.abs <=> b.abs } })
520
+ def sort! vector_order, opts={}
521
+ raise ArgumentError, "Required atleast one vector name" if vector_order.size < 1
522
+ opts = {
523
+ ascending: true,
524
+ type: :quick_sort,
525
+ by: {}
526
+ }.merge(opts)
527
+
528
+ opts[:by] = create_logic_blocks vector_order, opts[:by]
529
+ opts[:ascending] = sort_order_array vector_order, opts[:ascending]
530
+ index = @index.to_a
531
+ send(opts[:type], vector_order, index, opts[:by], opts[:ascending])
532
+ reindex! index
533
+ end
534
+
535
+ # Non-destructive version of #sort!
536
+ def sort vector_order, opts={}
537
+ self.dup.sort! vector_order, opts
538
+ end
539
+
540
+ # Pivots a data frame on specified vectors and applies an aggregate function
541
+ # to quickly generate a summary.
542
+ #
543
+ # == Options
544
+ #
545
+ # +:index+ - Keys to group by on the pivot table row index. Pass vector names
546
+ # contained in an Array.
547
+ #
548
+ # +:vectors+ - Keys to group by on the pivot table column index. Pass vector
549
+ # names contained in an Array.
550
+ #
551
+ # +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
552
+ # use any of the statistics functions applicable on Vectors that can be found in
553
+ # the Daru::Statistics::Vector module.
554
+ #
555
+ # +:values+ - Columns to aggregate. Will consider all numeric columns not
556
+ # specified in *:index* or *:vectors*. Optional.
557
+ #
558
+ # == Usage
559
+ #
560
+ # df = Daru::DataFrame.new({
561
+ # a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
562
+ # b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
563
+ # c: ['small','large','large','small','small','large','small','large','small'],
564
+ # d: [1,2,2,3,3,4,5,6,7],
565
+ # e: [2,4,4,6,6,8,10,12,14]
566
+ # })
567
+ # df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
568
+ #
569
+ # #=>
570
+ # # #<Daru::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
571
+ # # [:e, :one] [:e, :two]
572
+ # # [:bar] 18 26
573
+ # # [:foo] 10 12
574
+ def pivot_table opts={}
575
+ raise ArgumentError, "Specify grouping index" if !opts[:index] or opts[:index].empty?
386
576
 
387
- # def sort_by_vector name
577
+ index = opts[:index]
578
+ vectors = opts[:vectors] || []
579
+ aggregate_function = opts[:agg] || :mean
580
+ values =
581
+ if opts[:values].is_a?(Symbol)
582
+ [opts[:values]]
583
+ elsif opts[:values].is_a?(Array)
584
+ opts[:values]
585
+ else # nil
586
+ (@vectors.to_a - (index | vectors)) & numeric_vectors
587
+ end
388
588
 
389
- # end
589
+ raise IndexError, "No numeric vectors to aggregate" if values.empty?
590
+
591
+ grouped = group_by(index)
592
+
593
+ unless vectors.empty?
594
+ super_hash = {}
595
+ values.each do |value|
596
+ grouped.groups.each do |group_name, row_numbers|
597
+ super_hash[group_name] ||= {}
598
+
599
+ row_numbers.each do |num|
600
+ arry = []
601
+ arry << value
602
+ vectors.each { |v| arry << self[v][num] }
603
+ sub_hash = super_hash[group_name]
604
+ sub_hash[arry] ||= []
605
+
606
+ sub_hash[arry] << self[value][num]
607
+ end
608
+ end
609
+ end
610
+
611
+ super_hash.each_value do |sub_hash|
612
+ sub_hash.each do |group_name, aggregates|
613
+ sub_hash[group_name] = Daru::Vector.new(aggregates).send(aggregate_function)
614
+ end
615
+ end
616
+
617
+ df_index = Daru::MultiIndex.new(symbolize(super_hash.keys))
618
+
619
+ vector_indexes = []
620
+ super_hash.each_value do |sub_hash|
621
+ vector_indexes.concat sub_hash.keys
622
+ end
623
+ df_vectors = Daru::MultiIndex.new symbolize(vector_indexes.uniq)
624
+ pivoted_dataframe = Daru::DataFrame.new({}, index: df_index, order: df_vectors)
625
+
626
+ super_hash.each do |row_index, sub_h|
627
+ sub_h.each do |vector_index, val|
628
+ pivoted_dataframe[symbolize(vector_index)][symbolize(row_index)] = val
629
+ end
630
+ end
631
+ return pivoted_dataframe
632
+ else
633
+ grouped.send(aggregate_function)
634
+ end
635
+ end
636
+
637
+ # Convert all vectors of type *:numeric* into a Matrix.
638
+ def to_matrix
639
+ numerics_as_arrays = []
640
+ each_vector do |vector|
641
+ numerics_as_arrays << vector.to_a if(vector.type == :numeric)
642
+ end
643
+
644
+ Matrix.columns numerics_as_arrays
645
+ end
646
+
647
+ # Convert all vectors of type *:numeric* and not containing nils into an NMatrix.
648
+ def to_nmatrix
649
+ numerics_as_arrays = []
650
+ each_vector do |vector|
651
+ numerics_as_arrays << vector.to_a if(vector.type == :numeric and
652
+ vector.nil_positions.size == 0)
653
+ end
654
+
655
+ numerics_as_arrays.transpose.to_nm
656
+ end
390
657
 
391
658
  # Converts the DataFrame into an array of hashes where key is vector name
392
- # and value is the corresponding element.
393
- # The 0th index of the array contains the array of hashes while the 1th
394
- # index contains the indexes of each row of the dataframe. Each element in
395
- # the index array corresponds to its row in the array of hashes, which has
396
- # the same index.
659
+ # and value is the corresponding element. The 0th index of the array contains
660
+ # the array of hashes while the 1th index contains the indexes of each row
661
+ # of the dataframe. Each element in the index array corresponds to its row
662
+ # in the array of hashes, which has the same index.
397
663
  def to_a
398
664
  arry = [[],[]]
399
665
  self.each_row do |row|
@@ -443,7 +709,28 @@ module Daru
443
709
  to_html
444
710
  end
445
711
 
446
- # Pretty print in a nice table format for the command line (irb)
712
+ # Change dtypes of vectors by supplying a hash of :vector_name => :new_dtype
713
+ #
714
+ # == Usage
715
+ # df = Daru::DataFrame.new({a: [1,2,3], b: [1,2,3], c: [1,2,3]})
716
+ # df.recast a: :nmatrix, c: :nmatrix
717
+ def recast opts={}
718
+ opts.each do |vector_name, dtype|
719
+ vector[vector_name].cast(dtype: dtype)
720
+ end
721
+ end
722
+
723
+ # Transpose a DataFrame, tranposing elements and row, column indexing.
724
+ def transpose
725
+ arrys = []
726
+ each_vector do |vec|
727
+ arrys << vec.to_a
728
+ end
729
+
730
+ Daru::DataFrame.new(arrys.transpose, index: @vectors, order: @index, dtype: @dtype, name: @name)
731
+ end
732
+
733
+ # Pretty print in a nice table format for the command line (irb/pry/iruby)
447
734
  def inspect spacing=10, threshold=15
448
735
  longest = [@name.to_s.size,
449
736
  @vectors.map(&:to_s).map(&:size).max,
@@ -477,23 +764,14 @@ module Daru
477
764
  content
478
765
  end
479
766
 
480
- def dtype= dtype
481
- @dtype = dtype
482
-
483
- @vectors.each do |vec|
484
- pos = @vectors[vec]
485
- @data[pos] = @data[pos].coerce(@dtype)
486
- end
487
- end
488
-
489
767
  def == other
490
- @index == other.index and @size == other.size and @vectors.all? { |vector|
491
- self[vector, :vector] == other[vector, :vector] }
768
+ @index == other.index and @size == other.size and @vectors == other.vectors and
769
+ @vectors.all? { |vector| self[vector, :vector] == other[vector, :vector] }
492
770
  end
493
771
 
494
772
  def method_missing(name, *args, &block)
495
773
  if md = name.match(/(.+)\=/)
496
- insert_or_modify_vector name[/(.+)\=/].delete("="), args[0]
774
+ insert_or_modify_vector name[/(.+)\=/].delete("=").to_sym, args[0]
497
775
  elsif self.has_vector? name
498
776
  self[name, :vector]
499
777
  else
@@ -503,81 +781,234 @@ module Daru
503
781
 
504
782
  private
505
783
 
506
- def access_vector *names
507
- unless names[1]
508
- if @vectors.include? names[0]
509
- return @data[@vectors[names[0]]]
510
- elsif @vectors.key names[0]
511
- return @data[names[0]]
784
+ def possibly_multi_index? index
785
+ if @index.is_a?(MultiIndex)
786
+ Daru::MultiIndex.new(index)
787
+ else
788
+ Daru::Index.new(index)
789
+ end
790
+ end
791
+
792
+ def quick_sort vector_order, index, by, ascending
793
+ recursive_quick_sort vector_order, index, by, ascending, 0, @size-1
794
+ end
795
+
796
+ # == Arguments
797
+ #
798
+ # vector_order -
799
+ # index -
800
+ # by -
801
+ # ascending -
802
+ # left_lower -
803
+ # right_upper -
804
+ def recursive_quick_sort vector_order, index, by, ascending, left_lower, right_upper
805
+ if left_lower < right_upper
806
+ left_upper, right_lower = partition(vector_order, index, by, ascending, left_lower, right_upper)
807
+ if left_upper - left_lower < right_upper - right_lower
808
+ recursive_quick_sort(vector_order, index, by, ascending, left_lower, left_upper)
809
+ recursive_quick_sort(vector_order, index, by, ascending, right_lower, right_upper)
512
810
  else
513
- raise IndexError, "Specified index #{names[0]} does not exist."
811
+ recursive_quick_sort(vector_order, index, by, ascending, right_lower, right_upper)
812
+ recursive_quick_sort(vector_order, index, by, ascending, left_lower, left_upper)
514
813
  end
515
814
  end
516
- new_vcs = {}
815
+ end
517
816
 
518
- names.each do |name|
519
- name = name.to_sym unless name.is_a?(Integer)
817
+ def partition vector_order, index, by, ascending, left_lower, right_upper
818
+ mindex = (left_lower + right_upper) / 2
819
+ mvalues = vector_order.inject([]) { |a, vector_name| a << vector[vector_name][mindex]; a }
820
+ i = left_lower
821
+ j = right_upper
822
+ descending = ascending.map { |a| !a }
823
+
824
+ i += 1 while(keep?(i, mvalues, vector_order, ascending , by, 0))
825
+ j -= 1 while(keep?(j, mvalues, vector_order, descending, by, 0))
826
+
827
+ while i < j - 1
828
+ @data.each do |vector|
829
+ vector[i], vector[j] = vector[j], vector[i]
830
+ end
831
+ index[i], index[j] = index[j], index[i]
832
+ i += 1
833
+ j -= 1
520
834
 
521
- new_vcs[name] = @data[@vectors[name]]
835
+ i += 1 while(keep?(i, mvalues, vector_order, ascending , by,0))
836
+ j -= 1 while(keep?(j, mvalues, vector_order, descending, by,0))
522
837
  end
523
- Daru::DataFrame.new new_vcs, order: new_vcs.keys, index: @index, name: @name
838
+
839
+ if i <= j
840
+ if i < j
841
+ @data.each do |vector|
842
+ vector[i], vector[j] = vector[j], vector[i]
843
+ end
844
+ index[i], index[j] = index[j], index[i]
845
+ end
846
+ i += 1
847
+ j -= 1
848
+ end
849
+
850
+ [j,i]
524
851
  end
525
852
 
526
- def access_row *names
527
- if names[1].nil?
528
- access_token = names[0]
529
- if access_token.is_a?(Range)
530
- index_arry = @index.to_a
531
-
532
- range =
533
- if access_token.first.is_a?(Numeric)
534
- access_token
535
- else
536
- first_index = index_arry.index access_token.first
537
- last_index = index_arry.index access_token.last
853
+ def keep? current_index, mvalues, vector_order, sort_order, by, vector_order_index
854
+ vector_name = vector_order[vector_order_index]
855
+ if vector_name
856
+ vec = vector[vector_name]
857
+ eval = by[vector_name].call(vec[current_index], mvalues[vector_order_index])
858
+
859
+ if sort_order[vector_order_index] # sort in ascending order
860
+ return false if eval == 1
861
+ return true if eval == -1
862
+ if eval == 0
863
+ keep?(current_index, mvalues, vector_order, sort_order, by, vector_order_index + 1)
864
+ end
865
+ else # sort in descending order
866
+ return false if eval == -1
867
+ return true if eval == 1
868
+ if eval == 0
869
+ keep?(current_index, mvalues, vector_order, sort_order, by, vector_order_index + 1)
870
+ end
871
+ end
872
+ end
873
+ end
538
874
 
539
- first_index..last_index
875
+ def create_logic_blocks vector_order, by={}
876
+ universal_block = lambda { |a,b| a <=> b }
877
+ vector_order.each do |vector|
878
+ by[vector] ||= universal_block
879
+ end
880
+
881
+ by
882
+ end
883
+
884
+ def sort_order_array vector_order, ascending
885
+ if ascending.is_a?(Array)
886
+ raise ArgumentError, "Specify same number of vector names and sort orders" if
887
+ vector_order.size != ascending.size
888
+ return ascending
889
+ else
890
+ Array.new(vector_order.size, ascending)
891
+ end
892
+ end
893
+
894
+ def vectors_index_for location
895
+ if @vectors.include?(location)
896
+ @vectors[location]
897
+ elsif location[0].is_a?(Integer)
898
+ location[0]
899
+ end
900
+ end
901
+
902
+ def access_vector *names
903
+ location = names[0]
904
+ if @vectors.is_a?(MultiIndex)
905
+ pos = vectors_index_for names
906
+
907
+ if pos.is_a?(Integer)
908
+ return @data[pos]
909
+ else # MultiIndex
910
+ new_vectors = pos.map do |tuple|
911
+ @data[vectors_index_for(names + tuple)]
540
912
  end
913
+ Daru::DataFrame.new(new_vectors, index: @index, order: Daru::MultiIndex.new(pos.to_a))
914
+ end
915
+ else
916
+ unless names[1]
917
+ pos = vectors_index_for location
918
+ return @data[pos]
919
+ end
920
+
921
+ new_vcs = {}
922
+ names.each do |name|
923
+ name = name.to_sym unless name.is_a?(Integer)
924
+ new_vcs[name] = @data[@vectors[name]]
925
+ end
926
+ Daru::DataFrame.new new_vcs, order: new_vcs.keys, index: @index, name: @name
927
+ end
928
+ end
929
+
930
+ def access_row *names
931
+ location = names[0]
541
932
 
542
- names = index_arry[range]
933
+ if @index.is_a?(MultiIndex)
934
+ pos = row_index_for names
935
+ if pos.is_a?(Integer)
936
+ return Daru::Vector.new(populate_row_for(pos), index: @vectors, name: pos)
543
937
  else
544
- row = []
545
- name = named_index_for names[0]
546
- @vectors.each do |vector|
547
- row << @data[@vectors[vector]][name]
938
+ new_rows =
939
+ if location.is_a?(Range)
940
+ pos.map { |tuple| populate_row_for(tuple) }
941
+ else
942
+ pos.map { |tuple| populate_row_for(names + tuple) }
548
943
  end
944
+
945
+ Daru::DataFrame.rows(new_rows, order: @vectors, name: @name,
946
+ index: Daru::MultiIndex.new(pos.to_a))
947
+ end
948
+ else
949
+ if names[1].nil?
950
+ if location.is_a?(Range)
951
+ index_arry = @index.to_a
549
952
 
550
- return Daru::Vector.new(row, index: @vectors, name: name, dtype: @dtype)
953
+ range =
954
+ if location.first.is_a?(Numeric)
955
+ location
956
+ else
957
+ first_index = index_arry.index location.first
958
+ last_index = index_arry.index location.last
959
+
960
+ first_index..last_index
961
+ end
962
+
963
+ names = index_arry[range]
964
+ else
965
+ row = []
966
+ name = named_index_for names[0]
967
+ @vectors.each do |vector|
968
+ row << @data[@vectors[vector]][name]
969
+ end
970
+
971
+ return Daru::Vector.new(row, index: @vectors, name: set_name(name))
972
+ end
551
973
  end
974
+ # Access multiple rows
975
+ rows = []
976
+ names.each do |name|
977
+ rows << self.row[name]
978
+ end
979
+
980
+ Daru::DataFrame.rows rows, name: @name
552
981
  end
553
- # Access multiple rows
554
- rows = []
555
- names.each do |name|
556
- rows << self.row[name]
982
+ end
983
+
984
+ def row_index_for location
985
+ if @index.include?(location) or location[0].is_a?(Range)
986
+ @index[location]
987
+ elsif location[0].is_a?(Integer)
988
+ location[0]
989
+ end
990
+ end
991
+
992
+ def populate_row_for pos
993
+ @vectors.map do |vector|
994
+ @data[@vectors[vector]][pos]
557
995
  end
558
-
559
- Daru::DataFrame.rows rows, name: @name, dtype: @dtype
560
996
  end
561
997
 
562
998
  def insert_or_modify_vector name, vector
563
- @vectors = @vectors.re_index(@vectors + name)
999
+ @vectors = reassign_index_as(@vectors + name)
564
1000
  v = nil
565
1001
 
566
1002
  if vector.is_a?(Daru::Vector)
567
- v = Daru::Vector.new [], name: name, index: @index, dtype: @dtype
568
- nil_data = false
1003
+ v = Daru::Vector.new [], name: set_name(name), index: @index
569
1004
  @index.each do |idx|
570
- begin
571
- v[idx] = vector[idx]
572
- rescue IndexError
573
- v[idx] = nil
574
- end
1005
+ v[idx] = vector[idx]
575
1006
  end
576
1007
  else
577
1008
  raise Exception, "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
578
1009
  @size != vector.size
579
1010
 
580
- v = vector.dv(name, @index, @dtype)
1011
+ v = Daru::Vector.new(vector, name: set_name(name), index: @index)
581
1012
  end
582
1013
 
583
1014
  @data[@vectors[name]] = v
@@ -585,25 +1016,17 @@ module Daru
585
1016
 
586
1017
  def insert_or_modify_row name, vector
587
1018
  if @index.include? name
588
- v = vector.dv(name, @vectors, @dtype)
1019
+ v = vector.dv(name, @vectors, @dtype)
589
1020
 
590
1021
  @vectors.each do |vector|
591
- begin
592
- @data[@vectors[vector]][name] = v[vector]
593
- rescue IndexError
594
- @data[@vectors[vector]][name] = nil
595
- end
1022
+ @data[@vectors[vector]][name] = v[vector]
596
1023
  end
597
1024
  else
598
- @index = @index.re_index(@index + name)
599
- v = vector.dv(name, @vectors, @dtype)
1025
+ @index = reassign_index_as(@index + name)
1026
+ v = Daru::Vector.new(vector, name: set_name(name), index: @vectors)
600
1027
 
601
1028
  @vectors.each do |vector|
602
- begin
603
- @data[@vectors[vector]].concat v[vector], name
604
- rescue IndexError
605
- @data[@vectors[vector]].concat nil, name
606
- end
1029
+ @data[@vectors[vector]].concat v[vector], name
607
1030
  end
608
1031
  end
609
1032
 
@@ -612,16 +1035,16 @@ module Daru
612
1035
 
613
1036
  def create_empty_vectors
614
1037
  @vectors.each do |name|
615
- @data << Daru::Vector.new([],name: name, index: @index, dtype: @dtype)
1038
+ @data << Daru::Vector.new([], name: set_name(name), index: @index)
616
1039
  end
617
1040
  end
618
1041
 
619
1042
  def validate_labels
620
- raise IndexError, "Expected equal number of vectors for number of Hash pairs" if
621
- @vectors.size != @data.size
1043
+ raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if
1044
+ @vectors and @vectors.size != @data.size
622
1045
 
623
1046
  raise IndexError, "Expected number of indexes same as number of rows" if
624
- @index.size != @data[0].size
1047
+ @index and @data[0] and @index.size != @data[0].size
625
1048
  end
626
1049
 
627
1050
  def validate_vector_sizes
@@ -631,8 +1054,6 @@ module Daru
631
1054
  end
632
1055
 
633
1056
  def validate
634
- # TODO: [IMP] when vectors of different dimensions are specified, they should
635
- # be inserted into the dataframe by inserting nils wherever necessary.
636
1057
  validate_labels
637
1058
  validate_vector_sizes
638
1059
  end
@@ -660,10 +1081,10 @@ module Daru
660
1081
  def create_vectors_index_with vectors, source
661
1082
  vectors = source.keys.sort if vectors.nil?
662
1083
 
663
- if vectors.is_a?(Daru::Index)
664
- @vectors = vectors.to_index
665
- else
1084
+ unless vectors.is_a?(Index) or vectors.is_a?(MultiIndex)
666
1085
  @vectors = Daru::Index.new (vectors + (source.keys - vectors)).uniq.map(&:to_sym)
1086
+ else
1087
+ @vectors = vectors
667
1088
  end
668
1089
  end
669
1090
 
@@ -674,5 +1095,32 @@ module Daru
674
1095
  index == vector.index
675
1096
  end
676
1097
  end
1098
+
1099
+ def reassign_index_as new_index
1100
+ Daru::Index.new new_index
1101
+ end
1102
+
1103
+ def create_index index
1104
+ index.is_a?(MultiIndex) ? index : Daru::Index.new(index)
1105
+ end
1106
+
1107
+ def set_name potential_name
1108
+ potential_name.is_a?(Array) ? potential_name.join.to_sym : potential_name
1109
+ end
1110
+
1111
+ def symbolize arry
1112
+ symbolized_arry =
1113
+ if arry.all? { |e| e.is_a?(Array) }
1114
+ arry.map do |sub_arry|
1115
+ sub_arry.map do |e|
1116
+ e.is_a?(Numeric) ? e : e.to_sym
1117
+ end
1118
+ end
1119
+ else
1120
+ arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym }
1121
+ end
1122
+
1123
+ symbolized_arry
1124
+ end
677
1125
  end
678
1126
  end