daru 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +0 -0
  3. data/Gemfile +0 -1
  4. data/History.txt +35 -0
  5. data/README.md +178 -198
  6. data/daru.gemspec +5 -7
  7. data/lib/daru.rb +10 -2
  8. data/lib/daru/accessors/array_wrapper.rb +36 -198
  9. data/lib/daru/accessors/nmatrix_wrapper.rb +60 -209
  10. data/lib/daru/core/group_by.rb +183 -0
  11. data/lib/daru/dataframe.rb +615 -167
  12. data/lib/daru/index.rb +17 -16
  13. data/lib/daru/io/io.rb +5 -12
  14. data/lib/daru/maths/arithmetic/dataframe.rb +72 -8
  15. data/lib/daru/maths/arithmetic/vector.rb +19 -6
  16. data/lib/daru/maths/statistics/dataframe.rb +103 -2
  17. data/lib/daru/maths/statistics/vector.rb +102 -61
  18. data/lib/daru/monkeys.rb +8 -0
  19. data/lib/daru/multi_index.rb +199 -0
  20. data/lib/daru/plotting/dataframe.rb +24 -24
  21. data/lib/daru/plotting/vector.rb +14 -15
  22. data/lib/daru/vector.rb +402 -98
  23. data/lib/version.rb +1 -1
  24. data/notebooks/grouping_splitting_pivots.ipynb +529 -0
  25. data/notebooks/intro_with_music_data_.ipynb +104 -119
  26. data/spec/accessors/wrappers_spec.rb +36 -0
  27. data/spec/core/group_by_spec.rb +331 -0
  28. data/spec/dataframe_spec.rb +1237 -475
  29. data/spec/fixtures/sales-funnel.csv +18 -0
  30. data/spec/index_spec.rb +10 -21
  31. data/spec/io/io_spec.rb +4 -14
  32. data/spec/math/arithmetic/dataframe_spec.rb +66 -0
  33. data/spec/math/arithmetic/vector_spec.rb +45 -4
  34. data/spec/math/statistics/dataframe_spec.rb +91 -1
  35. data/spec/math/statistics/vector_spec.rb +32 -6
  36. data/spec/monkeys_spec.rb +10 -1
  37. data/spec/multi_index_spec.rb +216 -0
  38. data/spec/spec_helper.rb +1 -0
  39. data/spec/vector_spec.rb +505 -57
  40. metadata +21 -15
@@ -0,0 +1,183 @@
1
+ module Daru
2
+ module Core
3
+ class GroupBy
4
+
5
+ attr_reader :groups
6
+
7
+ def initialize context, names
8
+ @groups = {}
9
+ @non_group_vectors = context.vectors.to_a - names
10
+ @context = context
11
+ vectors = names.map { |vec| context.vector[vec].to_a }
12
+ tuples = vectors[0].zip(*vectors[1..-1])
13
+ keys = tuples.uniq.sort
14
+
15
+ keys.each do |key|
16
+ @groups[key] = all_indices_for(tuples, key)
17
+ end
18
+ @groups.freeze
19
+ end
20
+
21
+ def size
22
+ index =
23
+ if multi_indexed_grouping?
24
+ Daru::MultiIndex.new symbolize(@groups.keys)
25
+ else
26
+ Daru::Index.new symbolize(@groups.keys.flatten)
27
+ end
28
+
29
+ values = @groups.values.map { |e| e.size }
30
+ Daru::Vector.new(values, index: index, name: :size)
31
+ end
32
+
33
+ def first
34
+ head(1)
35
+ end
36
+
37
+ def last
38
+ tail(1)
39
+ end
40
+
41
+ def head quantity=5
42
+ select_groups_from :first, quantity
43
+ end
44
+
45
+ def tail quantity=5
46
+ select_groups_from :last, quantity
47
+ end
48
+
49
+ # Calculate mean of numeric groups, excluding missing values.
50
+ def mean
51
+ apply_method :numeric, :mean
52
+ end
53
+
54
+ # Calculate the median of numeric groups, excluding missing values.
55
+ def median
56
+ apply_method :numeric, :median
57
+ end
58
+
59
+ # Calculate sum of numeric groups, excluding missing values.
60
+ def sum
61
+ apply_method :numeric, :sum
62
+ end
63
+
64
+ def count
65
+ width = @non_group_vectors.size
66
+ Daru::DataFrame.new([size]*width, order: @non_group_vectors)
67
+ end
68
+
69
+ # Calculate sample standard deviation of numeric vector groups, excluding
70
+ # missing values.
71
+ def std
72
+ apply_method :numeric, :std
73
+ end
74
+
75
+ # Find the max element of each numeric vector group.
76
+ def max
77
+ apply_method :numeric, :max
78
+ end
79
+
80
+ # Find the min element of each numeric vector group.
81
+ def min
82
+ apply_method :numeric, :min
83
+ end
84
+
85
+ # Returns one of the selected groups as a DataFrame.
86
+ def get_group group
87
+ indexes = @groups[group]
88
+ elements = []
89
+
90
+ @context.each_vector do |vector|
91
+ elements << vector.to_a
92
+ end
93
+ rows = []
94
+ transpose = elements.transpose
95
+
96
+ indexes.each do |idx|
97
+ rows << transpose[idx]
98
+ end
99
+ Daru::DataFrame.rows(rows, index: @context.index[indexes], order: @context.vectors)
100
+ end
101
+
102
+ private
103
+
104
+ def select_groups_from method, quantity
105
+ selection = @context
106
+ rows, indexes = [], []
107
+
108
+ @groups.each_value do |index|
109
+ index.send(method, quantity).each do |idx|
110
+ rows << selection.row[idx].to_a
111
+ indexes << idx
112
+ end
113
+ end
114
+ indexes.flatten!
115
+
116
+ Daru::DataFrame.rows(rows, order: @context.vectors, index: indexes)
117
+ end
118
+
119
+ def apply_method method_type, method
120
+ multi_index = multi_indexed_grouping?
121
+ rows, order = [], []
122
+
123
+ @groups.each do |group, indexes|
124
+ single_row = []
125
+ @non_group_vectors.each do |ngvector|
126
+ vector = @context.vector[ngvector]
127
+ if method_type == :numeric and vector.type == :numeric
128
+ slice = vector[*indexes]
129
+
130
+ single_row << (slice.is_a?(Numeric) ? slice : slice.send(method))
131
+ order << ngvector
132
+ end
133
+ end
134
+
135
+ rows << single_row
136
+ end
137
+
138
+ index = symbolize @groups.keys
139
+ index = multi_index ? Daru::MultiIndex.new(index) : Daru::Index.new(index.flatten)
140
+ order = symbolize order
141
+ order =
142
+ if order.all?{ |e| e.is_a?(Array) }
143
+ Daru::MultiIndex.new(order)
144
+ else
145
+ Daru::Index.new(order)
146
+ end
147
+
148
+ Daru::DataFrame.new(rows.transpose, index: index, order: order)
149
+ end
150
+
151
+ def all_indices_for arry, element
152
+ found, index, indexes = -1, -1, []
153
+ while found
154
+ found = arry[index+1..-1].index(element)
155
+ if found
156
+ index = index + found + 1
157
+ indexes << index
158
+ end
159
+ end
160
+ indexes
161
+ end
162
+
163
+ def symbolize arry
164
+ symbolized_arry =
165
+ if arry.all? { |e| e.is_a?(Array) }
166
+ arry.map do |sub_arry|
167
+ sub_arry.map do |e|
168
+ e.is_a?(Numeric) ? e : e.to_sym
169
+ end
170
+ end
171
+ else
172
+ arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym }
173
+ end
174
+
175
+ symbolized_arry
176
+ end
177
+
178
+ def multi_indexed_grouping?
179
+ @groups.keys[0][1] ? true : false
180
+ end
181
+ end
182
+ end
183
+ end
@@ -1,9 +1,11 @@
1
- require_relative 'accessors/dataframe_by_row.rb'
2
- require_relative 'accessors/dataframe_by_vector.rb'
3
- require_relative 'maths/arithmetic/dataframe.rb'
4
- require_relative 'maths/statistics/dataframe.rb'
5
- require_relative 'plotting/dataframe.rb'
6
- require_relative 'io/io.rb'
1
+ $:.unshift File.dirname(__FILE__)
2
+
3
+ require 'accessors/dataframe_by_row.rb'
4
+ require 'accessors/dataframe_by_vector.rb'
5
+ require 'maths/arithmetic/dataframe.rb'
6
+ require 'maths/statistics/dataframe.rb'
7
+ require 'plotting/dataframe.rb'
8
+ require 'io/io.rb'
7
9
 
8
10
  module Daru
9
11
  class DataFrame
@@ -14,7 +16,7 @@ module Daru
14
16
 
15
17
  class << self
16
18
  # Load data from a CSV file.
17
- # Arguments - path, options, block(optional)
19
+ # Arguments - path, options, block(optional)
18
20
  #
19
21
  # Accepts a block for pre-conditioning of CSV data if any.
20
22
  def from_csv path, opts={}, &block
@@ -24,25 +26,25 @@ module Daru
24
26
  # Create DataFrame by specifying rows as an Array of Arrays or Array of
25
27
  # Daru::Vector objects.
26
28
  def rows source, opts={}
29
+ df = nil
27
30
  if source.all? { |v| v.size == source[0].size }
28
31
  first = source[0]
29
32
  index = []
30
- order =
31
- unless opts[:order]
32
- if first.is_a?(Daru::Vector) # assume that all are Vectors only
33
- source.each { |vec| index << vec.name }
34
- first.index.to_a
35
- elsif first.is_a?(Array)
36
- Array.new(first.size) { |i| i.to_s }
37
- end
38
- else
39
- opts[:order]
33
+ opts[:order] ||=
34
+ if first.is_a?(Daru::Vector) # assume that all are Vectors
35
+ source.each { |vec| index << vec.name }
36
+ first.index.to_a
37
+ elsif first.is_a?(Array)
38
+ Array.new(first.size) { |i| i.to_s }
40
39
  end
41
40
 
42
- opts[:order] = order
43
- df = Daru::DataFrame.new({}, opts)
44
- source.each_with_index do |row,idx|
45
- df[(index[idx] || idx), :row] = row
41
+ if source.all? { |s| s.is_a?(Array) }
42
+ df = Daru::DataFrame.new(source.transpose, opts)
43
+ else # array of Daru::Vectors
44
+ df = Daru::DataFrame.new({}, opts)
45
+ source.each_with_index do |row, idx|
46
+ df[(index[idx] || idx), :row] = row
47
+ end
46
48
  end
47
49
  else
48
50
  raise SizeError, "All vectors must have same length"
@@ -65,8 +67,8 @@ module Daru
65
67
  attr_reader :size
66
68
 
67
69
  # DataFrame basically consists of an Array of Vector objects.
68
- # These objects are indexed by row and column by vectors and index Index objects.
69
- # Arguments - source, vectors, index, name.
70
+ # These objects are indexed by row and column by vectors and index Index objects.
71
+ # Arguments - source, vectors, index, name.
70
72
  #
71
73
  # == Usage
72
74
  # df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
@@ -82,43 +84,55 @@ module Daru
82
84
  def initialize source, opts={}
83
85
  vectors = opts[:order]
84
86
  index = opts[:index]
85
- @dtype = opts[:dtype] || Array
86
87
  @name = (opts[:name] || SecureRandom.uuid).to_sym
87
88
  @data = []
88
89
 
89
90
  if source.empty?
90
- @vectors = Daru::Index.new vectors
91
- @index = Daru::Index.new index
91
+ @vectors = create_index vectors
92
+ @index = create_index index
92
93
  create_empty_vectors
93
94
  else
94
95
  case source
95
96
  when Array
96
- if vectors.nil?
97
- @vectors = Daru::Index.new source[0].keys.map(&:to_sym)
98
- else
99
- @vectors = Daru::Index.new (vectors + (source[0].keys - vectors)).uniq.map(&:to_sym)
100
- end
97
+ if source.all? { |s| s.is_a?(Array) }
98
+ raise ArgumentError, "Number of vectors (#{vectors.size}) should \
99
+ equal order size (#{source.size})" if source.size != vectors.size
101
100
 
102
- if index.nil?
103
- @index = Daru::Index.new source.size
104
- else
105
- @index = Daru::Index.new index
106
- end
101
+ @index = create_index(index || source[0].size)
102
+ @vectors = create_index(vectors)
107
103
 
108
- @vectors.each do |name|
109
- v = []
110
- source.each do |hsh|
111
- v << (hsh[name] || hsh[name.to_s])
104
+ @vectors.each_with_index do |vec,idx|
105
+ @data << Daru::Vector.new(source[idx], index: @index)
106
+ end
107
+ elsif source.all? { |s| s.is_a?(Daru::Vector) }
108
+ hsh = {}
109
+ vectors.each_with_index do |name, idx|
110
+ hsh[name] = source[idx]
111
+ end
112
+ initialize(hsh, index: index, order: vectors, name: @name)
113
+ else # array of hashes
114
+ if vectors.nil?
115
+ @vectors = Daru::Index.new source[0].keys.map(&:to_sym)
116
+ else
117
+ @vectors = Daru::Index.new (vectors + (source[0].keys - vectors)).uniq.map(&:to_sym)
112
118
  end
119
+ @index = Daru::Index.new(index || source.size)
120
+
121
+ @vectors.each do |name|
122
+ v = []
123
+ source.each do |hsh|
124
+ v << (hsh[name] || hsh[name.to_s])
125
+ end
113
126
 
114
- @data << v.dv(name, @index, @dtype)
127
+ @data << Daru::Vector.new(v, name: set_name(name), index: @index)
128
+ end
115
129
  end
116
130
  when Hash
117
131
  create_vectors_index_with vectors, source
118
132
  if all_daru_vectors_in_source? source
119
133
  if !index.nil?
120
- @index = index.to_index
121
- elsif all_vectors_have_equal_indexes? source
134
+ @index = create_index index
135
+ elsif all_vectors_have_equal_indexes?(source)
122
136
  @index = source.values[0].index.dup
123
137
  else
124
138
  all_indexes = []
@@ -131,29 +145,17 @@ module Daru
131
145
  @index = Daru::Index.new all_indexes
132
146
  end
133
147
  @vectors.each do |vector|
134
- @data << Daru::Vector.new([], name: vector, index: @index, dtype: @dtype)
148
+ @data << Daru::Vector.new([], name: vector, index: @index)
135
149
 
136
150
  @index.each do |idx|
137
- begin
138
- @data[@vectors[vector]][idx] = source[vector][idx]
139
- rescue IndexError
140
- # If the index is not present in the vector under consideration
141
- # (in source) then an error is raised. Put a nil in that place if
142
- # that is the case.
143
- @data[@vectors[vector]][idx] = nil
144
- end
151
+ @data[@vectors[vector]][idx] = source[vector][idx]
145
152
  end
146
153
  end
147
- else
148
- index = source.values[0].size if index.nil?
149
- if index.is_a?(Daru::Index)
150
- @index = index.to_index
151
- else
152
- @index = Daru::Index.new index
153
- end
154
+ else
155
+ @index = create_index(index || source.values[0].size)
154
156
 
155
157
  @vectors.each do |name|
156
- @data << source[name].dup.dv(name, @index, @dtype)
158
+ @data << Daru::Vector.new(source[name].dup, name: set_name(name), index: @index)
157
159
  end
158
160
  end
159
161
  end
@@ -164,10 +166,17 @@ module Daru
164
166
  end
165
167
 
166
168
  # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
167
- # Use of this method is not recommended for accessing rows or vectors.
168
- # Use df.row[:a] for accessing row with index ':a' or df.vector[:vec] for
169
- # accessing vector with index ':vec'
170
- def [](*names, axis)
169
+ # Defaults to *:vector*. Use of this method is not recommended for accessing
170
+ # rows or vectors. Use df.row[:a] for accessing row with index ':a' or
171
+ # df.vector[:vec] for accessing vector with index *:vec*.
172
+ def [](*names)
173
+ if names[-1] == :vector or names[-1] == :row
174
+ axis = names[-1]
175
+ names = names[0..-2]
176
+ else
177
+ axis = :vector
178
+ end
179
+
171
180
  if axis == :vector
172
181
  access_vector *names
173
182
  elsif axis == :row
@@ -184,7 +193,12 @@ module Daru
184
193
  # In case a Daru::Vector is specified after the equality the sign, the indexes
185
194
  # of the vector will be matched against the row/vector indexes of the DataFrame
186
195
  # before an insertion is performed. Unmatched indexes will be set to nil.
187
- def []=(name, axis ,vector)
196
+ def []=(*args)
197
+ name = args[0]
198
+ axis = args[1]
199
+ vector = args[-1]
200
+
201
+ axis = (!axis.is_a?(Symbol) and (axis != :vector or axis != :row)) ? :vector : axis
188
202
  if axis == :vector
189
203
  insert_or_modify_vector name, vector
190
204
  elsif axis == :row
@@ -203,6 +217,11 @@ module Daru
203
217
  Daru::Accessors::DataFrameByVector.new(self)
204
218
  end
205
219
 
220
+ # Access a vector by name.
221
+ def column name
222
+ vector[name]
223
+ end
224
+
206
225
  # Access a row or set/create a row. Refer #[] and #[]= docs for details.
207
226
  #
208
227
  # == Usage
@@ -219,18 +238,24 @@ module Daru
219
238
  src[vector] = @data[@vectors[vector]].dup
220
239
  end
221
240
 
222
- Daru::DataFrame.new src, order: @vectors.dup, index: @index.dup, name: @name, dtype: @dtype
241
+ Daru::DataFrame.new src, order: @vectors.dup, index: @index.dup, name: @name
223
242
  end
224
243
 
225
244
  # Iterate over each vector
226
245
  def each_vector(&block)
246
+ return to_enum(:each_vector) unless block_given?
247
+
227
248
  @data.each(&block)
228
249
 
229
250
  self
230
251
  end
231
252
 
253
+ alias_method :each_column, :each_vector
254
+
232
255
  # Iterate over each vector alongwith the name of the vector
233
256
  def each_vector_with_index(&block)
257
+ return to_enum(:each_vector_with_index) unless block_given?
258
+
234
259
  @vectors.each do |vector|
235
260
  yield @data[@vectors[vector]], vector
236
261
  end
@@ -238,8 +263,12 @@ module Daru
238
263
  self
239
264
  end
240
265
 
266
+ alias_method :each_column_with_index, :each_vector_with_index
267
+
241
268
  # Iterate over each row
242
269
  def each_row(&block)
270
+ return to_enum(:each_row) unless block_given?
271
+
243
272
  @index.each do |index|
244
273
  yield access_row(index)
245
274
  end
@@ -248,6 +277,8 @@ module Daru
248
277
  end
249
278
 
250
279
  def each_row_with_index(&block)
280
+ return to_enum(:each_row_with_index) unless block_given?
281
+
251
282
  @index.each do |index|
252
283
  yield access_row(index), index
253
284
  end
@@ -256,17 +287,27 @@ module Daru
256
287
  end
257
288
 
258
289
  # Map each vector. Returns a DataFrame whose vectors are modified according
259
- # to the value returned by the block.
290
+ # to the value returned by the block. As is the case with Enumerable#map,
291
+ # the object returned by each block must be a Daru::Vector for the dataframe
292
+ # to remain relevant.
260
293
  def map_vectors(&block)
261
- df = self.dup
262
- df.each_vector_with_index do |vector, name|
263
- df[name, :vector] = yield(vector)
264
- end
294
+ return to_enum(:map_vectors) unless block_given?
265
295
 
266
- df
296
+ self.dup.map_vectors!(&block)
267
297
  end
268
298
 
299
+ # Destructive form of #map_vectors
300
+ def map_vectors!(&block)
301
+ return to_enum(:map_vectors!) unless block_given?
302
+
303
+ @data.map!(&block)
304
+ self
305
+ end
306
+
307
+ # Map vectors alongwith the index.
269
308
  def map_vectors_with_index(&block)
309
+ return to_enum(:map_vectors_with_index) unless block_given?
310
+
270
311
  df = self.dup
271
312
  df.each_vector_with_index do |vector, name|
272
313
  df[name, :vector] = yield(vector, name)
@@ -277,6 +318,8 @@ module Daru
277
318
 
278
319
  # Map each row
279
320
  def map_rows(&block)
321
+ return to_enum(:map_rows) unless block_given?
322
+
280
323
  df = self.dup
281
324
  df.each_row_with_index do |row, index|
282
325
  df[index, :row] = yield(row)
@@ -286,6 +329,8 @@ module Daru
286
329
  end
287
330
 
288
331
  def map_rows_with_index(&block)
332
+ return to_enum(:map_rows_with_index) unless block_given?
333
+
289
334
  df = self.dup
290
335
  df.each_row_with_index do |row, index|
291
336
  df[index, :row] = yield(row, index)
@@ -302,13 +347,16 @@ module Daru
302
347
  else
303
348
  raise IndexError, "Vector #{vector} does not exist."
304
349
  end
350
+
351
+ self
305
352
  end
306
353
 
354
+ # Delete a row
307
355
  def delete_row index
308
356
  idx = named_index_for index
309
357
 
310
358
  if @index.include? idx
311
- @index = (@index.to_a - [idx]).to_index
359
+ @index = reassign_index_as(@index.to_a - [idx])
312
360
  self.each_vector do |vector|
313
361
  vector.delete_at idx
314
362
  end
@@ -343,6 +391,8 @@ module Daru
343
391
  # Iterates over each row and retains it in a new DataFrame if the block returns
344
392
  # true for that row.
345
393
  def filter_rows &block
394
+ return to_enum(:filter_rows) unless block_given?
395
+
346
396
  df = Daru::DataFrame.new({}, order: @vectors.to_a)
347
397
  marked = []
348
398
 
@@ -361,39 +411,255 @@ module Daru
361
411
  # Iterates over each vector and retains it in a new DataFrame if the block returns
362
412
  # true for that vector.
363
413
  def filter_vectors &block
414
+ return to_enum(:filter_vectors) unless block_given?
415
+
364
416
  df = self.dup
365
417
  df.keep_vector_if &block
366
418
 
367
419
  df
368
420
  end
369
421
 
422
+ # Return the number of rows and columns of the DataFrame in an Array.
423
+ def shape
424
+ [@index.size, @vectors.size]
425
+ end
426
+
427
+ # The number of rows
428
+ def rows
429
+ shape[0]
430
+ end
431
+
432
+ # The number of vectors
433
+ def cols
434
+ shape[1]
435
+ end
436
+
370
437
  # Check if a vector is present
371
- def has_vector? name
372
- !!@vectors[name]
438
+ def has_vector? vector
439
+ !!@vectors[*vector]
373
440
  end
374
441
 
442
+ # The first ten elements of the DataFrame
443
+ #
444
+ # @param [Fixnum] quantity (10) The number of elements to display from the top.
375
445
  def head quantity=10
376
446
  self[0..quantity, :row]
377
447
  end
378
448
 
449
+ # The last ten elements of the DataFrame
450
+ #
451
+ # @param [Fixnum] quantity (10) The number of elements to display from the bottom.
379
452
  def tail quantity=10
380
- self[(@size - quantity)..@size, :row]
453
+ self[(@size - quantity)..(@size-1), :row]
381
454
  end
382
455
 
383
- # def sort_by_row name
384
-
385
- # end
456
+ # Group elements by vector to perform operations on them.
457
+ def group_by vectors
458
+ vectors = [vectors] if vectors.is_a?(Symbol)
459
+ vectors.each { |v| raise(ArgumentError, "Vector #{v} does not exist") unless
460
+ has_vector?(v) }
461
+
462
+ Daru::Core::GroupBy.new(self, vectors)
463
+ end
464
+
465
+ # Change the index of the DataFrame and its underlying vectors. Destructive.
466
+ #
467
+ # @param [Symbol, Array] new_index Specify an Array if
468
+ def reindex! new_index
469
+ raise ArgumentError, "Index size must equal dataframe size" if new_index.is_a?(Array) and new_index.size != @size
470
+
471
+ @index = possibly_multi_index?(new_index == :seq ? @size : new_index)
472
+ @data.map! do |vector|
473
+ vector.reindex possibly_multi_index?(@index.to_a)
474
+ end
475
+
476
+ self
477
+ end
478
+
479
+ # Non-destructive version of #reindex!
480
+ def reindex new_index
481
+ self.dup.reindex! new_index
482
+ end
483
+
484
+ # Return the names of all the numeric vectors. Will include vectors with nils
485
+ # alongwith numbers.
486
+ def numeric_vectors
487
+ numerics = []
488
+
489
+ each_vector do |vec|
490
+ numerics << vec.name if(vec.type == :numeric)
491
+ end
492
+ numerics
493
+ end
494
+
495
+ # Sorts a dataframe (ascending/descending)according to the given sequence of
496
+ # vectors, using the attributes provided in the blocks. Works for 2 LEVELS ONLY.
497
+ #
498
+ # @param order [Array] The order of vector names in which the DataFrame
499
+ # should be sorted.
500
+ # @param [Hash] opts The options to sort with.
501
+ # @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
502
+ # or descending order. Specify Array corresponding to *order* for multiple
503
+ # sort orders.
504
+ # @option opts [Hash] :by ({|a,b| a <=> b}) Specify attributes of objects to
505
+ # to be used for sorting, for each vector name in *order* as a hash of
506
+ # vector name and lambda pairs. In case a lambda for a vector is not
507
+ # specified, the default will be used.
508
+ #
509
+ # == Usage
510
+ #
511
+ # df = Daru::DataFrame.new({a: [-3,2,-1,4], b: [4,3,2,1]})
512
+ #
513
+ # #<Daru::DataFrame:140630680 @name = 04e00197-f8d5-4161-bca2-93266bfabc6f @size = 4>
514
+ # # a b
515
+ # # 0 -3 4
516
+ # # 1 2 3
517
+ # # 2 -1 2
518
+ # # 3 4 1
519
+ # df.sort([:a], by: { a: lambda { |a,b| a.abs <=> b.abs } })
520
+ def sort! vector_order, opts={}
521
+ raise ArgumentError, "Required atleast one vector name" if vector_order.size < 1
522
+ opts = {
523
+ ascending: true,
524
+ type: :quick_sort,
525
+ by: {}
526
+ }.merge(opts)
527
+
528
+ opts[:by] = create_logic_blocks vector_order, opts[:by]
529
+ opts[:ascending] = sort_order_array vector_order, opts[:ascending]
530
+ index = @index.to_a
531
+ send(opts[:type], vector_order, index, opts[:by], opts[:ascending])
532
+ reindex! index
533
+ end
534
+
535
+ # Non-destructive version of #sort!
536
+ def sort vector_order, opts={}
537
+ self.dup.sort! vector_order, opts
538
+ end
539
+
540
+ # Pivots a data frame on specified vectors and applies an aggregate function
541
+ # to quickly generate a summary.
542
+ #
543
+ # == Options
544
+ #
545
+ # +:index+ - Keys to group by on the pivot table row index. Pass vector names
546
+ # contained in an Array.
547
+ #
548
+ # +:vectors+ - Keys to group by on the pivot table column index. Pass vector
549
+ # names contained in an Array.
550
+ #
551
+ # +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
552
+ # use any of the statistics functions applicable on Vectors that can be found in
553
+ # the Daru::Statistics::Vector module.
554
+ #
555
+ # +:values+ - Columns to aggregate. Will consider all numeric columns not
556
+ # specified in *:index* or *:vectors*. Optional.
557
+ #
558
+ # == Usage
559
+ #
560
+ # df = Daru::DataFrame.new({
561
+ # a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
562
+ # b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
563
+ # c: ['small','large','large','small','small','large','small','large','small'],
564
+ # d: [1,2,2,3,3,4,5,6,7],
565
+ # e: [2,4,4,6,6,8,10,12,14]
566
+ # })
567
+ # df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
568
+ #
569
+ # #=>
570
+ # # #<Daru::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
571
+ # # [:e, :one] [:e, :two]
572
+ # # [:bar] 18 26
573
+ # # [:foo] 10 12
574
+ def pivot_table opts={}
575
+ raise ArgumentError, "Specify grouping index" if !opts[:index] or opts[:index].empty?
386
576
 
387
- # def sort_by_vector name
577
+ index = opts[:index]
578
+ vectors = opts[:vectors] || []
579
+ aggregate_function = opts[:agg] || :mean
580
+ values =
581
+ if opts[:values].is_a?(Symbol)
582
+ [opts[:values]]
583
+ elsif opts[:values].is_a?(Array)
584
+ opts[:values]
585
+ else # nil
586
+ (@vectors.to_a - (index | vectors)) & numeric_vectors
587
+ end
388
588
 
389
- # end
589
+ raise IndexError, "No numeric vectors to aggregate" if values.empty?
590
+
591
+ grouped = group_by(index)
592
+
593
+ unless vectors.empty?
594
+ super_hash = {}
595
+ values.each do |value|
596
+ grouped.groups.each do |group_name, row_numbers|
597
+ super_hash[group_name] ||= {}
598
+
599
+ row_numbers.each do |num|
600
+ arry = []
601
+ arry << value
602
+ vectors.each { |v| arry << self[v][num] }
603
+ sub_hash = super_hash[group_name]
604
+ sub_hash[arry] ||= []
605
+
606
+ sub_hash[arry] << self[value][num]
607
+ end
608
+ end
609
+ end
610
+
611
+ super_hash.each_value do |sub_hash|
612
+ sub_hash.each do |group_name, aggregates|
613
+ sub_hash[group_name] = Daru::Vector.new(aggregates).send(aggregate_function)
614
+ end
615
+ end
616
+
617
+ df_index = Daru::MultiIndex.new(symbolize(super_hash.keys))
618
+
619
+ vector_indexes = []
620
+ super_hash.each_value do |sub_hash|
621
+ vector_indexes.concat sub_hash.keys
622
+ end
623
+ df_vectors = Daru::MultiIndex.new symbolize(vector_indexes.uniq)
624
+ pivoted_dataframe = Daru::DataFrame.new({}, index: df_index, order: df_vectors)
625
+
626
+ super_hash.each do |row_index, sub_h|
627
+ sub_h.each do |vector_index, val|
628
+ pivoted_dataframe[symbolize(vector_index)][symbolize(row_index)] = val
629
+ end
630
+ end
631
+ return pivoted_dataframe
632
+ else
633
+ grouped.send(aggregate_function)
634
+ end
635
+ end
636
+
637
+ # Convert all vectors of type *:numeric* into a Matrix.
638
+ def to_matrix
639
+ numerics_as_arrays = []
640
+ each_vector do |vector|
641
+ numerics_as_arrays << vector.to_a if(vector.type == :numeric)
642
+ end
643
+
644
+ Matrix.columns numerics_as_arrays
645
+ end
646
+
647
+ # Convert all vectors of type *:numeric* and not containing nils into an NMatrix.
648
+ def to_nmatrix
649
+ numerics_as_arrays = []
650
+ each_vector do |vector|
651
+ numerics_as_arrays << vector.to_a if(vector.type == :numeric and
652
+ vector.nil_positions.size == 0)
653
+ end
654
+
655
+ numerics_as_arrays.transpose.to_nm
656
+ end
390
657
 
391
658
  # Converts the DataFrame into an array of hashes where key is vector name
392
- # and value is the corresponding element.
393
- # The 0th index of the array contains the array of hashes while the 1th
394
- # index contains the indexes of each row of the dataframe. Each element in
395
- # the index array corresponds to its row in the array of hashes, which has
396
- # the same index.
659
+ # and value is the corresponding element. The 0th index of the array contains
660
+ # the array of hashes while the 1th index contains the indexes of each row
661
+ # of the dataframe. Each element in the index array corresponds to its row
662
+ # in the array of hashes, which has the same index.
397
663
  def to_a
398
664
  arry = [[],[]]
399
665
  self.each_row do |row|
@@ -443,7 +709,28 @@ module Daru
443
709
  to_html
444
710
  end
445
711
 
446
- # Pretty print in a nice table format for the command line (irb)
712
+ # Change dtypes of vectors by supplying a hash of :vector_name => :new_dtype
713
+ #
714
+ # == Usage
715
+ # df = Daru::DataFrame.new({a: [1,2,3], b: [1,2,3], c: [1,2,3]})
716
+ # df.recast a: :nmatrix, c: :nmatrix
717
+ def recast opts={}
718
+ opts.each do |vector_name, dtype|
719
+ vector[vector_name].cast(dtype: dtype)
720
+ end
721
+ end
722
+
723
+ # Transpose a DataFrame, tranposing elements and row, column indexing.
724
+ def transpose
725
+ arrys = []
726
+ each_vector do |vec|
727
+ arrys << vec.to_a
728
+ end
729
+
730
+ Daru::DataFrame.new(arrys.transpose, index: @vectors, order: @index, dtype: @dtype, name: @name)
731
+ end
732
+
733
+ # Pretty print in a nice table format for the command line (irb/pry/iruby)
447
734
  def inspect spacing=10, threshold=15
448
735
  longest = [@name.to_s.size,
449
736
  @vectors.map(&:to_s).map(&:size).max,
@@ -477,23 +764,14 @@ module Daru
477
764
  content
478
765
  end
479
766
 
480
- def dtype= dtype
481
- @dtype = dtype
482
-
483
- @vectors.each do |vec|
484
- pos = @vectors[vec]
485
- @data[pos] = @data[pos].coerce(@dtype)
486
- end
487
- end
488
-
489
767
  def == other
490
- @index == other.index and @size == other.size and @vectors.all? { |vector|
491
- self[vector, :vector] == other[vector, :vector] }
768
+ @index == other.index and @size == other.size and @vectors == other.vectors and
769
+ @vectors.all? { |vector| self[vector, :vector] == other[vector, :vector] }
492
770
  end
493
771
 
494
772
  def method_missing(name, *args, &block)
495
773
  if md = name.match(/(.+)\=/)
496
- insert_or_modify_vector name[/(.+)\=/].delete("="), args[0]
774
+ insert_or_modify_vector name[/(.+)\=/].delete("=").to_sym, args[0]
497
775
  elsif self.has_vector? name
498
776
  self[name, :vector]
499
777
  else
@@ -503,81 +781,234 @@ module Daru
503
781
 
504
782
  private
505
783
 
506
- def access_vector *names
507
- unless names[1]
508
- if @vectors.include? names[0]
509
- return @data[@vectors[names[0]]]
510
- elsif @vectors.key names[0]
511
- return @data[names[0]]
784
+ def possibly_multi_index? index
785
+ if @index.is_a?(MultiIndex)
786
+ Daru::MultiIndex.new(index)
787
+ else
788
+ Daru::Index.new(index)
789
+ end
790
+ end
791
+
792
+ def quick_sort vector_order, index, by, ascending
793
+ recursive_quick_sort vector_order, index, by, ascending, 0, @size-1
794
+ end
795
+
796
+ # == Arguments
797
+ #
798
+ # vector_order -
799
+ # index -
800
+ # by -
801
+ # ascending -
802
+ # left_lower -
803
+ # right_upper -
804
+ def recursive_quick_sort vector_order, index, by, ascending, left_lower, right_upper
805
+ if left_lower < right_upper
806
+ left_upper, right_lower = partition(vector_order, index, by, ascending, left_lower, right_upper)
807
+ if left_upper - left_lower < right_upper - right_lower
808
+ recursive_quick_sort(vector_order, index, by, ascending, left_lower, left_upper)
809
+ recursive_quick_sort(vector_order, index, by, ascending, right_lower, right_upper)
512
810
  else
513
- raise IndexError, "Specified index #{names[0]} does not exist."
811
+ recursive_quick_sort(vector_order, index, by, ascending, right_lower, right_upper)
812
+ recursive_quick_sort(vector_order, index, by, ascending, left_lower, left_upper)
514
813
  end
515
814
  end
516
- new_vcs = {}
815
+ end
517
816
 
518
- names.each do |name|
519
- name = name.to_sym unless name.is_a?(Integer)
817
+ def partition vector_order, index, by, ascending, left_lower, right_upper
818
+ mindex = (left_lower + right_upper) / 2
819
+ mvalues = vector_order.inject([]) { |a, vector_name| a << vector[vector_name][mindex]; a }
820
+ i = left_lower
821
+ j = right_upper
822
+ descending = ascending.map { |a| !a }
823
+
824
+ i += 1 while(keep?(i, mvalues, vector_order, ascending , by, 0))
825
+ j -= 1 while(keep?(j, mvalues, vector_order, descending, by, 0))
826
+
827
+ while i < j - 1
828
+ @data.each do |vector|
829
+ vector[i], vector[j] = vector[j], vector[i]
830
+ end
831
+ index[i], index[j] = index[j], index[i]
832
+ i += 1
833
+ j -= 1
520
834
 
521
- new_vcs[name] = @data[@vectors[name]]
835
+ i += 1 while(keep?(i, mvalues, vector_order, ascending , by,0))
836
+ j -= 1 while(keep?(j, mvalues, vector_order, descending, by,0))
522
837
  end
523
- Daru::DataFrame.new new_vcs, order: new_vcs.keys, index: @index, name: @name
838
+
839
+ if i <= j
840
+ if i < j
841
+ @data.each do |vector|
842
+ vector[i], vector[j] = vector[j], vector[i]
843
+ end
844
+ index[i], index[j] = index[j], index[i]
845
+ end
846
+ i += 1
847
+ j -= 1
848
+ end
849
+
850
+ [j,i]
524
851
  end
525
852
 
526
- def access_row *names
527
- if names[1].nil?
528
- access_token = names[0]
529
- if access_token.is_a?(Range)
530
- index_arry = @index.to_a
531
-
532
- range =
533
- if access_token.first.is_a?(Numeric)
534
- access_token
535
- else
536
- first_index = index_arry.index access_token.first
537
- last_index = index_arry.index access_token.last
853
+ def keep? current_index, mvalues, vector_order, sort_order, by, vector_order_index
854
+ vector_name = vector_order[vector_order_index]
855
+ if vector_name
856
+ vec = vector[vector_name]
857
+ eval = by[vector_name].call(vec[current_index], mvalues[vector_order_index])
858
+
859
+ if sort_order[vector_order_index] # sort in ascending order
860
+ return false if eval == 1
861
+ return true if eval == -1
862
+ if eval == 0
863
+ keep?(current_index, mvalues, vector_order, sort_order, by, vector_order_index + 1)
864
+ end
865
+ else # sort in descending order
866
+ return false if eval == -1
867
+ return true if eval == 1
868
+ if eval == 0
869
+ keep?(current_index, mvalues, vector_order, sort_order, by, vector_order_index + 1)
870
+ end
871
+ end
872
+ end
873
+ end
538
874
 
539
- first_index..last_index
875
+ def create_logic_blocks vector_order, by={}
876
+ universal_block = lambda { |a,b| a <=> b }
877
+ vector_order.each do |vector|
878
+ by[vector] ||= universal_block
879
+ end
880
+
881
+ by
882
+ end
883
+
884
+ def sort_order_array vector_order, ascending
885
+ if ascending.is_a?(Array)
886
+ raise ArgumentError, "Specify same number of vector names and sort orders" if
887
+ vector_order.size != ascending.size
888
+ return ascending
889
+ else
890
+ Array.new(vector_order.size, ascending)
891
+ end
892
+ end
893
+
894
+ def vectors_index_for location
895
+ if @vectors.include?(location)
896
+ @vectors[location]
897
+ elsif location[0].is_a?(Integer)
898
+ location[0]
899
+ end
900
+ end
901
+
902
+ def access_vector *names
903
+ location = names[0]
904
+ if @vectors.is_a?(MultiIndex)
905
+ pos = vectors_index_for names
906
+
907
+ if pos.is_a?(Integer)
908
+ return @data[pos]
909
+ else # MultiIndex
910
+ new_vectors = pos.map do |tuple|
911
+ @data[vectors_index_for(names + tuple)]
540
912
  end
913
+ Daru::DataFrame.new(new_vectors, index: @index, order: Daru::MultiIndex.new(pos.to_a))
914
+ end
915
+ else
916
+ unless names[1]
917
+ pos = vectors_index_for location
918
+ return @data[pos]
919
+ end
920
+
921
+ new_vcs = {}
922
+ names.each do |name|
923
+ name = name.to_sym unless name.is_a?(Integer)
924
+ new_vcs[name] = @data[@vectors[name]]
925
+ end
926
+ Daru::DataFrame.new new_vcs, order: new_vcs.keys, index: @index, name: @name
927
+ end
928
+ end
929
+
930
+ def access_row *names
931
+ location = names[0]
541
932
 
542
- names = index_arry[range]
933
+ if @index.is_a?(MultiIndex)
934
+ pos = row_index_for names
935
+ if pos.is_a?(Integer)
936
+ return Daru::Vector.new(populate_row_for(pos), index: @vectors, name: pos)
543
937
  else
544
- row = []
545
- name = named_index_for names[0]
546
- @vectors.each do |vector|
547
- row << @data[@vectors[vector]][name]
938
+ new_rows =
939
+ if location.is_a?(Range)
940
+ pos.map { |tuple| populate_row_for(tuple) }
941
+ else
942
+ pos.map { |tuple| populate_row_for(names + tuple) }
548
943
  end
944
+
945
+ Daru::DataFrame.rows(new_rows, order: @vectors, name: @name,
946
+ index: Daru::MultiIndex.new(pos.to_a))
947
+ end
948
+ else
949
+ if names[1].nil?
950
+ if location.is_a?(Range)
951
+ index_arry = @index.to_a
549
952
 
550
- return Daru::Vector.new(row, index: @vectors, name: name, dtype: @dtype)
953
+ range =
954
+ if location.first.is_a?(Numeric)
955
+ location
956
+ else
957
+ first_index = index_arry.index location.first
958
+ last_index = index_arry.index location.last
959
+
960
+ first_index..last_index
961
+ end
962
+
963
+ names = index_arry[range]
964
+ else
965
+ row = []
966
+ name = named_index_for names[0]
967
+ @vectors.each do |vector|
968
+ row << @data[@vectors[vector]][name]
969
+ end
970
+
971
+ return Daru::Vector.new(row, index: @vectors, name: set_name(name))
972
+ end
551
973
  end
974
+ # Access multiple rows
975
+ rows = []
976
+ names.each do |name|
977
+ rows << self.row[name]
978
+ end
979
+
980
+ Daru::DataFrame.rows rows, name: @name
552
981
  end
553
- # Access multiple rows
554
- rows = []
555
- names.each do |name|
556
- rows << self.row[name]
982
+ end
983
+
984
+ def row_index_for location
985
+ if @index.include?(location) or location[0].is_a?(Range)
986
+ @index[location]
987
+ elsif location[0].is_a?(Integer)
988
+ location[0]
989
+ end
990
+ end
991
+
992
+ def populate_row_for pos
993
+ @vectors.map do |vector|
994
+ @data[@vectors[vector]][pos]
557
995
  end
558
-
559
- Daru::DataFrame.rows rows, name: @name, dtype: @dtype
560
996
  end
561
997
 
562
998
  def insert_or_modify_vector name, vector
563
- @vectors = @vectors.re_index(@vectors + name)
999
+ @vectors = reassign_index_as(@vectors + name)
564
1000
  v = nil
565
1001
 
566
1002
  if vector.is_a?(Daru::Vector)
567
- v = Daru::Vector.new [], name: name, index: @index, dtype: @dtype
568
- nil_data = false
1003
+ v = Daru::Vector.new [], name: set_name(name), index: @index
569
1004
  @index.each do |idx|
570
- begin
571
- v[idx] = vector[idx]
572
- rescue IndexError
573
- v[idx] = nil
574
- end
1005
+ v[idx] = vector[idx]
575
1006
  end
576
1007
  else
577
1008
  raise Exception, "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
578
1009
  @size != vector.size
579
1010
 
580
- v = vector.dv(name, @index, @dtype)
1011
+ v = Daru::Vector.new(vector, name: set_name(name), index: @index)
581
1012
  end
582
1013
 
583
1014
  @data[@vectors[name]] = v
@@ -585,25 +1016,17 @@ module Daru
585
1016
 
586
1017
  def insert_or_modify_row name, vector
587
1018
  if @index.include? name
588
- v = vector.dv(name, @vectors, @dtype)
1019
+ v = vector.dv(name, @vectors, @dtype)
589
1020
 
590
1021
  @vectors.each do |vector|
591
- begin
592
- @data[@vectors[vector]][name] = v[vector]
593
- rescue IndexError
594
- @data[@vectors[vector]][name] = nil
595
- end
1022
+ @data[@vectors[vector]][name] = v[vector]
596
1023
  end
597
1024
  else
598
- @index = @index.re_index(@index + name)
599
- v = vector.dv(name, @vectors, @dtype)
1025
+ @index = reassign_index_as(@index + name)
1026
+ v = Daru::Vector.new(vector, name: set_name(name), index: @vectors)
600
1027
 
601
1028
  @vectors.each do |vector|
602
- begin
603
- @data[@vectors[vector]].concat v[vector], name
604
- rescue IndexError
605
- @data[@vectors[vector]].concat nil, name
606
- end
1029
+ @data[@vectors[vector]].concat v[vector], name
607
1030
  end
608
1031
  end
609
1032
 
@@ -612,16 +1035,16 @@ module Daru
612
1035
 
613
1036
  def create_empty_vectors
614
1037
  @vectors.each do |name|
615
- @data << Daru::Vector.new([],name: name, index: @index, dtype: @dtype)
1038
+ @data << Daru::Vector.new([], name: set_name(name), index: @index)
616
1039
  end
617
1040
  end
618
1041
 
619
1042
  def validate_labels
620
- raise IndexError, "Expected equal number of vectors for number of Hash pairs" if
621
- @vectors.size != @data.size
1043
+ raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if
1044
+ @vectors and @vectors.size != @data.size
622
1045
 
623
1046
  raise IndexError, "Expected number of indexes same as number of rows" if
624
- @index.size != @data[0].size
1047
+ @index and @data[0] and @index.size != @data[0].size
625
1048
  end
626
1049
 
627
1050
  def validate_vector_sizes
@@ -631,8 +1054,6 @@ module Daru
631
1054
  end
632
1055
 
633
1056
  def validate
634
- # TODO: [IMP] when vectors of different dimensions are specified, they should
635
- # be inserted into the dataframe by inserting nils wherever necessary.
636
1057
  validate_labels
637
1058
  validate_vector_sizes
638
1059
  end
@@ -660,10 +1081,10 @@ module Daru
660
1081
  def create_vectors_index_with vectors, source
661
1082
  vectors = source.keys.sort if vectors.nil?
662
1083
 
663
- if vectors.is_a?(Daru::Index)
664
- @vectors = vectors.to_index
665
- else
1084
+ unless vectors.is_a?(Index) or vectors.is_a?(MultiIndex)
666
1085
  @vectors = Daru::Index.new (vectors + (source.keys - vectors)).uniq.map(&:to_sym)
1086
+ else
1087
+ @vectors = vectors
667
1088
  end
668
1089
  end
669
1090
 
@@ -674,5 +1095,32 @@ module Daru
674
1095
  index == vector.index
675
1096
  end
676
1097
  end
1098
+
1099
+ def reassign_index_as new_index
1100
+ Daru::Index.new new_index
1101
+ end
1102
+
1103
+ def create_index index
1104
+ index.is_a?(MultiIndex) ? index : Daru::Index.new(index)
1105
+ end
1106
+
1107
+ def set_name potential_name
1108
+ potential_name.is_a?(Array) ? potential_name.join.to_sym : potential_name
1109
+ end
1110
+
1111
+ def symbolize arry
1112
+ symbolized_arry =
1113
+ if arry.all? { |e| e.is_a?(Array) }
1114
+ arry.map do |sub_arry|
1115
+ sub_arry.map do |e|
1116
+ e.is_a?(Numeric) ? e : e.to_sym
1117
+ end
1118
+ end
1119
+ else
1120
+ arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym }
1121
+ end
1122
+
1123
+ symbolized_arry
1124
+ end
677
1125
  end
678
1126
  end