daru 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +0 -0
  3. data/Gemfile +0 -1
  4. data/History.txt +35 -0
  5. data/README.md +178 -198
  6. data/daru.gemspec +5 -7
  7. data/lib/daru.rb +10 -2
  8. data/lib/daru/accessors/array_wrapper.rb +36 -198
  9. data/lib/daru/accessors/nmatrix_wrapper.rb +60 -209
  10. data/lib/daru/core/group_by.rb +183 -0
  11. data/lib/daru/dataframe.rb +615 -167
  12. data/lib/daru/index.rb +17 -16
  13. data/lib/daru/io/io.rb +5 -12
  14. data/lib/daru/maths/arithmetic/dataframe.rb +72 -8
  15. data/lib/daru/maths/arithmetic/vector.rb +19 -6
  16. data/lib/daru/maths/statistics/dataframe.rb +103 -2
  17. data/lib/daru/maths/statistics/vector.rb +102 -61
  18. data/lib/daru/monkeys.rb +8 -0
  19. data/lib/daru/multi_index.rb +199 -0
  20. data/lib/daru/plotting/dataframe.rb +24 -24
  21. data/lib/daru/plotting/vector.rb +14 -15
  22. data/lib/daru/vector.rb +402 -98
  23. data/lib/version.rb +1 -1
  24. data/notebooks/grouping_splitting_pivots.ipynb +529 -0
  25. data/notebooks/intro_with_music_data_.ipynb +104 -119
  26. data/spec/accessors/wrappers_spec.rb +36 -0
  27. data/spec/core/group_by_spec.rb +331 -0
  28. data/spec/dataframe_spec.rb +1237 -475
  29. data/spec/fixtures/sales-funnel.csv +18 -0
  30. data/spec/index_spec.rb +10 -21
  31. data/spec/io/io_spec.rb +4 -14
  32. data/spec/math/arithmetic/dataframe_spec.rb +66 -0
  33. data/spec/math/arithmetic/vector_spec.rb +45 -4
  34. data/spec/math/statistics/dataframe_spec.rb +91 -1
  35. data/spec/math/statistics/vector_spec.rb +32 -6
  36. data/spec/monkeys_spec.rb +10 -1
  37. data/spec/multi_index_spec.rb +216 -0
  38. data/spec/spec_helper.rb +1 -0
  39. data/spec/vector_spec.rb +505 -57
  40. metadata +21 -15
@@ -2,10 +2,13 @@ module Daru
2
2
  class Index
3
3
  include Enumerable
4
4
 
5
- # needs to iterate over keys sorted by their values. Happens right now by
6
- # virtue of ordered Hashes (ruby).
7
5
  def each(&block)
8
6
  @relation_hash.each_key(&block)
7
+ self
8
+ end
9
+
10
+ def map(&block)
11
+ to_a.map(&block)
9
12
  end
10
13
 
11
14
  attr_reader :relation_hash
@@ -19,6 +22,7 @@ module Daru
19
22
 
20
23
  index = 0 if index.nil?
21
24
  index = Array.new(index) { |i| i} if index.is_a? Integer
25
+ index = index.to_a if index.is_a? Daru::Index
22
26
 
23
27
  if values.nil?
24
28
  index.each_with_index do |n, idx|
@@ -36,7 +40,6 @@ module Daru
36
40
  end
37
41
 
38
42
  @relation_hash.freeze
39
-
40
43
  @size = @relation_hash.size
41
44
 
42
45
  if index[0].is_a?(Integer)
@@ -49,22 +52,28 @@ module Daru
49
52
  def ==(other)
50
53
  return false if other.size != @size
51
54
 
52
- @relation_hash.keys == other.to_a
55
+ @relation_hash.keys == other.to_a and @relation_hash.values == other.relation_hash.values
53
56
  end
54
57
 
55
58
  def [](key)
56
59
  case key
57
60
  when Range
58
- first = @relation_hash[key.first]
59
- last = @relation_hash[key.last]
61
+ if key.first.is_a?(Integer) and key.last.is_a?(Integer)
62
+ first = key.first
63
+ last = key.last
64
+ else
65
+ first = @relation_hash[key.first]
66
+ last = @relation_hash[key.last]
67
+ end
60
68
 
61
69
  indexes = []
62
-
63
70
  (first..last).each do |idx|
64
71
  indexes << @relation_hash.key(idx)
65
72
  end
66
73
 
67
74
  Daru::Index.new indexes, (first..last).to_a
75
+ when Array # works only with numeric indices
76
+ Daru::Index.new key.map { |k| @relation_hash.key(k) }, key
68
77
  else
69
78
  @relation_hash[key]
70
79
  end
@@ -85,11 +94,7 @@ module Daru
85
94
  end
86
95
 
87
96
  def key(value)
88
- @relation_hash.key value
89
- end
90
-
91
- def re_index new_index
92
- new_index.to_index
97
+ @relation_hash.keys[value]
93
98
  end
94
99
 
95
100
  def include? index
@@ -99,9 +104,5 @@ module Daru
99
104
  def dup
100
105
  Daru::Index.new @relation_hash.keys
101
106
  end
102
-
103
- def to_index
104
- self
105
- end
106
107
  end
107
108
  end
@@ -7,23 +7,16 @@ module Daru
7
7
  opts[:converters] ||= :numeric
8
8
  opts[:header_converters] ||= :symbol
9
9
 
10
- csv = CSV.open(path, 'r', opts)
10
+ csv = CSV.read(path, 'r', opts)
11
11
 
12
12
  yield csv if block_given?
13
13
 
14
- first = true
15
- df = nil
16
-
17
- csv.each_with_index do |row, index|
18
- if first
19
- df = Daru::DataFrame.new({}, order: csv.headers, name: opts[:name])
20
- first = false
21
- end
22
-
23
- df.row[index] = row.fields
14
+ hsh = {}
15
+ csv.by_col!.each do |col_name, values|
16
+ hsh[col_name] = values
24
17
  end
25
18
 
26
- df
19
+ Daru::DataFrame.new(hsh)
27
20
  end
28
21
  end
29
22
  end
@@ -1,26 +1,90 @@
1
1
  module Daru
2
2
  module Maths
3
+ # Module encapsulating all aritmetic methods on DataFrame.
3
4
  module Arithmetic
4
- module DataFrame
5
-
5
+ module DataFrame
6
+
7
+ # Add a scalar or another DataFrame
6
8
  def + other
7
-
9
+ binary_operation :+, other
8
10
  end
9
11
 
10
- def - other
11
-
12
+ # Subtract a scalar or another DataFrame.
13
+ def - other
14
+ binary_operation :-, other
12
15
  end
13
16
 
17
+ # Multiply a scalar or another DataFrame.
14
18
  def * other
15
-
19
+ binary_operation :*, other
16
20
  end
17
21
 
22
+ # Divide a scalar or another DataFrame.
18
23
  def / other
19
-
24
+ binary_operation :/, other
20
25
  end
21
26
 
27
+ # Modulus with a scalar or another DataFrame.
22
28
  def % other
23
-
29
+ binary_operation :%, other
30
+ end
31
+
32
+ # Exponent with a scalar or another DataFrame.
33
+ def ** other
34
+ binary_operation :**, other
35
+ end
36
+
37
+ # Calculate exponenential of all vectors with numeric values.
38
+ def exp
39
+ self.dup.map_vectors! { |v| v.exp if v.type == :numeric }
40
+ end
41
+
42
+ def sqrt
43
+ self.dup.map_vectors! { |v| v.sqrt if v.type == :numeric }
44
+ end
45
+
46
+ def round precision=0
47
+ self.dup.map_vectors! { |v| v.round(precision) if v.type == :numeric }
48
+ end
49
+ private
50
+
51
+ def binary_operation operation, other
52
+ case other
53
+ when Daru::DataFrame
54
+ dataframe_binary_operation operation, other
55
+ else
56
+ scalar_binary_operation operation, other
57
+ end
58
+ end
59
+
60
+ def dataframe_binary_operation operation, other
61
+ all_vectors = (self.vectors.to_a | other.vectors.to_a).sort
62
+ all_indexes = (self.index.to_a | other.index.to_a).sort
63
+
64
+ hsh = {}
65
+ all_vectors.each do |vector_name|
66
+ this = self .has_vector?(vector_name) ? self .vector[vector_name] : nil
67
+ that = other.has_vector?(vector_name) ? other.vector[vector_name] : nil
68
+
69
+ if this and that
70
+ hsh[vector_name] = this.send(operation, that)
71
+ else
72
+ hsh[vector_name] = Daru::Vector.new([], index: all_indexes,
73
+ name: vector_name)
74
+ end
75
+ end
76
+
77
+ Daru::DataFrame.new(hsh, index: all_indexes, name: @name, dtype: @dtype)
78
+ end
79
+
80
+ def scalar_binary_operation operation, other
81
+ clone = self.dup
82
+ clone.map_vectors! do |vector|
83
+ vector = vector.send(operation, other) if vector.type == :numeric
84
+ vector
85
+ end
86
+
87
+ clone
24
88
  end
25
89
  end
26
90
  end
@@ -27,19 +27,27 @@ module Daru
27
27
  end
28
28
 
29
29
  def exp
30
-
30
+ math_unary_op :exp
31
31
  end
32
32
 
33
33
  def sqrt
34
-
34
+ math_unary_op :sqrt
35
35
  end
36
36
 
37
- def round
38
-
37
+ def abs
38
+ self.dup.map! { |e| e.abs unless e.nil? }
39
+ end
40
+
41
+ def round precision=0
42
+ self.dup.map! { |e| e.round(precision) unless e.nil? }
39
43
  end
40
44
 
41
45
  private
42
46
 
47
+ def math_unary_op operation
48
+ self.dup.map! { |e| Math.send(operation, e) unless e.nil? }
49
+ end
50
+
43
51
  def binary_op operation, other
44
52
  case other
45
53
  when Daru::Vector
@@ -50,20 +58,25 @@ module Daru
50
58
  end
51
59
 
52
60
  def v2o_binary operation, other
53
- Daru::Vector.new self.map { |e| e.send(operation, other) }, name: @name, index: @index
61
+ Daru::Vector.new self.map { |e| e.nil? ? nil : e.send(operation, other) },
62
+ name: @name, index: @index
54
63
  end
55
64
 
56
65
  def v2v_binary operation, other
57
66
  common_idxs = []
58
67
  elements = []
68
+ index = (@index.to_a + other.index.to_a).uniq.sort
59
69
 
60
- @index.each do |idx|
70
+ index.each do |idx|
61
71
  this = self[idx]
62
72
  that = other[idx]
63
73
 
64
74
  if this and that
65
75
  elements << this.send(operation ,that)
66
76
  common_idxs << idx
77
+ else
78
+ elements << nil
79
+ common_idxs << idx
67
80
  end
68
81
  end
69
82
 
@@ -2,8 +2,109 @@ module Daru
2
2
  module Maths
3
3
  module Statistics
4
4
  module DataFrame
5
-
6
-
5
+ # Calculate mean of numeric vectors.
6
+ def mean
7
+ compute_stats :mean
8
+ end
9
+
10
+ # Calculate sample standard deviation of numeric vectors.
11
+ def std
12
+ compute_stats :std
13
+ end
14
+
15
+ # Calculate sum of numeric vectors
16
+ def sum
17
+ compute_stats :sum
18
+ end
19
+
20
+ # Count the number of non-nil values in each vector.
21
+ def count
22
+ compute_stats :count
23
+ end
24
+
25
+ # Calculate the maximum value of each numeric vector.
26
+ def max
27
+ compute_stats :max
28
+ end
29
+
30
+ # Calculate the minimmum value of each numeric vector.
31
+ def min
32
+ compute_stats :min
33
+ end
34
+
35
+ # Compute the product of each numeric vector.
36
+ def product
37
+ compute_stats :product
38
+ end
39
+
40
+ # Create a summary of mean, standard deviation, count, max and min of
41
+ # each numeric vector in the dataframe in one shot.
42
+ #
43
+ # == Arguments
44
+ #
45
+ # +methods+ - An array with aggregation methods specified as symbols to
46
+ # be applied to numeric vectors. Default is [:count, :mean, :std, :max,
47
+ # :min]. Methods will be applied in the specified order.
48
+ def describe methods=nil
49
+ methods ||= [:count, :mean, :std, :min, :max]
50
+
51
+ description_hash = {}
52
+ numeric_vectors.each do |vec|
53
+ description_hash[vec] = methods.map { |m| self[vec].send(m) }
54
+ end
55
+ Daru::DataFrame.new(description_hash, index: methods)
56
+ end
57
+
58
+ # Calculate variance-covariance between the numeric vectors.
59
+ #
60
+ # == Arguments
61
+ #
62
+ # +for_sample_data+ - If set to false, will calculate the population
63
+ # covariance (denominator N), otherwise calculates the sample covariance
64
+ # matrix. Default to true.
65
+ def covariance for_sample_data=true
66
+ cov_arry =
67
+ if defined? NMatrix and NMatrix.respond_to?(:cov)
68
+ to_nmatrix.cov(for_sample_data).to_a
69
+ else
70
+ df_as_matrix = to_matrix
71
+ denominator = for_sample_data ? rows - 1 : rows
72
+ ones = Matrix.column_vector [1]*rows
73
+ deviation_scores = df_as_matrix - (ones * ones.transpose * df_as_matrix) / rows
74
+ ((deviation_scores.transpose * deviation_scores) / denominator).to_a
75
+ end
76
+
77
+ Daru::DataFrame.rows(cov_arry, index: numeric_vectors, order: numeric_vectors)
78
+ end
79
+
80
+ alias :cov :covariance
81
+
82
+ # Calculate the correlation between the numeric vectors.
83
+ def correlation
84
+ corr_arry =
85
+ if defined? NMatrix and NMatrix.respond_to?(:corr)
86
+ to_nmatrix.corr.to_a
87
+ else
88
+ standard_deviation = std.to_matrix
89
+ (cov.to_matrix.elementwise_division(standard_deviation.transpose *
90
+ standard_deviation)).to_a
91
+ end
92
+
93
+ Daru::DataFrame.rows(corr_arry, index: numeric_vectors, order: numeric_vectors)
94
+ end
95
+
96
+ alias :corr :correlation
97
+
98
+ private
99
+
100
+ def compute_stats method
101
+ Daru::Vector.new(
102
+ numeric_vectors.inject({}) do |hash, vec|
103
+ hash[vec] = self[vec].send(method)
104
+ hash
105
+ end
106
+ )
107
+ end
7
108
  end
8
109
  end
9
110
  end
@@ -1,47 +1,59 @@
1
1
  module Daru
2
2
  module Maths
3
+ # Encapsulates statistics methods for vectors. Most basic stuff like mean, etc.
4
+ # is done inside the wrapper, so that native methods can be used for most of
5
+ # the computationally intensive tasks.
3
6
  module Statistics
4
7
  module Vector
5
-
6
8
  def mean
7
- @vector.mean
9
+ @data.mean
8
10
  end
9
11
 
10
- def median
11
- @vector.median
12
+ def sum
13
+ @data.sum
12
14
  end
13
15
 
14
- def mode
15
- @vector.mode
16
+ def product
17
+ @data.product
16
18
  end
17
19
 
18
- def sum
19
- @vector.sum
20
+ def min
21
+ @data.min
20
22
  end
21
23
 
22
- def product
23
- @vector.product
24
+ def range
25
+ max - min
26
+ end
27
+
28
+ def median
29
+ percentile 50
30
+ end
31
+
32
+ def mode
33
+ freqs = frequencies.values
34
+ @data[freqs.index(freqs.max)]
24
35
  end
25
36
 
26
37
  def median_absolute_deviation
27
- @vector.median_absolute_deviation
38
+ m = median
39
+ map {|val| (val - m).abs }.median
28
40
  end
29
41
 
30
42
  def standard_error
31
- @vector.standard_error
43
+ standard_deviation_sample/(Math::sqrt((@size - @nil_positions.size)))
32
44
  end
33
45
 
34
46
  def sum_of_squared_deviation
35
- @vector.sum_of_squared_deviation
47
+ (@data.to_a.inject(0) { |a,x| x.square + a } - (sum.square.quo((@size - @nil_positions.size)))).to_f
36
48
  end
37
49
 
38
50
  # Maximum element of the vector.
39
51
  #
40
52
  # @param return_type [Symbol] Data type of the returned value. Defaults
41
- # to returning only the maximum number but passing *:vector* will return
42
- # a Daru::Vector with the index of the corresponding maximum value.
53
+ # to returning only the maximum number but passing *:vector* will return
54
+ # a Daru::Vector with the index of the corresponding maximum value.
43
55
  def max return_type=:stored_type
44
- max_value = @vector.max
56
+ max_value = @data.max
45
57
  if return_type == :vector
46
58
  Daru::Vector.new({index_of(max_value) => max_value}, name: @name, dtype: @dtype)
47
59
  else
@@ -49,98 +61,127 @@ module Daru
49
61
  end
50
62
  end
51
63
 
52
- def min
53
- @vector.min
54
- end
55
-
56
- def has_missing_data?
57
- @vector.has_missing_data?
58
- end
59
-
60
- def range
61
- @vector.range
64
+ # Return a Vector with the max element and its index.
65
+ # @return [Daru::Vector]
66
+ def max_index
67
+ max :vector
62
68
  end
63
69
 
64
70
  def frequencies
65
- @vector.frequencies
71
+ @data.inject({}) do |hash, element|
72
+ hash[element] ||= 0
73
+ hash[element] += 1
74
+ hash
75
+ end
66
76
  end
67
77
 
68
78
  def proportions
69
- @vector.proportions
79
+ len = n_valid
80
+ frequencies.inject({}) { |hash, arr| hash[arr[0]] = arr[1] / len; hash }
70
81
  end
71
82
 
72
83
  def ranked
73
- @vector.ranked
84
+ sum = 0
85
+ r = frequencies.sort.inject( {} ) do |memo, val|
86
+ memo[val[0]] = ((sum + 1) + (sum + val[1])) / 2
87
+ sum += val[1]
88
+ memo
89
+ end
90
+
91
+ Daru::Vector.new @data.map { |e| r[e] }, index: self.index,
92
+ name: self.name, dtype: self.dtype, nm_dtype: self.nm_dtype
74
93
  end
75
94
 
76
95
  def coefficient_of_variation
77
- @vector.coefficient_of_variation
96
+ standard_deviation_sample / mean
78
97
  end
79
98
 
80
- # Retrieves number of cases which comply condition.
81
- # If block given, retrieves number of instances where
82
- # block returns true.
83
- # If other values given, retrieves the frequency for
84
- # this value.
99
+ # Retrieves number of cases which comply condition. If block given,
100
+ # retrieves number of instances where block returns true. If other
101
+ # values given, retrieves the frequency for this value. If no value
102
+ # given, counts the number of non-nil elements in the Vector.
85
103
  def count value=false
86
- @vector.count value
104
+ if block_given?
105
+ @data.inject(0){ |memo, val| memo += 1 if yield val; memo}
106
+ elsif value
107
+ val = frequencies[value]
108
+ val.nil? ? 0 : val
109
+ else
110
+ size - @nil_positions.size
111
+ end
87
112
  end
88
113
 
89
114
  def proportion value=1
90
- @vector.proportion value
91
- end
92
-
93
- # Population variance with denominator (N)
94
- def variance_population m=nil
95
- @vector.variance_population m
115
+ frequencies[value] / n_valid
96
116
  end
97
117
 
98
118
  # Sample variance with denominator (N-1)
99
119
  def variance_sample m=nil
100
- @vector.variance_sample m
120
+ m ||= self.mean
121
+ sum_of_squares(m).quo((@size - @nil_positions.size) - 1)
101
122
  end
102
123
 
103
- def sum_of_squares m=nil
104
- @vector.sum_of_squares m
124
+ # Population variance with denominator (N)
125
+ def variance_population m=nil
126
+ m ||= mean
127
+ sum_of_squares(m).quo((@size - @nil_positions.size)).to_f
105
128
  end
106
129
 
107
- def standard_deviation_sample m=nil
108
- @vector.standard_deviation_sample m
130
+ def sum_of_squares(m=nil)
131
+ m ||= mean
132
+ @data.inject(0) { |memo, val| memo + (val - m)**2 }
109
133
  end
110
134
 
111
135
  def standard_deviation_population m=nil
112
- @vector.standard_deviation_population m
136
+ m ||= mean
137
+ Math::sqrt(variance_population(m))
138
+ end
139
+
140
+ def standard_deviation_sample m=nil
141
+ Math::sqrt(variance_sample(m))
113
142
  end
114
143
 
115
144
  # Calculate skewness using (sigma(xi - mean)^3)/((N)*std_dev_sample^3)
116
145
  def skew m=nil
117
- @vector.skew m
146
+ m ||= mean
147
+ th = @data.inject(0) { |memo, val| memo + ((val - m)**3) }
148
+ th.quo ((@size - @nil_positions.size) * (standard_deviation_sample(m)**3))
118
149
  end
119
150
 
120
151
  def kurtosis m=nil
121
- @vector.kurtosis m
152
+ m ||= mean
153
+ fo = @data.inject(0){ |a, x| a + ((x - m) ** 4) }
154
+ fo.quo((@size - @nil_positions.size) * standard_deviation_sample(m) ** 4) - 3
122
155
  end
123
156
 
124
157
  def average_deviation_population m=nil
125
- @vector.average_deviation_population m
158
+ m ||= mean
159
+ (@data.inject(0) {|memo, val| val + (val - m).abs }) / n_valid
126
160
  end
127
161
 
128
162
  def recode!(&block)
129
- @vector.recode!(&block)
163
+ @data.recode!(&block)
130
164
  end
131
165
 
132
166
  def percentile percent
133
- @vector.percentile percent
167
+ sorted = @data.sort
168
+ v = (n_valid * percent).quo(100)
169
+ if v.to_i != v
170
+ sorted[v.round]
171
+ else
172
+ (sorted[(v - 0.5).round].to_f + sorted[(v + 0.5).round]).quo(2)
173
+ end
134
174
  end
135
175
 
136
- alias_method :sdp, :standard_deviation_population
137
- alias_method :sds, :standard_deviation_sample
138
- alias_method :adp, :average_deviation_population
139
- # alias_method :cov, :coefficient_of_variation
140
- # alias_method :variance, :variance_sample
141
- alias_method :sd, :standard_deviation_sample
142
- alias_method :ss, :sum_of_squares
143
- alias_method :percentil, :percentile
176
+ alias :sdp :standard_deviation_population
177
+ alias :sds :standard_deviation_sample
178
+ alias :std :sds
179
+ alias :adp :average_deviation_population
180
+ alias :cov :coefficient_of_variation
181
+ alias :variance :variance_sample
182
+ alias :sd :standard_deviation_sample
183
+ alias :ss :sum_of_squares
184
+ alias :percentil :percentile
144
185
  end
145
186
  end
146
187
  end