daru 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +0 -0
  3. data/Gemfile +0 -1
  4. data/History.txt +35 -0
  5. data/README.md +178 -198
  6. data/daru.gemspec +5 -7
  7. data/lib/daru.rb +10 -2
  8. data/lib/daru/accessors/array_wrapper.rb +36 -198
  9. data/lib/daru/accessors/nmatrix_wrapper.rb +60 -209
  10. data/lib/daru/core/group_by.rb +183 -0
  11. data/lib/daru/dataframe.rb +615 -167
  12. data/lib/daru/index.rb +17 -16
  13. data/lib/daru/io/io.rb +5 -12
  14. data/lib/daru/maths/arithmetic/dataframe.rb +72 -8
  15. data/lib/daru/maths/arithmetic/vector.rb +19 -6
  16. data/lib/daru/maths/statistics/dataframe.rb +103 -2
  17. data/lib/daru/maths/statistics/vector.rb +102 -61
  18. data/lib/daru/monkeys.rb +8 -0
  19. data/lib/daru/multi_index.rb +199 -0
  20. data/lib/daru/plotting/dataframe.rb +24 -24
  21. data/lib/daru/plotting/vector.rb +14 -15
  22. data/lib/daru/vector.rb +402 -98
  23. data/lib/version.rb +1 -1
  24. data/notebooks/grouping_splitting_pivots.ipynb +529 -0
  25. data/notebooks/intro_with_music_data_.ipynb +104 -119
  26. data/spec/accessors/wrappers_spec.rb +36 -0
  27. data/spec/core/group_by_spec.rb +331 -0
  28. data/spec/dataframe_spec.rb +1237 -475
  29. data/spec/fixtures/sales-funnel.csv +18 -0
  30. data/spec/index_spec.rb +10 -21
  31. data/spec/io/io_spec.rb +4 -14
  32. data/spec/math/arithmetic/dataframe_spec.rb +66 -0
  33. data/spec/math/arithmetic/vector_spec.rb +45 -4
  34. data/spec/math/statistics/dataframe_spec.rb +91 -1
  35. data/spec/math/statistics/vector_spec.rb +32 -6
  36. data/spec/monkeys_spec.rb +10 -1
  37. data/spec/multi_index_spec.rb +216 -0
  38. data/spec/spec_helper.rb +1 -0
  39. data/spec/vector_spec.rb +505 -57
  40. metadata +21 -15
@@ -2,10 +2,13 @@ module Daru
2
2
  class Index
3
3
  include Enumerable
4
4
 
5
- # needs to iterate over keys sorted by their values. Happens right now by
6
- # virtue of ordered Hashes (ruby).
7
5
  def each(&block)
8
6
  @relation_hash.each_key(&block)
7
+ self
8
+ end
9
+
10
+ def map(&block)
11
+ to_a.map(&block)
9
12
  end
10
13
 
11
14
  attr_reader :relation_hash
@@ -19,6 +22,7 @@ module Daru
19
22
 
20
23
  index = 0 if index.nil?
21
24
  index = Array.new(index) { |i| i} if index.is_a? Integer
25
+ index = index.to_a if index.is_a? Daru::Index
22
26
 
23
27
  if values.nil?
24
28
  index.each_with_index do |n, idx|
@@ -36,7 +40,6 @@ module Daru
36
40
  end
37
41
 
38
42
  @relation_hash.freeze
39
-
40
43
  @size = @relation_hash.size
41
44
 
42
45
  if index[0].is_a?(Integer)
@@ -49,22 +52,28 @@ module Daru
49
52
  def ==(other)
50
53
  return false if other.size != @size
51
54
 
52
- @relation_hash.keys == other.to_a
55
+ @relation_hash.keys == other.to_a and @relation_hash.values == other.relation_hash.values
53
56
  end
54
57
 
55
58
  def [](key)
56
59
  case key
57
60
  when Range
58
- first = @relation_hash[key.first]
59
- last = @relation_hash[key.last]
61
+ if key.first.is_a?(Integer) and key.last.is_a?(Integer)
62
+ first = key.first
63
+ last = key.last
64
+ else
65
+ first = @relation_hash[key.first]
66
+ last = @relation_hash[key.last]
67
+ end
60
68
 
61
69
  indexes = []
62
-
63
70
  (first..last).each do |idx|
64
71
  indexes << @relation_hash.key(idx)
65
72
  end
66
73
 
67
74
  Daru::Index.new indexes, (first..last).to_a
75
+ when Array # works only with numeric indices
76
+ Daru::Index.new key.map { |k| @relation_hash.key(k) }, key
68
77
  else
69
78
  @relation_hash[key]
70
79
  end
@@ -85,11 +94,7 @@ module Daru
85
94
  end
86
95
 
87
96
  def key(value)
88
- @relation_hash.key value
89
- end
90
-
91
- def re_index new_index
92
- new_index.to_index
97
+ @relation_hash.keys[value]
93
98
  end
94
99
 
95
100
  def include? index
@@ -99,9 +104,5 @@ module Daru
99
104
  def dup
100
105
  Daru::Index.new @relation_hash.keys
101
106
  end
102
-
103
- def to_index
104
- self
105
- end
106
107
  end
107
108
  end
@@ -7,23 +7,16 @@ module Daru
7
7
  opts[:converters] ||= :numeric
8
8
  opts[:header_converters] ||= :symbol
9
9
 
10
- csv = CSV.open(path, 'r', opts)
10
+ csv = CSV.read(path, 'r', opts)
11
11
 
12
12
  yield csv if block_given?
13
13
 
14
- first = true
15
- df = nil
16
-
17
- csv.each_with_index do |row, index|
18
- if first
19
- df = Daru::DataFrame.new({}, order: csv.headers, name: opts[:name])
20
- first = false
21
- end
22
-
23
- df.row[index] = row.fields
14
+ hsh = {}
15
+ csv.by_col!.each do |col_name, values|
16
+ hsh[col_name] = values
24
17
  end
25
18
 
26
- df
19
+ Daru::DataFrame.new(hsh)
27
20
  end
28
21
  end
29
22
  end
@@ -1,26 +1,90 @@
1
1
  module Daru
2
2
  module Maths
3
+ # Module encapsulating all aritmetic methods on DataFrame.
3
4
  module Arithmetic
4
- module DataFrame
5
-
5
+ module DataFrame
6
+
7
+ # Add a scalar or another DataFrame
6
8
  def + other
7
-
9
+ binary_operation :+, other
8
10
  end
9
11
 
10
- def - other
11
-
12
+ # Subtract a scalar or another DataFrame.
13
+ def - other
14
+ binary_operation :-, other
12
15
  end
13
16
 
17
+ # Multiply a scalar or another DataFrame.
14
18
  def * other
15
-
19
+ binary_operation :*, other
16
20
  end
17
21
 
22
+ # Divide a scalar or another DataFrame.
18
23
  def / other
19
-
24
+ binary_operation :/, other
20
25
  end
21
26
 
27
+ # Modulus with a scalar or another DataFrame.
22
28
  def % other
23
-
29
+ binary_operation :%, other
30
+ end
31
+
32
+ # Exponent with a scalar or another DataFrame.
33
+ def ** other
34
+ binary_operation :**, other
35
+ end
36
+
37
+ # Calculate exponenential of all vectors with numeric values.
38
+ def exp
39
+ self.dup.map_vectors! { |v| v.exp if v.type == :numeric }
40
+ end
41
+
42
+ def sqrt
43
+ self.dup.map_vectors! { |v| v.sqrt if v.type == :numeric }
44
+ end
45
+
46
+ def round precision=0
47
+ self.dup.map_vectors! { |v| v.round(precision) if v.type == :numeric }
48
+ end
49
+ private
50
+
51
+ def binary_operation operation, other
52
+ case other
53
+ when Daru::DataFrame
54
+ dataframe_binary_operation operation, other
55
+ else
56
+ scalar_binary_operation operation, other
57
+ end
58
+ end
59
+
60
+ def dataframe_binary_operation operation, other
61
+ all_vectors = (self.vectors.to_a | other.vectors.to_a).sort
62
+ all_indexes = (self.index.to_a | other.index.to_a).sort
63
+
64
+ hsh = {}
65
+ all_vectors.each do |vector_name|
66
+ this = self .has_vector?(vector_name) ? self .vector[vector_name] : nil
67
+ that = other.has_vector?(vector_name) ? other.vector[vector_name] : nil
68
+
69
+ if this and that
70
+ hsh[vector_name] = this.send(operation, that)
71
+ else
72
+ hsh[vector_name] = Daru::Vector.new([], index: all_indexes,
73
+ name: vector_name)
74
+ end
75
+ end
76
+
77
+ Daru::DataFrame.new(hsh, index: all_indexes, name: @name, dtype: @dtype)
78
+ end
79
+
80
+ def scalar_binary_operation operation, other
81
+ clone = self.dup
82
+ clone.map_vectors! do |vector|
83
+ vector = vector.send(operation, other) if vector.type == :numeric
84
+ vector
85
+ end
86
+
87
+ clone
24
88
  end
25
89
  end
26
90
  end
@@ -27,19 +27,27 @@ module Daru
27
27
  end
28
28
 
29
29
  def exp
30
-
30
+ math_unary_op :exp
31
31
  end
32
32
 
33
33
  def sqrt
34
-
34
+ math_unary_op :sqrt
35
35
  end
36
36
 
37
- def round
38
-
37
+ def abs
38
+ self.dup.map! { |e| e.abs unless e.nil? }
39
+ end
40
+
41
+ def round precision=0
42
+ self.dup.map! { |e| e.round(precision) unless e.nil? }
39
43
  end
40
44
 
41
45
  private
42
46
 
47
+ def math_unary_op operation
48
+ self.dup.map! { |e| Math.send(operation, e) unless e.nil? }
49
+ end
50
+
43
51
  def binary_op operation, other
44
52
  case other
45
53
  when Daru::Vector
@@ -50,20 +58,25 @@ module Daru
50
58
  end
51
59
 
52
60
  def v2o_binary operation, other
53
- Daru::Vector.new self.map { |e| e.send(operation, other) }, name: @name, index: @index
61
+ Daru::Vector.new self.map { |e| e.nil? ? nil : e.send(operation, other) },
62
+ name: @name, index: @index
54
63
  end
55
64
 
56
65
  def v2v_binary operation, other
57
66
  common_idxs = []
58
67
  elements = []
68
+ index = (@index.to_a + other.index.to_a).uniq.sort
59
69
 
60
- @index.each do |idx|
70
+ index.each do |idx|
61
71
  this = self[idx]
62
72
  that = other[idx]
63
73
 
64
74
  if this and that
65
75
  elements << this.send(operation ,that)
66
76
  common_idxs << idx
77
+ else
78
+ elements << nil
79
+ common_idxs << idx
67
80
  end
68
81
  end
69
82
 
@@ -2,8 +2,109 @@ module Daru
2
2
  module Maths
3
3
  module Statistics
4
4
  module DataFrame
5
-
6
-
5
+ # Calculate mean of numeric vectors.
6
+ def mean
7
+ compute_stats :mean
8
+ end
9
+
10
+ # Calculate sample standard deviation of numeric vectors.
11
+ def std
12
+ compute_stats :std
13
+ end
14
+
15
+ # Calculate sum of numeric vectors
16
+ def sum
17
+ compute_stats :sum
18
+ end
19
+
20
+ # Count the number of non-nil values in each vector.
21
+ def count
22
+ compute_stats :count
23
+ end
24
+
25
+ # Calculate the maximum value of each numeric vector.
26
+ def max
27
+ compute_stats :max
28
+ end
29
+
30
+ # Calculate the minimmum value of each numeric vector.
31
+ def min
32
+ compute_stats :min
33
+ end
34
+
35
+ # Compute the product of each numeric vector.
36
+ def product
37
+ compute_stats :product
38
+ end
39
+
40
+ # Create a summary of mean, standard deviation, count, max and min of
41
+ # each numeric vector in the dataframe in one shot.
42
+ #
43
+ # == Arguments
44
+ #
45
+ # +methods+ - An array with aggregation methods specified as symbols to
46
+ # be applied to numeric vectors. Default is [:count, :mean, :std, :max,
47
+ # :min]. Methods will be applied in the specified order.
48
+ def describe methods=nil
49
+ methods ||= [:count, :mean, :std, :min, :max]
50
+
51
+ description_hash = {}
52
+ numeric_vectors.each do |vec|
53
+ description_hash[vec] = methods.map { |m| self[vec].send(m) }
54
+ end
55
+ Daru::DataFrame.new(description_hash, index: methods)
56
+ end
57
+
58
+ # Calculate variance-covariance between the numeric vectors.
59
+ #
60
+ # == Arguments
61
+ #
62
+ # +for_sample_data+ - If set to false, will calculate the population
63
+ # covariance (denominator N), otherwise calculates the sample covariance
64
+ # matrix. Default to true.
65
+ def covariance for_sample_data=true
66
+ cov_arry =
67
+ if defined? NMatrix and NMatrix.respond_to?(:cov)
68
+ to_nmatrix.cov(for_sample_data).to_a
69
+ else
70
+ df_as_matrix = to_matrix
71
+ denominator = for_sample_data ? rows - 1 : rows
72
+ ones = Matrix.column_vector [1]*rows
73
+ deviation_scores = df_as_matrix - (ones * ones.transpose * df_as_matrix) / rows
74
+ ((deviation_scores.transpose * deviation_scores) / denominator).to_a
75
+ end
76
+
77
+ Daru::DataFrame.rows(cov_arry, index: numeric_vectors, order: numeric_vectors)
78
+ end
79
+
80
+ alias :cov :covariance
81
+
82
+ # Calculate the correlation between the numeric vectors.
83
+ def correlation
84
+ corr_arry =
85
+ if defined? NMatrix and NMatrix.respond_to?(:corr)
86
+ to_nmatrix.corr.to_a
87
+ else
88
+ standard_deviation = std.to_matrix
89
+ (cov.to_matrix.elementwise_division(standard_deviation.transpose *
90
+ standard_deviation)).to_a
91
+ end
92
+
93
+ Daru::DataFrame.rows(corr_arry, index: numeric_vectors, order: numeric_vectors)
94
+ end
95
+
96
+ alias :corr :correlation
97
+
98
+ private
99
+
100
+ def compute_stats method
101
+ Daru::Vector.new(
102
+ numeric_vectors.inject({}) do |hash, vec|
103
+ hash[vec] = self[vec].send(method)
104
+ hash
105
+ end
106
+ )
107
+ end
7
108
  end
8
109
  end
9
110
  end
@@ -1,47 +1,59 @@
1
1
  module Daru
2
2
  module Maths
3
+ # Encapsulates statistics methods for vectors. Most basic stuff like mean, etc.
4
+ # is done inside the wrapper, so that native methods can be used for most of
5
+ # the computationally intensive tasks.
3
6
  module Statistics
4
7
  module Vector
5
-
6
8
  def mean
7
- @vector.mean
9
+ @data.mean
8
10
  end
9
11
 
10
- def median
11
- @vector.median
12
+ def sum
13
+ @data.sum
12
14
  end
13
15
 
14
- def mode
15
- @vector.mode
16
+ def product
17
+ @data.product
16
18
  end
17
19
 
18
- def sum
19
- @vector.sum
20
+ def min
21
+ @data.min
20
22
  end
21
23
 
22
- def product
23
- @vector.product
24
+ def range
25
+ max - min
26
+ end
27
+
28
+ def median
29
+ percentile 50
30
+ end
31
+
32
+ def mode
33
+ freqs = frequencies.values
34
+ @data[freqs.index(freqs.max)]
24
35
  end
25
36
 
26
37
  def median_absolute_deviation
27
- @vector.median_absolute_deviation
38
+ m = median
39
+ map {|val| (val - m).abs }.median
28
40
  end
29
41
 
30
42
  def standard_error
31
- @vector.standard_error
43
+ standard_deviation_sample/(Math::sqrt((@size - @nil_positions.size)))
32
44
  end
33
45
 
34
46
  def sum_of_squared_deviation
35
- @vector.sum_of_squared_deviation
47
+ (@data.to_a.inject(0) { |a,x| x.square + a } - (sum.square.quo((@size - @nil_positions.size)))).to_f
36
48
  end
37
49
 
38
50
  # Maximum element of the vector.
39
51
  #
40
52
  # @param return_type [Symbol] Data type of the returned value. Defaults
41
- # to returning only the maximum number but passing *:vector* will return
42
- # a Daru::Vector with the index of the corresponding maximum value.
53
+ # to returning only the maximum number but passing *:vector* will return
54
+ # a Daru::Vector with the index of the corresponding maximum value.
43
55
  def max return_type=:stored_type
44
- max_value = @vector.max
56
+ max_value = @data.max
45
57
  if return_type == :vector
46
58
  Daru::Vector.new({index_of(max_value) => max_value}, name: @name, dtype: @dtype)
47
59
  else
@@ -49,98 +61,127 @@ module Daru
49
61
  end
50
62
  end
51
63
 
52
- def min
53
- @vector.min
54
- end
55
-
56
- def has_missing_data?
57
- @vector.has_missing_data?
58
- end
59
-
60
- def range
61
- @vector.range
64
+ # Return a Vector with the max element and its index.
65
+ # @return [Daru::Vector]
66
+ def max_index
67
+ max :vector
62
68
  end
63
69
 
64
70
  def frequencies
65
- @vector.frequencies
71
+ @data.inject({}) do |hash, element|
72
+ hash[element] ||= 0
73
+ hash[element] += 1
74
+ hash
75
+ end
66
76
  end
67
77
 
68
78
  def proportions
69
- @vector.proportions
79
+ len = n_valid
80
+ frequencies.inject({}) { |hash, arr| hash[arr[0]] = arr[1] / len; hash }
70
81
  end
71
82
 
72
83
  def ranked
73
- @vector.ranked
84
+ sum = 0
85
+ r = frequencies.sort.inject( {} ) do |memo, val|
86
+ memo[val[0]] = ((sum + 1) + (sum + val[1])) / 2
87
+ sum += val[1]
88
+ memo
89
+ end
90
+
91
+ Daru::Vector.new @data.map { |e| r[e] }, index: self.index,
92
+ name: self.name, dtype: self.dtype, nm_dtype: self.nm_dtype
74
93
  end
75
94
 
76
95
  def coefficient_of_variation
77
- @vector.coefficient_of_variation
96
+ standard_deviation_sample / mean
78
97
  end
79
98
 
80
- # Retrieves number of cases which comply condition.
81
- # If block given, retrieves number of instances where
82
- # block returns true.
83
- # If other values given, retrieves the frequency for
84
- # this value.
99
+ # Retrieves number of cases which comply condition. If block given,
100
+ # retrieves number of instances where block returns true. If other
101
+ # values given, retrieves the frequency for this value. If no value
102
+ # given, counts the number of non-nil elements in the Vector.
85
103
  def count value=false
86
- @vector.count value
104
+ if block_given?
105
+ @data.inject(0){ |memo, val| memo += 1 if yield val; memo}
106
+ elsif value
107
+ val = frequencies[value]
108
+ val.nil? ? 0 : val
109
+ else
110
+ size - @nil_positions.size
111
+ end
87
112
  end
88
113
 
89
114
  def proportion value=1
90
- @vector.proportion value
91
- end
92
-
93
- # Population variance with denominator (N)
94
- def variance_population m=nil
95
- @vector.variance_population m
115
+ frequencies[value] / n_valid
96
116
  end
97
117
 
98
118
  # Sample variance with denominator (N-1)
99
119
  def variance_sample m=nil
100
- @vector.variance_sample m
120
+ m ||= self.mean
121
+ sum_of_squares(m).quo((@size - @nil_positions.size) - 1)
101
122
  end
102
123
 
103
- def sum_of_squares m=nil
104
- @vector.sum_of_squares m
124
+ # Population variance with denominator (N)
125
+ def variance_population m=nil
126
+ m ||= mean
127
+ sum_of_squares(m).quo((@size - @nil_positions.size)).to_f
105
128
  end
106
129
 
107
- def standard_deviation_sample m=nil
108
- @vector.standard_deviation_sample m
130
+ def sum_of_squares(m=nil)
131
+ m ||= mean
132
+ @data.inject(0) { |memo, val| memo + (val - m)**2 }
109
133
  end
110
134
 
111
135
  def standard_deviation_population m=nil
112
- @vector.standard_deviation_population m
136
+ m ||= mean
137
+ Math::sqrt(variance_population(m))
138
+ end
139
+
140
+ def standard_deviation_sample m=nil
141
+ Math::sqrt(variance_sample(m))
113
142
  end
114
143
 
115
144
  # Calculate skewness using (sigma(xi - mean)^3)/((N)*std_dev_sample^3)
116
145
  def skew m=nil
117
- @vector.skew m
146
+ m ||= mean
147
+ th = @data.inject(0) { |memo, val| memo + ((val - m)**3) }
148
+ th.quo ((@size - @nil_positions.size) * (standard_deviation_sample(m)**3))
118
149
  end
119
150
 
120
151
  def kurtosis m=nil
121
- @vector.kurtosis m
152
+ m ||= mean
153
+ fo = @data.inject(0){ |a, x| a + ((x - m) ** 4) }
154
+ fo.quo((@size - @nil_positions.size) * standard_deviation_sample(m) ** 4) - 3
122
155
  end
123
156
 
124
157
  def average_deviation_population m=nil
125
- @vector.average_deviation_population m
158
+ m ||= mean
159
+ (@data.inject(0) {|memo, val| val + (val - m).abs }) / n_valid
126
160
  end
127
161
 
128
162
  def recode!(&block)
129
- @vector.recode!(&block)
163
+ @data.recode!(&block)
130
164
  end
131
165
 
132
166
  def percentile percent
133
- @vector.percentile percent
167
+ sorted = @data.sort
168
+ v = (n_valid * percent).quo(100)
169
+ if v.to_i != v
170
+ sorted[v.round]
171
+ else
172
+ (sorted[(v - 0.5).round].to_f + sorted[(v + 0.5).round]).quo(2)
173
+ end
134
174
  end
135
175
 
136
- alias_method :sdp, :standard_deviation_population
137
- alias_method :sds, :standard_deviation_sample
138
- alias_method :adp, :average_deviation_population
139
- # alias_method :cov, :coefficient_of_variation
140
- # alias_method :variance, :variance_sample
141
- alias_method :sd, :standard_deviation_sample
142
- alias_method :ss, :sum_of_squares
143
- alias_method :percentil, :percentile
176
+ alias :sdp :standard_deviation_population
177
+ alias :sds :standard_deviation_sample
178
+ alias :std :sds
179
+ alias :adp :average_deviation_population
180
+ alias :cov :coefficient_of_variation
181
+ alias :variance :variance_sample
182
+ alias :sd :standard_deviation_sample
183
+ alias :ss :sum_of_squares
184
+ alias :percentil :percentile
144
185
  end
145
186
  end
146
187
  end