daru 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +0 -0
  3. data/Gemfile +0 -1
  4. data/History.txt +35 -0
  5. data/README.md +178 -198
  6. data/daru.gemspec +5 -7
  7. data/lib/daru.rb +10 -2
  8. data/lib/daru/accessors/array_wrapper.rb +36 -198
  9. data/lib/daru/accessors/nmatrix_wrapper.rb +60 -209
  10. data/lib/daru/core/group_by.rb +183 -0
  11. data/lib/daru/dataframe.rb +615 -167
  12. data/lib/daru/index.rb +17 -16
  13. data/lib/daru/io/io.rb +5 -12
  14. data/lib/daru/maths/arithmetic/dataframe.rb +72 -8
  15. data/lib/daru/maths/arithmetic/vector.rb +19 -6
  16. data/lib/daru/maths/statistics/dataframe.rb +103 -2
  17. data/lib/daru/maths/statistics/vector.rb +102 -61
  18. data/lib/daru/monkeys.rb +8 -0
  19. data/lib/daru/multi_index.rb +199 -0
  20. data/lib/daru/plotting/dataframe.rb +24 -24
  21. data/lib/daru/plotting/vector.rb +14 -15
  22. data/lib/daru/vector.rb +402 -98
  23. data/lib/version.rb +1 -1
  24. data/notebooks/grouping_splitting_pivots.ipynb +529 -0
  25. data/notebooks/intro_with_music_data_.ipynb +104 -119
  26. data/spec/accessors/wrappers_spec.rb +36 -0
  27. data/spec/core/group_by_spec.rb +331 -0
  28. data/spec/dataframe_spec.rb +1237 -475
  29. data/spec/fixtures/sales-funnel.csv +18 -0
  30. data/spec/index_spec.rb +10 -21
  31. data/spec/io/io_spec.rb +4 -14
  32. data/spec/math/arithmetic/dataframe_spec.rb +66 -0
  33. data/spec/math/arithmetic/vector_spec.rb +45 -4
  34. data/spec/math/statistics/dataframe_spec.rb +91 -1
  35. data/spec/math/statistics/vector_spec.rb +32 -6
  36. data/spec/monkeys_spec.rb +10 -1
  37. data/spec/multi_index_spec.rb +216 -0
  38. data/spec/spec_helper.rb +1 -0
  39. data/spec/vector_spec.rb +505 -57
  40. metadata +21 -15
@@ -4,14 +4,12 @@ $:.unshift File.expand_path("../lib", __FILE__)
4
4
  require 'version.rb'
5
5
 
6
6
  DESCRIPTION = <<MSG
7
- Daru (Data Analysis in RUby) is a library for storage, analysis and manipulation
7
+ Daru (Data Analysis in RUby) is a library for analysis, manipulation and visualization
8
8
  of data.
9
9
 
10
- Daru works with Ruby arrays, NMatrix and MDArray, thus working seamlessly accross
11
- ruby interpreters, at the same time providing speed for those who need it.
12
-
13
- This library is under active development so NMatrix and MDArray support is
14
- somewhat limited, but should be available soon!
10
+ Daru works with Ruby arrays and NMatrix, thus working seamlessly accross
11
+ ruby interpreters, at the same time providing speed for those who need it, while
12
+ making working with data super simple and intuitive.
15
13
  MSG
16
14
 
17
15
  Gem::Specification.new do |spec|
@@ -35,6 +33,6 @@ Gem::Specification.new do |spec|
35
33
  spec.add_development_dependency 'awesome_print'
36
34
  spec.add_development_dependency 'nyaplot'
37
35
  if RUBY_ENGINE != 'jruby'
38
- spec.add_development_dependency 'nmatrix', '~> 0.1.0.rc5'
36
+ spec.add_development_dependency 'nmatrix', '~> 0.1.0'
39
37
  end
40
38
  end
@@ -1,7 +1,15 @@
1
- require 'securerandom'
1
+ def jruby?
2
+ RUBY_ENGINE == 'jruby'
3
+ end
4
+
2
5
  require 'csv'
6
+ require 'matrix'
7
+ require 'securerandom'
3
8
 
4
9
  require 'daru/index.rb'
10
+ require 'daru/multi_index.rb'
5
11
  require 'daru/vector.rb'
6
12
  require 'daru/dataframe.rb'
7
- require 'daru/monkeys.rb'
13
+ require 'daru/monkeys.rb'
14
+
15
+ require 'daru/core/group_by.rb'
@@ -2,254 +2,92 @@ module Daru
2
2
  module Accessors
3
3
  # Internal class for wrapping ruby array
4
4
  class ArrayWrapper
5
- module Statistics
6
-
7
- def average_deviation_population m=nil
8
- m ||= mean
9
- (@vector.inject(0) {|memo, val| val + (val - m).abs }) / n_valid
10
- end
11
-
12
- def coefficient_of_variation
13
- standard_deviation_sample / mean
14
- end
15
-
16
- def count value=false
17
- if block_given?
18
- @vector.inject(0){ |memo, val| memo += 1 if yield val; memo}
19
- else
20
- val = frequencies[value]
21
- val.nil? ? 0 : val
22
- end
23
- end
24
-
25
- def factors
26
- index = @data.sorted_indices
27
- index.reduce([]){|memo, val| memo.push(@data[val]) if memo.last != @data[val]; memo}
28
- end # TODO
29
-
30
- def frequencies
31
- @vector.inject({}) do |hash, element|
32
- hash[element] ||= 0
33
- hash[element] += 1
34
- hash
35
- end
36
- end
37
-
38
- def has_missing_data?
39
- has_missing_data
40
- end
41
-
42
- def kurtosis m=nil
43
- m ||= mean
44
- fo = @vector.inject(0){ |a, x| a + ((x - m) ** 4) }
45
- fo.quo(@size * standard_deviation_sample(m) ** 4) - 3
46
- end
47
-
48
- def mean
49
- sum.quo(@size).to_f
50
- end
51
-
52
- def median
53
- percentile 50
54
- end
55
-
56
- def median_absolute_deviation
57
- m = median
58
- recode {|val| (val - m).abs }.median
59
- end
60
-
61
- def mode
62
- freqs = frequencies.values
63
-
64
- @vector[freqs.index(freqs.max)]
65
- end
66
-
67
- def n_valid
68
- @size
69
- end
70
-
71
- def percentile percent
72
- sorted = @vector.sort
73
- v = (n_valid * percent).quo(100)
74
- if v.to_i != v
75
- sorted[v.round]
76
- else
77
- (sorted[(v - 0.5).round].to_f + sorted[(v + 0.5).round]).quo(2)
78
- end
79
- end
80
-
81
- def product
82
- @vector.inject(:*)
83
- end
84
-
85
- def max
86
- @vector.max
87
- end
88
-
89
- def min
90
- @vector.min
91
- end
92
-
93
- def proportion value=1
94
- frequencies[value] / n_valid
95
- end
96
-
97
- def proportions
98
- len = n_valid
99
- frequencies.inject({}) { |hash, arr| hash[arr[0]] = arr[1] / len; hash }
100
- end
101
-
102
- def range
103
- max - min
104
- end
105
-
106
- def ranked
107
- sum = 0
108
- r = frequencies.sort.inject( {} ) do |memo, val|
109
- memo[val[0]] = ((sum + 1) + (sum + val[1])) / 2
110
- sum += val[1]
111
- memo
112
- end
113
-
114
- Daru::Vector.new @vector.map { |e| r[e] }, index: @caller.index,
115
- name: @caller.name, dtype: @caller.dtype
116
- end
117
-
118
- def recode(&block)
119
- @vector.map(&block)
120
- end
121
-
122
- def recode!(&block)
123
- @vector.map!(&block)
124
- end
125
-
126
- # Calculate skewness using (sigma(xi - mean)^3)/((N)*std_dev_sample^3)
127
- def skew m=nil
128
- m ||= mean
129
- th = @vector.inject(0) { |memo, val| memo + ((val - m)**3) }
130
- th.quo (@size * (standard_deviation_sample(m)**3))
131
- end
132
-
133
- def standard_deviation_population m=nil
134
- m ||= mean
135
- Math::sqrt(variance_population(m))
136
- end
137
-
138
- def standard_deviation_sample m=nil
139
- Math::sqrt(variance_sample(m))
140
- end
141
-
142
- def standard_error
143
- standard_deviation_sample/(Math::sqrt(@size))
144
- end
145
-
146
- def sum_of_squared_deviation
147
- (@vector.inject(0) { |a,x| x.square + a } - (sum.square.quo(@size))).to_f
148
- end
149
-
150
- def sum_of_squares(m=nil)
151
- m ||= mean
152
- @vector.inject(0) { |memo, val| memo + (val - m)**2 }
153
- end
154
-
155
- def sum
156
- @vector.inject(:+)
157
- end
158
-
159
- # Sample variance with denominator (N-1)
160
- def variance_sample m=nil
161
- m ||= self.mean
162
-
163
- sum_of_squares(m).quo(@size - 1)
164
- end
165
-
166
- # Population variance with denominator (N)
167
- def variance_population m=nil
168
- m ||= mean
169
-
170
- sum_of_squares(m).quo(@size).to_f
171
- end
172
- end # module Statistics
173
-
174
- include Statistics
175
5
  include Enumerable
176
6
 
177
7
  def each(&block)
178
- @vector.each(&block)
8
+ @data.each(&block)
179
9
  end
180
10
 
181
11
  def map!(&block)
182
- @vector.map!(&block)
12
+ @data.map!(&block)
183
13
  end
184
14
 
185
15
  attr_accessor :size
186
- attr_reader :vector
187
- attr_reader :has_missing_data
16
+ attr_reader :data
188
17
 
189
- def initialize vector, caller
190
- @vector = vector
191
- @caller = caller
18
+ def initialize vector, context
19
+ @data = vector.to_a
20
+ @context = context
192
21
 
193
22
  set_size
194
23
  end
195
24
 
196
25
  def [] index
197
- @vector[index]
26
+ @data[index]
198
27
  end
199
28
 
200
29
  def []= index, value
201
- has_missing_data = true if value.nil?
202
- @vector[index] = value
30
+ @data[index] = value
203
31
  set_size
204
32
  end
205
33
 
206
34
  def == other
207
- @vector == other
35
+ @data == other
208
36
  end
209
37
 
210
38
  def delete_at index
211
- @vector.delete_at index
39
+ @data.delete_at index
212
40
  set_size
213
41
  end
214
42
 
215
43
  def index key
216
- @vector.index key
44
+ @data.index key
217
45
  end
218
46
 
219
47
  def << element
220
- @vector << element
48
+ @data << element
221
49
  set_size
222
50
  end
223
51
 
224
52
  def uniq
225
- @vector.uniq
53
+ @data.uniq
226
54
  end
227
55
 
228
56
  def to_a
229
- @vector
57
+ @data
230
58
  end
231
59
 
232
60
  def dup
233
- ArrayWrapper.new @vector.dup, @caller
61
+ ArrayWrapper.new @data.dup, @context
62
+ end
63
+
64
+ def mean
65
+ sum.quo(@size - @context.nil_positions.size).to_f
66
+ end
67
+
68
+ def product
69
+ @data.inject(1) { |m,e| m*e unless e.nil? }
70
+ end
71
+
72
+ def max
73
+ @data.max
74
+ end
75
+
76
+ def min
77
+ @data.min
234
78
  end
235
79
 
236
- def coerce dtype
237
- case
238
- when dtype == Array
239
- self
240
- when dtype == NMatrix
241
- Daru::Accessors::NMatrixWrapper.new @vector, @caller
242
- when dtype == MDArray
243
- raise NotImplementedError
244
- else
245
- raise ArgumentError, "Cant coerce to dtype #{dtype}"
80
+ def sum
81
+ @data.inject(0) do |memo ,e|
82
+ memo += e unless e.nil? #TODO: Remove this conditional somehow!
83
+ memo
246
84
  end
247
85
  end
248
86
 
249
87
  private
250
88
 
251
89
  def set_size
252
- @size = @vector.size
90
+ @size = @data.size
253
91
  end
254
92
  end
255
93
  end
@@ -1,260 +1,111 @@
1
- require 'nmatrix'
1
+ begin
2
+ require 'nmatrix' unless jruby?
3
+ rescue LoadError => e
4
+ puts "Please install the nmatrix gem for fast and efficient data storage."
5
+ end
2
6
 
3
7
  module Daru
4
8
  module Accessors
5
-
6
9
  # Internal class for wrapping NMatrix
7
10
  class NMatrixWrapper
8
- module Statistics
9
- # def average_deviation_population m=nil
10
- # m ||= self.mean
11
- # (self.reduce(0){|memo, val| val + (val - m).abs})/self.length
12
- # end
13
-
14
- # def coefficient_of_variation
15
- # self.standard_deviation_sample/self.mean
16
- # end
17
-
18
- # def count x=false
19
- # if block_given?
20
- # self.reduce(0){|memo, val| memo += 1 if yield val; memo}
21
- # else
22
- # val = self.frequencies[x]
23
- # val.nil? ? 0 : val
24
- # end
25
- # end
26
-
27
- # def factors
28
- # index = @data.sorted_indices
29
- # index.reduce([]){|memo, val| memo.push(@data[val]) if memo.last != @data[val]; memo}
30
- # end
31
-
32
- # def frequencies
33
- # index = @data.sorted_indices
34
- # index.reduce({}){|memo, val| memo[@data[val]] ||= 0; memo[@data[val]] += 1; memo}
35
- # end
36
-
37
- # def has_missing_data?
38
- # @missing_data
39
- # end
40
-
41
- # def is_valid?
42
- # true
43
- # end
44
-
45
- # def kurtosis(m=nil)
46
- # m ||= self.mean
47
- # fo=self.reduce(0){|a, x| a+((x-m)**4)}
48
- # fo.quo(self.length*sd(m)**4)-3
49
- # end
50
-
51
- # def mean
52
- # @vector[0...@size].mean.first
53
- # end
54
-
55
- # def median
56
- # self.percentil(50)
57
- # end
58
-
59
- # def median_absolute_deviation
60
- # m = self.median
61
- # self.recode{|val| (val-m).abls}.median
62
- # end
63
-
64
- # def mode
65
- # self.frequencies.max
66
- # end
67
-
68
- # def ==(other)
69
- # @data==other
70
- # end
71
-
72
- # def n_valid
73
- # self.length
74
- # end
75
-
76
- # def percentil(percent)
77
- # index = @data.sorted_indices
78
- # pos = (self.length * percent)/100
79
- # if pos.to_i == pos
80
- # @data[index[pos.to_i]]
81
- # else
82
- # pos = (pos-0.5).to_i
83
- # (@data[index[pos]] + @data[index[pos+1]])/2
84
- # end
85
- # end
86
-
87
- # def product
88
- # @data.inject(1){|memo, val| memo*val}
89
- # end
90
-
91
- # def proportion(val=1)
92
- # self.frequencies[val]/self.n_valid
93
- # end
94
-
95
- # def proportion_confidence_interval_t
96
- # raise "NotImplementedError"
97
- # end
98
-
99
- # def proportion_confidence_interval_z
100
- # raise "NotImplementedError"
101
- # end
102
-
103
- # def proportions
104
- # len = self.n_valid
105
- # self.frequencies.reduce({}){|memo, arr| memo[arr[0]] = arr[1]/len}
106
- # end
107
-
108
- # def push(val)
109
- # self.expand(self.length+1)
110
- # self[self.length-1] = recode
111
- # end
112
-
113
- # def range
114
- # max - min
115
- # end
116
-
117
- # def ranked
118
- # sum = 0
119
- # r = self.frequencies.sort.reduce({}) do |memo, val|
120
- # memo[val[0]] = ((sum+1) + (sum+val[1]))/2
121
- # sum += val[1]
122
- # memo
123
- # end
124
- # Mikon::DArray.new(self.reduce{|val| r[val]})
125
- # end
126
-
127
- # def recode(&block)
128
- # Mikon::DArray.new(@data.map(&block))
129
- # end
130
-
131
- # def recode!(&block)
132
- # @data.map!(&block)
133
- # end
134
-
135
- # def skew(m=nil)
136
- # m ||= self.mean
137
- # th = self.reduce(0){|memo, val| memo + ((val - m)**3)}
138
- # th/((self.length)*self.sd(m)**3)
139
- # end
140
-
141
- # def standard_deviation_population(m=nil)
142
- # m ||= self.mean
143
- # Maths.sqrt(self.variance_population(m))
144
- # end
145
-
146
- # def standard_deviation_sample(m=nil)
147
- # if !m.nil?
148
- # Maths.sqrt(variance_sample(m))
149
- # else
150
- # @data.std.first
151
- # end
152
- # end
153
-
154
- # def standard_error
155
- # self.standard_deviation_sample/(Maths.sqrt(self.length))
156
- # end
157
-
158
- # def sum_of_squared_deviation
159
- # self.reduce(0){|memo, val| val**2 + memo}
160
- # end
161
-
162
- # def sum_of_squares(m=nil)
163
- # m ||= self.mean
164
- # self.reduce(0){|memo, val| memo + (val-m)**2}
165
- # end
11
+ include Enumerable
166
12
 
167
- # def sum
168
- # @data.sum.first
169
- # end
13
+ def each(&block)
14
+ @data[0...@size].each(&block)
15
+ end
170
16
 
171
- # def variance_sample(m=nil)
172
- # m ||= self.mean
173
- # self.sum_of_squares(m)/(self.length-1)
174
- # end
175
- end # module Statistics
17
+ def map(&block)
18
+ @data[0...@size].map(&block)
19
+ end
176
20
 
177
- include Statistics
178
- include Enumerable
21
+ def map!(&block)
22
+ @data = NMatrix.new [@size*2], map(&block).to_a, dtype: nm_dtype
23
+ end
179
24
 
180
- def each(&block)
181
- @vector.each(&block)
25
+ def inject(*args, &block)
26
+ @data[0...@size].inject(*args, &block)
182
27
  end
183
28
 
184
- attr_reader :size, :vector, :missing_data
29
+ alias_method :recode, :map
30
+ alias_method :recode!, :map!
185
31
 
186
- def initialize vector, caller
32
+ attr_reader :size, :data, :nm_dtype
33
+
34
+ def initialize vector, context, nm_dtype=:int32
187
35
  @size = vector.size
188
- @vector = NMatrix.new [@size*2], vector.to_a
189
- @missing_data = false
190
- @caller = caller
36
+ @data = NMatrix.new [@size*2], vector.to_a, dtype: nm_dtype
37
+ @context = context
38
+ @nm_dtype = @data.dtype
191
39
  # init with twice the storage for reducing the need to resize
192
40
  end
193
41
 
194
42
  def [] index
195
- @vector[index]
43
+ return @data[index] if index < @size
44
+ nil
196
45
  end
197
46
 
198
47
  def []= index, value
199
- resize if index >= @size
200
-
201
- if value.nil?
202
- @missing_data = true
203
- @vector = @vector.cast(dtype: :object)
204
- end
205
- @vector[index] = value
48
+ resize if index >= @data.size
49
+ @size += 1 if index == @size
50
+
51
+ @data = @data.cast(dtype: :object) if value.nil?
52
+ @data[index] = value
206
53
  end
207
54
 
208
55
  def == other
209
- @vector == other and @size == other.size
56
+ @data == other and @size == other.size
210
57
  end
211
58
 
212
59
  def delete_at index
213
- arry = @vector.to_a
60
+ arry = @data.to_a
214
61
  arry.delete_at index
215
- @vector = NMatrix.new [@size-1], arry
62
+ @data = NMatrix.new [(2*@size-1)], arry, dtype: @nm_dtype
216
63
  @size -= 1
217
64
  end
218
65
 
219
66
  def index key
220
- @vector.to_a.index key
67
+ @data.to_a.index key
221
68
  end
222
69
 
223
70
  def << element
224
- if @size >= @vector.size
225
- resize
226
- end
227
-
71
+ resize if @size >= @data.size
228
72
  self[@size] = element
229
73
 
230
74
  @size += 1
231
75
  end
232
76
 
233
77
  def to_a
234
- @vector.to_a
78
+ @data[0...@size].to_a
235
79
  end
236
80
 
237
81
  def dup
238
- NMatrixWrapper.new @vector.to_a
82
+ NMatrixWrapper.new @data.to_a, @context, @nm_dtype
239
83
  end
240
84
 
241
- def coerce dtype
242
- case
243
- when dtype == Array
244
- Daru::Accessors::ArrayWrapper.new @vector[0..(@size-1)].to_a, @caller
245
- when dtype == NMatrix
246
- self
247
- when dtype == MDArray
248
- raise NotImplementedError
249
- else
250
- raise ArgumentError, "Cant coerce to dtype #{dtype}"
251
- end
85
+ def resize size = @size*2
86
+ raise ArgumentError, "Size must be greater than current size" if size < @size
87
+
88
+ @data = NMatrix.new [size], @data.to_a
252
89
  end
253
90
 
254
- def resize size = @size*2
255
- raise "Size must be greater than current size" if size < @size
91
+ def mean
92
+ @data[0...@size].mean.first
93
+ end
94
+
95
+ def product
96
+ @data[0...@size].inject(1) { |m,e| m*e }
97
+ end
98
+
99
+ def sum
100
+ @data[0...@size].inject(:+)
101
+ end
102
+
103
+ def max
104
+ @data.max
105
+ end
256
106
 
257
- @vector = NMatrix.new [size], @vector.to_a
107
+ def min
108
+ @data.min
258
109
  end
259
110
  end
260
111
  end