daru 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +0 -0
  3. data/Gemfile +0 -1
  4. data/History.txt +35 -0
  5. data/README.md +178 -198
  6. data/daru.gemspec +5 -7
  7. data/lib/daru.rb +10 -2
  8. data/lib/daru/accessors/array_wrapper.rb +36 -198
  9. data/lib/daru/accessors/nmatrix_wrapper.rb +60 -209
  10. data/lib/daru/core/group_by.rb +183 -0
  11. data/lib/daru/dataframe.rb +615 -167
  12. data/lib/daru/index.rb +17 -16
  13. data/lib/daru/io/io.rb +5 -12
  14. data/lib/daru/maths/arithmetic/dataframe.rb +72 -8
  15. data/lib/daru/maths/arithmetic/vector.rb +19 -6
  16. data/lib/daru/maths/statistics/dataframe.rb +103 -2
  17. data/lib/daru/maths/statistics/vector.rb +102 -61
  18. data/lib/daru/monkeys.rb +8 -0
  19. data/lib/daru/multi_index.rb +199 -0
  20. data/lib/daru/plotting/dataframe.rb +24 -24
  21. data/lib/daru/plotting/vector.rb +14 -15
  22. data/lib/daru/vector.rb +402 -98
  23. data/lib/version.rb +1 -1
  24. data/notebooks/grouping_splitting_pivots.ipynb +529 -0
  25. data/notebooks/intro_with_music_data_.ipynb +104 -119
  26. data/spec/accessors/wrappers_spec.rb +36 -0
  27. data/spec/core/group_by_spec.rb +331 -0
  28. data/spec/dataframe_spec.rb +1237 -475
  29. data/spec/fixtures/sales-funnel.csv +18 -0
  30. data/spec/index_spec.rb +10 -21
  31. data/spec/io/io_spec.rb +4 -14
  32. data/spec/math/arithmetic/dataframe_spec.rb +66 -0
  33. data/spec/math/arithmetic/vector_spec.rb +45 -4
  34. data/spec/math/statistics/dataframe_spec.rb +91 -1
  35. data/spec/math/statistics/vector_spec.rb +32 -6
  36. data/spec/monkeys_spec.rb +10 -1
  37. data/spec/multi_index_spec.rb +216 -0
  38. data/spec/spec_helper.rb +1 -0
  39. data/spec/vector_spec.rb +505 -57
  40. metadata +21 -15
@@ -4,14 +4,12 @@ $:.unshift File.expand_path("../lib", __FILE__)
4
4
  require 'version.rb'
5
5
 
6
6
  DESCRIPTION = <<MSG
7
- Daru (Data Analysis in RUby) is a library for storage, analysis and manipulation
7
+ Daru (Data Analysis in RUby) is a library for analysis, manipulation and visualization
8
8
  of data.
9
9
 
10
- Daru works with Ruby arrays, NMatrix and MDArray, thus working seamlessly accross
11
- ruby interpreters, at the same time providing speed for those who need it.
12
-
13
- This library is under active development so NMatrix and MDArray support is
14
- somewhat limited, but should be available soon!
10
+ Daru works with Ruby arrays and NMatrix, thus working seamlessly accross
11
+ ruby interpreters, at the same time providing speed for those who need it, while
12
+ making working with data super simple and intuitive.
15
13
  MSG
16
14
 
17
15
  Gem::Specification.new do |spec|
@@ -35,6 +33,6 @@ Gem::Specification.new do |spec|
35
33
  spec.add_development_dependency 'awesome_print'
36
34
  spec.add_development_dependency 'nyaplot'
37
35
  if RUBY_ENGINE != 'jruby'
38
- spec.add_development_dependency 'nmatrix', '~> 0.1.0.rc5'
36
+ spec.add_development_dependency 'nmatrix', '~> 0.1.0'
39
37
  end
40
38
  end
@@ -1,7 +1,15 @@
1
- require 'securerandom'
1
+ def jruby?
2
+ RUBY_ENGINE == 'jruby'
3
+ end
4
+
2
5
  require 'csv'
6
+ require 'matrix'
7
+ require 'securerandom'
3
8
 
4
9
  require 'daru/index.rb'
10
+ require 'daru/multi_index.rb'
5
11
  require 'daru/vector.rb'
6
12
  require 'daru/dataframe.rb'
7
- require 'daru/monkeys.rb'
13
+ require 'daru/monkeys.rb'
14
+
15
+ require 'daru/core/group_by.rb'
@@ -2,254 +2,92 @@ module Daru
2
2
  module Accessors
3
3
  # Internal class for wrapping ruby array
4
4
  class ArrayWrapper
5
- module Statistics
6
-
7
- def average_deviation_population m=nil
8
- m ||= mean
9
- (@vector.inject(0) {|memo, val| val + (val - m).abs }) / n_valid
10
- end
11
-
12
- def coefficient_of_variation
13
- standard_deviation_sample / mean
14
- end
15
-
16
- def count value=false
17
- if block_given?
18
- @vector.inject(0){ |memo, val| memo += 1 if yield val; memo}
19
- else
20
- val = frequencies[value]
21
- val.nil? ? 0 : val
22
- end
23
- end
24
-
25
- def factors
26
- index = @data.sorted_indices
27
- index.reduce([]){|memo, val| memo.push(@data[val]) if memo.last != @data[val]; memo}
28
- end # TODO
29
-
30
- def frequencies
31
- @vector.inject({}) do |hash, element|
32
- hash[element] ||= 0
33
- hash[element] += 1
34
- hash
35
- end
36
- end
37
-
38
- def has_missing_data?
39
- has_missing_data
40
- end
41
-
42
- def kurtosis m=nil
43
- m ||= mean
44
- fo = @vector.inject(0){ |a, x| a + ((x - m) ** 4) }
45
- fo.quo(@size * standard_deviation_sample(m) ** 4) - 3
46
- end
47
-
48
- def mean
49
- sum.quo(@size).to_f
50
- end
51
-
52
- def median
53
- percentile 50
54
- end
55
-
56
- def median_absolute_deviation
57
- m = median
58
- recode {|val| (val - m).abs }.median
59
- end
60
-
61
- def mode
62
- freqs = frequencies.values
63
-
64
- @vector[freqs.index(freqs.max)]
65
- end
66
-
67
- def n_valid
68
- @size
69
- end
70
-
71
- def percentile percent
72
- sorted = @vector.sort
73
- v = (n_valid * percent).quo(100)
74
- if v.to_i != v
75
- sorted[v.round]
76
- else
77
- (sorted[(v - 0.5).round].to_f + sorted[(v + 0.5).round]).quo(2)
78
- end
79
- end
80
-
81
- def product
82
- @vector.inject(:*)
83
- end
84
-
85
- def max
86
- @vector.max
87
- end
88
-
89
- def min
90
- @vector.min
91
- end
92
-
93
- def proportion value=1
94
- frequencies[value] / n_valid
95
- end
96
-
97
- def proportions
98
- len = n_valid
99
- frequencies.inject({}) { |hash, arr| hash[arr[0]] = arr[1] / len; hash }
100
- end
101
-
102
- def range
103
- max - min
104
- end
105
-
106
- def ranked
107
- sum = 0
108
- r = frequencies.sort.inject( {} ) do |memo, val|
109
- memo[val[0]] = ((sum + 1) + (sum + val[1])) / 2
110
- sum += val[1]
111
- memo
112
- end
113
-
114
- Daru::Vector.new @vector.map { |e| r[e] }, index: @caller.index,
115
- name: @caller.name, dtype: @caller.dtype
116
- end
117
-
118
- def recode(&block)
119
- @vector.map(&block)
120
- end
121
-
122
- def recode!(&block)
123
- @vector.map!(&block)
124
- end
125
-
126
- # Calculate skewness using (sigma(xi - mean)^3)/((N)*std_dev_sample^3)
127
- def skew m=nil
128
- m ||= mean
129
- th = @vector.inject(0) { |memo, val| memo + ((val - m)**3) }
130
- th.quo (@size * (standard_deviation_sample(m)**3))
131
- end
132
-
133
- def standard_deviation_population m=nil
134
- m ||= mean
135
- Math::sqrt(variance_population(m))
136
- end
137
-
138
- def standard_deviation_sample m=nil
139
- Math::sqrt(variance_sample(m))
140
- end
141
-
142
- def standard_error
143
- standard_deviation_sample/(Math::sqrt(@size))
144
- end
145
-
146
- def sum_of_squared_deviation
147
- (@vector.inject(0) { |a,x| x.square + a } - (sum.square.quo(@size))).to_f
148
- end
149
-
150
- def sum_of_squares(m=nil)
151
- m ||= mean
152
- @vector.inject(0) { |memo, val| memo + (val - m)**2 }
153
- end
154
-
155
- def sum
156
- @vector.inject(:+)
157
- end
158
-
159
- # Sample variance with denominator (N-1)
160
- def variance_sample m=nil
161
- m ||= self.mean
162
-
163
- sum_of_squares(m).quo(@size - 1)
164
- end
165
-
166
- # Population variance with denominator (N)
167
- def variance_population m=nil
168
- m ||= mean
169
-
170
- sum_of_squares(m).quo(@size).to_f
171
- end
172
- end # module Statistics
173
-
174
- include Statistics
175
5
  include Enumerable
176
6
 
177
7
  def each(&block)
178
- @vector.each(&block)
8
+ @data.each(&block)
179
9
  end
180
10
 
181
11
  def map!(&block)
182
- @vector.map!(&block)
12
+ @data.map!(&block)
183
13
  end
184
14
 
185
15
  attr_accessor :size
186
- attr_reader :vector
187
- attr_reader :has_missing_data
16
+ attr_reader :data
188
17
 
189
- def initialize vector, caller
190
- @vector = vector
191
- @caller = caller
18
+ def initialize vector, context
19
+ @data = vector.to_a
20
+ @context = context
192
21
 
193
22
  set_size
194
23
  end
195
24
 
196
25
  def [] index
197
- @vector[index]
26
+ @data[index]
198
27
  end
199
28
 
200
29
  def []= index, value
201
- has_missing_data = true if value.nil?
202
- @vector[index] = value
30
+ @data[index] = value
203
31
  set_size
204
32
  end
205
33
 
206
34
  def == other
207
- @vector == other
35
+ @data == other
208
36
  end
209
37
 
210
38
  def delete_at index
211
- @vector.delete_at index
39
+ @data.delete_at index
212
40
  set_size
213
41
  end
214
42
 
215
43
  def index key
216
- @vector.index key
44
+ @data.index key
217
45
  end
218
46
 
219
47
  def << element
220
- @vector << element
48
+ @data << element
221
49
  set_size
222
50
  end
223
51
 
224
52
  def uniq
225
- @vector.uniq
53
+ @data.uniq
226
54
  end
227
55
 
228
56
  def to_a
229
- @vector
57
+ @data
230
58
  end
231
59
 
232
60
  def dup
233
- ArrayWrapper.new @vector.dup, @caller
61
+ ArrayWrapper.new @data.dup, @context
62
+ end
63
+
64
+ def mean
65
+ sum.quo(@size - @context.nil_positions.size).to_f
66
+ end
67
+
68
+ def product
69
+ @data.inject(1) { |m,e| m*e unless e.nil? }
70
+ end
71
+
72
+ def max
73
+ @data.max
74
+ end
75
+
76
+ def min
77
+ @data.min
234
78
  end
235
79
 
236
- def coerce dtype
237
- case
238
- when dtype == Array
239
- self
240
- when dtype == NMatrix
241
- Daru::Accessors::NMatrixWrapper.new @vector, @caller
242
- when dtype == MDArray
243
- raise NotImplementedError
244
- else
245
- raise ArgumentError, "Cant coerce to dtype #{dtype}"
80
+ def sum
81
+ @data.inject(0) do |memo ,e|
82
+ memo += e unless e.nil? #TODO: Remove this conditional somehow!
83
+ memo
246
84
  end
247
85
  end
248
86
 
249
87
  private
250
88
 
251
89
  def set_size
252
- @size = @vector.size
90
+ @size = @data.size
253
91
  end
254
92
  end
255
93
  end
@@ -1,260 +1,111 @@
1
- require 'nmatrix'
1
+ begin
2
+ require 'nmatrix' unless jruby?
3
+ rescue LoadError => e
4
+ puts "Please install the nmatrix gem for fast and efficient data storage."
5
+ end
2
6
 
3
7
  module Daru
4
8
  module Accessors
5
-
6
9
  # Internal class for wrapping NMatrix
7
10
  class NMatrixWrapper
8
- module Statistics
9
- # def average_deviation_population m=nil
10
- # m ||= self.mean
11
- # (self.reduce(0){|memo, val| val + (val - m).abs})/self.length
12
- # end
13
-
14
- # def coefficient_of_variation
15
- # self.standard_deviation_sample/self.mean
16
- # end
17
-
18
- # def count x=false
19
- # if block_given?
20
- # self.reduce(0){|memo, val| memo += 1 if yield val; memo}
21
- # else
22
- # val = self.frequencies[x]
23
- # val.nil? ? 0 : val
24
- # end
25
- # end
26
-
27
- # def factors
28
- # index = @data.sorted_indices
29
- # index.reduce([]){|memo, val| memo.push(@data[val]) if memo.last != @data[val]; memo}
30
- # end
31
-
32
- # def frequencies
33
- # index = @data.sorted_indices
34
- # index.reduce({}){|memo, val| memo[@data[val]] ||= 0; memo[@data[val]] += 1; memo}
35
- # end
36
-
37
- # def has_missing_data?
38
- # @missing_data
39
- # end
40
-
41
- # def is_valid?
42
- # true
43
- # end
44
-
45
- # def kurtosis(m=nil)
46
- # m ||= self.mean
47
- # fo=self.reduce(0){|a, x| a+((x-m)**4)}
48
- # fo.quo(self.length*sd(m)**4)-3
49
- # end
50
-
51
- # def mean
52
- # @vector[0...@size].mean.first
53
- # end
54
-
55
- # def median
56
- # self.percentil(50)
57
- # end
58
-
59
- # def median_absolute_deviation
60
- # m = self.median
61
- # self.recode{|val| (val-m).abls}.median
62
- # end
63
-
64
- # def mode
65
- # self.frequencies.max
66
- # end
67
-
68
- # def ==(other)
69
- # @data==other
70
- # end
71
-
72
- # def n_valid
73
- # self.length
74
- # end
75
-
76
- # def percentil(percent)
77
- # index = @data.sorted_indices
78
- # pos = (self.length * percent)/100
79
- # if pos.to_i == pos
80
- # @data[index[pos.to_i]]
81
- # else
82
- # pos = (pos-0.5).to_i
83
- # (@data[index[pos]] + @data[index[pos+1]])/2
84
- # end
85
- # end
86
-
87
- # def product
88
- # @data.inject(1){|memo, val| memo*val}
89
- # end
90
-
91
- # def proportion(val=1)
92
- # self.frequencies[val]/self.n_valid
93
- # end
94
-
95
- # def proportion_confidence_interval_t
96
- # raise "NotImplementedError"
97
- # end
98
-
99
- # def proportion_confidence_interval_z
100
- # raise "NotImplementedError"
101
- # end
102
-
103
- # def proportions
104
- # len = self.n_valid
105
- # self.frequencies.reduce({}){|memo, arr| memo[arr[0]] = arr[1]/len}
106
- # end
107
-
108
- # def push(val)
109
- # self.expand(self.length+1)
110
- # self[self.length-1] = recode
111
- # end
112
-
113
- # def range
114
- # max - min
115
- # end
116
-
117
- # def ranked
118
- # sum = 0
119
- # r = self.frequencies.sort.reduce({}) do |memo, val|
120
- # memo[val[0]] = ((sum+1) + (sum+val[1]))/2
121
- # sum += val[1]
122
- # memo
123
- # end
124
- # Mikon::DArray.new(self.reduce{|val| r[val]})
125
- # end
126
-
127
- # def recode(&block)
128
- # Mikon::DArray.new(@data.map(&block))
129
- # end
130
-
131
- # def recode!(&block)
132
- # @data.map!(&block)
133
- # end
134
-
135
- # def skew(m=nil)
136
- # m ||= self.mean
137
- # th = self.reduce(0){|memo, val| memo + ((val - m)**3)}
138
- # th/((self.length)*self.sd(m)**3)
139
- # end
140
-
141
- # def standard_deviation_population(m=nil)
142
- # m ||= self.mean
143
- # Maths.sqrt(self.variance_population(m))
144
- # end
145
-
146
- # def standard_deviation_sample(m=nil)
147
- # if !m.nil?
148
- # Maths.sqrt(variance_sample(m))
149
- # else
150
- # @data.std.first
151
- # end
152
- # end
153
-
154
- # def standard_error
155
- # self.standard_deviation_sample/(Maths.sqrt(self.length))
156
- # end
157
-
158
- # def sum_of_squared_deviation
159
- # self.reduce(0){|memo, val| val**2 + memo}
160
- # end
161
-
162
- # def sum_of_squares(m=nil)
163
- # m ||= self.mean
164
- # self.reduce(0){|memo, val| memo + (val-m)**2}
165
- # end
11
+ include Enumerable
166
12
 
167
- # def sum
168
- # @data.sum.first
169
- # end
13
+ def each(&block)
14
+ @data[0...@size].each(&block)
15
+ end
170
16
 
171
- # def variance_sample(m=nil)
172
- # m ||= self.mean
173
- # self.sum_of_squares(m)/(self.length-1)
174
- # end
175
- end # module Statistics
17
+ def map(&block)
18
+ @data[0...@size].map(&block)
19
+ end
176
20
 
177
- include Statistics
178
- include Enumerable
21
+ def map!(&block)
22
+ @data = NMatrix.new [@size*2], map(&block).to_a, dtype: nm_dtype
23
+ end
179
24
 
180
- def each(&block)
181
- @vector.each(&block)
25
+ def inject(*args, &block)
26
+ @data[0...@size].inject(*args, &block)
182
27
  end
183
28
 
184
- attr_reader :size, :vector, :missing_data
29
+ alias_method :recode, :map
30
+ alias_method :recode!, :map!
185
31
 
186
- def initialize vector, caller
32
+ attr_reader :size, :data, :nm_dtype
33
+
34
+ def initialize vector, context, nm_dtype=:int32
187
35
  @size = vector.size
188
- @vector = NMatrix.new [@size*2], vector.to_a
189
- @missing_data = false
190
- @caller = caller
36
+ @data = NMatrix.new [@size*2], vector.to_a, dtype: nm_dtype
37
+ @context = context
38
+ @nm_dtype = @data.dtype
191
39
  # init with twice the storage for reducing the need to resize
192
40
  end
193
41
 
194
42
  def [] index
195
- @vector[index]
43
+ return @data[index] if index < @size
44
+ nil
196
45
  end
197
46
 
198
47
  def []= index, value
199
- resize if index >= @size
200
-
201
- if value.nil?
202
- @missing_data = true
203
- @vector = @vector.cast(dtype: :object)
204
- end
205
- @vector[index] = value
48
+ resize if index >= @data.size
49
+ @size += 1 if index == @size
50
+
51
+ @data = @data.cast(dtype: :object) if value.nil?
52
+ @data[index] = value
206
53
  end
207
54
 
208
55
  def == other
209
- @vector == other and @size == other.size
56
+ @data == other and @size == other.size
210
57
  end
211
58
 
212
59
  def delete_at index
213
- arry = @vector.to_a
60
+ arry = @data.to_a
214
61
  arry.delete_at index
215
- @vector = NMatrix.new [@size-1], arry
62
+ @data = NMatrix.new [(2*@size-1)], arry, dtype: @nm_dtype
216
63
  @size -= 1
217
64
  end
218
65
 
219
66
  def index key
220
- @vector.to_a.index key
67
+ @data.to_a.index key
221
68
  end
222
69
 
223
70
  def << element
224
- if @size >= @vector.size
225
- resize
226
- end
227
-
71
+ resize if @size >= @data.size
228
72
  self[@size] = element
229
73
 
230
74
  @size += 1
231
75
  end
232
76
 
233
77
  def to_a
234
- @vector.to_a
78
+ @data[0...@size].to_a
235
79
  end
236
80
 
237
81
  def dup
238
- NMatrixWrapper.new @vector.to_a
82
+ NMatrixWrapper.new @data.to_a, @context, @nm_dtype
239
83
  end
240
84
 
241
- def coerce dtype
242
- case
243
- when dtype == Array
244
- Daru::Accessors::ArrayWrapper.new @vector[0..(@size-1)].to_a, @caller
245
- when dtype == NMatrix
246
- self
247
- when dtype == MDArray
248
- raise NotImplementedError
249
- else
250
- raise ArgumentError, "Cant coerce to dtype #{dtype}"
251
- end
85
+ def resize size = @size*2
86
+ raise ArgumentError, "Size must be greater than current size" if size < @size
87
+
88
+ @data = NMatrix.new [size], @data.to_a
252
89
  end
253
90
 
254
- def resize size = @size*2
255
- raise "Size must be greater than current size" if size < @size
91
+ def mean
92
+ @data[0...@size].mean.first
93
+ end
94
+
95
+ def product
96
+ @data[0...@size].inject(1) { |m,e| m*e }
97
+ end
98
+
99
+ def sum
100
+ @data[0...@size].inject(:+)
101
+ end
102
+
103
+ def max
104
+ @data.max
105
+ end
256
106
 
257
- @vector = NMatrix.new [size], @vector.to_a
107
+ def min
108
+ @data.min
258
109
  end
259
110
  end
260
111
  end