daru 0.0.5 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.build.sh +14 -0
  3. data/.travis.yml +26 -4
  4. data/CONTRIBUTING.md +31 -0
  5. data/Gemfile +1 -2
  6. data/{History.txt → History.md} +110 -44
  7. data/README.md +21 -288
  8. data/Rakefile +1 -0
  9. data/daru.gemspec +12 -8
  10. data/lib/daru.rb +36 -1
  11. data/lib/daru/accessors/array_wrapper.rb +8 -3
  12. data/lib/daru/accessors/gsl_wrapper.rb +113 -0
  13. data/lib/daru/accessors/nmatrix_wrapper.rb +6 -17
  14. data/lib/daru/core/group_by.rb +0 -1
  15. data/lib/daru/dataframe.rb +1192 -83
  16. data/lib/daru/extensions/rserve.rb +21 -0
  17. data/lib/daru/index.rb +14 -0
  18. data/lib/daru/io/io.rb +170 -8
  19. data/lib/daru/maths/arithmetic/dataframe.rb +4 -3
  20. data/lib/daru/maths/arithmetic/vector.rb +4 -4
  21. data/lib/daru/maths/statistics/dataframe.rb +48 -27
  22. data/lib/daru/maths/statistics/vector.rb +215 -33
  23. data/lib/daru/monkeys.rb +53 -7
  24. data/lib/daru/multi_index.rb +21 -4
  25. data/lib/daru/plotting/dataframe.rb +83 -25
  26. data/lib/daru/plotting/vector.rb +9 -10
  27. data/lib/daru/vector.rb +596 -61
  28. data/lib/daru/version.rb +3 -0
  29. data/spec/accessors/wrappers_spec.rb +51 -0
  30. data/spec/core/group_by_spec.rb +0 -2
  31. data/spec/daru_spec.rb +58 -0
  32. data/spec/dataframe_spec.rb +768 -73
  33. data/spec/extensions/rserve_spec.rb +52 -0
  34. data/spec/fixtures/bank2.dat +200 -0
  35. data/spec/fixtures/repeated_fields.csv +7 -0
  36. data/spec/fixtures/scientific_notation.csv +4 -0
  37. data/spec/fixtures/test_xls.xls +0 -0
  38. data/spec/io/io_spec.rb +161 -24
  39. data/spec/math/arithmetic/dataframe_spec.rb +26 -7
  40. data/spec/math/arithmetic/vector_spec.rb +8 -0
  41. data/spec/math/statistics/dataframe_spec.rb +16 -1
  42. data/spec/math/statistics/vector_spec.rb +215 -47
  43. data/spec/spec_helper.rb +21 -2
  44. data/spec/vector_spec.rb +368 -12
  45. metadata +99 -16
  46. data/lib/version.rb +0 -3
  47. data/notebooks/grouping_splitting_pivots.ipynb +0 -529
  48. data/notebooks/intro_with_music_data_.ipynb +0 -303
@@ -0,0 +1,21 @@
1
+ # Support for converting data to R data structures to support rserve-client
2
+
3
+ module Daru
4
+ class DataFrame
5
+ def to_REXP
6
+ names = @vectors.to_a
7
+ data = names.map do |f|
8
+ Rserve::REXP::Wrapper.wrap(self[f].to_a)
9
+ end
10
+ l = Rserve::Rlist.new(data, names.map(&:to_s))
11
+
12
+ Rserve::REXP.create_data_frame(l)
13
+ end
14
+ end
15
+
16
+ class Vector
17
+ def to_REXP
18
+ Rserve::REXP::Wrapper.wrap(self.to_a)
19
+ end
20
+ end
21
+ end
@@ -101,8 +101,22 @@ module Daru
101
101
  @relation_hash.has_key? index
102
102
  end
103
103
 
104
+ def empty?
105
+ @relation_hash.empty?
106
+ end
107
+
104
108
  def dup
105
109
  Daru::Index.new @relation_hash.keys
106
110
  end
111
+
112
+ def _dump depth
113
+ Marshal.dump({relation_hash: @relation_hash})
114
+ end
115
+
116
+ def self._load data
117
+ h = Marshal.load data
118
+
119
+ Daru::Index.new(h[:relation_hash].keys, h[:relation_hash].values)
120
+ end
107
121
  end
108
122
  end
@@ -1,22 +1,184 @@
1
1
  module Daru
2
+ module IOHelpers
3
+ class << self
4
+ def process_row(row,empty)
5
+ row.to_a.map do |c|
6
+ if empty.include?(c)
7
+ nil
8
+ else
9
+ if c.is_a? String and c.is_number?
10
+ c =~ /^\d+$/ ? c.to_i : c.gsub(",",".").to_f
11
+ else
12
+ c
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+
2
20
  module IO
3
21
  class << self
22
+ # Functions for loading/writing Excel files.
23
+
24
+ def from_excel path, opts={}
25
+ opts = {
26
+ :worksheet_id => 0,
27
+ }.merge opts
28
+
29
+ worksheet_id = opts[:worksheet_id]
30
+ book = Spreadsheet.open path
31
+ worksheet = book.worksheet worksheet_id
32
+ headers = worksheet.row(0).recode_repeated.map(&:to_sym)
33
+
34
+ df = Daru::DataFrame.new({})
35
+ headers.each_with_index do |h,i|
36
+ col = worksheet.column(i).to_a
37
+ col.delete_at 0
38
+ df[h] = col
39
+ end
40
+
41
+ df
42
+ end
43
+
44
+ def dataframe_write_excel dataframe, path, opts={}
45
+ book = Spreadsheet::Workbook.new
46
+ sheet = book.create_worksheet
47
+ format = Spreadsheet::Format.new :color => :blue, :weight => :bold
48
+
49
+ sheet.row(0).concat(dataframe.vectors.to_a.map(&:to_s)) # Unfreeze strings
50
+ sheet.row(0).default_format = format
51
+ i = 1
52
+ dataframe.each_row do |row|
53
+ sheet.row(i).concat(row.to_a)
54
+ i += 1
55
+ end
56
+
57
+ book.write(path)
58
+ end
59
+
60
+ # Functions for loading/writing CSV files
61
+
4
62
  def from_csv path, opts={}
5
63
  opts[:col_sep] ||= ','
6
- opts[:headers] ||= true
7
64
  opts[:converters] ||= :numeric
8
- opts[:header_converters] ||= :symbol
9
65
 
10
- csv = CSV.read(path, 'r', opts)
66
+ daru_options = opts.keys.inject({}) do |hash, k|
67
+ if [:clone, :order, :index, :name].include?(k)
68
+ hash[k] = opts[k]
69
+ opts.delete k
70
+ end
71
+
72
+ hash
73
+ end
74
+
75
+ # Preprocess headers for detecting and correcting repetition in
76
+ # case the :headers option is not specified.
77
+ unless opts[:headers]
78
+ csv = ::CSV.open(path, 'rb', opts)
79
+ yield csv if block_given?
80
+
81
+ csv_as_arrays = csv.to_a
82
+ headers = csv_as_arrays[0].recode_repeated.map(&:to_sym)
83
+ csv_as_arrays.delete_at 0
84
+ csv_as_arrays = csv_as_arrays.transpose
11
85
 
12
- yield csv if block_given?
86
+ hsh = {}
87
+ headers.each_with_index do |h, i|
88
+ hsh[h] = csv_as_arrays[i]
89
+ end
90
+ else
91
+ opts[:header_converters] ||= :symbol
92
+
93
+ csv = ::CSV.read(path, 'rb',opts)
94
+ yield csv if block_given?
13
95
 
14
- hsh = {}
15
- csv.by_col!.each do |col_name, values|
16
- hsh[col_name] = values
96
+ hsh = {}
97
+ csv.by_col.each do |col_name, values|
98
+ hsh[col_name] = values
99
+ end
17
100
  end
18
101
 
19
- Daru::DataFrame.new(hsh)
102
+ Daru::DataFrame.new(hsh,daru_options)
103
+ end
104
+
105
+ def dataframe_write_csv dataframe, path, opts={}
106
+ options = {
107
+ converters: :numeric
108
+ }.merge(opts)
109
+
110
+ writer = ::CSV.open(path, 'w', options)
111
+ writer << dataframe.vectors.to_a
112
+
113
+ dataframe.each_row do |row|
114
+ if options[:convert_comma]
115
+ writer << row.map { |v| v.to_s.gsub('.', ',') }
116
+ else
117
+ writer << row.to_a
118
+ end
119
+ end
120
+
121
+ writer.close
122
+ end
123
+
124
+ # Loading/writing from SQL databases
125
+
126
+ def from_sql dbh, query
127
+ require 'dbi'
128
+ sth = dbh.execute(query)
129
+ vectors = {}
130
+ fields = []
131
+ sth.column_info.each do |c|
132
+ vectors[c[:name]] = Daru::Vector.new([])
133
+ vectors[c[:name]].rename c[:name]
134
+ fields.push(c[:name].to_sym)
135
+ end
136
+ ds=Daru::DataFrame.new(vectors,order: fields)
137
+ sth.fetch do |row|
138
+ ds.add_row(row.to_a)
139
+ end
140
+ ds.update
141
+ ds
142
+ end
143
+
144
+ def dataframe_write_sql ds, dbh, table
145
+ require 'dbi'
146
+ query = "INSERT INTO #{table} ("+ds.vectors.to_a.join(",")+") VALUES ("+((["?"]*ds.vectors.size).join(","))+")"
147
+ sth = dbh.prepare(query)
148
+ ds.each_row { |c| sth.execute(*c.to_a) }
149
+ return true
150
+ end
151
+
152
+ # Loading data from plain text files
153
+
154
+ def from_plaintext filename, fields
155
+ ds = Daru::DataFrame.new({}, order: fields)
156
+ fp = File.open(filename,"r")
157
+ fp.each_line do |line|
158
+ row = Daru::IOHelpers.process_row(line.strip.split(/\s+/),[""])
159
+ next if row == ["\x1A"]
160
+ ds.add_row(row)
161
+ end
162
+ ds.update
163
+ fields.each { |f| ds[f].rename f }
164
+ ds
165
+ end
166
+
167
+ # Loading and writing Marshalled DataFrame/Vector
168
+ def save klass, filename
169
+ fp = File.open(filename, 'w')
170
+ Marshal.dump(klass, fp)
171
+ fp.close
172
+ end
173
+
174
+ def load filename
175
+ if File.exist? filename
176
+ o = false
177
+ File.open(filename, 'r') { |fp| o = Marshal.load(fp) }
178
+ o
179
+ else
180
+ false
181
+ end
20
182
  end
21
183
  end
22
184
  end
@@ -36,15 +36,16 @@ module Daru
36
36
 
37
37
  # Calculate exponenential of all vectors with numeric values.
38
38
  def exp
39
- self.dup.map_vectors! { |v| v.exp if v.type == :numeric }
39
+ only_numerics(clone: false).recode { |v| v.exp }
40
40
  end
41
41
 
42
+ # Calcuate square root of numeric vectors.
42
43
  def sqrt
43
- self.dup.map_vectors! { |v| v.sqrt if v.type == :numeric }
44
+ only_numerics(clone: false).recode { |v| v.sqrt }
44
45
  end
45
46
 
46
47
  def round precision=0
47
- self.dup.map_vectors! { |v| v.round(precision) if v.type == :numeric }
48
+ only_numerics(clone: false).recode { |v| v.round(precision) }
48
49
  end
49
50
  private
50
51
 
@@ -35,17 +35,17 @@ module Daru
35
35
  end
36
36
 
37
37
  def abs
38
- self.dup.map! { |e| e.abs unless e.nil? }
38
+ self.recode { |e| e.abs unless e.nil? }
39
39
  end
40
40
 
41
41
  def round precision=0
42
- self.dup.map! { |e| e.round(precision) unless e.nil? }
42
+ self.recode { |e| e.round(precision) unless e.nil? }
43
43
  end
44
44
 
45
45
  private
46
46
 
47
47
  def math_unary_op operation
48
- self.dup.map! { |e| Math.send(operation, e) unless e.nil? }
48
+ self.recode { |e| Math.send(operation, e) unless e.nil? }
49
49
  end
50
50
 
51
51
  def binary_op operation, other
@@ -65,7 +65,7 @@ module Daru
65
65
  def v2v_binary operation, other
66
66
  common_idxs = []
67
67
  elements = []
68
- index = (@index.to_a + other.index.to_a).uniq.sort
68
+ index = (@index.to_a | other.index.to_a).sort
69
69
 
70
70
  index.each do |idx|
71
71
  this = self[idx]
@@ -37,6 +37,15 @@ module Daru
37
37
  compute_stats :product
38
38
  end
39
39
 
40
+ def standardize
41
+ df = self.only_numerics clone: true
42
+ df.map! do |v|
43
+ v.standardize
44
+ end
45
+
46
+ df
47
+ end
48
+
40
49
  # Create a summary of mean, standard deviation, count, max and min of
41
50
  # each numeric vector in the dataframe in one shot.
42
51
  #
@@ -55,40 +64,39 @@ module Daru
55
64
  Daru::DataFrame.new(description_hash, index: methods)
56
65
  end
57
66
 
58
- # Calculate variance-covariance between the numeric vectors.
59
- #
60
- # == Arguments
61
- #
62
- # +for_sample_data+ - If set to false, will calculate the population
63
- # covariance (denominator N), otherwise calculates the sample covariance
64
- # matrix. Default to true.
65
- def covariance for_sample_data=true
66
- cov_arry =
67
- if defined? NMatrix and NMatrix.respond_to?(:cov)
68
- to_nmatrix.cov(for_sample_data).to_a
69
- else
70
- df_as_matrix = to_matrix
71
- denominator = for_sample_data ? rows - 1 : rows
72
- ones = Matrix.column_vector [1]*rows
73
- deviation_scores = df_as_matrix - (ones * ones.transpose * df_as_matrix) / rows
74
- ((deviation_scores.transpose * deviation_scores) / denominator).to_a
67
+ # Calculate sample variance-covariance between the numeric vectors.
68
+ def covariance
69
+ cache={}
70
+ vectors = self.numeric_vectors
71
+
72
+ mat_rows = vectors.collect do |row|
73
+ vectors.collect do |col|
74
+ if row == col
75
+ self[row].variance
76
+ else
77
+ if cache[[col,row]].nil?
78
+ cov = vector_cov(self[row],self[col])
79
+ cache[[row,col]] = cov
80
+ cov
81
+ else
82
+ cache[[col,row]]
83
+ end
84
+ end
85
+ end
75
86
  end
76
87
 
77
- Daru::DataFrame.rows(cov_arry, index: numeric_vectors, order: numeric_vectors)
88
+ Daru::DataFrame.rows(mat_rows, index: numeric_vectors, order: numeric_vectors)
78
89
  end
79
90
 
80
91
  alias :cov :covariance
81
92
 
82
93
  # Calculate the correlation between the numeric vectors.
83
94
  def correlation
84
- corr_arry =
85
- if defined? NMatrix and NMatrix.respond_to?(:corr)
86
- to_nmatrix.corr.to_a
87
- else
88
- standard_deviation = std.to_matrix
89
- (cov.to_matrix.elementwise_division(standard_deviation.transpose *
90
- standard_deviation)).to_a
91
- end
95
+ standard_deviation = std.to_matrix
96
+ corr_arry = (cov
97
+ .to_matrix
98
+ .elementwise_division(standard_deviation.transpose *
99
+ standard_deviation)).to_a
92
100
 
93
101
  Daru::DataFrame.rows(corr_arry, index: numeric_vectors, order: numeric_vectors)
94
102
  end
@@ -97,12 +105,25 @@ module Daru
97
105
 
98
106
  private
99
107
 
108
+ def vector_cov v1a, v2a
109
+ sum_of_squares(v1a,v2a) / (v1a.size - 1)
110
+ end
111
+
112
+ def sum_of_squares v1, v2
113
+ v1a,v2a = v1.only_valid ,v2.only_valid
114
+ v1a.reset_index!
115
+ v2a.reset_index!
116
+ m1 = v1a.mean
117
+ m2 = v2a.mean
118
+ (v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
119
+ end
120
+
100
121
  def compute_stats method
101
122
  Daru::Vector.new(
102
123
  numeric_vectors.inject({}) do |hash, vec|
103
124
  hash[vec] = self[vec].send(method)
104
125
  hash
105
- end
126
+ end, name: method
106
127
  )
107
128
  end
108
129
  end
@@ -1,8 +1,8 @@
1
1
  module Daru
2
2
  module Maths
3
3
  # Encapsulates statistics methods for vectors. Most basic stuff like mean, etc.
4
- # is done inside the wrapper, so that native methods can be used for most of
5
- # the computationally intensive tasks.
4
+ # is done inside the wrapper, so that native methods can be used for most of
5
+ # the computationally intensive tasks.
6
6
  module Statistics
7
7
  module Vector
8
8
  def mean
@@ -26,7 +26,7 @@ module Daru
26
26
  end
27
27
 
28
28
  def median
29
- percentile 50
29
+ @data.respond_to?(:median) ? @data.median : percentile(50)
30
30
  end
31
31
 
32
32
  def mode
@@ -36,15 +36,21 @@ module Daru
36
36
 
37
37
  def median_absolute_deviation
38
38
  m = median
39
- map {|val| (val - m).abs }.median
39
+ recode {|val| (val - m).abs }.median
40
40
  end
41
+ alias :mad :median_absolute_deviation
41
42
 
42
43
  def standard_error
43
- standard_deviation_sample/(Math::sqrt((@size - @nil_positions.size)))
44
+ standard_deviation_sample/(Math::sqrt((n_valid)))
44
45
  end
45
46
 
46
47
  def sum_of_squared_deviation
47
- (@data.to_a.inject(0) { |a,x| x.square + a } - (sum.square.quo((@size - @nil_positions.size)))).to_f
48
+ (@data.inject(0) { |a,x| x.square + a } - (sum.square.quo(n_valid)).to_f).to_f
49
+ end
50
+
51
+ # Retrieve unique values of non-nil data
52
+ def factors
53
+ only_valid.uniq.reset_index!
48
54
  end
49
55
 
50
56
  # Maximum element of the vector.
@@ -69,12 +75,18 @@ module Daru
69
75
 
70
76
  def frequencies
71
77
  @data.inject({}) do |hash, element|
72
- hash[element] ||= 0
73
- hash[element] += 1
78
+ unless element.nil?
79
+ hash[element] ||= 0
80
+ hash[element] += 1
81
+ end
74
82
  hash
75
83
  end
76
84
  end
77
85
 
86
+ def freqs
87
+ Daru::Vector.new(frequencies)
88
+ end
89
+
78
90
  def proportions
79
91
  len = n_valid
80
92
  frequencies.inject({}) { |hash, arr| hash[arr[0]] = arr[1] / len; hash }
@@ -83,13 +95,12 @@ module Daru
83
95
  def ranked
84
96
  sum = 0
85
97
  r = frequencies.sort.inject( {} ) do |memo, val|
86
- memo[val[0]] = ((sum + 1) + (sum + val[1])) / 2
98
+ memo[val[0]] = ((sum + 1) + (sum + val[1])).quo(2)
87
99
  sum += val[1]
88
100
  memo
89
101
  end
90
102
 
91
- Daru::Vector.new @data.map { |e| r[e] }, index: self.index,
92
- name: self.name, dtype: self.dtype, nm_dtype: self.nm_dtype
103
+ recode { |e| r[e] }
93
104
  end
94
105
 
95
106
  def coefficient_of_variation
@@ -107,69 +118,239 @@ module Daru
107
118
  val = frequencies[value]
108
119
  val.nil? ? 0 : val
109
120
  else
110
- size - @nil_positions.size
121
+ size - @missing_positions.size
111
122
  end
112
123
  end
113
124
 
114
125
  def proportion value=1
115
- frequencies[value] / n_valid
126
+ frequencies[value].quo(n_valid).to_f
116
127
  end
117
128
 
118
129
  # Sample variance with denominator (N-1)
119
130
  def variance_sample m=nil
120
131
  m ||= self.mean
121
- sum_of_squares(m).quo((@size - @nil_positions.size) - 1)
132
+ if @data.respond_to? :variance_sample
133
+ @data.variance_sample m
134
+ else
135
+ sum_of_squares(m).quo((n_valid) - 1)
136
+ end
122
137
  end
123
138
 
124
139
  # Population variance with denominator (N)
125
140
  def variance_population m=nil
126
141
  m ||= mean
127
- sum_of_squares(m).quo((@size - @nil_positions.size)).to_f
142
+ if @data.respond_to? :variance_population
143
+ @data.variance_population m
144
+ else
145
+ sum_of_squares(m).quo((n_valid)).to_f
146
+ end
128
147
  end
129
148
 
130
149
  def sum_of_squares(m=nil)
131
150
  m ||= mean
132
- @data.inject(0) { |memo, val| memo + (val - m)**2 }
151
+ @data.inject(0) { |memo, val|
152
+ @missing_values.has_key?(val) ? memo : (memo + (val - m)**2)
153
+ }
133
154
  end
134
155
 
135
156
  def standard_deviation_population m=nil
136
157
  m ||= mean
137
- Math::sqrt(variance_population(m))
158
+ if @data.respond_to? :standard_deviation_population
159
+ @data.standard_deviation_population(m)
160
+ else
161
+ Math::sqrt(variance_population(m))
162
+ end
138
163
  end
139
164
 
140
165
  def standard_deviation_sample m=nil
141
- Math::sqrt(variance_sample(m))
166
+ m ||= mean
167
+ if @data.respond_to? :standard_deviation_sample
168
+ @data.standard_deviation_sample m
169
+ else
170
+ Math::sqrt(variance_sample(m))
171
+ end
142
172
  end
143
173
 
144
174
  # Calculate skewness using (sigma(xi - mean)^3)/((N)*std_dev_sample^3)
145
175
  def skew m=nil
146
- m ||= mean
147
- th = @data.inject(0) { |memo, val| memo + ((val - m)**3) }
148
- th.quo ((@size - @nil_positions.size) * (standard_deviation_sample(m)**3))
176
+ if @data.respond_to? :skew
177
+ @data.skew
178
+ else
179
+ m ||= mean
180
+ th = @data.inject(0) { |memo, val| memo + ((val - m)**3) }
181
+ th.quo ((@size - @missing_positions.size) * (standard_deviation_sample(m)**3))
182
+ end
149
183
  end
150
184
 
151
185
  def kurtosis m=nil
152
- m ||= mean
153
- fo = @data.inject(0){ |a, x| a + ((x - m) ** 4) }
154
- fo.quo((@size - @nil_positions.size) * standard_deviation_sample(m) ** 4) - 3
186
+ if @data.respond_to? :kurtosis
187
+ @data.kurtosis
188
+ else
189
+ m ||= mean
190
+ fo = @data.inject(0){ |a, x| a + ((x - m) ** 4) }
191
+ fo.quo((@size - @missing_positions.size) * standard_deviation_sample(m) ** 4) - 3
192
+ end
155
193
  end
156
194
 
157
195
  def average_deviation_population m=nil
196
+ type == :numeric or raise TypeError, "Vector must be numeric"
197
+ m ||= mean
198
+ (@data.inject( 0 ) { |memo, val|
199
+ @missing_values.has_key?(val) ? memo : ( val - m ).abs + memo
200
+ }).quo( n_valid )
201
+ end
202
+
203
+ # Returns the value of the percentile q
204
+ #
205
+ # Accepts an optional second argument specifying the strategy to interpolate
206
+ # when the requested percentile lies between two data points a and b
207
+ # Valid strategies are:
208
+ # * :midpoint (Default): (a + b) / 2
209
+ # * :linear : a + (b - a) * d where d is the decimal part of the index between a and b.
210
+ # == References
211
+ #
212
+ # This is the NIST recommended method (http://en.wikipedia.org/wiki/Percentile#NIST_method)
213
+ def percentile(q, strategy = :midpoint)
214
+ sorted = only_valid(:array).sort
215
+
216
+ case strategy
217
+ when :midpoint
218
+ v = (n_valid * q).quo(100)
219
+ if(v.to_i!=v)
220
+ sorted[v.to_i]
221
+ else
222
+ (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
223
+ end
224
+ when :linear
225
+ index = (q / 100.0) * (n_valid + 1)
226
+
227
+ k = index.truncate
228
+ d = index % 1
229
+
230
+ if k == 0
231
+ sorted[0]
232
+ elsif k >= sorted.size
233
+ sorted[-1]
234
+ else
235
+ sorted[k - 1] + d * (sorted[k] - sorted[k - 1])
236
+ end
237
+ else
238
+ raise NotImplementedError.new "Unknown strategy #{strategy.to_s}"
239
+ end
240
+ end
241
+
242
+ # Dichotomize the vector with 0 and 1, based on lowest value.
243
+ # If parameter is defined, this value and lower will be 0
244
+ # and higher, 1.
245
+ def dichotomize(low = nil)
246
+ low ||= factors.min
247
+
248
+ self.recode do |x|
249
+ if x.nil?
250
+ nil
251
+ elsif x > low
252
+ 1
253
+ else
254
+ 0
255
+ end
256
+ end
257
+ end
258
+
259
+ # Center data by subtracting the mean from each non-nil value.
260
+ def center
261
+ self - mean
262
+ end
263
+
264
+ # Standardize data.
265
+ #
266
+ # == Arguments
267
+ #
268
+ # * use_population - Pass as *true* if you want to use population
269
+ # standard deviation instead of sample standard deviation.
270
+ def standardize use_population=false
158
271
  m ||= mean
159
- (@data.inject(0) {|memo, val| val + (val - m).abs }) / n_valid
272
+ sd = use_population ? sdp : sds
273
+ return Daru::Vector.new([nil]*@size) if m.nil? or sd == 0.0
274
+
275
+ vector_standardized_compute m, sd
160
276
  end
161
277
 
162
- def recode!(&block)
163
- @data.recode!(&block)
278
+ def box_cox_transformation lambda # :nodoc:
279
+ raise "Should be a numeric" unless @type == :numeric
280
+
281
+ self.recode do |x|
282
+ if !x.nil?
283
+ if(lambda == 0)
284
+ Math.log(x)
285
+ else
286
+ (x ** lambda - 1).quo(lambda)
287
+ end
288
+ else
289
+ nil
290
+ end
291
+ end
164
292
  end
165
293
 
166
- def percentile percent
167
- sorted = @data.sort
168
- v = (n_valid * percent).quo(100)
169
- if v.to_i != v
170
- sorted[v.round]
294
+ # Replace each non-nil value in the vector with its percentile.
295
+ def vector_percentile
296
+ c = size - missing_positions.size
297
+ ranked.recode! { |i| i.nil? ? nil : (i.quo(c)*100).to_f }
298
+ end
299
+
300
+ def vector_standardized_compute(m,sd)
301
+ if @data.respond_to? :vector_standardized_compute
302
+ @data.vector_standardized_compute(m,sd)
171
303
  else
172
- (sorted[(v - 0.5).round].to_f + sorted[(v + 0.5).round]).quo(2)
304
+ Daru::Vector.new @data.collect { |x| x.nil? ? nil : (x.to_f - m).quo(sd) },
305
+ index: index, name: name, dtype: dtype
306
+ end
307
+ end
308
+
309
+ def vector_centered_compute(m)
310
+ if @data.respond_to? :vector_centered_compute
311
+ @data.vector_centered_compute(m)
312
+ else
313
+ Daru::Vector.new @data.collect { |x| x.nil? ? nil : x.to_f-m },
314
+ index: index, name: name, dtype: dtype
315
+ end
316
+ end
317
+
318
+ # Returns an random sample of size n, with replacement,
319
+ # only with non-nil data.
320
+ #
321
+ # In all the trails, every item have the same probability
322
+ # of been selected.
323
+ def sample_with_replacement(sample=1)
324
+ if @data.respond_to? :sample_with_replacement
325
+ @data.sample_with_replacement sample
326
+ else
327
+ valid = missing_positions.empty? ? self : self.only_valid
328
+ vds = valid.size
329
+ (0...sample).collect{ valid[rand(vds)] }
330
+ end
331
+ end
332
+
333
+ # Returns an random sample of size n, without replacement,
334
+ # only with valid data.
335
+ #
336
+ # Every element could only be selected once.
337
+ #
338
+ # A sample of the same size of the vector is the vector itself.
339
+ def sample_without_replacement(sample=1)
340
+ if @data.respond_to? :sample_without_replacement
341
+ @data.sample_without_replacement sample
342
+ else
343
+ valid = missing_positions.empty? ? self : self.only_valid
344
+ raise ArgumentError, "Sample size couldn't be greater than n" if
345
+ sample > valid.size
346
+ out = []
347
+ size = valid.size
348
+ while out.size < sample
349
+ value = rand(size)
350
+ out.push(value) if !out.include?(value)
351
+ end
352
+
353
+ out.collect{|i| valid[i]}
173
354
  end
174
355
  end
175
356
 
@@ -182,6 +363,7 @@ module Daru
182
363
  alias :sd :standard_deviation_sample
183
364
  alias :ss :sum_of_squares
184
365
  alias :percentil :percentile
366
+ alias :se :standard_error
185
367
  end
186
368
  end
187
369
  end