daru 0.0.5 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.build.sh +14 -0
  3. data/.travis.yml +26 -4
  4. data/CONTRIBUTING.md +31 -0
  5. data/Gemfile +1 -2
  6. data/{History.txt → History.md} +110 -44
  7. data/README.md +21 -288
  8. data/Rakefile +1 -0
  9. data/daru.gemspec +12 -8
  10. data/lib/daru.rb +36 -1
  11. data/lib/daru/accessors/array_wrapper.rb +8 -3
  12. data/lib/daru/accessors/gsl_wrapper.rb +113 -0
  13. data/lib/daru/accessors/nmatrix_wrapper.rb +6 -17
  14. data/lib/daru/core/group_by.rb +0 -1
  15. data/lib/daru/dataframe.rb +1192 -83
  16. data/lib/daru/extensions/rserve.rb +21 -0
  17. data/lib/daru/index.rb +14 -0
  18. data/lib/daru/io/io.rb +170 -8
  19. data/lib/daru/maths/arithmetic/dataframe.rb +4 -3
  20. data/lib/daru/maths/arithmetic/vector.rb +4 -4
  21. data/lib/daru/maths/statistics/dataframe.rb +48 -27
  22. data/lib/daru/maths/statistics/vector.rb +215 -33
  23. data/lib/daru/monkeys.rb +53 -7
  24. data/lib/daru/multi_index.rb +21 -4
  25. data/lib/daru/plotting/dataframe.rb +83 -25
  26. data/lib/daru/plotting/vector.rb +9 -10
  27. data/lib/daru/vector.rb +596 -61
  28. data/lib/daru/version.rb +3 -0
  29. data/spec/accessors/wrappers_spec.rb +51 -0
  30. data/spec/core/group_by_spec.rb +0 -2
  31. data/spec/daru_spec.rb +58 -0
  32. data/spec/dataframe_spec.rb +768 -73
  33. data/spec/extensions/rserve_spec.rb +52 -0
  34. data/spec/fixtures/bank2.dat +200 -0
  35. data/spec/fixtures/repeated_fields.csv +7 -0
  36. data/spec/fixtures/scientific_notation.csv +4 -0
  37. data/spec/fixtures/test_xls.xls +0 -0
  38. data/spec/io/io_spec.rb +161 -24
  39. data/spec/math/arithmetic/dataframe_spec.rb +26 -7
  40. data/spec/math/arithmetic/vector_spec.rb +8 -0
  41. data/spec/math/statistics/dataframe_spec.rb +16 -1
  42. data/spec/math/statistics/vector_spec.rb +215 -47
  43. data/spec/spec_helper.rb +21 -2
  44. data/spec/vector_spec.rb +368 -12
  45. metadata +99 -16
  46. data/lib/version.rb +0 -3
  47. data/notebooks/grouping_splitting_pivots.ipynb +0 -529
  48. data/notebooks/intro_with_music_data_.ipynb +0 -303
@@ -0,0 +1,21 @@
1
+ # Support for converting data to R data structures to support rserve-client
2
+
3
+ module Daru
4
+ class DataFrame
5
+ def to_REXP
6
+ names = @vectors.to_a
7
+ data = names.map do |f|
8
+ Rserve::REXP::Wrapper.wrap(self[f].to_a)
9
+ end
10
+ l = Rserve::Rlist.new(data, names.map(&:to_s))
11
+
12
+ Rserve::REXP.create_data_frame(l)
13
+ end
14
+ end
15
+
16
+ class Vector
17
+ def to_REXP
18
+ Rserve::REXP::Wrapper.wrap(self.to_a)
19
+ end
20
+ end
21
+ end
@@ -101,8 +101,22 @@ module Daru
101
101
  @relation_hash.has_key? index
102
102
  end
103
103
 
104
+ def empty?
105
+ @relation_hash.empty?
106
+ end
107
+
104
108
  def dup
105
109
  Daru::Index.new @relation_hash.keys
106
110
  end
111
+
112
+ def _dump depth
113
+ Marshal.dump({relation_hash: @relation_hash})
114
+ end
115
+
116
+ def self._load data
117
+ h = Marshal.load data
118
+
119
+ Daru::Index.new(h[:relation_hash].keys, h[:relation_hash].values)
120
+ end
107
121
  end
108
122
  end
@@ -1,22 +1,184 @@
1
1
  module Daru
2
+ module IOHelpers
3
+ class << self
4
+ def process_row(row,empty)
5
+ row.to_a.map do |c|
6
+ if empty.include?(c)
7
+ nil
8
+ else
9
+ if c.is_a? String and c.is_number?
10
+ c =~ /^\d+$/ ? c.to_i : c.gsub(",",".").to_f
11
+ else
12
+ c
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+
2
20
  module IO
3
21
  class << self
22
+ # Functions for loading/writing Excel files.
23
+
24
+ def from_excel path, opts={}
25
+ opts = {
26
+ :worksheet_id => 0,
27
+ }.merge opts
28
+
29
+ worksheet_id = opts[:worksheet_id]
30
+ book = Spreadsheet.open path
31
+ worksheet = book.worksheet worksheet_id
32
+ headers = worksheet.row(0).recode_repeated.map(&:to_sym)
33
+
34
+ df = Daru::DataFrame.new({})
35
+ headers.each_with_index do |h,i|
36
+ col = worksheet.column(i).to_a
37
+ col.delete_at 0
38
+ df[h] = col
39
+ end
40
+
41
+ df
42
+ end
43
+
44
+ def dataframe_write_excel dataframe, path, opts={}
45
+ book = Spreadsheet::Workbook.new
46
+ sheet = book.create_worksheet
47
+ format = Spreadsheet::Format.new :color => :blue, :weight => :bold
48
+
49
+ sheet.row(0).concat(dataframe.vectors.to_a.map(&:to_s)) # Unfreeze strings
50
+ sheet.row(0).default_format = format
51
+ i = 1
52
+ dataframe.each_row do |row|
53
+ sheet.row(i).concat(row.to_a)
54
+ i += 1
55
+ end
56
+
57
+ book.write(path)
58
+ end
59
+
60
+ # Functions for loading/writing CSV files
61
+
4
62
  def from_csv path, opts={}
5
63
  opts[:col_sep] ||= ','
6
- opts[:headers] ||= true
7
64
  opts[:converters] ||= :numeric
8
- opts[:header_converters] ||= :symbol
9
65
 
10
- csv = CSV.read(path, 'r', opts)
66
+ daru_options = opts.keys.inject({}) do |hash, k|
67
+ if [:clone, :order, :index, :name].include?(k)
68
+ hash[k] = opts[k]
69
+ opts.delete k
70
+ end
71
+
72
+ hash
73
+ end
74
+
75
+ # Preprocess headers for detecting and correcting repetition in
76
+ # case the :headers option is not specified.
77
+ unless opts[:headers]
78
+ csv = ::CSV.open(path, 'rb', opts)
79
+ yield csv if block_given?
80
+
81
+ csv_as_arrays = csv.to_a
82
+ headers = csv_as_arrays[0].recode_repeated.map(&:to_sym)
83
+ csv_as_arrays.delete_at 0
84
+ csv_as_arrays = csv_as_arrays.transpose
11
85
 
12
- yield csv if block_given?
86
+ hsh = {}
87
+ headers.each_with_index do |h, i|
88
+ hsh[h] = csv_as_arrays[i]
89
+ end
90
+ else
91
+ opts[:header_converters] ||= :symbol
92
+
93
+ csv = ::CSV.read(path, 'rb',opts)
94
+ yield csv if block_given?
13
95
 
14
- hsh = {}
15
- csv.by_col!.each do |col_name, values|
16
- hsh[col_name] = values
96
+ hsh = {}
97
+ csv.by_col.each do |col_name, values|
98
+ hsh[col_name] = values
99
+ end
17
100
  end
18
101
 
19
- Daru::DataFrame.new(hsh)
102
+ Daru::DataFrame.new(hsh,daru_options)
103
+ end
104
+
105
+ def dataframe_write_csv dataframe, path, opts={}
106
+ options = {
107
+ converters: :numeric
108
+ }.merge(opts)
109
+
110
+ writer = ::CSV.open(path, 'w', options)
111
+ writer << dataframe.vectors.to_a
112
+
113
+ dataframe.each_row do |row|
114
+ if options[:convert_comma]
115
+ writer << row.map { |v| v.to_s.gsub('.', ',') }
116
+ else
117
+ writer << row.to_a
118
+ end
119
+ end
120
+
121
+ writer.close
122
+ end
123
+
124
+ # Loading/writing from SQL databases
125
+
126
+ def from_sql dbh, query
127
+ require 'dbi'
128
+ sth = dbh.execute(query)
129
+ vectors = {}
130
+ fields = []
131
+ sth.column_info.each do |c|
132
+ vectors[c[:name]] = Daru::Vector.new([])
133
+ vectors[c[:name]].rename c[:name]
134
+ fields.push(c[:name].to_sym)
135
+ end
136
+ ds=Daru::DataFrame.new(vectors,order: fields)
137
+ sth.fetch do |row|
138
+ ds.add_row(row.to_a)
139
+ end
140
+ ds.update
141
+ ds
142
+ end
143
+
144
+ def dataframe_write_sql ds, dbh, table
145
+ require 'dbi'
146
+ query = "INSERT INTO #{table} ("+ds.vectors.to_a.join(",")+") VALUES ("+((["?"]*ds.vectors.size).join(","))+")"
147
+ sth = dbh.prepare(query)
148
+ ds.each_row { |c| sth.execute(*c.to_a) }
149
+ return true
150
+ end
151
+
152
+ # Loading data from plain text files
153
+
154
+ def from_plaintext filename, fields
155
+ ds = Daru::DataFrame.new({}, order: fields)
156
+ fp = File.open(filename,"r")
157
+ fp.each_line do |line|
158
+ row = Daru::IOHelpers.process_row(line.strip.split(/\s+/),[""])
159
+ next if row == ["\x1A"]
160
+ ds.add_row(row)
161
+ end
162
+ ds.update
163
+ fields.each { |f| ds[f].rename f }
164
+ ds
165
+ end
166
+
167
+ # Loading and writing Marshalled DataFrame/Vector
168
+ def save klass, filename
169
+ fp = File.open(filename, 'w')
170
+ Marshal.dump(klass, fp)
171
+ fp.close
172
+ end
173
+
174
+ def load filename
175
+ if File.exist? filename
176
+ o = false
177
+ File.open(filename, 'r') { |fp| o = Marshal.load(fp) }
178
+ o
179
+ else
180
+ false
181
+ end
20
182
  end
21
183
  end
22
184
  end
@@ -36,15 +36,16 @@ module Daru
36
36
 
37
37
  # Calculate exponenential of all vectors with numeric values.
38
38
  def exp
39
- self.dup.map_vectors! { |v| v.exp if v.type == :numeric }
39
+ only_numerics(clone: false).recode { |v| v.exp }
40
40
  end
41
41
 
42
+ # Calcuate square root of numeric vectors.
42
43
  def sqrt
43
- self.dup.map_vectors! { |v| v.sqrt if v.type == :numeric }
44
+ only_numerics(clone: false).recode { |v| v.sqrt }
44
45
  end
45
46
 
46
47
  def round precision=0
47
- self.dup.map_vectors! { |v| v.round(precision) if v.type == :numeric }
48
+ only_numerics(clone: false).recode { |v| v.round(precision) }
48
49
  end
49
50
  private
50
51
 
@@ -35,17 +35,17 @@ module Daru
35
35
  end
36
36
 
37
37
  def abs
38
- self.dup.map! { |e| e.abs unless e.nil? }
38
+ self.recode { |e| e.abs unless e.nil? }
39
39
  end
40
40
 
41
41
  def round precision=0
42
- self.dup.map! { |e| e.round(precision) unless e.nil? }
42
+ self.recode { |e| e.round(precision) unless e.nil? }
43
43
  end
44
44
 
45
45
  private
46
46
 
47
47
  def math_unary_op operation
48
- self.dup.map! { |e| Math.send(operation, e) unless e.nil? }
48
+ self.recode { |e| Math.send(operation, e) unless e.nil? }
49
49
  end
50
50
 
51
51
  def binary_op operation, other
@@ -65,7 +65,7 @@ module Daru
65
65
  def v2v_binary operation, other
66
66
  common_idxs = []
67
67
  elements = []
68
- index = (@index.to_a + other.index.to_a).uniq.sort
68
+ index = (@index.to_a | other.index.to_a).sort
69
69
 
70
70
  index.each do |idx|
71
71
  this = self[idx]
@@ -37,6 +37,15 @@ module Daru
37
37
  compute_stats :product
38
38
  end
39
39
 
40
+ def standardize
41
+ df = self.only_numerics clone: true
42
+ df.map! do |v|
43
+ v.standardize
44
+ end
45
+
46
+ df
47
+ end
48
+
40
49
  # Create a summary of mean, standard deviation, count, max and min of
41
50
  # each numeric vector in the dataframe in one shot.
42
51
  #
@@ -55,40 +64,39 @@ module Daru
55
64
  Daru::DataFrame.new(description_hash, index: methods)
56
65
  end
57
66
 
58
- # Calculate variance-covariance between the numeric vectors.
59
- #
60
- # == Arguments
61
- #
62
- # +for_sample_data+ - If set to false, will calculate the population
63
- # covariance (denominator N), otherwise calculates the sample covariance
64
- # matrix. Default to true.
65
- def covariance for_sample_data=true
66
- cov_arry =
67
- if defined? NMatrix and NMatrix.respond_to?(:cov)
68
- to_nmatrix.cov(for_sample_data).to_a
69
- else
70
- df_as_matrix = to_matrix
71
- denominator = for_sample_data ? rows - 1 : rows
72
- ones = Matrix.column_vector [1]*rows
73
- deviation_scores = df_as_matrix - (ones * ones.transpose * df_as_matrix) / rows
74
- ((deviation_scores.transpose * deviation_scores) / denominator).to_a
67
+ # Calculate sample variance-covariance between the numeric vectors.
68
+ def covariance
69
+ cache={}
70
+ vectors = self.numeric_vectors
71
+
72
+ mat_rows = vectors.collect do |row|
73
+ vectors.collect do |col|
74
+ if row == col
75
+ self[row].variance
76
+ else
77
+ if cache[[col,row]].nil?
78
+ cov = vector_cov(self[row],self[col])
79
+ cache[[row,col]] = cov
80
+ cov
81
+ else
82
+ cache[[col,row]]
83
+ end
84
+ end
85
+ end
75
86
  end
76
87
 
77
- Daru::DataFrame.rows(cov_arry, index: numeric_vectors, order: numeric_vectors)
88
+ Daru::DataFrame.rows(mat_rows, index: numeric_vectors, order: numeric_vectors)
78
89
  end
79
90
 
80
91
  alias :cov :covariance
81
92
 
82
93
  # Calculate the correlation between the numeric vectors.
83
94
  def correlation
84
- corr_arry =
85
- if defined? NMatrix and NMatrix.respond_to?(:corr)
86
- to_nmatrix.corr.to_a
87
- else
88
- standard_deviation = std.to_matrix
89
- (cov.to_matrix.elementwise_division(standard_deviation.transpose *
90
- standard_deviation)).to_a
91
- end
95
+ standard_deviation = std.to_matrix
96
+ corr_arry = (cov
97
+ .to_matrix
98
+ .elementwise_division(standard_deviation.transpose *
99
+ standard_deviation)).to_a
92
100
 
93
101
  Daru::DataFrame.rows(corr_arry, index: numeric_vectors, order: numeric_vectors)
94
102
  end
@@ -97,12 +105,25 @@ module Daru
97
105
 
98
106
  private
99
107
 
108
+ def vector_cov v1a, v2a
109
+ sum_of_squares(v1a,v2a) / (v1a.size - 1)
110
+ end
111
+
112
+ def sum_of_squares v1, v2
113
+ v1a,v2a = v1.only_valid ,v2.only_valid
114
+ v1a.reset_index!
115
+ v2a.reset_index!
116
+ m1 = v1a.mean
117
+ m2 = v2a.mean
118
+ (v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
119
+ end
120
+
100
121
  def compute_stats method
101
122
  Daru::Vector.new(
102
123
  numeric_vectors.inject({}) do |hash, vec|
103
124
  hash[vec] = self[vec].send(method)
104
125
  hash
105
- end
126
+ end, name: method
106
127
  )
107
128
  end
108
129
  end
@@ -1,8 +1,8 @@
1
1
  module Daru
2
2
  module Maths
3
3
  # Encapsulates statistics methods for vectors. Most basic stuff like mean, etc.
4
- # is done inside the wrapper, so that native methods can be used for most of
5
- # the computationally intensive tasks.
4
+ # is done inside the wrapper, so that native methods can be used for most of
5
+ # the computationally intensive tasks.
6
6
  module Statistics
7
7
  module Vector
8
8
  def mean
@@ -26,7 +26,7 @@ module Daru
26
26
  end
27
27
 
28
28
  def median
29
- percentile 50
29
+ @data.respond_to?(:median) ? @data.median : percentile(50)
30
30
  end
31
31
 
32
32
  def mode
@@ -36,15 +36,21 @@ module Daru
36
36
 
37
37
  def median_absolute_deviation
38
38
  m = median
39
- map {|val| (val - m).abs }.median
39
+ recode {|val| (val - m).abs }.median
40
40
  end
41
+ alias :mad :median_absolute_deviation
41
42
 
42
43
  def standard_error
43
- standard_deviation_sample/(Math::sqrt((@size - @nil_positions.size)))
44
+ standard_deviation_sample/(Math::sqrt((n_valid)))
44
45
  end
45
46
 
46
47
  def sum_of_squared_deviation
47
- (@data.to_a.inject(0) { |a,x| x.square + a } - (sum.square.quo((@size - @nil_positions.size)))).to_f
48
+ (@data.inject(0) { |a,x| x.square + a } - (sum.square.quo(n_valid)).to_f).to_f
49
+ end
50
+
51
+ # Retrieve unique values of non-nil data
52
+ def factors
53
+ only_valid.uniq.reset_index!
48
54
  end
49
55
 
50
56
  # Maximum element of the vector.
@@ -69,12 +75,18 @@ module Daru
69
75
 
70
76
  def frequencies
71
77
  @data.inject({}) do |hash, element|
72
- hash[element] ||= 0
73
- hash[element] += 1
78
+ unless element.nil?
79
+ hash[element] ||= 0
80
+ hash[element] += 1
81
+ end
74
82
  hash
75
83
  end
76
84
  end
77
85
 
86
+ def freqs
87
+ Daru::Vector.new(frequencies)
88
+ end
89
+
78
90
  def proportions
79
91
  len = n_valid
80
92
  frequencies.inject({}) { |hash, arr| hash[arr[0]] = arr[1] / len; hash }
@@ -83,13 +95,12 @@ module Daru
83
95
  def ranked
84
96
  sum = 0
85
97
  r = frequencies.sort.inject( {} ) do |memo, val|
86
- memo[val[0]] = ((sum + 1) + (sum + val[1])) / 2
98
+ memo[val[0]] = ((sum + 1) + (sum + val[1])).quo(2)
87
99
  sum += val[1]
88
100
  memo
89
101
  end
90
102
 
91
- Daru::Vector.new @data.map { |e| r[e] }, index: self.index,
92
- name: self.name, dtype: self.dtype, nm_dtype: self.nm_dtype
103
+ recode { |e| r[e] }
93
104
  end
94
105
 
95
106
  def coefficient_of_variation
@@ -107,69 +118,239 @@ module Daru
107
118
  val = frequencies[value]
108
119
  val.nil? ? 0 : val
109
120
  else
110
- size - @nil_positions.size
121
+ size - @missing_positions.size
111
122
  end
112
123
  end
113
124
 
114
125
  def proportion value=1
115
- frequencies[value] / n_valid
126
+ frequencies[value].quo(n_valid).to_f
116
127
  end
117
128
 
118
129
  # Sample variance with denominator (N-1)
119
130
  def variance_sample m=nil
120
131
  m ||= self.mean
121
- sum_of_squares(m).quo((@size - @nil_positions.size) - 1)
132
+ if @data.respond_to? :variance_sample
133
+ @data.variance_sample m
134
+ else
135
+ sum_of_squares(m).quo((n_valid) - 1)
136
+ end
122
137
  end
123
138
 
124
139
  # Population variance with denominator (N)
125
140
  def variance_population m=nil
126
141
  m ||= mean
127
- sum_of_squares(m).quo((@size - @nil_positions.size)).to_f
142
+ if @data.respond_to? :variance_population
143
+ @data.variance_population m
144
+ else
145
+ sum_of_squares(m).quo((n_valid)).to_f
146
+ end
128
147
  end
129
148
 
130
149
  def sum_of_squares(m=nil)
131
150
  m ||= mean
132
- @data.inject(0) { |memo, val| memo + (val - m)**2 }
151
+ @data.inject(0) { |memo, val|
152
+ @missing_values.has_key?(val) ? memo : (memo + (val - m)**2)
153
+ }
133
154
  end
134
155
 
135
156
  def standard_deviation_population m=nil
136
157
  m ||= mean
137
- Math::sqrt(variance_population(m))
158
+ if @data.respond_to? :standard_deviation_population
159
+ @data.standard_deviation_population(m)
160
+ else
161
+ Math::sqrt(variance_population(m))
162
+ end
138
163
  end
139
164
 
140
165
  def standard_deviation_sample m=nil
141
- Math::sqrt(variance_sample(m))
166
+ m ||= mean
167
+ if @data.respond_to? :standard_deviation_sample
168
+ @data.standard_deviation_sample m
169
+ else
170
+ Math::sqrt(variance_sample(m))
171
+ end
142
172
  end
143
173
 
144
174
  # Calculate skewness using (sigma(xi - mean)^3)/((N)*std_dev_sample^3)
145
175
  def skew m=nil
146
- m ||= mean
147
- th = @data.inject(0) { |memo, val| memo + ((val - m)**3) }
148
- th.quo ((@size - @nil_positions.size) * (standard_deviation_sample(m)**3))
176
+ if @data.respond_to? :skew
177
+ @data.skew
178
+ else
179
+ m ||= mean
180
+ th = @data.inject(0) { |memo, val| memo + ((val - m)**3) }
181
+ th.quo ((@size - @missing_positions.size) * (standard_deviation_sample(m)**3))
182
+ end
149
183
  end
150
184
 
151
185
  def kurtosis m=nil
152
- m ||= mean
153
- fo = @data.inject(0){ |a, x| a + ((x - m) ** 4) }
154
- fo.quo((@size - @nil_positions.size) * standard_deviation_sample(m) ** 4) - 3
186
+ if @data.respond_to? :kurtosis
187
+ @data.kurtosis
188
+ else
189
+ m ||= mean
190
+ fo = @data.inject(0){ |a, x| a + ((x - m) ** 4) }
191
+ fo.quo((@size - @missing_positions.size) * standard_deviation_sample(m) ** 4) - 3
192
+ end
155
193
  end
156
194
 
157
195
  def average_deviation_population m=nil
196
+ type == :numeric or raise TypeError, "Vector must be numeric"
197
+ m ||= mean
198
+ (@data.inject( 0 ) { |memo, val|
199
+ @missing_values.has_key?(val) ? memo : ( val - m ).abs + memo
200
+ }).quo( n_valid )
201
+ end
202
+
203
+ # Returns the value of the percentile q
204
+ #
205
+ # Accepts an optional second argument specifying the strategy to interpolate
206
+ # when the requested percentile lies between two data points a and b
207
+ # Valid strategies are:
208
+ # * :midpoint (Default): (a + b) / 2
209
+ # * :linear : a + (b - a) * d where d is the decimal part of the index between a and b.
210
+ # == References
211
+ #
212
+ # This is the NIST recommended method (http://en.wikipedia.org/wiki/Percentile#NIST_method)
213
+ def percentile(q, strategy = :midpoint)
214
+ sorted = only_valid(:array).sort
215
+
216
+ case strategy
217
+ when :midpoint
218
+ v = (n_valid * q).quo(100)
219
+ if(v.to_i!=v)
220
+ sorted[v.to_i]
221
+ else
222
+ (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
223
+ end
224
+ when :linear
225
+ index = (q / 100.0) * (n_valid + 1)
226
+
227
+ k = index.truncate
228
+ d = index % 1
229
+
230
+ if k == 0
231
+ sorted[0]
232
+ elsif k >= sorted.size
233
+ sorted[-1]
234
+ else
235
+ sorted[k - 1] + d * (sorted[k] - sorted[k - 1])
236
+ end
237
+ else
238
+ raise NotImplementedError.new "Unknown strategy #{strategy.to_s}"
239
+ end
240
+ end
241
+
242
+ # Dichotomize the vector with 0 and 1, based on lowest value.
243
+ # If parameter is defined, this value and lower will be 0
244
+ # and higher, 1.
245
+ def dichotomize(low = nil)
246
+ low ||= factors.min
247
+
248
+ self.recode do |x|
249
+ if x.nil?
250
+ nil
251
+ elsif x > low
252
+ 1
253
+ else
254
+ 0
255
+ end
256
+ end
257
+ end
258
+
259
+ # Center data by subtracting the mean from each non-nil value.
260
+ def center
261
+ self - mean
262
+ end
263
+
264
+ # Standardize data.
265
+ #
266
+ # == Arguments
267
+ #
268
+ # * use_population - Pass as *true* if you want to use population
269
+ # standard deviation instead of sample standard deviation.
270
+ def standardize use_population=false
158
271
  m ||= mean
159
- (@data.inject(0) {|memo, val| val + (val - m).abs }) / n_valid
272
+ sd = use_population ? sdp : sds
273
+ return Daru::Vector.new([nil]*@size) if m.nil? or sd == 0.0
274
+
275
+ vector_standardized_compute m, sd
160
276
  end
161
277
 
162
- def recode!(&block)
163
- @data.recode!(&block)
278
+ def box_cox_transformation lambda # :nodoc:
279
+ raise "Should be a numeric" unless @type == :numeric
280
+
281
+ self.recode do |x|
282
+ if !x.nil?
283
+ if(lambda == 0)
284
+ Math.log(x)
285
+ else
286
+ (x ** lambda - 1).quo(lambda)
287
+ end
288
+ else
289
+ nil
290
+ end
291
+ end
164
292
  end
165
293
 
166
- def percentile percent
167
- sorted = @data.sort
168
- v = (n_valid * percent).quo(100)
169
- if v.to_i != v
170
- sorted[v.round]
294
+ # Replace each non-nil value in the vector with its percentile.
295
+ def vector_percentile
296
+ c = size - missing_positions.size
297
+ ranked.recode! { |i| i.nil? ? nil : (i.quo(c)*100).to_f }
298
+ end
299
+
300
+ def vector_standardized_compute(m,sd)
301
+ if @data.respond_to? :vector_standardized_compute
302
+ @data.vector_standardized_compute(m,sd)
171
303
  else
172
- (sorted[(v - 0.5).round].to_f + sorted[(v + 0.5).round]).quo(2)
304
+ Daru::Vector.new @data.collect { |x| x.nil? ? nil : (x.to_f - m).quo(sd) },
305
+ index: index, name: name, dtype: dtype
306
+ end
307
+ end
308
+
309
+ def vector_centered_compute(m)
310
+ if @data.respond_to? :vector_centered_compute
311
+ @data.vector_centered_compute(m)
312
+ else
313
+ Daru::Vector.new @data.collect { |x| x.nil? ? nil : x.to_f-m },
314
+ index: index, name: name, dtype: dtype
315
+ end
316
+ end
317
+
318
+ # Returns an random sample of size n, with replacement,
319
+ # only with non-nil data.
320
+ #
321
+ # In all the trails, every item have the same probability
322
+ # of been selected.
323
+ def sample_with_replacement(sample=1)
324
+ if @data.respond_to? :sample_with_replacement
325
+ @data.sample_with_replacement sample
326
+ else
327
+ valid = missing_positions.empty? ? self : self.only_valid
328
+ vds = valid.size
329
+ (0...sample).collect{ valid[rand(vds)] }
330
+ end
331
+ end
332
+
333
+ # Returns an random sample of size n, without replacement,
334
+ # only with valid data.
335
+ #
336
+ # Every element could only be selected once.
337
+ #
338
+ # A sample of the same size of the vector is the vector itself.
339
+ def sample_without_replacement(sample=1)
340
+ if @data.respond_to? :sample_without_replacement
341
+ @data.sample_without_replacement sample
342
+ else
343
+ valid = missing_positions.empty? ? self : self.only_valid
344
+ raise ArgumentError, "Sample size couldn't be greater than n" if
345
+ sample > valid.size
346
+ out = []
347
+ size = valid.size
348
+ while out.size < sample
349
+ value = rand(size)
350
+ out.push(value) if !out.include?(value)
351
+ end
352
+
353
+ out.collect{|i| valid[i]}
173
354
  end
174
355
  end
175
356
 
@@ -182,6 +363,7 @@ module Daru
182
363
  alias :sd :standard_deviation_sample
183
364
  alias :ss :sum_of_squares
184
365
  alias :percentil :percentile
366
+ alias :se :standard_error
185
367
  end
186
368
  end
187
369
  end