daru 0.0.5 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.build.sh +14 -0
  3. data/.travis.yml +26 -4
  4. data/CONTRIBUTING.md +31 -0
  5. data/Gemfile +1 -2
  6. data/{History.txt → History.md} +110 -44
  7. data/README.md +21 -288
  8. data/Rakefile +1 -0
  9. data/daru.gemspec +12 -8
  10. data/lib/daru.rb +36 -1
  11. data/lib/daru/accessors/array_wrapper.rb +8 -3
  12. data/lib/daru/accessors/gsl_wrapper.rb +113 -0
  13. data/lib/daru/accessors/nmatrix_wrapper.rb +6 -17
  14. data/lib/daru/core/group_by.rb +0 -1
  15. data/lib/daru/dataframe.rb +1192 -83
  16. data/lib/daru/extensions/rserve.rb +21 -0
  17. data/lib/daru/index.rb +14 -0
  18. data/lib/daru/io/io.rb +170 -8
  19. data/lib/daru/maths/arithmetic/dataframe.rb +4 -3
  20. data/lib/daru/maths/arithmetic/vector.rb +4 -4
  21. data/lib/daru/maths/statistics/dataframe.rb +48 -27
  22. data/lib/daru/maths/statistics/vector.rb +215 -33
  23. data/lib/daru/monkeys.rb +53 -7
  24. data/lib/daru/multi_index.rb +21 -4
  25. data/lib/daru/plotting/dataframe.rb +83 -25
  26. data/lib/daru/plotting/vector.rb +9 -10
  27. data/lib/daru/vector.rb +596 -61
  28. data/lib/daru/version.rb +3 -0
  29. data/spec/accessors/wrappers_spec.rb +51 -0
  30. data/spec/core/group_by_spec.rb +0 -2
  31. data/spec/daru_spec.rb +58 -0
  32. data/spec/dataframe_spec.rb +768 -73
  33. data/spec/extensions/rserve_spec.rb +52 -0
  34. data/spec/fixtures/bank2.dat +200 -0
  35. data/spec/fixtures/repeated_fields.csv +7 -0
  36. data/spec/fixtures/scientific_notation.csv +4 -0
  37. data/spec/fixtures/test_xls.xls +0 -0
  38. data/spec/io/io_spec.rb +161 -24
  39. data/spec/math/arithmetic/dataframe_spec.rb +26 -7
  40. data/spec/math/arithmetic/vector_spec.rb +8 -0
  41. data/spec/math/statistics/dataframe_spec.rb +16 -1
  42. data/spec/math/statistics/vector_spec.rb +215 -47
  43. data/spec/spec_helper.rb +21 -2
  44. data/spec/vector_spec.rb +368 -12
  45. metadata +99 -16
  46. data/lib/version.rb +0 -3
  47. data/notebooks/grouping_splitting_pivots.ipynb +0 -529
  48. data/notebooks/intro_with_music_data_.ipynb +0 -303
@@ -1,5 +1,41 @@
1
1
  class Array
2
- def daru_vector name=nil, index=nil, dtype=Array
2
+ # Recode repeated values on an array, adding the number of repetition
3
+ # at the end
4
+ # Example:
5
+ # a=%w{a b c c d d d e}
6
+ # a.recode_repeated
7
+ # => ["a","b","c_1","c_2","d_1","d_2","d_3","e"]
8
+ def recode_repeated
9
+ if size != uniq.size
10
+ # Find repeated
11
+ repeated = inject({}) do |acc, v|
12
+ if acc[v].nil?
13
+ acc[v] = 1
14
+ else
15
+ acc[v] += 1
16
+ end
17
+ acc
18
+ end.select { |_k, v| v > 1 }.keys
19
+
20
+ ns = repeated.inject({}) do |acc, v|
21
+ acc[v] = 0
22
+ acc
23
+ end
24
+
25
+ collect do |f|
26
+ if repeated.include? f
27
+ ns[f] += 1
28
+ sprintf('%s_%d', f, ns[f])
29
+ else
30
+ f
31
+ end
32
+ end
33
+ else
34
+ self
35
+ end
36
+ end
37
+
38
+ def daru_vector name=nil, index=nil, dtype=:array
3
39
  Daru::Vector.new self, name: name, index: index, dtype: dtype
4
40
  end
5
41
 
@@ -11,8 +47,8 @@ class Array
11
47
  end
12
48
 
13
49
  class Range
14
- def daru_vector name=nil, index=nil, dtype=Array
15
- Daru::Vector.new self, name: name, index: index, dtype: Array
50
+ def daru_vector name=nil, index=nil, dtype=:array
51
+ Daru::Vector.new self, name: name, index: index, dtype: dtype
16
52
  end
17
53
 
18
54
  alias_method :dv, :daru_vector
@@ -23,8 +59,8 @@ class Range
23
59
  end
24
60
 
25
61
  class Hash
26
- def daru_vector index=nil, dtype=Array
27
- Daru::Vector.new self.values[0], name: self.keys[0], index: index, dtype: Array
62
+ def daru_vector index=nil, dtype=:array
63
+ Daru::Vector.new self.values[0], name: self.keys[0], index: index, dtype: dtype
28
64
  end
29
65
 
30
66
  alias_method :dv, :daru_vector
@@ -32,7 +68,7 @@ end
32
68
 
33
69
  class NMatrix
34
70
  def daru_vector name=nil, index=nil, dtype=NMatrix
35
- Daru::Vector.new self, name: name, index: index, dtype: NMatrix
71
+ Daru::Vector.new self, name: name, index: index, dtype: :nmatrix
36
72
  end
37
73
 
38
74
  alias_method :dv, :daru_vector
@@ -40,7 +76,7 @@ end
40
76
 
41
77
  class MDArray
42
78
  def daru_vector name=nil, index=nil, dtype=MDArray
43
- Daru::Vector.new self, name: name, index: index, dtype: MDArray
79
+ Daru::Vector.new self, name: name, index: index, dtype: :mdarray
44
80
  end
45
81
 
46
82
  alias_method :dv, :daru_vector
@@ -58,4 +94,14 @@ class Matrix
58
94
  e / other.to_a.flatten[index]
59
95
  end
60
96
  end
97
+ end
98
+
99
+ class String
100
+ def is_number?
101
+ if self =~ /^-?\d+[,.]?\d*(e-?\d+)?$/
102
+ true
103
+ else
104
+ false
105
+ end
106
+ end
61
107
  end
@@ -17,12 +17,12 @@ module Daru
17
17
  attr_reader :values
18
18
 
19
19
  # Initialize a MultiIndex by passing a tuple of indexes. The order assigned
20
- # to the multi index corresponds to the position of the tuple in the array
21
- # of tuples.
20
+ # to the multi index corresponds to the position of the tuple in the array
21
+ # of tuples.
22
22
  #
23
23
  # Although you can create your own hierarchially indexed Vectors and DataFrames,
24
- # this class currently contains minimal error checking and is mainly used
25
- # internally for summarizing, splitting and grouping of data.
24
+ # this class currently contains minimal error checking and is mainly used
25
+ # internally for summarizing, splitting and grouping of data.
26
26
  #
27
27
  # == Arguments
28
28
  #
@@ -68,6 +68,19 @@ module Daru
68
68
  end
69
69
  end
70
70
 
71
+ def + other
72
+ other.flatten!
73
+ tuples = to_a
74
+ raise ArgumentError, "Incomplete tuple #{other}" unless
75
+ tuples.all? { |t| t.size == other.size }
76
+
77
+ Daru::MultiIndex.new(tuples << (other))
78
+ end
79
+
80
+ def empty?
81
+ @relation_hash.empty?
82
+ end
83
+
71
84
  # Compare two MultiIndex objects for equality based on the contents of their
72
85
  # relation hashes. Does not take object_id into account.
73
86
  def == other
@@ -108,6 +121,10 @@ module Daru
108
121
  tuple.empty? ? nil : tuple
109
122
  end
110
123
 
124
+ def size
125
+ to_a.size
126
+ end
127
+
111
128
  private
112
129
 
113
130
  # Deep compare two hashes
@@ -1,9 +1,3 @@
1
- begin
2
- require 'nyaplot'
3
- rescue LoadError => e
4
- puts "#{e}"
5
- end
6
-
7
1
  module Daru
8
2
  module Plotting
9
3
  module DataFrame
@@ -12,36 +6,100 @@ module Daru
12
6
  # to the block, if it is specified. See the nyaplot docs for info on how to
13
7
  # further use these objects.
14
8
  #
9
+ # Detailed instructions on use of the plotting API can be found in the
10
+ # notebooks whose links you can find in the README.
11
+ #
15
12
  # == Options
16
- # +:type+ - Type of plot (scatter, bar, histogram)
17
- # +:legends+ - The names of the vectors that are to be used as X and Y axes.
18
- # The vectors names must be specified as symbols inside an Array. They
19
- # also should be specified in the right order. For example, passing [:a, :b]
20
- # will keep vector :a as the X axis and :b as the Y axis. Passing [:a]
21
- # keep :a as the X axis and plot the frequency with which :a appears
22
- # on the Y axis.
23
- # +:frame+ - Pass this as *true* to disable plotting the graph directly
24
- # and instead manually create Nyaplot::Frame object inside the block using
25
- # the Nyaplot::Plot object for plotting one or many graphs in a frame.
13
+ #
14
+ # * +:type+ - Type of plot. Can be :scatter, :bar, :histogram, :line or :box.
15
+ # * +:x+ - Vector to be used for X co-ordinates.
16
+ # * +:y+ - Vector to be used for Y co-ordinates.
26
17
  #
27
18
  # == Usage
28
- # df = Daru::DataFrame.new({a:[0,1,2,3,4], b:[10,20,30,40,50]})
29
- # df.plot legends: [:a, :b], type: :bar
19
+ # # Simple bar chart
20
+ # df = Daru::DataFrame.new({a:['A', 'B', 'C', 'D', 'E'], b:[10,20,30,40,50]})
21
+ # df.plot type: :bar, x: :a, y: :b
30
22
  def plot opts={}
31
23
  options = {
32
- type: :scatter,
33
- frame: false,
34
- legends: []
24
+ type: :scatter
35
25
  }.merge(opts)
36
26
 
37
27
  plot = Nyaplot::Plot.new
38
- diagram = plot.add_with_df(Nyaplot::DataFrame.new(self.to_a[0]),
39
- options[:type], *options[:legends])
28
+ types = extract_option :type, options
29
+
30
+ diagram =
31
+ case
32
+ when !([:scatter, :bar, :line, :histogram] & types).empty?
33
+ if single_diagram? options
34
+ add_single_diagram plot, options
35
+ else
36
+ add_multiple_diagrams plot, options
37
+ end
38
+ when types.include?(:box)
39
+ numeric = self.only_numerics(clone: false).dup_only_valid
40
+
41
+ plot.add_with_df(
42
+ numeric.to_nyaplotdf,
43
+ :box, *numeric.vectors.to_a)
44
+ end
40
45
 
41
46
  yield(plot, diagram) if block_given?
42
47
 
43
- plot.show unless options[:frame]
48
+ plot.show
44
49
  end
50
+
51
+ private
52
+
53
+ def single_diagram? options
54
+ options[:x] and options[:x].is_a?(Symbol)
55
+ end
56
+
57
+ def add_single_diagram plot, options
58
+ args = [
59
+ self.to_nyaplotdf,
60
+ options[:type],
61
+ options[:x]
62
+ ]
63
+
64
+ args << options[:y] if(options[:y])
65
+
66
+ plot.add_with_df(*args)
67
+ end
68
+
69
+ def add_multiple_diagrams plot, options
70
+ types = extract_option :type, options
71
+ x_vecs = extract_option :x, options
72
+ y_vecs = extract_option :y, options
73
+
74
+ diagrams = []
75
+ nyaplot_df = self.to_nyaplotdf
76
+ total = x_vecs.size
77
+ types = types.size < total ? types*total : types
78
+
79
+
80
+ (0...total).each do |i|
81
+ diagrams << plot.add_with_df(
82
+ nyaplot_df,
83
+ types[i],
84
+ x_vecs[i],
85
+ y_vecs[i]
86
+ )
87
+ end
88
+
89
+ diagrams
90
+ end
91
+
92
+ def extract_option opt, options
93
+ if options[opt]
94
+ o = options[opt]
95
+ o.is_a?(Array) ? o : [o]
96
+ else
97
+ arr = options.keys
98
+ arr.keep_if { |a| a =~ Regexp.new("\\A#{opt.to_s}") }.sort
99
+ arr.map { |a| options[a] }
100
+ end
101
+ end
102
+
45
103
  end
46
104
  end
47
- end
105
+ end if Daru.has_nyaplot?
@@ -1,9 +1,3 @@
1
- begin
2
- require 'nyaplot'
3
- rescue LoadError => e
4
- puts "#{e}"
5
- end
6
-
7
1
  module Daru
8
2
  module Plotting
9
3
  module Vector
@@ -27,9 +21,14 @@ module Daru
27
21
  type: :scatter
28
22
  }.merge(opts)
29
23
 
30
- x_axis = options[:type] == :scatter ? Array.new(@size) { |i| i } : @index.to_a
31
- plot = Nyaplot::Plot.new
32
- diagram = plot.add( options[:type], x_axis, @data.to_a )
24
+ x_axis = options[:type] == :scatter ? Array.new(@size) { |i| i } : @index.to_a
25
+ plot = Nyaplot::Plot.new
26
+ diagram =
27
+ if [:box, :histogram].include? options[:type]
28
+ plot.add(options[:type], @data.to_a)
29
+ else
30
+ plot.add(options[:type], x_axis, @data.to_a)
31
+ end
33
32
 
34
33
  yield plot, diagram if block_given?
35
34
 
@@ -37,4 +36,4 @@ module Daru
37
36
  end
38
37
  end
39
38
  end
40
- end
39
+ end if Daru.has_nyaplot?
@@ -5,13 +5,14 @@ require 'maths/statistics/vector.rb'
5
5
  require 'plotting/vector.rb'
6
6
  require 'accessors/array_wrapper.rb'
7
7
  require 'accessors/nmatrix_wrapper.rb'
8
+ require 'accessors/gsl_wrapper.rb'
8
9
 
9
10
  module Daru
10
11
  class Vector
11
12
  include Enumerable
12
13
  include Daru::Maths::Arithmetic::Vector
13
14
  include Daru::Maths::Statistics::Vector
14
- include Daru::Plotting::Vector
15
+ include Daru::Plotting::Vector if Daru.has_nyaplot?
15
16
 
16
17
  def each(&block)
17
18
  return to_enum(:each) unless block_given?
@@ -20,37 +21,55 @@ module Daru
20
21
  self
21
22
  end
22
23
 
23
- def map!(&block)
24
- return to_enum(:map!) unless block_given?
24
+ def each_index(&block)
25
+ return to_enum(:each_index) unless block_given?
25
26
 
26
- @data.map!(&block)
27
+ @index.each(&block)
27
28
  self
28
29
  end
29
30
 
30
- def map(&block)
31
- return to_enum(:map) unless block_given?
31
+ def each_with_index(&block)
32
+ return to_enum(:each_with_index) unless block_given?
32
33
 
33
- Daru::Vector.new @data.map(&block), name: @name, index: @index, dtype: @dtype
34
+ @index.each { |i| yield(self[i], i) }
35
+ self
34
36
  end
35
37
 
36
- alias_method :recode, :map
38
+ def map!(&block)
39
+ return to_enum(:map!) unless block_given?
40
+ @data.map!(&block)
41
+ update
42
+ self
43
+ end
37
44
 
45
+ # The name of the Daru::Vector. String.
38
46
  attr_reader :name
47
+ # The row index. Can be either Daru::Index or Daru::MultiIndex.
39
48
  attr_reader :index
49
+ # The total number of elements of the vector.
40
50
  attr_reader :size
51
+ # The underlying dtype of the Vector. Can be either :array, :nmatrix or :gsl.
41
52
  attr_reader :dtype
53
+ # If the dtype is :nmatrix, this attribute represents the data type of the
54
+ # underlying NMatrix object. See NMatrix docs for more details on NMatrix
55
+ # data types.
42
56
  attr_reader :nm_dtype
43
- attr_reader :nil_positions
57
+ # An Array or the positions in the vector that are being treated as 'missing'.
58
+ attr_reader :missing_positions
59
+ # Store a hash of labels for values. Supplementary only. Recommend using index
60
+ # for proper usage.
61
+ attr_accessor :labels
44
62
 
45
63
  # Create a Vector object.
64
+ #
46
65
  # == Arguments
47
66
  #
48
- # @param source[Array,Hash] - Supply elements in the form of an Array or a Hash. If Array, a
49
- # numeric index will be created if not supplied in the options. Specifying more
50
- # index elements than actual values in *source* will insert *nil* into the
51
- # surplus index elements. When a Hash is specified, the keys of the Hash are
52
- # taken as the index elements and the corresponding values as the values that
53
- # populate the vector.
67
+ # @param source[Array,Hash] - Supply elements in the form of an Array or a
68
+ # Hash. If Array, a numeric index will be created if not supplied in the
69
+ # options. Specifying more index elements than actual values in *source*
70
+ # will insert *nil* into the surplus index elements. When a Hash is specified,
71
+ # the keys of the Hash are taken as the index elements and the corresponding
72
+ # values as the values that populate the vector.
54
73
  #
55
74
  # == Options
56
75
  #
@@ -58,10 +77,14 @@ module Daru
58
77
  #
59
78
  # * +:index+ - Index of the vector
60
79
  #
61
- # * +:dtype+ - The underlying data type. Can be :array or :nmatrix. Default :array.
80
+ # * +:dtype+ - The underlying data type. Can be :array, :nmatrix or :gsl.
81
+ # Default :array.
62
82
  #
63
83
  # * +:nm_dtype+ - For NMatrix, the data type of the numbers. See the NMatrix docs for
64
- # further information on supported data type.
84
+ # further information on supported data type.
85
+ #
86
+ # * +:missing_values+ - An Array of the values that are to be treated as 'missing'.
87
+ # nil is the default missing value.
65
88
  #
66
89
  # == Usage
67
90
  #
@@ -79,7 +102,7 @@ module Daru
79
102
  name = opts[:name]
80
103
  set_name name
81
104
 
82
- @data = cast_vector_to(opts[:dtype], source, opts[:nm_dtype])
105
+ @data = cast_vector_to(opts[:dtype] || :array, source, opts[:nm_dtype])
83
106
  @index = create_index(index || @data.size)
84
107
 
85
108
  if @index.size > @data.size
@@ -90,10 +113,81 @@ module Daru
90
113
  end
91
114
 
92
115
  @possibly_changed_type = true
93
- set_nil_positions
116
+ set_missing_values opts[:missing_values]
117
+ set_missing_positions
94
118
  set_size
95
119
  end
96
120
 
121
+ # Create a new vector by specifying the size and an optional value
122
+ # and block to generate values.
123
+ #
124
+ # == Description
125
+ #
126
+ # The *new_with_size* class method lets you create a Daru::Vector
127
+ # by specifying the size as the argument. The optional block, if
128
+ # supplied, is run once for populating each element in the Vector.
129
+ #
130
+ # The result of each run of the block is the value that is ultimately
131
+ # assigned to that position in the Vector.
132
+ #
133
+ # == Options
134
+ # :value
135
+ # All the rest like .new
136
+ def self.new_with_size n, opts={}, &block
137
+ value = opts[:value]
138
+ opts.delete :value
139
+ if block
140
+ vector = Daru::Vector.new n.times.map { |i| block.call(i) }, opts
141
+ else
142
+ vector = Daru::Vector.new n.times.map { value }, opts
143
+ end
144
+ vector
145
+ end
146
+
147
+ # Create a vector using (almost) any object
148
+ # * Array: flattened
149
+ # * Range: transformed using to_a
150
+ # * Daru::Vector
151
+ # * Numeric and string values
152
+ #
153
+ # == Description
154
+ #
155
+ # The `Vector.[]` class method creates a vector from almost any
156
+ # object that has a `#to_a` method defined on it. It is similar
157
+ # to R's `c` method.
158
+ #
159
+ # == Usage
160
+ #
161
+ # a = Daru::Vector[1,2,3,4,6..10]
162
+ # #=>
163
+ # # <Daru::Vector:99448510 @name = nil @size = 9 >
164
+ # # nil
165
+ # # 0 1
166
+ # # 1 2
167
+ # # 2 3
168
+ # # 3 4
169
+ # # 4 6
170
+ # # 5 7
171
+ # # 6 8
172
+ # # 7 9
173
+ # # 8 10
174
+ def self.[](*args)
175
+ values = []
176
+ args.each do |a|
177
+ case a
178
+ when Array
179
+ values.concat a.flatten
180
+ when Daru::Vector
181
+ values.concat a.to_a
182
+ when Range
183
+ values.concat a.to_a
184
+ else
185
+ values << a
186
+ end
187
+ end
188
+ Daru::Vector.new(values)
189
+ end
190
+
97
191
  # Get one or more elements with specified index or a range.
98
192
  #
99
193
  # == Usage
@@ -106,6 +200,7 @@ module Daru
106
200
  # # For vectors employing hierarchial multi index
107
201
  #
108
202
  def [](*indexes)
203
+ indexes.map! { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
109
204
  location = indexes[0]
110
205
  if @index.is_a?(MultiIndex)
111
206
  result =
@@ -158,6 +253,19 @@ module Daru
158
253
  end
159
254
  end
160
255
 
256
+ # Just like in Hashes, you can specify the index label of the Daru::Vector
257
+ # and assign an element an that place in the Daru::Vector.
258
+ #
259
+ # == Usage
260
+ #
261
+ # v = Daru::Vector.new([1,2,3], index: [:a, :b, :c])
262
+ # v[:a] = 999
263
+ # #=>
264
+ # ##<Daru::Vector:90257920 @name = nil @size = 3 >
265
+ # # nil
266
+ # # a 999
267
+ # # b 2
268
+ # # c 3
161
269
  def []=(*location, value)
162
270
  cast(dtype: :array) if value.nil? and dtype != :array
163
271
 
@@ -182,7 +290,38 @@ module Daru
182
290
  end
183
291
 
184
292
  set_size
185
- set_nil_positions
293
+ set_missing_positions unless Daru.lazy_update
294
+ end
295
+
296
+ # The values to be treated as 'missing'. *nil* is the default missing
297
+ # type. To set missing values see the missing_values= method.
298
+ def missing_values
299
+ @missing_values.keys
300
+ end
301
+
302
+ # Assign an Array to treat certain values as 'missing'.
303
+ #
304
+ # == Usage
305
+ #
306
+ # v = Daru::Vector.new [1,2,3,4,5]
307
+ # v.missing_values = [3]
308
+ # v.update
309
+ # v.missing_positions
310
+ # #=> [2]
311
+ def missing_values= values
312
+ set_missing_values values
313
+ set_missing_positions unless Daru.lazy_update
314
+ end
315
+
316
+ # Method for updating the metadata (i.e. missing value positions) of the
317
+ # after assingment/deletion etc. are complete. This is provided so that
318
+ # time is not wasted in creating the metadata for the vector each time
319
+ # assignment/deletion of elements is done. Updating data this way is called
320
+ # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
321
+ def update
322
+ if Daru.lazy_update
323
+ set_missing_positions
324
+ end
186
325
  end
187
326
 
188
327
  # Two vectors are equal if the have the exact same index values corresponding
@@ -199,21 +338,20 @@ module Daru
199
338
  end
200
339
  end
201
340
 
202
- def << element
203
- concat element
341
+ def head q=10
342
+ self[0..(q-1)]
204
343
  end
205
344
 
206
- def push element
207
- concat element
345
+ def tail q=10
346
+ self[(@size - q - 1)..(@size-1)]
208
347
  end
209
348
 
210
- def head q=10
211
- self[0..q]
349
+ # Reports whether missing data is present in the Vector.
350
+ def has_missing_data?
351
+ !missing_positions.empty?
212
352
  end
353
+ alias :flawed? :has_missing_data?
213
354
 
214
- def tail q=10
215
- self[-q..-1]
216
- end
217
355
 
218
356
  # Append an element to the vector by specifying the element and index
219
357
  def concat element, index=nil
@@ -231,8 +369,10 @@ module Daru
231
369
  end
232
370
  @data[@index[index]] = element
233
371
  set_size
234
- set_nil_positions
372
+ set_missing_positions unless Daru.lazy_update
235
373
  end
374
+ alias :push :concat
375
+ alias :<< :concat
236
376
 
237
377
  # Cast a vector to a new data type.
238
378
  #
@@ -240,11 +380,11 @@ module Daru
240
380
  #
241
381
  # * +:dtype+ - :array for Ruby Array. :nmatrix for NMatrix.
242
382
  def cast opts={}
243
- dtype = opts[:dtype]
383
+ dt = opts[:dtype]
244
384
  raise ArgumentError, "Unsupported dtype #{opts[:dtype]}" unless
245
- dtype == :array or dtype == :nmatrix
385
+ dt == :array or dt == :nmatrix or dt == :gsl
246
386
 
247
- @data = cast_vector_to dtype
387
+ @data = cast_vector_to dt unless @dtype == dt
248
388
  end
249
389
 
250
390
  # Delete an element by value
@@ -264,7 +404,7 @@ module Daru
264
404
  end
265
405
 
266
406
  set_size
267
- set_nil_positions
407
+ set_missing_positions unless Daru.lazy_update
268
408
  end
269
409
 
270
410
  # The type of data contained in the vector. Can be :object or :numeric. If
@@ -308,11 +448,19 @@ module Daru
308
448
  Daru::Vector.new uniq_vector, name: @name, index: new_index, dtype: @dtype
309
449
  end
310
450
 
451
+ def any? &block
452
+ @data.data.any?(&block)
453
+ end
454
+
455
+ def all? &block
456
+ @data.data.all?(&block)
457
+ end
458
+
311
459
  # Sorts a vector according to its values. If a block is specified, the contents
312
- # will be evaluated and data will be swapped whenever the block evaluates
313
- # to *true*. Defaults to ascending order sorting. Any missing values will be
314
- # put at the end of the vector. Preserves indexing. Default sort algorithm is
315
- # quick sort.
460
+ # will be evaluated and data will be swapped whenever the block evaluates
461
+ # to *true*. Defaults to ascending order sorting. Any missing values will be
462
+ # put at the end of the vector. Preserves indexing. Default sort algorithm is
463
+ # quick sort.
316
464
  #
317
465
  # == Options
318
466
  #
@@ -323,7 +471,7 @@ module Daru
323
471
  #
324
472
  # v = Daru::Vector.new ["My first guitar", "jazz", "guitar"]
325
473
  # # Say you want to sort these strings by length.
326
- # v.sort { |a,b| a.length <=> b.length }
474
+ # v.sort(ascending: false) { |a,b| a.length <=> b.length }
327
475
  def sort opts={}, &block
328
476
  opts = {
329
477
  ascending: true,
@@ -339,18 +487,138 @@ module Daru
339
487
  Daru::Vector.new(vector, index: create_index(index), name: @name, dtype: @dtype)
340
488
  end
341
489
 
342
- # Just sort the data and get an Array in return using Enumerable#sort. Non-destructive.
490
+ # Just sort the data and get an Array in return using Enumerable#sort.
491
+ # Non-destructive.
343
492
  def sorted_data &block
344
493
  @data.to_a.sort(&block)
345
494
  end
346
495
 
347
- # Returns *true* if the value passed actually exists in the vector.
496
+ # Returns *true* if the value passed is actually exists or is not marked as
497
+ # a *missing value*.
348
498
  def exists? value
349
- !self[index_of(value)].nil?
499
+ !@missing_values.has_key?(self[index_of(value)])
500
+ end
501
+
502
+ # Like map, but returns a Daru::Vector with the returned values.
503
+ def recode dt=nil, &block
504
+ return to_enum(:recode) unless block_given?
505
+
506
+ dup.recode! dt, &block
507
+ end
508
+
509
+ # Destructive version of recode!
510
+ def recode! dt=nil, &block
511
+ return to_enum(:recode!) unless block_given?
512
+
513
+ @data.map!(&block).data
514
+ @data = cast_vector_to(dt || @dtype)
515
+ self
516
+ end
517
+
518
+ def delete_if &block
519
+ return to_enum(:delete_if) unless block_given?
520
+
521
+ keep_e = []
522
+ keep_i = []
523
+ each_with_index do |n, i|
524
+ if yield(n)
525
+ keep_e << n
526
+ keep_i << i
527
+ end
528
+ end
529
+
530
+ @data = cast_vector_to @dtype, keep_e
531
+ @index = @index.is_a?(MultiIndex) ? MultiIndex.new(keep_i) : Index.new(keep_i)
532
+ set_missing_positions unless Daru.lazy_update
533
+ set_size
534
+
535
+ self
536
+ end
537
+
538
+ # Reports all values that doesn't comply with a condition.
539
+ # Returns a hash with the index of data and the invalid data.
540
+ def verify &block
541
+ h = {}
542
+ (0...size).each do |i|
543
+ if !(yield @data[i])
544
+ h[i] = @data[i]
545
+ end
546
+ end
547
+
548
+ h
549
+ end
550
+
551
+ # Return an Array with the data splitted by a separator.
552
+ # a=Daru::Vector.new(["a,b","c,d","a,b","d"])
553
+ # a.splitted
554
+ # =>
555
+ # [["a","b"],["c","d"],["a","b"],["d"]]
556
+ def splitted sep=","
557
+ @data.map do |s|
558
+ if s.nil?
559
+ nil
560
+ elsif s.respond_to? :split
561
+ s.split sep
562
+ else
563
+ [s]
564
+ end
565
+ end
566
+ end
567
+
568
+ # Returns a hash of Vectors, defined by the different values
569
+ # defined on the fields
570
+ # Example:
571
+ #
572
+ # a=Daru::Vector.new(["a,b","c,d","a,b"])
573
+ # a.split_by_separator
574
+ # => {"a"=>#<Daru::Vector:0x7f2dbcc09d88
575
+ # @data=[1, 0, 1]>,
576
+ # "b"=>#<Daru::Vector:0x7f2dbcc09c48
577
+ # @data=[1, 1, 0]>,
578
+ # "c"=>#<Daru::Vector:0x7f2dbcc09b08
579
+ # @data=[0, 1, 1]>}
580
+ #
581
+ def split_by_separator sep=","
582
+ split_data = splitted sep
583
+ factors = split_data.flatten.uniq.compact
584
+
585
+ out = factors.inject({}) do |h,x|
586
+ h[x] = []
587
+ h
588
+ end
589
+
590
+ split_data.each do |r|
591
+ if r.nil?
592
+ factors.each do |f|
593
+ out[f].push(nil)
594
+ end
595
+ else
596
+ factors.each do |f|
597
+ out[f].push(r.include?(f) ? 1:0)
598
+ end
599
+ end
600
+ end
601
+
602
+ out.inject({}) do |s,v|
603
+ s[v[0]] = Daru::Vector.new v[1]
604
+ s
605
+ end
606
+ end
607
+
608
+ def split_by_separator_freq(sep=",")
609
+ split_by_separator(sep).inject({}) do |a,v|
610
+ a[v[0]] = v[1].inject { |s,x| s+x.to_i }
611
+ a
612
+ end
613
+ end
614
+
615
+ def reset_index!
616
+ @index = Daru::Index.new(Array.new(size) { |i| i })
617
+ self
350
618
  end
351
619
 
352
620
  # Returns a vector which has *true* in the position where the element in self
353
- # is nil, and false otherwise.
621
+ # is nil, and false otherwise.
354
622
  #
355
623
  # == Usage
356
624
  #
@@ -383,26 +651,34 @@ module Daru
383
651
  end
384
652
 
385
653
  # Replace all nils in the vector with the value passed as an argument. Destructive.
386
- # See #replace_nils for non-destructive version
654
+ # See #replace_nils for non-destructive version
387
655
  #
388
656
  # == Arguments
389
657
  #
390
658
  # * +replacement+ - The value which should replace all nils
391
659
  def replace_nils! replacement
392
- nil_positions.each do |idx|
660
+ missing_positions.each do |idx|
393
661
  self[idx] = replacement
394
662
  end
395
663
 
396
664
  self
397
665
  end
398
666
 
667
+ def detach_index
668
+ Daru::DataFrame.new({
669
+ index: @index.to_a.map(&:to_s),
670
+ vector: @data.to_a
671
+ })
672
+ end
673
+
399
674
  # Non-destructive version of #replace_nils!
400
675
  def replace_nils replacement
401
676
  self.dup.replace_nils!(replacement)
402
677
  end
403
678
 
679
+ # number of non-missing elements
404
680
  def n_valid
405
- @size
681
+ @size - missing_positions.size
406
682
  end
407
683
 
408
684
  # Returns *true* if an index exists
@@ -425,6 +701,20 @@ module Daru
425
701
  end
426
702
  end
427
703
 
704
+ # If dtype != gsl, will convert data to GSL::Vector with to_a. Otherwise returns
705
+ # the stored GSL::Vector object.
706
+ def to_gsl
707
+ if Daru.has_gsl?
708
+ if dtype == :gsl
709
+ return @data.data
710
+ else
711
+ GSL::Vector.alloc only_valid(:array).to_a
712
+ end
713
+ else
714
+ raise NoMethodError, "Install gsl-nmatrix for access to this functionality."
715
+ end
716
+ end
717
+
428
718
  # Convert to hash. Hash keys are indexes and values are the correspoding elements
429
719
  def to_hash
430
720
  @index.inject({}) do |hsh, index|
@@ -446,12 +736,24 @@ module Daru
446
736
  # Convert to html for iruby
447
737
  def to_html threshold=30
448
738
  name = @name || 'nil'
449
- html = '<table>' + '<tr><th> </th><th>' + name.to_s + '</th></tr>'
739
+ html = "<table>" +
740
+ "<tr>" +
741
+ "<th colspan=\"2\">" +
742
+ "Daru::Vector:#{self.object_id} " + " size: #{size}" +
743
+ "</th>" +
744
+ "</tr>"
745
+ html += '<tr><th> </th><th>' + name.to_s + '</th></tr>'
450
746
  @index.each_with_index do |index, num|
451
747
  html += '<tr><td>' + index.to_s + '</td>' + '<td>' + self[index].to_s + '</td></tr>'
452
748
 
453
749
  if num > threshold
454
750
  html += '<tr><td>...</td><td>...</td></tr>'
751
+
752
+ last_index = @index.to_a.last
753
+ html += '<tr>' +
754
+ '<td>' + last_index.to_s + '</td>' +
755
+ '<td>' + self[last_index].to_s + '</td>' +
756
+ '</tr>'
455
757
  break
456
758
  end
457
759
  end
@@ -464,11 +766,45 @@ module Daru
464
766
  to_html
465
767
  end
466
768
 
769
+ # Create a summary of the Vector using Report Builder.
770
+ def summary(method = :to_text)
771
+ ReportBuilder.new(no_title: true).add(self).send(method)
772
+ end
773
+
774
+ def report_building b
775
+ b.section(:name => name) do |s|
776
+ s.text "n :#{size}"
777
+ s.text "n valid:#{n_valid}"
778
+ if @type == :object
779
+ s.text "factors: #{factors.to_a.join(',')}"
780
+ s.text "mode: #{mode}"
781
+
782
+ s.table(:name => "Distribution") do |t|
783
+ frequencies.sort_by { |a| a.to_s }.each do |k,v|
784
+ key = @index.include?(k) ? @index[k] : k
785
+ t.row [key, v , ("%0.2f%%" % (v.quo(n_valid)*100))]
786
+ end
787
+ end
788
+ end
789
+
790
+ s.text "median: #{median.to_s}" if (@type==:numeric or @type==:numeric)
791
+ if @type==:numeric
792
+ s.text "mean: %0.4f" % mean
793
+ if sd
794
+ s.text "std.dev.: %0.4f" % sd
795
+ s.text "std.err.: %0.4f" % se
796
+ s.text "skew: %0.4f" % skew
797
+ s.text "kurtosis: %0.4f" % kurtosis
798
+ end
799
+ end
800
+ end
801
+ end
802
+
467
803
  # Over rides original inspect for pretty printing in irb
468
804
  def inspect spacing=20, threshold=15
469
805
  longest = [@name.to_s.size,
470
- @index.to_a.map(&:to_s).map(&:size).max,
471
- @data .map(&:to_s).map(&:size).max,
806
+ (@index.to_a.map(&:to_s).map(&:size).max || 0),
807
+ (@data .map(&:to_s).map(&:size).max || 0),
472
808
  'nil'.size].max
473
809
 
474
810
  content = ""
@@ -503,6 +839,11 @@ module Daru
503
839
  #
504
840
  # @param new_name [Symbol] The new name.
505
841
  def rename new_name
842
+ if new_name.is_a?(Numeric)
843
+ @name = new_name
844
+ return
845
+ end
846
+
506
847
  @name = new_name.to_sym
507
848
  end
508
849
 
@@ -511,12 +852,176 @@ module Daru
511
852
  Daru::Vector.new @data.dup, name: @name, index: @index.dup
512
853
  end
513
854
 
855
+ # == Bootstrap
856
+ # Generate +nr+ resamples (with replacement) of size +s+
857
+ # from vector, computing each estimate from +estimators+
858
+ # over each resample.
859
+ # +estimators+ could be
860
+ # a) Hash with variable names as keys and lambdas as values
861
+ # a.bootstrap(:log_s2=>lambda {|v| Math.log(v.variance)},1000)
862
+ # b) Array with names of method to bootstrap
863
+ # a.bootstrap([:mean, :sd],1000)
864
+ # c) A single method to bootstrap
865
+ # a.jacknife(:mean, 1000)
866
+ # If s is nil, is set to vector size by default.
867
+ #
868
+ # Returns a DataFrame where each vector is a vector
869
+ # of length +nr+ containing the computed resample estimates.
870
+ def bootstrap(estimators, nr, s=nil)
871
+ s ||= size
872
+ h_est, es, bss = prepare_bootstrap(estimators)
873
+
874
+ nr.times do |i|
875
+ bs = sample_with_replacement(s)
876
+ es.each do |estimator|
877
+ bss[estimator].push(h_est[estimator].call(bs))
878
+ end
879
+ end
880
+
881
+ es.each do |est|
882
+ bss[est] = Daru::Vector.new bss[est]
883
+ end
884
+
885
+ Daru::DataFrame.new bss
886
+ end
887
+
888
+ # == Jacknife
889
+ # Returns a dataset with jacknife delete-+k+ +estimators+
890
+ # +estimators+ could be:
891
+ # a) Hash with variable names as keys and lambdas as values
892
+ # a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance)})
893
+ # b) Array with method names to jacknife
894
+ # a.jacknife([:mean, :sd])
895
+ # c) A single method to jacknife
896
+ # a.jacknife(:mean)
897
+ # +k+ represent the block size for block jacknife. By default
898
+ # is set to 1, for classic delete-one jacknife.
899
+ #
900
+ # Returns a dataset where each vector is an vector
901
+ # of length +cases+/+k+ containing the computed jacknife estimates.
902
+ #
903
+ # == Reference:
904
+ # * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife.
905
+ def jackknife(estimators, k=1)
906
+ raise "n should be divisible by k:#{k}" unless size % k==0
907
+
908
+ nb = (size / k).to_i
909
+ h_est, es, ps = prepare_bootstrap(estimators)
910
+
911
+ est_n = es.inject({}) do |h,v|
912
+ h[v] = h_est[v].call(self)
913
+ h
914
+ end
915
+
916
+ nb.times do |i|
917
+ other = @data.dup
918
+ other.slice!(i*k, k)
919
+ other = Daru::Vector.new other
920
+
921
+ es.each do |estimator|
922
+ # Add pseudovalue
923
+ ps[estimator].push(
924
+ nb * est_n[estimator] - (nb-1) * h_est[estimator].call(other))
925
+ end
926
+ end
927
+
928
+ es.each do |est|
929
+ ps[est] = Daru::Vector.new ps[est]
930
+ end
931
+ Daru::DataFrame.new ps
932
+ end
933
+
934
+ # Creates a new vector consisting only of non-nil data
935
+ #
936
+ # == Arguments
937
+ #
938
+ # @as_a [Symbol] Passing :array will return only the elements
939
+ # as an Array. Otherwise will return a Daru::Vector.
940
+ #
941
+ # @duplicate [Symbol] In case no missing data is found in the
942
+ # vector, setting this to false will return the same vector.
943
+ # Otherwise, a duplicate will be returned irrespective of
944
+ # presence of missing data.
945
+ def only_valid as_a=:vector, duplicate=true
946
+ return self.dup if !has_missing_data? and as_a == :vector and duplicate
947
+ return self if !has_missing_data? and as_a == :vector and !duplicate
948
+ return self.to_a if !has_missing_data? and as_a != :vector
949
+
950
+ new_index = @index.to_a - missing_positions
951
+ new_vector = new_index.map do |idx|
952
+ self[idx]
953
+ end
954
+
955
+ return new_vector if as_a != :vector
956
+
957
+ Daru::Vector.new new_vector, index: new_index, name: @name, dtype: dtype
958
+ end
959
+
960
+ # Returns a Vector containing only missing data (preserves indexes).
961
+ def only_missing as_a=:vector
962
+ if as_a == :vector
963
+ self[*missing_positions]
964
+ elsif as_a == :array
965
+ self[*missing_positions].to_a
966
+ end
967
+ end
968
+
969
+ # Returns a Vector with only numerical data. Missing data is included
970
+ # but non-Numeric objects are excluded. Preserves index.
971
+ def only_numerics
972
+ numeric_indexes = []
973
+
974
+ each_with_index do |v, i|
975
+ numeric_indexes << i if(v.kind_of?(Numeric) or @missing_values.has_key?(v))
976
+ end
977
+
978
+ self[*numeric_indexes]
979
+ end
980
+
981
+ # Returns the database type for the vector, according to its content
982
+ def db_type(dbs=:mysql)
983
+ # first, detect any character not number
984
+ if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
985
+ return "DATE"
986
+ elsif @data.find {|v| v.to_s=~/[^0-9e.-]/ }
987
+ return "VARCHAR (255)"
988
+ elsif @data.find {|v| v.to_s=~/\./}
989
+ return "DOUBLE"
990
+ else
991
+ return "INTEGER"
992
+ end
993
+ end
994
+
514
995
  # Copies the structure of the vector (i.e the index, size, etc.) and fills all
515
- # all values with nils.
996
+ # all values with nils.
516
997
  def clone_structure
517
998
  Daru::Vector.new(([nil]*@size), name: @name, index: @index.dup)
518
999
  end
519
1000
 
1001
+ # Save the vector to a file
1002
+ #
1003
+ # == Arguments
1004
+ #
1005
+ # * filename - Path of file where the vector is to be saved
1006
+ def save filename
1007
+ Daru::IO.save self, filename
1008
+ end
1009
+
1010
+ def _dump(depth) # :nodoc:
1011
+ Marshal.dump({
1012
+ data: @data.to_a,
1013
+ dtype: @dtype,
1014
+ name: @name,
1015
+ index: @index,
1016
+ missing_values: @missing_values})
1017
+ end
1018
+
1019
+ def self._load(data) # :nodoc:
1020
+ h = Marshal.load(data)
1021
+ Daru::Vector.new(h[:data], index: h[:index],
1022
+ name: h[:name], dtype: h[:dtype], missing_values: h[:missing_values])
1023
+ end
1024
+
520
1025
  def daru_vector *name
521
1026
  self
522
1027
  end
@@ -535,6 +1040,26 @@ module Daru
535
1040
 
536
1041
  private
537
1042
 
1043
+ # For an array or hash of estimators methods, returns
1044
+ # an array with three elements
1045
+ # 1.- A hash with estimators names as keys and lambdas as values
1046
+ # 2.- An array with estimators names
1047
+ # 3.- A Hash with estimators names as keys and empty arrays as values
1048
+ def prepare_bootstrap(estimators)
1049
+ h_est = estimators
1050
+ h_est = [h_est] unless h_est.is_a?(Array) or h_est.is_a?(Hash)
1051
+
1052
+ if h_est.is_a? Array
1053
+ h_est = h_est.inject({}) do |h, est|
1054
+ h[est] = lambda { |v| Daru::Vector.new(v).send(est) }
1055
+ h
1056
+ end
1057
+ end
1058
+ bss = h_est.keys.inject({}) { |h,v| h[v] = []; h }
1059
+
1060
+ [h_est, h_est.keys, bss]
1061
+ end
1062
+
538
1063
  def quick_sort vector, index, order, &block
539
1064
  recursive_quick_sort vector, index, order, 0, @size-1, &block
540
1065
  [vector, index]
@@ -599,18 +1124,17 @@ module Daru
599
1124
  end
600
1125
 
601
1126
  # Note: To maintain sanity, this _MUST_ be the _ONLY_ place in daru where the
602
- # @dtype variable is set and the underlying data type of vector changed.
1127
+ # @dtype variable is set and the underlying data type of vector changed.
603
1128
  def cast_vector_to dtype, source=nil, nm_dtype=nil
604
- source = @data if source.nil?
605
- return @data if @dtype and @dtype == dtype
1129
+ source = @data.to_a if source.nil?
606
1130
 
607
1131
  new_vector =
608
1132
  case dtype
609
- when :array then Daru::Accessors::ArrayWrapper.new(source.to_a.dup, self)
610
- when :nmatrix then Daru::Accessors::NMatrixWrapper.new(source.to_a.dup,
611
- self, nm_dtype)
1133
+ when :array then Daru::Accessors::ArrayWrapper.new(source, self)
1134
+ when :nmatrix then Daru::Accessors::NMatrixWrapper.new(source, self, nm_dtype)
1135
+ when :gsl then Daru::Accessors::GSLWrapper.new(source, self)
612
1136
  when :mdarray then raise NotImplementedError, "MDArray not yet supported."
613
- else Daru::Accessors::ArrayWrapper.new(source.dup, self)
1137
+ else raise "Unknown dtype #{dtype}"
614
1138
  end
615
1139
 
616
1140
  @dtype = dtype || :array
@@ -649,12 +1173,11 @@ module Daru
649
1173
  end
650
1174
  end
651
1175
 
652
- def set_nil_positions
653
- @nil_positions = []
1176
+ def set_missing_positions
1177
+ @missing_positions = []
654
1178
  @index.each do |e|
655
- @nil_positions << e if(self[e].nil?)
1179
+ @missing_positions << e if (@missing_values.has_key?(self[e]))
656
1180
  end
657
- @nil_positions.uniq!
658
1181
  end
659
1182
 
660
1183
  def create_index potential_index
@@ -669,5 +1192,17 @@ module Daru
669
1192
  pos = index_for location
670
1193
  pos ? @data[pos] : nil
671
1194
  end
1195
+
1196
+ # Setup missing_values. The missing_values instance variable is set
1197
+ # as a Hash for faster lookup times.
1198
+ def set_missing_values values_arry
1199
+ @missing_values = {}
1200
+ @missing_values[nil] = 0
1201
+ if values_arry
1202
+ values_arry.each do |e|
1203
+ @missing_values[e] = 0
1204
+ end
1205
+ end
1206
+ end
672
1207
  end
673
1208
  end