daru 0.0.5 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.build.sh +14 -0
  3. data/.travis.yml +26 -4
  4. data/CONTRIBUTING.md +31 -0
  5. data/Gemfile +1 -2
  6. data/{History.txt → History.md} +110 -44
  7. data/README.md +21 -288
  8. data/Rakefile +1 -0
  9. data/daru.gemspec +12 -8
  10. data/lib/daru.rb +36 -1
  11. data/lib/daru/accessors/array_wrapper.rb +8 -3
  12. data/lib/daru/accessors/gsl_wrapper.rb +113 -0
  13. data/lib/daru/accessors/nmatrix_wrapper.rb +6 -17
  14. data/lib/daru/core/group_by.rb +0 -1
  15. data/lib/daru/dataframe.rb +1192 -83
  16. data/lib/daru/extensions/rserve.rb +21 -0
  17. data/lib/daru/index.rb +14 -0
  18. data/lib/daru/io/io.rb +170 -8
  19. data/lib/daru/maths/arithmetic/dataframe.rb +4 -3
  20. data/lib/daru/maths/arithmetic/vector.rb +4 -4
  21. data/lib/daru/maths/statistics/dataframe.rb +48 -27
  22. data/lib/daru/maths/statistics/vector.rb +215 -33
  23. data/lib/daru/monkeys.rb +53 -7
  24. data/lib/daru/multi_index.rb +21 -4
  25. data/lib/daru/plotting/dataframe.rb +83 -25
  26. data/lib/daru/plotting/vector.rb +9 -10
  27. data/lib/daru/vector.rb +596 -61
  28. data/lib/daru/version.rb +3 -0
  29. data/spec/accessors/wrappers_spec.rb +51 -0
  30. data/spec/core/group_by_spec.rb +0 -2
  31. data/spec/daru_spec.rb +58 -0
  32. data/spec/dataframe_spec.rb +768 -73
  33. data/spec/extensions/rserve_spec.rb +52 -0
  34. data/spec/fixtures/bank2.dat +200 -0
  35. data/spec/fixtures/repeated_fields.csv +7 -0
  36. data/spec/fixtures/scientific_notation.csv +4 -0
  37. data/spec/fixtures/test_xls.xls +0 -0
  38. data/spec/io/io_spec.rb +161 -24
  39. data/spec/math/arithmetic/dataframe_spec.rb +26 -7
  40. data/spec/math/arithmetic/vector_spec.rb +8 -0
  41. data/spec/math/statistics/dataframe_spec.rb +16 -1
  42. data/spec/math/statistics/vector_spec.rb +215 -47
  43. data/spec/spec_helper.rb +21 -2
  44. data/spec/vector_spec.rb +368 -12
  45. metadata +99 -16
  46. data/lib/version.rb +0 -3
  47. data/notebooks/grouping_splitting_pivots.ipynb +0 -529
  48. data/notebooks/intro_with_music_data_.ipynb +0 -303
@@ -1,5 +1,41 @@
1
1
  class Array
2
- def daru_vector name=nil, index=nil, dtype=Array
2
+ # Recode repeated values on an array, adding the number of repetition
3
+ # at the end
4
+ # Example:
5
+ # a=%w{a b c c d d d e}
6
+ # a.recode_repeated
7
+ # => ["a","b","c_1","c_2","d_1","d_2","d_3","e"]
8
+ def recode_repeated
9
+ if size != uniq.size
10
+ # Find repeated
11
+ repeated = inject({}) do |acc, v|
12
+ if acc[v].nil?
13
+ acc[v] = 1
14
+ else
15
+ acc[v] += 1
16
+ end
17
+ acc
18
+ end.select { |_k, v| v > 1 }.keys
19
+
20
+ ns = repeated.inject({}) do |acc, v|
21
+ acc[v] = 0
22
+ acc
23
+ end
24
+
25
+ collect do |f|
26
+ if repeated.include? f
27
+ ns[f] += 1
28
+ sprintf('%s_%d', f, ns[f])
29
+ else
30
+ f
31
+ end
32
+ end
33
+ else
34
+ self
35
+ end
36
+ end
37
+
38
+ def daru_vector name=nil, index=nil, dtype=:array
3
39
  Daru::Vector.new self, name: name, index: index, dtype: dtype
4
40
  end
5
41
 
@@ -11,8 +47,8 @@ class Array
11
47
  end
12
48
 
13
49
  class Range
14
- def daru_vector name=nil, index=nil, dtype=Array
15
- Daru::Vector.new self, name: name, index: index, dtype: Array
50
+ def daru_vector name=nil, index=nil, dtype=:array
51
+ Daru::Vector.new self, name: name, index: index, dtype: dtype
16
52
  end
17
53
 
18
54
  alias_method :dv, :daru_vector
@@ -23,8 +59,8 @@ class Range
23
59
  end
24
60
 
25
61
  class Hash
26
- def daru_vector index=nil, dtype=Array
27
- Daru::Vector.new self.values[0], name: self.keys[0], index: index, dtype: Array
62
+ def daru_vector index=nil, dtype=:array
63
+ Daru::Vector.new self.values[0], name: self.keys[0], index: index, dtype: dtype
28
64
  end
29
65
 
30
66
  alias_method :dv, :daru_vector
@@ -32,7 +68,7 @@ end
32
68
 
33
69
  class NMatrix
34
70
  def daru_vector name=nil, index=nil, dtype=NMatrix
35
- Daru::Vector.new self, name: name, index: index, dtype: NMatrix
71
+ Daru::Vector.new self, name: name, index: index, dtype: :nmatrix
36
72
  end
37
73
 
38
74
  alias_method :dv, :daru_vector
@@ -40,7 +76,7 @@ end
40
76
 
41
77
  class MDArray
42
78
  def daru_vector name=nil, index=nil, dtype=MDArray
43
- Daru::Vector.new self, name: name, index: index, dtype: MDArray
79
+ Daru::Vector.new self, name: name, index: index, dtype: :mdarray
44
80
  end
45
81
 
46
82
  alias_method :dv, :daru_vector
@@ -58,4 +94,14 @@ class Matrix
58
94
  e / other.to_a.flatten[index]
59
95
  end
60
96
  end
97
+ end
98
+
99
+ class String
100
+ def is_number?
101
+ if self =~ /^-?\d+[,.]?\d*(e-?\d+)?$/
102
+ true
103
+ else
104
+ false
105
+ end
106
+ end
61
107
  end
@@ -17,12 +17,12 @@ module Daru
17
17
  attr_reader :values
18
18
 
19
19
  # Initialize a MultiIndex by passing a tuple of indexes. The order assigned
20
- # to the multi index corresponds to the position of the tuple in the array
21
- # of tuples.
20
+ # to the multi index corresponds to the position of the tuple in the array
21
+ # of tuples.
22
22
  #
23
23
  # Although you can create your own hierarchially indexed Vectors and DataFrames,
24
- # this class currently contains minimal error checking and is mainly used
25
- # internally for summarizing, splitting and grouping of data.
24
+ # this class currently contains minimal error checking and is mainly used
25
+ # internally for summarizing, splitting and grouping of data.
26
26
  #
27
27
  # == Arguments
28
28
  #
@@ -68,6 +68,19 @@ module Daru
68
68
  end
69
69
  end
70
70
 
71
+ def + other
72
+ other.flatten!
73
+ tuples = to_a
74
+ raise ArgumentError, "Incomplete tuple #{other}" unless
75
+ tuples.all? { |t| t.size == other.size }
76
+
77
+ Daru::MultiIndex.new(tuples << (other))
78
+ end
79
+
80
+ def empty?
81
+ @relation_hash.empty?
82
+ end
83
+
71
84
  # Compare two MultiIndex objects for equality based on the contents of their
72
85
  # relation hashes. Does not take object_id into account.
73
86
  def == other
@@ -108,6 +121,10 @@ module Daru
108
121
  tuple.empty? ? nil : tuple
109
122
  end
110
123
 
124
+ def size
125
+ to_a.size
126
+ end
127
+
111
128
  private
112
129
 
113
130
  # Deep compare two hashes
@@ -1,9 +1,3 @@
1
- begin
2
- require 'nyaplot'
3
- rescue LoadError => e
4
- puts "#{e}"
5
- end
6
-
7
1
  module Daru
8
2
  module Plotting
9
3
  module DataFrame
@@ -12,36 +6,100 @@ module Daru
12
6
  # to the block, if it is specified. See the nyaplot docs for info on how to
13
7
  # further use these objects.
14
8
  #
9
+ # Detailed instructions on use of the plotting API can be found in the
10
+ # notebooks whose links you can find in the README.
11
+ #
15
12
  # == Options
16
- # +:type+ - Type of plot (scatter, bar, histogram)
17
- # +:legends+ - The names of the vectors that are to be used as X and Y axes.
18
- # The vectors names must be specified as symbols inside an Array. They
19
- # also should be specified in the right order. For example, passing [:a, :b]
20
- # will keep vector :a as the X axis and :b as the Y axis. Passing [:a]
21
- # keep :a as the X axis and plot the frequency with which :a appears
22
- # on the Y axis.
23
- # +:frame+ - Pass this as *true* to disable plotting the graph directly
24
- # and instead manually create Nyaplot::Frame object inside the block using
25
- # the Nyaplot::Plot object for plotting one or many graphs in a frame.
13
+ #
14
+ # * +:type+ - Type of plot. Can be :scatter, :bar, :histogram, :line or :box.
15
+ # * +:x+ - Vector to be used for X co-ordinates.
16
+ # * +:y+ - Vector to be used for Y co-ordinates.
26
17
  #
27
18
  # == Usage
28
- # df = Daru::DataFrame.new({a:[0,1,2,3,4], b:[10,20,30,40,50]})
29
- # df.plot legends: [:a, :b], type: :bar
19
+ # # Simple bar chart
20
+ # df = Daru::DataFrame.new({a:['A', 'B', 'C', 'D', 'E'], b:[10,20,30,40,50]})
21
+ # df.plot type: :bar, x: :a, y: :b
30
22
  def plot opts={}
31
23
  options = {
32
- type: :scatter,
33
- frame: false,
34
- legends: []
24
+ type: :scatter
35
25
  }.merge(opts)
36
26
 
37
27
  plot = Nyaplot::Plot.new
38
- diagram = plot.add_with_df(Nyaplot::DataFrame.new(self.to_a[0]),
39
- options[:type], *options[:legends])
28
+ types = extract_option :type, options
29
+
30
+ diagram =
31
+ case
32
+ when !([:scatter, :bar, :line, :histogram] & types).empty?
33
+ if single_diagram? options
34
+ add_single_diagram plot, options
35
+ else
36
+ add_multiple_diagrams plot, options
37
+ end
38
+ when types.include?(:box)
39
+ numeric = self.only_numerics(clone: false).dup_only_valid
40
+
41
+ plot.add_with_df(
42
+ numeric.to_nyaplotdf,
43
+ :box, *numeric.vectors.to_a)
44
+ end
40
45
 
41
46
  yield(plot, diagram) if block_given?
42
47
 
43
- plot.show unless options[:frame]
48
+ plot.show
44
49
  end
50
+
51
+ private
52
+
53
+ def single_diagram? options
54
+ options[:x] and options[:x].is_a?(Symbol)
55
+ end
56
+
57
+ def add_single_diagram plot, options
58
+ args = [
59
+ self.to_nyaplotdf,
60
+ options[:type],
61
+ options[:x]
62
+ ]
63
+
64
+ args << options[:y] if(options[:y])
65
+
66
+ plot.add_with_df(*args)
67
+ end
68
+
69
+ def add_multiple_diagrams plot, options
70
+ types = extract_option :type, options
71
+ x_vecs = extract_option :x, options
72
+ y_vecs = extract_option :y, options
73
+
74
+ diagrams = []
75
+ nyaplot_df = self.to_nyaplotdf
76
+ total = x_vecs.size
77
+ types = types.size < total ? types*total : types
78
+
79
+
80
+ (0...total).each do |i|
81
+ diagrams << plot.add_with_df(
82
+ nyaplot_df,
83
+ types[i],
84
+ x_vecs[i],
85
+ y_vecs[i]
86
+ )
87
+ end
88
+
89
+ diagrams
90
+ end
91
+
92
+ def extract_option opt, options
93
+ if options[opt]
94
+ o = options[opt]
95
+ o.is_a?(Array) ? o : [o]
96
+ else
97
+ arr = options.keys
98
+ arr.keep_if { |a| a =~ Regexp.new("\\A#{opt.to_s}") }.sort
99
+ arr.map { |a| options[a] }
100
+ end
101
+ end
102
+
45
103
  end
46
104
  end
47
- end
105
+ end if Daru.has_nyaplot?
@@ -1,9 +1,3 @@
1
- begin
2
- require 'nyaplot'
3
- rescue LoadError => e
4
- puts "#{e}"
5
- end
6
-
7
1
  module Daru
8
2
  module Plotting
9
3
  module Vector
@@ -27,9 +21,14 @@ module Daru
27
21
  type: :scatter
28
22
  }.merge(opts)
29
23
 
30
- x_axis = options[:type] == :scatter ? Array.new(@size) { |i| i } : @index.to_a
31
- plot = Nyaplot::Plot.new
32
- diagram = plot.add( options[:type], x_axis, @data.to_a )
24
+ x_axis = options[:type] == :scatter ? Array.new(@size) { |i| i } : @index.to_a
25
+ plot = Nyaplot::Plot.new
26
+ diagram =
27
+ if [:box, :histogram].include? options[:type]
28
+ plot.add(options[:type], @data.to_a)
29
+ else
30
+ plot.add(options[:type], x_axis, @data.to_a)
31
+ end
33
32
 
34
33
  yield plot, diagram if block_given?
35
34
 
@@ -37,4 +36,4 @@ module Daru
37
36
  end
38
37
  end
39
38
  end
40
- end
39
+ end if Daru.has_nyaplot?
@@ -5,13 +5,14 @@ require 'maths/statistics/vector.rb'
5
5
  require 'plotting/vector.rb'
6
6
  require 'accessors/array_wrapper.rb'
7
7
  require 'accessors/nmatrix_wrapper.rb'
8
+ require 'accessors/gsl_wrapper.rb'
8
9
 
9
10
  module Daru
10
11
  class Vector
11
12
  include Enumerable
12
13
  include Daru::Maths::Arithmetic::Vector
13
14
  include Daru::Maths::Statistics::Vector
14
- include Daru::Plotting::Vector
15
+ include Daru::Plotting::Vector if Daru.has_nyaplot?
15
16
 
16
17
  def each(&block)
17
18
  return to_enum(:each) unless block_given?
@@ -20,37 +21,55 @@ module Daru
20
21
  self
21
22
  end
22
23
 
23
- def map!(&block)
24
- return to_enum(:map!) unless block_given?
24
+ def each_index(&block)
25
+ return to_enum(:each_index) unless block_given?
25
26
 
26
- @data.map!(&block)
27
+ @index.each(&block)
27
28
  self
28
29
  end
29
30
 
30
- def map(&block)
31
- return to_enum(:map) unless block_given?
31
+ def each_with_index(&block)
32
+ return to_enum(:each_with_index) unless block_given?
32
33
 
33
- Daru::Vector.new @data.map(&block), name: @name, index: @index, dtype: @dtype
34
+ @index.each { |i| yield(self[i], i) }
35
+ self
34
36
  end
35
37
 
36
- alias_method :recode, :map
38
+ def map!(&block)
39
+ return to_enum(:map!) unless block_given?
40
+ @data.map!(&block)
41
+ update
42
+ self
43
+ end
37
44
 
45
+ # The name of the Daru::Vector. String.
38
46
  attr_reader :name
47
+ # The row index. Can be either Daru::Index or Daru::MultiIndex.
39
48
  attr_reader :index
49
+ # The total number of elements of the vector.
40
50
  attr_reader :size
51
+ # The underlying dtype of the Vector. Can be either :array, :nmatrix or :gsl.
41
52
  attr_reader :dtype
53
+ # If the dtype is :nmatrix, this attribute represents the data type of the
54
+ # underlying NMatrix object. See NMatrix docs for more details on NMatrix
55
+ # data types.
42
56
  attr_reader :nm_dtype
43
- attr_reader :nil_positions
57
+ # An Array or the positions in the vector that are being treated as 'missing'.
58
+ attr_reader :missing_positions
59
+ # Store a hash of labels for values. Supplementary only. Recommend using index
60
+ # for proper usage.
61
+ attr_accessor :labels
44
62
 
45
63
  # Create a Vector object.
64
+ #
46
65
  # == Arguments
47
66
  #
48
- # @param source[Array,Hash] - Supply elements in the form of an Array or a Hash. If Array, a
49
- # numeric index will be created if not supplied in the options. Specifying more
50
- # index elements than actual values in *source* will insert *nil* into the
51
- # surplus index elements. When a Hash is specified, the keys of the Hash are
52
- # taken as the index elements and the corresponding values as the values that
53
- # populate the vector.
67
+ # @param source[Array,Hash] - Supply elements in the form of an Array or a
68
+ # Hash. If Array, a numeric index will be created if not supplied in the
69
+ # options. Specifying more index elements than actual values in *source*
70
+ # will insert *nil* into the surplus index elements. When a Hash is specified,
71
+ # the keys of the Hash are taken as the index elements and the corresponding
72
+ # values as the values that populate the vector.
54
73
  #
55
74
  # == Options
56
75
  #
@@ -58,10 +77,14 @@ module Daru
58
77
  #
59
78
  # * +:index+ - Index of the vector
60
79
  #
61
- # * +:dtype+ - The underlying data type. Can be :array or :nmatrix. Default :array.
80
+ # * +:dtype+ - The underlying data type. Can be :array, :nmatrix or :gsl.
81
+ # Default :array.
62
82
  #
63
83
  # * +:nm_dtype+ - For NMatrix, the data type of the numbers. See the NMatrix docs for
64
- # further information on supported data type.
84
+ # further information on supported data type.
85
+ #
86
+ # * +:missing_values+ - An Array of the values that are to be treated as 'missing'.
87
+ # nil is the default missing value.
65
88
  #
66
89
  # == Usage
67
90
  #
@@ -79,7 +102,7 @@ module Daru
79
102
  name = opts[:name]
80
103
  set_name name
81
104
 
82
- @data = cast_vector_to(opts[:dtype], source, opts[:nm_dtype])
105
+ @data = cast_vector_to(opts[:dtype] || :array, source, opts[:nm_dtype])
83
106
  @index = create_index(index || @data.size)
84
107
 
85
108
  if @index.size > @data.size
@@ -90,10 +113,81 @@ module Daru
90
113
  end
91
114
 
92
115
  @possibly_changed_type = true
93
- set_nil_positions
116
+ set_missing_values opts[:missing_values]
117
+ set_missing_positions
94
118
  set_size
95
119
  end
96
120
 
121
+ # Create a new vector by specifying the size and an optional value
122
+ # and block to generate values.
123
+ #
124
+ # == Description
125
+ #
126
+ # The *new_with_size* class method lets you create a Daru::Vector
127
+ # by specifying the size as the argument. The optional block, if
128
+ # supplied, is run once for populating each element in the Vector.
129
+ #
130
+ # The result of each run of the block is the value that is ultimately
131
+ # assigned to that position in the Vector.
132
+ #
133
+ # == Options
134
+ # :value
135
+ # All the rest like .new
136
+ def self.new_with_size n, opts={}, &block
137
+ value = opts[:value]
138
+ opts.delete :value
139
+ if block
140
+ vector = Daru::Vector.new n.times.map { |i| block.call(i) }, opts
141
+ else
142
+ vector = Daru::Vector.new n.times.map { value }, opts
143
+ end
144
+ vector
145
+ end
146
+
147
+ # Create a vector using (almost) any object
148
+ # * Array: flattened
149
+ # * Range: transformed using to_a
150
+ # * Daru::Vector
151
+ # * Numeric and string values
152
+ #
153
+ # == Description
154
+ #
155
+ # The `Vector.[]` class method creates a vector from almost any
156
+ # object that has a `#to_a` method defined on it. It is similar
157
+ # to R's `c` method.
158
+ #
159
+ # == Usage
160
+ #
161
+ # a = Daru::Vector[1,2,3,4,6..10]
162
+ # #=>
163
+ # # <Daru::Vector:99448510 @name = nil @size = 9 >
164
+ # # nil
165
+ # # 0 1
166
+ # # 1 2
167
+ # # 2 3
168
+ # # 3 4
169
+ # # 4 6
170
+ # # 5 7
171
+ # # 6 8
172
+ # # 7 9
173
+ # # 8 10
174
+ def self.[](*args)
175
+ values = []
176
+ args.each do |a|
177
+ case a
178
+ when Array
179
+ values.concat a.flatten
180
+ when Daru::Vector
181
+ values.concat a.to_a
182
+ when Range
183
+ values.concat a.to_a
184
+ else
185
+ values << a
186
+ end
187
+ end
188
+ Daru::Vector.new(values)
189
+ end
190
+
97
191
  # Get one or more elements with specified index or a range.
98
192
  #
99
193
  # == Usage
@@ -106,6 +200,7 @@ module Daru
106
200
  # # For vectors employing hierarchial multi index
107
201
  #
108
202
  def [](*indexes)
203
+ indexes.map! { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
109
204
  location = indexes[0]
110
205
  if @index.is_a?(MultiIndex)
111
206
  result =
@@ -158,6 +253,19 @@ module Daru
158
253
  end
159
254
  end
160
255
 
256
+ # Just like in Hashes, you can specify the index label of the Daru::Vector
257
+ # and assign an element an that place in the Daru::Vector.
258
+ #
259
+ # == Usage
260
+ #
261
+ # v = Daru::Vector.new([1,2,3], index: [:a, :b, :c])
262
+ # v[:a] = 999
263
+ # #=>
264
+ # ##<Daru::Vector:90257920 @name = nil @size = 3 >
265
+ # # nil
266
+ # # a 999
267
+ # # b 2
268
+ # # c 3
161
269
  def []=(*location, value)
162
270
  cast(dtype: :array) if value.nil? and dtype != :array
163
271
 
@@ -182,7 +290,38 @@ module Daru
182
290
  end
183
291
 
184
292
  set_size
185
- set_nil_positions
293
+ set_missing_positions unless Daru.lazy_update
294
+ end
295
+
296
+ # The values to be treated as 'missing'. *nil* is the default missing
297
+ # type. To set missing values see the missing_values= method.
298
+ def missing_values
299
+ @missing_values.keys
300
+ end
301
+
302
+ # Assign an Array to treat certain values as 'missing'.
303
+ #
304
+ # == Usage
305
+ #
306
+ # v = Daru::Vector.new [1,2,3,4,5]
307
+ # v.missing_values = [3]
308
+ # v.update
309
+ # v.missing_positions
310
+ # #=> [2]
311
+ def missing_values= values
312
+ set_missing_values values
313
+ set_missing_positions unless Daru.lazy_update
314
+ end
315
+
316
+ # Method for updating the metadata (i.e. missing value positions) of the
317
+ # after assingment/deletion etc. are complete. This is provided so that
318
+ # time is not wasted in creating the metadata for the vector each time
319
+ # assignment/deletion of elements is done. Updating data this way is called
320
+ # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
321
+ def update
322
+ if Daru.lazy_update
323
+ set_missing_positions
324
+ end
186
325
  end
187
326
 
188
327
  # Two vectors are equal if the have the exact same index values corresponding
@@ -199,21 +338,20 @@ module Daru
199
338
  end
200
339
  end
201
340
 
202
- def << element
203
- concat element
341
+ def head q=10
342
+ self[0..(q-1)]
204
343
  end
205
344
 
206
- def push element
207
- concat element
345
+ def tail q=10
346
+ self[(@size - q - 1)..(@size-1)]
208
347
  end
209
348
 
210
- def head q=10
211
- self[0..q]
349
+ # Reports whether missing data is present in the Vector.
350
+ def has_missing_data?
351
+ !missing_positions.empty?
212
352
  end
353
+ alias :flawed? :has_missing_data?
213
354
 
214
- def tail q=10
215
- self[-q..-1]
216
- end
217
355
 
218
356
  # Append an element to the vector by specifying the element and index
219
357
  def concat element, index=nil
@@ -231,8 +369,10 @@ module Daru
231
369
  end
232
370
  @data[@index[index]] = element
233
371
  set_size
234
- set_nil_positions
372
+ set_missing_positions unless Daru.lazy_update
235
373
  end
374
+ alias :push :concat
375
+ alias :<< :concat
236
376
 
237
377
  # Cast a vector to a new data type.
238
378
  #
@@ -240,11 +380,11 @@ module Daru
240
380
  #
241
381
  # * +:dtype+ - :array for Ruby Array. :nmatrix for NMatrix.
242
382
  def cast opts={}
243
- dtype = opts[:dtype]
383
+ dt = opts[:dtype]
244
384
  raise ArgumentError, "Unsupported dtype #{opts[:dtype]}" unless
245
- dtype == :array or dtype == :nmatrix
385
+ dt == :array or dt == :nmatrix or dt == :gsl
246
386
 
247
- @data = cast_vector_to dtype
387
+ @data = cast_vector_to dt unless @dtype == dt
248
388
  end
249
389
 
250
390
  # Delete an element by value
@@ -264,7 +404,7 @@ module Daru
264
404
  end
265
405
 
266
406
  set_size
267
- set_nil_positions
407
+ set_missing_positions unless Daru.lazy_update
268
408
  end
269
409
 
270
410
  # The type of data contained in the vector. Can be :object or :numeric. If
@@ -308,11 +448,19 @@ module Daru
308
448
  Daru::Vector.new uniq_vector, name: @name, index: new_index, dtype: @dtype
309
449
  end
310
450
 
451
+ def any? &block
452
+ @data.data.any?(&block)
453
+ end
454
+
455
+ def all? &block
456
+ @data.data.all?(&block)
457
+ end
458
+
311
459
  # Sorts a vector according to its values. If a block is specified, the contents
312
- # will be evaluated and data will be swapped whenever the block evaluates
313
- # to *true*. Defaults to ascending order sorting. Any missing values will be
314
- # put at the end of the vector. Preserves indexing. Default sort algorithm is
315
- # quick sort.
460
+ # will be evaluated and data will be swapped whenever the block evaluates
461
+ # to *true*. Defaults to ascending order sorting. Any missing values will be
462
+ # put at the end of the vector. Preserves indexing. Default sort algorithm is
463
+ # quick sort.
316
464
  #
317
465
  # == Options
318
466
  #
@@ -323,7 +471,7 @@ module Daru
323
471
  #
324
472
  # v = Daru::Vector.new ["My first guitar", "jazz", "guitar"]
325
473
  # # Say you want to sort these strings by length.
326
- # v.sort { |a,b| a.length <=> b.length }
474
+ # v.sort(ascending: false) { |a,b| a.length <=> b.length }
327
475
  def sort opts={}, &block
328
476
  opts = {
329
477
  ascending: true,
@@ -339,18 +487,138 @@ module Daru
339
487
  Daru::Vector.new(vector, index: create_index(index), name: @name, dtype: @dtype)
340
488
  end
341
489
 
342
- # Just sort the data and get an Array in return using Enumerable#sort. Non-destructive.
490
+ # Just sort the data and get an Array in return using Enumerable#sort.
491
+ # Non-destructive.
343
492
  def sorted_data &block
344
493
  @data.to_a.sort(&block)
345
494
  end
346
495
 
347
- # Returns *true* if the value passed actually exists in the vector.
496
+ # Returns *true* if the value passed is actually exists or is not marked as
497
+ # a *missing value*.
348
498
  def exists? value
349
- !self[index_of(value)].nil?
499
+ !@missing_values.has_key?(self[index_of(value)])
500
+ end
501
+
502
+ # Like map, but returns a Daru::Vector with the returned values.
503
+ def recode dt=nil, &block
504
+ return to_enum(:recode) unless block_given?
505
+
506
+ dup.recode! dt, &block
507
+ end
508
+
509
+ # Destructive version of recode!
510
+ def recode! dt=nil, &block
511
+ return to_enum(:recode!) unless block_given?
512
+
513
+ @data.map!(&block).data
514
+ @data = cast_vector_to(dt || @dtype)
515
+ self
516
+ end
517
+
518
+ def delete_if &block
519
+ return to_enum(:delete_if) unless block_given?
520
+
521
+ keep_e = []
522
+ keep_i = []
523
+ each_with_index do |n, i|
524
+ if yield(n)
525
+ keep_e << n
526
+ keep_i << i
527
+ end
528
+ end
529
+
530
+ @data = cast_vector_to @dtype, keep_e
531
+ @index = @index.is_a?(MultiIndex) ? MultiIndex.new(keep_i) : Index.new(keep_i)
532
+ set_missing_positions unless Daru.lazy_update
533
+ set_size
534
+
535
+ self
536
+ end
537
+
538
+ # Reports all values that doesn't comply with a condition.
539
+ # Returns a hash with the index of data and the invalid data.
540
+ def verify &block
541
+ h = {}
542
+ (0...size).each do |i|
543
+ if !(yield @data[i])
544
+ h[i] = @data[i]
545
+ end
546
+ end
547
+
548
+ h
549
+ end
550
+
551
+ # Return an Array with the data splitted by a separator.
552
+ # a=Daru::Vector.new(["a,b","c,d","a,b","d"])
553
+ # a.splitted
554
+ # =>
555
+ # [["a","b"],["c","d"],["a","b"],["d"]]
556
+ def splitted sep=","
557
+ @data.map do |s|
558
+ if s.nil?
559
+ nil
560
+ elsif s.respond_to? :split
561
+ s.split sep
562
+ else
563
+ [s]
564
+ end
565
+ end
566
+ end
567
+
568
+ # Returns a hash of Vectors, defined by the different values
569
+ # defined on the fields
570
+ # Example:
571
+ #
572
+ # a=Daru::Vector.new(["a,b","c,d","a,b"])
573
+ # a.split_by_separator
574
+ # => {"a"=>#<Daru::Vector:0x7f2dbcc09d88
575
+ # @data=[1, 0, 1]>,
576
+ # "b"=>#<Daru::Vector:0x7f2dbcc09c48
577
+ # @data=[1, 1, 0]>,
578
+ # "c"=>#<Daru::Vector:0x7f2dbcc09b08
579
+ # @data=[0, 1, 1]>}
580
+ #
581
+ def split_by_separator sep=","
582
+ split_data = splitted sep
583
+ factors = split_data.flatten.uniq.compact
584
+
585
+ out = factors.inject({}) do |h,x|
586
+ h[x] = []
587
+ h
588
+ end
589
+
590
+ split_data.each do |r|
591
+ if r.nil?
592
+ factors.each do |f|
593
+ out[f].push(nil)
594
+ end
595
+ else
596
+ factors.each do |f|
597
+ out[f].push(r.include?(f) ? 1:0)
598
+ end
599
+ end
600
+ end
601
+
602
+ out.inject({}) do |s,v|
603
+ s[v[0]] = Daru::Vector.new v[1]
604
+ s
605
+ end
606
+ end
607
+
608
+ def split_by_separator_freq(sep=",")
609
+ split_by_separator(sep).inject({}) do |a,v|
610
+ a[v[0]] = v[1].inject { |s,x| s+x.to_i }
611
+ a
612
+ end
613
+ end
614
+
615
+ def reset_index!
616
+ @index = Daru::Index.new(Array.new(size) { |i| i })
617
+ self
350
618
  end
351
619
 
352
620
  # Returns a vector which has *true* in the position where the element in self
353
- # is nil, and false otherwise.
621
+ # is nil, and false otherwise.
354
622
  #
355
623
  # == Usage
356
624
  #
@@ -383,26 +651,34 @@ module Daru
383
651
  end
384
652
 
385
653
  # Replace all nils in the vector with the value passed as an argument. Destructive.
386
- # See #replace_nils for non-destructive version
654
+ # See #replace_nils for non-destructive version
387
655
  #
388
656
  # == Arguments
389
657
  #
390
658
  # * +replacement+ - The value which should replace all nils
391
659
  def replace_nils! replacement
392
- nil_positions.each do |idx|
660
+ missing_positions.each do |idx|
393
661
  self[idx] = replacement
394
662
  end
395
663
 
396
664
  self
397
665
  end
398
666
 
667
+ def detach_index
668
+ Daru::DataFrame.new({
669
+ index: @index.to_a.map(&:to_s),
670
+ vector: @data.to_a
671
+ })
672
+ end
673
+
399
674
  # Non-destructive version of #replace_nils!
400
675
  def replace_nils replacement
401
676
  self.dup.replace_nils!(replacement)
402
677
  end
403
678
 
679
+ # number of non-missing elements
404
680
  def n_valid
405
- @size
681
+ @size - missing_positions.size
406
682
  end
407
683
 
408
684
  # Returns *true* if an index exists
@@ -425,6 +701,20 @@ module Daru
425
701
  end
426
702
  end
427
703
 
704
+ # If dtype != gsl, will convert data to GSL::Vector with to_a. Otherwise returns
705
+ # the stored GSL::Vector object.
706
+ def to_gsl
707
+ if Daru.has_gsl?
708
+ if dtype == :gsl
709
+ return @data.data
710
+ else
711
+ GSL::Vector.alloc only_valid(:array).to_a
712
+ end
713
+ else
714
+ raise NoMethodError, "Install gsl-nmatrix for access to this functionality."
715
+ end
716
+ end
717
+
428
718
  # Convert to hash. Hash keys are indexes and values are the correspoding elements
429
719
  def to_hash
430
720
  @index.inject({}) do |hsh, index|
@@ -446,12 +736,24 @@ module Daru
446
736
  # Convert to html for iruby
447
737
  def to_html threshold=30
448
738
  name = @name || 'nil'
449
- html = '<table>' + '<tr><th> </th><th>' + name.to_s + '</th></tr>'
739
+ html = "<table>" +
740
+ "<tr>" +
741
+ "<th colspan=\"2\">" +
742
+ "Daru::Vector:#{self.object_id} " + " size: #{size}" +
743
+ "</th>" +
744
+ "</tr>"
745
+ html += '<tr><th> </th><th>' + name.to_s + '</th></tr>'
450
746
  @index.each_with_index do |index, num|
451
747
  html += '<tr><td>' + index.to_s + '</td>' + '<td>' + self[index].to_s + '</td></tr>'
452
748
 
453
749
  if num > threshold
454
750
  html += '<tr><td>...</td><td>...</td></tr>'
751
+
752
+ last_index = @index.to_a.last
753
+ html += '<tr>' +
754
+ '<td>' + last_index.to_s + '</td>' +
755
+ '<td>' + self[last_index].to_s + '</td>' +
756
+ '</tr>'
455
757
  break
456
758
  end
457
759
  end
@@ -464,11 +766,45 @@ module Daru
464
766
  to_html
465
767
  end
466
768
 
769
+ # Create a summary of the Vector using Report Builder.
770
+ def summary(method = :to_text)
771
+ ReportBuilder.new(no_title: true).add(self).send(method)
772
+ end
773
+
774
+ def report_building b
775
+ b.section(:name => name) do |s|
776
+ s.text "n :#{size}"
777
+ s.text "n valid:#{n_valid}"
778
+ if @type == :object
779
+ s.text "factors: #{factors.to_a.join(',')}"
780
+ s.text "mode: #{mode}"
781
+
782
+ s.table(:name => "Distribution") do |t|
783
+ frequencies.sort_by { |a| a.to_s }.each do |k,v|
784
+ key = @index.include?(k) ? @index[k] : k
785
+ t.row [key, v , ("%0.2f%%" % (v.quo(n_valid)*100))]
786
+ end
787
+ end
788
+ end
789
+
790
+ s.text "median: #{median.to_s}" if (@type==:numeric or @type==:numeric)
791
+ if @type==:numeric
792
+ s.text "mean: %0.4f" % mean
793
+ if sd
794
+ s.text "std.dev.: %0.4f" % sd
795
+ s.text "std.err.: %0.4f" % se
796
+ s.text "skew: %0.4f" % skew
797
+ s.text "kurtosis: %0.4f" % kurtosis
798
+ end
799
+ end
800
+ end
801
+ end
802
+
467
803
  # Over rides original inspect for pretty printing in irb
468
804
  def inspect spacing=20, threshold=15
469
805
  longest = [@name.to_s.size,
470
- @index.to_a.map(&:to_s).map(&:size).max,
471
- @data .map(&:to_s).map(&:size).max,
806
+ (@index.to_a.map(&:to_s).map(&:size).max || 0),
807
+ (@data .map(&:to_s).map(&:size).max || 0),
472
808
  'nil'.size].max
473
809
 
474
810
  content = ""
@@ -503,6 +839,11 @@ module Daru
503
839
  #
504
840
  # @param new_name [Symbol] The new name.
505
841
  def rename new_name
842
+ if new_name.is_a?(Numeric)
843
+ @name = new_name
844
+ return
845
+ end
846
+
506
847
  @name = new_name.to_sym
507
848
  end
508
849
 
@@ -511,12 +852,176 @@ module Daru
511
852
  Daru::Vector.new @data.dup, name: @name, index: @index.dup
512
853
  end
513
854
 
855
+ # == Bootstrap
856
+ # Generate +nr+ resamples (with replacement) of size +s+
857
+ # from vector, computing each estimate from +estimators+
858
+ # over each resample.
859
+ # +estimators+ could be
860
+ # a) Hash with variable names as keys and lambdas as values
861
+ # a.bootstrap(:log_s2=>lambda {|v| Math.log(v.variance)},1000)
862
+ # b) Array with names of method to bootstrap
863
+ # a.bootstrap([:mean, :sd],1000)
864
+ # c) A single method to bootstrap
865
+ # a.jacknife(:mean, 1000)
866
+ # If s is nil, is set to vector size by default.
867
+ #
868
+ # Returns a DataFrame where each vector is a vector
869
+ # of length +nr+ containing the computed resample estimates.
870
+ def bootstrap(estimators, nr, s=nil)
871
+ s ||= size
872
+ h_est, es, bss = prepare_bootstrap(estimators)
873
+
874
+ nr.times do |i|
875
+ bs = sample_with_replacement(s)
876
+ es.each do |estimator|
877
+ bss[estimator].push(h_est[estimator].call(bs))
878
+ end
879
+ end
880
+
881
+ es.each do |est|
882
+ bss[est] = Daru::Vector.new bss[est]
883
+ end
884
+
885
+ Daru::DataFrame.new bss
886
+ end
887
+
888
+ # == Jacknife
889
+ # Returns a dataset with jacknife delete-+k+ +estimators+
890
+ # +estimators+ could be:
891
+ # a) Hash with variable names as keys and lambdas as values
892
+ # a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance)})
893
+ # b) Array with method names to jacknife
894
+ # a.jacknife([:mean, :sd])
895
+ # c) A single method to jacknife
896
+ # a.jacknife(:mean)
897
+ # +k+ represent the block size for block jacknife. By default
898
+ # is set to 1, for classic delete-one jacknife.
899
+ #
900
+ # Returns a dataset where each vector is an vector
901
+ # of length +cases+/+k+ containing the computed jacknife estimates.
902
+ #
903
+ # == Reference:
904
+ # * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife.
905
+ def jackknife(estimators, k=1)
906
+ raise "n should be divisible by k:#{k}" unless size % k==0
907
+
908
+ nb = (size / k).to_i
909
+ h_est, es, ps = prepare_bootstrap(estimators)
910
+
911
+ est_n = es.inject({}) do |h,v|
912
+ h[v] = h_est[v].call(self)
913
+ h
914
+ end
915
+
916
+ nb.times do |i|
917
+ other = @data.dup
918
+ other.slice!(i*k, k)
919
+ other = Daru::Vector.new other
920
+
921
+ es.each do |estimator|
922
+ # Add pseudovalue
923
+ ps[estimator].push(
924
+ nb * est_n[estimator] - (nb-1) * h_est[estimator].call(other))
925
+ end
926
+ end
927
+
928
+ es.each do |est|
929
+ ps[est] = Daru::Vector.new ps[est]
930
+ end
931
+ Daru::DataFrame.new ps
932
+ end
933
+
934
+ # Creates a new vector consisting only of non-nil data
935
+ #
936
+ # == Arguments
937
+ #
938
+ # @as_a [Symbol] Passing :array will return only the elements
939
+ # as an Array. Otherwise will return a Daru::Vector.
940
+ #
941
+ # @duplicate [Symbol] In case no missing data is found in the
942
+ # vector, setting this to false will return the same vector.
943
+ # Otherwise, a duplicate will be returned irrespective of
944
+ # presence of missing data.
945
+ def only_valid as_a=:vector, duplicate=true
946
+ return self.dup if !has_missing_data? and as_a == :vector and duplicate
947
+ return self if !has_missing_data? and as_a == :vector and !duplicate
948
+ return self.to_a if !has_missing_data? and as_a != :vector
949
+
950
+ new_index = @index.to_a - missing_positions
951
+ new_vector = new_index.map do |idx|
952
+ self[idx]
953
+ end
954
+
955
+ return new_vector if as_a != :vector
956
+
957
+ Daru::Vector.new new_vector, index: new_index, name: @name, dtype: dtype
958
+ end
959
+
960
+ # Returns a Vector containing only missing data (preserves indexes).
961
+ def only_missing as_a=:vector
962
+ if as_a == :vector
963
+ self[*missing_positions]
964
+ elsif as_a == :array
965
+ self[*missing_positions].to_a
966
+ end
967
+ end
968
+
969
+ # Returns a Vector with only numerical data. Missing data is included
970
+ # but non-Numeric objects are excluded. Preserves index.
971
+ def only_numerics
972
+ numeric_indexes = []
973
+
974
+ each_with_index do |v, i|
975
+ numeric_indexes << i if(v.kind_of?(Numeric) or @missing_values.has_key?(v))
976
+ end
977
+
978
+ self[*numeric_indexes]
979
+ end
980
+
981
+ # Returns the database type for the vector, according to its content
982
+ def db_type(dbs=:mysql)
983
+ # first, detect any character not number
984
+ if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
985
+ return "DATE"
986
+ elsif @data.find {|v| v.to_s=~/[^0-9e.-]/ }
987
+ return "VARCHAR (255)"
988
+ elsif @data.find {|v| v.to_s=~/\./}
989
+ return "DOUBLE"
990
+ else
991
+ return "INTEGER"
992
+ end
993
+ end
994
+
514
995
  # Copies the structure of the vector (i.e the index, size, etc.) and fills all
515
- # all values with nils.
996
+ # all values with nils.
516
997
  def clone_structure
517
998
  Daru::Vector.new(([nil]*@size), name: @name, index: @index.dup)
518
999
  end
519
1000
 
1001
+ # Save the vector to a file
1002
+ #
1003
+ # == Arguments
1004
+ #
1005
+ # * filename - Path of file where the vector is to be saved
1006
+ def save filename
1007
+ Daru::IO.save self, filename
1008
+ end
1009
+
1010
+ def _dump(depth) # :nodoc:
1011
+ Marshal.dump({
1012
+ data: @data.to_a,
1013
+ dtype: @dtype,
1014
+ name: @name,
1015
+ index: @index,
1016
+ missing_values: @missing_values})
1017
+ end
1018
+
1019
+ def self._load(data) # :nodoc:
1020
+ h = Marshal.load(data)
1021
+ Daru::Vector.new(h[:data], index: h[:index],
1022
+ name: h[:name], dtype: h[:dtype], missing_values: h[:missing_values])
1023
+ end
1024
+
520
1025
  def daru_vector *name
521
1026
  self
522
1027
  end
@@ -535,6 +1040,26 @@ module Daru
535
1040
 
536
1041
  private
537
1042
 
1043
+ # For an array or hash of estimators methods, returns
1044
+ # an array with three elements
1045
+ # 1.- A hash with estimators names as keys and lambdas as values
1046
+ # 2.- An array with estimators names
1047
+ # 3.- A Hash with estimators names as keys and empty arrays as values
1048
+ def prepare_bootstrap(estimators)
1049
+ h_est = estimators
1050
+ h_est = [h_est] unless h_est.is_a?(Array) or h_est.is_a?(Hash)
1051
+
1052
+ if h_est.is_a? Array
1053
+ h_est = h_est.inject({}) do |h, est|
1054
+ h[est] = lambda { |v| Daru::Vector.new(v).send(est) }
1055
+ h
1056
+ end
1057
+ end
1058
+ bss = h_est.keys.inject({}) { |h,v| h[v] = []; h }
1059
+
1060
+ [h_est, h_est.keys, bss]
1061
+ end
1062
+
538
1063
  def quick_sort vector, index, order, &block
539
1064
  recursive_quick_sort vector, index, order, 0, @size-1, &block
540
1065
  [vector, index]
@@ -599,18 +1124,17 @@ module Daru
599
1124
  end
600
1125
 
601
1126
  # Note: To maintain sanity, this _MUST_ be the _ONLY_ place in daru where the
602
- # @dtype variable is set and the underlying data type of vector changed.
1127
+ # @dtype variable is set and the underlying data type of vector changed.
603
1128
  def cast_vector_to dtype, source=nil, nm_dtype=nil
604
- source = @data if source.nil?
605
- return @data if @dtype and @dtype == dtype
1129
+ source = @data.to_a if source.nil?
606
1130
 
607
1131
  new_vector =
608
1132
  case dtype
609
- when :array then Daru::Accessors::ArrayWrapper.new(source.to_a.dup, self)
610
- when :nmatrix then Daru::Accessors::NMatrixWrapper.new(source.to_a.dup,
611
- self, nm_dtype)
1133
+ when :array then Daru::Accessors::ArrayWrapper.new(source, self)
1134
+ when :nmatrix then Daru::Accessors::NMatrixWrapper.new(source, self, nm_dtype)
1135
+ when :gsl then Daru::Accessors::GSLWrapper.new(source, self)
612
1136
  when :mdarray then raise NotImplementedError, "MDArray not yet supported."
613
- else Daru::Accessors::ArrayWrapper.new(source.dup, self)
1137
+ else raise "Unknown dtype #{dtype}"
614
1138
  end
615
1139
 
616
1140
  @dtype = dtype || :array
@@ -649,12 +1173,11 @@ module Daru
649
1173
  end
650
1174
  end
651
1175
 
652
- def set_nil_positions
653
- @nil_positions = []
1176
+ def set_missing_positions
1177
+ @missing_positions = []
654
1178
  @index.each do |e|
655
- @nil_positions << e if(self[e].nil?)
1179
+ @missing_positions << e if (@missing_values.has_key?(self[e]))
656
1180
  end
657
- @nil_positions.uniq!
658
1181
  end
659
1182
 
660
1183
  def create_index potential_index
@@ -669,5 +1192,17 @@ module Daru
669
1192
  pos = index_for location
670
1193
  pos ? @data[pos] : nil
671
1194
  end
1195
+
1196
+ # Setup missing_values. The missing_values instance variable is set
1197
+ # as a Hash for faster lookup times.
1198
+ def set_missing_values values_arry
1199
+ @missing_values = {}
1200
+ @missing_values[nil] = 0
1201
+ if values_arry
1202
+ values_arry.each do |e|
1203
+ @missing_values[e] = 0
1204
+ end
1205
+ end
1206
+ end
672
1207
  end
673
1208
  end