daru 0.0.5 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.build.sh +14 -0
  3. data/.travis.yml +26 -4
  4. data/CONTRIBUTING.md +31 -0
  5. data/Gemfile +1 -2
  6. data/{History.txt → History.md} +110 -44
  7. data/README.md +21 -288
  8. data/Rakefile +1 -0
  9. data/daru.gemspec +12 -8
  10. data/lib/daru.rb +36 -1
  11. data/lib/daru/accessors/array_wrapper.rb +8 -3
  12. data/lib/daru/accessors/gsl_wrapper.rb +113 -0
  13. data/lib/daru/accessors/nmatrix_wrapper.rb +6 -17
  14. data/lib/daru/core/group_by.rb +0 -1
  15. data/lib/daru/dataframe.rb +1192 -83
  16. data/lib/daru/extensions/rserve.rb +21 -0
  17. data/lib/daru/index.rb +14 -0
  18. data/lib/daru/io/io.rb +170 -8
  19. data/lib/daru/maths/arithmetic/dataframe.rb +4 -3
  20. data/lib/daru/maths/arithmetic/vector.rb +4 -4
  21. data/lib/daru/maths/statistics/dataframe.rb +48 -27
  22. data/lib/daru/maths/statistics/vector.rb +215 -33
  23. data/lib/daru/monkeys.rb +53 -7
  24. data/lib/daru/multi_index.rb +21 -4
  25. data/lib/daru/plotting/dataframe.rb +83 -25
  26. data/lib/daru/plotting/vector.rb +9 -10
  27. data/lib/daru/vector.rb +596 -61
  28. data/lib/daru/version.rb +3 -0
  29. data/spec/accessors/wrappers_spec.rb +51 -0
  30. data/spec/core/group_by_spec.rb +0 -2
  31. data/spec/daru_spec.rb +58 -0
  32. data/spec/dataframe_spec.rb +768 -73
  33. data/spec/extensions/rserve_spec.rb +52 -0
  34. data/spec/fixtures/bank2.dat +200 -0
  35. data/spec/fixtures/repeated_fields.csv +7 -0
  36. data/spec/fixtures/scientific_notation.csv +4 -0
  37. data/spec/fixtures/test_xls.xls +0 -0
  38. data/spec/io/io_spec.rb +161 -24
  39. data/spec/math/arithmetic/dataframe_spec.rb +26 -7
  40. data/spec/math/arithmetic/vector_spec.rb +8 -0
  41. data/spec/math/statistics/dataframe_spec.rb +16 -1
  42. data/spec/math/statistics/vector_spec.rb +215 -47
  43. data/spec/spec_helper.rb +21 -2
  44. data/spec/vector_spec.rb +368 -12
  45. metadata +99 -16
  46. data/lib/version.rb +0 -3
  47. data/notebooks/grouping_splitting_pivots.ipynb +0 -529
  48. data/notebooks/intro_with_music_data_.ipynb +0 -303
data/Rakefile CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'rspec/core/rake_task'
2
+ require 'bundler/gem_tasks'
2
3
 
3
4
  RSpec::Core::RakeTask.new(:spec)
4
5
 
@@ -1,9 +1,9 @@
1
1
  # coding: utf-8
2
2
  $:.unshift File.expand_path("../lib", __FILE__)
3
3
 
4
- require 'version.rb'
4
+ require 'daru/version.rb'
5
5
 
6
- DESCRIPTION = <<MSG
6
+ Daru::DESCRIPTION = <<MSG
7
7
  Daru (Data Analysis in RUby) is a library for analysis, manipulation and visualization
8
8
  of data.
9
9
 
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
18
18
  spec.authors = ['Sameer Deshmukh']
19
19
  spec.email = ['sameer.deshmukh93@gmail.com']
20
20
  spec.summary = %q{Data Analysis in RUby}
21
- spec.description = DESCRIPTION
21
+ spec.description = Daru::DESCRIPTION
22
22
  spec.homepage = "http://github.com/v0dro/daru"
23
23
  spec.license = 'BSD-2'
24
24
 
@@ -27,12 +27,16 @@ Gem::Specification.new do |spec|
27
27
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
28
28
  spec.require_paths = ["lib"]
29
29
 
30
- spec.add_development_dependency 'bundler'
30
+ spec.add_runtime_dependency 'reportbuilder', '~> 1.4'
31
+ spec.add_runtime_dependency 'spreadsheet', '~> 1.0.3'
32
+
33
+ spec.add_development_dependency 'bundler', '~> 1.10'
31
34
  spec.add_development_dependency 'rake'
35
+ spec.add_development_dependency 'rserve-client', '~> 0.3'
32
36
  spec.add_development_dependency 'rspec'
33
37
  spec.add_development_dependency 'awesome_print'
34
- spec.add_development_dependency 'nyaplot'
35
- if RUBY_ENGINE != 'jruby'
36
- spec.add_development_dependency 'nmatrix', '~> 0.1.0'
37
- end
38
+ spec.add_development_dependency 'nyaplot', '~> 0.1.5'
39
+ spec.add_development_dependency 'nmatrix', '~> 0.1.0'
40
+ spec.add_development_dependency 'distribution', '~> 0.7'
41
+ spec.add_development_dependency 'gsl-nmatrix', '~>1.17'
38
42
  end
@@ -2,10 +2,45 @@ def jruby?
2
2
  RUBY_ENGINE == 'jruby'
3
3
  end
4
4
 
5
- require 'csv'
5
+ module Daru
6
+ SPLIT_TOKEN = ','
7
+ class << self
8
+ @@lazy_update = false
9
+
10
+ # A variable which will set whether Vector metadata is updated immediately or lazily.
11
+ # Call the #update method every time a values are set or removed in order to update
12
+ # metadata like positions of missing values.
13
+ attr_accessor :lazy_update
14
+
15
+ def create_has_library(library)
16
+ define_singleton_method("has_#{library}?") do
17
+ cv = "@@#{library}"
18
+ unless class_variable_defined? cv
19
+ begin
20
+ require library.to_s
21
+ class_variable_set(cv, true)
22
+ rescue LoadError
23
+ class_variable_set(cv, false)
24
+ end
25
+ end
26
+ class_variable_get(cv)
27
+ end
28
+ end
29
+ end
30
+
31
+ create_has_library :gsl
32
+ create_has_library :nmatrix
33
+ create_has_library :nyaplot
34
+ end
35
+
36
+ autoload :Spreadsheet, 'spreadsheet'
37
+ autoload :CSV, 'csv'
38
+
6
39
  require 'matrix'
7
40
  require 'securerandom'
41
+ require 'reportbuilder'
8
42
 
43
+ require 'daru/version.rb'
9
44
  require 'daru/index.rb'
10
45
  require 'daru/multi_index.rb'
11
46
  require 'daru/vector.rb'
@@ -3,13 +3,18 @@ module Daru
3
3
  # Internal class for wrapping ruby array
4
4
  class ArrayWrapper
5
5
  include Enumerable
6
+ extend Forwardable
6
7
 
8
+ def_delegators :@data, :slice!
9
+
7
10
  def each(&block)
8
11
  @data.each(&block)
12
+ self
9
13
  end
10
14
 
11
15
  def map!(&block)
12
16
  @data.map!(&block)
17
+ self
13
18
  end
14
19
 
15
20
  attr_accessor :size
@@ -22,8 +27,8 @@ module Daru
22
27
  set_size
23
28
  end
24
29
 
25
- def [] index
26
- @data[index]
30
+ def [] *index
31
+ @data[*index]
27
32
  end
28
33
 
29
34
  def []= index, value
@@ -62,7 +67,7 @@ module Daru
62
67
  end
63
68
 
64
69
  def mean
65
- sum.quo(@size - @context.nil_positions.size).to_f
70
+ sum.quo(@size - @context.missing_positions.size).to_f
66
71
  end
67
72
 
68
73
  def product
@@ -0,0 +1,113 @@
1
+ module Daru
2
+ module Accessors
3
+ module GSLStatistics
4
+ def vector_standardized_compute(m,sd)
5
+ Daru::Vector.new @data.collect { |x| (x.to_f - m).quo(sd) }, dtype: :gsl,
6
+ index: @context.index, name: @context.name
7
+ end
8
+
9
+ def vector_centered_compute(m)
10
+ Daru::Vector.new @data.collect {|x| (x.to_f - m)}, dtype: :gsl,
11
+ index: @context.index, name: @context.name
12
+ end
13
+
14
+ def sample_with_replacement(sample=1)
15
+ r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
16
+ Daru::Vector.new(r.sample(@data, sample).to_a, dtype: :gsl,
17
+ index: @context.index, name: @context.name)
18
+ end
19
+
20
+ def sample_without_replacement(sample=1)
21
+ r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
22
+ r.choose(@data, sample).to_a
23
+ end
24
+
25
+ def median
26
+ GSL::Stats::median_from_sorted_data(@data.sort)
27
+ end
28
+
29
+ def variance_sample(m)
30
+ @data.variance_m
31
+ end
32
+
33
+ def standard_deviation_sample(m)
34
+ @data.sd(m)
35
+ end
36
+
37
+ def variance_population(m)
38
+ @data.variance_with_fixed_mean(m)
39
+ end
40
+
41
+ def standard_deviation_population m
42
+ @data.sd_with_fixed_mean(m)
43
+ end
44
+
45
+ def skew
46
+ @data.skew
47
+ end
48
+
49
+ def kurtosis
50
+ @data.kurtosis
51
+ end
52
+ end
53
+
54
+ class GSLWrapper
55
+ include Enumerable
56
+ extend Forwardable
57
+ include Daru::Accessors::GSLStatistics
58
+
59
+ def_delegators :@data, :[], :size, :to_a, :each, :mean,
60
+ :sum, :prod, :max, :min
61
+
62
+ alias :product :prod
63
+
64
+ attr_reader :data
65
+
66
+ def each(&block)
67
+ @data.each(&block)
68
+ self
69
+ end
70
+
71
+ def map!(&block)
72
+ @data.map!(&block)
73
+ self
74
+ end
75
+
76
+ def initialize data, context
77
+ @data = ::GSL::Vector.alloc(data)
78
+ @context = context
79
+ end
80
+
81
+ def []= index, element
82
+ if index == size
83
+ push element
84
+ else
85
+ @data[index] = element
86
+ end
87
+ end
88
+
89
+ def delete_at index
90
+ @data.delete_at index
91
+ end
92
+
93
+ def index key
94
+ @data.to_a.index key
95
+ end
96
+
97
+ def push value
98
+ @data = @data.concat value
99
+ self
100
+ end
101
+ alias :<< :push
102
+ alias :concat :push
103
+
104
+ def dup
105
+ GSLWrapper.new(@data.to_a, @context)
106
+ end
107
+
108
+ def == other
109
+ @data == other.data
110
+ end
111
+ end
112
+ end
113
+ end if Daru.has_gsl?
@@ -1,9 +1,3 @@
1
- begin
2
- require 'nmatrix' unless jruby?
3
- rescue LoadError => e
4
- puts "Please install the nmatrix gem for fast and efficient data storage."
5
- end
6
-
7
1
  module Daru
8
2
  module Accessors
9
3
  # Internal class for wrapping NMatrix
@@ -12,23 +6,18 @@ module Daru
12
6
 
13
7
  def each(&block)
14
8
  @data[0...@size].each(&block)
15
- end
16
-
17
- def map(&block)
18
- @data[0...@size].map(&block)
9
+ self
19
10
  end
20
11
 
21
12
  def map!(&block)
22
13
  @data = NMatrix.new [@size*2], map(&block).to_a, dtype: nm_dtype
14
+ self
23
15
  end
24
16
 
25
17
  def inject(*args, &block)
26
18
  @data[0...@size].inject(*args, &block)
27
19
  end
28
20
 
29
- alias_method :recode, :map
30
- alias_method :recode!, :map!
31
-
32
21
  attr_reader :size, :data, :nm_dtype
33
22
 
34
23
  def initialize vector, context, nm_dtype=:int32
@@ -39,8 +28,8 @@ module Daru
39
28
  # init with twice the storage for reducing the need to resize
40
29
  end
41
30
 
42
- def [] index
43
- return @data[index] if index < @size
31
+ def [] *index
32
+ return @data[*index] if index[0] < @size
44
33
  nil
45
34
  end
46
35
 
@@ -79,7 +68,7 @@ module Daru
79
68
  end
80
69
 
81
70
  def dup
82
- NMatrixWrapper.new @data.to_a, @context, @nm_dtype
71
+ NMatrixWrapper.new @data[0...@size].to_a, @context, @nm_dtype
83
72
  end
84
73
 
85
74
  def resize size = @size*2
@@ -109,4 +98,4 @@ module Daru
109
98
  end
110
99
  end
111
100
  end
112
- end
101
+ end if Daru.has_nmatrix?
@@ -171,7 +171,6 @@ module Daru
171
171
  else
172
172
  arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym }
173
173
  end
174
-
175
174
  symbolized_arry
176
175
  end
177
176
 
@@ -12,17 +12,82 @@ module Daru
12
12
 
13
13
  include Daru::Maths::Arithmetic::DataFrame
14
14
  include Daru::Maths::Statistics::DataFrame
15
- include Daru::Plotting::DataFrame
15
+ include Daru::Plotting::DataFrame if Daru.has_nyaplot?
16
16
 
17
17
  class << self
18
- # Load data from a CSV file.
19
- # Arguments - path, options, block(optional)
18
+ # Load data from a CSV file. Specify an optional block to grab the CSV
19
+ # object and pre-condition it (for example use the `convert` or
20
+ # `header_convert` methods).
20
21
  #
21
- # Accepts a block for pre-conditioning of CSV data if any.
22
+ # == Arguments
23
+ #
24
+ # * path - Path of the file to load specified as a String.
25
+ #
26
+ # == Options
27
+ #
28
+ # Accepts the same options as the Daru::DataFrame constructor and CSV.open()
29
+ # and uses those to eventually construct the resulting DataFrame.
30
+ #
31
+ # == Verbose Description
32
+ #
33
+ # You can specify all the options to the `.from_csv` function that you
34
+ # do to the Ruby `CSV.read()` function, since this is what is used internally.
35
+ #
36
+ # For example, if the columns in your CSV file are separated by something
37
+ # other that commas, you can use the `:col_sep` option. If you want to
38
+ # convert numeric values to numbers and not keep them as strings, you can
39
+ # use the `:converters` option and set it to `:numeric`.
40
+ #
41
+ # The `.from_csv` function uses the following defaults for reading CSV files
42
+ # (that are passed into the `CSV.read()` function):
43
+ #
44
+ # {
45
+ # :col_sep => ',',
46
+ # :converters => :numeric
47
+ # }
22
48
  def from_csv path, opts={}, &block
23
49
  Daru::IO.from_csv path, opts, &block
24
50
  end
25
51
 
52
+ # Read data from an Excel file into a DataFrame.
53
+ #
54
+ # == Arguments
55
+ #
56
+ # * path - Path of the file to be read.
57
+ #
58
+ # == Options
59
+ #
60
+ # *:worksheet_id - ID of the worksheet that is to be read.
61
+ def from_excel path, opts={}, &block
62
+ Daru::IO.from_excel path, opts, &block
63
+ end
64
+
65
+ # Read a database query and returns a Dataset
66
+ #
67
+ # USE:
68
+ #
69
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
70
+ # Daru::DataFrame.from_sql(dbh, "SELECT * FROM test")
71
+ def from_sql dbh, query
72
+ Daru::IO.from_sql dbh, query
73
+ end
74
+
75
+ # Read the database from a plaintext file. For this method to work,
76
+ # the data should be present in a plain text file in columns. See
77
+ # spec/fixtures/bank2.dat for an example.
78
+ #
79
+ # == Arguments
80
+ #
81
+ # * path - Path of the file to be read.
82
+ # * fields - Vector names of the resulting database.
83
+ #
84
+ # == Usage
85
+ #
86
+ # df = Daru::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
87
+ def from_plaintext path, fields
88
+ Daru::IO.from_plaintext path, fields
89
+ end
90
+
26
91
  # Create DataFrame by specifying rows as an Array of Arrays or Array of
27
92
  # Daru::Vector objects.
28
93
  def rows source, opts={}
@@ -52,6 +117,58 @@ module Daru
52
117
 
53
118
  df
54
119
  end
120
+
121
+ # Generates a new dataset, using three vectors
122
+ # - Rows
123
+ # - Columns
124
+ # - Values
125
+ #
126
+ # For example, you have these values
127
+ #
128
+ # x y v
129
+ # a a 0
130
+ # a b 1
131
+ # b a 1
132
+ # b b 0
133
+ #
134
+ # You obtain
135
+ # id a b
136
+ # a 0 1
137
+ # b 1 0
138
+ #
139
+ # Useful to process outputs from databases
140
+ def crosstab_by_assignation rows, columns, values
141
+ raise "Three vectors should be equal size" if
142
+ rows.size != columns.size or rows.size!=values.size
143
+
144
+ cols_values = columns.factors
145
+ cols_n = cols_values.size
146
+
147
+ h_rows = rows.factors.inject({}) do |a,v|
148
+ a[v] = cols_values.inject({}) do |a1,v1|
149
+ a1[v1]=nil
150
+ a1
151
+ end
152
+ a
153
+ end
154
+
155
+ values.each_index do |i|
156
+ h_rows[rows[i]][columns[i]] = values[i]
157
+ end
158
+ df = Daru::DataFrame.new({}, order: [:_id] + cols_values.to_a)
159
+
160
+ rows.factors.each do |row|
161
+ n_row = Array.new(cols_n+1)
162
+ n_row[0] = row
163
+ cols_values.each_index do |i|
164
+ n_row[i+1] = h_rows[row][cols_values[i]]
165
+ end
166
+
167
+ df.add_row(n_row)
168
+ end
169
+ df.update
170
+ df
171
+ end
55
172
  end
56
173
 
57
174
  # The vectors (columns) index of the DataFrame
@@ -67,8 +184,29 @@ module Daru
67
184
  attr_reader :size
68
185
 
69
186
  # DataFrame basically consists of an Array of Vector objects.
70
- # These objects are indexed by row and column by vectors and index Index objects.
71
- # Arguments - source, vectors, index, name.
187
+ # These objects are indexed by row and column by vectors and index Index objects.
188
+ #
189
+ # == Arguments
190
+ #
191
+ # * source - Source from the DataFrame is to be initialized. Can be a Hash
192
+ # of names and vectors (array or Daru::Vector), an array of arrays or
193
+ # array of Daru::Vectors.
194
+ #
195
+ # == Options
196
+ #
197
+ # +:order+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order in
198
+ # which Vectors should appear in the DataFrame.
199
+ #
200
+ # +:index+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order
201
+ # in which rows of the DataFrame will be named.
202
+ #
203
+ # +:name+ - A name for the DataFrame.
204
+ #
205
+ # +:clone+ - Specify as *true* or *false*. When set to false, and Vector
206
+ # objects are passed for the source, the Vector objects will not duplicated
207
+ # when creating the DataFrame. Will have no effect if Array is passed in
208
+ # the source, or if the passed Daru::Vectors have different indexes.
209
+ # Default to *true*.
72
210
  #
73
211
  # == Usage
74
212
  # df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
@@ -84,9 +222,12 @@ module Daru
84
222
  def initialize source, opts={}
85
223
  vectors = opts[:order]
86
224
  index = opts[:index]
87
- @name = (opts[:name] || SecureRandom.uuid).to_sym
225
+ clone = opts[:clone] == false ? false : true
88
226
  @data = []
89
227
 
228
+ temp_name = opts[:name]
229
+ @name = temp_name.is_a?(Numeric) ? temp_name : (temp_name || SecureRandom.uuid).to_sym
230
+
90
231
  if source.empty?
91
232
  @vectors = create_index vectors
92
233
  @index = create_index index
@@ -109,7 +250,7 @@ module Daru
109
250
  vectors.each_with_index do |name, idx|
110
251
  hsh[name] = source[idx]
111
252
  end
112
- initialize(hsh, index: index, order: vectors, name: @name)
253
+ initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
113
254
  else # array of hashes
114
255
  if vectors.nil?
115
256
  @vectors = Daru::Index.new source[0].keys.map(&:to_sym)
@@ -143,13 +284,19 @@ module Daru
143
284
  all_indexes.flatten!.uniq!.sort!
144
285
 
145
286
  @index = Daru::Index.new all_indexes
287
+ clone = true
146
288
  end
147
- @vectors.each do |vector|
148
- @data << Daru::Vector.new([], name: vector, index: @index)
149
289
 
150
- @index.each do |idx|
151
- @data[@vectors[vector]][idx] = source[vector][idx]
290
+ if clone
291
+ @vectors.each do |vector|
292
+ @data << Daru::Vector.new([], name: vector, index: @index)
293
+
294
+ @index.each do |idx|
295
+ @data[@vectors[vector]][idx] = source[vector][idx]
296
+ end
152
297
  end
298
+ else
299
+ @data.concat source.values
153
300
  end
154
301
  else
155
302
  @index = create_index(index || source.values[0].size)
@@ -163,6 +310,7 @@ module Daru
163
310
 
164
311
  set_size
165
312
  validate
313
+ update
166
314
  end
167
315
 
168
316
  # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
@@ -176,6 +324,7 @@ module Daru
176
324
  else
177
325
  axis = :vector
178
326
  end
327
+ names.map! { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
179
328
 
180
329
  if axis == :vector
181
330
  access_vector *names
@@ -194,11 +343,14 @@ module Daru
194
343
  # of the vector will be matched against the row/vector indexes of the DataFrame
195
344
  # before an insertion is performed. Unmatched indexes will be set to nil.
196
345
  def []=(*args)
197
- name = args[0]
198
- axis = args[1]
346
+ axis = args.include?(:row) ? :row : :vector
347
+ args.delete :vector
348
+ args.delete :row
349
+
350
+ name = args[0..-2]
199
351
  vector = args[-1]
352
+ name.map! { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
200
353
 
201
- axis = (!axis.is_a?(Symbol) and (axis != :vector or axis != :row)) ? :vector : axis
202
354
  if axis == :vector
203
355
  insert_or_modify_vector name, vector
204
356
  elsif axis == :row
@@ -222,6 +374,14 @@ module Daru
222
374
  vector[name]
223
375
  end
224
376
 
377
+ def add_row row, index=nil
378
+ self.row[index || @size] = row
379
+ end
380
+
381
+ def add_vector n, vector
382
+ self[n] = vector
383
+ end
384
+
225
385
  # Access a row or set/create a row. Refer #[] and #[]= docs for details.
226
386
  #
227
387
  # == Usage
@@ -232,13 +392,77 @@ module Daru
232
392
  end
233
393
 
234
394
  # Duplicate the DataFrame entirely.
235
- def dup
236
- src = {}
237
- @vectors.each do |vector|
238
- src[vector] = @data[@vectors[vector]].dup
395
+ #
396
+ # == Arguments
397
+ #
398
+ # * +vectors_to_dup+ - An Array specifying the names of Vectors to
399
+ # be duplicated. Will duplicate the entire DataFrame if not specified.
400
+ def dup vectors_to_dup=nil
401
+ vectors_to_dup = @vectors unless vectors_to_dup
402
+
403
+ new_order =
404
+ if vectors.is_a?(MultiIndex)
405
+ src = []
406
+ vectors_to_dup.each do |vec|
407
+ src << @data[@vectors[vec]].dup
408
+ end
409
+
410
+ Daru::MultiIndex.new(vectors_to_dup)
411
+ else
412
+ src = {}
413
+ vectors_to_dup.each do |vector|
414
+ src[vector] = @data[@vectors[vector]].dup
415
+ end
416
+
417
+ Daru::Index.new(vectors_to_dup)
418
+ end
419
+
420
+ Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
421
+ end
422
+
423
+ # Only clone the structure of the DataFrame.
424
+ def clone_structure
425
+ Daru::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
426
+ end
427
+
428
+ # Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
429
+ # preserved.
430
+ #
431
+ # == Arguments
432
+ #
433
+ # +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
434
+ # a view of the whole data frame otherwise.
435
+ def clone *vectors_to_clone
436
+ vectors_to_clone.flatten! unless vectors_to_clone.all? { |a| !a.is_a?(Array) }
437
+ return super if vectors_to_clone.empty?
438
+
439
+ h = vectors_to_clone.inject({}) do |hsh, vec|
440
+ hsh[vec] = self[vec]
441
+ hsh
239
442
  end
443
+ Daru::DataFrame.new(h, clone: false)
444
+ end
240
445
 
241
- Daru::DataFrame.new src, order: @vectors.dup, index: @index.dup, name: @name
446
+ # Returns a 'shallow' copy of DataFrame if missing data is not present,
447
+ # or a full copy of only valid data if missing data is present.
448
+ def clone_only_valid
449
+ if has_missing_data?
450
+ dup_only_valid
451
+ else
452
+ clone
453
+ end
454
+ end
455
+
456
+ # Creates a new duplicate dataframe containing only rows
457
+ # without a single missing value.
458
+ def dup_only_valid vecs=nil
459
+ rows_with_nil = @data.inject([]) do |memo, vector|
460
+ memo.concat vector.missing_positions
461
+ memo
462
+ end.uniq
463
+
464
+ row_indexes = @index.to_a
465
+ (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
242
466
  end
243
467
 
244
468
  # Iterate over each vector
@@ -286,21 +510,205 @@ module Daru
286
510
  self
287
511
  end
288
512
 
289
- # Map each vector. Returns a DataFrame whose vectors are modified according
290
- # to the value returned by the block. As is the case with Enumerable#map,
291
- # the object returned by each block must be a Daru::Vector for the dataframe
292
- # to remain relevant.
513
+ # Iterate over each row or vector of the DataFrame. Specify axis
514
+ # by passing :vector or :row as the argument. Default to :vector.
515
+ #
516
+ # == Description
517
+ #
518
+ # `#each` works exactly like Array#each. The default mode for `each`
519
+ # is to iterate over the columns of the DataFrame. To iterate over
520
+ # rows you must pass the axis, i.e `:row` as an argument.
521
+ #
522
+ # == Arguments
523
+ #
524
+ # * +axis+ - The axis to iterate over. Can be :vector (or :column)
525
+ # or :row. Default to :vector.
526
+ def each axis=:vector, &block
527
+ if axis == :vector or axis == :column
528
+ each_vector(&block)
529
+ elsif axis == :row
530
+ each_row(&block)
531
+ else
532
+ raise ArgumentError, "Unknown axis #{axis}"
533
+ end
534
+ end
535
+
536
+ # Iterate over a row or vector and return results in a Daru::Vector.
537
+ # Specify axis with :vector or :row. Default to :vector.
538
+ #
539
+ # == Description
540
+ #
541
+ # The #collect iterator works similar to #map, the only difference
542
+ # being that it returns a Daru::Vector comprising of the results of
543
+ # each block run. The resultant Vector has the same index as that
544
+ # of the axis over which collect has iterated. It also accepts the
545
+ # optional axis argument.
546
+ #
547
+ # == Arguments
548
+ #
549
+ # * +axis+ - The axis to iterate over. Can be :vector (or :column)
550
+ # or :row. Default to :vector.
551
+ def collect axis=:vector, &block
552
+ if axis == :vector or axis == :column
553
+ collect_vectors(&block)
554
+ elsif axis == :row
555
+ collect_rows(&block)
556
+ else
557
+ raise ArgumentError, "Unknown axis #{axis}"
558
+ end
559
+ end
560
+
561
+ # Map over each vector or row of the data frame according to
562
+ # the argument specified. Will return an Array of the resulting
563
+ # elements. To map over each row/vector and get a DataFrame,
564
+ # see #recode.
565
+ #
566
+ # == Description
567
+ #
568
+ # The #map iterator works like Array#map. The value returned by
569
+ # each run of the block is added to an Array and the Array is
570
+ # returned. This method also accepts an axis argument, like #each.
571
+ # The default is :vector.
572
+ #
573
+ # == Arguments
574
+ #
575
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
576
+ # Default to :vector.
577
+ def map axis=:vector, &block
578
+ if axis == :vector or axis == :column
579
+ map_vectors(&block)
580
+ elsif axis == :row
581
+ map_rows(&block)
582
+ else
583
+ raise ArgumentError, "Unknown axis #{axis}"
584
+ end
585
+ end
586
+
587
+ # Destructive map. Modifies the DataFrame. Each run of the block
588
+ # must return a Daru::Vector. You can specify the axis to map over
589
+ # as the argument. Default to :vector.
590
+ #
591
+ # == Arguments
592
+ #
593
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
594
+ # Default to :vector.
595
+ def map! axis=:vector, &block
596
+ if axis == :vector or axis == :column
597
+ map_vectors!(&block)
598
+ elsif axis == :row
599
+ map_rows!(&block)
600
+ end
601
+ end
602
+
603
+ # Maps over the DataFrame and returns a DataFrame. Each run of the
604
+ # block must return a Daru::Vector object. You can specify the axis
605
+ # to map over. Default to :vector.
606
+ #
607
+ # == Description
608
+ #
609
+ # Recode works similarly to #map, but an important difference between
610
+ # the two is that recode returns a modified Daru::DataFrame instead
611
+ # of an Array. For this reason, #recodeexpects that every run of the
612
+ # block to return a Daru::Vector.
613
+ #
614
+ # Just like map and each, recode also accepts an optional _axis_ argument.
615
+ #
616
+ # == Arguments
617
+ #
618
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
619
+ # Default to :vector.
620
+ def recode axis=:vector, &block
621
+ if axis == :vector or axis == :column
622
+ recode_vectors(&block)
623
+ elsif axis == :row
624
+ recode_rows(&block)
625
+ end
626
+ end
627
+
628
+ # Retain vectors or rows if the block returns a truthy value.
629
+ #
630
+ # == Description
631
+ #
632
+ # For filtering out certain rows/vectors based on their values,
633
+ # use the #filter method. By default it iterates over vectors and
634
+ # keeps those vectors for which the block returns true. It accepts
635
+ # an optional axis argument which lets you specify whether you want
636
+ # to iterate over vectors or rows.
637
+ #
638
+ # == Arguments
639
+ #
640
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
641
+ # Default to :vector.
642
+ #
643
+ # == Usage
644
+ #
645
+ # # Filter vectors
646
+ #
647
+ # df.filter do |vector|
648
+ # vector.type == :numeric and vector.median < 50
649
+ # end
650
+ #
651
+ # # Filter rows
652
+ #
653
+ # df.filter(:row) do |row|
654
+ # row[:a] + row[:d] < 100
655
+ # end
656
+ def filter axis=:vector, &block
657
+ if axis == :vector or axis == :column
658
+ filter_vectors(&block)
659
+ elsif axis == :row
660
+ filter_rows(&block)
661
+ end
662
+ end
663
+
664
+ def recode_vectors &block
665
+ block_given? or return to_enum(:recode_vectors)
666
+
667
+ df = self.dup
668
+ df.each_vector_with_index do |v, i|
669
+ ret = yield v
670
+ ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
671
+ df[*i] = ret
672
+ end
673
+
674
+ df
675
+ end
676
+
677
+ def recode_rows &block
678
+ block_given? or return to_enum(:recode_rows)
679
+
680
+ df = self.dup
681
+ df.each_row_with_index do |r, i|
682
+ ret = yield r
683
+ ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
684
+ df.row[i] = ret
685
+ end
686
+
687
+ df
688
+ end
689
+
690
+ # Map each vector and return an Array.
293
691
  def map_vectors(&block)
294
692
  return to_enum(:map_vectors) unless block_given?
295
693
 
296
- self.dup.map_vectors!(&block)
694
+ arry = []
695
+ @data.each do |vec|
696
+ arry << yield(vec)
697
+ end
698
+
699
+ arry
297
700
  end
298
701
 
299
702
  # Destructive form of #map_vectors
300
703
  def map_vectors!(&block)
301
704
  return to_enum(:map_vectors!) unless block_given?
302
705
 
303
- @data.map!(&block)
706
+ vectors.dup.each do |n|
707
+ v = yield self[n]
708
+ v.is_a?(Daru::Vector) or raise TypeError, "Must return a Daru::Vector not #{v.class}"
709
+ self[n] = v
710
+ end
711
+
304
712
  self
305
713
  end
306
714
 
@@ -308,37 +716,114 @@ module Daru
308
716
  def map_vectors_with_index(&block)
309
717
  return to_enum(:map_vectors_with_index) unless block_given?
310
718
 
311
- df = self.dup
312
- df.each_vector_with_index do |vector, name|
313
- df[name, :vector] = yield(vector, name)
719
+ dt = []
720
+ each_vector_with_index do |vector, name|
721
+ dt << yield(vector, name)
314
722
  end
315
723
 
316
- df
724
+ dt
317
725
  end
318
726
 
319
727
  # Map each row
320
728
  def map_rows(&block)
321
729
  return to_enum(:map_rows) unless block_given?
322
730
 
323
- df = self.dup
324
- df.each_row_with_index do |row, index|
325
- df[index, :row] = yield(row)
731
+ dt = []
732
+ each_row do |row|
733
+ dt << yield(row)
326
734
  end
327
735
 
328
- df
736
+ dt
329
737
  end
330
738
 
331
739
  def map_rows_with_index(&block)
332
740
  return to_enum(:map_rows_with_index) unless block_given?
333
741
 
334
- df = self.dup
335
- df.each_row_with_index do |row, index|
336
- df[index, :row] = yield(row, index)
742
+ dt = []
743
+ each_row_with_index do |row, index|
744
+ dt << yield(row, index)
337
745
  end
338
746
 
339
- df
747
+ dt
340
748
  end
341
749
 
750
+ def map_rows!(&block)
751
+ return to_enum(:map_rows!) unless block_given?
752
+
753
+ index.dup.each do |i|
754
+ r = yield self.row[i]
755
+ r.is_a?(Daru::Vector) or raise TypeError, "Returned object must be Daru::Vector not #{r.class}"
756
+ self.row[i] = r
757
+ end
758
+
759
+ self
760
+ end
761
+
762
+ # Retrieves a Daru::Vector, based on the result of calculation
763
+ # performed on each row.
764
+ def collect_rows &block
765
+ return to_enum(:collect_rows) unless block_given?
766
+
767
+ data = []
768
+ each_row do |row|
769
+ data.push yield(row)
770
+ end
771
+
772
+ Daru::Vector.new(data, index: @index)
773
+ end
774
+
775
+ def collect_row_with_index &block
776
+ return to_enum(:collect_row_with_index) unless block_given?
777
+
778
+ data = []
779
+ each_row_with_index do |row, i|
780
+ data.push yield(row, i)
781
+ end
782
+
783
+ Daru::Vector.new(data, index: @index)
784
+ end
785
+
786
+ # Retrives a Daru::Vector, based on the result of calculation
787
+ # performed on each vector.
788
+ def collect_vectors &block
789
+ return to_enum(:collect_vectors) unless block_given?
790
+
791
+ data = []
792
+ each_vector do |vec|
793
+ data.push yield(vec)
794
+ end
795
+
796
+ Daru::Vector.new(data, index: @vectors)
797
+ end
798
+
799
+ def collect_vector_with_index &block
800
+ return to_enum(:collect_vector_with_index) unless block_given?
801
+
802
+ data = []
803
+ each_vector_with_index do |vec, i|
804
+ data.push yield(vec, i)
805
+ end
806
+
807
+ Daru::Vector.new(data, index: @vectors)
808
+ end
809
+
810
+ # Generate a matrix, based on vector names of the DataFrame.
811
+ #
812
+ # @return {::Matrix}
813
+ def collect_matrix
814
+ return to_enum(:collect_matrix) unless block_given?
815
+
816
+ vecs = vectors.to_a
817
+ rows = vecs.collect { |row|
818
+ vecs.collect { |col|
819
+ yield row,col
820
+ }
821
+ }
822
+
823
+ Matrix.rows(rows)
824
+ end
825
+
826
+
342
827
  # Delete a vector
343
828
  def delete_vector vector
344
829
  if @vectors.include? vector
@@ -367,6 +852,20 @@ module Daru
367
852
  set_size
368
853
  end
369
854
 
855
+ # Creates a DataFrame with the random data, of n size.
856
+ # If n not given, uses original number of rows.
857
+ #
858
+ # @return {Daru::DataFrame}
859
+ def bootstrap(n=nil)
860
+ n ||= nrows
861
+ ds_boot = Daru::DataFrame.new({}, order: @vectors)
862
+ n.times do
863
+ ds_boot.add_row(row[rand(n)])
864
+ end
865
+ ds_boot.update
866
+ ds_boot
867
+ end
868
+
370
869
  def keep_row_if &block
371
870
  deletion = []
372
871
 
@@ -388,6 +887,16 @@ module Daru
388
887
  end
389
888
  end
390
889
 
890
+ # creates a new vector with the data of a given field which the block returns true
891
+ def filter_vector vec
892
+ d = []
893
+ each_row do |row|
894
+ d.push(row[vec]) if yield row
895
+ end
896
+
897
+ Daru::Vector.new(d)
898
+ end
899
+
391
900
  # Iterates over each row and retains it in a new DataFrame if the block returns
392
901
  # true for that row.
393
902
  def filter_rows &block
@@ -419,18 +928,160 @@ module Daru
419
928
  df
420
929
  end
421
930
 
931
+ # Test each row with one or more tests. Each test is a Proc with the form
932
+ # *Proc.new {|row| row[:age] > 0}*
933
+ #
934
+ # The function returns an array with all errors.
935
+ def verify(*tests)
936
+ if(tests[0].is_a? Symbol)
937
+ id = tests[0]
938
+ tests.shift
939
+ else
940
+ id = @vectors.first
941
+ end
942
+
943
+ vr = []
944
+ i = 0
945
+ each(:row) do |row|
946
+ i += 1
947
+ tests.each do |test|
948
+ if !test[2].call(row)
949
+ values = ""
950
+ if test[1].size>0
951
+ values = " (" + test[1].collect{ |k| "#{k}=#{row[k]}" }.join(", ") + ")"
952
+ end
953
+ vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
954
+ end
955
+ end
956
+ end
957
+ vr
958
+ end
959
+
960
+ # DSL for yielding each row and returning a Daru::Vector based on the
961
+ # value each run of the block returns.
962
+ #
963
+ # == Usage
964
+ #
965
+ # a1 = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7])
966
+ # a2 = Daru::Vector.new([10, 20, 30, 40, 50, 60, 70])
967
+ # a3 = Daru::Vector.new([100, 200, 300, 400, 500, 600, 700])
968
+ # ds = Daru::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
969
+ # total = ds.vector_by_calculation { a + b + c }
970
+ # # <Daru::Vector:82314050 @name = nil @size = 7 >
971
+ # # nil
972
+ # # 0 111
973
+ # # 1 222
974
+ # # 2 333
975
+ # # 3 444
976
+ # # 4 555
977
+ # # 5 666
978
+ # # 6 777
979
+ def vector_by_calculation &block
980
+ a = []
981
+ each_row do |r|
982
+ a.push r.instance_eval(&block)
983
+ end
984
+
985
+ Daru::Vector.new a, index: @index
986
+ end
987
+
988
+ # Returns a vector, based on a string with a calculation based
989
+ # on vector.
990
+ #
991
+ # The calculation will be eval'ed, so you can put any variable
992
+ # or expression valid on ruby.
993
+ #
994
+ # For example:
995
+ # a = Daru::Vector.new [1,2]
996
+ # b = Daru::Vector.new [3,4]
997
+ # ds = Daru::DataFrame.new({:a => a,:b => b})
998
+ # ds.compute("a+b")
999
+ # => Vector [4,6]
1000
+ def compute text, &block
1001
+ return instance_eval(&block) if block_given?
1002
+ instance_eval(text)
1003
+ end
1004
+
1005
+ # Return a vector with the number of missing values in each row.
1006
+ #
1007
+ # == Arguments
1008
+ #
1009
+ # * +missing_values+ - An Array of the values that should be
1010
+ # treated as 'missing'. The default missing value is *nil*.
1011
+ def missing_values_rows missing_values=[nil]
1012
+ number_of_missing = []
1013
+ each_row do |row|
1014
+ row.missing_values = missing_values
1015
+ number_of_missing << row.missing_positions.size
1016
+ end
1017
+
1018
+ Daru::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows".to_sym
1019
+ end
1020
+
1021
+ # TODO: remove next version
1022
+ alias :vector_missing_values :missing_values_rows
1023
+
1024
+ def has_missing_data?
1025
+ !!@data.any? { |v| v.has_missing_data? }
1026
+ end
1027
+
1028
+ alias :flawed? :has_missing_data?
1029
+
1030
+ # Return a nested hash using vector names as keys and an array constructed of
1031
+ # hashes with other values. If block provided, is used to provide the
1032
+ # values, with parameters +row+ of dataset, +current+ last hash on
1033
+ # hierarchy and +name+ of the key to include
1034
+ def nest *tree_keys, &block
1035
+ tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
1036
+ out = {}
1037
+
1038
+ each_row do |row|
1039
+ current = out
1040
+ # Create tree
1041
+ tree_keys[0, tree_keys.size-1].each do |f|
1042
+ root = row[f]
1043
+ current[root] ||= {}
1044
+ current = current[root]
1045
+ end
1046
+ name = row[tree_keys.last]
1047
+ if !block
1048
+ current[name] ||= []
1049
+ current[name].push(row.to_hash.delete_if { |key,value| tree_keys.include? key})
1050
+ else
1051
+ current[name] = block.call(row, current,name)
1052
+ end
1053
+ end
1054
+
1055
+ out
1056
+ end
1057
+
1058
+ def vector_count_characters vecs=nil
1059
+ vecs ||= @vectors.to_a
1060
+
1061
+ collect_row_with_index do |row, i|
1062
+ vecs.inject(0) do |memo, vec|
1063
+ memo + (row[vec].nil? ? 0 : row[vec].to_s.size)
1064
+ end
1065
+ end
1066
+ end
1067
+
1068
+ def add_vectors_by_split(name,join='-',sep=Daru::SPLIT_TOKEN)
1069
+ split = self[name].split_by_separator(sep)
1070
+ split.each { |k,v| self[(name.to_s + join + k.to_s).to_sym] = v }
1071
+ end
1072
+
422
1073
  # Return the number of rows and columns of the DataFrame in an Array.
423
1074
  def shape
424
1075
  [@index.size, @vectors.size]
425
1076
  end
426
1077
 
427
1078
  # The number of rows
428
- def rows
1079
+ def nrows
429
1080
  shape[0]
430
1081
  end
431
1082
 
432
1083
  # The number of vectors
433
- def cols
1084
+ def ncols
434
1085
  shape[1]
435
1086
  end
436
1087
 
@@ -439,11 +1090,37 @@ module Daru
439
1090
  !!@vectors[*vector]
440
1091
  end
441
1092
 
1093
+ def any? axis=:vector, &block
1094
+ if axis == :vector or axis == :column
1095
+ @data.any?(&block)
1096
+ elsif axis == :row
1097
+ each_row do |row|
1098
+ return true if yield(row)
1099
+ end
1100
+ return false
1101
+ else
1102
+ raise ArgumentError, "Unidentified axis #{axis}"
1103
+ end
1104
+ end
1105
+
1106
+ def all? axis=:vector, &block
1107
+ if axis == :vector or axis == :column
1108
+ @data.all?(&block)
1109
+ elsif axis == :row
1110
+ each_row do |row|
1111
+ return false unless yield(row)
1112
+ end
1113
+ return true
1114
+ else
1115
+ raise ArgumentError, "Unidentified axis #{axis}"
1116
+ end
1117
+ end
1118
+
442
1119
  # The first ten elements of the DataFrame
443
1120
  #
444
1121
  # @param [Fixnum] quantity (10) The number of elements to display from the top.
445
1122
  def head quantity=10
446
- self[0..quantity, :row]
1123
+ self[0..(quantity-1), :row]
447
1124
  end
448
1125
 
449
1126
  # The last ten elements of the DataFrame
@@ -453,7 +1130,59 @@ module Daru
453
1130
  self[(@size - quantity)..(@size-1), :row]
454
1131
  end
455
1132
 
456
- # Group elements by vector to perform operations on them.
1133
+ # Returns a vector with sum of all vectors specified in the argument.
1134
+ # Tf vecs parameter is empty, sum all numeric vector.
1135
+ def vector_sum vecs=nil
1136
+ vecs ||= numeric_vectors
1137
+ sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype
1138
+
1139
+ vecs.each do |n|
1140
+ sum += self[n]
1141
+ end
1142
+
1143
+ sum
1144
+ end
1145
+
1146
+ # Calculate mean of the rows of the dataframe.
1147
+ #
1148
+ # == Arguments
1149
+ #
1150
+ # * +max_missing+ - The maximum number of elements in the row that can be
1151
+ # zero for the mean calculation to happen. Default to 0.
1152
+ def vector_mean max_missing=0
1153
+ mean_vec = Daru::Vector.new [0]*@size, index: @index, name: "mean_#{@name}"
1154
+
1155
+ each_row_with_index do |row, i|
1156
+ mean_vec[i] = row.missing_positions.size > max_missing ? nil : row.mean
1157
+ end
1158
+
1159
+ mean_vec
1160
+ end
1161
+
1162
+ # Group elements by vector to perform operations on them. Returns a
1163
+ # Daru::Core::GroupBy object.See the Daru::Core::GroupBy docs for a detailed
1164
+ # list of possible operations.
1165
+ #
1166
+ # == Arguments
1167
+ #
1168
+ # * vectors - An Array contatining names of vectors to group by.
1169
+ #
1170
+ # == Usage
1171
+ #
1172
+ # df = Daru::DataFrame.new({
1173
+ # a: %w{foo bar foo bar foo bar foo foo},
1174
+ # b: %w{one one two three two two one three},
1175
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
1176
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
1177
+ # })
1178
+ # df.group_by([:a,:b,:c]).groups
1179
+ # #=> {["bar", "one", 2]=>[1],
1180
+ # # ["bar", "three", 1]=>[3],
1181
+ # # ["bar", "two", 6]=>[5],
1182
+ # # ["foo", "one", 1]=>[0],
1183
+ # # ["foo", "one", 3]=>[6],
1184
+ # # ["foo", "three", 8]=>[7],
1185
+ # # ["foo", "two", 3]=>[2, 4]}
457
1186
  def group_by vectors
458
1187
  vectors = [vectors] if vectors.is_a?(Symbol)
459
1188
  vectors.each { |v| raise(ArgumentError, "Vector #{v} does not exist") unless
@@ -462,6 +1191,13 @@ module Daru
462
1191
  Daru::Core::GroupBy.new(self, vectors)
463
1192
  end
464
1193
 
1194
+ def reindex_vectors! new_vectors
1195
+ raise ArgumentError, "Number of vectors passed into function (#{new_vectors.size}) should equal that present in the DataFrame (#{@vectors.size})" if
1196
+ @vectors.size != new_vectors.size
1197
+
1198
+ @vectors = Daru::Index.new new_vectors.map(&:to_sym), new_vectors.map { |e| @vectors[e] }
1199
+ end
1200
+
465
1201
  # Change the index of the DataFrame and its underlying vectors. Destructive.
466
1202
  #
467
1203
  # @param [Symbol, Array] new_index Specify an Array if
@@ -481,19 +1217,58 @@ module Daru
481
1217
  self.dup.reindex! new_index
482
1218
  end
483
1219
 
484
- # Return the names of all the numeric vectors. Will include vectors with nils
1220
+ # Return the indexes of all the numeric vectors. Will include vectors with nils
485
1221
  # alongwith numbers.
486
1222
  def numeric_vectors
487
1223
  numerics = []
488
1224
 
489
- each_vector do |vec|
1225
+ each_vector_with_index do |vec, i|
1226
+ numerics << i if(vec.type == :numeric)
1227
+ end
1228
+ numerics
1229
+ end
1230
+
1231
+ def numeric_vector_names
1232
+ numerics = []
1233
+
1234
+ each_vector do |vec, i|
490
1235
  numerics << vec.name if(vec.type == :numeric)
491
1236
  end
492
1237
  numerics
493
1238
  end
494
1239
 
1240
+ # Return a DataFrame of only the numerical Vectors. If clone: false
1241
+ # is specified as option, only a *view* of the Vectors will be
1242
+ # returned. Defaults to clone: true.
1243
+ def only_numerics opts={}
1244
+ cln = opts[:clone] == false ? false : true
1245
+ nv = numeric_vectors
1246
+ arry = nv.inject([]) do |arr, v|
1247
+ arr << self[v]
1248
+ arr
1249
+ end
1250
+
1251
+ order = @vectors.is_a?(MultiIndex) ? MultiIndex.new(nv) : Index.new(nv)
1252
+ Daru::DataFrame.new(arry, clone: cln, order: order, index: @index)
1253
+ end
1254
+
1255
+ # Generate a summary of this DataFrame with ReportBuilder.
1256
+ def summary(method = :to_text)
1257
+ ReportBuilder.new(no_title: true).add(self).send(method)
1258
+ end
1259
+
1260
+ def report_building(b) # :nodoc: #
1261
+ b.section(:name=>@name) do |g|
1262
+ g.text "Number of rows: #{nrows}"
1263
+ @vectors.each do |v|
1264
+ g.text "Element:[#{v}]"
1265
+ g.parse_element(self[v])
1266
+ end
1267
+ end
1268
+ end
1269
+
495
1270
  # Sorts a dataframe (ascending/descending)according to the given sequence of
496
- # vectors, using the attributes provided in the blocks. Works for 2 LEVELS ONLY.
1271
+ # vectors, using the attributes provided in the blocks.
497
1272
  #
498
1273
  # @param order [Array] The order of vector names in which the DataFrame
499
1274
  # should be sorted.
@@ -583,7 +1358,7 @@ module Daru
583
1358
  elsif opts[:values].is_a?(Array)
584
1359
  opts[:values]
585
1360
  else # nil
586
- (@vectors.to_a - (index | vectors)) & numeric_vectors
1361
+ (@vectors.to_a - (index | vectors)) & numeric_vector_names
587
1362
  end
588
1363
 
589
1364
  raise IndexError, "No numeric vectors to aggregate" if values.empty?
@@ -634,6 +1409,195 @@ module Daru
634
1409
  end
635
1410
  end
636
1411
 
1412
+ # Merge vectors from two DataFrames. In case of name collision,
1413
+ # the vectors names are changed to x_1, x_2 ....
1414
+ #
1415
+ # @return {Daru::DataFrame}
1416
+ def merge other_df
1417
+ raise "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" unless nrows == other_df.nrows
1418
+
1419
+ new_fields = (@vectors.to_a + other_df.vectors.to_a)
1420
+ .recode_repeated
1421
+ .map(&:to_sym)
1422
+ df_new = DataFrame.new({}, order: new_fields)
1423
+
1424
+ (0...nrows).to_a.each do |i|
1425
+ row = self.row[i].to_a + other_df.row[i].to_a
1426
+ df_new.add_row(row)
1427
+ end
1428
+
1429
+ df_new.update
1430
+ df_new
1431
+ end
1432
+
1433
+ # Join 2 DataFrames by given fields
1434
+ # type is one of :left and :inner, default is :left
1435
+ #
1436
+ # Untested! Use at your own risk.
1437
+ #
1438
+ # @return {Daru::DataFrame}
1439
+ def join(other_ds,fields_1=[],fields_2=[],type=:left)
1440
+ fields_new = other_ds.vectors.to_a - fields_2
1441
+ fields = self.vectors.to_a + fields_new
1442
+
1443
+ other_ds_hash = {}
1444
+ other_ds.each_row do |row|
1445
+ key = row.to_hash.select { |k,v| fields_2.include?(k) }.values
1446
+ value = row.to_hash.select { |k,v| fields_new.include?(k) }
1447
+
1448
+ if other_ds_hash[key].nil?
1449
+ other_ds_hash[key] = [value]
1450
+ else
1451
+ other_ds_hash[key] << value
1452
+ end
1453
+ end
1454
+
1455
+ new_ds = DataFrame.new({}, order: fields)
1456
+
1457
+ self.each_row do |row|
1458
+ key = row.to_hash.select{|k,v| fields_1.include?(k)}.values
1459
+ new_case = row.to_hash
1460
+
1461
+ if other_ds_hash[key].nil?
1462
+ if type == :left
1463
+ fields_new.each{|field| new_case[field] = nil}
1464
+ new_ds.add_row(Daru::Vector.new(new_case))
1465
+ end
1466
+ else
1467
+ other_ds_hash[key].each do |new_values|
1468
+ new_ds.add_row(Daru::Vector.new(new_case.merge(new_values)))
1469
+ end
1470
+ end
1471
+ end
1472
+
1473
+ new_ds
1474
+ end
1475
+
1476
+
1477
+ # Creates a new dataset for one to many relations
1478
+ # on a dataset, based on pattern of field names.
1479
+ #
1480
+ # for example, you have a survey for number of children
1481
+ # with this structure:
1482
+ # id, name, child_name_1, child_age_1, child_name_2, child_age_2
1483
+ # with
1484
+ # ds.one_to_many([:id], "child_%v_%n"
1485
+ # the field of first parameters will be copied verbatim
1486
+ # to new dataset, and fields which responds to second
1487
+ # pattern will be added one case for each different %n.
1488
+ #
1489
+ # == Usage
1490
+ # cases=[
1491
+ # ['1','george','red',10,'blue',20,nil,nil],
1492
+ # ['2','fred','green',15,'orange',30,'white',20],
1493
+ # ['3','alfred',nil,nil,nil,nil,nil,nil]
1494
+ # ]
1495
+ # ds=Daru::DataFrame.rows(cases, order: [:id, :name, :car_color1, :car_value1, :car_color2, :car_value2, :car_color3, :car_value3])
1496
+ # ds.one_to_many([:id],'car_%v%n').to_matrix
1497
+ # => Matrix[
1498
+ # ["red", "1", 10],
1499
+ # ["blue", "1", 20],
1500
+ # ["green", "2", 15],
1501
+ # ["orange", "2", 30],
1502
+ # ["white", "2", 20]
1503
+ # ]
1504
+ #
1505
+ def one_to_many(parent_fields, pattern)
1506
+ re = Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
1507
+ ds_vars = parent_fields
1508
+ vars = []
1509
+ max_n = 0
1510
+ h = parent_fields.inject({}) { |a,v|
1511
+ a[v] = Daru::Vector.new([])
1512
+ a
1513
+ }
1514
+ # Adding _row_id
1515
+ h[:_col_id] = Daru::Vector.new([])
1516
+ ds_vars.push(:_col_id)
1517
+
1518
+ @vectors.each do |f|
1519
+ if f =~ re
1520
+ if !vars.include? $1
1521
+ vars.push($1)
1522
+ h[$1] = Daru::Vector.new([])
1523
+ end
1524
+ max_n = $2.to_i if max_n < $2.to_i
1525
+ end
1526
+ end
1527
+ ds = DataFrame.new(h, order: ds_vars+vars)
1528
+
1529
+ each_row do |row|
1530
+ row_out = {}
1531
+ parent_fields.each do |f|
1532
+ row_out[f]=row[f]
1533
+ end
1534
+
1535
+ max_n.times do |n1|
1536
+ n = n1+1
1537
+ any_data = false
1538
+ vars.each do |v|
1539
+ data = row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s).to_sym]
1540
+ row_out[v] = data
1541
+ any_data = true if !data.nil?
1542
+ end
1543
+
1544
+ if any_data
1545
+ row_out[:_col_id] = n
1546
+ ds.add_row(row_out)
1547
+ end
1548
+ end
1549
+ end
1550
+ ds.update
1551
+ ds
1552
+ end
1553
+
1554
+ def add_vectors_by_split_recode(name_, join='-', sep=Daru::SPLIT_TOKEN)
1555
+ split = self[name_].split_by_separator(sep)
1556
+ i = 1
1557
+ split.each { |k,v|
1558
+ new_field = name_.to_s + join + i.to_s
1559
+ v.rename name_.to_s + ":" + k.to_s
1560
+ self[new_field.to_sym] = v
1561
+ i += 1
1562
+ }
1563
+ end
1564
+
1565
+ # Create a sql, basen on a given Dataset
1566
+ #
1567
+ # == Arguments
1568
+ #
1569
+ # * table - String specifying name of the table that will created in SQL.
1570
+ # * charset - Character set. Default is "UTF8".
1571
+ #
1572
+ # == Usage
1573
+ #
1574
+ # ds = Daru::DataFrame.new({
1575
+ # :id => Daru::Vector.new([1,2,3,4,5]),
1576
+ # :name => Daru::Vector.new(%w{Alex Peter Susan Mary John})
1577
+ # })
1578
+ # ds.create_sql('names')
1579
+ # ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
1580
+ #
1581
+ def create_sql(table,charset="UTF8")
1582
+ sql = "CREATE TABLE #{table} ("
1583
+ fields = self.vectors.to_a.collect do |f|
1584
+ v = self[f]
1585
+ f.to_s + " " + v.db_type
1586
+ end
1587
+
1588
+ sql + fields.join(",\n ")+") CHARACTER SET=#{charset};"
1589
+ end
1590
+
1591
+ # Convert all numeric vectors to GSL::Matrix
1592
+ def to_gsl
1593
+ numerics_as_arrays = []
1594
+ numeric_vectors.each do |n|
1595
+ numerics_as_arrays << self[n].to_a
1596
+ end
1597
+
1598
+ GSL::Matrix.alloc *numerics_as_arrays.transpose
1599
+ end
1600
+
637
1601
  # Convert all vectors of type *:numeric* into a Matrix.
638
1602
  def to_matrix
639
1603
  numerics_as_arrays = []
@@ -644,22 +1608,27 @@ module Daru
644
1608
  Matrix.columns numerics_as_arrays
645
1609
  end
646
1610
 
1611
+ # Return a Nyaplot::DataFrame from the data of this DataFrame.
1612
+ def to_nyaplotdf
1613
+ Nyaplot::DataFrame.new(to_a[0])
1614
+ end
1615
+
647
1616
  # Convert all vectors of type *:numeric* and not containing nils into an NMatrix.
648
1617
  def to_nmatrix
649
1618
  numerics_as_arrays = []
650
1619
  each_vector do |vector|
651
1620
  numerics_as_arrays << vector.to_a if(vector.type == :numeric and
652
- vector.nil_positions.size == 0)
1621
+ vector.missing_positions.size == 0)
653
1622
  end
654
1623
 
655
1624
  numerics_as_arrays.transpose.to_nm
656
1625
  end
657
1626
 
658
1627
  # Converts the DataFrame into an array of hashes where key is vector name
659
- # and value is the corresponding element. The 0th index of the array contains
660
- # the array of hashes while the 1th index contains the indexes of each row
661
- # of the dataframe. Each element in the index array corresponds to its row
662
- # in the array of hashes, which has the same index.
1628
+ # and value is the corresponding element. The 0th index of the array contains
1629
+ # the array of hashes while the 1th index contains the indexes of each row
1630
+ # of the dataframe. Each element in the index array corresponds to its row
1631
+ # in the array of hashes, which has the same index.
663
1632
  def to_a
664
1633
  arry = [[],[]]
665
1634
  self.each_row do |row|
@@ -678,9 +1647,26 @@ module Daru
678
1647
  end
679
1648
  end
680
1649
 
1650
+ # Converts DataFrame to a hash with keys as vector names and values as
1651
+ # the corresponding vectors.
1652
+ def to_hash
1653
+ hsh = {}
1654
+ @vectors.each_with_index do |vec_name, idx|
1655
+ hsh[vec_name] = @data[idx]
1656
+ end
1657
+
1658
+ hsh
1659
+ end
1660
+
681
1661
  # Convert to html for IRuby.
682
1662
  def to_html threshold=30
683
- html = '<table><tr><th></th>'
1663
+ html = "<table>" +
1664
+ "<tr>" +
1665
+ "<th colspan=\"#{@vectors.size+1}\">" +
1666
+ "Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}"
1667
+ "</th>" +
1668
+ "</tr>"
1669
+ html +='<tr><th></th>'
684
1670
  @vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
685
1671
  html += '</tr>'
686
1672
 
@@ -697,6 +1683,15 @@ module Daru
697
1683
  html += '<tr>'
698
1684
  (@vectors + 1).size.times { html += '<td>...</td>' }
699
1685
  html += '</tr>'
1686
+
1687
+ last_index = @index.to_a.last
1688
+ last_row = self.row[last_index]
1689
+ html += '<tr>'
1690
+ html += "<td>" + last_index.to_s + "</td>"
1691
+ (0..(ncols - 1)).to_a.each do |i|
1692
+ html += '<td>' + last_row[i].to_s + '</td>'
1693
+ end
1694
+ html += '</tr>'
700
1695
  break
701
1696
  end
702
1697
  end
@@ -709,6 +1704,87 @@ module Daru
709
1704
  to_html
710
1705
  end
711
1706
 
1707
+ # Method for updating the metadata (i.e. missing value positions) of the
1708
+ # after assingment/deletion etc. are complete. This is provided so that
1709
+ # time is not wasted in creating the metadata for the vector each time
1710
+ # assignment/deletion of elements is done. Updating data this way is called
1711
+ # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
1712
+ def update
1713
+ @data.each { |v| v.update } if Daru.lazy_update
1714
+ end
1715
+
1716
+ def rename new_name
1717
+ if new_name.is_a?(Numeric)
1718
+ @name = new_name
1719
+ return
1720
+ end
1721
+ @name = new_name.to_sym
1722
+ end
1723
+
1724
+ # Write this DataFrame to a CSV file.
1725
+ #
1726
+ # == Arguements
1727
+ #
1728
+ # * filename - Path of CSV file where the DataFrame is to be saved.
1729
+ #
1730
+ # == Options
1731
+ #
1732
+ # * convert_comma - If set to *true*, will convert any commas in any
1733
+ # of the data to full stops ('.').
1734
+ # All the options accepted by CSV.read() can also be passed into this
1735
+ # function.
1736
+ def write_csv filename, opts={}
1737
+ Daru::IO.dataframe_write_csv self, filename, opts
1738
+ end
1739
+
1740
+ # Write this dataframe to an Excel Spreadsheet
1741
+ #
1742
+ # == Arguments
1743
+ #
1744
+ # * filename - The path of the file where the DataFrame should be written.
1745
+ def write_excel filename, opts={}
1746
+ Daru::IO.dataframe_write_excel self, filename, opts
1747
+ end
1748
+
1749
+ # Insert each case of the Dataset on the selected table
1750
+ #
1751
+ # == Arguments
1752
+ #
1753
+ # * dbh - DBI database connection object.
1754
+ # * query - Query string.
1755
+ #
1756
+ # == Usage
1757
+ #
1758
+ # ds = Daru::DataFrame.new({:id=>Daru::Vector.new([1,2,3]), :name=>Daru::Vector.new(["a","b","c"])})
1759
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
1760
+ # ds.write_sql(dbh,"test")
1761
+ def write_sql dbh, table
1762
+ Daru::IO.dataframe_write_sql self, dbh, table
1763
+ end
1764
+
1765
+
1766
+ # Use marshalling to save dataframe to a file.
1767
+ def save filename
1768
+ Daru::IO.save self, filename
1769
+ end
1770
+
1771
+ def _dump depth
1772
+ Marshal.dump({
1773
+ data: @data,
1774
+ index: @index.to_a,
1775
+ order: @vectors.to_a,
1776
+ name: @name
1777
+ })
1778
+ end
1779
+
1780
+ def self._load data
1781
+ h = Marshal.load data
1782
+ Daru::DataFrame.new(h[:data],
1783
+ index: h[:index],
1784
+ order: h[:order],
1785
+ name: h[:name])
1786
+ end
1787
+
712
1788
  # Change dtypes of vectors by supplying a hash of :vector_name => :new_dtype
713
1789
  #
714
1790
  # == Usage
@@ -733,9 +1809,9 @@ module Daru
733
1809
  # Pretty print in a nice table format for the command line (irb/pry/iruby)
734
1810
  def inspect spacing=10, threshold=15
735
1811
  longest = [@name.to_s.size,
736
- @vectors.map(&:to_s).map(&:size).max,
737
- @index .map(&:to_s).map(&:size).max,
738
- @data .map{ |v| v.map(&:to_s).map(&:size).max }.max].max
1812
+ (@vectors.map(&:to_s).map(&:size).max || 0),
1813
+ (@index .map(&:to_s).map(&:size).max || 0),
1814
+ (@data .map{ |v| v.map(&:to_s).map(&:size).max}.max || 0)].max
739
1815
 
740
1816
  name = @name || 'nil'
741
1817
  content = ""
@@ -901,6 +1977,8 @@ module Daru
901
1977
 
902
1978
  def access_vector *names
903
1979
  location = names[0]
1980
+
1981
+ return dup(@vectors[location]) if location.is_a?(Range)
904
1982
  if @vectors.is_a?(MultiIndex)
905
1983
  pos = vectors_index_for names
906
1984
 
@@ -996,41 +2074,68 @@ module Daru
996
2074
  end
997
2075
 
998
2076
  def insert_or_modify_vector name, vector
999
- @vectors = reassign_index_as(@vectors + name)
1000
- v = nil
2077
+ if vectors.is_a?(Index)
2078
+ name = name[0]
2079
+ end
1001
2080
 
1002
- if vector.is_a?(Daru::Vector)
1003
- v = Daru::Vector.new [], name: set_name(name), index: @index
1004
- @index.each do |idx|
1005
- v[idx] = vector[idx]
2081
+ @vectors = @vectors + name if !@vectors.include?(name)
2082
+ v = nil
2083
+
2084
+ if @index.empty?
2085
+ v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a)
2086
+ @index = v.index
2087
+ @data[@vectors[name]] = v
2088
+ set_size
2089
+
2090
+ @data.map! do |v|
2091
+ if v.size == 0
2092
+ Daru::Vector.new([nil]*@size, name: set_name(name), index: @index)
2093
+ else
2094
+ v
2095
+ end
1006
2096
  end
1007
2097
  else
1008
- raise Exception, "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
1009
- @size != vector.size
2098
+ if vector.is_a?(Daru::Vector)
2099
+ v = Daru::Vector.new [], name: set_name(name), index: @index
2100
+ @index.each do |idx|
2101
+ v[idx] = vector[idx]
2102
+ end
2103
+ else
2104
+ raise Exception, "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
2105
+ @size != vector.size
1010
2106
 
1011
- v = Daru::Vector.new(vector, name: set_name(name), index: @index)
1012
- end
2107
+ v = Daru::Vector.new(vector, name: set_name(name), index: @index)
2108
+ end
1013
2109
 
1014
- @data[@vectors[name]] = v
2110
+ @data[@vectors[name]] = v
2111
+ end
1015
2112
  end
1016
2113
 
1017
- def insert_or_modify_row name, vector
1018
- if @index.include? name
1019
- v = vector.dv(name, @vectors, @dtype)
1020
-
1021
- @vectors.each do |vector|
1022
- @data[@vectors[vector]][name] = v[vector]
1023
- end
2114
+ def insert_or_modify_row name, vector
2115
+ if index.is_a?(MultiIndex)
2116
+ # TODO
1024
2117
  else
1025
- @index = reassign_index_as(@index + name)
1026
- v = Daru::Vector.new(vector, name: set_name(name), index: @vectors)
2118
+ name = name[0]
2119
+ v =
2120
+ if vector.is_a?(Daru::Vector)
2121
+ vector
2122
+ else
2123
+ Daru::Vector.new(vector, name: set_name(name), index: @vectors)
2124
+ end
1027
2125
 
1028
- @vectors.each do |vector|
1029
- @data[@vectors[vector]].concat v[vector], name
2126
+ if @index.include? name
2127
+ @vectors.each do |vector|
2128
+ @data[@vectors[vector]][name] = v[vector]
2129
+ end
2130
+ else
2131
+ @index = reassign_index_as(@index + name)
2132
+ @vectors.each do |vector|
2133
+ @data[@vectors[vector]].concat v[vector], name
2134
+ end
1030
2135
  end
1031
- end
1032
2136
 
1033
- set_size
2137
+ set_size
2138
+ end
1034
2139
  end
1035
2140
 
1036
2141
  def create_empty_vectors
@@ -1081,18 +2186,22 @@ module Daru
1081
2186
  def create_vectors_index_with vectors, source
1082
2187
  vectors = source.keys.sort if vectors.nil?
1083
2188
 
2189
+ @vectors =
1084
2190
  unless vectors.is_a?(Index) or vectors.is_a?(MultiIndex)
1085
- @vectors = Daru::Index.new (vectors + (source.keys - vectors)).uniq.map(&:to_sym)
2191
+ Daru::Index.new((vectors + (source.keys - vectors))
2192
+ .uniq
2193
+ .map { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
2194
+ )
1086
2195
  else
1087
- @vectors = vectors
2196
+ vectors
1088
2197
  end
1089
2198
  end
1090
2199
 
1091
2200
  def all_vectors_have_equal_indexes? source
1092
- index = source.values[0].index
2201
+ idx = source.values[0].index
1093
2202
 
1094
2203
  source.all? do |name, vector|
1095
- index == vector.index
2204
+ idx == vector.index
1096
2205
  end
1097
2206
  end
1098
2207