daru 0.0.5 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.build.sh +14 -0
  3. data/.travis.yml +26 -4
  4. data/CONTRIBUTING.md +31 -0
  5. data/Gemfile +1 -2
  6. data/{History.txt → History.md} +110 -44
  7. data/README.md +21 -288
  8. data/Rakefile +1 -0
  9. data/daru.gemspec +12 -8
  10. data/lib/daru.rb +36 -1
  11. data/lib/daru/accessors/array_wrapper.rb +8 -3
  12. data/lib/daru/accessors/gsl_wrapper.rb +113 -0
  13. data/lib/daru/accessors/nmatrix_wrapper.rb +6 -17
  14. data/lib/daru/core/group_by.rb +0 -1
  15. data/lib/daru/dataframe.rb +1192 -83
  16. data/lib/daru/extensions/rserve.rb +21 -0
  17. data/lib/daru/index.rb +14 -0
  18. data/lib/daru/io/io.rb +170 -8
  19. data/lib/daru/maths/arithmetic/dataframe.rb +4 -3
  20. data/lib/daru/maths/arithmetic/vector.rb +4 -4
  21. data/lib/daru/maths/statistics/dataframe.rb +48 -27
  22. data/lib/daru/maths/statistics/vector.rb +215 -33
  23. data/lib/daru/monkeys.rb +53 -7
  24. data/lib/daru/multi_index.rb +21 -4
  25. data/lib/daru/plotting/dataframe.rb +83 -25
  26. data/lib/daru/plotting/vector.rb +9 -10
  27. data/lib/daru/vector.rb +596 -61
  28. data/lib/daru/version.rb +3 -0
  29. data/spec/accessors/wrappers_spec.rb +51 -0
  30. data/spec/core/group_by_spec.rb +0 -2
  31. data/spec/daru_spec.rb +58 -0
  32. data/spec/dataframe_spec.rb +768 -73
  33. data/spec/extensions/rserve_spec.rb +52 -0
  34. data/spec/fixtures/bank2.dat +200 -0
  35. data/spec/fixtures/repeated_fields.csv +7 -0
  36. data/spec/fixtures/scientific_notation.csv +4 -0
  37. data/spec/fixtures/test_xls.xls +0 -0
  38. data/spec/io/io_spec.rb +161 -24
  39. data/spec/math/arithmetic/dataframe_spec.rb +26 -7
  40. data/spec/math/arithmetic/vector_spec.rb +8 -0
  41. data/spec/math/statistics/dataframe_spec.rb +16 -1
  42. data/spec/math/statistics/vector_spec.rb +215 -47
  43. data/spec/spec_helper.rb +21 -2
  44. data/spec/vector_spec.rb +368 -12
  45. metadata +99 -16
  46. data/lib/version.rb +0 -3
  47. data/notebooks/grouping_splitting_pivots.ipynb +0 -529
  48. data/notebooks/intro_with_music_data_.ipynb +0 -303
data/Rakefile CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'rspec/core/rake_task'
2
+ require 'bundler/gem_tasks'
2
3
 
3
4
  RSpec::Core::RakeTask.new(:spec)
4
5
 
@@ -1,9 +1,9 @@
1
1
  # coding: utf-8
2
2
  $:.unshift File.expand_path("../lib", __FILE__)
3
3
 
4
- require 'version.rb'
4
+ require 'daru/version.rb'
5
5
 
6
- DESCRIPTION = <<MSG
6
+ Daru::DESCRIPTION = <<MSG
7
7
  Daru (Data Analysis in RUby) is a library for analysis, manipulation and visualization
8
8
  of data.
9
9
 
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
18
18
  spec.authors = ['Sameer Deshmukh']
19
19
  spec.email = ['sameer.deshmukh93@gmail.com']
20
20
  spec.summary = %q{Data Analysis in RUby}
21
- spec.description = DESCRIPTION
21
+ spec.description = Daru::DESCRIPTION
22
22
  spec.homepage = "http://github.com/v0dro/daru"
23
23
  spec.license = 'BSD-2'
24
24
 
@@ -27,12 +27,16 @@ Gem::Specification.new do |spec|
27
27
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
28
28
  spec.require_paths = ["lib"]
29
29
 
30
- spec.add_development_dependency 'bundler'
30
+ spec.add_runtime_dependency 'reportbuilder', '~> 1.4'
31
+ spec.add_runtime_dependency 'spreadsheet', '~> 1.0.3'
32
+
33
+ spec.add_development_dependency 'bundler', '~> 1.10'
31
34
  spec.add_development_dependency 'rake'
35
+ spec.add_development_dependency 'rserve-client', '~> 0.3'
32
36
  spec.add_development_dependency 'rspec'
33
37
  spec.add_development_dependency 'awesome_print'
34
- spec.add_development_dependency 'nyaplot'
35
- if RUBY_ENGINE != 'jruby'
36
- spec.add_development_dependency 'nmatrix', '~> 0.1.0'
37
- end
38
+ spec.add_development_dependency 'nyaplot', '~> 0.1.5'
39
+ spec.add_development_dependency 'nmatrix', '~> 0.1.0'
40
+ spec.add_development_dependency 'distribution', '~> 0.7'
41
+ spec.add_development_dependency 'gsl-nmatrix', '~>1.17'
38
42
  end
@@ -2,10 +2,45 @@ def jruby?
2
2
  RUBY_ENGINE == 'jruby'
3
3
  end
4
4
 
5
- require 'csv'
5
+ module Daru
6
+ SPLIT_TOKEN = ','
7
+ class << self
8
+ @@lazy_update = false
9
+
10
+ # A variable which will set whether Vector metadata is updated immediately or lazily.
11
+ # Call the #update method every time a values are set or removed in order to update
12
+ # metadata like positions of missing values.
13
+ attr_accessor :lazy_update
14
+
15
+ def create_has_library(library)
16
+ define_singleton_method("has_#{library}?") do
17
+ cv = "@@#{library}"
18
+ unless class_variable_defined? cv
19
+ begin
20
+ require library.to_s
21
+ class_variable_set(cv, true)
22
+ rescue LoadError
23
+ class_variable_set(cv, false)
24
+ end
25
+ end
26
+ class_variable_get(cv)
27
+ end
28
+ end
29
+ end
30
+
31
+ create_has_library :gsl
32
+ create_has_library :nmatrix
33
+ create_has_library :nyaplot
34
+ end
35
+
36
+ autoload :Spreadsheet, 'spreadsheet'
37
+ autoload :CSV, 'csv'
38
+
6
39
  require 'matrix'
7
40
  require 'securerandom'
41
+ require 'reportbuilder'
8
42
 
43
+ require 'daru/version.rb'
9
44
  require 'daru/index.rb'
10
45
  require 'daru/multi_index.rb'
11
46
  require 'daru/vector.rb'
@@ -3,13 +3,18 @@ module Daru
3
3
  # Internal class for wrapping ruby array
4
4
  class ArrayWrapper
5
5
  include Enumerable
6
+ extend Forwardable
6
7
 
8
+ def_delegators :@data, :slice!
9
+
7
10
  def each(&block)
8
11
  @data.each(&block)
12
+ self
9
13
  end
10
14
 
11
15
  def map!(&block)
12
16
  @data.map!(&block)
17
+ self
13
18
  end
14
19
 
15
20
  attr_accessor :size
@@ -22,8 +27,8 @@ module Daru
22
27
  set_size
23
28
  end
24
29
 
25
- def [] index
26
- @data[index]
30
+ def [] *index
31
+ @data[*index]
27
32
  end
28
33
 
29
34
  def []= index, value
@@ -62,7 +67,7 @@ module Daru
62
67
  end
63
68
 
64
69
  def mean
65
- sum.quo(@size - @context.nil_positions.size).to_f
70
+ sum.quo(@size - @context.missing_positions.size).to_f
66
71
  end
67
72
 
68
73
  def product
@@ -0,0 +1,113 @@
1
+ module Daru
2
+ module Accessors
3
+ module GSLStatistics
4
+ def vector_standardized_compute(m,sd)
5
+ Daru::Vector.new @data.collect { |x| (x.to_f - m).quo(sd) }, dtype: :gsl,
6
+ index: @context.index, name: @context.name
7
+ end
8
+
9
+ def vector_centered_compute(m)
10
+ Daru::Vector.new @data.collect {|x| (x.to_f - m)}, dtype: :gsl,
11
+ index: @context.index, name: @context.name
12
+ end
13
+
14
+ def sample_with_replacement(sample=1)
15
+ r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
16
+ Daru::Vector.new(r.sample(@data, sample).to_a, dtype: :gsl,
17
+ index: @context.index, name: @context.name)
18
+ end
19
+
20
+ def sample_without_replacement(sample=1)
21
+ r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
22
+ r.choose(@data, sample).to_a
23
+ end
24
+
25
+ def median
26
+ GSL::Stats::median_from_sorted_data(@data.sort)
27
+ end
28
+
29
+ def variance_sample(m)
30
+ @data.variance_m
31
+ end
32
+
33
+ def standard_deviation_sample(m)
34
+ @data.sd(m)
35
+ end
36
+
37
+ def variance_population(m)
38
+ @data.variance_with_fixed_mean(m)
39
+ end
40
+
41
+ def standard_deviation_population m
42
+ @data.sd_with_fixed_mean(m)
43
+ end
44
+
45
+ def skew
46
+ @data.skew
47
+ end
48
+
49
+ def kurtosis
50
+ @data.kurtosis
51
+ end
52
+ end
53
+
54
+ class GSLWrapper
55
+ include Enumerable
56
+ extend Forwardable
57
+ include Daru::Accessors::GSLStatistics
58
+
59
+ def_delegators :@data, :[], :size, :to_a, :each, :mean,
60
+ :sum, :prod, :max, :min
61
+
62
+ alias :product :prod
63
+
64
+ attr_reader :data
65
+
66
+ def each(&block)
67
+ @data.each(&block)
68
+ self
69
+ end
70
+
71
+ def map!(&block)
72
+ @data.map!(&block)
73
+ self
74
+ end
75
+
76
+ def initialize data, context
77
+ @data = ::GSL::Vector.alloc(data)
78
+ @context = context
79
+ end
80
+
81
+ def []= index, element
82
+ if index == size
83
+ push element
84
+ else
85
+ @data[index] = element
86
+ end
87
+ end
88
+
89
+ def delete_at index
90
+ @data.delete_at index
91
+ end
92
+
93
+ def index key
94
+ @data.to_a.index key
95
+ end
96
+
97
+ def push value
98
+ @data = @data.concat value
99
+ self
100
+ end
101
+ alias :<< :push
102
+ alias :concat :push
103
+
104
+ def dup
105
+ GSLWrapper.new(@data.to_a, @context)
106
+ end
107
+
108
+ def == other
109
+ @data == other.data
110
+ end
111
+ end
112
+ end
113
+ end if Daru.has_gsl?
@@ -1,9 +1,3 @@
1
- begin
2
- require 'nmatrix' unless jruby?
3
- rescue LoadError => e
4
- puts "Please install the nmatrix gem for fast and efficient data storage."
5
- end
6
-
7
1
  module Daru
8
2
  module Accessors
9
3
  # Internal class for wrapping NMatrix
@@ -12,23 +6,18 @@ module Daru
12
6
 
13
7
  def each(&block)
14
8
  @data[0...@size].each(&block)
15
- end
16
-
17
- def map(&block)
18
- @data[0...@size].map(&block)
9
+ self
19
10
  end
20
11
 
21
12
  def map!(&block)
22
13
  @data = NMatrix.new [@size*2], map(&block).to_a, dtype: nm_dtype
14
+ self
23
15
  end
24
16
 
25
17
  def inject(*args, &block)
26
18
  @data[0...@size].inject(*args, &block)
27
19
  end
28
20
 
29
- alias_method :recode, :map
30
- alias_method :recode!, :map!
31
-
32
21
  attr_reader :size, :data, :nm_dtype
33
22
 
34
23
  def initialize vector, context, nm_dtype=:int32
@@ -39,8 +28,8 @@ module Daru
39
28
  # init with twice the storage for reducing the need to resize
40
29
  end
41
30
 
42
- def [] index
43
- return @data[index] if index < @size
31
+ def [] *index
32
+ return @data[*index] if index[0] < @size
44
33
  nil
45
34
  end
46
35
 
@@ -79,7 +68,7 @@ module Daru
79
68
  end
80
69
 
81
70
  def dup
82
- NMatrixWrapper.new @data.to_a, @context, @nm_dtype
71
+ NMatrixWrapper.new @data[0...@size].to_a, @context, @nm_dtype
83
72
  end
84
73
 
85
74
  def resize size = @size*2
@@ -109,4 +98,4 @@ module Daru
109
98
  end
110
99
  end
111
100
  end
112
- end
101
+ end if Daru.has_nmatrix?
@@ -171,7 +171,6 @@ module Daru
171
171
  else
172
172
  arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym }
173
173
  end
174
-
175
174
  symbolized_arry
176
175
  end
177
176
 
@@ -12,17 +12,82 @@ module Daru
12
12
 
13
13
  include Daru::Maths::Arithmetic::DataFrame
14
14
  include Daru::Maths::Statistics::DataFrame
15
- include Daru::Plotting::DataFrame
15
+ include Daru::Plotting::DataFrame if Daru.has_nyaplot?
16
16
 
17
17
  class << self
18
- # Load data from a CSV file.
19
- # Arguments - path, options, block(optional)
18
+ # Load data from a CSV file. Specify an optional block to grab the CSV
19
+ # object and pre-condition it (for example use the `convert` or
20
+ # `header_convert` methods).
20
21
  #
21
- # Accepts a block for pre-conditioning of CSV data if any.
22
+ # == Arguments
23
+ #
24
+ # * path - Path of the file to load specified as a String.
25
+ #
26
+ # == Options
27
+ #
28
+ # Accepts the same options as the Daru::DataFrame constructor and CSV.open()
29
+ # and uses those to eventually construct the resulting DataFrame.
30
+ #
31
+ # == Verbose Description
32
+ #
33
+ # You can specify all the options to the `.from_csv` function that you
34
+ # do to the Ruby `CSV.read()` function, since this is what is used internally.
35
+ #
36
+ # For example, if the columns in your CSV file are separated by something
37
+ # other that commas, you can use the `:col_sep` option. If you want to
38
+ # convert numeric values to numbers and not keep them as strings, you can
39
+ # use the `:converters` option and set it to `:numeric`.
40
+ #
41
+ # The `.from_csv` function uses the following defaults for reading CSV files
42
+ # (that are passed into the `CSV.read()` function):
43
+ #
44
+ # {
45
+ # :col_sep => ',',
46
+ # :converters => :numeric
47
+ # }
22
48
  def from_csv path, opts={}, &block
23
49
  Daru::IO.from_csv path, opts, &block
24
50
  end
25
51
 
52
+ # Read data from an Excel file into a DataFrame.
53
+ #
54
+ # == Arguments
55
+ #
56
+ # * path - Path of the file to be read.
57
+ #
58
+ # == Options
59
+ #
60
+ # *:worksheet_id - ID of the worksheet that is to be read.
61
+ def from_excel path, opts={}, &block
62
+ Daru::IO.from_excel path, opts, &block
63
+ end
64
+
65
+ # Read a database query and returns a Dataset
66
+ #
67
+ # USE:
68
+ #
69
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
70
+ # Daru::DataFrame.from_sql(dbh, "SELECT * FROM test")
71
+ def from_sql dbh, query
72
+ Daru::IO.from_sql dbh, query
73
+ end
74
+
75
+ # Read the database from a plaintext file. For this method to work,
76
+ # the data should be present in a plain text file in columns. See
77
+ # spec/fixtures/bank2.dat for an example.
78
+ #
79
+ # == Arguments
80
+ #
81
+ # * path - Path of the file to be read.
82
+ # * fields - Vector names of the resulting database.
83
+ #
84
+ # == Usage
85
+ #
86
+ # df = Daru::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
87
+ def from_plaintext path, fields
88
+ Daru::IO.from_plaintext path, fields
89
+ end
90
+
26
91
  # Create DataFrame by specifying rows as an Array of Arrays or Array of
27
92
  # Daru::Vector objects.
28
93
  def rows source, opts={}
@@ -52,6 +117,58 @@ module Daru
52
117
 
53
118
  df
54
119
  end
120
+
121
+ # Generates a new dataset, using three vectors
122
+ # - Rows
123
+ # - Columns
124
+ # - Values
125
+ #
126
+ # For example, you have these values
127
+ #
128
+ # x y v
129
+ # a a 0
130
+ # a b 1
131
+ # b a 1
132
+ # b b 0
133
+ #
134
+ # You obtain
135
+ # id a b
136
+ # a 0 1
137
+ # b 1 0
138
+ #
139
+ # Useful to process outputs from databases
140
+ def crosstab_by_assignation rows, columns, values
141
+ raise "Three vectors should be equal size" if
142
+ rows.size != columns.size or rows.size!=values.size
143
+
144
+ cols_values = columns.factors
145
+ cols_n = cols_values.size
146
+
147
+ h_rows = rows.factors.inject({}) do |a,v|
148
+ a[v] = cols_values.inject({}) do |a1,v1|
149
+ a1[v1]=nil
150
+ a1
151
+ end
152
+ a
153
+ end
154
+
155
+ values.each_index do |i|
156
+ h_rows[rows[i]][columns[i]] = values[i]
157
+ end
158
+ df = Daru::DataFrame.new({}, order: [:_id] + cols_values.to_a)
159
+
160
+ rows.factors.each do |row|
161
+ n_row = Array.new(cols_n+1)
162
+ n_row[0] = row
163
+ cols_values.each_index do |i|
164
+ n_row[i+1] = h_rows[row][cols_values[i]]
165
+ end
166
+
167
+ df.add_row(n_row)
168
+ end
169
+ df.update
170
+ df
171
+ end
55
172
  end
56
173
 
57
174
  # The vectors (columns) index of the DataFrame
@@ -67,8 +184,29 @@ module Daru
67
184
  attr_reader :size
68
185
 
69
186
  # DataFrame basically consists of an Array of Vector objects.
70
- # These objects are indexed by row and column by vectors and index Index objects.
71
- # Arguments - source, vectors, index, name.
187
+ # These objects are indexed by row and column by vectors and index Index objects.
188
+ #
189
+ # == Arguments
190
+ #
191
+ # * source - Source from the DataFrame is to be initialized. Can be a Hash
192
+ # of names and vectors (array or Daru::Vector), an array of arrays or
193
+ # array of Daru::Vectors.
194
+ #
195
+ # == Options
196
+ #
197
+ # +:order+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order in
198
+ # which Vectors should appear in the DataFrame.
199
+ #
200
+ # +:index+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order
201
+ # in which rows of the DataFrame will be named.
202
+ #
203
+ # +:name+ - A name for the DataFrame.
204
+ #
205
+ # +:clone+ - Specify as *true* or *false*. When set to false, and Vector
206
+ # objects are passed for the source, the Vector objects will not duplicated
207
+ # when creating the DataFrame. Will have no effect if Array is passed in
208
+ # the source, or if the passed Daru::Vectors have different indexes.
209
+ # Default to *true*.
72
210
  #
73
211
  # == Usage
74
212
  # df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
@@ -84,9 +222,12 @@ module Daru
84
222
  def initialize source, opts={}
85
223
  vectors = opts[:order]
86
224
  index = opts[:index]
87
- @name = (opts[:name] || SecureRandom.uuid).to_sym
225
+ clone = opts[:clone] == false ? false : true
88
226
  @data = []
89
227
 
228
+ temp_name = opts[:name]
229
+ @name = temp_name.is_a?(Numeric) ? temp_name : (temp_name || SecureRandom.uuid).to_sym
230
+
90
231
  if source.empty?
91
232
  @vectors = create_index vectors
92
233
  @index = create_index index
@@ -109,7 +250,7 @@ module Daru
109
250
  vectors.each_with_index do |name, idx|
110
251
  hsh[name] = source[idx]
111
252
  end
112
- initialize(hsh, index: index, order: vectors, name: @name)
253
+ initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
113
254
  else # array of hashes
114
255
  if vectors.nil?
115
256
  @vectors = Daru::Index.new source[0].keys.map(&:to_sym)
@@ -143,13 +284,19 @@ module Daru
143
284
  all_indexes.flatten!.uniq!.sort!
144
285
 
145
286
  @index = Daru::Index.new all_indexes
287
+ clone = true
146
288
  end
147
- @vectors.each do |vector|
148
- @data << Daru::Vector.new([], name: vector, index: @index)
149
289
 
150
- @index.each do |idx|
151
- @data[@vectors[vector]][idx] = source[vector][idx]
290
+ if clone
291
+ @vectors.each do |vector|
292
+ @data << Daru::Vector.new([], name: vector, index: @index)
293
+
294
+ @index.each do |idx|
295
+ @data[@vectors[vector]][idx] = source[vector][idx]
296
+ end
152
297
  end
298
+ else
299
+ @data.concat source.values
153
300
  end
154
301
  else
155
302
  @index = create_index(index || source.values[0].size)
@@ -163,6 +310,7 @@ module Daru
163
310
 
164
311
  set_size
165
312
  validate
313
+ update
166
314
  end
167
315
 
168
316
  # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
@@ -176,6 +324,7 @@ module Daru
176
324
  else
177
325
  axis = :vector
178
326
  end
327
+ names.map! { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
179
328
 
180
329
  if axis == :vector
181
330
  access_vector *names
@@ -194,11 +343,14 @@ module Daru
194
343
  # of the vector will be matched against the row/vector indexes of the DataFrame
195
344
  # before an insertion is performed. Unmatched indexes will be set to nil.
196
345
  def []=(*args)
197
- name = args[0]
198
- axis = args[1]
346
+ axis = args.include?(:row) ? :row : :vector
347
+ args.delete :vector
348
+ args.delete :row
349
+
350
+ name = args[0..-2]
199
351
  vector = args[-1]
352
+ name.map! { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
200
353
 
201
- axis = (!axis.is_a?(Symbol) and (axis != :vector or axis != :row)) ? :vector : axis
202
354
  if axis == :vector
203
355
  insert_or_modify_vector name, vector
204
356
  elsif axis == :row
@@ -222,6 +374,14 @@ module Daru
222
374
  vector[name]
223
375
  end
224
376
 
377
+ def add_row row, index=nil
378
+ self.row[index || @size] = row
379
+ end
380
+
381
+ def add_vector n, vector
382
+ self[n] = vector
383
+ end
384
+
225
385
  # Access a row or set/create a row. Refer #[] and #[]= docs for details.
226
386
  #
227
387
  # == Usage
@@ -232,13 +392,77 @@ module Daru
232
392
  end
233
393
 
234
394
  # Duplicate the DataFrame entirely.
235
- def dup
236
- src = {}
237
- @vectors.each do |vector|
238
- src[vector] = @data[@vectors[vector]].dup
395
+ #
396
+ # == Arguments
397
+ #
398
+ # * +vectors_to_dup+ - An Array specifying the names of Vectors to
399
+ # be duplicated. Will duplicate the entire DataFrame if not specified.
400
+ def dup vectors_to_dup=nil
401
+ vectors_to_dup = @vectors unless vectors_to_dup
402
+
403
+ new_order =
404
+ if vectors.is_a?(MultiIndex)
405
+ src = []
406
+ vectors_to_dup.each do |vec|
407
+ src << @data[@vectors[vec]].dup
408
+ end
409
+
410
+ Daru::MultiIndex.new(vectors_to_dup)
411
+ else
412
+ src = {}
413
+ vectors_to_dup.each do |vector|
414
+ src[vector] = @data[@vectors[vector]].dup
415
+ end
416
+
417
+ Daru::Index.new(vectors_to_dup)
418
+ end
419
+
420
+ Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
421
+ end
422
+
423
+ # Only clone the structure of the DataFrame.
424
+ def clone_structure
425
+ Daru::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
426
+ end
427
+
428
+ # Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
429
+ # preserved.
430
+ #
431
+ # == Arguments
432
+ #
433
+ # +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
434
+ # a view of the whole data frame otherwise.
435
+ def clone *vectors_to_clone
436
+ vectors_to_clone.flatten! unless vectors_to_clone.all? { |a| !a.is_a?(Array) }
437
+ return super if vectors_to_clone.empty?
438
+
439
+ h = vectors_to_clone.inject({}) do |hsh, vec|
440
+ hsh[vec] = self[vec]
441
+ hsh
239
442
  end
443
+ Daru::DataFrame.new(h, clone: false)
444
+ end
240
445
 
241
- Daru::DataFrame.new src, order: @vectors.dup, index: @index.dup, name: @name
446
+ # Returns a 'shallow' copy of DataFrame if missing data is not present,
447
+ # or a full copy of only valid data if missing data is present.
448
+ def clone_only_valid
449
+ if has_missing_data?
450
+ dup_only_valid
451
+ else
452
+ clone
453
+ end
454
+ end
455
+
456
+ # Creates a new duplicate dataframe containing only rows
457
+ # without a single missing value.
458
+ def dup_only_valid vecs=nil
459
+ rows_with_nil = @data.inject([]) do |memo, vector|
460
+ memo.concat vector.missing_positions
461
+ memo
462
+ end.uniq
463
+
464
+ row_indexes = @index.to_a
465
+ (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
242
466
  end
243
467
 
244
468
  # Iterate over each vector
@@ -286,21 +510,205 @@ module Daru
286
510
  self
287
511
  end
288
512
 
289
- # Map each vector. Returns a DataFrame whose vectors are modified according
290
- # to the value returned by the block. As is the case with Enumerable#map,
291
- # the object returned by each block must be a Daru::Vector for the dataframe
292
- # to remain relevant.
513
+ # Iterate over each row or vector of the DataFrame. Specify axis
514
+ # by passing :vector or :row as the argument. Default to :vector.
515
+ #
516
+ # == Description
517
+ #
518
+ # `#each` works exactly like Array#each. The default mode for `each`
519
+ # is to iterate over the columns of the DataFrame. To iterate over
520
+ # rows you must pass the axis, i.e `:row` as an argument.
521
+ #
522
+ # == Arguments
523
+ #
524
+ # * +axis+ - The axis to iterate over. Can be :vector (or :column)
525
+ # or :row. Default to :vector.
526
+ def each axis=:vector, &block
527
+ if axis == :vector or axis == :column
528
+ each_vector(&block)
529
+ elsif axis == :row
530
+ each_row(&block)
531
+ else
532
+ raise ArgumentError, "Unknown axis #{axis}"
533
+ end
534
+ end
535
+
536
+ # Iterate over a row or vector and return results in a Daru::Vector.
537
+ # Specify axis with :vector or :row. Default to :vector.
538
+ #
539
+ # == Description
540
+ #
541
+ # The #collect iterator works similar to #map, the only difference
542
+ # being that it returns a Daru::Vector comprising of the results of
543
+ # each block run. The resultant Vector has the same index as that
544
+ # of the axis over which collect has iterated. It also accepts the
545
+ # optional axis argument.
546
+ #
547
+ # == Arguments
548
+ #
549
+ # * +axis+ - The axis to iterate over. Can be :vector (or :column)
550
+ # or :row. Default to :vector.
551
+ def collect axis=:vector, &block
552
+ if axis == :vector or axis == :column
553
+ collect_vectors(&block)
554
+ elsif axis == :row
555
+ collect_rows(&block)
556
+ else
557
+ raise ArgumentError, "Unknown axis #{axis}"
558
+ end
559
+ end
560
+
561
+ # Map over each vector or row of the data frame according to
562
+ # the argument specified. Will return an Array of the resulting
563
+ # elements. To map over each row/vector and get a DataFrame,
564
+ # see #recode.
565
+ #
566
+ # == Description
567
+ #
568
+ # The #map iterator works like Array#map. The value returned by
569
+ # each run of the block is added to an Array and the Array is
570
+ # returned. This method also accepts an axis argument, like #each.
571
+ # The default is :vector.
572
+ #
573
+ # == Arguments
574
+ #
575
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
576
+ # Default to :vector.
577
+ def map axis=:vector, &block
578
+ if axis == :vector or axis == :column
579
+ map_vectors(&block)
580
+ elsif axis == :row
581
+ map_rows(&block)
582
+ else
583
+ raise ArgumentError, "Unknown axis #{axis}"
584
+ end
585
+ end
586
+
587
+ # Destructive map. Modifies the DataFrame. Each run of the block
588
+ # must return a Daru::Vector. You can specify the axis to map over
589
+ # as the argument. Default to :vector.
590
+ #
591
+ # == Arguments
592
+ #
593
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
594
+ # Default to :vector.
595
+ def map! axis=:vector, &block
596
+ if axis == :vector or axis == :column
597
+ map_vectors!(&block)
598
+ elsif axis == :row
599
+ map_rows!(&block)
600
+ end
601
+ end
602
+
603
+ # Maps over the DataFrame and returns a DataFrame. Each run of the
604
+ # block must return a Daru::Vector object. You can specify the axis
605
+ # to map over. Default to :vector.
606
+ #
607
+ # == Description
608
+ #
609
+ # Recode works similarly to #map, but an important difference between
610
+ # the two is that recode returns a modified Daru::DataFrame instead
611
+ # of an Array. For this reason, #recodeexpects that every run of the
612
+ # block to return a Daru::Vector.
613
+ #
614
+ # Just like map and each, recode also accepts an optional _axis_ argument.
615
+ #
616
+ # == Arguments
617
+ #
618
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
619
+ # Default to :vector.
620
+ def recode axis=:vector, &block
621
+ if axis == :vector or axis == :column
622
+ recode_vectors(&block)
623
+ elsif axis == :row
624
+ recode_rows(&block)
625
+ end
626
+ end
627
+
628
+ # Retain vectors or rows if the block returns a truthy value.
629
+ #
630
+ # == Description
631
+ #
632
+ # For filtering out certain rows/vectors based on their values,
633
+ # use the #filter method. By default it iterates over vectors and
634
+ # keeps those vectors for which the block returns true. It accepts
635
+ # an optional axis argument which lets you specify whether you want
636
+ # to iterate over vectors or rows.
637
+ #
638
+ # == Arguments
639
+ #
640
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
641
+ # Default to :vector.
642
+ #
643
+ # == Usage
644
+ #
645
+ # # Filter vectors
646
+ #
647
+ # df.filter do |vector|
648
+ # vector.type == :numeric and vector.median < 50
649
+ # end
650
+ #
651
+ # # Filter rows
652
+ #
653
+ # df.filter(:row) do |row|
654
+ # row[:a] + row[:d] < 100
655
+ # end
656
+ def filter axis=:vector, &block
657
+ if axis == :vector or axis == :column
658
+ filter_vectors(&block)
659
+ elsif axis == :row
660
+ filter_rows(&block)
661
+ end
662
+ end
663
+
664
+ def recode_vectors &block
665
+ block_given? or return to_enum(:recode_vectors)
666
+
667
+ df = self.dup
668
+ df.each_vector_with_index do |v, i|
669
+ ret = yield v
670
+ ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
671
+ df[*i] = ret
672
+ end
673
+
674
+ df
675
+ end
676
+
677
+ def recode_rows &block
678
+ block_given? or return to_enum(:recode_rows)
679
+
680
+ df = self.dup
681
+ df.each_row_with_index do |r, i|
682
+ ret = yield r
683
+ ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
684
+ df.row[i] = ret
685
+ end
686
+
687
+ df
688
+ end
689
+
690
+ # Map each vector and return an Array.
293
691
  def map_vectors(&block)
294
692
  return to_enum(:map_vectors) unless block_given?
295
693
 
296
- self.dup.map_vectors!(&block)
694
+ arry = []
695
+ @data.each do |vec|
696
+ arry << yield(vec)
697
+ end
698
+
699
+ arry
297
700
  end
298
701
 
299
702
  # Destructive form of #map_vectors
300
703
  def map_vectors!(&block)
301
704
  return to_enum(:map_vectors!) unless block_given?
302
705
 
303
- @data.map!(&block)
706
+ vectors.dup.each do |n|
707
+ v = yield self[n]
708
+ v.is_a?(Daru::Vector) or raise TypeError, "Must return a Daru::Vector not #{v.class}"
709
+ self[n] = v
710
+ end
711
+
304
712
  self
305
713
  end
306
714
 
@@ -308,37 +716,114 @@ module Daru
308
716
  def map_vectors_with_index(&block)
309
717
  return to_enum(:map_vectors_with_index) unless block_given?
310
718
 
311
- df = self.dup
312
- df.each_vector_with_index do |vector, name|
313
- df[name, :vector] = yield(vector, name)
719
+ dt = []
720
+ each_vector_with_index do |vector, name|
721
+ dt << yield(vector, name)
314
722
  end
315
723
 
316
- df
724
+ dt
317
725
  end
318
726
 
319
727
  # Map each row
320
728
  def map_rows(&block)
321
729
  return to_enum(:map_rows) unless block_given?
322
730
 
323
- df = self.dup
324
- df.each_row_with_index do |row, index|
325
- df[index, :row] = yield(row)
731
+ dt = []
732
+ each_row do |row|
733
+ dt << yield(row)
326
734
  end
327
735
 
328
- df
736
+ dt
329
737
  end
330
738
 
331
739
  def map_rows_with_index(&block)
332
740
  return to_enum(:map_rows_with_index) unless block_given?
333
741
 
334
- df = self.dup
335
- df.each_row_with_index do |row, index|
336
- df[index, :row] = yield(row, index)
742
+ dt = []
743
+ each_row_with_index do |row, index|
744
+ dt << yield(row, index)
337
745
  end
338
746
 
339
- df
747
+ dt
340
748
  end
341
749
 
750
+ def map_rows!(&block)
751
+ return to_enum(:map_rows!) unless block_given?
752
+
753
+ index.dup.each do |i|
754
+ r = yield self.row[i]
755
+ r.is_a?(Daru::Vector) or raise TypeError, "Returned object must be Daru::Vector not #{r.class}"
756
+ self.row[i] = r
757
+ end
758
+
759
+ self
760
+ end
761
+
762
+ # Retrieves a Daru::Vector, based on the result of calculation
763
+ # performed on each row.
764
+ def collect_rows &block
765
+ return to_enum(:collect_rows) unless block_given?
766
+
767
+ data = []
768
+ each_row do |row|
769
+ data.push yield(row)
770
+ end
771
+
772
+ Daru::Vector.new(data, index: @index)
773
+ end
774
+
775
+ def collect_row_with_index &block
776
+ return to_enum(:collect_row_with_index) unless block_given?
777
+
778
+ data = []
779
+ each_row_with_index do |row, i|
780
+ data.push yield(row, i)
781
+ end
782
+
783
+ Daru::Vector.new(data, index: @index)
784
+ end
785
+
786
+ # Retrives a Daru::Vector, based on the result of calculation
787
+ # performed on each vector.
788
+ def collect_vectors &block
789
+ return to_enum(:collect_vectors) unless block_given?
790
+
791
+ data = []
792
+ each_vector do |vec|
793
+ data.push yield(vec)
794
+ end
795
+
796
+ Daru::Vector.new(data, index: @vectors)
797
+ end
798
+
799
+ def collect_vector_with_index &block
800
+ return to_enum(:collect_vector_with_index) unless block_given?
801
+
802
+ data = []
803
+ each_vector_with_index do |vec, i|
804
+ data.push yield(vec, i)
805
+ end
806
+
807
+ Daru::Vector.new(data, index: @vectors)
808
+ end
809
+
810
+ # Generate a matrix, based on vector names of the DataFrame.
811
+ #
812
+ # @return {::Matrix}
813
+ def collect_matrix
814
+ return to_enum(:collect_matrix) unless block_given?
815
+
816
+ vecs = vectors.to_a
817
+ rows = vecs.collect { |row|
818
+ vecs.collect { |col|
819
+ yield row,col
820
+ }
821
+ }
822
+
823
+ Matrix.rows(rows)
824
+ end
825
+
826
+
342
827
  # Delete a vector
343
828
  def delete_vector vector
344
829
  if @vectors.include? vector
@@ -367,6 +852,20 @@ module Daru
367
852
  set_size
368
853
  end
369
854
 
855
+ # Creates a DataFrame with the random data, of n size.
856
+ # If n not given, uses original number of rows.
857
+ #
858
+ # @return {Daru::DataFrame}
859
+ def bootstrap(n=nil)
860
+ n ||= nrows
861
+ ds_boot = Daru::DataFrame.new({}, order: @vectors)
862
+ n.times do
863
+ ds_boot.add_row(row[rand(n)])
864
+ end
865
+ ds_boot.update
866
+ ds_boot
867
+ end
868
+
370
869
  def keep_row_if &block
371
870
  deletion = []
372
871
 
@@ -388,6 +887,16 @@ module Daru
388
887
  end
389
888
  end
390
889
 
890
+ # creates a new vector with the data of a given field which the block returns true
891
+ def filter_vector vec
892
+ d = []
893
+ each_row do |row|
894
+ d.push(row[vec]) if yield row
895
+ end
896
+
897
+ Daru::Vector.new(d)
898
+ end
899
+
391
900
  # Iterates over each row and retains it in a new DataFrame if the block returns
392
901
  # true for that row.
393
902
  def filter_rows &block
@@ -419,18 +928,160 @@ module Daru
419
928
  df
420
929
  end
421
930
 
931
+ # Test each row with one or more tests. Each test is a Proc with the form
932
+ # *Proc.new {|row| row[:age] > 0}*
933
+ #
934
+ # The function returns an array with all errors.
935
+ def verify(*tests)
936
+ if(tests[0].is_a? Symbol)
937
+ id = tests[0]
938
+ tests.shift
939
+ else
940
+ id = @vectors.first
941
+ end
942
+
943
+ vr = []
944
+ i = 0
945
+ each(:row) do |row|
946
+ i += 1
947
+ tests.each do |test|
948
+ if !test[2].call(row)
949
+ values = ""
950
+ if test[1].size>0
951
+ values = " (" + test[1].collect{ |k| "#{k}=#{row[k]}" }.join(", ") + ")"
952
+ end
953
+ vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
954
+ end
955
+ end
956
+ end
957
+ vr
958
+ end
959
+
960
+ # DSL for yielding each row and returning a Daru::Vector based on the
961
+ # value each run of the block returns.
962
+ #
963
+ # == Usage
964
+ #
965
+ # a1 = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7])
966
+ # a2 = Daru::Vector.new([10, 20, 30, 40, 50, 60, 70])
967
+ # a3 = Daru::Vector.new([100, 200, 300, 400, 500, 600, 700])
968
+ # ds = Daru::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
969
+ # total = ds.vector_by_calculation { a + b + c }
970
+ # # <Daru::Vector:82314050 @name = nil @size = 7 >
971
+ # # nil
972
+ # # 0 111
973
+ # # 1 222
974
+ # # 2 333
975
+ # # 3 444
976
+ # # 4 555
977
+ # # 5 666
978
+ # # 6 777
979
+ def vector_by_calculation &block
980
+ a = []
981
+ each_row do |r|
982
+ a.push r.instance_eval(&block)
983
+ end
984
+
985
+ Daru::Vector.new a, index: @index
986
+ end
987
+
988
+ # Returns a vector, based on a string with a calculation based
989
+ # on vector.
990
+ #
991
+ # The calculation will be eval'ed, so you can put any variable
992
+ # or expression valid on ruby.
993
+ #
994
+ # For example:
995
+ # a = Daru::Vector.new [1,2]
996
+ # b = Daru::Vector.new [3,4]
997
+ # ds = Daru::DataFrame.new({:a => a,:b => b})
998
+ # ds.compute("a+b")
999
+ # => Vector [4,6]
1000
+ def compute text, &block
1001
+ return instance_eval(&block) if block_given?
1002
+ instance_eval(text)
1003
+ end
1004
+
1005
+ # Return a vector with the number of missing values in each row.
1006
+ #
1007
+ # == Arguments
1008
+ #
1009
+ # * +missing_values+ - An Array of the values that should be
1010
+ # treated as 'missing'. The default missing value is *nil*.
1011
+ def missing_values_rows missing_values=[nil]
1012
+ number_of_missing = []
1013
+ each_row do |row|
1014
+ row.missing_values = missing_values
1015
+ number_of_missing << row.missing_positions.size
1016
+ end
1017
+
1018
+ Daru::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows".to_sym
1019
+ end
1020
+
1021
+ # TODO: remove next version
1022
+ alias :vector_missing_values :missing_values_rows
1023
+
1024
+ def has_missing_data?
1025
+ !!@data.any? { |v| v.has_missing_data? }
1026
+ end
1027
+
1028
+ alias :flawed? :has_missing_data?
1029
+
1030
+ # Return a nested hash using vector names as keys and an array constructed of
1031
+ # hashes with other values. If block provided, is used to provide the
1032
+ # values, with parameters +row+ of dataset, +current+ last hash on
1033
+ # hierarchy and +name+ of the key to include
1034
+ def nest *tree_keys, &block
1035
+ tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
1036
+ out = {}
1037
+
1038
+ each_row do |row|
1039
+ current = out
1040
+ # Create tree
1041
+ tree_keys[0, tree_keys.size-1].each do |f|
1042
+ root = row[f]
1043
+ current[root] ||= {}
1044
+ current = current[root]
1045
+ end
1046
+ name = row[tree_keys.last]
1047
+ if !block
1048
+ current[name] ||= []
1049
+ current[name].push(row.to_hash.delete_if { |key,value| tree_keys.include? key})
1050
+ else
1051
+ current[name] = block.call(row, current,name)
1052
+ end
1053
+ end
1054
+
1055
+ out
1056
+ end
1057
+
1058
+ def vector_count_characters vecs=nil
1059
+ vecs ||= @vectors.to_a
1060
+
1061
+ collect_row_with_index do |row, i|
1062
+ vecs.inject(0) do |memo, vec|
1063
+ memo + (row[vec].nil? ? 0 : row[vec].to_s.size)
1064
+ end
1065
+ end
1066
+ end
1067
+
1068
+ def add_vectors_by_split(name,join='-',sep=Daru::SPLIT_TOKEN)
1069
+ split = self[name].split_by_separator(sep)
1070
+ split.each { |k,v| self[(name.to_s + join + k.to_s).to_sym] = v }
1071
+ end
1072
+
422
1073
  # Return the number of rows and columns of the DataFrame in an Array.
423
1074
  def shape
424
1075
  [@index.size, @vectors.size]
425
1076
  end
426
1077
 
427
1078
  # The number of rows
428
- def rows
1079
+ def nrows
429
1080
  shape[0]
430
1081
  end
431
1082
 
432
1083
  # The number of vectors
433
- def cols
1084
+ def ncols
434
1085
  shape[1]
435
1086
  end
436
1087
 
@@ -439,11 +1090,37 @@ module Daru
439
1090
  !!@vectors[*vector]
440
1091
  end
441
1092
 
1093
+ def any? axis=:vector, &block
1094
+ if axis == :vector or axis == :column
1095
+ @data.any?(&block)
1096
+ elsif axis == :row
1097
+ each_row do |row|
1098
+ return true if yield(row)
1099
+ end
1100
+ return false
1101
+ else
1102
+ raise ArgumentError, "Unidentified axis #{axis}"
1103
+ end
1104
+ end
1105
+
1106
+ def all? axis=:vector, &block
1107
+ if axis == :vector or axis == :column
1108
+ @data.all?(&block)
1109
+ elsif axis == :row
1110
+ each_row do |row|
1111
+ return false unless yield(row)
1112
+ end
1113
+ return true
1114
+ else
1115
+ raise ArgumentError, "Unidentified axis #{axis}"
1116
+ end
1117
+ end
1118
+
442
1119
  # The first ten elements of the DataFrame
443
1120
  #
444
1121
  # @param [Fixnum] quantity (10) The number of elements to display from the top.
445
1122
  def head quantity=10
446
- self[0..quantity, :row]
1123
+ self[0..(quantity-1), :row]
447
1124
  end
448
1125
 
449
1126
  # The last ten elements of the DataFrame
@@ -453,7 +1130,59 @@ module Daru
453
1130
  self[(@size - quantity)..(@size-1), :row]
454
1131
  end
455
1132
 
456
- # Group elements by vector to perform operations on them.
1133
+ # Returns a vector with sum of all vectors specified in the argument.
1134
+ # Tf vecs parameter is empty, sum all numeric vector.
1135
+ def vector_sum vecs=nil
1136
+ vecs ||= numeric_vectors
1137
+ sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype
1138
+
1139
+ vecs.each do |n|
1140
+ sum += self[n]
1141
+ end
1142
+
1143
+ sum
1144
+ end
1145
+
1146
+ # Calculate mean of the rows of the dataframe.
1147
+ #
1148
+ # == Arguments
1149
+ #
1150
+ # * +max_missing+ - The maximum number of elements in the row that can be
1151
+ # zero for the mean calculation to happen. Default to 0.
1152
+ def vector_mean max_missing=0
1153
+ mean_vec = Daru::Vector.new [0]*@size, index: @index, name: "mean_#{@name}"
1154
+
1155
+ each_row_with_index do |row, i|
1156
+ mean_vec[i] = row.missing_positions.size > max_missing ? nil : row.mean
1157
+ end
1158
+
1159
+ mean_vec
1160
+ end
1161
+
1162
+ # Group elements by vector to perform operations on them. Returns a
1163
+ # Daru::Core::GroupBy object.See the Daru::Core::GroupBy docs for a detailed
1164
+ # list of possible operations.
1165
+ #
1166
+ # == Arguments
1167
+ #
1168
+ # * vectors - An Array contatining names of vectors to group by.
1169
+ #
1170
+ # == Usage
1171
+ #
1172
+ # df = Daru::DataFrame.new({
1173
+ # a: %w{foo bar foo bar foo bar foo foo},
1174
+ # b: %w{one one two three two two one three},
1175
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
1176
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
1177
+ # })
1178
+ # df.group_by([:a,:b,:c]).groups
1179
+ # #=> {["bar", "one", 2]=>[1],
1180
+ # # ["bar", "three", 1]=>[3],
1181
+ # # ["bar", "two", 6]=>[5],
1182
+ # # ["foo", "one", 1]=>[0],
1183
+ # # ["foo", "one", 3]=>[6],
1184
+ # # ["foo", "three", 8]=>[7],
1185
+ # # ["foo", "two", 3]=>[2, 4]}
457
1186
  def group_by vectors
458
1187
  vectors = [vectors] if vectors.is_a?(Symbol)
459
1188
  vectors.each { |v| raise(ArgumentError, "Vector #{v} does not exist") unless
@@ -462,6 +1191,13 @@ module Daru
462
1191
  Daru::Core::GroupBy.new(self, vectors)
463
1192
  end
464
1193
 
1194
+ def reindex_vectors! new_vectors
1195
+ raise ArgumentError, "Number of vectors passed into function (#{new_vectors.size}) should equal that present in the DataFrame (#{@vectors.size})" if
1196
+ @vectors.size != new_vectors.size
1197
+
1198
+ @vectors = Daru::Index.new new_vectors.map(&:to_sym), new_vectors.map { |e| @vectors[e] }
1199
+ end
1200
+
465
1201
  # Change the index of the DataFrame and its underlying vectors. Destructive.
466
1202
  #
467
1203
  # @param [Symbol, Array] new_index Specify an Array if
@@ -481,19 +1217,58 @@ module Daru
481
1217
  self.dup.reindex! new_index
482
1218
  end
483
1219
 
484
- # Return the names of all the numeric vectors. Will include vectors with nils
1220
+ # Return the indexes of all the numeric vectors. Will include vectors with nils
485
1221
  # alongwith numbers.
486
1222
  def numeric_vectors
487
1223
  numerics = []
488
1224
 
489
- each_vector do |vec|
1225
+ each_vector_with_index do |vec, i|
1226
+ numerics << i if(vec.type == :numeric)
1227
+ end
1228
+ numerics
1229
+ end
1230
+
1231
+ def numeric_vector_names
1232
+ numerics = []
1233
+
1234
+ each_vector do |vec, i|
490
1235
  numerics << vec.name if(vec.type == :numeric)
491
1236
  end
492
1237
  numerics
493
1238
  end
494
1239
 
1240
+ # Return a DataFrame of only the numerical Vectors. If clone: false
1241
+ # is specified as option, only a *view* of the Vectors will be
1242
+ # returned. Defaults to clone: true.
1243
+ def only_numerics opts={}
1244
+ cln = opts[:clone] == false ? false : true
1245
+ nv = numeric_vectors
1246
+ arry = nv.inject([]) do |arr, v|
1247
+ arr << self[v]
1248
+ arr
1249
+ end
1250
+
1251
+ order = @vectors.is_a?(MultiIndex) ? MultiIndex.new(nv) : Index.new(nv)
1252
+ Daru::DataFrame.new(arry, clone: cln, order: order, index: @index)
1253
+ end
1254
+
1255
+ # Generate a summary of this DataFrame with ReportBuilder.
1256
+ def summary(method = :to_text)
1257
+ ReportBuilder.new(no_title: true).add(self).send(method)
1258
+ end
1259
+
1260
+ def report_building(b) # :nodoc: #
1261
+ b.section(:name=>@name) do |g|
1262
+ g.text "Number of rows: #{nrows}"
1263
+ @vectors.each do |v|
1264
+ g.text "Element:[#{v}]"
1265
+ g.parse_element(self[v])
1266
+ end
1267
+ end
1268
+ end
1269
+
495
1270
  # Sorts a dataframe (ascending/descending)according to the given sequence of
496
- # vectors, using the attributes provided in the blocks. Works for 2 LEVELS ONLY.
1271
+ # vectors, using the attributes provided in the blocks.
497
1272
  #
498
1273
  # @param order [Array] The order of vector names in which the DataFrame
499
1274
  # should be sorted.
@@ -583,7 +1358,7 @@ module Daru
583
1358
  elsif opts[:values].is_a?(Array)
584
1359
  opts[:values]
585
1360
  else # nil
586
- (@vectors.to_a - (index | vectors)) & numeric_vectors
1361
+ (@vectors.to_a - (index | vectors)) & numeric_vector_names
587
1362
  end
588
1363
 
589
1364
  raise IndexError, "No numeric vectors to aggregate" if values.empty?
@@ -634,6 +1409,195 @@ module Daru
634
1409
  end
635
1410
  end
636
1411
 
1412
+ # Merge vectors from two DataFrames. In case of name collision,
1413
+ # the vectors names are changed to x_1, x_2 ....
1414
+ #
1415
+ # @return {Daru::DataFrame}
1416
+ def merge other_df
1417
+ raise "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" unless nrows == other_df.nrows
1418
+
1419
+ new_fields = (@vectors.to_a + other_df.vectors.to_a)
1420
+ .recode_repeated
1421
+ .map(&:to_sym)
1422
+ df_new = DataFrame.new({}, order: new_fields)
1423
+
1424
+ (0...nrows).to_a.each do |i|
1425
+ row = self.row[i].to_a + other_df.row[i].to_a
1426
+ df_new.add_row(row)
1427
+ end
1428
+
1429
+ df_new.update
1430
+ df_new
1431
+ end
1432
+
1433
+ # Join 2 DataFrames by given fields
1434
+ # type is one of :left and :inner, default is :left
1435
+ #
1436
+ # Untested! Use at your own risk.
1437
+ #
1438
+ # @return {Daru::DataFrame}
1439
+ def join(other_ds,fields_1=[],fields_2=[],type=:left)
1440
+ fields_new = other_ds.vectors.to_a - fields_2
1441
+ fields = self.vectors.to_a + fields_new
1442
+
1443
+ other_ds_hash = {}
1444
+ other_ds.each_row do |row|
1445
+ key = row.to_hash.select { |k,v| fields_2.include?(k) }.values
1446
+ value = row.to_hash.select { |k,v| fields_new.include?(k) }
1447
+
1448
+ if other_ds_hash[key].nil?
1449
+ other_ds_hash[key] = [value]
1450
+ else
1451
+ other_ds_hash[key] << value
1452
+ end
1453
+ end
1454
+
1455
+ new_ds = DataFrame.new({}, order: fields)
1456
+
1457
+ self.each_row do |row|
1458
+ key = row.to_hash.select{|k,v| fields_1.include?(k)}.values
1459
+ new_case = row.to_hash
1460
+
1461
+ if other_ds_hash[key].nil?
1462
+ if type == :left
1463
+ fields_new.each{|field| new_case[field] = nil}
1464
+ new_ds.add_row(Daru::Vector.new(new_case))
1465
+ end
1466
+ else
1467
+ other_ds_hash[key].each do |new_values|
1468
+ new_ds.add_row(Daru::Vector.new(new_case.merge(new_values)))
1469
+ end
1470
+ end
1471
+ end
1472
+
1473
+ new_ds
1474
+ end
1475
+
1476
+
1477
+ # Creates a new dataset for one to many relations
1478
+ # on a dataset, based on pattern of field names.
1479
+ #
1480
+ # for example, you have a survey for number of children
1481
+ # with this structure:
1482
+ # id, name, child_name_1, child_age_1, child_name_2, child_age_2
1483
+ # with
1484
+ # ds.one_to_many([:id], "child_%v_%n"
1485
+ # the field of first parameters will be copied verbatim
1486
+ # to new dataset, and fields which responds to second
1487
+ # pattern will be added one case for each different %n.
1488
+ #
1489
+ # == Usage
1490
+ # cases=[
1491
+ # ['1','george','red',10,'blue',20,nil,nil],
1492
+ # ['2','fred','green',15,'orange',30,'white',20],
1493
+ # ['3','alfred',nil,nil,nil,nil,nil,nil]
1494
+ # ]
1495
+ # ds=Daru::DataFrame.rows(cases, order: [:id, :name, :car_color1, :car_value1, :car_color2, :car_value2, :car_color3, :car_value3])
1496
+ # ds.one_to_many([:id],'car_%v%n').to_matrix
1497
+ # => Matrix[
1498
+ # ["red", "1", 10],
1499
+ # ["blue", "1", 20],
1500
+ # ["green", "2", 15],
1501
+ # ["orange", "2", 30],
1502
+ # ["white", "2", 20]
1503
+ # ]
1504
+ #
1505
+ def one_to_many(parent_fields, pattern)
1506
+ re = Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
1507
+ ds_vars = parent_fields
1508
+ vars = []
1509
+ max_n = 0
1510
+ h = parent_fields.inject({}) { |a,v|
1511
+ a[v] = Daru::Vector.new([])
1512
+ a
1513
+ }
1514
+ # Adding _row_id
1515
+ h[:_col_id] = Daru::Vector.new([])
1516
+ ds_vars.push(:_col_id)
1517
+
1518
+ @vectors.each do |f|
1519
+ if f =~ re
1520
+ if !vars.include? $1
1521
+ vars.push($1)
1522
+ h[$1] = Daru::Vector.new([])
1523
+ end
1524
+ max_n = $2.to_i if max_n < $2.to_i
1525
+ end
1526
+ end
1527
+ ds = DataFrame.new(h, order: ds_vars+vars)
1528
+
1529
+ each_row do |row|
1530
+ row_out = {}
1531
+ parent_fields.each do |f|
1532
+ row_out[f]=row[f]
1533
+ end
1534
+
1535
+ max_n.times do |n1|
1536
+ n = n1+1
1537
+ any_data = false
1538
+ vars.each do |v|
1539
+ data = row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s).to_sym]
1540
+ row_out[v] = data
1541
+ any_data = true if !data.nil?
1542
+ end
1543
+
1544
+ if any_data
1545
+ row_out[:_col_id] = n
1546
+ ds.add_row(row_out)
1547
+ end
1548
+ end
1549
+ end
1550
+ ds.update
1551
+ ds
1552
+ end
1553
+
1554
+ def add_vectors_by_split_recode(name_, join='-', sep=Daru::SPLIT_TOKEN)
1555
+ split = self[name_].split_by_separator(sep)
1556
+ i = 1
1557
+ split.each { |k,v|
1558
+ new_field = name_.to_s + join + i.to_s
1559
+ v.rename name_.to_s + ":" + k.to_s
1560
+ self[new_field.to_sym] = v
1561
+ i += 1
1562
+ }
1563
+ end
1564
+
1565
+ # Create a sql, basen on a given Dataset
1566
+ #
1567
+ # == Arguments
1568
+ #
1569
+ # * table - String specifying name of the table that will created in SQL.
1570
+ # * charset - Character set. Default is "UTF8".
1571
+ #
1572
+ # == Usage
1573
+ #
1574
+ # ds = Daru::DataFrame.new({
1575
+ # :id => Daru::Vector.new([1,2,3,4,5]),
1576
+ # :name => Daru::Vector.new(%w{Alex Peter Susan Mary John})
1577
+ # })
1578
+ # ds.create_sql('names')
1579
+ # ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
1580
+ #
1581
+ def create_sql(table,charset="UTF8")
1582
+ sql = "CREATE TABLE #{table} ("
1583
+ fields = self.vectors.to_a.collect do |f|
1584
+ v = self[f]
1585
+ f.to_s + " " + v.db_type
1586
+ end
1587
+
1588
+ sql + fields.join(",\n ")+") CHARACTER SET=#{charset};"
1589
+ end
1590
+
1591
+ # Convert all numeric vectors to GSL::Matrix
1592
+ def to_gsl
1593
+ numerics_as_arrays = []
1594
+ numeric_vectors.each do |n|
1595
+ numerics_as_arrays << self[n].to_a
1596
+ end
1597
+
1598
+ GSL::Matrix.alloc *numerics_as_arrays.transpose
1599
+ end
1600
+
637
1601
  # Convert all vectors of type *:numeric* into a Matrix.
638
1602
  def to_matrix
639
1603
  numerics_as_arrays = []
@@ -644,22 +1608,27 @@ module Daru
644
1608
  Matrix.columns numerics_as_arrays
645
1609
  end
646
1610
 
1611
+ # Return a Nyaplot::DataFrame from the data of this DataFrame.
1612
+ def to_nyaplotdf
1613
+ Nyaplot::DataFrame.new(to_a[0])
1614
+ end
1615
+
647
1616
  # Convert all vectors of type *:numeric* and not containing nils into an NMatrix.
648
1617
  def to_nmatrix
649
1618
  numerics_as_arrays = []
650
1619
  each_vector do |vector|
651
1620
  numerics_as_arrays << vector.to_a if(vector.type == :numeric and
652
- vector.nil_positions.size == 0)
1621
+ vector.missing_positions.size == 0)
653
1622
  end
654
1623
 
655
1624
  numerics_as_arrays.transpose.to_nm
656
1625
  end
657
1626
 
658
1627
  # Converts the DataFrame into an array of hashes where key is vector name
659
- # and value is the corresponding element. The 0th index of the array contains
660
- # the array of hashes while the 1th index contains the indexes of each row
661
- # of the dataframe. Each element in the index array corresponds to its row
662
- # in the array of hashes, which has the same index.
1628
+ # and value is the corresponding element. The 0th index of the array contains
1629
+ # the array of hashes while the 1th index contains the indexes of each row
1630
+ # of the dataframe. Each element in the index array corresponds to its row
1631
+ # in the array of hashes, which has the same index.
663
1632
  def to_a
664
1633
  arry = [[],[]]
665
1634
  self.each_row do |row|
@@ -678,9 +1647,26 @@ module Daru
678
1647
  end
679
1648
  end
680
1649
 
1650
+ # Converts DataFrame to a hash with keys as vector names and values as
1651
+ # the corresponding vectors.
1652
+ def to_hash
1653
+ hsh = {}
1654
+ @vectors.each_with_index do |vec_name, idx|
1655
+ hsh[vec_name] = @data[idx]
1656
+ end
1657
+
1658
+ hsh
1659
+ end
1660
+
681
1661
  # Convert to html for IRuby.
682
1662
  def to_html threshold=30
683
- html = '<table><tr><th></th>'
1663
+ html = "<table>" +
1664
+ "<tr>" +
1665
+ "<th colspan=\"#{@vectors.size+1}\">" +
1666
+ "Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}"
1667
+ "</th>" +
1668
+ "</tr>"
1669
+ html +='<tr><th></th>'
684
1670
  @vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
685
1671
  html += '</tr>'
686
1672
 
@@ -697,6 +1683,15 @@ module Daru
697
1683
  html += '<tr>'
698
1684
  (@vectors + 1).size.times { html += '<td>...</td>' }
699
1685
  html += '</tr>'
1686
+
1687
+ last_index = @index.to_a.last
1688
+ last_row = self.row[last_index]
1689
+ html += '<tr>'
1690
+ html += "<td>" + last_index.to_s + "</td>"
1691
+ (0..(ncols - 1)).to_a.each do |i|
1692
+ html += '<td>' + last_row[i].to_s + '</td>'
1693
+ end
1694
+ html += '</tr>'
700
1695
  break
701
1696
  end
702
1697
  end
@@ -709,6 +1704,87 @@ module Daru
709
1704
  to_html
710
1705
  end
711
1706
 
1707
+ # Method for updating the metadata (i.e. missing value positions) of the
1708
+ # after assingment/deletion etc. are complete. This is provided so that
1709
+ # time is not wasted in creating the metadata for the vector each time
1710
+ # assignment/deletion of elements is done. Updating data this way is called
1711
+ # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
1712
+ def update
1713
+ @data.each { |v| v.update } if Daru.lazy_update
1714
+ end
1715
+
1716
+ def rename new_name
1717
+ if new_name.is_a?(Numeric)
1718
+ @name = new_name
1719
+ return
1720
+ end
1721
+ @name = new_name.to_sym
1722
+ end
1723
+
1724
+ # Write this DataFrame to a CSV file.
1725
+ #
1726
+ # == Arguements
1727
+ #
1728
+ # * filename - Path of CSV file where the DataFrame is to be saved.
1729
+ #
1730
+ # == Options
1731
+ #
1732
+ # * convert_comma - If set to *true*, will convert any commas in any
1733
+ # of the data to full stops ('.').
1734
+ # All the options accepted by CSV.read() can also be passed into this
1735
+ # function.
1736
+ def write_csv filename, opts={}
1737
+ Daru::IO.dataframe_write_csv self, filename, opts
1738
+ end
1739
+
1740
+ # Write this dataframe to an Excel Spreadsheet
1741
+ #
1742
+ # == Arguments
1743
+ #
1744
+ # * filename - The path of the file where the DataFrame should be written.
1745
+ def write_excel filename, opts={}
1746
+ Daru::IO.dataframe_write_excel self, filename, opts
1747
+ end
1748
+
1749
+ # Insert each case of the Dataset on the selected table
1750
+ #
1751
+ # == Arguments
1752
+ #
1753
+ # * dbh - DBI database connection object.
1754
+ # * query - Query string.
1755
+ #
1756
+ # == Usage
1757
+ #
1758
+ # ds = Daru::DataFrame.new({:id=>Daru::Vector.new([1,2,3]), :name=>Daru::Vector.new(["a","b","c"])})
1759
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
1760
+ # ds.write_sql(dbh,"test")
1761
+ def write_sql dbh, table
1762
+ Daru::IO.dataframe_write_sql self, dbh, table
1763
+ end
1764
+
1765
+
1766
+ # Use marshalling to save dataframe to a file.
1767
+ def save filename
1768
+ Daru::IO.save self, filename
1769
+ end
1770
+
1771
+ def _dump depth
1772
+ Marshal.dump({
1773
+ data: @data,
1774
+ index: @index.to_a,
1775
+ order: @vectors.to_a,
1776
+ name: @name
1777
+ })
1778
+ end
1779
+
1780
+ def self._load data
1781
+ h = Marshal.load data
1782
+ Daru::DataFrame.new(h[:data],
1783
+ index: h[:index],
1784
+ order: h[:order],
1785
+ name: h[:name])
1786
+ end
1787
+
712
1788
  # Change dtypes of vectors by supplying a hash of :vector_name => :new_dtype
713
1789
  #
714
1790
  # == Usage
@@ -733,9 +1809,9 @@ module Daru
733
1809
  # Pretty print in a nice table format for the command line (irb/pry/iruby)
734
1810
  def inspect spacing=10, threshold=15
735
1811
  longest = [@name.to_s.size,
736
- @vectors.map(&:to_s).map(&:size).max,
737
- @index .map(&:to_s).map(&:size).max,
738
- @data .map{ |v| v.map(&:to_s).map(&:size).max }.max].max
1812
+ (@vectors.map(&:to_s).map(&:size).max || 0),
1813
+ (@index .map(&:to_s).map(&:size).max || 0),
1814
+ (@data .map{ |v| v.map(&:to_s).map(&:size).max}.max || 0)].max
739
1815
 
740
1816
  name = @name || 'nil'
741
1817
  content = ""
@@ -901,6 +1977,8 @@ module Daru
901
1977
 
902
1978
  def access_vector *names
903
1979
  location = names[0]
1980
+
1981
+ return dup(@vectors[location]) if location.is_a?(Range)
904
1982
  if @vectors.is_a?(MultiIndex)
905
1983
  pos = vectors_index_for names
906
1984
 
@@ -996,41 +2074,68 @@ module Daru
996
2074
  end
997
2075
 
998
2076
  def insert_or_modify_vector name, vector
999
- @vectors = reassign_index_as(@vectors + name)
1000
- v = nil
2077
+ if vectors.is_a?(Index)
2078
+ name = name[0]
2079
+ end
1001
2080
 
1002
- if vector.is_a?(Daru::Vector)
1003
- v = Daru::Vector.new [], name: set_name(name), index: @index
1004
- @index.each do |idx|
1005
- v[idx] = vector[idx]
2081
+ @vectors = @vectors + name if !@vectors.include?(name)
2082
+ v = nil
2083
+
2084
+ if @index.empty?
2085
+ v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a)
2086
+ @index = v.index
2087
+ @data[@vectors[name]] = v
2088
+ set_size
2089
+
2090
+ @data.map! do |v|
2091
+ if v.size == 0
2092
+ Daru::Vector.new([nil]*@size, name: set_name(name), index: @index)
2093
+ else
2094
+ v
2095
+ end
1006
2096
  end
1007
2097
  else
1008
- raise Exception, "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
1009
- @size != vector.size
2098
+ if vector.is_a?(Daru::Vector)
2099
+ v = Daru::Vector.new [], name: set_name(name), index: @index
2100
+ @index.each do |idx|
2101
+ v[idx] = vector[idx]
2102
+ end
2103
+ else
2104
+ raise Exception, "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
2105
+ @size != vector.size
1010
2106
 
1011
- v = Daru::Vector.new(vector, name: set_name(name), index: @index)
1012
- end
2107
+ v = Daru::Vector.new(vector, name: set_name(name), index: @index)
2108
+ end
1013
2109
 
1014
- @data[@vectors[name]] = v
2110
+ @data[@vectors[name]] = v
2111
+ end
1015
2112
  end
1016
2113
 
1017
- def insert_or_modify_row name, vector
1018
- if @index.include? name
1019
- v = vector.dv(name, @vectors, @dtype)
1020
-
1021
- @vectors.each do |vector|
1022
- @data[@vectors[vector]][name] = v[vector]
1023
- end
2114
+ def insert_or_modify_row name, vector
2115
+ if index.is_a?(MultiIndex)
2116
+ # TODO
1024
2117
  else
1025
- @index = reassign_index_as(@index + name)
1026
- v = Daru::Vector.new(vector, name: set_name(name), index: @vectors)
2118
+ name = name[0]
2119
+ v =
2120
+ if vector.is_a?(Daru::Vector)
2121
+ vector
2122
+ else
2123
+ Daru::Vector.new(vector, name: set_name(name), index: @vectors)
2124
+ end
1027
2125
 
1028
- @vectors.each do |vector|
1029
- @data[@vectors[vector]].concat v[vector], name
2126
+ if @index.include? name
2127
+ @vectors.each do |vector|
2128
+ @data[@vectors[vector]][name] = v[vector]
2129
+ end
2130
+ else
2131
+ @index = reassign_index_as(@index + name)
2132
+ @vectors.each do |vector|
2133
+ @data[@vectors[vector]].concat v[vector], name
2134
+ end
1030
2135
  end
1031
- end
1032
2136
 
1033
- set_size
2137
+ set_size
2138
+ end
1034
2139
  end
1035
2140
 
1036
2141
  def create_empty_vectors
@@ -1081,18 +2186,22 @@ module Daru
1081
2186
  def create_vectors_index_with vectors, source
1082
2187
  vectors = source.keys.sort if vectors.nil?
1083
2188
 
2189
+ @vectors =
1084
2190
  unless vectors.is_a?(Index) or vectors.is_a?(MultiIndex)
1085
- @vectors = Daru::Index.new (vectors + (source.keys - vectors)).uniq.map(&:to_sym)
2191
+ Daru::Index.new((vectors + (source.keys - vectors))
2192
+ .uniq
2193
+ .map { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
2194
+ )
1086
2195
  else
1087
- @vectors = vectors
2196
+ vectors
1088
2197
  end
1089
2198
  end
1090
2199
 
1091
2200
  def all_vectors_have_equal_indexes? source
1092
- index = source.values[0].index
2201
+ idx = source.values[0].index
1093
2202
 
1094
2203
  source.all? do |name, vector|
1095
- index == vector.index
2204
+ idx == vector.index
1096
2205
  end
1097
2206
  end
1098
2207