daru 0.0.5 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.build.sh +14 -0
- data/.travis.yml +26 -4
- data/CONTRIBUTING.md +31 -0
- data/Gemfile +1 -2
- data/{History.txt → History.md} +110 -44
- data/README.md +21 -288
- data/Rakefile +1 -0
- data/daru.gemspec +12 -8
- data/lib/daru.rb +36 -1
- data/lib/daru/accessors/array_wrapper.rb +8 -3
- data/lib/daru/accessors/gsl_wrapper.rb +113 -0
- data/lib/daru/accessors/nmatrix_wrapper.rb +6 -17
- data/lib/daru/core/group_by.rb +0 -1
- data/lib/daru/dataframe.rb +1192 -83
- data/lib/daru/extensions/rserve.rb +21 -0
- data/lib/daru/index.rb +14 -0
- data/lib/daru/io/io.rb +170 -8
- data/lib/daru/maths/arithmetic/dataframe.rb +4 -3
- data/lib/daru/maths/arithmetic/vector.rb +4 -4
- data/lib/daru/maths/statistics/dataframe.rb +48 -27
- data/lib/daru/maths/statistics/vector.rb +215 -33
- data/lib/daru/monkeys.rb +53 -7
- data/lib/daru/multi_index.rb +21 -4
- data/lib/daru/plotting/dataframe.rb +83 -25
- data/lib/daru/plotting/vector.rb +9 -10
- data/lib/daru/vector.rb +596 -61
- data/lib/daru/version.rb +3 -0
- data/spec/accessors/wrappers_spec.rb +51 -0
- data/spec/core/group_by_spec.rb +0 -2
- data/spec/daru_spec.rb +58 -0
- data/spec/dataframe_spec.rb +768 -73
- data/spec/extensions/rserve_spec.rb +52 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/io/io_spec.rb +161 -24
- data/spec/math/arithmetic/dataframe_spec.rb +26 -7
- data/spec/math/arithmetic/vector_spec.rb +8 -0
- data/spec/math/statistics/dataframe_spec.rb +16 -1
- data/spec/math/statistics/vector_spec.rb +215 -47
- data/spec/spec_helper.rb +21 -2
- data/spec/vector_spec.rb +368 -12
- metadata +99 -16
- data/lib/version.rb +0 -3
- data/notebooks/grouping_splitting_pivots.ipynb +0 -529
- data/notebooks/intro_with_music_data_.ipynb +0 -303
data/Rakefile
CHANGED
data/daru.gemspec
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
$:.unshift File.expand_path("../lib", __FILE__)
|
3
3
|
|
4
|
-
require 'version.rb'
|
4
|
+
require 'daru/version.rb'
|
5
5
|
|
6
|
-
DESCRIPTION = <<MSG
|
6
|
+
Daru::DESCRIPTION = <<MSG
|
7
7
|
Daru (Data Analysis in RUby) is a library for analysis, manipulation and visualization
|
8
8
|
of data.
|
9
9
|
|
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.authors = ['Sameer Deshmukh']
|
19
19
|
spec.email = ['sameer.deshmukh93@gmail.com']
|
20
20
|
spec.summary = %q{Data Analysis in RUby}
|
21
|
-
spec.description = DESCRIPTION
|
21
|
+
spec.description = Daru::DESCRIPTION
|
22
22
|
spec.homepage = "http://github.com/v0dro/daru"
|
23
23
|
spec.license = 'BSD-2'
|
24
24
|
|
@@ -27,12 +27,16 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
28
28
|
spec.require_paths = ["lib"]
|
29
29
|
|
30
|
-
spec.
|
30
|
+
spec.add_runtime_dependency 'reportbuilder', '~> 1.4'
|
31
|
+
spec.add_runtime_dependency 'spreadsheet', '~> 1.0.3'
|
32
|
+
|
33
|
+
spec.add_development_dependency 'bundler', '~> 1.10'
|
31
34
|
spec.add_development_dependency 'rake'
|
35
|
+
spec.add_development_dependency 'rserve-client', '~> 0.3'
|
32
36
|
spec.add_development_dependency 'rspec'
|
33
37
|
spec.add_development_dependency 'awesome_print'
|
34
|
-
spec.add_development_dependency 'nyaplot'
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
+
spec.add_development_dependency 'nyaplot', '~> 0.1.5'
|
39
|
+
spec.add_development_dependency 'nmatrix', '~> 0.1.0'
|
40
|
+
spec.add_development_dependency 'distribution', '~> 0.7'
|
41
|
+
spec.add_development_dependency 'gsl-nmatrix', '~>1.17'
|
38
42
|
end
|
data/lib/daru.rb
CHANGED
@@ -2,10 +2,45 @@ def jruby?
|
|
2
2
|
RUBY_ENGINE == 'jruby'
|
3
3
|
end
|
4
4
|
|
5
|
-
|
5
|
+
module Daru
|
6
|
+
SPLIT_TOKEN = ','
|
7
|
+
class << self
|
8
|
+
@@lazy_update = false
|
9
|
+
|
10
|
+
# A variable which will set whether Vector metadata is updated immediately or lazily.
|
11
|
+
# Call the #update method every time a values are set or removed in order to update
|
12
|
+
# metadata like positions of missing values.
|
13
|
+
attr_accessor :lazy_update
|
14
|
+
|
15
|
+
def create_has_library(library)
|
16
|
+
define_singleton_method("has_#{library}?") do
|
17
|
+
cv = "@@#{library}"
|
18
|
+
unless class_variable_defined? cv
|
19
|
+
begin
|
20
|
+
require library.to_s
|
21
|
+
class_variable_set(cv, true)
|
22
|
+
rescue LoadError
|
23
|
+
class_variable_set(cv, false)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
class_variable_get(cv)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
create_has_library :gsl
|
32
|
+
create_has_library :nmatrix
|
33
|
+
create_has_library :nyaplot
|
34
|
+
end
|
35
|
+
|
36
|
+
autoload :Spreadsheet, 'spreadsheet'
|
37
|
+
autoload :CSV, 'csv'
|
38
|
+
|
6
39
|
require 'matrix'
|
7
40
|
require 'securerandom'
|
41
|
+
require 'reportbuilder'
|
8
42
|
|
43
|
+
require 'daru/version.rb'
|
9
44
|
require 'daru/index.rb'
|
10
45
|
require 'daru/multi_index.rb'
|
11
46
|
require 'daru/vector.rb'
|
@@ -3,13 +3,18 @@ module Daru
|
|
3
3
|
# Internal class for wrapping ruby array
|
4
4
|
class ArrayWrapper
|
5
5
|
include Enumerable
|
6
|
+
extend Forwardable
|
6
7
|
|
8
|
+
def_delegators :@data, :slice!
|
9
|
+
|
7
10
|
def each(&block)
|
8
11
|
@data.each(&block)
|
12
|
+
self
|
9
13
|
end
|
10
14
|
|
11
15
|
def map!(&block)
|
12
16
|
@data.map!(&block)
|
17
|
+
self
|
13
18
|
end
|
14
19
|
|
15
20
|
attr_accessor :size
|
@@ -22,8 +27,8 @@ module Daru
|
|
22
27
|
set_size
|
23
28
|
end
|
24
29
|
|
25
|
-
def [] index
|
26
|
-
@data[index]
|
30
|
+
def [] *index
|
31
|
+
@data[*index]
|
27
32
|
end
|
28
33
|
|
29
34
|
def []= index, value
|
@@ -62,7 +67,7 @@ module Daru
|
|
62
67
|
end
|
63
68
|
|
64
69
|
def mean
|
65
|
-
sum.quo(@size - @context.
|
70
|
+
sum.quo(@size - @context.missing_positions.size).to_f
|
66
71
|
end
|
67
72
|
|
68
73
|
def product
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module Daru
|
2
|
+
module Accessors
|
3
|
+
module GSLStatistics
|
4
|
+
def vector_standardized_compute(m,sd)
|
5
|
+
Daru::Vector.new @data.collect { |x| (x.to_f - m).quo(sd) }, dtype: :gsl,
|
6
|
+
index: @context.index, name: @context.name
|
7
|
+
end
|
8
|
+
|
9
|
+
def vector_centered_compute(m)
|
10
|
+
Daru::Vector.new @data.collect {|x| (x.to_f - m)}, dtype: :gsl,
|
11
|
+
index: @context.index, name: @context.name
|
12
|
+
end
|
13
|
+
|
14
|
+
def sample_with_replacement(sample=1)
|
15
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
16
|
+
Daru::Vector.new(r.sample(@data, sample).to_a, dtype: :gsl,
|
17
|
+
index: @context.index, name: @context.name)
|
18
|
+
end
|
19
|
+
|
20
|
+
def sample_without_replacement(sample=1)
|
21
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
22
|
+
r.choose(@data, sample).to_a
|
23
|
+
end
|
24
|
+
|
25
|
+
def median
|
26
|
+
GSL::Stats::median_from_sorted_data(@data.sort)
|
27
|
+
end
|
28
|
+
|
29
|
+
def variance_sample(m)
|
30
|
+
@data.variance_m
|
31
|
+
end
|
32
|
+
|
33
|
+
def standard_deviation_sample(m)
|
34
|
+
@data.sd(m)
|
35
|
+
end
|
36
|
+
|
37
|
+
def variance_population(m)
|
38
|
+
@data.variance_with_fixed_mean(m)
|
39
|
+
end
|
40
|
+
|
41
|
+
def standard_deviation_population m
|
42
|
+
@data.sd_with_fixed_mean(m)
|
43
|
+
end
|
44
|
+
|
45
|
+
def skew
|
46
|
+
@data.skew
|
47
|
+
end
|
48
|
+
|
49
|
+
def kurtosis
|
50
|
+
@data.kurtosis
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
class GSLWrapper
|
55
|
+
include Enumerable
|
56
|
+
extend Forwardable
|
57
|
+
include Daru::Accessors::GSLStatistics
|
58
|
+
|
59
|
+
def_delegators :@data, :[], :size, :to_a, :each, :mean,
|
60
|
+
:sum, :prod, :max, :min
|
61
|
+
|
62
|
+
alias :product :prod
|
63
|
+
|
64
|
+
attr_reader :data
|
65
|
+
|
66
|
+
def each(&block)
|
67
|
+
@data.each(&block)
|
68
|
+
self
|
69
|
+
end
|
70
|
+
|
71
|
+
def map!(&block)
|
72
|
+
@data.map!(&block)
|
73
|
+
self
|
74
|
+
end
|
75
|
+
|
76
|
+
def initialize data, context
|
77
|
+
@data = ::GSL::Vector.alloc(data)
|
78
|
+
@context = context
|
79
|
+
end
|
80
|
+
|
81
|
+
def []= index, element
|
82
|
+
if index == size
|
83
|
+
push element
|
84
|
+
else
|
85
|
+
@data[index] = element
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def delete_at index
|
90
|
+
@data.delete_at index
|
91
|
+
end
|
92
|
+
|
93
|
+
def index key
|
94
|
+
@data.to_a.index key
|
95
|
+
end
|
96
|
+
|
97
|
+
def push value
|
98
|
+
@data = @data.concat value
|
99
|
+
self
|
100
|
+
end
|
101
|
+
alias :<< :push
|
102
|
+
alias :concat :push
|
103
|
+
|
104
|
+
def dup
|
105
|
+
GSLWrapper.new(@data.to_a, @context)
|
106
|
+
end
|
107
|
+
|
108
|
+
def == other
|
109
|
+
@data == other.data
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end if Daru.has_gsl?
|
@@ -1,9 +1,3 @@
|
|
1
|
-
begin
|
2
|
-
require 'nmatrix' unless jruby?
|
3
|
-
rescue LoadError => e
|
4
|
-
puts "Please install the nmatrix gem for fast and efficient data storage."
|
5
|
-
end
|
6
|
-
|
7
1
|
module Daru
|
8
2
|
module Accessors
|
9
3
|
# Internal class for wrapping NMatrix
|
@@ -12,23 +6,18 @@ module Daru
|
|
12
6
|
|
13
7
|
def each(&block)
|
14
8
|
@data[0...@size].each(&block)
|
15
|
-
|
16
|
-
|
17
|
-
def map(&block)
|
18
|
-
@data[0...@size].map(&block)
|
9
|
+
self
|
19
10
|
end
|
20
11
|
|
21
12
|
def map!(&block)
|
22
13
|
@data = NMatrix.new [@size*2], map(&block).to_a, dtype: nm_dtype
|
14
|
+
self
|
23
15
|
end
|
24
16
|
|
25
17
|
def inject(*args, &block)
|
26
18
|
@data[0...@size].inject(*args, &block)
|
27
19
|
end
|
28
20
|
|
29
|
-
alias_method :recode, :map
|
30
|
-
alias_method :recode!, :map!
|
31
|
-
|
32
21
|
attr_reader :size, :data, :nm_dtype
|
33
22
|
|
34
23
|
def initialize vector, context, nm_dtype=:int32
|
@@ -39,8 +28,8 @@ module Daru
|
|
39
28
|
# init with twice the storage for reducing the need to resize
|
40
29
|
end
|
41
30
|
|
42
|
-
def [] index
|
43
|
-
return @data[index] if index < @size
|
31
|
+
def [] *index
|
32
|
+
return @data[*index] if index[0] < @size
|
44
33
|
nil
|
45
34
|
end
|
46
35
|
|
@@ -79,7 +68,7 @@ module Daru
|
|
79
68
|
end
|
80
69
|
|
81
70
|
def dup
|
82
|
-
NMatrixWrapper.new @data.to_a, @context, @nm_dtype
|
71
|
+
NMatrixWrapper.new @data[0...@size].to_a, @context, @nm_dtype
|
83
72
|
end
|
84
73
|
|
85
74
|
def resize size = @size*2
|
@@ -109,4 +98,4 @@ module Daru
|
|
109
98
|
end
|
110
99
|
end
|
111
100
|
end
|
112
|
-
end
|
101
|
+
end if Daru.has_nmatrix?
|
data/lib/daru/core/group_by.rb
CHANGED
data/lib/daru/dataframe.rb
CHANGED
@@ -12,17 +12,82 @@ module Daru
|
|
12
12
|
|
13
13
|
include Daru::Maths::Arithmetic::DataFrame
|
14
14
|
include Daru::Maths::Statistics::DataFrame
|
15
|
-
include Daru::Plotting::DataFrame
|
15
|
+
include Daru::Plotting::DataFrame if Daru.has_nyaplot?
|
16
16
|
|
17
17
|
class << self
|
18
|
-
# Load data from a CSV file.
|
19
|
-
#
|
18
|
+
# Load data from a CSV file. Specify an optional block to grab the CSV
|
19
|
+
# object and pre-condition it (for example use the `convert` or
|
20
|
+
# `header_convert` methods).
|
20
21
|
#
|
21
|
-
#
|
22
|
+
# == Arguments
|
23
|
+
#
|
24
|
+
# * path - Path of the file to load specified as a String.
|
25
|
+
#
|
26
|
+
# == Options
|
27
|
+
#
|
28
|
+
# Accepts the same options as the Daru::DataFrame constructor and CSV.open()
|
29
|
+
# and uses those to eventually construct the resulting DataFrame.
|
30
|
+
#
|
31
|
+
# == Verbose Description
|
32
|
+
#
|
33
|
+
# You can specify all the options to the `.from_csv` function that you
|
34
|
+
# do to the Ruby `CSV.read()` function, since this is what is used internally.
|
35
|
+
#
|
36
|
+
# For example, if the columns in your CSV file are separated by something
|
37
|
+
# other that commas, you can use the `:col_sep` option. If you want to
|
38
|
+
# convert numeric values to numbers and not keep them as strings, you can
|
39
|
+
# use the `:converters` option and set it to `:numeric`.
|
40
|
+
#
|
41
|
+
# The `.from_csv` function uses the following defaults for reading CSV files
|
42
|
+
# (that are passed into the `CSV.read()` function):
|
43
|
+
#
|
44
|
+
# {
|
45
|
+
# :col_sep => ',',
|
46
|
+
# :converters => :numeric
|
47
|
+
# }
|
22
48
|
def from_csv path, opts={}, &block
|
23
49
|
Daru::IO.from_csv path, opts, &block
|
24
50
|
end
|
25
51
|
|
52
|
+
# Read data from an Excel file into a DataFrame.
|
53
|
+
#
|
54
|
+
# == Arguments
|
55
|
+
#
|
56
|
+
# * path - Path of the file to be read.
|
57
|
+
#
|
58
|
+
# == Options
|
59
|
+
#
|
60
|
+
# *:worksheet_id - ID of the worksheet that is to be read.
|
61
|
+
def from_excel path, opts={}, &block
|
62
|
+
Daru::IO.from_excel path, opts, &block
|
63
|
+
end
|
64
|
+
|
65
|
+
# Read a database query and returns a Dataset
|
66
|
+
#
|
67
|
+
# USE:
|
68
|
+
#
|
69
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
70
|
+
# Daru::DataFrame.from_sql(dbh, "SELECT * FROM test")
|
71
|
+
def from_sql dbh, query
|
72
|
+
Daru::IO.from_sql dbh, query
|
73
|
+
end
|
74
|
+
|
75
|
+
# Read the database from a plaintext file. For this method to work,
|
76
|
+
# the data should be present in a plain text file in columns. See
|
77
|
+
# spec/fixtures/bank2.dat for an example.
|
78
|
+
#
|
79
|
+
# == Arguments
|
80
|
+
#
|
81
|
+
# * path - Path of the file to be read.
|
82
|
+
# * fields - Vector names of the resulting database.
|
83
|
+
#
|
84
|
+
# == Usage
|
85
|
+
#
|
86
|
+
# df = Daru::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
|
87
|
+
def from_plaintext path, fields
|
88
|
+
Daru::IO.from_plaintext path, fields
|
89
|
+
end
|
90
|
+
|
26
91
|
# Create DataFrame by specifying rows as an Array of Arrays or Array of
|
27
92
|
# Daru::Vector objects.
|
28
93
|
def rows source, opts={}
|
@@ -52,6 +117,58 @@ module Daru
|
|
52
117
|
|
53
118
|
df
|
54
119
|
end
|
120
|
+
|
121
|
+
# Generates a new dataset, using three vectors
|
122
|
+
# - Rows
|
123
|
+
# - Columns
|
124
|
+
# - Values
|
125
|
+
#
|
126
|
+
# For example, you have these values
|
127
|
+
#
|
128
|
+
# x y v
|
129
|
+
# a a 0
|
130
|
+
# a b 1
|
131
|
+
# b a 1
|
132
|
+
# b b 0
|
133
|
+
#
|
134
|
+
# You obtain
|
135
|
+
# id a b
|
136
|
+
# a 0 1
|
137
|
+
# b 1 0
|
138
|
+
#
|
139
|
+
# Useful to process outputs from databases
|
140
|
+
def crosstab_by_assignation rows, columns, values
|
141
|
+
raise "Three vectors should be equal size" if
|
142
|
+
rows.size != columns.size or rows.size!=values.size
|
143
|
+
|
144
|
+
cols_values = columns.factors
|
145
|
+
cols_n = cols_values.size
|
146
|
+
|
147
|
+
h_rows = rows.factors.inject({}) do |a,v|
|
148
|
+
a[v] = cols_values.inject({}) do |a1,v1|
|
149
|
+
a1[v1]=nil
|
150
|
+
a1
|
151
|
+
end
|
152
|
+
a
|
153
|
+
end
|
154
|
+
|
155
|
+
values.each_index do |i|
|
156
|
+
h_rows[rows[i]][columns[i]] = values[i]
|
157
|
+
end
|
158
|
+
df = Daru::DataFrame.new({}, order: [:_id] + cols_values.to_a)
|
159
|
+
|
160
|
+
rows.factors.each do |row|
|
161
|
+
n_row = Array.new(cols_n+1)
|
162
|
+
n_row[0] = row
|
163
|
+
cols_values.each_index do |i|
|
164
|
+
n_row[i+1] = h_rows[row][cols_values[i]]
|
165
|
+
end
|
166
|
+
|
167
|
+
df.add_row(n_row)
|
168
|
+
end
|
169
|
+
df.update
|
170
|
+
df
|
171
|
+
end
|
55
172
|
end
|
56
173
|
|
57
174
|
# The vectors (columns) index of the DataFrame
|
@@ -67,8 +184,29 @@ module Daru
|
|
67
184
|
attr_reader :size
|
68
185
|
|
69
186
|
# DataFrame basically consists of an Array of Vector objects.
|
70
|
-
#
|
71
|
-
#
|
187
|
+
# These objects are indexed by row and column by vectors and index Index objects.
|
188
|
+
#
|
189
|
+
# == Arguments
|
190
|
+
#
|
191
|
+
# * source - Source from the DataFrame is to be initialized. Can be a Hash
|
192
|
+
# of names and vectors (array or Daru::Vector), an array of arrays or
|
193
|
+
# array of Daru::Vectors.
|
194
|
+
#
|
195
|
+
# == Options
|
196
|
+
#
|
197
|
+
# +:order+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order in
|
198
|
+
# which Vectors should appear in the DataFrame.
|
199
|
+
#
|
200
|
+
# +:index+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order
|
201
|
+
# in which rows of the DataFrame will be named.
|
202
|
+
#
|
203
|
+
# +:name+ - A name for the DataFrame.
|
204
|
+
#
|
205
|
+
# +:clone+ - Specify as *true* or *false*. When set to false, and Vector
|
206
|
+
# objects are passed for the source, the Vector objects will not duplicated
|
207
|
+
# when creating the DataFrame. Will have no effect if Array is passed in
|
208
|
+
# the source, or if the passed Daru::Vectors have different indexes.
|
209
|
+
# Default to *true*.
|
72
210
|
#
|
73
211
|
# == Usage
|
74
212
|
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
|
@@ -84,9 +222,12 @@ module Daru
|
|
84
222
|
def initialize source, opts={}
|
85
223
|
vectors = opts[:order]
|
86
224
|
index = opts[:index]
|
87
|
-
|
225
|
+
clone = opts[:clone] == false ? false : true
|
88
226
|
@data = []
|
89
227
|
|
228
|
+
temp_name = opts[:name]
|
229
|
+
@name = temp_name.is_a?(Numeric) ? temp_name : (temp_name || SecureRandom.uuid).to_sym
|
230
|
+
|
90
231
|
if source.empty?
|
91
232
|
@vectors = create_index vectors
|
92
233
|
@index = create_index index
|
@@ -109,7 +250,7 @@ module Daru
|
|
109
250
|
vectors.each_with_index do |name, idx|
|
110
251
|
hsh[name] = source[idx]
|
111
252
|
end
|
112
|
-
initialize(hsh, index: index, order: vectors, name: @name)
|
253
|
+
initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
|
113
254
|
else # array of hashes
|
114
255
|
if vectors.nil?
|
115
256
|
@vectors = Daru::Index.new source[0].keys.map(&:to_sym)
|
@@ -143,13 +284,19 @@ module Daru
|
|
143
284
|
all_indexes.flatten!.uniq!.sort!
|
144
285
|
|
145
286
|
@index = Daru::Index.new all_indexes
|
287
|
+
clone = true
|
146
288
|
end
|
147
|
-
@vectors.each do |vector|
|
148
|
-
@data << Daru::Vector.new([], name: vector, index: @index)
|
149
289
|
|
150
|
-
|
151
|
-
|
290
|
+
if clone
|
291
|
+
@vectors.each do |vector|
|
292
|
+
@data << Daru::Vector.new([], name: vector, index: @index)
|
293
|
+
|
294
|
+
@index.each do |idx|
|
295
|
+
@data[@vectors[vector]][idx] = source[vector][idx]
|
296
|
+
end
|
152
297
|
end
|
298
|
+
else
|
299
|
+
@data.concat source.values
|
153
300
|
end
|
154
301
|
else
|
155
302
|
@index = create_index(index || source.values[0].size)
|
@@ -163,6 +310,7 @@ module Daru
|
|
163
310
|
|
164
311
|
set_size
|
165
312
|
validate
|
313
|
+
update
|
166
314
|
end
|
167
315
|
|
168
316
|
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
|
@@ -176,6 +324,7 @@ module Daru
|
|
176
324
|
else
|
177
325
|
axis = :vector
|
178
326
|
end
|
327
|
+
names.map! { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
|
179
328
|
|
180
329
|
if axis == :vector
|
181
330
|
access_vector *names
|
@@ -194,11 +343,14 @@ module Daru
|
|
194
343
|
# of the vector will be matched against the row/vector indexes of the DataFrame
|
195
344
|
# before an insertion is performed. Unmatched indexes will be set to nil.
|
196
345
|
def []=(*args)
|
197
|
-
|
198
|
-
|
346
|
+
axis = args.include?(:row) ? :row : :vector
|
347
|
+
args.delete :vector
|
348
|
+
args.delete :row
|
349
|
+
|
350
|
+
name = args[0..-2]
|
199
351
|
vector = args[-1]
|
352
|
+
name.map! { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
|
200
353
|
|
201
|
-
axis = (!axis.is_a?(Symbol) and (axis != :vector or axis != :row)) ? :vector : axis
|
202
354
|
if axis == :vector
|
203
355
|
insert_or_modify_vector name, vector
|
204
356
|
elsif axis == :row
|
@@ -222,6 +374,14 @@ module Daru
|
|
222
374
|
vector[name]
|
223
375
|
end
|
224
376
|
|
377
|
+
def add_row row, index=nil
|
378
|
+
self.row[index || @size] = row
|
379
|
+
end
|
380
|
+
|
381
|
+
def add_vector n, vector
|
382
|
+
self[n] = vector
|
383
|
+
end
|
384
|
+
|
225
385
|
# Access a row or set/create a row. Refer #[] and #[]= docs for details.
|
226
386
|
#
|
227
387
|
# == Usage
|
@@ -232,13 +392,77 @@ module Daru
|
|
232
392
|
end
|
233
393
|
|
234
394
|
# Duplicate the DataFrame entirely.
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
395
|
+
#
|
396
|
+
# == Arguments
|
397
|
+
#
|
398
|
+
# * +vectors_to_dup+ - An Array specifying the names of Vectors to
|
399
|
+
# be duplicated. Will duplicate the entire DataFrame if not specified.
|
400
|
+
def dup vectors_to_dup=nil
|
401
|
+
vectors_to_dup = @vectors unless vectors_to_dup
|
402
|
+
|
403
|
+
new_order =
|
404
|
+
if vectors.is_a?(MultiIndex)
|
405
|
+
src = []
|
406
|
+
vectors_to_dup.each do |vec|
|
407
|
+
src << @data[@vectors[vec]].dup
|
408
|
+
end
|
409
|
+
|
410
|
+
Daru::MultiIndex.new(vectors_to_dup)
|
411
|
+
else
|
412
|
+
src = {}
|
413
|
+
vectors_to_dup.each do |vector|
|
414
|
+
src[vector] = @data[@vectors[vector]].dup
|
415
|
+
end
|
416
|
+
|
417
|
+
Daru::Index.new(vectors_to_dup)
|
418
|
+
end
|
419
|
+
|
420
|
+
Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
|
421
|
+
end
|
422
|
+
|
423
|
+
# Only clone the structure of the DataFrame.
|
424
|
+
def clone_structure
|
425
|
+
Daru::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
|
426
|
+
end
|
427
|
+
|
428
|
+
# Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
|
429
|
+
# preserved.
|
430
|
+
#
|
431
|
+
# == Arguments
|
432
|
+
#
|
433
|
+
# +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
|
434
|
+
# a view of the whole data frame otherwise.
|
435
|
+
def clone *vectors_to_clone
|
436
|
+
vectors_to_clone.flatten! unless vectors_to_clone.all? { |a| !a.is_a?(Array) }
|
437
|
+
return super if vectors_to_clone.empty?
|
438
|
+
|
439
|
+
h = vectors_to_clone.inject({}) do |hsh, vec|
|
440
|
+
hsh[vec] = self[vec]
|
441
|
+
hsh
|
239
442
|
end
|
443
|
+
Daru::DataFrame.new(h, clone: false)
|
444
|
+
end
|
240
445
|
|
241
|
-
|
446
|
+
# Returns a 'shallow' copy of DataFrame if missing data is not present,
|
447
|
+
# or a full copy of only valid data if missing data is present.
|
448
|
+
def clone_only_valid
|
449
|
+
if has_missing_data?
|
450
|
+
dup_only_valid
|
451
|
+
else
|
452
|
+
clone
|
453
|
+
end
|
454
|
+
end
|
455
|
+
|
456
|
+
# Creates a new duplicate dataframe containing only rows
|
457
|
+
# without a single missing value.
|
458
|
+
def dup_only_valid vecs=nil
|
459
|
+
rows_with_nil = @data.inject([]) do |memo, vector|
|
460
|
+
memo.concat vector.missing_positions
|
461
|
+
memo
|
462
|
+
end.uniq
|
463
|
+
|
464
|
+
row_indexes = @index.to_a
|
465
|
+
(vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
|
242
466
|
end
|
243
467
|
|
244
468
|
# Iterate over each vector
|
@@ -286,21 +510,205 @@ module Daru
|
|
286
510
|
self
|
287
511
|
end
|
288
512
|
|
289
|
-
#
|
290
|
-
#
|
291
|
-
#
|
292
|
-
#
|
513
|
+
# Iterate over each row or vector of the DataFrame. Specify axis
|
514
|
+
# by passing :vector or :row as the argument. Default to :vector.
|
515
|
+
#
|
516
|
+
# == Description
|
517
|
+
#
|
518
|
+
# `#each` works exactly like Array#each. The default mode for `each`
|
519
|
+
# is to iterate over the columns of the DataFrame. To iterate over
|
520
|
+
# rows you must pass the axis, i.e `:row` as an argument.
|
521
|
+
#
|
522
|
+
# == Arguments
|
523
|
+
#
|
524
|
+
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
525
|
+
# or :row. Default to :vector.
|
526
|
+
def each axis=:vector, &block
|
527
|
+
if axis == :vector or axis == :column
|
528
|
+
each_vector(&block)
|
529
|
+
elsif axis == :row
|
530
|
+
each_row(&block)
|
531
|
+
else
|
532
|
+
raise ArgumentError, "Unknown axis #{axis}"
|
533
|
+
end
|
534
|
+
end
|
535
|
+
|
536
|
+
# Iterate over a row or vector and return results in a Daru::Vector.
|
537
|
+
# Specify axis with :vector or :row. Default to :vector.
|
538
|
+
#
|
539
|
+
# == Description
|
540
|
+
#
|
541
|
+
# The #collect iterator works similar to #map, the only difference
|
542
|
+
# being that it returns a Daru::Vector comprising of the results of
|
543
|
+
# each block run. The resultant Vector has the same index as that
|
544
|
+
# of the axis over which collect has iterated. It also accepts the
|
545
|
+
# optional axis argument.
|
546
|
+
#
|
547
|
+
# == Arguments
|
548
|
+
#
|
549
|
+
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
550
|
+
# or :row. Default to :vector.
|
551
|
+
def collect axis=:vector, &block
|
552
|
+
if axis == :vector or axis == :column
|
553
|
+
collect_vectors(&block)
|
554
|
+
elsif axis == :row
|
555
|
+
collect_rows(&block)
|
556
|
+
else
|
557
|
+
raise ArgumentError, "Unknown axis #{axis}"
|
558
|
+
end
|
559
|
+
end
|
560
|
+
|
561
|
+
# Map over each vector or row of the data frame according to
|
562
|
+
# the argument specified. Will return an Array of the resulting
|
563
|
+
# elements. To map over each row/vector and get a DataFrame,
|
564
|
+
# see #recode.
|
565
|
+
#
|
566
|
+
# == Description
|
567
|
+
#
|
568
|
+
# The #map iterator works like Array#map. The value returned by
|
569
|
+
# each run of the block is added to an Array and the Array is
|
570
|
+
# returned. This method also accepts an axis argument, like #each.
|
571
|
+
# The default is :vector.
|
572
|
+
#
|
573
|
+
# == Arguments
|
574
|
+
#
|
575
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
576
|
+
# Default to :vector.
|
577
|
+
def map axis=:vector, &block
|
578
|
+
if axis == :vector or axis == :column
|
579
|
+
map_vectors(&block)
|
580
|
+
elsif axis == :row
|
581
|
+
map_rows(&block)
|
582
|
+
else
|
583
|
+
raise ArgumentError, "Unknown axis #{axis}"
|
584
|
+
end
|
585
|
+
end
|
586
|
+
|
587
|
+
# Destructive map. Modifies the DataFrame. Each run of the block
|
588
|
+
# must return a Daru::Vector. You can specify the axis to map over
|
589
|
+
# as the argument. Default to :vector.
|
590
|
+
#
|
591
|
+
# == Arguments
|
592
|
+
#
|
593
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
594
|
+
# Default to :vector.
|
595
|
+
def map! axis=:vector, &block
|
596
|
+
if axis == :vector or axis == :column
|
597
|
+
map_vectors!(&block)
|
598
|
+
elsif axis == :row
|
599
|
+
map_rows!(&block)
|
600
|
+
end
|
601
|
+
end
|
602
|
+
|
603
|
+
# Maps over the DataFrame and returns a DataFrame. Each run of the
|
604
|
+
# block must return a Daru::Vector object. You can specify the axis
|
605
|
+
# to map over. Default to :vector.
|
606
|
+
#
|
607
|
+
# == Description
|
608
|
+
#
|
609
|
+
# Recode works similarly to #map, but an important difference between
|
610
|
+
# the two is that recode returns a modified Daru::DataFrame instead
|
611
|
+
# of an Array. For this reason, #recodeexpects that every run of the
|
612
|
+
# block to return a Daru::Vector.
|
613
|
+
#
|
614
|
+
# Just like map and each, recode also accepts an optional _axis_ argument.
|
615
|
+
#
|
616
|
+
# == Arguments
|
617
|
+
#
|
618
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
619
|
+
# Default to :vector.
|
620
|
+
def recode axis=:vector, &block
|
621
|
+
if axis == :vector or axis == :column
|
622
|
+
recode_vectors(&block)
|
623
|
+
elsif axis == :row
|
624
|
+
recode_rows(&block)
|
625
|
+
end
|
626
|
+
end
|
627
|
+
|
628
|
+
# Retain vectors or rows if the block returns a truthy value.
|
629
|
+
#
|
630
|
+
# == Description
|
631
|
+
#
|
632
|
+
# For filtering out certain rows/vectors based on their values,
|
633
|
+
# use the #filter method. By default it iterates over vectors and
|
634
|
+
# keeps those vectors for which the block returns true. It accepts
|
635
|
+
# an optional axis argument which lets you specify whether you want
|
636
|
+
# to iterate over vectors or rows.
|
637
|
+
#
|
638
|
+
# == Arguments
|
639
|
+
#
|
640
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
641
|
+
# Default to :vector.
|
642
|
+
#
|
643
|
+
# == Usage
|
644
|
+
#
|
645
|
+
# # Filter vectors
|
646
|
+
#
|
647
|
+
# df.filter do |vector|
|
648
|
+
# vector.type == :numeric and vector.median < 50
|
649
|
+
# end
|
650
|
+
#
|
651
|
+
# # Filter rows
|
652
|
+
#
|
653
|
+
# df.filter(:row) do |row|
|
654
|
+
# row[:a] + row[:d] < 100
|
655
|
+
# end
|
656
|
+
def filter axis=:vector, &block
|
657
|
+
if axis == :vector or axis == :column
|
658
|
+
filter_vectors(&block)
|
659
|
+
elsif axis == :row
|
660
|
+
filter_rows(&block)
|
661
|
+
end
|
662
|
+
end
|
663
|
+
|
664
|
+
def recode_vectors &block
|
665
|
+
block_given? or return to_enum(:recode_vectors)
|
666
|
+
|
667
|
+
df = self.dup
|
668
|
+
df.each_vector_with_index do |v, i|
|
669
|
+
ret = yield v
|
670
|
+
ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
|
671
|
+
df[*i] = ret
|
672
|
+
end
|
673
|
+
|
674
|
+
df
|
675
|
+
end
|
676
|
+
|
677
|
+
def recode_rows &block
|
678
|
+
block_given? or return to_enum(:recode_rows)
|
679
|
+
|
680
|
+
df = self.dup
|
681
|
+
df.each_row_with_index do |r, i|
|
682
|
+
ret = yield r
|
683
|
+
ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
|
684
|
+
df.row[i] = ret
|
685
|
+
end
|
686
|
+
|
687
|
+
df
|
688
|
+
end
|
689
|
+
|
690
|
+
# Map each vector and return an Array.
|
293
691
|
def map_vectors(&block)
|
294
692
|
return to_enum(:map_vectors) unless block_given?
|
295
693
|
|
296
|
-
|
694
|
+
arry = []
|
695
|
+
@data.each do |vec|
|
696
|
+
arry << yield(vec)
|
697
|
+
end
|
698
|
+
|
699
|
+
arry
|
297
700
|
end
|
298
701
|
|
299
702
|
# Destructive form of #map_vectors
|
300
703
|
def map_vectors!(&block)
|
301
704
|
return to_enum(:map_vectors!) unless block_given?
|
302
705
|
|
303
|
-
|
706
|
+
vectors.dup.each do |n|
|
707
|
+
v = yield self[n]
|
708
|
+
v.is_a?(Daru::Vector) or raise TypeError, "Must return a Daru::Vector not #{v.class}"
|
709
|
+
self[n] = v
|
710
|
+
end
|
711
|
+
|
304
712
|
self
|
305
713
|
end
|
306
714
|
|
@@ -308,37 +716,114 @@ module Daru
|
|
308
716
|
def map_vectors_with_index(&block)
|
309
717
|
return to_enum(:map_vectors_with_index) unless block_given?
|
310
718
|
|
311
|
-
|
312
|
-
|
313
|
-
|
719
|
+
dt = []
|
720
|
+
each_vector_with_index do |vector, name|
|
721
|
+
dt << yield(vector, name)
|
314
722
|
end
|
315
723
|
|
316
|
-
|
724
|
+
dt
|
317
725
|
end
|
318
726
|
|
319
727
|
# Map each row
|
320
728
|
def map_rows(&block)
|
321
729
|
return to_enum(:map_rows) unless block_given?
|
322
730
|
|
323
|
-
|
324
|
-
|
325
|
-
|
731
|
+
dt = []
|
732
|
+
each_row do |row|
|
733
|
+
dt << yield(row)
|
326
734
|
end
|
327
735
|
|
328
|
-
|
736
|
+
dt
|
329
737
|
end
|
330
738
|
|
331
739
|
def map_rows_with_index(&block)
|
332
740
|
return to_enum(:map_rows_with_index) unless block_given?
|
333
741
|
|
334
|
-
|
335
|
-
|
336
|
-
|
742
|
+
dt = []
|
743
|
+
each_row_with_index do |row, index|
|
744
|
+
dt << yield(row, index)
|
337
745
|
end
|
338
746
|
|
339
|
-
|
747
|
+
dt
|
340
748
|
end
|
341
749
|
|
750
|
+
def map_rows!(&block)
|
751
|
+
return to_enum(:map_rows!) unless block_given?
|
752
|
+
|
753
|
+
index.dup.each do |i|
|
754
|
+
r = yield self.row[i]
|
755
|
+
r.is_a?(Daru::Vector) or raise TypeError, "Returned object must be Daru::Vector not #{r.class}"
|
756
|
+
self.row[i] = r
|
757
|
+
end
|
758
|
+
|
759
|
+
self
|
760
|
+
end
|
761
|
+
|
762
|
+
# Retrieves a Daru::Vector, based on the result of calculation
|
763
|
+
# performed on each row.
|
764
|
+
def collect_rows &block
|
765
|
+
return to_enum(:collect_rows) unless block_given?
|
766
|
+
|
767
|
+
data = []
|
768
|
+
each_row do |row|
|
769
|
+
data.push yield(row)
|
770
|
+
end
|
771
|
+
|
772
|
+
Daru::Vector.new(data, index: @index)
|
773
|
+
end
|
774
|
+
|
775
|
+
def collect_row_with_index &block
|
776
|
+
return to_enum(:collect_row_with_index) unless block_given?
|
777
|
+
|
778
|
+
data = []
|
779
|
+
each_row_with_index do |row, i|
|
780
|
+
data.push yield(row, i)
|
781
|
+
end
|
782
|
+
|
783
|
+
Daru::Vector.new(data, index: @index)
|
784
|
+
end
|
785
|
+
|
786
|
+
# Retrives a Daru::Vector, based on the result of calculation
|
787
|
+
# performed on each vector.
|
788
|
+
def collect_vectors &block
|
789
|
+
return to_enum(:collect_vectors) unless block_given?
|
790
|
+
|
791
|
+
data = []
|
792
|
+
each_vector do |vec|
|
793
|
+
data.push yield(vec)
|
794
|
+
end
|
795
|
+
|
796
|
+
Daru::Vector.new(data, index: @vectors)
|
797
|
+
end
|
798
|
+
|
799
|
+
def collect_vector_with_index &block
|
800
|
+
return to_enum(:collect_vector_with_index) unless block_given?
|
801
|
+
|
802
|
+
data = []
|
803
|
+
each_vector_with_index do |vec, i|
|
804
|
+
data.push yield(vec, i)
|
805
|
+
end
|
806
|
+
|
807
|
+
Daru::Vector.new(data, index: @vectors)
|
808
|
+
end
|
809
|
+
|
810
|
+
# Generate a matrix, based on vector names of the DataFrame.
|
811
|
+
#
|
812
|
+
# @return {::Matrix}
|
813
|
+
def collect_matrix
|
814
|
+
return to_enum(:collect_matrix) unless block_given?
|
815
|
+
|
816
|
+
vecs = vectors.to_a
|
817
|
+
rows = vecs.collect { |row|
|
818
|
+
vecs.collect { |col|
|
819
|
+
yield row,col
|
820
|
+
}
|
821
|
+
}
|
822
|
+
|
823
|
+
Matrix.rows(rows)
|
824
|
+
end
|
825
|
+
|
826
|
+
|
342
827
|
# Delete a vector
|
343
828
|
def delete_vector vector
|
344
829
|
if @vectors.include? vector
|
@@ -367,6 +852,20 @@ module Daru
|
|
367
852
|
set_size
|
368
853
|
end
|
369
854
|
|
855
|
+
# Creates a DataFrame with the random data, of n size.
|
856
|
+
# If n not given, uses original number of rows.
|
857
|
+
#
|
858
|
+
# @return {Daru::DataFrame}
|
859
|
+
def bootstrap(n=nil)
|
860
|
+
n ||= nrows
|
861
|
+
ds_boot = Daru::DataFrame.new({}, order: @vectors)
|
862
|
+
n.times do
|
863
|
+
ds_boot.add_row(row[rand(n)])
|
864
|
+
end
|
865
|
+
ds_boot.update
|
866
|
+
ds_boot
|
867
|
+
end
|
868
|
+
|
370
869
|
def keep_row_if &block
|
371
870
|
deletion = []
|
372
871
|
|
@@ -388,6 +887,16 @@ module Daru
|
|
388
887
|
end
|
389
888
|
end
|
390
889
|
|
890
|
+
# creates a new vector with the data of a given field which the block returns true
|
891
|
+
def filter_vector vec
|
892
|
+
d = []
|
893
|
+
each_row do |row|
|
894
|
+
d.push(row[vec]) if yield row
|
895
|
+
end
|
896
|
+
|
897
|
+
Daru::Vector.new(d)
|
898
|
+
end
|
899
|
+
|
391
900
|
# Iterates over each row and retains it in a new DataFrame if the block returns
|
392
901
|
# true for that row.
|
393
902
|
def filter_rows &block
|
@@ -419,18 +928,160 @@ module Daru
|
|
419
928
|
df
|
420
929
|
end
|
421
930
|
|
931
|
+
# Test each row with one or more tests. Each test is a Proc with the form
|
932
|
+
# *Proc.new {|row| row[:age] > 0}*
|
933
|
+
#
|
934
|
+
# The function returns an array with all errors.
|
935
|
+
def verify(*tests)
|
936
|
+
if(tests[0].is_a? Symbol)
|
937
|
+
id = tests[0]
|
938
|
+
tests.shift
|
939
|
+
else
|
940
|
+
id = @vectors.first
|
941
|
+
end
|
942
|
+
|
943
|
+
vr = []
|
944
|
+
i = 0
|
945
|
+
each(:row) do |row|
|
946
|
+
i += 1
|
947
|
+
tests.each do |test|
|
948
|
+
if !test[2].call(row)
|
949
|
+
values = ""
|
950
|
+
if test[1].size>0
|
951
|
+
values = " (" + test[1].collect{ |k| "#{k}=#{row[k]}" }.join(", ") + ")"
|
952
|
+
end
|
953
|
+
vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
|
954
|
+
end
|
955
|
+
end
|
956
|
+
end
|
957
|
+
vr
|
958
|
+
end
|
959
|
+
|
960
|
+
# DSL for yielding each row and returning a Daru::Vector based on the
|
961
|
+
# value each run of the block returns.
|
962
|
+
#
|
963
|
+
# == Usage
|
964
|
+
#
|
965
|
+
# a1 = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7])
|
966
|
+
# a2 = Daru::Vector.new([10, 20, 30, 40, 50, 60, 70])
|
967
|
+
# a3 = Daru::Vector.new([100, 200, 300, 400, 500, 600, 700])
|
968
|
+
# ds = Daru::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
|
969
|
+
# total = ds.vector_by_calculation { a + b + c }
|
970
|
+
# # <Daru::Vector:82314050 @name = nil @size = 7 >
|
971
|
+
# # nil
|
972
|
+
# # 0 111
|
973
|
+
# # 1 222
|
974
|
+
# # 2 333
|
975
|
+
# # 3 444
|
976
|
+
# # 4 555
|
977
|
+
# # 5 666
|
978
|
+
# # 6 777
|
979
|
+
def vector_by_calculation &block
|
980
|
+
a = []
|
981
|
+
each_row do |r|
|
982
|
+
a.push r.instance_eval(&block)
|
983
|
+
end
|
984
|
+
|
985
|
+
Daru::Vector.new a, index: @index
|
986
|
+
end
|
987
|
+
|
988
|
+
# Returns a vector, based on a string with a calculation based
|
989
|
+
# on vector.
|
990
|
+
#
|
991
|
+
# The calculation will be eval'ed, so you can put any variable
|
992
|
+
# or expression valid on ruby.
|
993
|
+
#
|
994
|
+
# For example:
|
995
|
+
# a = Daru::Vector.new [1,2]
|
996
|
+
# b = Daru::Vector.new [3,4]
|
997
|
+
# ds = Daru::DataFrame.new({:a => a,:b => b})
|
998
|
+
# ds.compute("a+b")
|
999
|
+
# => Vector [4,6]
|
1000
|
+
def compute text, &block
|
1001
|
+
return instance_eval(&block) if block_given?
|
1002
|
+
instance_eval(text)
|
1003
|
+
end
|
1004
|
+
|
1005
|
+
# Return a vector with the number of missing values in each row.
|
1006
|
+
#
|
1007
|
+
# == Arguments
|
1008
|
+
#
|
1009
|
+
# * +missing_values+ - An Array of the values that should be
|
1010
|
+
# treated as 'missing'. The default missing value is *nil*.
|
1011
|
+
def missing_values_rows missing_values=[nil]
|
1012
|
+
number_of_missing = []
|
1013
|
+
each_row do |row|
|
1014
|
+
row.missing_values = missing_values
|
1015
|
+
number_of_missing << row.missing_positions.size
|
1016
|
+
end
|
1017
|
+
|
1018
|
+
Daru::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows".to_sym
|
1019
|
+
end
|
1020
|
+
|
1021
|
+
# TODO: remove next version
|
1022
|
+
alias :vector_missing_values :missing_values_rows
|
1023
|
+
|
1024
|
+
def has_missing_data?
|
1025
|
+
!!@data.any? { |v| v.has_missing_data? }
|
1026
|
+
end
|
1027
|
+
|
1028
|
+
alias :flawed? :has_missing_data?
|
1029
|
+
|
1030
|
+
# Return a nested hash using vector names as keys and an array constructed of
|
1031
|
+
# hashes with other values. If block provided, is used to provide the
|
1032
|
+
# values, with parameters +row+ of dataset, +current+ last hash on
|
1033
|
+
# hierarchy and +name+ of the key to include
|
1034
|
+
def nest *tree_keys, &block
|
1035
|
+
tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
|
1036
|
+
out = {}
|
1037
|
+
|
1038
|
+
each_row do |row|
|
1039
|
+
current = out
|
1040
|
+
# Create tree
|
1041
|
+
tree_keys[0, tree_keys.size-1].each do |f|
|
1042
|
+
root = row[f]
|
1043
|
+
current[root] ||= {}
|
1044
|
+
current = current[root]
|
1045
|
+
end
|
1046
|
+
name = row[tree_keys.last]
|
1047
|
+
if !block
|
1048
|
+
current[name] ||= []
|
1049
|
+
current[name].push(row.to_hash.delete_if { |key,value| tree_keys.include? key})
|
1050
|
+
else
|
1051
|
+
current[name] = block.call(row, current,name)
|
1052
|
+
end
|
1053
|
+
end
|
1054
|
+
|
1055
|
+
out
|
1056
|
+
end
|
1057
|
+
|
1058
|
+
def vector_count_characters vecs=nil
|
1059
|
+
vecs ||= @vectors.to_a
|
1060
|
+
|
1061
|
+
collect_row_with_index do |row, i|
|
1062
|
+
vecs.inject(0) do |memo, vec|
|
1063
|
+
memo + (row[vec].nil? ? 0 : row[vec].to_s.size)
|
1064
|
+
end
|
1065
|
+
end
|
1066
|
+
end
|
1067
|
+
|
1068
|
+
def add_vectors_by_split(name,join='-',sep=Daru::SPLIT_TOKEN)
|
1069
|
+
split = self[name].split_by_separator(sep)
|
1070
|
+
split.each { |k,v| self[(name.to_s + join + k.to_s).to_sym] = v }
|
1071
|
+
end
|
1072
|
+
|
422
1073
|
# Return the number of rows and columns of the DataFrame in an Array.
|
423
1074
|
def shape
|
424
1075
|
[@index.size, @vectors.size]
|
425
1076
|
end
|
426
1077
|
|
427
1078
|
# The number of rows
|
428
|
-
def
|
1079
|
+
def nrows
|
429
1080
|
shape[0]
|
430
1081
|
end
|
431
1082
|
|
432
1083
|
# The number of vectors
|
433
|
-
def
|
1084
|
+
def ncols
|
434
1085
|
shape[1]
|
435
1086
|
end
|
436
1087
|
|
@@ -439,11 +1090,37 @@ module Daru
|
|
439
1090
|
!!@vectors[*vector]
|
440
1091
|
end
|
441
1092
|
|
1093
|
+
def any? axis=:vector, &block
|
1094
|
+
if axis == :vector or axis == :column
|
1095
|
+
@data.any?(&block)
|
1096
|
+
elsif axis == :row
|
1097
|
+
each_row do |row|
|
1098
|
+
return true if yield(row)
|
1099
|
+
end
|
1100
|
+
return false
|
1101
|
+
else
|
1102
|
+
raise ArgumentError, "Unidentified axis #{axis}"
|
1103
|
+
end
|
1104
|
+
end
|
1105
|
+
|
1106
|
+
def all? axis=:vector, &block
|
1107
|
+
if axis == :vector or axis == :column
|
1108
|
+
@data.all?(&block)
|
1109
|
+
elsif axis == :row
|
1110
|
+
each_row do |row|
|
1111
|
+
return false unless yield(row)
|
1112
|
+
end
|
1113
|
+
return true
|
1114
|
+
else
|
1115
|
+
raise ArgumentError, "Unidentified axis #{axis}"
|
1116
|
+
end
|
1117
|
+
end
|
1118
|
+
|
442
1119
|
# The first ten elements of the DataFrame
|
443
1120
|
#
|
444
1121
|
# @param [Fixnum] quantity (10) The number of elements to display from the top.
|
445
1122
|
def head quantity=10
|
446
|
-
self[0..quantity, :row]
|
1123
|
+
self[0..(quantity-1), :row]
|
447
1124
|
end
|
448
1125
|
|
449
1126
|
# The last ten elements of the DataFrame
|
@@ -453,7 +1130,59 @@ module Daru
|
|
453
1130
|
self[(@size - quantity)..(@size-1), :row]
|
454
1131
|
end
|
455
1132
|
|
456
|
-
#
|
1133
|
+
# Returns a vector with sum of all vectors specified in the argument.
|
1134
|
+
# Tf vecs parameter is empty, sum all numeric vector.
|
1135
|
+
def vector_sum vecs=nil
|
1136
|
+
vecs ||= numeric_vectors
|
1137
|
+
sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype
|
1138
|
+
|
1139
|
+
vecs.each do |n|
|
1140
|
+
sum += self[n]
|
1141
|
+
end
|
1142
|
+
|
1143
|
+
sum
|
1144
|
+
end
|
1145
|
+
|
1146
|
+
# Calculate mean of the rows of the dataframe.
|
1147
|
+
#
|
1148
|
+
# == Arguments
|
1149
|
+
#
|
1150
|
+
# * +max_missing+ - The maximum number of elements in the row that can be
|
1151
|
+
# zero for the mean calculation to happen. Default to 0.
|
1152
|
+
def vector_mean max_missing=0
|
1153
|
+
mean_vec = Daru::Vector.new [0]*@size, index: @index, name: "mean_#{@name}"
|
1154
|
+
|
1155
|
+
each_row_with_index do |row, i|
|
1156
|
+
mean_vec[i] = row.missing_positions.size > max_missing ? nil : row.mean
|
1157
|
+
end
|
1158
|
+
|
1159
|
+
mean_vec
|
1160
|
+
end
|
1161
|
+
|
1162
|
+
# Group elements by vector to perform operations on them. Returns a
|
1163
|
+
# Daru::Core::GroupBy object.See the Daru::Core::GroupBy docs for a detailed
|
1164
|
+
# list of possible operations.
|
1165
|
+
#
|
1166
|
+
# == Arguments
|
1167
|
+
#
|
1168
|
+
# * vectors - An Array contatining names of vectors to group by.
|
1169
|
+
#
|
1170
|
+
# == Usage
|
1171
|
+
#
|
1172
|
+
# df = Daru::DataFrame.new({
|
1173
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
1174
|
+
# b: %w{one one two three two two one three},
|
1175
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
1176
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
1177
|
+
# })
|
1178
|
+
# df.group_by([:a,:b,:c]).groups
|
1179
|
+
# #=> {["bar", "one", 2]=>[1],
|
1180
|
+
# # ["bar", "three", 1]=>[3],
|
1181
|
+
# # ["bar", "two", 6]=>[5],
|
1182
|
+
# # ["foo", "one", 1]=>[0],
|
1183
|
+
# # ["foo", "one", 3]=>[6],
|
1184
|
+
# # ["foo", "three", 8]=>[7],
|
1185
|
+
# # ["foo", "two", 3]=>[2, 4]}
|
457
1186
|
def group_by vectors
|
458
1187
|
vectors = [vectors] if vectors.is_a?(Symbol)
|
459
1188
|
vectors.each { |v| raise(ArgumentError, "Vector #{v} does not exist") unless
|
@@ -462,6 +1191,13 @@ module Daru
|
|
462
1191
|
Daru::Core::GroupBy.new(self, vectors)
|
463
1192
|
end
|
464
1193
|
|
1194
|
+
def reindex_vectors! new_vectors
|
1195
|
+
raise ArgumentError, "Number of vectors passed into function (#{new_vectors.size}) should equal that present in the DataFrame (#{@vectors.size})" if
|
1196
|
+
@vectors.size != new_vectors.size
|
1197
|
+
|
1198
|
+
@vectors = Daru::Index.new new_vectors.map(&:to_sym), new_vectors.map { |e| @vectors[e] }
|
1199
|
+
end
|
1200
|
+
|
465
1201
|
# Change the index of the DataFrame and its underlying vectors. Destructive.
|
466
1202
|
#
|
467
1203
|
# @param [Symbol, Array] new_index Specify an Array if
|
@@ -481,19 +1217,58 @@ module Daru
|
|
481
1217
|
self.dup.reindex! new_index
|
482
1218
|
end
|
483
1219
|
|
484
|
-
# Return the
|
1220
|
+
# Return the indexes of all the numeric vectors. Will include vectors with nils
|
485
1221
|
# alongwith numbers.
|
486
1222
|
def numeric_vectors
|
487
1223
|
numerics = []
|
488
1224
|
|
489
|
-
|
1225
|
+
each_vector_with_index do |vec, i|
|
1226
|
+
numerics << i if(vec.type == :numeric)
|
1227
|
+
end
|
1228
|
+
numerics
|
1229
|
+
end
|
1230
|
+
|
1231
|
+
def numeric_vector_names
|
1232
|
+
numerics = []
|
1233
|
+
|
1234
|
+
each_vector do |vec, i|
|
490
1235
|
numerics << vec.name if(vec.type == :numeric)
|
491
1236
|
end
|
492
1237
|
numerics
|
493
1238
|
end
|
494
1239
|
|
1240
|
+
# Return a DataFrame of only the numerical Vectors. If clone: false
|
1241
|
+
# is specified as option, only a *view* of the Vectors will be
|
1242
|
+
# returned. Defaults to clone: true.
|
1243
|
+
def only_numerics opts={}
|
1244
|
+
cln = opts[:clone] == false ? false : true
|
1245
|
+
nv = numeric_vectors
|
1246
|
+
arry = nv.inject([]) do |arr, v|
|
1247
|
+
arr << self[v]
|
1248
|
+
arr
|
1249
|
+
end
|
1250
|
+
|
1251
|
+
order = @vectors.is_a?(MultiIndex) ? MultiIndex.new(nv) : Index.new(nv)
|
1252
|
+
Daru::DataFrame.new(arry, clone: cln, order: order, index: @index)
|
1253
|
+
end
|
1254
|
+
|
1255
|
+
# Generate a summary of this DataFrame with ReportBuilder.
|
1256
|
+
def summary(method = :to_text)
|
1257
|
+
ReportBuilder.new(no_title: true).add(self).send(method)
|
1258
|
+
end
|
1259
|
+
|
1260
|
+
def report_building(b) # :nodoc: #
|
1261
|
+
b.section(:name=>@name) do |g|
|
1262
|
+
g.text "Number of rows: #{nrows}"
|
1263
|
+
@vectors.each do |v|
|
1264
|
+
g.text "Element:[#{v}]"
|
1265
|
+
g.parse_element(self[v])
|
1266
|
+
end
|
1267
|
+
end
|
1268
|
+
end
|
1269
|
+
|
495
1270
|
# Sorts a dataframe (ascending/descending)according to the given sequence of
|
496
|
-
#
|
1271
|
+
# vectors, using the attributes provided in the blocks.
|
497
1272
|
#
|
498
1273
|
# @param order [Array] The order of vector names in which the DataFrame
|
499
1274
|
# should be sorted.
|
@@ -583,7 +1358,7 @@ module Daru
|
|
583
1358
|
elsif opts[:values].is_a?(Array)
|
584
1359
|
opts[:values]
|
585
1360
|
else # nil
|
586
|
-
(@vectors.to_a - (index | vectors)) &
|
1361
|
+
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
587
1362
|
end
|
588
1363
|
|
589
1364
|
raise IndexError, "No numeric vectors to aggregate" if values.empty?
|
@@ -634,6 +1409,195 @@ module Daru
|
|
634
1409
|
end
|
635
1410
|
end
|
636
1411
|
|
1412
|
+
# Merge vectors from two DataFrames. In case of name collision,
|
1413
|
+
# the vectors names are changed to x_1, x_2 ....
|
1414
|
+
#
|
1415
|
+
# @return {Daru::DataFrame}
|
1416
|
+
def merge other_df
|
1417
|
+
raise "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" unless nrows == other_df.nrows
|
1418
|
+
|
1419
|
+
new_fields = (@vectors.to_a + other_df.vectors.to_a)
|
1420
|
+
.recode_repeated
|
1421
|
+
.map(&:to_sym)
|
1422
|
+
df_new = DataFrame.new({}, order: new_fields)
|
1423
|
+
|
1424
|
+
(0...nrows).to_a.each do |i|
|
1425
|
+
row = self.row[i].to_a + other_df.row[i].to_a
|
1426
|
+
df_new.add_row(row)
|
1427
|
+
end
|
1428
|
+
|
1429
|
+
df_new.update
|
1430
|
+
df_new
|
1431
|
+
end
|
1432
|
+
|
1433
|
+
# Join 2 DataFrames by given fields
|
1434
|
+
# type is one of :left and :inner, default is :left
|
1435
|
+
#
|
1436
|
+
# Untested! Use at your own risk.
|
1437
|
+
#
|
1438
|
+
# @return {Daru::DataFrame}
|
1439
|
+
def join(other_ds,fields_1=[],fields_2=[],type=:left)
|
1440
|
+
fields_new = other_ds.vectors.to_a - fields_2
|
1441
|
+
fields = self.vectors.to_a + fields_new
|
1442
|
+
|
1443
|
+
other_ds_hash = {}
|
1444
|
+
other_ds.each_row do |row|
|
1445
|
+
key = row.to_hash.select { |k,v| fields_2.include?(k) }.values
|
1446
|
+
value = row.to_hash.select { |k,v| fields_new.include?(k) }
|
1447
|
+
|
1448
|
+
if other_ds_hash[key].nil?
|
1449
|
+
other_ds_hash[key] = [value]
|
1450
|
+
else
|
1451
|
+
other_ds_hash[key] << value
|
1452
|
+
end
|
1453
|
+
end
|
1454
|
+
|
1455
|
+
new_ds = DataFrame.new({}, order: fields)
|
1456
|
+
|
1457
|
+
self.each_row do |row|
|
1458
|
+
key = row.to_hash.select{|k,v| fields_1.include?(k)}.values
|
1459
|
+
new_case = row.to_hash
|
1460
|
+
|
1461
|
+
if other_ds_hash[key].nil?
|
1462
|
+
if type == :left
|
1463
|
+
fields_new.each{|field| new_case[field] = nil}
|
1464
|
+
new_ds.add_row(Daru::Vector.new(new_case))
|
1465
|
+
end
|
1466
|
+
else
|
1467
|
+
other_ds_hash[key].each do |new_values|
|
1468
|
+
new_ds.add_row(Daru::Vector.new(new_case.merge(new_values)))
|
1469
|
+
end
|
1470
|
+
end
|
1471
|
+
end
|
1472
|
+
|
1473
|
+
new_ds
|
1474
|
+
end
|
1475
|
+
|
1476
|
+
|
1477
|
+
# Creates a new dataset for one to many relations
|
1478
|
+
# on a dataset, based on pattern of field names.
|
1479
|
+
#
|
1480
|
+
# for example, you have a survey for number of children
|
1481
|
+
# with this structure:
|
1482
|
+
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
|
1483
|
+
# with
|
1484
|
+
# ds.one_to_many([:id], "child_%v_%n"
|
1485
|
+
# the field of first parameters will be copied verbatim
|
1486
|
+
# to new dataset, and fields which responds to second
|
1487
|
+
# pattern will be added one case for each different %n.
|
1488
|
+
#
|
1489
|
+
# == Usage
|
1490
|
+
# cases=[
|
1491
|
+
# ['1','george','red',10,'blue',20,nil,nil],
|
1492
|
+
# ['2','fred','green',15,'orange',30,'white',20],
|
1493
|
+
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
1494
|
+
# ]
|
1495
|
+
# ds=Daru::DataFrame.rows(cases, order: [:id, :name, :car_color1, :car_value1, :car_color2, :car_value2, :car_color3, :car_value3])
|
1496
|
+
# ds.one_to_many([:id],'car_%v%n').to_matrix
|
1497
|
+
# => Matrix[
|
1498
|
+
# ["red", "1", 10],
|
1499
|
+
# ["blue", "1", 20],
|
1500
|
+
# ["green", "2", 15],
|
1501
|
+
# ["orange", "2", 30],
|
1502
|
+
# ["white", "2", 20]
|
1503
|
+
# ]
|
1504
|
+
#
|
1505
|
+
def one_to_many(parent_fields, pattern)
|
1506
|
+
re = Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
|
1507
|
+
ds_vars = parent_fields
|
1508
|
+
vars = []
|
1509
|
+
max_n = 0
|
1510
|
+
h = parent_fields.inject({}) { |a,v|
|
1511
|
+
a[v] = Daru::Vector.new([])
|
1512
|
+
a
|
1513
|
+
}
|
1514
|
+
# Adding _row_id
|
1515
|
+
h[:_col_id] = Daru::Vector.new([])
|
1516
|
+
ds_vars.push(:_col_id)
|
1517
|
+
|
1518
|
+
@vectors.each do |f|
|
1519
|
+
if f =~ re
|
1520
|
+
if !vars.include? $1
|
1521
|
+
vars.push($1)
|
1522
|
+
h[$1] = Daru::Vector.new([])
|
1523
|
+
end
|
1524
|
+
max_n = $2.to_i if max_n < $2.to_i
|
1525
|
+
end
|
1526
|
+
end
|
1527
|
+
ds = DataFrame.new(h, order: ds_vars+vars)
|
1528
|
+
|
1529
|
+
each_row do |row|
|
1530
|
+
row_out = {}
|
1531
|
+
parent_fields.each do |f|
|
1532
|
+
row_out[f]=row[f]
|
1533
|
+
end
|
1534
|
+
|
1535
|
+
max_n.times do |n1|
|
1536
|
+
n = n1+1
|
1537
|
+
any_data = false
|
1538
|
+
vars.each do |v|
|
1539
|
+
data = row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s).to_sym]
|
1540
|
+
row_out[v] = data
|
1541
|
+
any_data = true if !data.nil?
|
1542
|
+
end
|
1543
|
+
|
1544
|
+
if any_data
|
1545
|
+
row_out[:_col_id] = n
|
1546
|
+
ds.add_row(row_out)
|
1547
|
+
end
|
1548
|
+
end
|
1549
|
+
end
|
1550
|
+
ds.update
|
1551
|
+
ds
|
1552
|
+
end
|
1553
|
+
|
1554
|
+
def add_vectors_by_split_recode(name_, join='-', sep=Daru::SPLIT_TOKEN)
|
1555
|
+
split = self[name_].split_by_separator(sep)
|
1556
|
+
i = 1
|
1557
|
+
split.each { |k,v|
|
1558
|
+
new_field = name_.to_s + join + i.to_s
|
1559
|
+
v.rename name_.to_s + ":" + k.to_s
|
1560
|
+
self[new_field.to_sym] = v
|
1561
|
+
i += 1
|
1562
|
+
}
|
1563
|
+
end
|
1564
|
+
|
1565
|
+
# Create a sql, basen on a given Dataset
|
1566
|
+
#
|
1567
|
+
# == Arguments
|
1568
|
+
#
|
1569
|
+
# * table - String specifying name of the table that will created in SQL.
|
1570
|
+
# * charset - Character set. Default is "UTF8".
|
1571
|
+
#
|
1572
|
+
# == Usage
|
1573
|
+
#
|
1574
|
+
# ds = Daru::DataFrame.new({
|
1575
|
+
# :id => Daru::Vector.new([1,2,3,4,5]),
|
1576
|
+
# :name => Daru::Vector.new(%w{Alex Peter Susan Mary John})
|
1577
|
+
# })
|
1578
|
+
# ds.create_sql('names')
|
1579
|
+
# ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
|
1580
|
+
#
|
1581
|
+
def create_sql(table,charset="UTF8")
|
1582
|
+
sql = "CREATE TABLE #{table} ("
|
1583
|
+
fields = self.vectors.to_a.collect do |f|
|
1584
|
+
v = self[f]
|
1585
|
+
f.to_s + " " + v.db_type
|
1586
|
+
end
|
1587
|
+
|
1588
|
+
sql + fields.join(",\n ")+") CHARACTER SET=#{charset};"
|
1589
|
+
end
|
1590
|
+
|
1591
|
+
# Convert all numeric vectors to GSL::Matrix
|
1592
|
+
def to_gsl
|
1593
|
+
numerics_as_arrays = []
|
1594
|
+
numeric_vectors.each do |n|
|
1595
|
+
numerics_as_arrays << self[n].to_a
|
1596
|
+
end
|
1597
|
+
|
1598
|
+
GSL::Matrix.alloc *numerics_as_arrays.transpose
|
1599
|
+
end
|
1600
|
+
|
637
1601
|
# Convert all vectors of type *:numeric* into a Matrix.
|
638
1602
|
def to_matrix
|
639
1603
|
numerics_as_arrays = []
|
@@ -644,22 +1608,27 @@ module Daru
|
|
644
1608
|
Matrix.columns numerics_as_arrays
|
645
1609
|
end
|
646
1610
|
|
1611
|
+
# Return a Nyaplot::DataFrame from the data of this DataFrame.
|
1612
|
+
def to_nyaplotdf
|
1613
|
+
Nyaplot::DataFrame.new(to_a[0])
|
1614
|
+
end
|
1615
|
+
|
647
1616
|
# Convert all vectors of type *:numeric* and not containing nils into an NMatrix.
|
648
1617
|
def to_nmatrix
|
649
1618
|
numerics_as_arrays = []
|
650
1619
|
each_vector do |vector|
|
651
1620
|
numerics_as_arrays << vector.to_a if(vector.type == :numeric and
|
652
|
-
vector.
|
1621
|
+
vector.missing_positions.size == 0)
|
653
1622
|
end
|
654
1623
|
|
655
1624
|
numerics_as_arrays.transpose.to_nm
|
656
1625
|
end
|
657
1626
|
|
658
1627
|
# Converts the DataFrame into an array of hashes where key is vector name
|
659
|
-
#
|
660
|
-
#
|
661
|
-
#
|
662
|
-
#
|
1628
|
+
# and value is the corresponding element. The 0th index of the array contains
|
1629
|
+
# the array of hashes while the 1th index contains the indexes of each row
|
1630
|
+
# of the dataframe. Each element in the index array corresponds to its row
|
1631
|
+
# in the array of hashes, which has the same index.
|
663
1632
|
def to_a
|
664
1633
|
arry = [[],[]]
|
665
1634
|
self.each_row do |row|
|
@@ -678,9 +1647,26 @@ module Daru
|
|
678
1647
|
end
|
679
1648
|
end
|
680
1649
|
|
1650
|
+
# Converts DataFrame to a hash with keys as vector names and values as
|
1651
|
+
# the corresponding vectors.
|
1652
|
+
def to_hash
|
1653
|
+
hsh = {}
|
1654
|
+
@vectors.each_with_index do |vec_name, idx|
|
1655
|
+
hsh[vec_name] = @data[idx]
|
1656
|
+
end
|
1657
|
+
|
1658
|
+
hsh
|
1659
|
+
end
|
1660
|
+
|
681
1661
|
# Convert to html for IRuby.
|
682
1662
|
def to_html threshold=30
|
683
|
-
html
|
1663
|
+
html = "<table>" +
|
1664
|
+
"<tr>" +
|
1665
|
+
"<th colspan=\"#{@vectors.size+1}\">" +
|
1666
|
+
"Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}"
|
1667
|
+
"</th>" +
|
1668
|
+
"</tr>"
|
1669
|
+
html +='<tr><th></th>'
|
684
1670
|
@vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
|
685
1671
|
html += '</tr>'
|
686
1672
|
|
@@ -697,6 +1683,15 @@ module Daru
|
|
697
1683
|
html += '<tr>'
|
698
1684
|
(@vectors + 1).size.times { html += '<td>...</td>' }
|
699
1685
|
html += '</tr>'
|
1686
|
+
|
1687
|
+
last_index = @index.to_a.last
|
1688
|
+
last_row = self.row[last_index]
|
1689
|
+
html += '<tr>'
|
1690
|
+
html += "<td>" + last_index.to_s + "</td>"
|
1691
|
+
(0..(ncols - 1)).to_a.each do |i|
|
1692
|
+
html += '<td>' + last_row[i].to_s + '</td>'
|
1693
|
+
end
|
1694
|
+
html += '</tr>'
|
700
1695
|
break
|
701
1696
|
end
|
702
1697
|
end
|
@@ -709,6 +1704,87 @@ module Daru
|
|
709
1704
|
to_html
|
710
1705
|
end
|
711
1706
|
|
1707
|
+
# Method for updating the metadata (i.e. missing value positions) of the
|
1708
|
+
# after assingment/deletion etc. are complete. This is provided so that
|
1709
|
+
# time is not wasted in creating the metadata for the vector each time
|
1710
|
+
# assignment/deletion of elements is done. Updating data this way is called
|
1711
|
+
# lazy loading. To set or unset lazy loading, see the .lazy_update= method.
|
1712
|
+
def update
|
1713
|
+
@data.each { |v| v.update } if Daru.lazy_update
|
1714
|
+
end
|
1715
|
+
|
1716
|
+
def rename new_name
|
1717
|
+
if new_name.is_a?(Numeric)
|
1718
|
+
@name = new_name
|
1719
|
+
return
|
1720
|
+
end
|
1721
|
+
@name = new_name.to_sym
|
1722
|
+
end
|
1723
|
+
|
1724
|
+
# Write this DataFrame to a CSV file.
|
1725
|
+
#
|
1726
|
+
# == Arguements
|
1727
|
+
#
|
1728
|
+
# * filename - Path of CSV file where the DataFrame is to be saved.
|
1729
|
+
#
|
1730
|
+
# == Options
|
1731
|
+
#
|
1732
|
+
# * convert_comma - If set to *true*, will convert any commas in any
|
1733
|
+
# of the data to full stops ('.').
|
1734
|
+
# All the options accepted by CSV.read() can also be passed into this
|
1735
|
+
# function.
|
1736
|
+
def write_csv filename, opts={}
|
1737
|
+
Daru::IO.dataframe_write_csv self, filename, opts
|
1738
|
+
end
|
1739
|
+
|
1740
|
+
# Write this dataframe to an Excel Spreadsheet
|
1741
|
+
#
|
1742
|
+
# == Arguments
|
1743
|
+
#
|
1744
|
+
# * filename - The path of the file where the DataFrame should be written.
|
1745
|
+
def write_excel filename, opts={}
|
1746
|
+
Daru::IO.dataframe_write_excel self, filename, opts
|
1747
|
+
end
|
1748
|
+
|
1749
|
+
# Insert each case of the Dataset on the selected table
|
1750
|
+
#
|
1751
|
+
# == Arguments
|
1752
|
+
#
|
1753
|
+
# * dbh - DBI database connection object.
|
1754
|
+
# * query - Query string.
|
1755
|
+
#
|
1756
|
+
# == Usage
|
1757
|
+
#
|
1758
|
+
# ds = Daru::DataFrame.new({:id=>Daru::Vector.new([1,2,3]), :name=>Daru::Vector.new(["a","b","c"])})
|
1759
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
1760
|
+
# ds.write_sql(dbh,"test")
|
1761
|
+
def write_sql dbh, table
|
1762
|
+
Daru::IO.dataframe_write_sql self, dbh, table
|
1763
|
+
end
|
1764
|
+
|
1765
|
+
|
1766
|
+
# Use marshalling to save dataframe to a file.
|
1767
|
+
def save filename
|
1768
|
+
Daru::IO.save self, filename
|
1769
|
+
end
|
1770
|
+
|
1771
|
+
def _dump depth
|
1772
|
+
Marshal.dump({
|
1773
|
+
data: @data,
|
1774
|
+
index: @index.to_a,
|
1775
|
+
order: @vectors.to_a,
|
1776
|
+
name: @name
|
1777
|
+
})
|
1778
|
+
end
|
1779
|
+
|
1780
|
+
def self._load data
|
1781
|
+
h = Marshal.load data
|
1782
|
+
Daru::DataFrame.new(h[:data],
|
1783
|
+
index: h[:index],
|
1784
|
+
order: h[:order],
|
1785
|
+
name: h[:name])
|
1786
|
+
end
|
1787
|
+
|
712
1788
|
# Change dtypes of vectors by supplying a hash of :vector_name => :new_dtype
|
713
1789
|
#
|
714
1790
|
# == Usage
|
@@ -733,9 +1809,9 @@ module Daru
|
|
733
1809
|
# Pretty print in a nice table format for the command line (irb/pry/iruby)
|
734
1810
|
def inspect spacing=10, threshold=15
|
735
1811
|
longest = [@name.to_s.size,
|
736
|
-
@vectors.map(&:to_s).map(&:size).max,
|
737
|
-
@index .map(&:to_s).map(&:size).max,
|
738
|
-
@data .map{ |v|
|
1812
|
+
(@vectors.map(&:to_s).map(&:size).max || 0),
|
1813
|
+
(@index .map(&:to_s).map(&:size).max || 0),
|
1814
|
+
(@data .map{ |v| v.map(&:to_s).map(&:size).max}.max || 0)].max
|
739
1815
|
|
740
1816
|
name = @name || 'nil'
|
741
1817
|
content = ""
|
@@ -901,6 +1977,8 @@ module Daru
|
|
901
1977
|
|
902
1978
|
def access_vector *names
|
903
1979
|
location = names[0]
|
1980
|
+
|
1981
|
+
return dup(@vectors[location]) if location.is_a?(Range)
|
904
1982
|
if @vectors.is_a?(MultiIndex)
|
905
1983
|
pos = vectors_index_for names
|
906
1984
|
|
@@ -996,41 +2074,68 @@ module Daru
|
|
996
2074
|
end
|
997
2075
|
|
998
2076
|
def insert_or_modify_vector name, vector
|
999
|
-
|
1000
|
-
|
2077
|
+
if vectors.is_a?(Index)
|
2078
|
+
name = name[0]
|
2079
|
+
end
|
1001
2080
|
|
1002
|
-
if
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
2081
|
+
@vectors = @vectors + name if !@vectors.include?(name)
|
2082
|
+
v = nil
|
2083
|
+
|
2084
|
+
if @index.empty?
|
2085
|
+
v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a)
|
2086
|
+
@index = v.index
|
2087
|
+
@data[@vectors[name]] = v
|
2088
|
+
set_size
|
2089
|
+
|
2090
|
+
@data.map! do |v|
|
2091
|
+
if v.size == 0
|
2092
|
+
Daru::Vector.new([nil]*@size, name: set_name(name), index: @index)
|
2093
|
+
else
|
2094
|
+
v
|
2095
|
+
end
|
1006
2096
|
end
|
1007
2097
|
else
|
1008
|
-
|
1009
|
-
|
2098
|
+
if vector.is_a?(Daru::Vector)
|
2099
|
+
v = Daru::Vector.new [], name: set_name(name), index: @index
|
2100
|
+
@index.each do |idx|
|
2101
|
+
v[idx] = vector[idx]
|
2102
|
+
end
|
2103
|
+
else
|
2104
|
+
raise Exception, "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
|
2105
|
+
@size != vector.size
|
1010
2106
|
|
1011
|
-
|
1012
|
-
|
2107
|
+
v = Daru::Vector.new(vector, name: set_name(name), index: @index)
|
2108
|
+
end
|
1013
2109
|
|
1014
|
-
|
2110
|
+
@data[@vectors[name]] = v
|
2111
|
+
end
|
1015
2112
|
end
|
1016
2113
|
|
1017
|
-
def insert_or_modify_row name, vector
|
1018
|
-
if
|
1019
|
-
|
1020
|
-
|
1021
|
-
@vectors.each do |vector|
|
1022
|
-
@data[@vectors[vector]][name] = v[vector]
|
1023
|
-
end
|
2114
|
+
def insert_or_modify_row name, vector
|
2115
|
+
if index.is_a?(MultiIndex)
|
2116
|
+
# TODO
|
1024
2117
|
else
|
1025
|
-
|
1026
|
-
v
|
2118
|
+
name = name[0]
|
2119
|
+
v =
|
2120
|
+
if vector.is_a?(Daru::Vector)
|
2121
|
+
vector
|
2122
|
+
else
|
2123
|
+
Daru::Vector.new(vector, name: set_name(name), index: @vectors)
|
2124
|
+
end
|
1027
2125
|
|
1028
|
-
@
|
1029
|
-
@
|
2126
|
+
if @index.include? name
|
2127
|
+
@vectors.each do |vector|
|
2128
|
+
@data[@vectors[vector]][name] = v[vector]
|
2129
|
+
end
|
2130
|
+
else
|
2131
|
+
@index = reassign_index_as(@index + name)
|
2132
|
+
@vectors.each do |vector|
|
2133
|
+
@data[@vectors[vector]].concat v[vector], name
|
2134
|
+
end
|
1030
2135
|
end
|
1031
|
-
end
|
1032
2136
|
|
1033
|
-
|
2137
|
+
set_size
|
2138
|
+
end
|
1034
2139
|
end
|
1035
2140
|
|
1036
2141
|
def create_empty_vectors
|
@@ -1081,18 +2186,22 @@ module Daru
|
|
1081
2186
|
def create_vectors_index_with vectors, source
|
1082
2187
|
vectors = source.keys.sort if vectors.nil?
|
1083
2188
|
|
2189
|
+
@vectors =
|
1084
2190
|
unless vectors.is_a?(Index) or vectors.is_a?(MultiIndex)
|
1085
|
-
|
2191
|
+
Daru::Index.new((vectors + (source.keys - vectors))
|
2192
|
+
.uniq
|
2193
|
+
.map { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
|
2194
|
+
)
|
1086
2195
|
else
|
1087
|
-
|
2196
|
+
vectors
|
1088
2197
|
end
|
1089
2198
|
end
|
1090
2199
|
|
1091
2200
|
def all_vectors_have_equal_indexes? source
|
1092
|
-
|
2201
|
+
idx = source.values[0].index
|
1093
2202
|
|
1094
2203
|
source.all? do |name, vector|
|
1095
|
-
|
2204
|
+
idx == vector.index
|
1096
2205
|
end
|
1097
2206
|
end
|
1098
2207
|
|