daru 0.0.5 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.build.sh +14 -0
- data/.travis.yml +26 -4
- data/CONTRIBUTING.md +31 -0
- data/Gemfile +1 -2
- data/{History.txt → History.md} +110 -44
- data/README.md +21 -288
- data/Rakefile +1 -0
- data/daru.gemspec +12 -8
- data/lib/daru.rb +36 -1
- data/lib/daru/accessors/array_wrapper.rb +8 -3
- data/lib/daru/accessors/gsl_wrapper.rb +113 -0
- data/lib/daru/accessors/nmatrix_wrapper.rb +6 -17
- data/lib/daru/core/group_by.rb +0 -1
- data/lib/daru/dataframe.rb +1192 -83
- data/lib/daru/extensions/rserve.rb +21 -0
- data/lib/daru/index.rb +14 -0
- data/lib/daru/io/io.rb +170 -8
- data/lib/daru/maths/arithmetic/dataframe.rb +4 -3
- data/lib/daru/maths/arithmetic/vector.rb +4 -4
- data/lib/daru/maths/statistics/dataframe.rb +48 -27
- data/lib/daru/maths/statistics/vector.rb +215 -33
- data/lib/daru/monkeys.rb +53 -7
- data/lib/daru/multi_index.rb +21 -4
- data/lib/daru/plotting/dataframe.rb +83 -25
- data/lib/daru/plotting/vector.rb +9 -10
- data/lib/daru/vector.rb +596 -61
- data/lib/daru/version.rb +3 -0
- data/spec/accessors/wrappers_spec.rb +51 -0
- data/spec/core/group_by_spec.rb +0 -2
- data/spec/daru_spec.rb +58 -0
- data/spec/dataframe_spec.rb +768 -73
- data/spec/extensions/rserve_spec.rb +52 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/io/io_spec.rb +161 -24
- data/spec/math/arithmetic/dataframe_spec.rb +26 -7
- data/spec/math/arithmetic/vector_spec.rb +8 -0
- data/spec/math/statistics/dataframe_spec.rb +16 -1
- data/spec/math/statistics/vector_spec.rb +215 -47
- data/spec/spec_helper.rb +21 -2
- data/spec/vector_spec.rb +368 -12
- metadata +99 -16
- data/lib/version.rb +0 -3
- data/notebooks/grouping_splitting_pivots.ipynb +0 -529
- data/notebooks/intro_with_music_data_.ipynb +0 -303
data/Rakefile
CHANGED
data/daru.gemspec
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
$:.unshift File.expand_path("../lib", __FILE__)
|
3
3
|
|
4
|
-
require 'version.rb'
|
4
|
+
require 'daru/version.rb'
|
5
5
|
|
6
|
-
DESCRIPTION = <<MSG
|
6
|
+
Daru::DESCRIPTION = <<MSG
|
7
7
|
Daru (Data Analysis in RUby) is a library for analysis, manipulation and visualization
|
8
8
|
of data.
|
9
9
|
|
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.authors = ['Sameer Deshmukh']
|
19
19
|
spec.email = ['sameer.deshmukh93@gmail.com']
|
20
20
|
spec.summary = %q{Data Analysis in RUby}
|
21
|
-
spec.description = DESCRIPTION
|
21
|
+
spec.description = Daru::DESCRIPTION
|
22
22
|
spec.homepage = "http://github.com/v0dro/daru"
|
23
23
|
spec.license = 'BSD-2'
|
24
24
|
|
@@ -27,12 +27,16 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
28
28
|
spec.require_paths = ["lib"]
|
29
29
|
|
30
|
-
spec.
|
30
|
+
spec.add_runtime_dependency 'reportbuilder', '~> 1.4'
|
31
|
+
spec.add_runtime_dependency 'spreadsheet', '~> 1.0.3'
|
32
|
+
|
33
|
+
spec.add_development_dependency 'bundler', '~> 1.10'
|
31
34
|
spec.add_development_dependency 'rake'
|
35
|
+
spec.add_development_dependency 'rserve-client', '~> 0.3'
|
32
36
|
spec.add_development_dependency 'rspec'
|
33
37
|
spec.add_development_dependency 'awesome_print'
|
34
|
-
spec.add_development_dependency 'nyaplot'
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
+
spec.add_development_dependency 'nyaplot', '~> 0.1.5'
|
39
|
+
spec.add_development_dependency 'nmatrix', '~> 0.1.0'
|
40
|
+
spec.add_development_dependency 'distribution', '~> 0.7'
|
41
|
+
spec.add_development_dependency 'gsl-nmatrix', '~>1.17'
|
38
42
|
end
|
data/lib/daru.rb
CHANGED
@@ -2,10 +2,45 @@ def jruby?
|
|
2
2
|
RUBY_ENGINE == 'jruby'
|
3
3
|
end
|
4
4
|
|
5
|
-
|
5
|
+
module Daru
|
6
|
+
SPLIT_TOKEN = ','
|
7
|
+
class << self
|
8
|
+
@@lazy_update = false
|
9
|
+
|
10
|
+
# A variable which will set whether Vector metadata is updated immediately or lazily.
|
11
|
+
# Call the #update method every time a values are set or removed in order to update
|
12
|
+
# metadata like positions of missing values.
|
13
|
+
attr_accessor :lazy_update
|
14
|
+
|
15
|
+
def create_has_library(library)
|
16
|
+
define_singleton_method("has_#{library}?") do
|
17
|
+
cv = "@@#{library}"
|
18
|
+
unless class_variable_defined? cv
|
19
|
+
begin
|
20
|
+
require library.to_s
|
21
|
+
class_variable_set(cv, true)
|
22
|
+
rescue LoadError
|
23
|
+
class_variable_set(cv, false)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
class_variable_get(cv)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
create_has_library :gsl
|
32
|
+
create_has_library :nmatrix
|
33
|
+
create_has_library :nyaplot
|
34
|
+
end
|
35
|
+
|
36
|
+
autoload :Spreadsheet, 'spreadsheet'
|
37
|
+
autoload :CSV, 'csv'
|
38
|
+
|
6
39
|
require 'matrix'
|
7
40
|
require 'securerandom'
|
41
|
+
require 'reportbuilder'
|
8
42
|
|
43
|
+
require 'daru/version.rb'
|
9
44
|
require 'daru/index.rb'
|
10
45
|
require 'daru/multi_index.rb'
|
11
46
|
require 'daru/vector.rb'
|
@@ -3,13 +3,18 @@ module Daru
|
|
3
3
|
# Internal class for wrapping ruby array
|
4
4
|
class ArrayWrapper
|
5
5
|
include Enumerable
|
6
|
+
extend Forwardable
|
6
7
|
|
8
|
+
def_delegators :@data, :slice!
|
9
|
+
|
7
10
|
def each(&block)
|
8
11
|
@data.each(&block)
|
12
|
+
self
|
9
13
|
end
|
10
14
|
|
11
15
|
def map!(&block)
|
12
16
|
@data.map!(&block)
|
17
|
+
self
|
13
18
|
end
|
14
19
|
|
15
20
|
attr_accessor :size
|
@@ -22,8 +27,8 @@ module Daru
|
|
22
27
|
set_size
|
23
28
|
end
|
24
29
|
|
25
|
-
def [] index
|
26
|
-
@data[index]
|
30
|
+
def [] *index
|
31
|
+
@data[*index]
|
27
32
|
end
|
28
33
|
|
29
34
|
def []= index, value
|
@@ -62,7 +67,7 @@ module Daru
|
|
62
67
|
end
|
63
68
|
|
64
69
|
def mean
|
65
|
-
sum.quo(@size - @context.
|
70
|
+
sum.quo(@size - @context.missing_positions.size).to_f
|
66
71
|
end
|
67
72
|
|
68
73
|
def product
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module Daru
|
2
|
+
module Accessors
|
3
|
+
module GSLStatistics
|
4
|
+
def vector_standardized_compute(m,sd)
|
5
|
+
Daru::Vector.new @data.collect { |x| (x.to_f - m).quo(sd) }, dtype: :gsl,
|
6
|
+
index: @context.index, name: @context.name
|
7
|
+
end
|
8
|
+
|
9
|
+
def vector_centered_compute(m)
|
10
|
+
Daru::Vector.new @data.collect {|x| (x.to_f - m)}, dtype: :gsl,
|
11
|
+
index: @context.index, name: @context.name
|
12
|
+
end
|
13
|
+
|
14
|
+
def sample_with_replacement(sample=1)
|
15
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
16
|
+
Daru::Vector.new(r.sample(@data, sample).to_a, dtype: :gsl,
|
17
|
+
index: @context.index, name: @context.name)
|
18
|
+
end
|
19
|
+
|
20
|
+
def sample_without_replacement(sample=1)
|
21
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
22
|
+
r.choose(@data, sample).to_a
|
23
|
+
end
|
24
|
+
|
25
|
+
def median
|
26
|
+
GSL::Stats::median_from_sorted_data(@data.sort)
|
27
|
+
end
|
28
|
+
|
29
|
+
def variance_sample(m)
|
30
|
+
@data.variance_m
|
31
|
+
end
|
32
|
+
|
33
|
+
def standard_deviation_sample(m)
|
34
|
+
@data.sd(m)
|
35
|
+
end
|
36
|
+
|
37
|
+
def variance_population(m)
|
38
|
+
@data.variance_with_fixed_mean(m)
|
39
|
+
end
|
40
|
+
|
41
|
+
def standard_deviation_population m
|
42
|
+
@data.sd_with_fixed_mean(m)
|
43
|
+
end
|
44
|
+
|
45
|
+
def skew
|
46
|
+
@data.skew
|
47
|
+
end
|
48
|
+
|
49
|
+
def kurtosis
|
50
|
+
@data.kurtosis
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
class GSLWrapper
|
55
|
+
include Enumerable
|
56
|
+
extend Forwardable
|
57
|
+
include Daru::Accessors::GSLStatistics
|
58
|
+
|
59
|
+
def_delegators :@data, :[], :size, :to_a, :each, :mean,
|
60
|
+
:sum, :prod, :max, :min
|
61
|
+
|
62
|
+
alias :product :prod
|
63
|
+
|
64
|
+
attr_reader :data
|
65
|
+
|
66
|
+
def each(&block)
|
67
|
+
@data.each(&block)
|
68
|
+
self
|
69
|
+
end
|
70
|
+
|
71
|
+
def map!(&block)
|
72
|
+
@data.map!(&block)
|
73
|
+
self
|
74
|
+
end
|
75
|
+
|
76
|
+
def initialize data, context
|
77
|
+
@data = ::GSL::Vector.alloc(data)
|
78
|
+
@context = context
|
79
|
+
end
|
80
|
+
|
81
|
+
def []= index, element
|
82
|
+
if index == size
|
83
|
+
push element
|
84
|
+
else
|
85
|
+
@data[index] = element
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def delete_at index
|
90
|
+
@data.delete_at index
|
91
|
+
end
|
92
|
+
|
93
|
+
def index key
|
94
|
+
@data.to_a.index key
|
95
|
+
end
|
96
|
+
|
97
|
+
def push value
|
98
|
+
@data = @data.concat value
|
99
|
+
self
|
100
|
+
end
|
101
|
+
alias :<< :push
|
102
|
+
alias :concat :push
|
103
|
+
|
104
|
+
def dup
|
105
|
+
GSLWrapper.new(@data.to_a, @context)
|
106
|
+
end
|
107
|
+
|
108
|
+
def == other
|
109
|
+
@data == other.data
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end if Daru.has_gsl?
|
@@ -1,9 +1,3 @@
|
|
1
|
-
begin
|
2
|
-
require 'nmatrix' unless jruby?
|
3
|
-
rescue LoadError => e
|
4
|
-
puts "Please install the nmatrix gem for fast and efficient data storage."
|
5
|
-
end
|
6
|
-
|
7
1
|
module Daru
|
8
2
|
module Accessors
|
9
3
|
# Internal class for wrapping NMatrix
|
@@ -12,23 +6,18 @@ module Daru
|
|
12
6
|
|
13
7
|
def each(&block)
|
14
8
|
@data[0...@size].each(&block)
|
15
|
-
|
16
|
-
|
17
|
-
def map(&block)
|
18
|
-
@data[0...@size].map(&block)
|
9
|
+
self
|
19
10
|
end
|
20
11
|
|
21
12
|
def map!(&block)
|
22
13
|
@data = NMatrix.new [@size*2], map(&block).to_a, dtype: nm_dtype
|
14
|
+
self
|
23
15
|
end
|
24
16
|
|
25
17
|
def inject(*args, &block)
|
26
18
|
@data[0...@size].inject(*args, &block)
|
27
19
|
end
|
28
20
|
|
29
|
-
alias_method :recode, :map
|
30
|
-
alias_method :recode!, :map!
|
31
|
-
|
32
21
|
attr_reader :size, :data, :nm_dtype
|
33
22
|
|
34
23
|
def initialize vector, context, nm_dtype=:int32
|
@@ -39,8 +28,8 @@ module Daru
|
|
39
28
|
# init with twice the storage for reducing the need to resize
|
40
29
|
end
|
41
30
|
|
42
|
-
def [] index
|
43
|
-
return @data[index] if index < @size
|
31
|
+
def [] *index
|
32
|
+
return @data[*index] if index[0] < @size
|
44
33
|
nil
|
45
34
|
end
|
46
35
|
|
@@ -79,7 +68,7 @@ module Daru
|
|
79
68
|
end
|
80
69
|
|
81
70
|
def dup
|
82
|
-
NMatrixWrapper.new @data.to_a, @context, @nm_dtype
|
71
|
+
NMatrixWrapper.new @data[0...@size].to_a, @context, @nm_dtype
|
83
72
|
end
|
84
73
|
|
85
74
|
def resize size = @size*2
|
@@ -109,4 +98,4 @@ module Daru
|
|
109
98
|
end
|
110
99
|
end
|
111
100
|
end
|
112
|
-
end
|
101
|
+
end if Daru.has_nmatrix?
|
data/lib/daru/core/group_by.rb
CHANGED
data/lib/daru/dataframe.rb
CHANGED
@@ -12,17 +12,82 @@ module Daru
|
|
12
12
|
|
13
13
|
include Daru::Maths::Arithmetic::DataFrame
|
14
14
|
include Daru::Maths::Statistics::DataFrame
|
15
|
-
include Daru::Plotting::DataFrame
|
15
|
+
include Daru::Plotting::DataFrame if Daru.has_nyaplot?
|
16
16
|
|
17
17
|
class << self
|
18
|
-
# Load data from a CSV file.
|
19
|
-
#
|
18
|
+
# Load data from a CSV file. Specify an optional block to grab the CSV
|
19
|
+
# object and pre-condition it (for example use the `convert` or
|
20
|
+
# `header_convert` methods).
|
20
21
|
#
|
21
|
-
#
|
22
|
+
# == Arguments
|
23
|
+
#
|
24
|
+
# * path - Path of the file to load specified as a String.
|
25
|
+
#
|
26
|
+
# == Options
|
27
|
+
#
|
28
|
+
# Accepts the same options as the Daru::DataFrame constructor and CSV.open()
|
29
|
+
# and uses those to eventually construct the resulting DataFrame.
|
30
|
+
#
|
31
|
+
# == Verbose Description
|
32
|
+
#
|
33
|
+
# You can specify all the options to the `.from_csv` function that you
|
34
|
+
# do to the Ruby `CSV.read()` function, since this is what is used internally.
|
35
|
+
#
|
36
|
+
# For example, if the columns in your CSV file are separated by something
|
37
|
+
# other that commas, you can use the `:col_sep` option. If you want to
|
38
|
+
# convert numeric values to numbers and not keep them as strings, you can
|
39
|
+
# use the `:converters` option and set it to `:numeric`.
|
40
|
+
#
|
41
|
+
# The `.from_csv` function uses the following defaults for reading CSV files
|
42
|
+
# (that are passed into the `CSV.read()` function):
|
43
|
+
#
|
44
|
+
# {
|
45
|
+
# :col_sep => ',',
|
46
|
+
# :converters => :numeric
|
47
|
+
# }
|
22
48
|
def from_csv path, opts={}, &block
|
23
49
|
Daru::IO.from_csv path, opts, &block
|
24
50
|
end
|
25
51
|
|
52
|
+
# Read data from an Excel file into a DataFrame.
|
53
|
+
#
|
54
|
+
# == Arguments
|
55
|
+
#
|
56
|
+
# * path - Path of the file to be read.
|
57
|
+
#
|
58
|
+
# == Options
|
59
|
+
#
|
60
|
+
# *:worksheet_id - ID of the worksheet that is to be read.
|
61
|
+
def from_excel path, opts={}, &block
|
62
|
+
Daru::IO.from_excel path, opts, &block
|
63
|
+
end
|
64
|
+
|
65
|
+
# Read a database query and returns a Dataset
|
66
|
+
#
|
67
|
+
# USE:
|
68
|
+
#
|
69
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
70
|
+
# Daru::DataFrame.from_sql(dbh, "SELECT * FROM test")
|
71
|
+
def from_sql dbh, query
|
72
|
+
Daru::IO.from_sql dbh, query
|
73
|
+
end
|
74
|
+
|
75
|
+
# Read the database from a plaintext file. For this method to work,
|
76
|
+
# the data should be present in a plain text file in columns. See
|
77
|
+
# spec/fixtures/bank2.dat for an example.
|
78
|
+
#
|
79
|
+
# == Arguments
|
80
|
+
#
|
81
|
+
# * path - Path of the file to be read.
|
82
|
+
# * fields - Vector names of the resulting database.
|
83
|
+
#
|
84
|
+
# == Usage
|
85
|
+
#
|
86
|
+
# df = Daru::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
|
87
|
+
def from_plaintext path, fields
|
88
|
+
Daru::IO.from_plaintext path, fields
|
89
|
+
end
|
90
|
+
|
26
91
|
# Create DataFrame by specifying rows as an Array of Arrays or Array of
|
27
92
|
# Daru::Vector objects.
|
28
93
|
def rows source, opts={}
|
@@ -52,6 +117,58 @@ module Daru
|
|
52
117
|
|
53
118
|
df
|
54
119
|
end
|
120
|
+
|
121
|
+
# Generates a new dataset, using three vectors
|
122
|
+
# - Rows
|
123
|
+
# - Columns
|
124
|
+
# - Values
|
125
|
+
#
|
126
|
+
# For example, you have these values
|
127
|
+
#
|
128
|
+
# x y v
|
129
|
+
# a a 0
|
130
|
+
# a b 1
|
131
|
+
# b a 1
|
132
|
+
# b b 0
|
133
|
+
#
|
134
|
+
# You obtain
|
135
|
+
# id a b
|
136
|
+
# a 0 1
|
137
|
+
# b 1 0
|
138
|
+
#
|
139
|
+
# Useful to process outputs from databases
|
140
|
+
def crosstab_by_assignation rows, columns, values
|
141
|
+
raise "Three vectors should be equal size" if
|
142
|
+
rows.size != columns.size or rows.size!=values.size
|
143
|
+
|
144
|
+
cols_values = columns.factors
|
145
|
+
cols_n = cols_values.size
|
146
|
+
|
147
|
+
h_rows = rows.factors.inject({}) do |a,v|
|
148
|
+
a[v] = cols_values.inject({}) do |a1,v1|
|
149
|
+
a1[v1]=nil
|
150
|
+
a1
|
151
|
+
end
|
152
|
+
a
|
153
|
+
end
|
154
|
+
|
155
|
+
values.each_index do |i|
|
156
|
+
h_rows[rows[i]][columns[i]] = values[i]
|
157
|
+
end
|
158
|
+
df = Daru::DataFrame.new({}, order: [:_id] + cols_values.to_a)
|
159
|
+
|
160
|
+
rows.factors.each do |row|
|
161
|
+
n_row = Array.new(cols_n+1)
|
162
|
+
n_row[0] = row
|
163
|
+
cols_values.each_index do |i|
|
164
|
+
n_row[i+1] = h_rows[row][cols_values[i]]
|
165
|
+
end
|
166
|
+
|
167
|
+
df.add_row(n_row)
|
168
|
+
end
|
169
|
+
df.update
|
170
|
+
df
|
171
|
+
end
|
55
172
|
end
|
56
173
|
|
57
174
|
# The vectors (columns) index of the DataFrame
|
@@ -67,8 +184,29 @@ module Daru
|
|
67
184
|
attr_reader :size
|
68
185
|
|
69
186
|
# DataFrame basically consists of an Array of Vector objects.
|
70
|
-
#
|
71
|
-
#
|
187
|
+
# These objects are indexed by row and column by vectors and index Index objects.
|
188
|
+
#
|
189
|
+
# == Arguments
|
190
|
+
#
|
191
|
+
# * source - Source from the DataFrame is to be initialized. Can be a Hash
|
192
|
+
# of names and vectors (array or Daru::Vector), an array of arrays or
|
193
|
+
# array of Daru::Vectors.
|
194
|
+
#
|
195
|
+
# == Options
|
196
|
+
#
|
197
|
+
# +:order+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order in
|
198
|
+
# which Vectors should appear in the DataFrame.
|
199
|
+
#
|
200
|
+
# +:index+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order
|
201
|
+
# in which rows of the DataFrame will be named.
|
202
|
+
#
|
203
|
+
# +:name+ - A name for the DataFrame.
|
204
|
+
#
|
205
|
+
# +:clone+ - Specify as *true* or *false*. When set to false, and Vector
|
206
|
+
# objects are passed for the source, the Vector objects will not duplicated
|
207
|
+
# when creating the DataFrame. Will have no effect if Array is passed in
|
208
|
+
# the source, or if the passed Daru::Vectors have different indexes.
|
209
|
+
# Default to *true*.
|
72
210
|
#
|
73
211
|
# == Usage
|
74
212
|
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
|
@@ -84,9 +222,12 @@ module Daru
|
|
84
222
|
def initialize source, opts={}
|
85
223
|
vectors = opts[:order]
|
86
224
|
index = opts[:index]
|
87
|
-
|
225
|
+
clone = opts[:clone] == false ? false : true
|
88
226
|
@data = []
|
89
227
|
|
228
|
+
temp_name = opts[:name]
|
229
|
+
@name = temp_name.is_a?(Numeric) ? temp_name : (temp_name || SecureRandom.uuid).to_sym
|
230
|
+
|
90
231
|
if source.empty?
|
91
232
|
@vectors = create_index vectors
|
92
233
|
@index = create_index index
|
@@ -109,7 +250,7 @@ module Daru
|
|
109
250
|
vectors.each_with_index do |name, idx|
|
110
251
|
hsh[name] = source[idx]
|
111
252
|
end
|
112
|
-
initialize(hsh, index: index, order: vectors, name: @name)
|
253
|
+
initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
|
113
254
|
else # array of hashes
|
114
255
|
if vectors.nil?
|
115
256
|
@vectors = Daru::Index.new source[0].keys.map(&:to_sym)
|
@@ -143,13 +284,19 @@ module Daru
|
|
143
284
|
all_indexes.flatten!.uniq!.sort!
|
144
285
|
|
145
286
|
@index = Daru::Index.new all_indexes
|
287
|
+
clone = true
|
146
288
|
end
|
147
|
-
@vectors.each do |vector|
|
148
|
-
@data << Daru::Vector.new([], name: vector, index: @index)
|
149
289
|
|
150
|
-
|
151
|
-
|
290
|
+
if clone
|
291
|
+
@vectors.each do |vector|
|
292
|
+
@data << Daru::Vector.new([], name: vector, index: @index)
|
293
|
+
|
294
|
+
@index.each do |idx|
|
295
|
+
@data[@vectors[vector]][idx] = source[vector][idx]
|
296
|
+
end
|
152
297
|
end
|
298
|
+
else
|
299
|
+
@data.concat source.values
|
153
300
|
end
|
154
301
|
else
|
155
302
|
@index = create_index(index || source.values[0].size)
|
@@ -163,6 +310,7 @@ module Daru
|
|
163
310
|
|
164
311
|
set_size
|
165
312
|
validate
|
313
|
+
update
|
166
314
|
end
|
167
315
|
|
168
316
|
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
|
@@ -176,6 +324,7 @@ module Daru
|
|
176
324
|
else
|
177
325
|
axis = :vector
|
178
326
|
end
|
327
|
+
names.map! { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
|
179
328
|
|
180
329
|
if axis == :vector
|
181
330
|
access_vector *names
|
@@ -194,11 +343,14 @@ module Daru
|
|
194
343
|
# of the vector will be matched against the row/vector indexes of the DataFrame
|
195
344
|
# before an insertion is performed. Unmatched indexes will be set to nil.
|
196
345
|
def []=(*args)
|
197
|
-
|
198
|
-
|
346
|
+
axis = args.include?(:row) ? :row : :vector
|
347
|
+
args.delete :vector
|
348
|
+
args.delete :row
|
349
|
+
|
350
|
+
name = args[0..-2]
|
199
351
|
vector = args[-1]
|
352
|
+
name.map! { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
|
200
353
|
|
201
|
-
axis = (!axis.is_a?(Symbol) and (axis != :vector or axis != :row)) ? :vector : axis
|
202
354
|
if axis == :vector
|
203
355
|
insert_or_modify_vector name, vector
|
204
356
|
elsif axis == :row
|
@@ -222,6 +374,14 @@ module Daru
|
|
222
374
|
vector[name]
|
223
375
|
end
|
224
376
|
|
377
|
+
def add_row row, index=nil
|
378
|
+
self.row[index || @size] = row
|
379
|
+
end
|
380
|
+
|
381
|
+
def add_vector n, vector
|
382
|
+
self[n] = vector
|
383
|
+
end
|
384
|
+
|
225
385
|
# Access a row or set/create a row. Refer #[] and #[]= docs for details.
|
226
386
|
#
|
227
387
|
# == Usage
|
@@ -232,13 +392,77 @@ module Daru
|
|
232
392
|
end
|
233
393
|
|
234
394
|
# Duplicate the DataFrame entirely.
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
395
|
+
#
|
396
|
+
# == Arguments
|
397
|
+
#
|
398
|
+
# * +vectors_to_dup+ - An Array specifying the names of Vectors to
|
399
|
+
# be duplicated. Will duplicate the entire DataFrame if not specified.
|
400
|
+
def dup vectors_to_dup=nil
|
401
|
+
vectors_to_dup = @vectors unless vectors_to_dup
|
402
|
+
|
403
|
+
new_order =
|
404
|
+
if vectors.is_a?(MultiIndex)
|
405
|
+
src = []
|
406
|
+
vectors_to_dup.each do |vec|
|
407
|
+
src << @data[@vectors[vec]].dup
|
408
|
+
end
|
409
|
+
|
410
|
+
Daru::MultiIndex.new(vectors_to_dup)
|
411
|
+
else
|
412
|
+
src = {}
|
413
|
+
vectors_to_dup.each do |vector|
|
414
|
+
src[vector] = @data[@vectors[vector]].dup
|
415
|
+
end
|
416
|
+
|
417
|
+
Daru::Index.new(vectors_to_dup)
|
418
|
+
end
|
419
|
+
|
420
|
+
Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
|
421
|
+
end
|
422
|
+
|
423
|
+
# Only clone the structure of the DataFrame.
|
424
|
+
def clone_structure
|
425
|
+
Daru::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
|
426
|
+
end
|
427
|
+
|
428
|
+
# Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
|
429
|
+
# preserved.
|
430
|
+
#
|
431
|
+
# == Arguments
|
432
|
+
#
|
433
|
+
# +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
|
434
|
+
# a view of the whole data frame otherwise.
|
435
|
+
def clone *vectors_to_clone
|
436
|
+
vectors_to_clone.flatten! unless vectors_to_clone.all? { |a| !a.is_a?(Array) }
|
437
|
+
return super if vectors_to_clone.empty?
|
438
|
+
|
439
|
+
h = vectors_to_clone.inject({}) do |hsh, vec|
|
440
|
+
hsh[vec] = self[vec]
|
441
|
+
hsh
|
239
442
|
end
|
443
|
+
Daru::DataFrame.new(h, clone: false)
|
444
|
+
end
|
240
445
|
|
241
|
-
|
446
|
+
# Returns a 'shallow' copy of DataFrame if missing data is not present,
|
447
|
+
# or a full copy of only valid data if missing data is present.
|
448
|
+
def clone_only_valid
|
449
|
+
if has_missing_data?
|
450
|
+
dup_only_valid
|
451
|
+
else
|
452
|
+
clone
|
453
|
+
end
|
454
|
+
end
|
455
|
+
|
456
|
+
# Creates a new duplicate dataframe containing only rows
|
457
|
+
# without a single missing value.
|
458
|
+
def dup_only_valid vecs=nil
|
459
|
+
rows_with_nil = @data.inject([]) do |memo, vector|
|
460
|
+
memo.concat vector.missing_positions
|
461
|
+
memo
|
462
|
+
end.uniq
|
463
|
+
|
464
|
+
row_indexes = @index.to_a
|
465
|
+
(vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
|
242
466
|
end
|
243
467
|
|
244
468
|
# Iterate over each vector
|
@@ -286,21 +510,205 @@ module Daru
|
|
286
510
|
self
|
287
511
|
end
|
288
512
|
|
289
|
-
#
|
290
|
-
#
|
291
|
-
#
|
292
|
-
#
|
513
|
+
# Iterate over each row or vector of the DataFrame. Specify axis
|
514
|
+
# by passing :vector or :row as the argument. Default to :vector.
|
515
|
+
#
|
516
|
+
# == Description
|
517
|
+
#
|
518
|
+
# `#each` works exactly like Array#each. The default mode for `each`
|
519
|
+
# is to iterate over the columns of the DataFrame. To iterate over
|
520
|
+
# rows you must pass the axis, i.e `:row` as an argument.
|
521
|
+
#
|
522
|
+
# == Arguments
|
523
|
+
#
|
524
|
+
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
525
|
+
# or :row. Default to :vector.
|
526
|
+
def each axis=:vector, &block
|
527
|
+
if axis == :vector or axis == :column
|
528
|
+
each_vector(&block)
|
529
|
+
elsif axis == :row
|
530
|
+
each_row(&block)
|
531
|
+
else
|
532
|
+
raise ArgumentError, "Unknown axis #{axis}"
|
533
|
+
end
|
534
|
+
end
|
535
|
+
|
536
|
+
# Iterate over a row or vector and return results in a Daru::Vector.
|
537
|
+
# Specify axis with :vector or :row. Default to :vector.
|
538
|
+
#
|
539
|
+
# == Description
|
540
|
+
#
|
541
|
+
# The #collect iterator works similar to #map, the only difference
|
542
|
+
# being that it returns a Daru::Vector comprising of the results of
|
543
|
+
# each block run. The resultant Vector has the same index as that
|
544
|
+
# of the axis over which collect has iterated. It also accepts the
|
545
|
+
# optional axis argument.
|
546
|
+
#
|
547
|
+
# == Arguments
|
548
|
+
#
|
549
|
+
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
550
|
+
# or :row. Default to :vector.
|
551
|
+
def collect axis=:vector, &block
|
552
|
+
if axis == :vector or axis == :column
|
553
|
+
collect_vectors(&block)
|
554
|
+
elsif axis == :row
|
555
|
+
collect_rows(&block)
|
556
|
+
else
|
557
|
+
raise ArgumentError, "Unknown axis #{axis}"
|
558
|
+
end
|
559
|
+
end
|
560
|
+
|
561
|
+
# Map over each vector or row of the data frame according to
|
562
|
+
# the argument specified. Will return an Array of the resulting
|
563
|
+
# elements. To map over each row/vector and get a DataFrame,
|
564
|
+
# see #recode.
|
565
|
+
#
|
566
|
+
# == Description
|
567
|
+
#
|
568
|
+
# The #map iterator works like Array#map. The value returned by
|
569
|
+
# each run of the block is added to an Array and the Array is
|
570
|
+
# returned. This method also accepts an axis argument, like #each.
|
571
|
+
# The default is :vector.
|
572
|
+
#
|
573
|
+
# == Arguments
|
574
|
+
#
|
575
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
576
|
+
# Default to :vector.
|
577
|
+
def map axis=:vector, &block
|
578
|
+
if axis == :vector or axis == :column
|
579
|
+
map_vectors(&block)
|
580
|
+
elsif axis == :row
|
581
|
+
map_rows(&block)
|
582
|
+
else
|
583
|
+
raise ArgumentError, "Unknown axis #{axis}"
|
584
|
+
end
|
585
|
+
end
|
586
|
+
|
587
|
+
# Destructive map. Modifies the DataFrame. Each run of the block
|
588
|
+
# must return a Daru::Vector. You can specify the axis to map over
|
589
|
+
# as the argument. Default to :vector.
|
590
|
+
#
|
591
|
+
# == Arguments
|
592
|
+
#
|
593
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
594
|
+
# Default to :vector.
|
595
|
+
def map! axis=:vector, &block
|
596
|
+
if axis == :vector or axis == :column
|
597
|
+
map_vectors!(&block)
|
598
|
+
elsif axis == :row
|
599
|
+
map_rows!(&block)
|
600
|
+
end
|
601
|
+
end
|
602
|
+
|
603
|
+
# Maps over the DataFrame and returns a DataFrame. Each run of the
|
604
|
+
# block must return a Daru::Vector object. You can specify the axis
|
605
|
+
# to map over. Default to :vector.
|
606
|
+
#
|
607
|
+
# == Description
|
608
|
+
#
|
609
|
+
# Recode works similarly to #map, but an important difference between
|
610
|
+
# the two is that recode returns a modified Daru::DataFrame instead
|
611
|
+
# of an Array. For this reason, #recodeexpects that every run of the
|
612
|
+
# block to return a Daru::Vector.
|
613
|
+
#
|
614
|
+
# Just like map and each, recode also accepts an optional _axis_ argument.
|
615
|
+
#
|
616
|
+
# == Arguments
|
617
|
+
#
|
618
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
619
|
+
# Default to :vector.
|
620
|
+
def recode axis=:vector, &block
|
621
|
+
if axis == :vector or axis == :column
|
622
|
+
recode_vectors(&block)
|
623
|
+
elsif axis == :row
|
624
|
+
recode_rows(&block)
|
625
|
+
end
|
626
|
+
end
|
627
|
+
|
628
|
+
# Retain vectors or rows if the block returns a truthy value.
|
629
|
+
#
|
630
|
+
# == Description
|
631
|
+
#
|
632
|
+
# For filtering out certain rows/vectors based on their values,
|
633
|
+
# use the #filter method. By default it iterates over vectors and
|
634
|
+
# keeps those vectors for which the block returns true. It accepts
|
635
|
+
# an optional axis argument which lets you specify whether you want
|
636
|
+
# to iterate over vectors or rows.
|
637
|
+
#
|
638
|
+
# == Arguments
|
639
|
+
#
|
640
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
641
|
+
# Default to :vector.
|
642
|
+
#
|
643
|
+
# == Usage
|
644
|
+
#
|
645
|
+
# # Filter vectors
|
646
|
+
#
|
647
|
+
# df.filter do |vector|
|
648
|
+
# vector.type == :numeric and vector.median < 50
|
649
|
+
# end
|
650
|
+
#
|
651
|
+
# # Filter rows
|
652
|
+
#
|
653
|
+
# df.filter(:row) do |row|
|
654
|
+
# row[:a] + row[:d] < 100
|
655
|
+
# end
|
656
|
+
def filter axis=:vector, &block
|
657
|
+
if axis == :vector or axis == :column
|
658
|
+
filter_vectors(&block)
|
659
|
+
elsif axis == :row
|
660
|
+
filter_rows(&block)
|
661
|
+
end
|
662
|
+
end
|
663
|
+
|
664
|
+
def recode_vectors &block
|
665
|
+
block_given? or return to_enum(:recode_vectors)
|
666
|
+
|
667
|
+
df = self.dup
|
668
|
+
df.each_vector_with_index do |v, i|
|
669
|
+
ret = yield v
|
670
|
+
ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
|
671
|
+
df[*i] = ret
|
672
|
+
end
|
673
|
+
|
674
|
+
df
|
675
|
+
end
|
676
|
+
|
677
|
+
def recode_rows &block
|
678
|
+
block_given? or return to_enum(:recode_rows)
|
679
|
+
|
680
|
+
df = self.dup
|
681
|
+
df.each_row_with_index do |r, i|
|
682
|
+
ret = yield r
|
683
|
+
ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
|
684
|
+
df.row[i] = ret
|
685
|
+
end
|
686
|
+
|
687
|
+
df
|
688
|
+
end
|
689
|
+
|
690
|
+
# Map each vector and return an Array.
|
293
691
|
def map_vectors(&block)
|
294
692
|
return to_enum(:map_vectors) unless block_given?
|
295
693
|
|
296
|
-
|
694
|
+
arry = []
|
695
|
+
@data.each do |vec|
|
696
|
+
arry << yield(vec)
|
697
|
+
end
|
698
|
+
|
699
|
+
arry
|
297
700
|
end
|
298
701
|
|
299
702
|
# Destructive form of #map_vectors
|
300
703
|
def map_vectors!(&block)
|
301
704
|
return to_enum(:map_vectors!) unless block_given?
|
302
705
|
|
303
|
-
|
706
|
+
vectors.dup.each do |n|
|
707
|
+
v = yield self[n]
|
708
|
+
v.is_a?(Daru::Vector) or raise TypeError, "Must return a Daru::Vector not #{v.class}"
|
709
|
+
self[n] = v
|
710
|
+
end
|
711
|
+
|
304
712
|
self
|
305
713
|
end
|
306
714
|
|
@@ -308,37 +716,114 @@ module Daru
|
|
308
716
|
def map_vectors_with_index(&block)
|
309
717
|
return to_enum(:map_vectors_with_index) unless block_given?
|
310
718
|
|
311
|
-
|
312
|
-
|
313
|
-
|
719
|
+
dt = []
|
720
|
+
each_vector_with_index do |vector, name|
|
721
|
+
dt << yield(vector, name)
|
314
722
|
end
|
315
723
|
|
316
|
-
|
724
|
+
dt
|
317
725
|
end
|
318
726
|
|
319
727
|
# Map each row
|
320
728
|
def map_rows(&block)
|
321
729
|
return to_enum(:map_rows) unless block_given?
|
322
730
|
|
323
|
-
|
324
|
-
|
325
|
-
|
731
|
+
dt = []
|
732
|
+
each_row do |row|
|
733
|
+
dt << yield(row)
|
326
734
|
end
|
327
735
|
|
328
|
-
|
736
|
+
dt
|
329
737
|
end
|
330
738
|
|
331
739
|
def map_rows_with_index(&block)
|
332
740
|
return to_enum(:map_rows_with_index) unless block_given?
|
333
741
|
|
334
|
-
|
335
|
-
|
336
|
-
|
742
|
+
dt = []
|
743
|
+
each_row_with_index do |row, index|
|
744
|
+
dt << yield(row, index)
|
337
745
|
end
|
338
746
|
|
339
|
-
|
747
|
+
dt
|
340
748
|
end
|
341
749
|
|
750
|
+
def map_rows!(&block)
|
751
|
+
return to_enum(:map_rows!) unless block_given?
|
752
|
+
|
753
|
+
index.dup.each do |i|
|
754
|
+
r = yield self.row[i]
|
755
|
+
r.is_a?(Daru::Vector) or raise TypeError, "Returned object must be Daru::Vector not #{r.class}"
|
756
|
+
self.row[i] = r
|
757
|
+
end
|
758
|
+
|
759
|
+
self
|
760
|
+
end
|
761
|
+
|
762
|
+
# Retrieves a Daru::Vector, based on the result of calculation
|
763
|
+
# performed on each row.
|
764
|
+
def collect_rows &block
|
765
|
+
return to_enum(:collect_rows) unless block_given?
|
766
|
+
|
767
|
+
data = []
|
768
|
+
each_row do |row|
|
769
|
+
data.push yield(row)
|
770
|
+
end
|
771
|
+
|
772
|
+
Daru::Vector.new(data, index: @index)
|
773
|
+
end
|
774
|
+
|
775
|
+
def collect_row_with_index &block
|
776
|
+
return to_enum(:collect_row_with_index) unless block_given?
|
777
|
+
|
778
|
+
data = []
|
779
|
+
each_row_with_index do |row, i|
|
780
|
+
data.push yield(row, i)
|
781
|
+
end
|
782
|
+
|
783
|
+
Daru::Vector.new(data, index: @index)
|
784
|
+
end
|
785
|
+
|
786
|
+
# Retrives a Daru::Vector, based on the result of calculation
|
787
|
+
# performed on each vector.
|
788
|
+
def collect_vectors &block
|
789
|
+
return to_enum(:collect_vectors) unless block_given?
|
790
|
+
|
791
|
+
data = []
|
792
|
+
each_vector do |vec|
|
793
|
+
data.push yield(vec)
|
794
|
+
end
|
795
|
+
|
796
|
+
Daru::Vector.new(data, index: @vectors)
|
797
|
+
end
|
798
|
+
|
799
|
+
def collect_vector_with_index &block
|
800
|
+
return to_enum(:collect_vector_with_index) unless block_given?
|
801
|
+
|
802
|
+
data = []
|
803
|
+
each_vector_with_index do |vec, i|
|
804
|
+
data.push yield(vec, i)
|
805
|
+
end
|
806
|
+
|
807
|
+
Daru::Vector.new(data, index: @vectors)
|
808
|
+
end
|
809
|
+
|
810
|
+
# Generate a matrix, based on vector names of the DataFrame.
|
811
|
+
#
|
812
|
+
# @return {::Matrix}
|
813
|
+
def collect_matrix
|
814
|
+
return to_enum(:collect_matrix) unless block_given?
|
815
|
+
|
816
|
+
vecs = vectors.to_a
|
817
|
+
rows = vecs.collect { |row|
|
818
|
+
vecs.collect { |col|
|
819
|
+
yield row,col
|
820
|
+
}
|
821
|
+
}
|
822
|
+
|
823
|
+
Matrix.rows(rows)
|
824
|
+
end
|
825
|
+
|
826
|
+
|
342
827
|
# Delete a vector
|
343
828
|
def delete_vector vector
|
344
829
|
if @vectors.include? vector
|
@@ -367,6 +852,20 @@ module Daru
|
|
367
852
|
set_size
|
368
853
|
end
|
369
854
|
|
855
|
+
# Creates a DataFrame with the random data, of n size.
|
856
|
+
# If n not given, uses original number of rows.
|
857
|
+
#
|
858
|
+
# @return {Daru::DataFrame}
|
859
|
+
def bootstrap(n=nil)
|
860
|
+
n ||= nrows
|
861
|
+
ds_boot = Daru::DataFrame.new({}, order: @vectors)
|
862
|
+
n.times do
|
863
|
+
ds_boot.add_row(row[rand(n)])
|
864
|
+
end
|
865
|
+
ds_boot.update
|
866
|
+
ds_boot
|
867
|
+
end
|
868
|
+
|
370
869
|
def keep_row_if &block
|
371
870
|
deletion = []
|
372
871
|
|
@@ -388,6 +887,16 @@ module Daru
|
|
388
887
|
end
|
389
888
|
end
|
390
889
|
|
890
|
+
# creates a new vector with the data of a given field which the block returns true
|
891
|
+
def filter_vector vec
|
892
|
+
d = []
|
893
|
+
each_row do |row|
|
894
|
+
d.push(row[vec]) if yield row
|
895
|
+
end
|
896
|
+
|
897
|
+
Daru::Vector.new(d)
|
898
|
+
end
|
899
|
+
|
391
900
|
# Iterates over each row and retains it in a new DataFrame if the block returns
|
392
901
|
# true for that row.
|
393
902
|
def filter_rows &block
|
@@ -419,18 +928,160 @@ module Daru
|
|
419
928
|
df
|
420
929
|
end
|
421
930
|
|
931
|
+
# Test each row with one or more tests. Each test is a Proc with the form
|
932
|
+
# *Proc.new {|row| row[:age] > 0}*
|
933
|
+
#
|
934
|
+
# The function returns an array with all errors.
|
935
|
+
def verify(*tests)
|
936
|
+
if(tests[0].is_a? Symbol)
|
937
|
+
id = tests[0]
|
938
|
+
tests.shift
|
939
|
+
else
|
940
|
+
id = @vectors.first
|
941
|
+
end
|
942
|
+
|
943
|
+
vr = []
|
944
|
+
i = 0
|
945
|
+
each(:row) do |row|
|
946
|
+
i += 1
|
947
|
+
tests.each do |test|
|
948
|
+
if !test[2].call(row)
|
949
|
+
values = ""
|
950
|
+
if test[1].size>0
|
951
|
+
values = " (" + test[1].collect{ |k| "#{k}=#{row[k]}" }.join(", ") + ")"
|
952
|
+
end
|
953
|
+
vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
|
954
|
+
end
|
955
|
+
end
|
956
|
+
end
|
957
|
+
vr
|
958
|
+
end
|
959
|
+
|
960
|
+
# DSL for yielding each row and returning a Daru::Vector based on the
|
961
|
+
# value each run of the block returns.
|
962
|
+
#
|
963
|
+
# == Usage
|
964
|
+
#
|
965
|
+
# a1 = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7])
|
966
|
+
# a2 = Daru::Vector.new([10, 20, 30, 40, 50, 60, 70])
|
967
|
+
# a3 = Daru::Vector.new([100, 200, 300, 400, 500, 600, 700])
|
968
|
+
# ds = Daru::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
|
969
|
+
# total = ds.vector_by_calculation { a + b + c }
|
970
|
+
# # <Daru::Vector:82314050 @name = nil @size = 7 >
|
971
|
+
# # nil
|
972
|
+
# # 0 111
|
973
|
+
# # 1 222
|
974
|
+
# # 2 333
|
975
|
+
# # 3 444
|
976
|
+
# # 4 555
|
977
|
+
# # 5 666
|
978
|
+
# # 6 777
|
979
|
+
def vector_by_calculation &block
|
980
|
+
a = []
|
981
|
+
each_row do |r|
|
982
|
+
a.push r.instance_eval(&block)
|
983
|
+
end
|
984
|
+
|
985
|
+
Daru::Vector.new a, index: @index
|
986
|
+
end
|
987
|
+
|
988
|
+
# Returns a vector, based on a string with a calculation based
|
989
|
+
# on vector.
|
990
|
+
#
|
991
|
+
# The calculation will be eval'ed, so you can put any variable
|
992
|
+
# or expression valid on ruby.
|
993
|
+
#
|
994
|
+
# For example:
|
995
|
+
# a = Daru::Vector.new [1,2]
|
996
|
+
# b = Daru::Vector.new [3,4]
|
997
|
+
# ds = Daru::DataFrame.new({:a => a,:b => b})
|
998
|
+
# ds.compute("a+b")
|
999
|
+
# => Vector [4,6]
|
1000
|
+
def compute text, &block
|
1001
|
+
return instance_eval(&block) if block_given?
|
1002
|
+
instance_eval(text)
|
1003
|
+
end
|
1004
|
+
|
1005
|
+
# Return a vector with the number of missing values in each row.
|
1006
|
+
#
|
1007
|
+
# == Arguments
|
1008
|
+
#
|
1009
|
+
# * +missing_values+ - An Array of the values that should be
|
1010
|
+
# treated as 'missing'. The default missing value is *nil*.
|
1011
|
+
def missing_values_rows missing_values=[nil]
|
1012
|
+
number_of_missing = []
|
1013
|
+
each_row do |row|
|
1014
|
+
row.missing_values = missing_values
|
1015
|
+
number_of_missing << row.missing_positions.size
|
1016
|
+
end
|
1017
|
+
|
1018
|
+
Daru::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows".to_sym
|
1019
|
+
end
|
1020
|
+
|
1021
|
+
# TODO: remove next version
|
1022
|
+
alias :vector_missing_values :missing_values_rows
|
1023
|
+
|
1024
|
+
def has_missing_data?
|
1025
|
+
!!@data.any? { |v| v.has_missing_data? }
|
1026
|
+
end
|
1027
|
+
|
1028
|
+
alias :flawed? :has_missing_data?
|
1029
|
+
|
1030
|
+
# Return a nested hash using vector names as keys and an array constructed of
|
1031
|
+
# hashes with other values. If block provided, is used to provide the
|
1032
|
+
# values, with parameters +row+ of dataset, +current+ last hash on
|
1033
|
+
# hierarchy and +name+ of the key to include
|
1034
|
+
def nest *tree_keys, &block
|
1035
|
+
tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
|
1036
|
+
out = {}
|
1037
|
+
|
1038
|
+
each_row do |row|
|
1039
|
+
current = out
|
1040
|
+
# Create tree
|
1041
|
+
tree_keys[0, tree_keys.size-1].each do |f|
|
1042
|
+
root = row[f]
|
1043
|
+
current[root] ||= {}
|
1044
|
+
current = current[root]
|
1045
|
+
end
|
1046
|
+
name = row[tree_keys.last]
|
1047
|
+
if !block
|
1048
|
+
current[name] ||= []
|
1049
|
+
current[name].push(row.to_hash.delete_if { |key,value| tree_keys.include? key})
|
1050
|
+
else
|
1051
|
+
current[name] = block.call(row, current,name)
|
1052
|
+
end
|
1053
|
+
end
|
1054
|
+
|
1055
|
+
out
|
1056
|
+
end
|
1057
|
+
|
1058
|
+
def vector_count_characters vecs=nil
|
1059
|
+
vecs ||= @vectors.to_a
|
1060
|
+
|
1061
|
+
collect_row_with_index do |row, i|
|
1062
|
+
vecs.inject(0) do |memo, vec|
|
1063
|
+
memo + (row[vec].nil? ? 0 : row[vec].to_s.size)
|
1064
|
+
end
|
1065
|
+
end
|
1066
|
+
end
|
1067
|
+
|
1068
|
+
def add_vectors_by_split(name,join='-',sep=Daru::SPLIT_TOKEN)
|
1069
|
+
split = self[name].split_by_separator(sep)
|
1070
|
+
split.each { |k,v| self[(name.to_s + join + k.to_s).to_sym] = v }
|
1071
|
+
end
|
1072
|
+
|
422
1073
|
# Return the number of rows and columns of the DataFrame in an Array.
|
423
1074
|
def shape
|
424
1075
|
[@index.size, @vectors.size]
|
425
1076
|
end
|
426
1077
|
|
427
1078
|
# The number of rows
|
428
|
-
def
|
1079
|
+
def nrows
|
429
1080
|
shape[0]
|
430
1081
|
end
|
431
1082
|
|
432
1083
|
# The number of vectors
|
433
|
-
def
|
1084
|
+
def ncols
|
434
1085
|
shape[1]
|
435
1086
|
end
|
436
1087
|
|
@@ -439,11 +1090,37 @@ module Daru
|
|
439
1090
|
!!@vectors[*vector]
|
440
1091
|
end
|
441
1092
|
|
1093
|
+
def any? axis=:vector, &block
|
1094
|
+
if axis == :vector or axis == :column
|
1095
|
+
@data.any?(&block)
|
1096
|
+
elsif axis == :row
|
1097
|
+
each_row do |row|
|
1098
|
+
return true if yield(row)
|
1099
|
+
end
|
1100
|
+
return false
|
1101
|
+
else
|
1102
|
+
raise ArgumentError, "Unidentified axis #{axis}"
|
1103
|
+
end
|
1104
|
+
end
|
1105
|
+
|
1106
|
+
def all? axis=:vector, &block
|
1107
|
+
if axis == :vector or axis == :column
|
1108
|
+
@data.all?(&block)
|
1109
|
+
elsif axis == :row
|
1110
|
+
each_row do |row|
|
1111
|
+
return false unless yield(row)
|
1112
|
+
end
|
1113
|
+
return true
|
1114
|
+
else
|
1115
|
+
raise ArgumentError, "Unidentified axis #{axis}"
|
1116
|
+
end
|
1117
|
+
end
|
1118
|
+
|
442
1119
|
# The first ten elements of the DataFrame
|
443
1120
|
#
|
444
1121
|
# @param [Fixnum] quantity (10) The number of elements to display from the top.
|
445
1122
|
def head quantity=10
|
446
|
-
self[0..quantity, :row]
|
1123
|
+
self[0..(quantity-1), :row]
|
447
1124
|
end
|
448
1125
|
|
449
1126
|
# The last ten elements of the DataFrame
|
@@ -453,7 +1130,59 @@ module Daru
|
|
453
1130
|
self[(@size - quantity)..(@size-1), :row]
|
454
1131
|
end
|
455
1132
|
|
456
|
-
#
|
1133
|
+
# Returns a vector with sum of all vectors specified in the argument.
|
1134
|
+
# Tf vecs parameter is empty, sum all numeric vector.
|
1135
|
+
def vector_sum vecs=nil
|
1136
|
+
vecs ||= numeric_vectors
|
1137
|
+
sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype
|
1138
|
+
|
1139
|
+
vecs.each do |n|
|
1140
|
+
sum += self[n]
|
1141
|
+
end
|
1142
|
+
|
1143
|
+
sum
|
1144
|
+
end
|
1145
|
+
|
1146
|
+
# Calculate mean of the rows of the dataframe.
|
1147
|
+
#
|
1148
|
+
# == Arguments
|
1149
|
+
#
|
1150
|
+
# * +max_missing+ - The maximum number of elements in the row that can be
|
1151
|
+
# zero for the mean calculation to happen. Default to 0.
|
1152
|
+
def vector_mean max_missing=0
|
1153
|
+
mean_vec = Daru::Vector.new [0]*@size, index: @index, name: "mean_#{@name}"
|
1154
|
+
|
1155
|
+
each_row_with_index do |row, i|
|
1156
|
+
mean_vec[i] = row.missing_positions.size > max_missing ? nil : row.mean
|
1157
|
+
end
|
1158
|
+
|
1159
|
+
mean_vec
|
1160
|
+
end
|
1161
|
+
|
1162
|
+
# Group elements by vector to perform operations on them. Returns a
|
1163
|
+
# Daru::Core::GroupBy object.See the Daru::Core::GroupBy docs for a detailed
|
1164
|
+
# list of possible operations.
|
1165
|
+
#
|
1166
|
+
# == Arguments
|
1167
|
+
#
|
1168
|
+
# * vectors - An Array contatining names of vectors to group by.
|
1169
|
+
#
|
1170
|
+
# == Usage
|
1171
|
+
#
|
1172
|
+
# df = Daru::DataFrame.new({
|
1173
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
1174
|
+
# b: %w{one one two three two two one three},
|
1175
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
1176
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
1177
|
+
# })
|
1178
|
+
# df.group_by([:a,:b,:c]).groups
|
1179
|
+
# #=> {["bar", "one", 2]=>[1],
|
1180
|
+
# # ["bar", "three", 1]=>[3],
|
1181
|
+
# # ["bar", "two", 6]=>[5],
|
1182
|
+
# # ["foo", "one", 1]=>[0],
|
1183
|
+
# # ["foo", "one", 3]=>[6],
|
1184
|
+
# # ["foo", "three", 8]=>[7],
|
1185
|
+
# # ["foo", "two", 3]=>[2, 4]}
|
457
1186
|
def group_by vectors
|
458
1187
|
vectors = [vectors] if vectors.is_a?(Symbol)
|
459
1188
|
vectors.each { |v| raise(ArgumentError, "Vector #{v} does not exist") unless
|
@@ -462,6 +1191,13 @@ module Daru
|
|
462
1191
|
Daru::Core::GroupBy.new(self, vectors)
|
463
1192
|
end
|
464
1193
|
|
1194
|
+
def reindex_vectors! new_vectors
|
1195
|
+
raise ArgumentError, "Number of vectors passed into function (#{new_vectors.size}) should equal that present in the DataFrame (#{@vectors.size})" if
|
1196
|
+
@vectors.size != new_vectors.size
|
1197
|
+
|
1198
|
+
@vectors = Daru::Index.new new_vectors.map(&:to_sym), new_vectors.map { |e| @vectors[e] }
|
1199
|
+
end
|
1200
|
+
|
465
1201
|
# Change the index of the DataFrame and its underlying vectors. Destructive.
|
466
1202
|
#
|
467
1203
|
# @param [Symbol, Array] new_index Specify an Array if
|
@@ -481,19 +1217,58 @@ module Daru
|
|
481
1217
|
self.dup.reindex! new_index
|
482
1218
|
end
|
483
1219
|
|
484
|
-
# Return the
|
1220
|
+
# Return the indexes of all the numeric vectors. Will include vectors with nils
|
485
1221
|
# alongwith numbers.
|
486
1222
|
def numeric_vectors
|
487
1223
|
numerics = []
|
488
1224
|
|
489
|
-
|
1225
|
+
each_vector_with_index do |vec, i|
|
1226
|
+
numerics << i if(vec.type == :numeric)
|
1227
|
+
end
|
1228
|
+
numerics
|
1229
|
+
end
|
1230
|
+
|
1231
|
+
def numeric_vector_names
|
1232
|
+
numerics = []
|
1233
|
+
|
1234
|
+
each_vector do |vec, i|
|
490
1235
|
numerics << vec.name if(vec.type == :numeric)
|
491
1236
|
end
|
492
1237
|
numerics
|
493
1238
|
end
|
494
1239
|
|
1240
|
+
# Return a DataFrame of only the numerical Vectors. If clone: false
|
1241
|
+
# is specified as option, only a *view* of the Vectors will be
|
1242
|
+
# returned. Defaults to clone: true.
|
1243
|
+
def only_numerics opts={}
|
1244
|
+
cln = opts[:clone] == false ? false : true
|
1245
|
+
nv = numeric_vectors
|
1246
|
+
arry = nv.inject([]) do |arr, v|
|
1247
|
+
arr << self[v]
|
1248
|
+
arr
|
1249
|
+
end
|
1250
|
+
|
1251
|
+
order = @vectors.is_a?(MultiIndex) ? MultiIndex.new(nv) : Index.new(nv)
|
1252
|
+
Daru::DataFrame.new(arry, clone: cln, order: order, index: @index)
|
1253
|
+
end
|
1254
|
+
|
1255
|
+
# Generate a summary of this DataFrame with ReportBuilder.
|
1256
|
+
def summary(method = :to_text)
|
1257
|
+
ReportBuilder.new(no_title: true).add(self).send(method)
|
1258
|
+
end
|
1259
|
+
|
1260
|
+
def report_building(b) # :nodoc: #
|
1261
|
+
b.section(:name=>@name) do |g|
|
1262
|
+
g.text "Number of rows: #{nrows}"
|
1263
|
+
@vectors.each do |v|
|
1264
|
+
g.text "Element:[#{v}]"
|
1265
|
+
g.parse_element(self[v])
|
1266
|
+
end
|
1267
|
+
end
|
1268
|
+
end
|
1269
|
+
|
495
1270
|
# Sorts a dataframe (ascending/descending)according to the given sequence of
|
496
|
-
#
|
1271
|
+
# vectors, using the attributes provided in the blocks.
|
497
1272
|
#
|
498
1273
|
# @param order [Array] The order of vector names in which the DataFrame
|
499
1274
|
# should be sorted.
|
@@ -583,7 +1358,7 @@ module Daru
|
|
583
1358
|
elsif opts[:values].is_a?(Array)
|
584
1359
|
opts[:values]
|
585
1360
|
else # nil
|
586
|
-
(@vectors.to_a - (index | vectors)) &
|
1361
|
+
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
587
1362
|
end
|
588
1363
|
|
589
1364
|
raise IndexError, "No numeric vectors to aggregate" if values.empty?
|
@@ -634,6 +1409,195 @@ module Daru
|
|
634
1409
|
end
|
635
1410
|
end
|
636
1411
|
|
1412
|
+
# Merge vectors from two DataFrames. In case of name collision,
|
1413
|
+
# the vectors names are changed to x_1, x_2 ....
|
1414
|
+
#
|
1415
|
+
# @return {Daru::DataFrame}
|
1416
|
+
def merge other_df
|
1417
|
+
raise "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" unless nrows == other_df.nrows
|
1418
|
+
|
1419
|
+
new_fields = (@vectors.to_a + other_df.vectors.to_a)
|
1420
|
+
.recode_repeated
|
1421
|
+
.map(&:to_sym)
|
1422
|
+
df_new = DataFrame.new({}, order: new_fields)
|
1423
|
+
|
1424
|
+
(0...nrows).to_a.each do |i|
|
1425
|
+
row = self.row[i].to_a + other_df.row[i].to_a
|
1426
|
+
df_new.add_row(row)
|
1427
|
+
end
|
1428
|
+
|
1429
|
+
df_new.update
|
1430
|
+
df_new
|
1431
|
+
end
|
1432
|
+
|
1433
|
+
# Join 2 DataFrames by given fields
|
1434
|
+
# type is one of :left and :inner, default is :left
|
1435
|
+
#
|
1436
|
+
# Untested! Use at your own risk.
|
1437
|
+
#
|
1438
|
+
# @return {Daru::DataFrame}
|
1439
|
+
def join(other_ds,fields_1=[],fields_2=[],type=:left)
|
1440
|
+
fields_new = other_ds.vectors.to_a - fields_2
|
1441
|
+
fields = self.vectors.to_a + fields_new
|
1442
|
+
|
1443
|
+
other_ds_hash = {}
|
1444
|
+
other_ds.each_row do |row|
|
1445
|
+
key = row.to_hash.select { |k,v| fields_2.include?(k) }.values
|
1446
|
+
value = row.to_hash.select { |k,v| fields_new.include?(k) }
|
1447
|
+
|
1448
|
+
if other_ds_hash[key].nil?
|
1449
|
+
other_ds_hash[key] = [value]
|
1450
|
+
else
|
1451
|
+
other_ds_hash[key] << value
|
1452
|
+
end
|
1453
|
+
end
|
1454
|
+
|
1455
|
+
new_ds = DataFrame.new({}, order: fields)
|
1456
|
+
|
1457
|
+
self.each_row do |row|
|
1458
|
+
key = row.to_hash.select{|k,v| fields_1.include?(k)}.values
|
1459
|
+
new_case = row.to_hash
|
1460
|
+
|
1461
|
+
if other_ds_hash[key].nil?
|
1462
|
+
if type == :left
|
1463
|
+
fields_new.each{|field| new_case[field] = nil}
|
1464
|
+
new_ds.add_row(Daru::Vector.new(new_case))
|
1465
|
+
end
|
1466
|
+
else
|
1467
|
+
other_ds_hash[key].each do |new_values|
|
1468
|
+
new_ds.add_row(Daru::Vector.new(new_case.merge(new_values)))
|
1469
|
+
end
|
1470
|
+
end
|
1471
|
+
end
|
1472
|
+
|
1473
|
+
new_ds
|
1474
|
+
end
|
1475
|
+
|
1476
|
+
|
1477
|
+
# Creates a new dataset for one to many relations
|
1478
|
+
# on a dataset, based on pattern of field names.
|
1479
|
+
#
|
1480
|
+
# for example, you have a survey for number of children
|
1481
|
+
# with this structure:
|
1482
|
+
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
|
1483
|
+
# with
|
1484
|
+
# ds.one_to_many([:id], "child_%v_%n"
|
1485
|
+
# the field of first parameters will be copied verbatim
|
1486
|
+
# to new dataset, and fields which responds to second
|
1487
|
+
# pattern will be added one case for each different %n.
|
1488
|
+
#
|
1489
|
+
# == Usage
|
1490
|
+
# cases=[
|
1491
|
+
# ['1','george','red',10,'blue',20,nil,nil],
|
1492
|
+
# ['2','fred','green',15,'orange',30,'white',20],
|
1493
|
+
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
1494
|
+
# ]
|
1495
|
+
# ds=Daru::DataFrame.rows(cases, order: [:id, :name, :car_color1, :car_value1, :car_color2, :car_value2, :car_color3, :car_value3])
|
1496
|
+
# ds.one_to_many([:id],'car_%v%n').to_matrix
|
1497
|
+
# => Matrix[
|
1498
|
+
# ["red", "1", 10],
|
1499
|
+
# ["blue", "1", 20],
|
1500
|
+
# ["green", "2", 15],
|
1501
|
+
# ["orange", "2", 30],
|
1502
|
+
# ["white", "2", 20]
|
1503
|
+
# ]
|
1504
|
+
#
|
1505
|
+
def one_to_many(parent_fields, pattern)
|
1506
|
+
re = Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
|
1507
|
+
ds_vars = parent_fields
|
1508
|
+
vars = []
|
1509
|
+
max_n = 0
|
1510
|
+
h = parent_fields.inject({}) { |a,v|
|
1511
|
+
a[v] = Daru::Vector.new([])
|
1512
|
+
a
|
1513
|
+
}
|
1514
|
+
# Adding _row_id
|
1515
|
+
h[:_col_id] = Daru::Vector.new([])
|
1516
|
+
ds_vars.push(:_col_id)
|
1517
|
+
|
1518
|
+
@vectors.each do |f|
|
1519
|
+
if f =~ re
|
1520
|
+
if !vars.include? $1
|
1521
|
+
vars.push($1)
|
1522
|
+
h[$1] = Daru::Vector.new([])
|
1523
|
+
end
|
1524
|
+
max_n = $2.to_i if max_n < $2.to_i
|
1525
|
+
end
|
1526
|
+
end
|
1527
|
+
ds = DataFrame.new(h, order: ds_vars+vars)
|
1528
|
+
|
1529
|
+
each_row do |row|
|
1530
|
+
row_out = {}
|
1531
|
+
parent_fields.each do |f|
|
1532
|
+
row_out[f]=row[f]
|
1533
|
+
end
|
1534
|
+
|
1535
|
+
max_n.times do |n1|
|
1536
|
+
n = n1+1
|
1537
|
+
any_data = false
|
1538
|
+
vars.each do |v|
|
1539
|
+
data = row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s).to_sym]
|
1540
|
+
row_out[v] = data
|
1541
|
+
any_data = true if !data.nil?
|
1542
|
+
end
|
1543
|
+
|
1544
|
+
if any_data
|
1545
|
+
row_out[:_col_id] = n
|
1546
|
+
ds.add_row(row_out)
|
1547
|
+
end
|
1548
|
+
end
|
1549
|
+
end
|
1550
|
+
ds.update
|
1551
|
+
ds
|
1552
|
+
end
|
1553
|
+
|
1554
|
+
def add_vectors_by_split_recode(name_, join='-', sep=Daru::SPLIT_TOKEN)
|
1555
|
+
split = self[name_].split_by_separator(sep)
|
1556
|
+
i = 1
|
1557
|
+
split.each { |k,v|
|
1558
|
+
new_field = name_.to_s + join + i.to_s
|
1559
|
+
v.rename name_.to_s + ":" + k.to_s
|
1560
|
+
self[new_field.to_sym] = v
|
1561
|
+
i += 1
|
1562
|
+
}
|
1563
|
+
end
|
1564
|
+
|
1565
|
+
# Create a sql, basen on a given Dataset
|
1566
|
+
#
|
1567
|
+
# == Arguments
|
1568
|
+
#
|
1569
|
+
# * table - String specifying name of the table that will created in SQL.
|
1570
|
+
# * charset - Character set. Default is "UTF8".
|
1571
|
+
#
|
1572
|
+
# == Usage
|
1573
|
+
#
|
1574
|
+
# ds = Daru::DataFrame.new({
|
1575
|
+
# :id => Daru::Vector.new([1,2,3,4,5]),
|
1576
|
+
# :name => Daru::Vector.new(%w{Alex Peter Susan Mary John})
|
1577
|
+
# })
|
1578
|
+
# ds.create_sql('names')
|
1579
|
+
# ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
|
1580
|
+
#
|
1581
|
+
def create_sql(table,charset="UTF8")
|
1582
|
+
sql = "CREATE TABLE #{table} ("
|
1583
|
+
fields = self.vectors.to_a.collect do |f|
|
1584
|
+
v = self[f]
|
1585
|
+
f.to_s + " " + v.db_type
|
1586
|
+
end
|
1587
|
+
|
1588
|
+
sql + fields.join(",\n ")+") CHARACTER SET=#{charset};"
|
1589
|
+
end
|
1590
|
+
|
1591
|
+
# Convert all numeric vectors to GSL::Matrix
|
1592
|
+
def to_gsl
|
1593
|
+
numerics_as_arrays = []
|
1594
|
+
numeric_vectors.each do |n|
|
1595
|
+
numerics_as_arrays << self[n].to_a
|
1596
|
+
end
|
1597
|
+
|
1598
|
+
GSL::Matrix.alloc *numerics_as_arrays.transpose
|
1599
|
+
end
|
1600
|
+
|
637
1601
|
# Convert all vectors of type *:numeric* into a Matrix.
|
638
1602
|
def to_matrix
|
639
1603
|
numerics_as_arrays = []
|
@@ -644,22 +1608,27 @@ module Daru
|
|
644
1608
|
Matrix.columns numerics_as_arrays
|
645
1609
|
end
|
646
1610
|
|
1611
|
+
# Return a Nyaplot::DataFrame from the data of this DataFrame.
|
1612
|
+
def to_nyaplotdf
|
1613
|
+
Nyaplot::DataFrame.new(to_a[0])
|
1614
|
+
end
|
1615
|
+
|
647
1616
|
# Convert all vectors of type *:numeric* and not containing nils into an NMatrix.
|
648
1617
|
def to_nmatrix
|
649
1618
|
numerics_as_arrays = []
|
650
1619
|
each_vector do |vector|
|
651
1620
|
numerics_as_arrays << vector.to_a if(vector.type == :numeric and
|
652
|
-
vector.
|
1621
|
+
vector.missing_positions.size == 0)
|
653
1622
|
end
|
654
1623
|
|
655
1624
|
numerics_as_arrays.transpose.to_nm
|
656
1625
|
end
|
657
1626
|
|
658
1627
|
# Converts the DataFrame into an array of hashes where key is vector name
|
659
|
-
#
|
660
|
-
#
|
661
|
-
#
|
662
|
-
#
|
1628
|
+
# and value is the corresponding element. The 0th index of the array contains
|
1629
|
+
# the array of hashes while the 1th index contains the indexes of each row
|
1630
|
+
# of the dataframe. Each element in the index array corresponds to its row
|
1631
|
+
# in the array of hashes, which has the same index.
|
663
1632
|
def to_a
|
664
1633
|
arry = [[],[]]
|
665
1634
|
self.each_row do |row|
|
@@ -678,9 +1647,26 @@ module Daru
|
|
678
1647
|
end
|
679
1648
|
end
|
680
1649
|
|
1650
|
+
# Converts DataFrame to a hash with keys as vector names and values as
|
1651
|
+
# the corresponding vectors.
|
1652
|
+
def to_hash
|
1653
|
+
hsh = {}
|
1654
|
+
@vectors.each_with_index do |vec_name, idx|
|
1655
|
+
hsh[vec_name] = @data[idx]
|
1656
|
+
end
|
1657
|
+
|
1658
|
+
hsh
|
1659
|
+
end
|
1660
|
+
|
681
1661
|
# Convert to html for IRuby.
|
682
1662
|
def to_html threshold=30
|
683
|
-
html
|
1663
|
+
html = "<table>" +
|
1664
|
+
"<tr>" +
|
1665
|
+
"<th colspan=\"#{@vectors.size+1}\">" +
|
1666
|
+
"Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}"
|
1667
|
+
"</th>" +
|
1668
|
+
"</tr>"
|
1669
|
+
html +='<tr><th></th>'
|
684
1670
|
@vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
|
685
1671
|
html += '</tr>'
|
686
1672
|
|
@@ -697,6 +1683,15 @@ module Daru
|
|
697
1683
|
html += '<tr>'
|
698
1684
|
(@vectors + 1).size.times { html += '<td>...</td>' }
|
699
1685
|
html += '</tr>'
|
1686
|
+
|
1687
|
+
last_index = @index.to_a.last
|
1688
|
+
last_row = self.row[last_index]
|
1689
|
+
html += '<tr>'
|
1690
|
+
html += "<td>" + last_index.to_s + "</td>"
|
1691
|
+
(0..(ncols - 1)).to_a.each do |i|
|
1692
|
+
html += '<td>' + last_row[i].to_s + '</td>'
|
1693
|
+
end
|
1694
|
+
html += '</tr>'
|
700
1695
|
break
|
701
1696
|
end
|
702
1697
|
end
|
@@ -709,6 +1704,87 @@ module Daru
|
|
709
1704
|
to_html
|
710
1705
|
end
|
711
1706
|
|
1707
|
+
# Method for updating the metadata (i.e. missing value positions) of the
|
1708
|
+
# after assingment/deletion etc. are complete. This is provided so that
|
1709
|
+
# time is not wasted in creating the metadata for the vector each time
|
1710
|
+
# assignment/deletion of elements is done. Updating data this way is called
|
1711
|
+
# lazy loading. To set or unset lazy loading, see the .lazy_update= method.
|
1712
|
+
def update
|
1713
|
+
@data.each { |v| v.update } if Daru.lazy_update
|
1714
|
+
end
|
1715
|
+
|
1716
|
+
def rename new_name
|
1717
|
+
if new_name.is_a?(Numeric)
|
1718
|
+
@name = new_name
|
1719
|
+
return
|
1720
|
+
end
|
1721
|
+
@name = new_name.to_sym
|
1722
|
+
end
|
1723
|
+
|
1724
|
+
# Write this DataFrame to a CSV file.
|
1725
|
+
#
|
1726
|
+
# == Arguements
|
1727
|
+
#
|
1728
|
+
# * filename - Path of CSV file where the DataFrame is to be saved.
|
1729
|
+
#
|
1730
|
+
# == Options
|
1731
|
+
#
|
1732
|
+
# * convert_comma - If set to *true*, will convert any commas in any
|
1733
|
+
# of the data to full stops ('.').
|
1734
|
+
# All the options accepted by CSV.read() can also be passed into this
|
1735
|
+
# function.
|
1736
|
+
def write_csv filename, opts={}
|
1737
|
+
Daru::IO.dataframe_write_csv self, filename, opts
|
1738
|
+
end
|
1739
|
+
|
1740
|
+
# Write this dataframe to an Excel Spreadsheet
|
1741
|
+
#
|
1742
|
+
# == Arguments
|
1743
|
+
#
|
1744
|
+
# * filename - The path of the file where the DataFrame should be written.
|
1745
|
+
def write_excel filename, opts={}
|
1746
|
+
Daru::IO.dataframe_write_excel self, filename, opts
|
1747
|
+
end
|
1748
|
+
|
1749
|
+
# Insert each case of the Dataset on the selected table
|
1750
|
+
#
|
1751
|
+
# == Arguments
|
1752
|
+
#
|
1753
|
+
# * dbh - DBI database connection object.
|
1754
|
+
# * query - Query string.
|
1755
|
+
#
|
1756
|
+
# == Usage
|
1757
|
+
#
|
1758
|
+
# ds = Daru::DataFrame.new({:id=>Daru::Vector.new([1,2,3]), :name=>Daru::Vector.new(["a","b","c"])})
|
1759
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
1760
|
+
# ds.write_sql(dbh,"test")
|
1761
|
+
def write_sql dbh, table
|
1762
|
+
Daru::IO.dataframe_write_sql self, dbh, table
|
1763
|
+
end
|
1764
|
+
|
1765
|
+
|
1766
|
+
# Use marshalling to save dataframe to a file.
|
1767
|
+
def save filename
|
1768
|
+
Daru::IO.save self, filename
|
1769
|
+
end
|
1770
|
+
|
1771
|
+
def _dump depth
|
1772
|
+
Marshal.dump({
|
1773
|
+
data: @data,
|
1774
|
+
index: @index.to_a,
|
1775
|
+
order: @vectors.to_a,
|
1776
|
+
name: @name
|
1777
|
+
})
|
1778
|
+
end
|
1779
|
+
|
1780
|
+
def self._load data
|
1781
|
+
h = Marshal.load data
|
1782
|
+
Daru::DataFrame.new(h[:data],
|
1783
|
+
index: h[:index],
|
1784
|
+
order: h[:order],
|
1785
|
+
name: h[:name])
|
1786
|
+
end
|
1787
|
+
|
712
1788
|
# Change dtypes of vectors by supplying a hash of :vector_name => :new_dtype
|
713
1789
|
#
|
714
1790
|
# == Usage
|
@@ -733,9 +1809,9 @@ module Daru
|
|
733
1809
|
# Pretty print in a nice table format for the command line (irb/pry/iruby)
|
734
1810
|
def inspect spacing=10, threshold=15
|
735
1811
|
longest = [@name.to_s.size,
|
736
|
-
@vectors.map(&:to_s).map(&:size).max,
|
737
|
-
@index .map(&:to_s).map(&:size).max,
|
738
|
-
@data .map{ |v|
|
1812
|
+
(@vectors.map(&:to_s).map(&:size).max || 0),
|
1813
|
+
(@index .map(&:to_s).map(&:size).max || 0),
|
1814
|
+
(@data .map{ |v| v.map(&:to_s).map(&:size).max}.max || 0)].max
|
739
1815
|
|
740
1816
|
name = @name || 'nil'
|
741
1817
|
content = ""
|
@@ -901,6 +1977,8 @@ module Daru
|
|
901
1977
|
|
902
1978
|
def access_vector *names
|
903
1979
|
location = names[0]
|
1980
|
+
|
1981
|
+
return dup(@vectors[location]) if location.is_a?(Range)
|
904
1982
|
if @vectors.is_a?(MultiIndex)
|
905
1983
|
pos = vectors_index_for names
|
906
1984
|
|
@@ -996,41 +2074,68 @@ module Daru
|
|
996
2074
|
end
|
997
2075
|
|
998
2076
|
def insert_or_modify_vector name, vector
|
999
|
-
|
1000
|
-
|
2077
|
+
if vectors.is_a?(Index)
|
2078
|
+
name = name[0]
|
2079
|
+
end
|
1001
2080
|
|
1002
|
-
if
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
2081
|
+
@vectors = @vectors + name if !@vectors.include?(name)
|
2082
|
+
v = nil
|
2083
|
+
|
2084
|
+
if @index.empty?
|
2085
|
+
v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a)
|
2086
|
+
@index = v.index
|
2087
|
+
@data[@vectors[name]] = v
|
2088
|
+
set_size
|
2089
|
+
|
2090
|
+
@data.map! do |v|
|
2091
|
+
if v.size == 0
|
2092
|
+
Daru::Vector.new([nil]*@size, name: set_name(name), index: @index)
|
2093
|
+
else
|
2094
|
+
v
|
2095
|
+
end
|
1006
2096
|
end
|
1007
2097
|
else
|
1008
|
-
|
1009
|
-
|
2098
|
+
if vector.is_a?(Daru::Vector)
|
2099
|
+
v = Daru::Vector.new [], name: set_name(name), index: @index
|
2100
|
+
@index.each do |idx|
|
2101
|
+
v[idx] = vector[idx]
|
2102
|
+
end
|
2103
|
+
else
|
2104
|
+
raise Exception, "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
|
2105
|
+
@size != vector.size
|
1010
2106
|
|
1011
|
-
|
1012
|
-
|
2107
|
+
v = Daru::Vector.new(vector, name: set_name(name), index: @index)
|
2108
|
+
end
|
1013
2109
|
|
1014
|
-
|
2110
|
+
@data[@vectors[name]] = v
|
2111
|
+
end
|
1015
2112
|
end
|
1016
2113
|
|
1017
|
-
def insert_or_modify_row name, vector
|
1018
|
-
if
|
1019
|
-
|
1020
|
-
|
1021
|
-
@vectors.each do |vector|
|
1022
|
-
@data[@vectors[vector]][name] = v[vector]
|
1023
|
-
end
|
2114
|
+
def insert_or_modify_row name, vector
|
2115
|
+
if index.is_a?(MultiIndex)
|
2116
|
+
# TODO
|
1024
2117
|
else
|
1025
|
-
|
1026
|
-
v
|
2118
|
+
name = name[0]
|
2119
|
+
v =
|
2120
|
+
if vector.is_a?(Daru::Vector)
|
2121
|
+
vector
|
2122
|
+
else
|
2123
|
+
Daru::Vector.new(vector, name: set_name(name), index: @vectors)
|
2124
|
+
end
|
1027
2125
|
|
1028
|
-
@
|
1029
|
-
@
|
2126
|
+
if @index.include? name
|
2127
|
+
@vectors.each do |vector|
|
2128
|
+
@data[@vectors[vector]][name] = v[vector]
|
2129
|
+
end
|
2130
|
+
else
|
2131
|
+
@index = reassign_index_as(@index + name)
|
2132
|
+
@vectors.each do |vector|
|
2133
|
+
@data[@vectors[vector]].concat v[vector], name
|
2134
|
+
end
|
1030
2135
|
end
|
1031
|
-
end
|
1032
2136
|
|
1033
|
-
|
2137
|
+
set_size
|
2138
|
+
end
|
1034
2139
|
end
|
1035
2140
|
|
1036
2141
|
def create_empty_vectors
|
@@ -1081,18 +2186,22 @@ module Daru
|
|
1081
2186
|
def create_vectors_index_with vectors, source
|
1082
2187
|
vectors = source.keys.sort if vectors.nil?
|
1083
2188
|
|
2189
|
+
@vectors =
|
1084
2190
|
unless vectors.is_a?(Index) or vectors.is_a?(MultiIndex)
|
1085
|
-
|
2191
|
+
Daru::Index.new((vectors + (source.keys - vectors))
|
2192
|
+
.uniq
|
2193
|
+
.map { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
|
2194
|
+
)
|
1086
2195
|
else
|
1087
|
-
|
2196
|
+
vectors
|
1088
2197
|
end
|
1089
2198
|
end
|
1090
2199
|
|
1091
2200
|
def all_vectors_have_equal_indexes? source
|
1092
|
-
|
2201
|
+
idx = source.values[0].index
|
1093
2202
|
|
1094
2203
|
source.all? do |name, vector|
|
1095
|
-
|
2204
|
+
idx == vector.index
|
1096
2205
|
end
|
1097
2206
|
end
|
1098
2207
|
|