daru 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -5
- data/CONTRIBUTING.md +2 -11
- data/History.md +18 -0
- data/README.md +109 -11
- data/daru.gemspec +11 -6
- data/images/README.md +5 -0
- data/images/con0.png +0 -0
- data/images/con1.png +0 -0
- data/images/init0.png +0 -0
- data/images/init1.png +0 -0
- data/images/man0.png +0 -0
- data/images/man1.png +0 -0
- data/images/man2.png +0 -0
- data/images/man3.png +0 -0
- data/images/man4.png +0 -0
- data/images/man5.png +0 -0
- data/images/man6.png +0 -0
- data/images/plot0.png +0 -0
- data/lib/daru.rb +5 -2
- data/lib/daru/core/group_by.rb +45 -45
- data/lib/daru/core/merge.rb +59 -1
- data/lib/daru/dataframe.rb +255 -226
- data/lib/daru/exceptions.rb +2 -0
- data/lib/daru/io/io.rb +41 -19
- data/lib/daru/io/sql_data_source.rb +116 -0
- data/lib/daru/vector.rb +124 -104
- data/lib/daru/version.rb +1 -1
- data/spec/core/group_by_spec.rb +12 -2
- data/spec/core/merge_spec.rb +14 -1
- data/spec/dataframe_spec.rb +189 -158
- data/spec/io/io_spec.rb +80 -2
- data/spec/io/sql_data_source_spec.rb +67 -0
- data/spec/spec_helper.rb +4 -2
- data/spec/support/database_helper.rb +30 -0
- data/spec/vector_spec.rb +45 -46
- metadata +104 -16
- data/.build.sh +0 -14
data/lib/daru/core/merge.rb
CHANGED
@@ -33,6 +33,14 @@ module Daru
|
|
33
33
|
hsh.each { |k,v| hsh[k] = v.to_a }
|
34
34
|
hsh
|
35
35
|
end
|
36
|
+
|
37
|
+
def arrayify df
|
38
|
+
arr = df.to_a
|
39
|
+
col_names = arr[0][0].keys
|
40
|
+
values = arr[0].map{|h| h.values}
|
41
|
+
|
42
|
+
return col_names, values
|
43
|
+
end
|
36
44
|
|
37
45
|
def inner_join df1, df2, df_hash1, df_hash2, on
|
38
46
|
joined_hash = {}
|
@@ -53,6 +61,52 @@ module Daru
|
|
53
61
|
Daru::DataFrame.new(joined_hash, order: joined_hash.keys)
|
54
62
|
end
|
55
63
|
|
64
|
+
def bf_inner_join df1, df2, on
|
65
|
+
col_names1, table1 = arrayify df1
|
66
|
+
col_names2, table2 = arrayify df2
|
67
|
+
|
68
|
+
#resolve duplicates
|
69
|
+
indicies1 = on.map{|i| col_names1.index(i)}
|
70
|
+
indicies2 = on.map{|i| col_names2.index(i)}
|
71
|
+
col_names2.map! do |name|
|
72
|
+
if (col_names1.include?(name))
|
73
|
+
col_names1[col_names1.index(name)] = (name.to_s + "_1").to_sym unless on.include?(name)
|
74
|
+
(name.to_s + "_2").to_sym
|
75
|
+
else
|
76
|
+
name
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
#combine key columns to a single column value
|
81
|
+
on_cols1 = table1.flat_map{|x| indicies1.map{|i| x[i].to_s}.join("+")}
|
82
|
+
on_cols2 = table2.flat_map{|x| indicies2.map{|i| x[i].to_s}.join("+")}
|
83
|
+
|
84
|
+
#parameters for a BF with approx 0.1% false positives
|
85
|
+
m = on_cols2.size * 15
|
86
|
+
k = 11
|
87
|
+
|
88
|
+
bf = BloomFilter::Native.new({:size => m, :hashes => k, :bucket => 1})
|
89
|
+
on_cols2.each{|x| bf.insert(x)}
|
90
|
+
|
91
|
+
x_ind = -1
|
92
|
+
joined_new = on_cols1.map do |x|
|
93
|
+
x_ind+=1
|
94
|
+
if (bf.include?(x))
|
95
|
+
{x_ind => on_cols2.each_index.select{|y_ind| on_cols2[y_ind] == x}}
|
96
|
+
else
|
97
|
+
{x_ind => []}
|
98
|
+
end
|
99
|
+
end
|
100
|
+
.reduce({}) {|h,pairs| pairs.each {|k,v| (h[k] ||= []) << v}; h}
|
101
|
+
.flat_map{|ind1, inds2| inds2.flatten.map{|ind2| [table1[ind1], table2[ind2]].flatten} if inds2.flatten.size > 0}
|
102
|
+
|
103
|
+
joined_cols = [col_names1, col_names2].flatten
|
104
|
+
df = Daru::DataFrame.rows(joined_new.compact, order: joined_cols)
|
105
|
+
on.each{|x| df.delete_vector (x.to_s + "_2").to_sym}
|
106
|
+
|
107
|
+
df
|
108
|
+
end
|
109
|
+
|
56
110
|
def full_outer_join df1, df2, df_hash1, df_hash2, on
|
57
111
|
left = left_outer_join df1, df2, df_hash1, df_hash2, on, true
|
58
112
|
right = right_outer_join df1, df2, df_hash1, df_hash2, on, true
|
@@ -153,7 +207,11 @@ module Daru
|
|
153
207
|
|
154
208
|
case opts[:how]
|
155
209
|
when :inner
|
156
|
-
|
210
|
+
if Daru.has_bloomfilter_rb?
|
211
|
+
helper.bf_inner_join df1, df2, on
|
212
|
+
else
|
213
|
+
helper.inner_join df1, df2, df_hash1, df_hash2, on
|
214
|
+
end
|
157
215
|
when :outer
|
158
216
|
helper.full_outer_join df1, df2, df_hash1, df_hash2, on
|
159
217
|
when :left
|
data/lib/daru/dataframe.rb
CHANGED
@@ -14,30 +14,30 @@ module Daru
|
|
14
14
|
include Daru::Plotting::DataFrame if Daru.has_nyaplot?
|
15
15
|
|
16
16
|
class << self
|
17
|
-
# Load data from a CSV file. Specify an optional block to grab the CSV
|
18
|
-
# object and pre-condition it (for example use the `convert` or
|
17
|
+
# Load data from a CSV file. Specify an optional block to grab the CSV
|
18
|
+
# object and pre-condition it (for example use the `convert` or
|
19
19
|
# `header_convert` methods).
|
20
|
-
#
|
20
|
+
#
|
21
21
|
# == Arguments
|
22
|
-
#
|
22
|
+
#
|
23
23
|
# * path - Path of the file to load specified as a String.
|
24
|
-
#
|
24
|
+
#
|
25
25
|
# == Options
|
26
|
-
#
|
26
|
+
#
|
27
27
|
# Accepts the same options as the Daru::DataFrame constructor and CSV.open()
|
28
28
|
# and uses those to eventually construct the resulting DataFrame.
|
29
29
|
#
|
30
30
|
# == Verbose Description
|
31
31
|
#
|
32
|
-
# You can specify all the options to the `.from_csv` function that you
|
32
|
+
# You can specify all the options to the `.from_csv` function that you
|
33
33
|
# do to the Ruby `CSV.read()` function, since this is what is used internally.
|
34
34
|
#
|
35
|
-
# For example, if the columns in your CSV file are separated by something
|
36
|
-
# other that commas, you can use the `:col_sep` option. If you want to
|
37
|
-
# convert numeric values to numbers and not keep them as strings, you can
|
35
|
+
# For example, if the columns in your CSV file are separated by something
|
36
|
+
# other that commas, you can use the `:col_sep` option. If you want to
|
37
|
+
# convert numeric values to numbers and not keep them as strings, you can
|
38
38
|
# use the `:converters` option and set it to `:numeric`.
|
39
39
|
#
|
40
|
-
# The `.from_csv` function uses the following defaults for reading CSV files
|
40
|
+
# The `.from_csv` function uses the following defaults for reading CSV files
|
41
41
|
# (that are passed into the `CSV.read()` function):
|
42
42
|
#
|
43
43
|
# {
|
@@ -45,24 +45,29 @@ module Daru
|
|
45
45
|
# :converters => :numeric
|
46
46
|
# }
|
47
47
|
def from_csv path, opts={}, &block
|
48
|
-
Daru::IO.from_csv path, opts, &block
|
48
|
+
Daru::IO.from_csv path, opts, &block
|
49
49
|
end
|
50
50
|
|
51
51
|
# Read data from an Excel file into a DataFrame.
|
52
|
-
#
|
52
|
+
#
|
53
53
|
# == Arguments
|
54
|
-
#
|
54
|
+
#
|
55
55
|
# * path - Path of the file to be read.
|
56
|
-
#
|
56
|
+
#
|
57
57
|
# == Options
|
58
|
-
#
|
58
|
+
#
|
59
59
|
# *:worksheet_id - ID of the worksheet that is to be read.
|
60
|
-
def from_excel path, opts={}, &block
|
60
|
+
def from_excel path, opts={}, &block
|
61
61
|
Daru::IO.from_excel path, opts, &block
|
62
62
|
end
|
63
63
|
|
64
64
|
# Read a database query and returns a Dataset
|
65
65
|
#
|
66
|
+
# @param dbh [DBI::DatabaseHandle] A DBI connection to be used to run the query
|
67
|
+
# @param query [String] The query to be executed
|
68
|
+
#
|
69
|
+
# @return A dataframe containing the data resulting from the query
|
70
|
+
#
|
66
71
|
# USE:
|
67
72
|
#
|
68
73
|
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
@@ -71,17 +76,37 @@ module Daru
|
|
71
76
|
Daru::IO.from_sql dbh, query
|
72
77
|
end
|
73
78
|
|
79
|
+
# Read a dataframe from AR::Relation
|
80
|
+
#
|
81
|
+
# @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
|
82
|
+
# @params fields [Array] Field names to be loaded (optional)
|
83
|
+
#
|
84
|
+
# @return A dataframe containing the data loaded from the relation
|
85
|
+
#
|
86
|
+
# USE:
|
87
|
+
#
|
88
|
+
# # When Post model is defined as:
|
89
|
+
# class Post < ActiveRecord::Base
|
90
|
+
# scope :active, -> { where.not(published_at: nil) }
|
91
|
+
# end
|
92
|
+
#
|
93
|
+
# # You can load active posts into a dataframe by:
|
94
|
+
# Daru::DataFrame.from_activerecord(Post.active, :title, :published_at)
|
95
|
+
def from_activerecord relation, *fields
|
96
|
+
Daru::IO.from_activerecord relation, *fields
|
97
|
+
end
|
98
|
+
|
74
99
|
# Read the database from a plaintext file. For this method to work,
|
75
100
|
# the data should be present in a plain text file in columns. See
|
76
101
|
# spec/fixtures/bank2.dat for an example.
|
77
|
-
#
|
102
|
+
#
|
78
103
|
# == Arguments
|
79
|
-
#
|
104
|
+
#
|
80
105
|
# * path - Path of the file to be read.
|
81
106
|
# * fields - Vector names of the resulting database.
|
82
|
-
#
|
107
|
+
#
|
83
108
|
# == Usage
|
84
|
-
#
|
109
|
+
#
|
85
110
|
# df = Daru::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
|
86
111
|
def from_plaintext path, fields
|
87
112
|
Daru::IO.from_plaintext path, fields
|
@@ -137,15 +162,15 @@ module Daru
|
|
137
162
|
#
|
138
163
|
# Useful to process outputs from databases
|
139
164
|
def crosstab_by_assignation rows, columns, values
|
140
|
-
raise "Three vectors should be equal size" if
|
165
|
+
raise "Three vectors should be equal size" if
|
141
166
|
rows.size != columns.size or rows.size!=values.size
|
142
167
|
|
143
168
|
cols_values = columns.factors
|
144
169
|
cols_n = cols_values.size
|
145
170
|
|
146
|
-
h_rows = rows.factors.inject({}) do |a,v|
|
147
|
-
a[v] = cols_values.inject({}) do |a1,v1|
|
148
|
-
a1[v1]=nil
|
171
|
+
h_rows = rows.factors.inject({}) do |a,v|
|
172
|
+
a[v] = cols_values.inject({}) do |a1,v1|
|
173
|
+
a1[v1]=nil
|
149
174
|
a1
|
150
175
|
end
|
151
176
|
a
|
@@ -186,38 +211,38 @@ module Daru
|
|
186
211
|
# These objects are indexed by row and column by vectors and index Index objects.
|
187
212
|
#
|
188
213
|
# == Arguments
|
189
|
-
#
|
214
|
+
#
|
190
215
|
# * source - Source from the DataFrame is to be initialized. Can be a Hash
|
191
216
|
# of names and vectors (array or Daru::Vector), an array of arrays or
|
192
217
|
# array of Daru::Vectors.
|
193
|
-
#
|
218
|
+
#
|
194
219
|
# == Options
|
195
|
-
#
|
196
|
-
# +:order+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order in
|
220
|
+
#
|
221
|
+
# +:order+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order in
|
197
222
|
# which Vectors should appear in the DataFrame.
|
198
|
-
#
|
223
|
+
#
|
199
224
|
# +:index+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order
|
200
225
|
# in which rows of the DataFrame will be named.
|
201
|
-
#
|
226
|
+
#
|
202
227
|
# +:name+ - A name for the DataFrame.
|
203
228
|
#
|
204
229
|
# +:clone+ - Specify as *true* or *false*. When set to false, and Vector
|
205
230
|
# objects are passed for the source, the Vector objects will not duplicated
|
206
|
-
# when creating the DataFrame. Will have no effect if Array is passed in
|
207
|
-
# the source, or if the passed Daru::Vectors have different indexes.
|
231
|
+
# when creating the DataFrame. Will have no effect if Array is passed in
|
232
|
+
# the source, or if the passed Daru::Vectors have different indexes.
|
208
233
|
# Default to *true*.
|
209
|
-
#
|
234
|
+
#
|
210
235
|
# == Usage
|
211
|
-
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
|
236
|
+
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
|
212
237
|
# index: [:a, :b, :c, :d], name: :spider_man)
|
213
|
-
#
|
214
|
-
# # =>
|
238
|
+
#
|
239
|
+
# # =>
|
215
240
|
# # <Daru::DataFrame:80766980 @name = spider_man @size = 4>
|
216
|
-
# # b a
|
217
|
-
# # a 6 1
|
218
|
-
# # b 7 2
|
219
|
-
# # c 8 3
|
220
|
-
# # d 9 4
|
241
|
+
# # b a
|
242
|
+
# # a 6 1
|
243
|
+
# # b 7 2
|
244
|
+
# # c 8 3
|
245
|
+
# # d 9 4
|
221
246
|
def initialize source, opts={}
|
222
247
|
vectors = opts[:order]
|
223
248
|
index = opts[:index]
|
@@ -292,7 +317,7 @@ module Daru
|
|
292
317
|
@vectors.each do |vector|
|
293
318
|
# avoids matching indexes of vectors if all the supplied vectors
|
294
319
|
# have the same index.
|
295
|
-
if vectors_have_same_index
|
320
|
+
if vectors_have_same_index
|
296
321
|
v = source[vector].dup
|
297
322
|
else
|
298
323
|
v = Daru::Vector.new([], name: vector, index: @index)
|
@@ -331,8 +356,8 @@ module Daru
|
|
331
356
|
end
|
332
357
|
|
333
358
|
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
|
334
|
-
# Defaults to *:vector*. Use of this method is not recommended for accessing
|
335
|
-
# rows or vectors. Use df.row[:a] for accessing row with index ':a' or
|
359
|
+
# Defaults to *:vector*. Use of this method is not recommended for accessing
|
360
|
+
# rows or vectors. Use df.row[:a] for accessing row with index ':a' or
|
336
361
|
# df.vector[:vec] for accessing vector with index *:vec*.
|
337
362
|
def [](*names)
|
338
363
|
if names[-1] == :vector or names[-1] == :row
|
@@ -354,7 +379,7 @@ module Daru
|
|
354
379
|
# Insert a new row/vector of the specified name or modify a previous row.
|
355
380
|
# Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
|
356
381
|
# a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
|
357
|
-
#
|
382
|
+
#
|
358
383
|
# In case a Daru::Vector is specified after the equality the sign, the indexes
|
359
384
|
# of the vector will be matched against the row/vector indexes of the DataFrame
|
360
385
|
# before an insertion is performed. Unmatched indexes will be set to nil.
|
@@ -368,7 +393,7 @@ module Daru
|
|
368
393
|
|
369
394
|
if axis == :vector
|
370
395
|
insert_or_modify_vector name, vector
|
371
|
-
elsif axis == :row
|
396
|
+
elsif axis == :row
|
372
397
|
insert_or_modify_row name, vector
|
373
398
|
else
|
374
399
|
raise IndexError, "Expected axis to be row or vector, not #{axis}."
|
@@ -389,7 +414,7 @@ module Daru
|
|
389
414
|
end
|
390
415
|
|
391
416
|
# Access a row or set/create a row. Refer #[] and #[]= docs for details.
|
392
|
-
#
|
417
|
+
#
|
393
418
|
# == Usage
|
394
419
|
# df.row[:a] # access row named ':a'
|
395
420
|
# df.row[:b] = [1,2,3] # set row ':b' to [1,2,3]
|
@@ -398,17 +423,17 @@ module Daru
|
|
398
423
|
end
|
399
424
|
|
400
425
|
# Duplicate the DataFrame entirely.
|
401
|
-
#
|
426
|
+
#
|
402
427
|
# == Arguments
|
403
|
-
#
|
404
|
-
# * +vectors_to_dup+ - An Array specifying the names of Vectors to
|
428
|
+
#
|
429
|
+
# * +vectors_to_dup+ - An Array specifying the names of Vectors to
|
405
430
|
# be duplicated. Will duplicate the entire DataFrame if not specified.
|
406
431
|
def dup vectors_to_dup=nil
|
407
432
|
vectors_to_dup = @vectors.to_a unless vectors_to_dup
|
408
433
|
|
409
434
|
src = []
|
410
435
|
vectors_to_dup.each do |vec|
|
411
|
-
src << @data[@vectors[vec]].to_a
|
436
|
+
src << @data[@vectors[vec]].to_a.dup
|
412
437
|
end
|
413
438
|
new_order = Daru::Index.new(vectors_to_dup)
|
414
439
|
|
@@ -422,9 +447,9 @@ module Daru
|
|
422
447
|
|
423
448
|
# Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
|
424
449
|
# preserved.
|
425
|
-
#
|
450
|
+
#
|
426
451
|
# == Arguments
|
427
|
-
#
|
452
|
+
#
|
428
453
|
# +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
|
429
454
|
# a view of the whole data frame otherwise.
|
430
455
|
def clone *vectors_to_clone
|
@@ -438,7 +463,7 @@ module Daru
|
|
438
463
|
Daru::DataFrame.new(h, clone: false)
|
439
464
|
end
|
440
465
|
|
441
|
-
# Returns a 'shallow' copy of DataFrame if missing data is not present,
|
466
|
+
# Returns a 'shallow' copy of DataFrame if missing data is not present,
|
442
467
|
# or a full copy of only valid data if missing data is present.
|
443
468
|
def clone_only_valid
|
444
469
|
if has_missing_data?
|
@@ -448,7 +473,7 @@ module Daru
|
|
448
473
|
end
|
449
474
|
end
|
450
475
|
|
451
|
-
# Creates a new duplicate dataframe containing only rows
|
476
|
+
# Creates a new duplicate dataframe containing only rows
|
452
477
|
# without a single missing value.
|
453
478
|
def dup_only_valid vecs=nil
|
454
479
|
rows_with_nil = @data.inject([]) do |memo, vector|
|
@@ -485,7 +510,7 @@ module Daru
|
|
485
510
|
|
486
511
|
@vectors.each do |vector|
|
487
512
|
yield @data[@vectors[vector]], vector
|
488
|
-
end
|
513
|
+
end
|
489
514
|
|
490
515
|
self
|
491
516
|
end
|
@@ -518,12 +543,12 @@ module Daru
|
|
518
543
|
#
|
519
544
|
# == Description
|
520
545
|
#
|
521
|
-
# `#each` works exactly like Array#each. The default mode for `each`
|
522
|
-
# is to iterate over the columns of the DataFrame. To iterate over
|
546
|
+
# `#each` works exactly like Array#each. The default mode for `each`
|
547
|
+
# is to iterate over the columns of the DataFrame. To iterate over
|
523
548
|
# rows you must pass the axis, i.e `:row` as an argument.
|
524
|
-
#
|
549
|
+
#
|
525
550
|
# == Arguments
|
526
|
-
#
|
551
|
+
#
|
527
552
|
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
528
553
|
# or :row. Default to :vector.
|
529
554
|
def each axis=:vector, &block
|
@@ -541,14 +566,14 @@ module Daru
|
|
541
566
|
#
|
542
567
|
# == Description
|
543
568
|
#
|
544
|
-
# The #collect iterator works similar to #map, the only difference
|
545
|
-
# being that it returns a Daru::Vector comprising of the results of
|
546
|
-
# each block run. The resultant Vector has the same index as that
|
547
|
-
# of the axis over which collect has iterated. It also accepts the
|
569
|
+
# The #collect iterator works similar to #map, the only difference
|
570
|
+
# being that it returns a Daru::Vector comprising of the results of
|
571
|
+
# each block run. The resultant Vector has the same index as that
|
572
|
+
# of the axis over which collect has iterated. It also accepts the
|
548
573
|
# optional axis argument.
|
549
574
|
#
|
550
575
|
# == Arguments
|
551
|
-
#
|
576
|
+
#
|
552
577
|
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
553
578
|
# or :row. Default to :vector.
|
554
579
|
def collect axis=:vector, &block
|
@@ -565,16 +590,16 @@ module Daru
|
|
565
590
|
# the argument specified. Will return an Array of the resulting
|
566
591
|
# elements. To map over each row/vector and get a DataFrame,
|
567
592
|
# see #recode.
|
568
|
-
#
|
593
|
+
#
|
569
594
|
# == Description
|
570
|
-
#
|
571
|
-
# The #map iterator works like Array#map. The value returned by
|
572
|
-
# each run of the block is added to an Array and the Array is
|
573
|
-
# returned. This method also accepts an axis argument, like #each.
|
595
|
+
#
|
596
|
+
# The #map iterator works like Array#map. The value returned by
|
597
|
+
# each run of the block is added to an Array and the Array is
|
598
|
+
# returned. This method also accepts an axis argument, like #each.
|
574
599
|
# The default is :vector.
|
575
|
-
#
|
600
|
+
#
|
576
601
|
# == Arguments
|
577
|
-
#
|
602
|
+
#
|
578
603
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
579
604
|
# Default to :vector.
|
580
605
|
def map axis=:vector, &block
|
@@ -590,9 +615,9 @@ module Daru
|
|
590
615
|
# Destructive map. Modifies the DataFrame. Each run of the block
|
591
616
|
# must return a Daru::Vector. You can specify the axis to map over
|
592
617
|
# as the argument. Default to :vector.
|
593
|
-
#
|
618
|
+
#
|
594
619
|
# == Arguments
|
595
|
-
#
|
620
|
+
#
|
596
621
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
597
622
|
# Default to :vector.
|
598
623
|
def map! axis=:vector, &block
|
@@ -609,15 +634,15 @@ module Daru
|
|
609
634
|
#
|
610
635
|
# == Description
|
611
636
|
#
|
612
|
-
# Recode works similarly to #map, but an important difference between
|
613
|
-
# the two is that recode returns a modified Daru::DataFrame instead
|
614
|
-
# of an Array. For this reason, #recode expects that every run of the
|
637
|
+
# Recode works similarly to #map, but an important difference between
|
638
|
+
# the two is that recode returns a modified Daru::DataFrame instead
|
639
|
+
# of an Array. For this reason, #recode expects that every run of the
|
615
640
|
# block to return a Daru::Vector.
|
616
641
|
#
|
617
642
|
# Just like map and each, recode also accepts an optional _axis_ argument.
|
618
|
-
#
|
643
|
+
#
|
619
644
|
# == Arguments
|
620
|
-
#
|
645
|
+
#
|
621
646
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
622
647
|
# Default to :vector.
|
623
648
|
def recode axis=:vector, &block
|
@@ -629,22 +654,22 @@ module Daru
|
|
629
654
|
end
|
630
655
|
|
631
656
|
# Retain vectors or rows if the block returns a truthy value.
|
632
|
-
#
|
657
|
+
#
|
633
658
|
# == Description
|
634
|
-
#
|
635
|
-
# For filtering out certain rows/vectors based on their values,
|
636
|
-
# use the #filter method. By default it iterates over vectors and
|
637
|
-
# keeps those vectors for which the block returns true. It accepts
|
638
|
-
# an optional axis argument which lets you specify whether you want
|
659
|
+
#
|
660
|
+
# For filtering out certain rows/vectors based on their values,
|
661
|
+
# use the #filter method. By default it iterates over vectors and
|
662
|
+
# keeps those vectors for which the block returns true. It accepts
|
663
|
+
# an optional axis argument which lets you specify whether you want
|
639
664
|
# to iterate over vectors or rows.
|
640
|
-
#
|
665
|
+
#
|
641
666
|
# == Arguments
|
642
|
-
#
|
667
|
+
#
|
643
668
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
644
669
|
# Default to :vector.
|
645
|
-
#
|
670
|
+
#
|
646
671
|
# == Usage
|
647
|
-
#
|
672
|
+
#
|
648
673
|
# # Filter vectors
|
649
674
|
#
|
650
675
|
# df.filter do |vector|
|
@@ -665,12 +690,12 @@ module Daru
|
|
665
690
|
end
|
666
691
|
|
667
692
|
def recode_vectors &block
|
668
|
-
block_given? or return to_enum(:recode_vectors)
|
693
|
+
block_given? or return to_enum(:recode_vectors)
|
669
694
|
|
670
695
|
df = self.dup
|
671
696
|
df.each_vector_with_index do |v, i|
|
672
697
|
ret = yield v
|
673
|
-
ret.is_a?(Daru::Vector) or
|
698
|
+
ret.is_a?(Daru::Vector) or
|
674
699
|
raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
|
675
700
|
df[*i] = ret
|
676
701
|
end
|
@@ -763,7 +788,7 @@ module Daru
|
|
763
788
|
self
|
764
789
|
end
|
765
790
|
|
766
|
-
# Retrieves a Daru::Vector, based on the result of calculation
|
791
|
+
# Retrieves a Daru::Vector, based on the result of calculation
|
767
792
|
# performed on each row.
|
768
793
|
def collect_rows &block
|
769
794
|
return to_enum(:collect_rows) unless block_given?
|
@@ -878,15 +903,15 @@ module Daru
|
|
878
903
|
|
879
904
|
deletion << index unless keep_row
|
880
905
|
end
|
881
|
-
deletion.each { |idx|
|
882
|
-
delete_row idx
|
906
|
+
deletion.each { |idx|
|
907
|
+
delete_row idx
|
883
908
|
}
|
884
909
|
end
|
885
910
|
|
886
911
|
def keep_vector_if &block
|
887
912
|
@vectors.each do |vector|
|
888
913
|
keep_vector = yield @data[@vectors[vector]], vector
|
889
|
-
|
914
|
+
|
890
915
|
delete_vector vector unless keep_vector
|
891
916
|
end
|
892
917
|
end
|
@@ -925,7 +950,7 @@ module Daru
|
|
925
950
|
# true for that vector.
|
926
951
|
def filter_vectors &block
|
927
952
|
return to_enum(:filter_vectors) unless block_given?
|
928
|
-
|
953
|
+
|
929
954
|
df = self.dup
|
930
955
|
df.keep_vector_if &block
|
931
956
|
|
@@ -934,7 +959,7 @@ module Daru
|
|
934
959
|
|
935
960
|
# Test each row with one or more tests. Each test is a Proc with the form
|
936
961
|
# *Proc.new {|row| row[:age] > 0}*
|
937
|
-
#
|
962
|
+
#
|
938
963
|
# The function returns an array with all errors.
|
939
964
|
def verify(*tests)
|
940
965
|
if(tests[0].is_a? Symbol)
|
@@ -963,9 +988,9 @@ module Daru
|
|
963
988
|
|
964
989
|
# DSL for yielding each row and returning a Daru::Vector based on the
|
965
990
|
# value each run of the block returns.
|
966
|
-
#
|
991
|
+
#
|
967
992
|
# == Usage
|
968
|
-
#
|
993
|
+
#
|
969
994
|
# a1 = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7])
|
970
995
|
# a2 = Daru::Vector.new([10, 20, 30, 40, 50, 60, 70])
|
971
996
|
# a3 = Daru::Vector.new([100, 200, 300, 400, 500, 600, 700])
|
@@ -991,10 +1016,10 @@ module Daru
|
|
991
1016
|
|
992
1017
|
# Returns a vector, based on a string with a calculation based
|
993
1018
|
# on vector.
|
994
|
-
#
|
1019
|
+
#
|
995
1020
|
# The calculation will be eval'ed, so you can put any variable
|
996
1021
|
# or expression valid on ruby.
|
997
|
-
#
|
1022
|
+
#
|
998
1023
|
# For example:
|
999
1024
|
# a = Daru::Vector.new [1,2]
|
1000
1025
|
# b = Daru::Vector.new [3,4]
|
@@ -1003,14 +1028,14 @@ module Daru
|
|
1003
1028
|
# => Vector [4,6]
|
1004
1029
|
def compute text, &block
|
1005
1030
|
return instance_eval(&block) if block_given?
|
1006
|
-
instance_eval(text)
|
1031
|
+
instance_eval(text)
|
1007
1032
|
end
|
1008
1033
|
|
1009
1034
|
# Return a vector with the number of missing values in each row.
|
1010
|
-
#
|
1035
|
+
#
|
1011
1036
|
# == Arguments
|
1012
|
-
#
|
1013
|
-
# * +missing_values+ - An Array of the values that should be
|
1037
|
+
#
|
1038
|
+
# * +missing_values+ - An Array of the values that should be
|
1014
1039
|
# treated as 'missing'. The default missing value is *nil*.
|
1015
1040
|
def missing_values_rows missing_values=[nil]
|
1016
1041
|
number_of_missing = []
|
@@ -1031,9 +1056,9 @@ module Daru
|
|
1031
1056
|
|
1032
1057
|
alias :flawed? :has_missing_data?
|
1033
1058
|
|
1034
|
-
# Return a nested hash using vector names as keys and an array constructed of
|
1059
|
+
# Return a nested hash using vector names as keys and an array constructed of
|
1035
1060
|
# hashes with other values. If block provided, is used to provide the
|
1036
|
-
# values, with parameters +row+ of dataset, +current+ last hash on
|
1061
|
+
# values, with parameters +row+ of dataset, +current+ last hash on
|
1037
1062
|
# hierarchy and +name+ of the key to include
|
1038
1063
|
def nest *tree_keys, &block
|
1039
1064
|
tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
|
@@ -1101,7 +1126,7 @@ module Daru
|
|
1101
1126
|
# @example Using any?
|
1102
1127
|
# df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
1103
1128
|
# df.any?(:row) do |row|
|
1104
|
-
# row[:a] < 3 and row[:b] == 'b'
|
1129
|
+
# row[:a] < 3 and row[:b] == 'b'
|
1105
1130
|
# end #=> true
|
1106
1131
|
def any? axis=:vector, &block
|
1107
1132
|
if axis == :vector or axis == :column
|
@@ -1123,7 +1148,7 @@ module Daru
|
|
1123
1148
|
# @example Using all?
|
1124
1149
|
# df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
1125
1150
|
# df.all?(:row) do |row|
|
1126
|
-
# row[:a] < 10
|
1151
|
+
# row[:a] < 10
|
1127
1152
|
# end #=> true
|
1128
1153
|
def all? axis=:vector, &block
|
1129
1154
|
if axis == :vector or axis == :column
|
@@ -1145,14 +1170,18 @@ module Daru
|
|
1145
1170
|
self[0..(quantity-1), :row]
|
1146
1171
|
end
|
1147
1172
|
|
1173
|
+
alias :first :head
|
1174
|
+
|
1148
1175
|
# The last ten elements of the DataFrame
|
1149
|
-
#
|
1176
|
+
#
|
1150
1177
|
# @param [Fixnum] quantity (10) The number of elements to display from the bottom.
|
1151
1178
|
def tail quantity=10
|
1152
1179
|
self[(@size - quantity)..(@size-1), :row]
|
1153
1180
|
end
|
1154
1181
|
|
1155
|
-
|
1182
|
+
alias :last :tail
|
1183
|
+
|
1184
|
+
# Returns a vector with sum of all vectors specified in the argument.
|
1156
1185
|
# Tf vecs parameter is empty, sum all numeric vector.
|
1157
1186
|
def vector_sum vecs=nil
|
1158
1187
|
vecs ||= numeric_vectors
|
@@ -1166,9 +1195,9 @@ module Daru
|
|
1166
1195
|
end
|
1167
1196
|
|
1168
1197
|
# Calculate mean of the rows of the dataframe.
|
1169
|
-
#
|
1198
|
+
#
|
1170
1199
|
# == Arguments
|
1171
|
-
#
|
1200
|
+
#
|
1172
1201
|
# * +max_missing+ - The maximum number of elements in the row that can be
|
1173
1202
|
# zero for the mean calculation to happen. Default to 0.
|
1174
1203
|
def vector_mean max_missing=0
|
@@ -1181,16 +1210,16 @@ module Daru
|
|
1181
1210
|
mean_vec
|
1182
1211
|
end
|
1183
1212
|
|
1184
|
-
# Group elements by vector to perform operations on them. Returns a
|
1213
|
+
# Group elements by vector to perform operations on them. Returns a
|
1185
1214
|
# Daru::Core::GroupBy object.See the Daru::Core::GroupBy docs for a detailed
|
1186
1215
|
# list of possible operations.
|
1187
|
-
#
|
1216
|
+
#
|
1188
1217
|
# == Arguments
|
1189
|
-
#
|
1218
|
+
#
|
1190
1219
|
# * vectors - An Array contatining names of vectors to group by.
|
1191
|
-
#
|
1220
|
+
#
|
1192
1221
|
# == Usage
|
1193
|
-
#
|
1222
|
+
#
|
1194
1223
|
# df = Daru::DataFrame.new({
|
1195
1224
|
# a: %w{foo bar foo bar foo bar foo foo},
|
1196
1225
|
# b: %w{one one two three two two one three},
|
@@ -1209,7 +1238,7 @@ module Daru
|
|
1209
1238
|
vectors.flatten!
|
1210
1239
|
vectors.each { |v| raise(ArgumentError, "Vector #{v} does not exist") unless
|
1211
1240
|
has_vector?(v) }
|
1212
|
-
|
1241
|
+
|
1213
1242
|
Daru::Core::GroupBy.new(self, vectors)
|
1214
1243
|
end
|
1215
1244
|
|
@@ -1234,7 +1263,7 @@ module Daru
|
|
1234
1263
|
def concat other_df
|
1235
1264
|
vectors = []
|
1236
1265
|
@vectors.each do |v|
|
1237
|
-
vectors << self[v].to_a.concat(other_df[v].to_a)
|
1266
|
+
vectors << self[v].to_a.dup.concat(other_df[v].to_a)
|
1238
1267
|
end
|
1239
1268
|
|
1240
1269
|
Daru::DataFrame.new(vectors, order: @vectors)
|
@@ -1242,9 +1271,9 @@ module Daru
|
|
1242
1271
|
|
1243
1272
|
# Set a particular column as the new DF
|
1244
1273
|
def set_index new_index, opts={}
|
1245
|
-
raise ArgumentError, "All elements in new index must be unique." if
|
1274
|
+
raise ArgumentError, "All elements in new index must be unique." if
|
1246
1275
|
@size != self[new_index].uniq.size
|
1247
|
-
|
1276
|
+
|
1248
1277
|
self.index = Daru::Index.new(self[new_index].to_a)
|
1249
1278
|
self.delete_vector(new_index) unless opts[:keep]
|
1250
1279
|
|
@@ -1253,25 +1282,25 @@ module Daru
|
|
1253
1282
|
|
1254
1283
|
# Change the index of the DataFrame and preserve the labels of the previous
|
1255
1284
|
# indexing. New index can be Daru::Index or any of its subclasses.
|
1256
|
-
#
|
1285
|
+
#
|
1257
1286
|
# @param [Daru::Index] new_index The new Index for reindexing the DataFrame.
|
1258
1287
|
# @example Reindexing DataFrame
|
1259
|
-
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
|
1288
|
+
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
|
1260
1289
|
# index: ['a','b','c','d'])
|
1261
|
-
# #=>
|
1290
|
+
# #=>
|
1262
1291
|
# ##<Daru::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
1263
|
-
# # a b
|
1264
|
-
# # a 1 11
|
1265
|
-
# # b 2 22
|
1266
|
-
# # c 3 33
|
1267
|
-
# # d 4 44
|
1292
|
+
# # a b
|
1293
|
+
# # a 1 11
|
1294
|
+
# # b 2 22
|
1295
|
+
# # c 3 33
|
1296
|
+
# # d 4 44
|
1268
1297
|
# df.reindex Daru::Index.new(['b', 0, 'a', 'g'])
|
1269
|
-
# #=>
|
1298
|
+
# #=>
|
1270
1299
|
# ##<Daru::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
1271
|
-
# # a b
|
1272
|
-
# # b 2 22
|
1273
|
-
# # 0 nil nil
|
1274
|
-
# # a 1 11
|
1300
|
+
# # a b
|
1301
|
+
# # b 2 22
|
1302
|
+
# # 0 nil nil
|
1303
|
+
# # a 1 11
|
1275
1304
|
# # g nil nil
|
1276
1305
|
def reindex new_index
|
1277
1306
|
raise ArgumentError, "Must pass the new index of type Index or its "\
|
@@ -1296,10 +1325,10 @@ module Daru
|
|
1296
1325
|
# @example Reassgining index of a DataFrame
|
1297
1326
|
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
|
1298
1327
|
# df.index.to_a #=> [0,1,2,3]
|
1299
|
-
#
|
1328
|
+
#
|
1300
1329
|
# df.index = Daru::Index.new(['a','b','c','d'])
|
1301
1330
|
# df.index.to_a #=> ['a','b','c','d']
|
1302
|
-
# df.row['a'].to_a #=> [1,11]
|
1331
|
+
# df.row['a'].to_a #=> [1,11]
|
1303
1332
|
def index= idx
|
1304
1333
|
@data.each { |vec| vec.index = idx}
|
1305
1334
|
@index = idx
|
@@ -1308,17 +1337,17 @@ module Daru
|
|
1308
1337
|
end
|
1309
1338
|
|
1310
1339
|
# Reassign vectors with a new index of type Daru::Index or any of its subclasses.
|
1311
|
-
#
|
1340
|
+
#
|
1312
1341
|
# @param [Daru::Index] idx The new index object on which the vectors are to
|
1313
1342
|
# be indexed. Must of the same size as ncols.
|
1314
1343
|
# @example Reassigning vectors of a DataFrame
|
1315
1344
|
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
|
1316
1345
|
# df.vectors.to_a #=> [:a, :b, :c]
|
1317
|
-
#
|
1346
|
+
#
|
1318
1347
|
# df.vectors = Daru::Index.new([:foo, :bar, :baz])
|
1319
1348
|
# df.vectors.to_a #=> [:foo, :bar, :baz]
|
1320
1349
|
def vectors= idx
|
1321
|
-
raise ArgumentError, "Can only reindex with Index and its subclasses" unless
|
1350
|
+
raise ArgumentError, "Can only reindex with Index and its subclasses" unless
|
1322
1351
|
index.kind_of?(Daru::Index)
|
1323
1352
|
raise ArgumentError, "Specified index length #{idx.size} not equal to"\
|
1324
1353
|
"dataframe size #{ncols}" if idx.size != ncols
|
@@ -1377,9 +1406,9 @@ module Daru
|
|
1377
1406
|
end
|
1378
1407
|
end
|
1379
1408
|
|
1380
|
-
# Sorts a dataframe (ascending/descending)according to the given sequence of
|
1409
|
+
# Sorts a dataframe (ascending/descending)according to the given sequence of
|
1381
1410
|
# vectors, using the attributes provided in the blocks.
|
1382
|
-
#
|
1411
|
+
#
|
1383
1412
|
# @param order [Array] The order of vector names in which the DataFrame
|
1384
1413
|
# should be sorted.
|
1385
1414
|
# @param [Hash] opts The options to sort with.
|
@@ -1387,21 +1416,21 @@ module Daru
|
|
1387
1416
|
# or descending order. Specify Array corresponding to *order* for multiple
|
1388
1417
|
# sort orders.
|
1389
1418
|
# @option opts [Hash] :by ({|a,b| a <=> b}) Specify attributes of objects to
|
1390
|
-
# to be used for sorting, for each vector name in *order* as a hash of
|
1419
|
+
# to be used for sorting, for each vector name in *order* as a hash of
|
1391
1420
|
# vector name and lambda pairs. In case a lambda for a vector is not
|
1392
1421
|
# specified, the default will be used.
|
1393
|
-
#
|
1422
|
+
#
|
1394
1423
|
# == Usage
|
1395
|
-
#
|
1424
|
+
#
|
1396
1425
|
# df = Daru::DataFrame.new({a: [-3,2,-1,4], b: [4,3,2,1]})
|
1397
|
-
#
|
1426
|
+
#
|
1398
1427
|
# #<Daru::DataFrame:140630680 @name = 04e00197-f8d5-4161-bca2-93266bfabc6f @size = 4>
|
1399
|
-
# # a b
|
1400
|
-
# # 0 -3 4
|
1401
|
-
# # 1 2 3
|
1402
|
-
# # 2 -1 2
|
1403
|
-
# # 3 4 1
|
1404
|
-
# df.sort([:a], by: { a: lambda { |a,b| a.abs <=> b.abs } })
|
1428
|
+
# # a b
|
1429
|
+
# # 0 -3 4
|
1430
|
+
# # 1 2 3
|
1431
|
+
# # 2 -1 2
|
1432
|
+
# # 3 4 1
|
1433
|
+
# df.sort([:a], by: { a: lambda { |a,b| a.abs <=> b.abs } })
|
1405
1434
|
def sort! vector_order, opts={}
|
1406
1435
|
raise ArgumentError, "Required atleast one vector name" if vector_order.size < 1
|
1407
1436
|
opts = {
|
@@ -1426,46 +1455,46 @@ module Daru
|
|
1426
1455
|
|
1427
1456
|
# Pivots a data frame on specified vectors and applies an aggregate function
|
1428
1457
|
# to quickly generate a summary.
|
1429
|
-
#
|
1458
|
+
#
|
1430
1459
|
# == Options
|
1431
|
-
#
|
1460
|
+
#
|
1432
1461
|
# +:index+ - Keys to group by on the pivot table row index. Pass vector names
|
1433
1462
|
# contained in an Array.
|
1434
|
-
#
|
1463
|
+
#
|
1435
1464
|
# +:vectors+ - Keys to group by on the pivot table column index. Pass vector
|
1436
1465
|
# names contained in an Array.
|
1437
|
-
#
|
1466
|
+
#
|
1438
1467
|
# +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
|
1439
|
-
# use any of the statistics functions applicable on Vectors that can be found in
|
1468
|
+
# use any of the statistics functions applicable on Vectors that can be found in
|
1440
1469
|
# the Daru::Statistics::Vector module.
|
1441
|
-
#
|
1442
|
-
# +:values+ - Columns to aggregate. Will consider all numeric columns not
|
1470
|
+
#
|
1471
|
+
# +:values+ - Columns to aggregate. Will consider all numeric columns not
|
1443
1472
|
# specified in *:index* or *:vectors*. Optional.
|
1444
|
-
#
|
1473
|
+
#
|
1445
1474
|
# == Usage
|
1446
|
-
#
|
1475
|
+
#
|
1447
1476
|
# df = Daru::DataFrame.new({
|
1448
|
-
# a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
|
1477
|
+
# a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
|
1449
1478
|
# b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
|
1450
1479
|
# c: ['small','large','large','small','small','large','small','large','small'],
|
1451
1480
|
# d: [1,2,2,3,3,4,5,6,7],
|
1452
1481
|
# e: [2,4,4,6,6,8,10,12,14]
|
1453
1482
|
# })
|
1454
1483
|
# df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
|
1455
|
-
#
|
1456
|
-
# #=>
|
1484
|
+
#
|
1485
|
+
# #=>
|
1457
1486
|
# # #<Daru::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
|
1458
|
-
# # [:e, :one] [:e, :two]
|
1459
|
-
# # [:bar] 18 26
|
1460
|
-
# # [:foo] 10 12
|
1487
|
+
# # [:e, :one] [:e, :two]
|
1488
|
+
# # [:bar] 18 26
|
1489
|
+
# # [:foo] 10 12
|
1461
1490
|
def pivot_table opts={}
|
1462
|
-
raise ArgumentError,
|
1491
|
+
raise ArgumentError,
|
1463
1492
|
"Specify grouping index" if !opts[:index] or opts[:index].empty?
|
1464
1493
|
|
1465
1494
|
index = opts[:index]
|
1466
1495
|
vectors = opts[:vectors] || []
|
1467
1496
|
aggregate_function = opts[:agg] || :mean
|
1468
|
-
values =
|
1497
|
+
values =
|
1469
1498
|
if opts[:values].is_a?(Symbol)
|
1470
1499
|
[opts[:values]]
|
1471
1500
|
elsif opts[:values].is_a?(Array)
|
@@ -1473,7 +1502,7 @@ module Daru
|
|
1473
1502
|
else # nil
|
1474
1503
|
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
1475
1504
|
end
|
1476
|
-
|
1505
|
+
|
1477
1506
|
raise IndexError, "No numeric vectors to aggregate" if values.empty?
|
1478
1507
|
|
1479
1508
|
grouped = group_by(index)
|
@@ -1524,7 +1553,7 @@ module Daru
|
|
1524
1553
|
end
|
1525
1554
|
end
|
1526
1555
|
|
1527
|
-
# Merge vectors from two DataFrames. In case of name collision,
|
1556
|
+
# Merge vectors from two DataFrames. In case of name collision,
|
1528
1557
|
# the vectors names are changed to x_1, x_2 ....
|
1529
1558
|
#
|
1530
1559
|
# @return {Daru::DataFrame}
|
@@ -1545,9 +1574,9 @@ module Daru
|
|
1545
1574
|
df_new
|
1546
1575
|
end
|
1547
1576
|
|
1548
|
-
# Join 2 DataFrames with SQL style joins. Currently supports inner, left
|
1577
|
+
# Join 2 DataFrames with SQL style joins. Currently supports inner, left
|
1549
1578
|
# outer, right outer and full outer joins.
|
1550
|
-
#
|
1579
|
+
#
|
1551
1580
|
# @param [Daru::DataFrame] other_df Another DataFrame on which the join is
|
1552
1581
|
# to be performed.
|
1553
1582
|
# @param [Hash] opts Options Hash
|
@@ -1565,11 +1594,11 @@ module Daru
|
|
1565
1594
|
# :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
|
1566
1595
|
# })
|
1567
1596
|
# left.join(right, how: :inner, on: [:name])
|
1568
|
-
# #=>
|
1597
|
+
# #=>
|
1569
1598
|
# ##<Daru::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
|
1570
|
-
# # id_1 name id_2
|
1571
|
-
# # 0 1 Pirate 2
|
1572
|
-
# # 1 3 Ninja 4
|
1599
|
+
# # id_1 name id_2
|
1600
|
+
# # 0 1 Pirate 2
|
1601
|
+
# # 1 3 Ninja 4
|
1573
1602
|
def join(other_df,opts={})
|
1574
1603
|
Daru::Core::Merge.join(self, other_df, opts)
|
1575
1604
|
end
|
@@ -1586,7 +1615,7 @@ module Daru
|
|
1586
1615
|
# the field of first parameters will be copied verbatim
|
1587
1616
|
# to new dataset, and fields which responds to second
|
1588
1617
|
# pattern will be added one case for each different %n.
|
1589
|
-
#
|
1618
|
+
#
|
1590
1619
|
# @example
|
1591
1620
|
# cases=[
|
1592
1621
|
# ['1','george','red',10,'blue',20,nil,nil],
|
@@ -1607,9 +1636,9 @@ module Daru
|
|
1607
1636
|
ds_vars = parent_fields.dup
|
1608
1637
|
vars = []
|
1609
1638
|
max_n = 0
|
1610
|
-
h = parent_fields.inject({}) { |a,v|
|
1639
|
+
h = parent_fields.inject({}) { |a,v|
|
1611
1640
|
a[v] = Daru::Vector.new([])
|
1612
|
-
a
|
1641
|
+
a
|
1613
1642
|
}
|
1614
1643
|
# Adding _row_id
|
1615
1644
|
h['_col_id'] = Daru::Vector.new([])
|
@@ -1663,12 +1692,12 @@ module Daru
|
|
1663
1692
|
end
|
1664
1693
|
|
1665
1694
|
# Create a sql, basen on a given Dataset
|
1666
|
-
#
|
1695
|
+
#
|
1667
1696
|
# == Arguments
|
1668
|
-
#
|
1697
|
+
#
|
1669
1698
|
# * table - String specifying name of the table that will created in SQL.
|
1670
1699
|
# * charset - Character set. Default is "UTF8".
|
1671
|
-
#
|
1700
|
+
#
|
1672
1701
|
# @example
|
1673
1702
|
#
|
1674
1703
|
# ds = Daru::DataFrame.new({
|
@@ -1717,17 +1746,17 @@ module Daru
|
|
1717
1746
|
def to_nmatrix
|
1718
1747
|
numerics_as_arrays = []
|
1719
1748
|
each_vector do |vector|
|
1720
|
-
numerics_as_arrays << vector.to_a if(vector.type == :numeric and
|
1749
|
+
numerics_as_arrays << vector.to_a if(vector.type == :numeric and
|
1721
1750
|
vector.missing_positions.size == 0)
|
1722
1751
|
end
|
1723
1752
|
|
1724
1753
|
numerics_as_arrays.transpose.to_nm
|
1725
1754
|
end
|
1726
|
-
|
1755
|
+
|
1727
1756
|
# Converts the DataFrame into an array of hashes where key is vector name
|
1728
|
-
# and value is the corresponding element. The 0th index of the array contains
|
1729
|
-
# the array of hashes while the 1th index contains the indexes of each row
|
1730
|
-
# of the dataframe. Each element in the index array corresponds to its row
|
1757
|
+
# and value is the corresponding element. The 0th index of the array contains
|
1758
|
+
# the array of hashes while the 1th index contains the indexes of each row
|
1759
|
+
# of the dataframe. Each element in the index array corresponds to its row
|
1731
1760
|
# in the array of hashes, which has the same index.
|
1732
1761
|
def to_a
|
1733
1762
|
arry = [[],[]]
|
@@ -1762,10 +1791,10 @@ module Daru
|
|
1762
1791
|
|
1763
1792
|
# Convert to html for IRuby.
|
1764
1793
|
def to_html threshold=30
|
1765
|
-
html = "<table>" +
|
1794
|
+
html = "<table>" +
|
1766
1795
|
"<tr>" +
|
1767
|
-
"<th colspan=\"#{@vectors.size+1}\">" +
|
1768
|
-
"Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}"
|
1796
|
+
"<th colspan=\"#{@vectors.size+1}\">" +
|
1797
|
+
"Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}"
|
1769
1798
|
"</th>" +
|
1770
1799
|
"</tr>"
|
1771
1800
|
html +='<tr><th></th>'
|
@@ -1791,7 +1820,7 @@ module Daru
|
|
1791
1820
|
html += '<tr>'
|
1792
1821
|
html += "<td>" + last_index.to_s + "</td>"
|
1793
1822
|
(0..(ncols - 1)).to_a.each do |i|
|
1794
|
-
html += '<td>' + last_row[i].to_s + '</td>'
|
1823
|
+
html += '<td>' + last_row[i].to_s + '</td>'
|
1795
1824
|
end
|
1796
1825
|
html += '</tr>'
|
1797
1826
|
break
|
@@ -1825,21 +1854,21 @@ module Daru
|
|
1825
1854
|
# == Arguements
|
1826
1855
|
#
|
1827
1856
|
# * filename - Path of CSV file where the DataFrame is to be saved.
|
1828
|
-
#
|
1857
|
+
#
|
1829
1858
|
# == Options
|
1830
|
-
#
|
1859
|
+
#
|
1831
1860
|
# * convert_comma - If set to *true*, will convert any commas in any
|
1832
1861
|
# of the data to full stops ('.').
|
1833
|
-
# All the options accepted by CSV.read() can also be passed into this
|
1862
|
+
# All the options accepted by CSV.read() can also be passed into this
|
1834
1863
|
# function.
|
1835
1864
|
def write_csv filename, opts={}
|
1836
1865
|
Daru::IO.dataframe_write_csv self, filename, opts
|
1837
1866
|
end
|
1838
1867
|
|
1839
1868
|
# Write this dataframe to an Excel Spreadsheet
|
1840
|
-
#
|
1869
|
+
#
|
1841
1870
|
# == Arguments
|
1842
|
-
#
|
1871
|
+
#
|
1843
1872
|
# * filename - The path of the file where the DataFrame should be written.
|
1844
1873
|
def write_excel filename, opts={}
|
1845
1874
|
Daru::IO.dataframe_write_excel self, filename, opts
|
@@ -1848,10 +1877,10 @@ module Daru
|
|
1848
1877
|
# Insert each case of the Dataset on the selected table
|
1849
1878
|
#
|
1850
1879
|
# == Arguments
|
1851
|
-
#
|
1880
|
+
#
|
1852
1881
|
# * dbh - DBI database connection object.
|
1853
1882
|
# * query - Query string.
|
1854
|
-
#
|
1883
|
+
#
|
1855
1884
|
# == Usage
|
1856
1885
|
#
|
1857
1886
|
# ds = Daru::DataFrame.new({:id=>Daru::Vector.new([1,2,3]), :name=>Daru::Vector.new(["a","b","c"])})
|
@@ -1869,8 +1898,8 @@ module Daru
|
|
1869
1898
|
|
1870
1899
|
def _dump depth
|
1871
1900
|
Marshal.dump({
|
1872
|
-
data: @data,
|
1873
|
-
index: @index.to_a,
|
1901
|
+
data: @data,
|
1902
|
+
index: @index.to_a,
|
1874
1903
|
order: @vectors.to_a,
|
1875
1904
|
name: @name
|
1876
1905
|
})
|
@@ -1878,14 +1907,14 @@ module Daru
|
|
1878
1907
|
|
1879
1908
|
def self._load data
|
1880
1909
|
h = Marshal.load data
|
1881
|
-
Daru::DataFrame.new(h[:data],
|
1882
|
-
index: h[:index],
|
1910
|
+
Daru::DataFrame.new(h[:data],
|
1911
|
+
index: h[:index],
|
1883
1912
|
order: h[:order],
|
1884
1913
|
name: h[:name])
|
1885
1914
|
end
|
1886
1915
|
|
1887
1916
|
# Change dtypes of vectors by supplying a hash of :vector_name => :new_dtype
|
1888
|
-
#
|
1917
|
+
#
|
1889
1918
|
# == Usage
|
1890
1919
|
# df = Daru::DataFrame.new({a: [1,2,3], b: [1,2,3], c: [1,2,3]})
|
1891
1920
|
# df.recast a: :nmatrix, c: :nmatrix
|
@@ -1908,7 +1937,7 @@ module Daru
|
|
1908
1937
|
# Pretty print in a nice table format for the command line (irb/pry/iruby)
|
1909
1938
|
def inspect spacing=10, threshold=15
|
1910
1939
|
longest = [@name.to_s.size,
|
1911
|
-
(@vectors.map(&:to_s).map(&:size).max || 0),
|
1940
|
+
(@vectors.map(&:to_s).map(&:size).max || 0),
|
1912
1941
|
(@index .map(&:to_s).map(&:size).max || 0),
|
1913
1942
|
(@data .map{ |v| v.map(&:to_s).map(&:size).max}.max || 0)].max
|
1914
1943
|
|
@@ -1918,7 +1947,7 @@ module Daru
|
|
1918
1947
|
formatter = "\n"
|
1919
1948
|
|
1920
1949
|
(@vectors.size + 1).times { formatter += "%#{longest}.#{longest}s " }
|
1921
|
-
content += "\n#<" + self.class.to_s + ":" + self.object_id.to_s + " @name = " +
|
1950
|
+
content += "\n#<" + self.class.to_s + ":" + self.object_id.to_s + " @name = " +
|
1922
1951
|
name.to_s + " @size = " + @size.to_s + ">"
|
1923
1952
|
content += sprintf formatter, "" , *@vectors.map(&:to_s)
|
1924
1953
|
row_num = 1
|
@@ -1945,10 +1974,10 @@ module Daru
|
|
1945
1974
|
end
|
1946
1975
|
|
1947
1976
|
def == other
|
1948
|
-
self.class == other.class and
|
1949
|
-
@size == other.size and
|
1977
|
+
self.class == other.class and
|
1978
|
+
@size == other.size and
|
1950
1979
|
@index == other.index and
|
1951
|
-
@vectors == other.vectors and
|
1980
|
+
@vectors == other.vectors and
|
1952
1981
|
@vectors.to_a.all? { |v| self[v] == other[v] }
|
1953
1982
|
end
|
1954
1983
|
|
@@ -1977,9 +2006,9 @@ module Daru
|
|
1977
2006
|
end
|
1978
2007
|
|
1979
2008
|
# == Arguments
|
1980
|
-
#
|
1981
|
-
# vector_order -
|
1982
|
-
# index -
|
2009
|
+
#
|
2010
|
+
# vector_order -
|
2011
|
+
# index -
|
1983
2012
|
# by -
|
1984
2013
|
# ascending -
|
1985
2014
|
# left_lower -
|
@@ -2120,7 +2149,7 @@ module Daru
|
|
2120
2149
|
end
|
2121
2150
|
|
2122
2151
|
order = names.is_a?(Array) ? Daru::Index.new(names) : names
|
2123
|
-
Daru::DataFrame.new(new_vcs, order: order,
|
2152
|
+
Daru::DataFrame.new(new_vcs, order: order,
|
2124
2153
|
index: @index, name: @name)
|
2125
2154
|
end
|
2126
2155
|
end
|
@@ -2134,7 +2163,7 @@ module Daru
|
|
2134
2163
|
return Daru::Vector.new(populate_row_for(pos), index: @vectors, name: pos)
|
2135
2164
|
else
|
2136
2165
|
new_rows = pos.map { |tuple| populate_row_for(tuple) }
|
2137
|
-
|
2166
|
+
|
2138
2167
|
if !location.is_a?(Range) and names.size < @index.width
|
2139
2168
|
pos = pos.drop_left_level names.size
|
2140
2169
|
end
|
@@ -2143,7 +2172,7 @@ module Daru
|
|
2143
2172
|
new_rows, order: @vectors, name: @name, index: pos)
|
2144
2173
|
end
|
2145
2174
|
else
|
2146
|
-
if names[1].nil?
|
2175
|
+
if names[1].nil?
|
2147
2176
|
names = @index[location]
|
2148
2177
|
if names.is_a?(Numeric)
|
2149
2178
|
row = []
|
@@ -2159,8 +2188,8 @@ module Daru
|
|
2159
2188
|
names.each do |name|
|
2160
2189
|
rows << self.row[name].to_a
|
2161
2190
|
end
|
2162
|
-
|
2163
|
-
Daru::DataFrame.rows rows, index: names ,name: @name, order: @vectors
|
2191
|
+
|
2192
|
+
Daru::DataFrame.rows rows, index: names ,name: @name, order: @vectors
|
2164
2193
|
end
|
2165
2194
|
end
|
2166
2195
|
|
@@ -2171,11 +2200,11 @@ module Daru
|
|
2171
2200
|
end
|
2172
2201
|
|
2173
2202
|
def insert_or_modify_vector name, vector
|
2174
|
-
name = name[0] unless @vectors.is_a?(MultiIndex)
|
2203
|
+
name = name[0] unless @vectors.is_a?(MultiIndex)
|
2175
2204
|
v = nil
|
2176
2205
|
|
2177
2206
|
if @index.empty?
|
2178
|
-
v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a)
|
2207
|
+
v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a)
|
2179
2208
|
@index = v.index
|
2180
2209
|
assign_or_add_vector name, v
|
2181
2210
|
set_size
|
@@ -2217,7 +2246,7 @@ module Daru
|
|
2217
2246
|
#FIXME: fix this jugaad. need to make changes in Indexing itself.
|
2218
2247
|
pos = @vectors[name]
|
2219
2248
|
|
2220
|
-
if !pos.kind_of?(Daru::Index) and pos == name and
|
2249
|
+
if !pos.kind_of?(Daru::Index) and pos == name and
|
2221
2250
|
(@vectors.include?(name) or (pos.is_a?(Integer) and pos < @data.size))
|
2222
2251
|
@data[pos] = v
|
2223
2252
|
elsif pos.kind_of?(Daru::Index)
|
@@ -2227,10 +2256,10 @@ module Daru
|
|
2227
2256
|
else
|
2228
2257
|
@vectors = @vectors | [name] if !@vectors.include?(name)
|
2229
2258
|
@data[@vectors[name]] = v
|
2230
|
-
end
|
2259
|
+
end
|
2231
2260
|
end
|
2232
2261
|
|
2233
|
-
def insert_or_modify_row name, vector
|
2262
|
+
def insert_or_modify_row name, vector
|
2234
2263
|
if index.is_a?(MultiIndex)
|
2235
2264
|
# TODO
|
2236
2265
|
else
|
@@ -2264,7 +2293,7 @@ module Daru
|
|
2264
2293
|
end
|
2265
2294
|
|
2266
2295
|
def validate_labels
|
2267
|
-
raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if
|
2296
|
+
raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if
|
2268
2297
|
@vectors and @vectors.size != @data.size
|
2269
2298
|
|
2270
2299
|
raise IndexError, "Expected number of indexes same as number of rows" if
|
@@ -2330,7 +2359,7 @@ module Daru
|
|
2330
2359
|
end
|
2331
2360
|
|
2332
2361
|
def symbolize arry
|
2333
|
-
symbolized_arry =
|
2362
|
+
symbolized_arry =
|
2334
2363
|
if arry.all? { |e| e.is_a?(Array) }
|
2335
2364
|
arry.map do |sub_arry|
|
2336
2365
|
sub_arry.map do |e|
|
@@ -2344,4 +2373,4 @@ module Daru
|
|
2344
2373
|
symbolized_arry
|
2345
2374
|
end
|
2346
2375
|
end
|
2347
|
-
end
|
2376
|
+
end
|