daru 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -5
- data/CONTRIBUTING.md +2 -11
- data/History.md +18 -0
- data/README.md +109 -11
- data/daru.gemspec +11 -6
- data/images/README.md +5 -0
- data/images/con0.png +0 -0
- data/images/con1.png +0 -0
- data/images/init0.png +0 -0
- data/images/init1.png +0 -0
- data/images/man0.png +0 -0
- data/images/man1.png +0 -0
- data/images/man2.png +0 -0
- data/images/man3.png +0 -0
- data/images/man4.png +0 -0
- data/images/man5.png +0 -0
- data/images/man6.png +0 -0
- data/images/plot0.png +0 -0
- data/lib/daru.rb +5 -2
- data/lib/daru/core/group_by.rb +45 -45
- data/lib/daru/core/merge.rb +59 -1
- data/lib/daru/dataframe.rb +255 -226
- data/lib/daru/exceptions.rb +2 -0
- data/lib/daru/io/io.rb +41 -19
- data/lib/daru/io/sql_data_source.rb +116 -0
- data/lib/daru/vector.rb +124 -104
- data/lib/daru/version.rb +1 -1
- data/spec/core/group_by_spec.rb +12 -2
- data/spec/core/merge_spec.rb +14 -1
- data/spec/dataframe_spec.rb +189 -158
- data/spec/io/io_spec.rb +80 -2
- data/spec/io/sql_data_source_spec.rb +67 -0
- data/spec/spec_helper.rb +4 -2
- data/spec/support/database_helper.rb +30 -0
- data/spec/vector_spec.rb +45 -46
- metadata +104 -16
- data/.build.sh +0 -14
data/lib/daru/core/merge.rb
CHANGED
@@ -33,6 +33,14 @@ module Daru
|
|
33
33
|
hsh.each { |k,v| hsh[k] = v.to_a }
|
34
34
|
hsh
|
35
35
|
end
|
36
|
+
|
37
|
+
def arrayify df
|
38
|
+
arr = df.to_a
|
39
|
+
col_names = arr[0][0].keys
|
40
|
+
values = arr[0].map{|h| h.values}
|
41
|
+
|
42
|
+
return col_names, values
|
43
|
+
end
|
36
44
|
|
37
45
|
def inner_join df1, df2, df_hash1, df_hash2, on
|
38
46
|
joined_hash = {}
|
@@ -53,6 +61,52 @@ module Daru
|
|
53
61
|
Daru::DataFrame.new(joined_hash, order: joined_hash.keys)
|
54
62
|
end
|
55
63
|
|
64
|
+
def bf_inner_join df1, df2, on
|
65
|
+
col_names1, table1 = arrayify df1
|
66
|
+
col_names2, table2 = arrayify df2
|
67
|
+
|
68
|
+
#resolve duplicates
|
69
|
+
indicies1 = on.map{|i| col_names1.index(i)}
|
70
|
+
indicies2 = on.map{|i| col_names2.index(i)}
|
71
|
+
col_names2.map! do |name|
|
72
|
+
if (col_names1.include?(name))
|
73
|
+
col_names1[col_names1.index(name)] = (name.to_s + "_1").to_sym unless on.include?(name)
|
74
|
+
(name.to_s + "_2").to_sym
|
75
|
+
else
|
76
|
+
name
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
#combine key columns to a single column value
|
81
|
+
on_cols1 = table1.flat_map{|x| indicies1.map{|i| x[i].to_s}.join("+")}
|
82
|
+
on_cols2 = table2.flat_map{|x| indicies2.map{|i| x[i].to_s}.join("+")}
|
83
|
+
|
84
|
+
#parameters for a BF with approx 0.1% false positives
|
85
|
+
m = on_cols2.size * 15
|
86
|
+
k = 11
|
87
|
+
|
88
|
+
bf = BloomFilter::Native.new({:size => m, :hashes => k, :bucket => 1})
|
89
|
+
on_cols2.each{|x| bf.insert(x)}
|
90
|
+
|
91
|
+
x_ind = -1
|
92
|
+
joined_new = on_cols1.map do |x|
|
93
|
+
x_ind+=1
|
94
|
+
if (bf.include?(x))
|
95
|
+
{x_ind => on_cols2.each_index.select{|y_ind| on_cols2[y_ind] == x}}
|
96
|
+
else
|
97
|
+
{x_ind => []}
|
98
|
+
end
|
99
|
+
end
|
100
|
+
.reduce({}) {|h,pairs| pairs.each {|k,v| (h[k] ||= []) << v}; h}
|
101
|
+
.flat_map{|ind1, inds2| inds2.flatten.map{|ind2| [table1[ind1], table2[ind2]].flatten} if inds2.flatten.size > 0}
|
102
|
+
|
103
|
+
joined_cols = [col_names1, col_names2].flatten
|
104
|
+
df = Daru::DataFrame.rows(joined_new.compact, order: joined_cols)
|
105
|
+
on.each{|x| df.delete_vector (x.to_s + "_2").to_sym}
|
106
|
+
|
107
|
+
df
|
108
|
+
end
|
109
|
+
|
56
110
|
def full_outer_join df1, df2, df_hash1, df_hash2, on
|
57
111
|
left = left_outer_join df1, df2, df_hash1, df_hash2, on, true
|
58
112
|
right = right_outer_join df1, df2, df_hash1, df_hash2, on, true
|
@@ -153,7 +207,11 @@ module Daru
|
|
153
207
|
|
154
208
|
case opts[:how]
|
155
209
|
when :inner
|
156
|
-
|
210
|
+
if Daru.has_bloomfilter_rb?
|
211
|
+
helper.bf_inner_join df1, df2, on
|
212
|
+
else
|
213
|
+
helper.inner_join df1, df2, df_hash1, df_hash2, on
|
214
|
+
end
|
157
215
|
when :outer
|
158
216
|
helper.full_outer_join df1, df2, df_hash1, df_hash2, on
|
159
217
|
when :left
|
data/lib/daru/dataframe.rb
CHANGED
@@ -14,30 +14,30 @@ module Daru
|
|
14
14
|
include Daru::Plotting::DataFrame if Daru.has_nyaplot?
|
15
15
|
|
16
16
|
class << self
|
17
|
-
# Load data from a CSV file. Specify an optional block to grab the CSV
|
18
|
-
# object and pre-condition it (for example use the `convert` or
|
17
|
+
# Load data from a CSV file. Specify an optional block to grab the CSV
|
18
|
+
# object and pre-condition it (for example use the `convert` or
|
19
19
|
# `header_convert` methods).
|
20
|
-
#
|
20
|
+
#
|
21
21
|
# == Arguments
|
22
|
-
#
|
22
|
+
#
|
23
23
|
# * path - Path of the file to load specified as a String.
|
24
|
-
#
|
24
|
+
#
|
25
25
|
# == Options
|
26
|
-
#
|
26
|
+
#
|
27
27
|
# Accepts the same options as the Daru::DataFrame constructor and CSV.open()
|
28
28
|
# and uses those to eventually construct the resulting DataFrame.
|
29
29
|
#
|
30
30
|
# == Verbose Description
|
31
31
|
#
|
32
|
-
# You can specify all the options to the `.from_csv` function that you
|
32
|
+
# You can specify all the options to the `.from_csv` function that you
|
33
33
|
# do to the Ruby `CSV.read()` function, since this is what is used internally.
|
34
34
|
#
|
35
|
-
# For example, if the columns in your CSV file are separated by something
|
36
|
-
# other that commas, you can use the `:col_sep` option. If you want to
|
37
|
-
# convert numeric values to numbers and not keep them as strings, you can
|
35
|
+
# For example, if the columns in your CSV file are separated by something
|
36
|
+
# other that commas, you can use the `:col_sep` option. If you want to
|
37
|
+
# convert numeric values to numbers and not keep them as strings, you can
|
38
38
|
# use the `:converters` option and set it to `:numeric`.
|
39
39
|
#
|
40
|
-
# The `.from_csv` function uses the following defaults for reading CSV files
|
40
|
+
# The `.from_csv` function uses the following defaults for reading CSV files
|
41
41
|
# (that are passed into the `CSV.read()` function):
|
42
42
|
#
|
43
43
|
# {
|
@@ -45,24 +45,29 @@ module Daru
|
|
45
45
|
# :converters => :numeric
|
46
46
|
# }
|
47
47
|
def from_csv path, opts={}, &block
|
48
|
-
Daru::IO.from_csv path, opts, &block
|
48
|
+
Daru::IO.from_csv path, opts, &block
|
49
49
|
end
|
50
50
|
|
51
51
|
# Read data from an Excel file into a DataFrame.
|
52
|
-
#
|
52
|
+
#
|
53
53
|
# == Arguments
|
54
|
-
#
|
54
|
+
#
|
55
55
|
# * path - Path of the file to be read.
|
56
|
-
#
|
56
|
+
#
|
57
57
|
# == Options
|
58
|
-
#
|
58
|
+
#
|
59
59
|
# *:worksheet_id - ID of the worksheet that is to be read.
|
60
|
-
def from_excel path, opts={}, &block
|
60
|
+
def from_excel path, opts={}, &block
|
61
61
|
Daru::IO.from_excel path, opts, &block
|
62
62
|
end
|
63
63
|
|
64
64
|
# Read a database query and returns a Dataset
|
65
65
|
#
|
66
|
+
# @param dbh [DBI::DatabaseHandle] A DBI connection to be used to run the query
|
67
|
+
# @param query [String] The query to be executed
|
68
|
+
#
|
69
|
+
# @return A dataframe containing the data resulting from the query
|
70
|
+
#
|
66
71
|
# USE:
|
67
72
|
#
|
68
73
|
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
@@ -71,17 +76,37 @@ module Daru
|
|
71
76
|
Daru::IO.from_sql dbh, query
|
72
77
|
end
|
73
78
|
|
79
|
+
# Read a dataframe from AR::Relation
|
80
|
+
#
|
81
|
+
# @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
|
82
|
+
# @params fields [Array] Field names to be loaded (optional)
|
83
|
+
#
|
84
|
+
# @return A dataframe containing the data loaded from the relation
|
85
|
+
#
|
86
|
+
# USE:
|
87
|
+
#
|
88
|
+
# # When Post model is defined as:
|
89
|
+
# class Post < ActiveRecord::Base
|
90
|
+
# scope :active, -> { where.not(published_at: nil) }
|
91
|
+
# end
|
92
|
+
#
|
93
|
+
# # You can load active posts into a dataframe by:
|
94
|
+
# Daru::DataFrame.from_activerecord(Post.active, :title, :published_at)
|
95
|
+
def from_activerecord relation, *fields
|
96
|
+
Daru::IO.from_activerecord relation, *fields
|
97
|
+
end
|
98
|
+
|
74
99
|
# Read the database from a plaintext file. For this method to work,
|
75
100
|
# the data should be present in a plain text file in columns. See
|
76
101
|
# spec/fixtures/bank2.dat for an example.
|
77
|
-
#
|
102
|
+
#
|
78
103
|
# == Arguments
|
79
|
-
#
|
104
|
+
#
|
80
105
|
# * path - Path of the file to be read.
|
81
106
|
# * fields - Vector names of the resulting database.
|
82
|
-
#
|
107
|
+
#
|
83
108
|
# == Usage
|
84
|
-
#
|
109
|
+
#
|
85
110
|
# df = Daru::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
|
86
111
|
def from_plaintext path, fields
|
87
112
|
Daru::IO.from_plaintext path, fields
|
@@ -137,15 +162,15 @@ module Daru
|
|
137
162
|
#
|
138
163
|
# Useful to process outputs from databases
|
139
164
|
def crosstab_by_assignation rows, columns, values
|
140
|
-
raise "Three vectors should be equal size" if
|
165
|
+
raise "Three vectors should be equal size" if
|
141
166
|
rows.size != columns.size or rows.size!=values.size
|
142
167
|
|
143
168
|
cols_values = columns.factors
|
144
169
|
cols_n = cols_values.size
|
145
170
|
|
146
|
-
h_rows = rows.factors.inject({}) do |a,v|
|
147
|
-
a[v] = cols_values.inject({}) do |a1,v1|
|
148
|
-
a1[v1]=nil
|
171
|
+
h_rows = rows.factors.inject({}) do |a,v|
|
172
|
+
a[v] = cols_values.inject({}) do |a1,v1|
|
173
|
+
a1[v1]=nil
|
149
174
|
a1
|
150
175
|
end
|
151
176
|
a
|
@@ -186,38 +211,38 @@ module Daru
|
|
186
211
|
# These objects are indexed by row and column by vectors and index Index objects.
|
187
212
|
#
|
188
213
|
# == Arguments
|
189
|
-
#
|
214
|
+
#
|
190
215
|
# * source - Source from the DataFrame is to be initialized. Can be a Hash
|
191
216
|
# of names and vectors (array or Daru::Vector), an array of arrays or
|
192
217
|
# array of Daru::Vectors.
|
193
|
-
#
|
218
|
+
#
|
194
219
|
# == Options
|
195
|
-
#
|
196
|
-
# +:order+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order in
|
220
|
+
#
|
221
|
+
# +:order+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order in
|
197
222
|
# which Vectors should appear in the DataFrame.
|
198
|
-
#
|
223
|
+
#
|
199
224
|
# +:index+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order
|
200
225
|
# in which rows of the DataFrame will be named.
|
201
|
-
#
|
226
|
+
#
|
202
227
|
# +:name+ - A name for the DataFrame.
|
203
228
|
#
|
204
229
|
# +:clone+ - Specify as *true* or *false*. When set to false, and Vector
|
205
230
|
# objects are passed for the source, the Vector objects will not duplicated
|
206
|
-
# when creating the DataFrame. Will have no effect if Array is passed in
|
207
|
-
# the source, or if the passed Daru::Vectors have different indexes.
|
231
|
+
# when creating the DataFrame. Will have no effect if Array is passed in
|
232
|
+
# the source, or if the passed Daru::Vectors have different indexes.
|
208
233
|
# Default to *true*.
|
209
|
-
#
|
234
|
+
#
|
210
235
|
# == Usage
|
211
|
-
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
|
236
|
+
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
|
212
237
|
# index: [:a, :b, :c, :d], name: :spider_man)
|
213
|
-
#
|
214
|
-
# # =>
|
238
|
+
#
|
239
|
+
# # =>
|
215
240
|
# # <Daru::DataFrame:80766980 @name = spider_man @size = 4>
|
216
|
-
# # b a
|
217
|
-
# # a 6 1
|
218
|
-
# # b 7 2
|
219
|
-
# # c 8 3
|
220
|
-
# # d 9 4
|
241
|
+
# # b a
|
242
|
+
# # a 6 1
|
243
|
+
# # b 7 2
|
244
|
+
# # c 8 3
|
245
|
+
# # d 9 4
|
221
246
|
def initialize source, opts={}
|
222
247
|
vectors = opts[:order]
|
223
248
|
index = opts[:index]
|
@@ -292,7 +317,7 @@ module Daru
|
|
292
317
|
@vectors.each do |vector|
|
293
318
|
# avoids matching indexes of vectors if all the supplied vectors
|
294
319
|
# have the same index.
|
295
|
-
if vectors_have_same_index
|
320
|
+
if vectors_have_same_index
|
296
321
|
v = source[vector].dup
|
297
322
|
else
|
298
323
|
v = Daru::Vector.new([], name: vector, index: @index)
|
@@ -331,8 +356,8 @@ module Daru
|
|
331
356
|
end
|
332
357
|
|
333
358
|
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
|
334
|
-
# Defaults to *:vector*. Use of this method is not recommended for accessing
|
335
|
-
# rows or vectors. Use df.row[:a] for accessing row with index ':a' or
|
359
|
+
# Defaults to *:vector*. Use of this method is not recommended for accessing
|
360
|
+
# rows or vectors. Use df.row[:a] for accessing row with index ':a' or
|
336
361
|
# df.vector[:vec] for accessing vector with index *:vec*.
|
337
362
|
def [](*names)
|
338
363
|
if names[-1] == :vector or names[-1] == :row
|
@@ -354,7 +379,7 @@ module Daru
|
|
354
379
|
# Insert a new row/vector of the specified name or modify a previous row.
|
355
380
|
# Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
|
356
381
|
# a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
|
357
|
-
#
|
382
|
+
#
|
358
383
|
# In case a Daru::Vector is specified after the equality the sign, the indexes
|
359
384
|
# of the vector will be matched against the row/vector indexes of the DataFrame
|
360
385
|
# before an insertion is performed. Unmatched indexes will be set to nil.
|
@@ -368,7 +393,7 @@ module Daru
|
|
368
393
|
|
369
394
|
if axis == :vector
|
370
395
|
insert_or_modify_vector name, vector
|
371
|
-
elsif axis == :row
|
396
|
+
elsif axis == :row
|
372
397
|
insert_or_modify_row name, vector
|
373
398
|
else
|
374
399
|
raise IndexError, "Expected axis to be row or vector, not #{axis}."
|
@@ -389,7 +414,7 @@ module Daru
|
|
389
414
|
end
|
390
415
|
|
391
416
|
# Access a row or set/create a row. Refer #[] and #[]= docs for details.
|
392
|
-
#
|
417
|
+
#
|
393
418
|
# == Usage
|
394
419
|
# df.row[:a] # access row named ':a'
|
395
420
|
# df.row[:b] = [1,2,3] # set row ':b' to [1,2,3]
|
@@ -398,17 +423,17 @@ module Daru
|
|
398
423
|
end
|
399
424
|
|
400
425
|
# Duplicate the DataFrame entirely.
|
401
|
-
#
|
426
|
+
#
|
402
427
|
# == Arguments
|
403
|
-
#
|
404
|
-
# * +vectors_to_dup+ - An Array specifying the names of Vectors to
|
428
|
+
#
|
429
|
+
# * +vectors_to_dup+ - An Array specifying the names of Vectors to
|
405
430
|
# be duplicated. Will duplicate the entire DataFrame if not specified.
|
406
431
|
def dup vectors_to_dup=nil
|
407
432
|
vectors_to_dup = @vectors.to_a unless vectors_to_dup
|
408
433
|
|
409
434
|
src = []
|
410
435
|
vectors_to_dup.each do |vec|
|
411
|
-
src << @data[@vectors[vec]].to_a
|
436
|
+
src << @data[@vectors[vec]].to_a.dup
|
412
437
|
end
|
413
438
|
new_order = Daru::Index.new(vectors_to_dup)
|
414
439
|
|
@@ -422,9 +447,9 @@ module Daru
|
|
422
447
|
|
423
448
|
# Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
|
424
449
|
# preserved.
|
425
|
-
#
|
450
|
+
#
|
426
451
|
# == Arguments
|
427
|
-
#
|
452
|
+
#
|
428
453
|
# +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
|
429
454
|
# a view of the whole data frame otherwise.
|
430
455
|
def clone *vectors_to_clone
|
@@ -438,7 +463,7 @@ module Daru
|
|
438
463
|
Daru::DataFrame.new(h, clone: false)
|
439
464
|
end
|
440
465
|
|
441
|
-
# Returns a 'shallow' copy of DataFrame if missing data is not present,
|
466
|
+
# Returns a 'shallow' copy of DataFrame if missing data is not present,
|
442
467
|
# or a full copy of only valid data if missing data is present.
|
443
468
|
def clone_only_valid
|
444
469
|
if has_missing_data?
|
@@ -448,7 +473,7 @@ module Daru
|
|
448
473
|
end
|
449
474
|
end
|
450
475
|
|
451
|
-
# Creates a new duplicate dataframe containing only rows
|
476
|
+
# Creates a new duplicate dataframe containing only rows
|
452
477
|
# without a single missing value.
|
453
478
|
def dup_only_valid vecs=nil
|
454
479
|
rows_with_nil = @data.inject([]) do |memo, vector|
|
@@ -485,7 +510,7 @@ module Daru
|
|
485
510
|
|
486
511
|
@vectors.each do |vector|
|
487
512
|
yield @data[@vectors[vector]], vector
|
488
|
-
end
|
513
|
+
end
|
489
514
|
|
490
515
|
self
|
491
516
|
end
|
@@ -518,12 +543,12 @@ module Daru
|
|
518
543
|
#
|
519
544
|
# == Description
|
520
545
|
#
|
521
|
-
# `#each` works exactly like Array#each. The default mode for `each`
|
522
|
-
# is to iterate over the columns of the DataFrame. To iterate over
|
546
|
+
# `#each` works exactly like Array#each. The default mode for `each`
|
547
|
+
# is to iterate over the columns of the DataFrame. To iterate over
|
523
548
|
# rows you must pass the axis, i.e `:row` as an argument.
|
524
|
-
#
|
549
|
+
#
|
525
550
|
# == Arguments
|
526
|
-
#
|
551
|
+
#
|
527
552
|
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
528
553
|
# or :row. Default to :vector.
|
529
554
|
def each axis=:vector, &block
|
@@ -541,14 +566,14 @@ module Daru
|
|
541
566
|
#
|
542
567
|
# == Description
|
543
568
|
#
|
544
|
-
# The #collect iterator works similar to #map, the only difference
|
545
|
-
# being that it returns a Daru::Vector comprising of the results of
|
546
|
-
# each block run. The resultant Vector has the same index as that
|
547
|
-
# of the axis over which collect has iterated. It also accepts the
|
569
|
+
# The #collect iterator works similar to #map, the only difference
|
570
|
+
# being that it returns a Daru::Vector comprising of the results of
|
571
|
+
# each block run. The resultant Vector has the same index as that
|
572
|
+
# of the axis over which collect has iterated. It also accepts the
|
548
573
|
# optional axis argument.
|
549
574
|
#
|
550
575
|
# == Arguments
|
551
|
-
#
|
576
|
+
#
|
552
577
|
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
553
578
|
# or :row. Default to :vector.
|
554
579
|
def collect axis=:vector, &block
|
@@ -565,16 +590,16 @@ module Daru
|
|
565
590
|
# the argument specified. Will return an Array of the resulting
|
566
591
|
# elements. To map over each row/vector and get a DataFrame,
|
567
592
|
# see #recode.
|
568
|
-
#
|
593
|
+
#
|
569
594
|
# == Description
|
570
|
-
#
|
571
|
-
# The #map iterator works like Array#map. The value returned by
|
572
|
-
# each run of the block is added to an Array and the Array is
|
573
|
-
# returned. This method also accepts an axis argument, like #each.
|
595
|
+
#
|
596
|
+
# The #map iterator works like Array#map. The value returned by
|
597
|
+
# each run of the block is added to an Array and the Array is
|
598
|
+
# returned. This method also accepts an axis argument, like #each.
|
574
599
|
# The default is :vector.
|
575
|
-
#
|
600
|
+
#
|
576
601
|
# == Arguments
|
577
|
-
#
|
602
|
+
#
|
578
603
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
579
604
|
# Default to :vector.
|
580
605
|
def map axis=:vector, &block
|
@@ -590,9 +615,9 @@ module Daru
|
|
590
615
|
# Destructive map. Modifies the DataFrame. Each run of the block
|
591
616
|
# must return a Daru::Vector. You can specify the axis to map over
|
592
617
|
# as the argument. Default to :vector.
|
593
|
-
#
|
618
|
+
#
|
594
619
|
# == Arguments
|
595
|
-
#
|
620
|
+
#
|
596
621
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
597
622
|
# Default to :vector.
|
598
623
|
def map! axis=:vector, &block
|
@@ -609,15 +634,15 @@ module Daru
|
|
609
634
|
#
|
610
635
|
# == Description
|
611
636
|
#
|
612
|
-
# Recode works similarly to #map, but an important difference between
|
613
|
-
# the two is that recode returns a modified Daru::DataFrame instead
|
614
|
-
# of an Array. For this reason, #recode expects that every run of the
|
637
|
+
# Recode works similarly to #map, but an important difference between
|
638
|
+
# the two is that recode returns a modified Daru::DataFrame instead
|
639
|
+
# of an Array. For this reason, #recode expects that every run of the
|
615
640
|
# block to return a Daru::Vector.
|
616
641
|
#
|
617
642
|
# Just like map and each, recode also accepts an optional _axis_ argument.
|
618
|
-
#
|
643
|
+
#
|
619
644
|
# == Arguments
|
620
|
-
#
|
645
|
+
#
|
621
646
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
622
647
|
# Default to :vector.
|
623
648
|
def recode axis=:vector, &block
|
@@ -629,22 +654,22 @@ module Daru
|
|
629
654
|
end
|
630
655
|
|
631
656
|
# Retain vectors or rows if the block returns a truthy value.
|
632
|
-
#
|
657
|
+
#
|
633
658
|
# == Description
|
634
|
-
#
|
635
|
-
# For filtering out certain rows/vectors based on their values,
|
636
|
-
# use the #filter method. By default it iterates over vectors and
|
637
|
-
# keeps those vectors for which the block returns true. It accepts
|
638
|
-
# an optional axis argument which lets you specify whether you want
|
659
|
+
#
|
660
|
+
# For filtering out certain rows/vectors based on their values,
|
661
|
+
# use the #filter method. By default it iterates over vectors and
|
662
|
+
# keeps those vectors for which the block returns true. It accepts
|
663
|
+
# an optional axis argument which lets you specify whether you want
|
639
664
|
# to iterate over vectors or rows.
|
640
|
-
#
|
665
|
+
#
|
641
666
|
# == Arguments
|
642
|
-
#
|
667
|
+
#
|
643
668
|
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
644
669
|
# Default to :vector.
|
645
|
-
#
|
670
|
+
#
|
646
671
|
# == Usage
|
647
|
-
#
|
672
|
+
#
|
648
673
|
# # Filter vectors
|
649
674
|
#
|
650
675
|
# df.filter do |vector|
|
@@ -665,12 +690,12 @@ module Daru
|
|
665
690
|
end
|
666
691
|
|
667
692
|
def recode_vectors &block
|
668
|
-
block_given? or return to_enum(:recode_vectors)
|
693
|
+
block_given? or return to_enum(:recode_vectors)
|
669
694
|
|
670
695
|
df = self.dup
|
671
696
|
df.each_vector_with_index do |v, i|
|
672
697
|
ret = yield v
|
673
|
-
ret.is_a?(Daru::Vector) or
|
698
|
+
ret.is_a?(Daru::Vector) or
|
674
699
|
raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
|
675
700
|
df[*i] = ret
|
676
701
|
end
|
@@ -763,7 +788,7 @@ module Daru
|
|
763
788
|
self
|
764
789
|
end
|
765
790
|
|
766
|
-
# Retrieves a Daru::Vector, based on the result of calculation
|
791
|
+
# Retrieves a Daru::Vector, based on the result of calculation
|
767
792
|
# performed on each row.
|
768
793
|
def collect_rows &block
|
769
794
|
return to_enum(:collect_rows) unless block_given?
|
@@ -878,15 +903,15 @@ module Daru
|
|
878
903
|
|
879
904
|
deletion << index unless keep_row
|
880
905
|
end
|
881
|
-
deletion.each { |idx|
|
882
|
-
delete_row idx
|
906
|
+
deletion.each { |idx|
|
907
|
+
delete_row idx
|
883
908
|
}
|
884
909
|
end
|
885
910
|
|
886
911
|
def keep_vector_if &block
|
887
912
|
@vectors.each do |vector|
|
888
913
|
keep_vector = yield @data[@vectors[vector]], vector
|
889
|
-
|
914
|
+
|
890
915
|
delete_vector vector unless keep_vector
|
891
916
|
end
|
892
917
|
end
|
@@ -925,7 +950,7 @@ module Daru
|
|
925
950
|
# true for that vector.
|
926
951
|
def filter_vectors &block
|
927
952
|
return to_enum(:filter_vectors) unless block_given?
|
928
|
-
|
953
|
+
|
929
954
|
df = self.dup
|
930
955
|
df.keep_vector_if &block
|
931
956
|
|
@@ -934,7 +959,7 @@ module Daru
|
|
934
959
|
|
935
960
|
# Test each row with one or more tests. Each test is a Proc with the form
|
936
961
|
# *Proc.new {|row| row[:age] > 0}*
|
937
|
-
#
|
962
|
+
#
|
938
963
|
# The function returns an array with all errors.
|
939
964
|
def verify(*tests)
|
940
965
|
if(tests[0].is_a? Symbol)
|
@@ -963,9 +988,9 @@ module Daru
|
|
963
988
|
|
964
989
|
# DSL for yielding each row and returning a Daru::Vector based on the
|
965
990
|
# value each run of the block returns.
|
966
|
-
#
|
991
|
+
#
|
967
992
|
# == Usage
|
968
|
-
#
|
993
|
+
#
|
969
994
|
# a1 = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7])
|
970
995
|
# a2 = Daru::Vector.new([10, 20, 30, 40, 50, 60, 70])
|
971
996
|
# a3 = Daru::Vector.new([100, 200, 300, 400, 500, 600, 700])
|
@@ -991,10 +1016,10 @@ module Daru
|
|
991
1016
|
|
992
1017
|
# Returns a vector, based on a string with a calculation based
|
993
1018
|
# on vector.
|
994
|
-
#
|
1019
|
+
#
|
995
1020
|
# The calculation will be eval'ed, so you can put any variable
|
996
1021
|
# or expression valid on ruby.
|
997
|
-
#
|
1022
|
+
#
|
998
1023
|
# For example:
|
999
1024
|
# a = Daru::Vector.new [1,2]
|
1000
1025
|
# b = Daru::Vector.new [3,4]
|
@@ -1003,14 +1028,14 @@ module Daru
|
|
1003
1028
|
# => Vector [4,6]
|
1004
1029
|
def compute text, &block
|
1005
1030
|
return instance_eval(&block) if block_given?
|
1006
|
-
instance_eval(text)
|
1031
|
+
instance_eval(text)
|
1007
1032
|
end
|
1008
1033
|
|
1009
1034
|
# Return a vector with the number of missing values in each row.
|
1010
|
-
#
|
1035
|
+
#
|
1011
1036
|
# == Arguments
|
1012
|
-
#
|
1013
|
-
# * +missing_values+ - An Array of the values that should be
|
1037
|
+
#
|
1038
|
+
# * +missing_values+ - An Array of the values that should be
|
1014
1039
|
# treated as 'missing'. The default missing value is *nil*.
|
1015
1040
|
def missing_values_rows missing_values=[nil]
|
1016
1041
|
number_of_missing = []
|
@@ -1031,9 +1056,9 @@ module Daru
|
|
1031
1056
|
|
1032
1057
|
alias :flawed? :has_missing_data?
|
1033
1058
|
|
1034
|
-
# Return a nested hash using vector names as keys and an array constructed of
|
1059
|
+
# Return a nested hash using vector names as keys and an array constructed of
|
1035
1060
|
# hashes with other values. If block provided, is used to provide the
|
1036
|
-
# values, with parameters +row+ of dataset, +current+ last hash on
|
1061
|
+
# values, with parameters +row+ of dataset, +current+ last hash on
|
1037
1062
|
# hierarchy and +name+ of the key to include
|
1038
1063
|
def nest *tree_keys, &block
|
1039
1064
|
tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
|
@@ -1101,7 +1126,7 @@ module Daru
|
|
1101
1126
|
# @example Using any?
|
1102
1127
|
# df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
1103
1128
|
# df.any?(:row) do |row|
|
1104
|
-
# row[:a] < 3 and row[:b] == 'b'
|
1129
|
+
# row[:a] < 3 and row[:b] == 'b'
|
1105
1130
|
# end #=> true
|
1106
1131
|
def any? axis=:vector, &block
|
1107
1132
|
if axis == :vector or axis == :column
|
@@ -1123,7 +1148,7 @@ module Daru
|
|
1123
1148
|
# @example Using all?
|
1124
1149
|
# df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
1125
1150
|
# df.all?(:row) do |row|
|
1126
|
-
# row[:a] < 10
|
1151
|
+
# row[:a] < 10
|
1127
1152
|
# end #=> true
|
1128
1153
|
def all? axis=:vector, &block
|
1129
1154
|
if axis == :vector or axis == :column
|
@@ -1145,14 +1170,18 @@ module Daru
|
|
1145
1170
|
self[0..(quantity-1), :row]
|
1146
1171
|
end
|
1147
1172
|
|
1173
|
+
alias :first :head
|
1174
|
+
|
1148
1175
|
# The last ten elements of the DataFrame
|
1149
|
-
#
|
1176
|
+
#
|
1150
1177
|
# @param [Fixnum] quantity (10) The number of elements to display from the bottom.
|
1151
1178
|
def tail quantity=10
|
1152
1179
|
self[(@size - quantity)..(@size-1), :row]
|
1153
1180
|
end
|
1154
1181
|
|
1155
|
-
|
1182
|
+
alias :last :tail
|
1183
|
+
|
1184
|
+
# Returns a vector with sum of all vectors specified in the argument.
|
1156
1185
|
# Tf vecs parameter is empty, sum all numeric vector.
|
1157
1186
|
def vector_sum vecs=nil
|
1158
1187
|
vecs ||= numeric_vectors
|
@@ -1166,9 +1195,9 @@ module Daru
|
|
1166
1195
|
end
|
1167
1196
|
|
1168
1197
|
# Calculate mean of the rows of the dataframe.
|
1169
|
-
#
|
1198
|
+
#
|
1170
1199
|
# == Arguments
|
1171
|
-
#
|
1200
|
+
#
|
1172
1201
|
# * +max_missing+ - The maximum number of elements in the row that can be
|
1173
1202
|
# zero for the mean calculation to happen. Default to 0.
|
1174
1203
|
def vector_mean max_missing=0
|
@@ -1181,16 +1210,16 @@ module Daru
|
|
1181
1210
|
mean_vec
|
1182
1211
|
end
|
1183
1212
|
|
1184
|
-
# Group elements by vector to perform operations on them. Returns a
|
1213
|
+
# Group elements by vector to perform operations on them. Returns a
|
1185
1214
|
# Daru::Core::GroupBy object.See the Daru::Core::GroupBy docs for a detailed
|
1186
1215
|
# list of possible operations.
|
1187
|
-
#
|
1216
|
+
#
|
1188
1217
|
# == Arguments
|
1189
|
-
#
|
1218
|
+
#
|
1190
1219
|
# * vectors - An Array contatining names of vectors to group by.
|
1191
|
-
#
|
1220
|
+
#
|
1192
1221
|
# == Usage
|
1193
|
-
#
|
1222
|
+
#
|
1194
1223
|
# df = Daru::DataFrame.new({
|
1195
1224
|
# a: %w{foo bar foo bar foo bar foo foo},
|
1196
1225
|
# b: %w{one one two three two two one three},
|
@@ -1209,7 +1238,7 @@ module Daru
|
|
1209
1238
|
vectors.flatten!
|
1210
1239
|
vectors.each { |v| raise(ArgumentError, "Vector #{v} does not exist") unless
|
1211
1240
|
has_vector?(v) }
|
1212
|
-
|
1241
|
+
|
1213
1242
|
Daru::Core::GroupBy.new(self, vectors)
|
1214
1243
|
end
|
1215
1244
|
|
@@ -1234,7 +1263,7 @@ module Daru
|
|
1234
1263
|
def concat other_df
|
1235
1264
|
vectors = []
|
1236
1265
|
@vectors.each do |v|
|
1237
|
-
vectors << self[v].to_a.concat(other_df[v].to_a)
|
1266
|
+
vectors << self[v].to_a.dup.concat(other_df[v].to_a)
|
1238
1267
|
end
|
1239
1268
|
|
1240
1269
|
Daru::DataFrame.new(vectors, order: @vectors)
|
@@ -1242,9 +1271,9 @@ module Daru
|
|
1242
1271
|
|
1243
1272
|
# Set a particular column as the new DF
|
1244
1273
|
def set_index new_index, opts={}
|
1245
|
-
raise ArgumentError, "All elements in new index must be unique." if
|
1274
|
+
raise ArgumentError, "All elements in new index must be unique." if
|
1246
1275
|
@size != self[new_index].uniq.size
|
1247
|
-
|
1276
|
+
|
1248
1277
|
self.index = Daru::Index.new(self[new_index].to_a)
|
1249
1278
|
self.delete_vector(new_index) unless opts[:keep]
|
1250
1279
|
|
@@ -1253,25 +1282,25 @@ module Daru
|
|
1253
1282
|
|
1254
1283
|
# Change the index of the DataFrame and preserve the labels of the previous
|
1255
1284
|
# indexing. New index can be Daru::Index or any of its subclasses.
|
1256
|
-
#
|
1285
|
+
#
|
1257
1286
|
# @param [Daru::Index] new_index The new Index for reindexing the DataFrame.
|
1258
1287
|
# @example Reindexing DataFrame
|
1259
|
-
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
|
1288
|
+
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
|
1260
1289
|
# index: ['a','b','c','d'])
|
1261
|
-
# #=>
|
1290
|
+
# #=>
|
1262
1291
|
# ##<Daru::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
1263
|
-
# # a b
|
1264
|
-
# # a 1 11
|
1265
|
-
# # b 2 22
|
1266
|
-
# # c 3 33
|
1267
|
-
# # d 4 44
|
1292
|
+
# # a b
|
1293
|
+
# # a 1 11
|
1294
|
+
# # b 2 22
|
1295
|
+
# # c 3 33
|
1296
|
+
# # d 4 44
|
1268
1297
|
# df.reindex Daru::Index.new(['b', 0, 'a', 'g'])
|
1269
|
-
# #=>
|
1298
|
+
# #=>
|
1270
1299
|
# ##<Daru::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
1271
|
-
# # a b
|
1272
|
-
# # b 2 22
|
1273
|
-
# # 0 nil nil
|
1274
|
-
# # a 1 11
|
1300
|
+
# # a b
|
1301
|
+
# # b 2 22
|
1302
|
+
# # 0 nil nil
|
1303
|
+
# # a 1 11
|
1275
1304
|
# # g nil nil
|
1276
1305
|
def reindex new_index
|
1277
1306
|
raise ArgumentError, "Must pass the new index of type Index or its "\
|
@@ -1296,10 +1325,10 @@ module Daru
|
|
1296
1325
|
# @example Reassgining index of a DataFrame
|
1297
1326
|
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
|
1298
1327
|
# df.index.to_a #=> [0,1,2,3]
|
1299
|
-
#
|
1328
|
+
#
|
1300
1329
|
# df.index = Daru::Index.new(['a','b','c','d'])
|
1301
1330
|
# df.index.to_a #=> ['a','b','c','d']
|
1302
|
-
# df.row['a'].to_a #=> [1,11]
|
1331
|
+
# df.row['a'].to_a #=> [1,11]
|
1303
1332
|
def index= idx
|
1304
1333
|
@data.each { |vec| vec.index = idx}
|
1305
1334
|
@index = idx
|
@@ -1308,17 +1337,17 @@ module Daru
|
|
1308
1337
|
end
|
1309
1338
|
|
1310
1339
|
# Reassign vectors with a new index of type Daru::Index or any of its subclasses.
|
1311
|
-
#
|
1340
|
+
#
|
1312
1341
|
# @param [Daru::Index] idx The new index object on which the vectors are to
|
1313
1342
|
# be indexed. Must of the same size as ncols.
|
1314
1343
|
# @example Reassigning vectors of a DataFrame
|
1315
1344
|
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
|
1316
1345
|
# df.vectors.to_a #=> [:a, :b, :c]
|
1317
|
-
#
|
1346
|
+
#
|
1318
1347
|
# df.vectors = Daru::Index.new([:foo, :bar, :baz])
|
1319
1348
|
# df.vectors.to_a #=> [:foo, :bar, :baz]
|
1320
1349
|
def vectors= idx
|
1321
|
-
raise ArgumentError, "Can only reindex with Index and its subclasses" unless
|
1350
|
+
raise ArgumentError, "Can only reindex with Index and its subclasses" unless
|
1322
1351
|
index.kind_of?(Daru::Index)
|
1323
1352
|
raise ArgumentError, "Specified index length #{idx.size} not equal to"\
|
1324
1353
|
"dataframe size #{ncols}" if idx.size != ncols
|
@@ -1377,9 +1406,9 @@ module Daru
|
|
1377
1406
|
end
|
1378
1407
|
end
|
1379
1408
|
|
1380
|
-
# Sorts a dataframe (ascending/descending)according to the given sequence of
|
1409
|
+
# Sorts a dataframe (ascending/descending)according to the given sequence of
|
1381
1410
|
# vectors, using the attributes provided in the blocks.
|
1382
|
-
#
|
1411
|
+
#
|
1383
1412
|
# @param order [Array] The order of vector names in which the DataFrame
|
1384
1413
|
# should be sorted.
|
1385
1414
|
# @param [Hash] opts The options to sort with.
|
@@ -1387,21 +1416,21 @@ module Daru
|
|
1387
1416
|
# or descending order. Specify Array corresponding to *order* for multiple
|
1388
1417
|
# sort orders.
|
1389
1418
|
# @option opts [Hash] :by ({|a,b| a <=> b}) Specify attributes of objects to
|
1390
|
-
# to be used for sorting, for each vector name in *order* as a hash of
|
1419
|
+
# to be used for sorting, for each vector name in *order* as a hash of
|
1391
1420
|
# vector name and lambda pairs. In case a lambda for a vector is not
|
1392
1421
|
# specified, the default will be used.
|
1393
|
-
#
|
1422
|
+
#
|
1394
1423
|
# == Usage
|
1395
|
-
#
|
1424
|
+
#
|
1396
1425
|
# df = Daru::DataFrame.new({a: [-3,2,-1,4], b: [4,3,2,1]})
|
1397
|
-
#
|
1426
|
+
#
|
1398
1427
|
# #<Daru::DataFrame:140630680 @name = 04e00197-f8d5-4161-bca2-93266bfabc6f @size = 4>
|
1399
|
-
# # a b
|
1400
|
-
# # 0 -3 4
|
1401
|
-
# # 1 2 3
|
1402
|
-
# # 2 -1 2
|
1403
|
-
# # 3 4 1
|
1404
|
-
# df.sort([:a], by: { a: lambda { |a,b| a.abs <=> b.abs } })
|
1428
|
+
# # a b
|
1429
|
+
# # 0 -3 4
|
1430
|
+
# # 1 2 3
|
1431
|
+
# # 2 -1 2
|
1432
|
+
# # 3 4 1
|
1433
|
+
# df.sort([:a], by: { a: lambda { |a,b| a.abs <=> b.abs } })
|
1405
1434
|
def sort! vector_order, opts={}
|
1406
1435
|
raise ArgumentError, "Required atleast one vector name" if vector_order.size < 1
|
1407
1436
|
opts = {
|
@@ -1426,46 +1455,46 @@ module Daru
|
|
1426
1455
|
|
1427
1456
|
# Pivots a data frame on specified vectors and applies an aggregate function
|
1428
1457
|
# to quickly generate a summary.
|
1429
|
-
#
|
1458
|
+
#
|
1430
1459
|
# == Options
|
1431
|
-
#
|
1460
|
+
#
|
1432
1461
|
# +:index+ - Keys to group by on the pivot table row index. Pass vector names
|
1433
1462
|
# contained in an Array.
|
1434
|
-
#
|
1463
|
+
#
|
1435
1464
|
# +:vectors+ - Keys to group by on the pivot table column index. Pass vector
|
1436
1465
|
# names contained in an Array.
|
1437
|
-
#
|
1466
|
+
#
|
1438
1467
|
# +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
|
1439
|
-
# use any of the statistics functions applicable on Vectors that can be found in
|
1468
|
+
# use any of the statistics functions applicable on Vectors that can be found in
|
1440
1469
|
# the Daru::Statistics::Vector module.
|
1441
|
-
#
|
1442
|
-
# +:values+ - Columns to aggregate. Will consider all numeric columns not
|
1470
|
+
#
|
1471
|
+
# +:values+ - Columns to aggregate. Will consider all numeric columns not
|
1443
1472
|
# specified in *:index* or *:vectors*. Optional.
|
1444
|
-
#
|
1473
|
+
#
|
1445
1474
|
# == Usage
|
1446
|
-
#
|
1475
|
+
#
|
1447
1476
|
# df = Daru::DataFrame.new({
|
1448
|
-
# a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
|
1477
|
+
# a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
|
1449
1478
|
# b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
|
1450
1479
|
# c: ['small','large','large','small','small','large','small','large','small'],
|
1451
1480
|
# d: [1,2,2,3,3,4,5,6,7],
|
1452
1481
|
# e: [2,4,4,6,6,8,10,12,14]
|
1453
1482
|
# })
|
1454
1483
|
# df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
|
1455
|
-
#
|
1456
|
-
# #=>
|
1484
|
+
#
|
1485
|
+
# #=>
|
1457
1486
|
# # #<Daru::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
|
1458
|
-
# # [:e, :one] [:e, :two]
|
1459
|
-
# # [:bar] 18 26
|
1460
|
-
# # [:foo] 10 12
|
1487
|
+
# # [:e, :one] [:e, :two]
|
1488
|
+
# # [:bar] 18 26
|
1489
|
+
# # [:foo] 10 12
|
1461
1490
|
def pivot_table opts={}
|
1462
|
-
raise ArgumentError,
|
1491
|
+
raise ArgumentError,
|
1463
1492
|
"Specify grouping index" if !opts[:index] or opts[:index].empty?
|
1464
1493
|
|
1465
1494
|
index = opts[:index]
|
1466
1495
|
vectors = opts[:vectors] || []
|
1467
1496
|
aggregate_function = opts[:agg] || :mean
|
1468
|
-
values =
|
1497
|
+
values =
|
1469
1498
|
if opts[:values].is_a?(Symbol)
|
1470
1499
|
[opts[:values]]
|
1471
1500
|
elsif opts[:values].is_a?(Array)
|
@@ -1473,7 +1502,7 @@ module Daru
|
|
1473
1502
|
else # nil
|
1474
1503
|
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
1475
1504
|
end
|
1476
|
-
|
1505
|
+
|
1477
1506
|
raise IndexError, "No numeric vectors to aggregate" if values.empty?
|
1478
1507
|
|
1479
1508
|
grouped = group_by(index)
|
@@ -1524,7 +1553,7 @@ module Daru
|
|
1524
1553
|
end
|
1525
1554
|
end
|
1526
1555
|
|
1527
|
-
# Merge vectors from two DataFrames. In case of name collision,
|
1556
|
+
# Merge vectors from two DataFrames. In case of name collision,
|
1528
1557
|
# the vectors names are changed to x_1, x_2 ....
|
1529
1558
|
#
|
1530
1559
|
# @return {Daru::DataFrame}
|
@@ -1545,9 +1574,9 @@ module Daru
|
|
1545
1574
|
df_new
|
1546
1575
|
end
|
1547
1576
|
|
1548
|
-
# Join 2 DataFrames with SQL style joins. Currently supports inner, left
|
1577
|
+
# Join 2 DataFrames with SQL style joins. Currently supports inner, left
|
1549
1578
|
# outer, right outer and full outer joins.
|
1550
|
-
#
|
1579
|
+
#
|
1551
1580
|
# @param [Daru::DataFrame] other_df Another DataFrame on which the join is
|
1552
1581
|
# to be performed.
|
1553
1582
|
# @param [Hash] opts Options Hash
|
@@ -1565,11 +1594,11 @@ module Daru
|
|
1565
1594
|
# :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
|
1566
1595
|
# })
|
1567
1596
|
# left.join(right, how: :inner, on: [:name])
|
1568
|
-
# #=>
|
1597
|
+
# #=>
|
1569
1598
|
# ##<Daru::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
|
1570
|
-
# # id_1 name id_2
|
1571
|
-
# # 0 1 Pirate 2
|
1572
|
-
# # 1 3 Ninja 4
|
1599
|
+
# # id_1 name id_2
|
1600
|
+
# # 0 1 Pirate 2
|
1601
|
+
# # 1 3 Ninja 4
|
1573
1602
|
def join(other_df,opts={})
|
1574
1603
|
Daru::Core::Merge.join(self, other_df, opts)
|
1575
1604
|
end
|
@@ -1586,7 +1615,7 @@ module Daru
|
|
1586
1615
|
# the field of first parameters will be copied verbatim
|
1587
1616
|
# to new dataset, and fields which responds to second
|
1588
1617
|
# pattern will be added one case for each different %n.
|
1589
|
-
#
|
1618
|
+
#
|
1590
1619
|
# @example
|
1591
1620
|
# cases=[
|
1592
1621
|
# ['1','george','red',10,'blue',20,nil,nil],
|
@@ -1607,9 +1636,9 @@ module Daru
|
|
1607
1636
|
ds_vars = parent_fields.dup
|
1608
1637
|
vars = []
|
1609
1638
|
max_n = 0
|
1610
|
-
h = parent_fields.inject({}) { |a,v|
|
1639
|
+
h = parent_fields.inject({}) { |a,v|
|
1611
1640
|
a[v] = Daru::Vector.new([])
|
1612
|
-
a
|
1641
|
+
a
|
1613
1642
|
}
|
1614
1643
|
# Adding _row_id
|
1615
1644
|
h['_col_id'] = Daru::Vector.new([])
|
@@ -1663,12 +1692,12 @@ module Daru
|
|
1663
1692
|
end
|
1664
1693
|
|
1665
1694
|
# Create a sql, basen on a given Dataset
|
1666
|
-
#
|
1695
|
+
#
|
1667
1696
|
# == Arguments
|
1668
|
-
#
|
1697
|
+
#
|
1669
1698
|
# * table - String specifying name of the table that will created in SQL.
|
1670
1699
|
# * charset - Character set. Default is "UTF8".
|
1671
|
-
#
|
1700
|
+
#
|
1672
1701
|
# @example
|
1673
1702
|
#
|
1674
1703
|
# ds = Daru::DataFrame.new({
|
@@ -1717,17 +1746,17 @@ module Daru
|
|
1717
1746
|
def to_nmatrix
|
1718
1747
|
numerics_as_arrays = []
|
1719
1748
|
each_vector do |vector|
|
1720
|
-
numerics_as_arrays << vector.to_a if(vector.type == :numeric and
|
1749
|
+
numerics_as_arrays << vector.to_a if(vector.type == :numeric and
|
1721
1750
|
vector.missing_positions.size == 0)
|
1722
1751
|
end
|
1723
1752
|
|
1724
1753
|
numerics_as_arrays.transpose.to_nm
|
1725
1754
|
end
|
1726
|
-
|
1755
|
+
|
1727
1756
|
# Converts the DataFrame into an array of hashes where key is vector name
|
1728
|
-
# and value is the corresponding element. The 0th index of the array contains
|
1729
|
-
# the array of hashes while the 1th index contains the indexes of each row
|
1730
|
-
# of the dataframe. Each element in the index array corresponds to its row
|
1757
|
+
# and value is the corresponding element. The 0th index of the array contains
|
1758
|
+
# the array of hashes while the 1th index contains the indexes of each row
|
1759
|
+
# of the dataframe. Each element in the index array corresponds to its row
|
1731
1760
|
# in the array of hashes, which has the same index.
|
1732
1761
|
def to_a
|
1733
1762
|
arry = [[],[]]
|
@@ -1762,10 +1791,10 @@ module Daru
|
|
1762
1791
|
|
1763
1792
|
# Convert to html for IRuby.
|
1764
1793
|
def to_html threshold=30
|
1765
|
-
html = "<table>" +
|
1794
|
+
html = "<table>" +
|
1766
1795
|
"<tr>" +
|
1767
|
-
"<th colspan=\"#{@vectors.size+1}\">" +
|
1768
|
-
"Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}"
|
1796
|
+
"<th colspan=\"#{@vectors.size+1}\">" +
|
1797
|
+
"Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}"
|
1769
1798
|
"</th>" +
|
1770
1799
|
"</tr>"
|
1771
1800
|
html +='<tr><th></th>'
|
@@ -1791,7 +1820,7 @@ module Daru
|
|
1791
1820
|
html += '<tr>'
|
1792
1821
|
html += "<td>" + last_index.to_s + "</td>"
|
1793
1822
|
(0..(ncols - 1)).to_a.each do |i|
|
1794
|
-
html += '<td>' + last_row[i].to_s + '</td>'
|
1823
|
+
html += '<td>' + last_row[i].to_s + '</td>'
|
1795
1824
|
end
|
1796
1825
|
html += '</tr>'
|
1797
1826
|
break
|
@@ -1825,21 +1854,21 @@ module Daru
|
|
1825
1854
|
# == Arguements
|
1826
1855
|
#
|
1827
1856
|
# * filename - Path of CSV file where the DataFrame is to be saved.
|
1828
|
-
#
|
1857
|
+
#
|
1829
1858
|
# == Options
|
1830
|
-
#
|
1859
|
+
#
|
1831
1860
|
# * convert_comma - If set to *true*, will convert any commas in any
|
1832
1861
|
# of the data to full stops ('.').
|
1833
|
-
# All the options accepted by CSV.read() can also be passed into this
|
1862
|
+
# All the options accepted by CSV.read() can also be passed into this
|
1834
1863
|
# function.
|
1835
1864
|
def write_csv filename, opts={}
|
1836
1865
|
Daru::IO.dataframe_write_csv self, filename, opts
|
1837
1866
|
end
|
1838
1867
|
|
1839
1868
|
# Write this dataframe to an Excel Spreadsheet
|
1840
|
-
#
|
1869
|
+
#
|
1841
1870
|
# == Arguments
|
1842
|
-
#
|
1871
|
+
#
|
1843
1872
|
# * filename - The path of the file where the DataFrame should be written.
|
1844
1873
|
def write_excel filename, opts={}
|
1845
1874
|
Daru::IO.dataframe_write_excel self, filename, opts
|
@@ -1848,10 +1877,10 @@ module Daru
|
|
1848
1877
|
# Insert each case of the Dataset on the selected table
|
1849
1878
|
#
|
1850
1879
|
# == Arguments
|
1851
|
-
#
|
1880
|
+
#
|
1852
1881
|
# * dbh - DBI database connection object.
|
1853
1882
|
# * query - Query string.
|
1854
|
-
#
|
1883
|
+
#
|
1855
1884
|
# == Usage
|
1856
1885
|
#
|
1857
1886
|
# ds = Daru::DataFrame.new({:id=>Daru::Vector.new([1,2,3]), :name=>Daru::Vector.new(["a","b","c"])})
|
@@ -1869,8 +1898,8 @@ module Daru
|
|
1869
1898
|
|
1870
1899
|
def _dump depth
|
1871
1900
|
Marshal.dump({
|
1872
|
-
data: @data,
|
1873
|
-
index: @index.to_a,
|
1901
|
+
data: @data,
|
1902
|
+
index: @index.to_a,
|
1874
1903
|
order: @vectors.to_a,
|
1875
1904
|
name: @name
|
1876
1905
|
})
|
@@ -1878,14 +1907,14 @@ module Daru
|
|
1878
1907
|
|
1879
1908
|
def self._load data
|
1880
1909
|
h = Marshal.load data
|
1881
|
-
Daru::DataFrame.new(h[:data],
|
1882
|
-
index: h[:index],
|
1910
|
+
Daru::DataFrame.new(h[:data],
|
1911
|
+
index: h[:index],
|
1883
1912
|
order: h[:order],
|
1884
1913
|
name: h[:name])
|
1885
1914
|
end
|
1886
1915
|
|
1887
1916
|
# Change dtypes of vectors by supplying a hash of :vector_name => :new_dtype
|
1888
|
-
#
|
1917
|
+
#
|
1889
1918
|
# == Usage
|
1890
1919
|
# df = Daru::DataFrame.new({a: [1,2,3], b: [1,2,3], c: [1,2,3]})
|
1891
1920
|
# df.recast a: :nmatrix, c: :nmatrix
|
@@ -1908,7 +1937,7 @@ module Daru
|
|
1908
1937
|
# Pretty print in a nice table format for the command line (irb/pry/iruby)
|
1909
1938
|
def inspect spacing=10, threshold=15
|
1910
1939
|
longest = [@name.to_s.size,
|
1911
|
-
(@vectors.map(&:to_s).map(&:size).max || 0),
|
1940
|
+
(@vectors.map(&:to_s).map(&:size).max || 0),
|
1912
1941
|
(@index .map(&:to_s).map(&:size).max || 0),
|
1913
1942
|
(@data .map{ |v| v.map(&:to_s).map(&:size).max}.max || 0)].max
|
1914
1943
|
|
@@ -1918,7 +1947,7 @@ module Daru
|
|
1918
1947
|
formatter = "\n"
|
1919
1948
|
|
1920
1949
|
(@vectors.size + 1).times { formatter += "%#{longest}.#{longest}s " }
|
1921
|
-
content += "\n#<" + self.class.to_s + ":" + self.object_id.to_s + " @name = " +
|
1950
|
+
content += "\n#<" + self.class.to_s + ":" + self.object_id.to_s + " @name = " +
|
1922
1951
|
name.to_s + " @size = " + @size.to_s + ">"
|
1923
1952
|
content += sprintf formatter, "" , *@vectors.map(&:to_s)
|
1924
1953
|
row_num = 1
|
@@ -1945,10 +1974,10 @@ module Daru
|
|
1945
1974
|
end
|
1946
1975
|
|
1947
1976
|
def == other
|
1948
|
-
self.class == other.class and
|
1949
|
-
@size == other.size and
|
1977
|
+
self.class == other.class and
|
1978
|
+
@size == other.size and
|
1950
1979
|
@index == other.index and
|
1951
|
-
@vectors == other.vectors and
|
1980
|
+
@vectors == other.vectors and
|
1952
1981
|
@vectors.to_a.all? { |v| self[v] == other[v] }
|
1953
1982
|
end
|
1954
1983
|
|
@@ -1977,9 +2006,9 @@ module Daru
|
|
1977
2006
|
end
|
1978
2007
|
|
1979
2008
|
# == Arguments
|
1980
|
-
#
|
1981
|
-
# vector_order -
|
1982
|
-
# index -
|
2009
|
+
#
|
2010
|
+
# vector_order -
|
2011
|
+
# index -
|
1983
2012
|
# by -
|
1984
2013
|
# ascending -
|
1985
2014
|
# left_lower -
|
@@ -2120,7 +2149,7 @@ module Daru
|
|
2120
2149
|
end
|
2121
2150
|
|
2122
2151
|
order = names.is_a?(Array) ? Daru::Index.new(names) : names
|
2123
|
-
Daru::DataFrame.new(new_vcs, order: order,
|
2152
|
+
Daru::DataFrame.new(new_vcs, order: order,
|
2124
2153
|
index: @index, name: @name)
|
2125
2154
|
end
|
2126
2155
|
end
|
@@ -2134,7 +2163,7 @@ module Daru
|
|
2134
2163
|
return Daru::Vector.new(populate_row_for(pos), index: @vectors, name: pos)
|
2135
2164
|
else
|
2136
2165
|
new_rows = pos.map { |tuple| populate_row_for(tuple) }
|
2137
|
-
|
2166
|
+
|
2138
2167
|
if !location.is_a?(Range) and names.size < @index.width
|
2139
2168
|
pos = pos.drop_left_level names.size
|
2140
2169
|
end
|
@@ -2143,7 +2172,7 @@ module Daru
|
|
2143
2172
|
new_rows, order: @vectors, name: @name, index: pos)
|
2144
2173
|
end
|
2145
2174
|
else
|
2146
|
-
if names[1].nil?
|
2175
|
+
if names[1].nil?
|
2147
2176
|
names = @index[location]
|
2148
2177
|
if names.is_a?(Numeric)
|
2149
2178
|
row = []
|
@@ -2159,8 +2188,8 @@ module Daru
|
|
2159
2188
|
names.each do |name|
|
2160
2189
|
rows << self.row[name].to_a
|
2161
2190
|
end
|
2162
|
-
|
2163
|
-
Daru::DataFrame.rows rows, index: names ,name: @name, order: @vectors
|
2191
|
+
|
2192
|
+
Daru::DataFrame.rows rows, index: names ,name: @name, order: @vectors
|
2164
2193
|
end
|
2165
2194
|
end
|
2166
2195
|
|
@@ -2171,11 +2200,11 @@ module Daru
|
|
2171
2200
|
end
|
2172
2201
|
|
2173
2202
|
def insert_or_modify_vector name, vector
|
2174
|
-
name = name[0] unless @vectors.is_a?(MultiIndex)
|
2203
|
+
name = name[0] unless @vectors.is_a?(MultiIndex)
|
2175
2204
|
v = nil
|
2176
2205
|
|
2177
2206
|
if @index.empty?
|
2178
|
-
v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a)
|
2207
|
+
v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a)
|
2179
2208
|
@index = v.index
|
2180
2209
|
assign_or_add_vector name, v
|
2181
2210
|
set_size
|
@@ -2217,7 +2246,7 @@ module Daru
|
|
2217
2246
|
#FIXME: fix this jugaad. need to make changes in Indexing itself.
|
2218
2247
|
pos = @vectors[name]
|
2219
2248
|
|
2220
|
-
if !pos.kind_of?(Daru::Index) and pos == name and
|
2249
|
+
if !pos.kind_of?(Daru::Index) and pos == name and
|
2221
2250
|
(@vectors.include?(name) or (pos.is_a?(Integer) and pos < @data.size))
|
2222
2251
|
@data[pos] = v
|
2223
2252
|
elsif pos.kind_of?(Daru::Index)
|
@@ -2227,10 +2256,10 @@ module Daru
|
|
2227
2256
|
else
|
2228
2257
|
@vectors = @vectors | [name] if !@vectors.include?(name)
|
2229
2258
|
@data[@vectors[name]] = v
|
2230
|
-
end
|
2259
|
+
end
|
2231
2260
|
end
|
2232
2261
|
|
2233
|
-
def insert_or_modify_row name, vector
|
2262
|
+
def insert_or_modify_row name, vector
|
2234
2263
|
if index.is_a?(MultiIndex)
|
2235
2264
|
# TODO
|
2236
2265
|
else
|
@@ -2264,7 +2293,7 @@ module Daru
|
|
2264
2293
|
end
|
2265
2294
|
|
2266
2295
|
def validate_labels
|
2267
|
-
raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if
|
2296
|
+
raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if
|
2268
2297
|
@vectors and @vectors.size != @data.size
|
2269
2298
|
|
2270
2299
|
raise IndexError, "Expected number of indexes same as number of rows" if
|
@@ -2330,7 +2359,7 @@ module Daru
|
|
2330
2359
|
end
|
2331
2360
|
|
2332
2361
|
def symbolize arry
|
2333
|
-
symbolized_arry =
|
2362
|
+
symbolized_arry =
|
2334
2363
|
if arry.all? { |e| e.is_a?(Array) }
|
2335
2364
|
arry.map do |sub_arry|
|
2336
2365
|
sub_arry.map do |e|
|
@@ -2344,4 +2373,4 @@ module Daru
|
|
2344
2373
|
symbolized_arry
|
2345
2374
|
end
|
2346
2375
|
end
|
2347
|
-
end
|
2376
|
+
end
|