daru_lite 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/workflows/ci.yml +33 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +27 -0
- data/.rubocop_todo.yml +137 -0
- data/CONTRIBUTING.md +47 -0
- data/Gemfile +2 -0
- data/History.md +4 -0
- data/LICENSE +24 -0
- data/README.md +218 -0
- data/Rakefile +69 -0
- data/ReleasePolicy.md +20 -0
- data/benchmarks/TradeoffData.csv +65 -0
- data/benchmarks/csv_reading.rb +22 -0
- data/benchmarks/dataframe_creation.rb +39 -0
- data/benchmarks/db_loading.rb +34 -0
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +32 -0
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/row_access.rb +41 -0
- data/benchmarks/row_assign.rb +36 -0
- data/benchmarks/sorting.rb +51 -0
- data/benchmarks/statistics.rb +28 -0
- data/benchmarks/vector_access.rb +31 -0
- data/benchmarks/vector_assign.rb +42 -0
- data/benchmarks/where_clause.rb +48 -0
- data/benchmarks/where_vs_filter.rb +28 -0
- data/daru_lite.gemspec +55 -0
- data/images/README.md +5 -0
- data/images/con0.png +0 -0
- data/images/con1.png +0 -0
- data/images/init0.png +0 -0
- data/images/init1.png +0 -0
- data/images/man0.png +0 -0
- data/images/man1.png +0 -0
- data/images/man2.png +0 -0
- data/images/man3.png +0 -0
- data/images/man4.png +0 -0
- data/images/man5.png +0 -0
- data/images/man6.png +0 -0
- data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
- data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
- data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
- data/lib/daru_lite/category.rb +929 -0
- data/lib/daru_lite/configuration.rb +34 -0
- data/lib/daru_lite/core/group_by.rb +403 -0
- data/lib/daru_lite/core/merge.rb +270 -0
- data/lib/daru_lite/core/query.rb +109 -0
- data/lib/daru_lite/dataframe.rb +3080 -0
- data/lib/daru_lite/date_time/index.rb +569 -0
- data/lib/daru_lite/date_time/offsets.rb +397 -0
- data/lib/daru_lite/exceptions.rb +2 -0
- data/lib/daru_lite/extensions/which_dsl.rb +53 -0
- data/lib/daru_lite/formatters/table.rb +52 -0
- data/lib/daru_lite/helpers/array.rb +53 -0
- data/lib/daru_lite/index/categorical_index.rb +201 -0
- data/lib/daru_lite/index/index.rb +374 -0
- data/lib/daru_lite/index/multi_index.rb +374 -0
- data/lib/daru_lite/io/csv/converters.rb +21 -0
- data/lib/daru_lite/io/io.rb +294 -0
- data/lib/daru_lite/io/sql_data_source.rb +97 -0
- data/lib/daru_lite/iruby/helpers.rb +38 -0
- data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
- data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
- data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
- data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
- data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
- data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
- data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
- data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
- data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
- data/lib/daru_lite/monkeys.rb +56 -0
- data/lib/daru_lite/vector.rb +1678 -0
- data/lib/daru_lite/version.rb +3 -0
- data/lib/daru_lite.rb +99 -0
- data/profile/_base.rb +23 -0
- data/profile/df_to_a.rb +10 -0
- data/profile/filter.rb +13 -0
- data/profile/joining.rb +13 -0
- data/profile/sorting.rb +12 -0
- data/profile/vector_each_with_index.rb +9 -0
- data/profile/vector_new.rb +9 -0
- data/spec/accessors/array_wrapper_spec.rb +3 -0
- data/spec/category_spec.rb +1741 -0
- data/spec/core/group_by_spec.rb +655 -0
- data/spec/core/merge_spec.rb +179 -0
- data/spec/core/query_spec.rb +347 -0
- data/spec/daru_lite_spec.rb +22 -0
- data/spec/dataframe_spec.rb +4330 -0
- data/spec/date_time/data_spec.rb +197 -0
- data/spec/date_time/date_time_index_helper_spec.rb +72 -0
- data/spec/date_time/index_spec.rb +588 -0
- data/spec/date_time/offsets_spec.rb +465 -0
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/boolean_converter_test.csv +5 -0
- data/spec/fixtures/countries.json +7794 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/fixtures/eciresults.html +394 -0
- data/spec/fixtures/empties.dat +2 -0
- data/spec/fixtures/empty_rows_test.csv +17 -0
- data/spec/fixtures/macau.html +3691 -0
- data/spec/fixtures/macd_data.csv +150 -0
- data/spec/fixtures/matrix_test.csv +100 -0
- data/spec/fixtures/moneycontrol.html +6812 -0
- data/spec/fixtures/music_data.tsv +2501 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/sales-funnel.csv +18 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/string_converter_test.csv +5 -0
- data/spec/fixtures/strings.dat +2 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/fixtures/test_xls_2.xls +0 -0
- data/spec/fixtures/url_test.txt~ +0 -0
- data/spec/fixtures/valid_markup.html +62 -0
- data/spec/fixtures/wiki_climate.html +1243 -0
- data/spec/fixtures/wiki_table_info.html +631 -0
- data/spec/formatters/table_formatter_spec.rb +137 -0
- data/spec/helpers_spec.rb +8 -0
- data/spec/index/categorical_index_spec.rb +170 -0
- data/spec/index/index_spec.rb +417 -0
- data/spec/index/multi_index_spec.rb +680 -0
- data/spec/io/io_spec.rb +373 -0
- data/spec/io/sql_data_source_spec.rb +56 -0
- data/spec/iruby/dataframe_spec.rb +170 -0
- data/spec/iruby/helpers_spec.rb +49 -0
- data/spec/iruby/multi_index_spec.rb +37 -0
- data/spec/iruby/vector_spec.rb +105 -0
- data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
- data/spec/maths/arithmetic/vector_spec.rb +165 -0
- data/spec/maths/statistics/dataframe_spec.rb +178 -0
- data/spec/maths/statistics/vector_spec.rb +756 -0
- data/spec/monkeys_spec.rb +42 -0
- data/spec/shared/vector_display_spec.rb +213 -0
- data/spec/spec_helper.rb +87 -0
- data/spec/support/database_helper.rb +30 -0
- data/spec/support/matchers.rb +5 -0
- data/spec/vector_spec.rb +2293 -0
- metadata +571 -0
@@ -0,0 +1,3080 @@
|
|
1
|
+
require 'daru_lite/accessors/dataframe_by_row'
|
2
|
+
require 'daru_lite/maths/arithmetic/dataframe'
|
3
|
+
require 'daru_lite/maths/statistics/dataframe'
|
4
|
+
require 'daru_lite/io/io'
|
5
|
+
|
6
|
+
module DaruLite
|
7
|
+
class DataFrame # rubocop:disable Metrics/ClassLength
|
8
|
+
include DaruLite::Maths::Arithmetic::DataFrame
|
9
|
+
include DaruLite::Maths::Statistics::DataFrame
|
10
|
+
|
11
|
+
attr_accessor(*Configuration::INSPECT_OPTIONS_KEYS)
|
12
|
+
|
13
|
+
extend Gem::Deprecate
|
14
|
+
|
15
|
+
class << self
|
16
|
+
# Load data from a CSV file. Specify an optional block to grab the CSV
|
17
|
+
# object and pre-condition it (for example use the `convert` or
|
18
|
+
# `header_convert` methods).
|
19
|
+
#
|
20
|
+
# == Arguments
|
21
|
+
#
|
22
|
+
# * path - Local path / Remote URL of the file to load specified as a String.
|
23
|
+
#
|
24
|
+
# == Options
|
25
|
+
#
|
26
|
+
# Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
|
27
|
+
# and uses those to eventually construct the resulting DataFrame.
|
28
|
+
#
|
29
|
+
# == Verbose Description
|
30
|
+
#
|
31
|
+
# You can specify all the options to the `.from_csv` function that you
|
32
|
+
# do to the Ruby `CSV.read()` function, since this is what is used internally.
|
33
|
+
#
|
34
|
+
# For example, if the columns in your CSV file are separated by something
|
35
|
+
# other that commas, you can use the `:col_sep` option. If you want to
|
36
|
+
# convert numeric values to numbers and not keep them as strings, you can
|
37
|
+
# use the `:converters` option and set it to `:numeric`.
|
38
|
+
#
|
39
|
+
# The `.from_csv` function uses the following defaults for reading CSV files
|
40
|
+
# (that are passed into the `CSV.read()` function):
|
41
|
+
#
|
42
|
+
# {
|
43
|
+
# :col_sep => ',',
|
44
|
+
# :converters => :numeric
|
45
|
+
# }
|
46
|
+
def from_csv(path, opts = {}, &block)
|
47
|
+
DaruLite::IO.from_csv path, opts, &block
|
48
|
+
end
|
49
|
+
|
50
|
+
# Read data from an Excel file into a DataFrame.
|
51
|
+
#
|
52
|
+
# == Arguments
|
53
|
+
#
|
54
|
+
# * path - Path of the file to be read.
|
55
|
+
#
|
56
|
+
# == Options
|
57
|
+
#
|
58
|
+
# *:worksheet_id - ID of the worksheet that is to be read.
|
59
|
+
def from_excel(path, opts = {}, &block)
|
60
|
+
DaruLite::IO.from_excel path, opts, &block
|
61
|
+
end
|
62
|
+
|
63
|
+
# Read a database query and returns a Dataset
|
64
|
+
#
|
65
|
+
# @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
|
66
|
+
# @param query [String] The query to be executed
|
67
|
+
#
|
68
|
+
# @return A dataframe containing the data resulting from the query
|
69
|
+
#
|
70
|
+
# USE:
|
71
|
+
#
|
72
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
73
|
+
# DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
|
74
|
+
#
|
75
|
+
# #Alternatively
|
76
|
+
#
|
77
|
+
# require 'dbi'
|
78
|
+
# DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
|
79
|
+
def from_sql(dbh, query)
|
80
|
+
DaruLite::IO.from_sql dbh, query
|
81
|
+
end
|
82
|
+
|
83
|
+
# Read a dataframe from AR::Relation
|
84
|
+
#
|
85
|
+
# @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
|
86
|
+
# @param fields [Array] Field names to be loaded (optional)
|
87
|
+
#
|
88
|
+
# @return A dataframe containing the data loaded from the relation
|
89
|
+
#
|
90
|
+
# USE:
|
91
|
+
#
|
92
|
+
# # When Post model is defined as:
|
93
|
+
# class Post < ActiveRecord::Base
|
94
|
+
# scope :active, -> { where.not(published_at: nil) }
|
95
|
+
# end
|
96
|
+
#
|
97
|
+
# # You can load active posts into a dataframe by:
|
98
|
+
# DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
|
99
|
+
def from_activerecord(relation, *fields)
|
100
|
+
DaruLite::IO.from_activerecord relation, *fields
|
101
|
+
end
|
102
|
+
|
103
|
+
# Read the database from a plaintext file. For this method to work,
|
104
|
+
# the data should be present in a plain text file in columns. See
|
105
|
+
# spec/fixtures/bank2.dat for an example.
|
106
|
+
#
|
107
|
+
# == Arguments
|
108
|
+
#
|
109
|
+
# * path - Path of the file to be read.
|
110
|
+
# * fields - Vector names of the resulting database.
|
111
|
+
#
|
112
|
+
# == Usage
|
113
|
+
#
|
114
|
+
# df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
|
115
|
+
def from_plaintext(path, fields)
|
116
|
+
DaruLite::IO.from_plaintext path, fields
|
117
|
+
end
|
118
|
+
|
119
|
+
# Create DataFrame by specifying rows as an Array of Arrays or Array of
|
120
|
+
# DaruLite::Vector objects.
|
121
|
+
def rows(source, opts = {})
|
122
|
+
raise SizeError, 'All vectors must have same length' \
|
123
|
+
unless source.all? { |v| v.size == source.first.size }
|
124
|
+
|
125
|
+
opts[:order] ||= guess_order(source)
|
126
|
+
|
127
|
+
if ArrayHelper.array_of?(source, Array) || source.empty?
|
128
|
+
DataFrame.new(source.transpose, opts)
|
129
|
+
elsif ArrayHelper.array_of?(source, Vector)
|
130
|
+
from_vector_rows(source, opts)
|
131
|
+
else
|
132
|
+
raise ArgumentError, "Can't create DataFrame from #{source}"
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# Generates a new dataset, using three vectors
|
137
|
+
# - Rows
|
138
|
+
# - Columns
|
139
|
+
# - Values
|
140
|
+
#
|
141
|
+
# For example, you have these values
|
142
|
+
#
|
143
|
+
# x y v
|
144
|
+
# a a 0
|
145
|
+
# a b 1
|
146
|
+
# b a 1
|
147
|
+
# b b 0
|
148
|
+
#
|
149
|
+
# You obtain
|
150
|
+
# id a b
|
151
|
+
# a 0 1
|
152
|
+
# b 1 0
|
153
|
+
#
|
154
|
+
# Useful to process outputs from databases
|
155
|
+
def crosstab_by_assignation(rows, columns, values)
|
156
|
+
raise 'Three vectors should be equal size' if
|
157
|
+
rows.size != columns.size || rows.size != values.size
|
158
|
+
|
159
|
+
data = Hash.new do |h, col|
|
160
|
+
h[col] = rows.factors.map { |r| [r, nil] }.to_h
|
161
|
+
end
|
162
|
+
columns.zip(rows, values).each { |c, r, v| data[c][r] = v }
|
163
|
+
|
164
|
+
# FIXME: in fact, WITHOUT this line you'll obtain more "right"
|
165
|
+
# data: with vectors having "rows" as an index...
|
166
|
+
data = data.transform_values(&:values)
|
167
|
+
data[:_id] = rows.factors
|
168
|
+
|
169
|
+
DataFrame.new(data)
|
170
|
+
end
|
171
|
+
|
172
|
+
private
|
173
|
+
|
174
|
+
def guess_order(source)
|
175
|
+
case source.first
|
176
|
+
when Vector # assume that all are Vectors
|
177
|
+
source.first.index.to_a
|
178
|
+
when Array
|
179
|
+
Array.new(source.first.size, &:to_s)
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
def from_vector_rows(source, opts)
|
184
|
+
index = source.map(&:name)
|
185
|
+
.each_with_index.map { |n, i| n || i }
|
186
|
+
index = ArrayHelper.recode_repeated(index)
|
187
|
+
|
188
|
+
DataFrame.new({}, opts).tap do |df|
|
189
|
+
source.each_with_index do |row, idx|
|
190
|
+
df[index[idx] || idx, :row] = row
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
# The vectors (columns) index of the DataFrame
|
197
|
+
attr_reader :vectors
|
198
|
+
# TOREMOVE
|
199
|
+
attr_reader :data
|
200
|
+
|
201
|
+
# The index of the rows of the DataFrame
|
202
|
+
attr_reader :index
|
203
|
+
|
204
|
+
# The name of the DataFrame
|
205
|
+
attr_reader :name
|
206
|
+
|
207
|
+
# The number of rows present in the DataFrame
|
208
|
+
attr_reader :size
|
209
|
+
|
210
|
+
# DataFrame basically consists of an Array of Vector objects.
|
211
|
+
# These objects are indexed by row and column by vectors and index Index objects.
|
212
|
+
#
|
213
|
+
# == Arguments
|
214
|
+
#
|
215
|
+
# * source - Source from the DataFrame is to be initialized. Can be a Hash
|
216
|
+
# of names and vectors (array or DaruLite::Vector), an array of arrays or
|
217
|
+
# array of DaruLite::Vectors.
|
218
|
+
#
|
219
|
+
# == Options
|
220
|
+
#
|
221
|
+
# +:order+ - An *Array*/*DaruLite::Index*/*DaruLite::MultiIndex* containing the order in
|
222
|
+
# which Vectors should appear in the DataFrame.
|
223
|
+
#
|
224
|
+
# +:index+ - An *Array*/*DaruLite::Index*/*DaruLite::MultiIndex* containing the order
|
225
|
+
# in which rows of the DataFrame will be named.
|
226
|
+
#
|
227
|
+
# +:name+ - A name for the DataFrame.
|
228
|
+
#
|
229
|
+
# +:clone+ - Specify as *true* or *false*. When set to false, and Vector
|
230
|
+
# objects are passed for the source, the Vector objects will not duplicated
|
231
|
+
# when creating the DataFrame. Will have no effect if Array is passed in
|
232
|
+
# the source, or if the passed DaruLite::Vectors have different indexes.
|
233
|
+
# Default to *true*.
|
234
|
+
#
|
235
|
+
# == Usage
|
236
|
+
#
|
237
|
+
# df = DaruLite::DataFrame.new
|
238
|
+
# # =>
|
239
|
+
# # <DaruLite::DataFrame(0x0)>
|
240
|
+
# # Creates an empty DataFrame with no rows or columns.
|
241
|
+
#
|
242
|
+
# df = DaruLite::DataFrame.new({}, order: [:a, :b])
|
243
|
+
# #<DaruLite::DataFrame(0x2)>
|
244
|
+
# a b
|
245
|
+
# # Creates a DataFrame with no rows and columns :a and :b
|
246
|
+
#
|
247
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
|
248
|
+
# index: [:a, :b, :c, :d], name: :spider_man)
|
249
|
+
#
|
250
|
+
# # =>
|
251
|
+
# # <DaruLite::DataFrame:80766980 @name = spider_man @size = 4>
|
252
|
+
# # b a
|
253
|
+
# # a 6 1
|
254
|
+
# # b 7 2
|
255
|
+
# # c 8 3
|
256
|
+
# # d 9 4
|
257
|
+
#
|
258
|
+
# df = DaruLite::DataFrame.new([[1,2,3,4],[6,7,8,9]], name: :bat_man)
|
259
|
+
#
|
260
|
+
# # =>
|
261
|
+
# # #<DaruLite::DataFrame: bat_man (4x2)>
|
262
|
+
# # 0 1
|
263
|
+
# # 0 1 6
|
264
|
+
# # 1 2 7
|
265
|
+
# # 2 3 8
|
266
|
+
# # 3 4 9
|
267
|
+
#
|
268
|
+
# # Dataframe having Index name
|
269
|
+
#
|
270
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
|
271
|
+
# index: DaruLite::Index.new([:a, :b, :c, :d], name: 'idx_name'),
|
272
|
+
# name: :spider_man)
|
273
|
+
#
|
274
|
+
# # =>
|
275
|
+
# # <DaruLite::DataFrame:80766980 @name = spider_man @size = 4>
|
276
|
+
# # idx_name b a
|
277
|
+
# # a 6 1
|
278
|
+
# # b 7 2
|
279
|
+
# # c 8 3
|
280
|
+
# # d 9 4
|
281
|
+
#
|
282
|
+
#
|
283
|
+
# idx = DaruLite::Index.new [100, 99, 101, 1, 2], name: "s1"
|
284
|
+
# => #<DaruLite::Index(5): s1 {100, 99, 101, 1, 2}>
|
285
|
+
#
|
286
|
+
# df = DaruLite::DataFrame.new({b: [11,12,13,14,15], a: [1,2,3,4,5],
|
287
|
+
# c: [11,22,33,44,55]},
|
288
|
+
# order: [:a, :b, :c],
|
289
|
+
# index: idx)
|
290
|
+
# # =>
|
291
|
+
# #<DaruLite::DataFrame(5x3)>
|
292
|
+
# # s1 a b c
|
293
|
+
# # 100 1 11 11
|
294
|
+
# # 99 2 12 22
|
295
|
+
# # 101 3 13 33
|
296
|
+
# # 1 4 14 44
|
297
|
+
# # 2 5 15 55
|
298
|
+
|
299
|
+
def initialize(source = {}, opts = {})
|
300
|
+
vectors = opts[:order]
|
301
|
+
index = opts[:index] # FIXME: just keyword arges after Ruby 2.1
|
302
|
+
@data = []
|
303
|
+
@name = opts[:name]
|
304
|
+
|
305
|
+
case source
|
306
|
+
when [], {}
|
307
|
+
create_empty_vectors(vectors, index)
|
308
|
+
when Array
|
309
|
+
initialize_from_array source, vectors, index, opts
|
310
|
+
when Hash
|
311
|
+
initialize_from_hash source, vectors, index, opts
|
312
|
+
end
|
313
|
+
|
314
|
+
set_size
|
315
|
+
validate
|
316
|
+
update
|
317
|
+
end
|
318
|
+
|
319
|
+
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
|
320
|
+
# Defaults to *:vector*. Use of this method is not recommended for accessing
|
321
|
+
# rows. Use df.row[:a] for accessing row with index ':a'.
|
322
|
+
def [](*names)
|
323
|
+
axis = extract_axis(names, :vector)
|
324
|
+
dispatch_to_axis axis, :access, *names
|
325
|
+
end
|
326
|
+
|
327
|
+
# Retrive rows by positions
|
328
|
+
# @param [Array<Integer>] positions of rows to retrive
|
329
|
+
# @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
|
330
|
+
# @example
|
331
|
+
# df = DaruLite::DataFrame.new({
|
332
|
+
# a: [1, 2, 3],
|
333
|
+
# b: ['a', 'b', 'c']
|
334
|
+
# })
|
335
|
+
# df.row_at 1, 2
|
336
|
+
# # => #<DaruLite::DataFrame(2x2)>
|
337
|
+
# # a b
|
338
|
+
# # 1 2 b
|
339
|
+
# # 2 3 c
|
340
|
+
def row_at(*positions)
|
341
|
+
original_positions = positions
|
342
|
+
positions = coerce_positions(*positions, nrows)
|
343
|
+
validate_positions(*positions, nrows)
|
344
|
+
|
345
|
+
if positions.is_a? Integer
|
346
|
+
row = get_rows_for([positions])
|
347
|
+
DaruLite::Vector.new row, index: @vectors
|
348
|
+
else
|
349
|
+
new_rows = get_rows_for(original_positions)
|
350
|
+
DaruLite::DataFrame.new new_rows, index: @index.at(*original_positions), order: @vectors
|
351
|
+
end
|
352
|
+
end
|
353
|
+
|
354
|
+
# Set rows by positions
|
355
|
+
# @param [Array<Integer>] positions positions of rows to set
|
356
|
+
# @param [Array, DaruLite::Vector] vector vector to be assigned
|
357
|
+
# @example
|
358
|
+
# df = DaruLite::DataFrame.new({
|
359
|
+
# a: [1, 2, 3],
|
360
|
+
# b: ['a', 'b', 'c']
|
361
|
+
# })
|
362
|
+
# df.set_row_at [0, 1], ['x', 'x']
|
363
|
+
# df
|
364
|
+
# #=> #<DaruLite::DataFrame(3x2)>
|
365
|
+
# # a b
|
366
|
+
# # 0 x x
|
367
|
+
# # 1 x x
|
368
|
+
# # 2 3 c
|
369
|
+
def set_row_at(positions, vector)
|
370
|
+
validate_positions(*positions, nrows)
|
371
|
+
vector =
|
372
|
+
if vector.is_a? DaruLite::Vector
|
373
|
+
vector.reindex @vectors
|
374
|
+
else
|
375
|
+
DaruLite::Vector.new vector
|
376
|
+
end
|
377
|
+
|
378
|
+
raise SizeError, 'Vector length should match row length' if
|
379
|
+
vector.size != @vectors.size
|
380
|
+
|
381
|
+
@data.each_with_index do |vec, pos|
|
382
|
+
vec.set_at(positions, vector.at(pos))
|
383
|
+
end
|
384
|
+
@index = @data[0].index
|
385
|
+
set_size
|
386
|
+
end
|
387
|
+
|
388
|
+
# Retrive vectors by positions
|
389
|
+
# @param [Array<Integer>] positions of vectors to retrive
|
390
|
+
# @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
|
391
|
+
# @example
|
392
|
+
# df = DaruLite::DataFrame.new({
|
393
|
+
# a: [1, 2, 3],
|
394
|
+
# b: ['a', 'b', 'c']
|
395
|
+
# })
|
396
|
+
# df.at 0
|
397
|
+
# # => #<DaruLite::Vector(3)>
|
398
|
+
# # a
|
399
|
+
# # 0 1
|
400
|
+
# # 1 2
|
401
|
+
# # 2 3
|
402
|
+
def at(*positions)
|
403
|
+
if AXES.include? positions.last
|
404
|
+
axis = positions.pop
|
405
|
+
return row_at(*positions) if axis == :row
|
406
|
+
end
|
407
|
+
|
408
|
+
original_positions = positions
|
409
|
+
positions = coerce_positions(*positions, ncols)
|
410
|
+
validate_positions(*positions, ncols)
|
411
|
+
|
412
|
+
if positions.is_a? Integer
|
413
|
+
@data[positions].dup
|
414
|
+
else
|
415
|
+
DaruLite::DataFrame.new positions.map { |pos| @data[pos].dup },
|
416
|
+
index: @index,
|
417
|
+
order: @vectors.at(*original_positions),
|
418
|
+
name: @name
|
419
|
+
end
|
420
|
+
end
|
421
|
+
|
422
|
+
# Set vectors by positions
|
423
|
+
# @param [Array<Integer>] positions positions of vectors to set
|
424
|
+
# @param [Array, DaruLite::Vector] vector vector to be assigned
|
425
|
+
# @example
|
426
|
+
# df = DaruLite::DataFrame.new({
|
427
|
+
# a: [1, 2, 3],
|
428
|
+
# b: ['a', 'b', 'c']
|
429
|
+
# })
|
430
|
+
# df.set_at [0], ['x', 'y', 'z']
|
431
|
+
# df
|
432
|
+
# #=> #<DaruLite::DataFrame(3x2)>
|
433
|
+
# # a b
|
434
|
+
# # 0 x a
|
435
|
+
# # 1 y b
|
436
|
+
# # 2 z c
|
437
|
+
def set_at(positions, vector)
|
438
|
+
if positions.last == :row
|
439
|
+
positions.pop
|
440
|
+
return set_row_at(positions, vector)
|
441
|
+
end
|
442
|
+
|
443
|
+
validate_positions(*positions, ncols)
|
444
|
+
vector =
|
445
|
+
if vector.is_a? DaruLite::Vector
|
446
|
+
vector.reindex @index
|
447
|
+
else
|
448
|
+
DaruLite::Vector.new vector
|
449
|
+
end
|
450
|
+
|
451
|
+
raise SizeError, 'Vector length should match index length' if
|
452
|
+
vector.size != @index.size
|
453
|
+
|
454
|
+
positions.each { |pos| @data[pos] = vector }
|
455
|
+
end
|
456
|
+
|
457
|
+
# Insert a new row/vector of the specified name or modify a previous row.
|
458
|
+
# Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
|
459
|
+
# a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
|
460
|
+
#
|
461
|
+
# In case a DaruLite::Vector is specified after the equality the sign, the indexes
|
462
|
+
# of the vector will be matched against the row/vector indexes of the DataFrame
|
463
|
+
# before an insertion is performed. Unmatched indexes will be set to nil.
|
464
|
+
def []=(*args)
|
465
|
+
vector = args.pop
|
466
|
+
axis = extract_axis(args)
|
467
|
+
names = args
|
468
|
+
|
469
|
+
dispatch_to_axis axis, :insert_or_modify, names, vector
|
470
|
+
end
|
471
|
+
|
472
|
+
def add_row(row, index = nil)
|
473
|
+
self.row[*(index || @size)] = row
|
474
|
+
end
|
475
|
+
|
476
|
+
def add_vector(n, vector)
|
477
|
+
self[n] = vector
|
478
|
+
end
|
479
|
+
|
480
|
+
def insert_vector(n, name, source)
|
481
|
+
raise ArgumentError unless source.is_a? Array
|
482
|
+
|
483
|
+
vector = DaruLite::Vector.new(source, index: @index, name: @name)
|
484
|
+
@data << vector
|
485
|
+
@vectors = @vectors.add name
|
486
|
+
ordr = @vectors.dup.to_a
|
487
|
+
elmnt = ordr.pop
|
488
|
+
ordr.insert n, elmnt
|
489
|
+
self.order = ordr
|
490
|
+
end
|
491
|
+
|
492
|
+
# Access a row or set/create a row. Refer #[] and #[]= docs for details.
|
493
|
+
#
|
494
|
+
# == Usage
|
495
|
+
# df.row[:a] # access row named ':a'
|
496
|
+
# df.row[:b] = [1,2,3] # set row ':b' to [1,2,3]
|
497
|
+
def row
|
498
|
+
DaruLite::Accessors::DataFrameByRow.new(self)
|
499
|
+
end
|
500
|
+
|
501
|
+
# Extract a dataframe given row indexes or positions
|
502
|
+
# @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
|
503
|
+
# @return [DaruLite::Dataframe]
|
504
|
+
def get_sub_dataframe(keys, by_position: true)
|
505
|
+
return DaruLite::DataFrame.new({}) if keys == []
|
506
|
+
|
507
|
+
keys = @index.pos(*keys) unless by_position
|
508
|
+
|
509
|
+
sub_df = row_at(*keys)
|
510
|
+
sub_df = sub_df.to_df.transpose if sub_df.is_a?(DaruLite::Vector)
|
511
|
+
|
512
|
+
sub_df
|
513
|
+
end
|
514
|
+
|
515
|
+
# Duplicate the DataFrame entirely.
|
516
|
+
#
|
517
|
+
# == Arguments
|
518
|
+
#
|
519
|
+
# * +vectors_to_dup+ - An Array specifying the names of Vectors to
|
520
|
+
# be duplicated. Will duplicate the entire DataFrame if not specified.
|
521
|
+
def dup(vectors_to_dup = nil)
|
522
|
+
vectors_to_dup ||= @vectors.to_a
|
523
|
+
|
524
|
+
src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
|
525
|
+
new_order = DaruLite::Index.new(vectors_to_dup)
|
526
|
+
|
527
|
+
DaruLite::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
|
528
|
+
end
|
529
|
+
|
530
|
+
# Only clone the structure of the DataFrame.
|
531
|
+
def clone_structure
|
532
|
+
DaruLite::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
|
533
|
+
end
|
534
|
+
|
535
|
+
# Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
|
536
|
+
# preserved.
|
537
|
+
#
|
538
|
+
# == Arguments
|
539
|
+
#
|
540
|
+
# +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
|
541
|
+
# a view of the whole data frame otherwise.
|
542
|
+
def clone(*vectors_to_clone)
|
543
|
+
vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
|
544
|
+
vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
|
545
|
+
|
546
|
+
h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
|
547
|
+
DaruLite::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
|
548
|
+
end
|
549
|
+
|
550
|
+
# Returns a 'shallow' copy of DataFrame if missing data is not present,
|
551
|
+
# or a full copy of only valid data if missing data is present.
|
552
|
+
def clone_only_valid
|
553
|
+
if include_values?(*DaruLite::MISSING_VALUES)
|
554
|
+
reject_values(*DaruLite::MISSING_VALUES)
|
555
|
+
else
|
556
|
+
clone
|
557
|
+
end
|
558
|
+
end
|
559
|
+
|
560
|
+
# Creates a new duplicate dataframe containing only rows
|
561
|
+
# without a single missing value.
|
562
|
+
def dup_only_valid(vecs = nil)
|
563
|
+
rows_with_nil = @data.map { |vec| vec.indexes(*DaruLite::MISSING_VALUES) }
|
564
|
+
.inject(&:concat)
|
565
|
+
.uniq
|
566
|
+
|
567
|
+
row_indexes = @index.to_a
|
568
|
+
(vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
|
569
|
+
end
|
570
|
+
deprecate :dup_only_valid, :reject_values, 2016, 10
|
571
|
+
|
572
|
+
# Returns a dataframe in which rows with any of the mentioned values
|
573
|
+
# are ignored.
|
574
|
+
# @param [Array] values to reject to form the new dataframe
|
575
|
+
# @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
|
576
|
+
# contain the mentioned values
|
577
|
+
# @example
|
578
|
+
# df = DaruLite::DataFrame.new({
|
579
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
580
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
581
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
582
|
+
# }, index: 11..18)
|
583
|
+
# df.reject_values nil, Float::NAN
|
584
|
+
# # => #<DaruLite::DataFrame(2x3)>
|
585
|
+
# # a b c
|
586
|
+
# # 11 1 a a
|
587
|
+
# # 18 7 8 7
|
588
|
+
def reject_values(*values)
|
589
|
+
positions =
|
590
|
+
size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
|
591
|
+
# Handle the case when positions size is 1 and #row_at wouldn't return a df
|
592
|
+
if positions.size == 1
|
593
|
+
pos = positions.first
|
594
|
+
row_at(pos..pos)
|
595
|
+
else
|
596
|
+
row_at(*positions)
|
597
|
+
end
|
598
|
+
end
|
599
|
+
|
600
|
+
# Replace specified values with given value
|
601
|
+
# @param [Array] old_values values to replace with new value
|
602
|
+
# @param [object] new_value new value to replace with
|
603
|
+
# @return [DaruLite::DataFrame] Data Frame itself with old values replace
|
604
|
+
# with new value
|
605
|
+
# @example
|
606
|
+
# df = DaruLite::DataFrame.new({
|
607
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
608
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
609
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
610
|
+
# }, index: 11..18)
|
611
|
+
# df.replace_values nil, Float::NAN
|
612
|
+
# # => #<DaruLite::DataFrame(8x3)>
|
613
|
+
# # a b c
|
614
|
+
# # 11 1 a a
|
615
|
+
# # 12 2 b NaN
|
616
|
+
# # 13 3 NaN 3
|
617
|
+
# # 14 NaN NaN 4
|
618
|
+
# # 15 NaN NaN 3
|
619
|
+
# # 16 NaN 3 5
|
620
|
+
# # 17 1 5 NaN
|
621
|
+
# # 18 7 8 7
|
622
|
+
def replace_values(old_values, new_value)
|
623
|
+
@data.each { |vec| vec.replace_values old_values, new_value }
|
624
|
+
self
|
625
|
+
end
|
626
|
+
|
627
|
+
# Rolling fillna
|
628
|
+
# replace all Float::NAN and NIL values with the preceeding or following value
|
629
|
+
#
|
630
|
+
# @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
|
631
|
+
#
|
632
|
+
# @example
|
633
|
+
# df = DaruLite::DataFrame.new({
|
634
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
635
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
|
636
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
637
|
+
# })
|
638
|
+
#
|
639
|
+
# => #<DaruLite::DataFrame(8x3)>
|
640
|
+
# a b c
|
641
|
+
# 0 1 a a
|
642
|
+
# 1 2 b NaN
|
643
|
+
# 2 3 nil 3
|
644
|
+
# 3 nil NaN 4
|
645
|
+
# 4 NaN nil 3
|
646
|
+
# 5 nil 3 5
|
647
|
+
# 6 1 5 nil
|
648
|
+
# 7 7 nil 7
|
649
|
+
#
|
650
|
+
# 2.3.3 :068 > df.rolling_fillna(:forward)
|
651
|
+
# => #<DaruLite::DataFrame(8x3)>
|
652
|
+
# a b c
|
653
|
+
# 0 1 a a
|
654
|
+
# 1 2 b a
|
655
|
+
# 2 3 b 3
|
656
|
+
# 3 3 b 4
|
657
|
+
# 4 3 b 3
|
658
|
+
# 5 3 3 5
|
659
|
+
# 6 1 5 5
|
660
|
+
# 7 7 5 7
|
661
|
+
#
|
662
|
+
def rolling_fillna!(direction = :forward)
|
663
|
+
@data.each { |vec| vec.rolling_fillna!(direction) }
|
664
|
+
self
|
665
|
+
end
|
666
|
+
|
667
|
+
def rolling_fillna(direction = :forward)
|
668
|
+
dup.rolling_fillna!(direction)
|
669
|
+
end
|
670
|
+
|
671
|
+
# Return unique rows by vector specified or all vectors
|
672
|
+
#
|
673
|
+
# @param vtrs [String][Symbol] vector names(s) that should be considered
|
674
|
+
#
|
675
|
+
# @example
|
676
|
+
#
|
677
|
+
# => #<DaruLite::DataFrame(6x2)>
|
678
|
+
# a b
|
679
|
+
# 0 1 a
|
680
|
+
# 1 2 b
|
681
|
+
# 2 3 c
|
682
|
+
# 3 4 d
|
683
|
+
# 2 3 c
|
684
|
+
# 3 4 f
|
685
|
+
#
|
686
|
+
# 2.3.3 :> df.unique
|
687
|
+
# => #<DaruLite::DataFrame(5x2)>
|
688
|
+
# a b
|
689
|
+
# 0 1 a
|
690
|
+
# 1 2 b
|
691
|
+
# 2 3 c
|
692
|
+
# 3 4 d
|
693
|
+
# 3 4 f
|
694
|
+
#
|
695
|
+
# 2.3.3 :> df.unique(:a)
|
696
|
+
# => #<DaruLite::DataFrame(5x2)>
|
697
|
+
# a b
|
698
|
+
# 0 1 a
|
699
|
+
# 1 2 b
|
700
|
+
# 2 3 c
|
701
|
+
# 3 4 d
|
702
|
+
#
|
703
|
+
def uniq(*vtrs)
|
704
|
+
vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
|
705
|
+
grouped = group_by(vecs)
|
706
|
+
indexes = grouped.groups.values.map { |v| v[0] }.sort
|
707
|
+
row[*indexes]
|
708
|
+
end
|
709
|
+
|
710
|
+
# Iterate over each index of the DataFrame.
|
711
|
+
def each_index(&block)
|
712
|
+
return to_enum(:each_index) unless block
|
713
|
+
|
714
|
+
@index.each(&block)
|
715
|
+
|
716
|
+
self
|
717
|
+
end
|
718
|
+
|
719
|
+
# Iterate over each vector
|
720
|
+
def each_vector(&block)
|
721
|
+
return to_enum(:each_vector) unless block
|
722
|
+
|
723
|
+
@data.each(&block)
|
724
|
+
|
725
|
+
self
|
726
|
+
end
|
727
|
+
|
728
|
+
alias each_column each_vector
|
729
|
+
|
730
|
+
# Iterate over each vector alongwith the name of the vector
|
731
|
+
def each_vector_with_index
|
732
|
+
return to_enum(:each_vector_with_index) unless block_given?
|
733
|
+
|
734
|
+
@vectors.each do |vector|
|
735
|
+
yield @data[@vectors[vector]], vector
|
736
|
+
end
|
737
|
+
|
738
|
+
self
|
739
|
+
end
|
740
|
+
|
741
|
+
alias each_column_with_index each_vector_with_index
|
742
|
+
|
743
|
+
# Iterate over each row
|
744
|
+
def each_row
|
745
|
+
return to_enum(:each_row) unless block_given?
|
746
|
+
|
747
|
+
@index.size.times do |pos|
|
748
|
+
yield row_at(pos)
|
749
|
+
end
|
750
|
+
|
751
|
+
self
|
752
|
+
end
|
753
|
+
|
754
|
+
def each_row_with_index
|
755
|
+
return to_enum(:each_row_with_index) unless block_given?
|
756
|
+
|
757
|
+
@index.each do |index|
|
758
|
+
yield access_row(index), index
|
759
|
+
end
|
760
|
+
|
761
|
+
self
|
762
|
+
end
|
763
|
+
|
764
|
+
# Iterate over each row or vector of the DataFrame. Specify axis
|
765
|
+
# by passing :vector or :row as the argument. Default to :vector.
|
766
|
+
#
|
767
|
+
# == Description
|
768
|
+
#
|
769
|
+
# `#each` works exactly like Array#each. The default mode for `each`
|
770
|
+
# is to iterate over the columns of the DataFrame. To iterate over
|
771
|
+
# rows you must pass the axis, i.e `:row` as an argument.
|
772
|
+
#
|
773
|
+
# == Arguments
|
774
|
+
#
|
775
|
+
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
776
|
+
# or :row. Default to :vector.
|
777
|
+
def each(axis = :vector, &block)
|
778
|
+
dispatch_to_axis axis, :each, &block
|
779
|
+
end
|
780
|
+
|
781
|
+
# Iterate over a row or vector and return results in a DaruLite::Vector.
|
782
|
+
# Specify axis with :vector or :row. Default to :vector.
|
783
|
+
#
|
784
|
+
# == Description
|
785
|
+
#
|
786
|
+
# The #collect iterator works similar to #map, the only difference
|
787
|
+
# being that it returns a DaruLite::Vector comprising of the results of
|
788
|
+
# each block run. The resultant Vector has the same index as that
|
789
|
+
# of the axis over which collect has iterated. It also accepts the
|
790
|
+
# optional axis argument.
|
791
|
+
#
|
792
|
+
# == Arguments
|
793
|
+
#
|
794
|
+
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
795
|
+
# or :row. Default to :vector.
|
796
|
+
def collect(axis = :vector, &block)
|
797
|
+
dispatch_to_axis_pl axis, :collect, &block
|
798
|
+
end
|
799
|
+
|
800
|
+
# Map over each vector or row of the data frame according to
|
801
|
+
# the argument specified. Will return an Array of the resulting
|
802
|
+
# elements. To map over each row/vector and get a DataFrame,
|
803
|
+
# see #recode.
|
804
|
+
#
|
805
|
+
# == Description
|
806
|
+
#
|
807
|
+
# The #map iterator works like Array#map. The value returned by
|
808
|
+
# each run of the block is added to an Array and the Array is
|
809
|
+
# returned. This method also accepts an axis argument, like #each.
|
810
|
+
# The default is :vector.
|
811
|
+
#
|
812
|
+
# == Arguments
|
813
|
+
#
|
814
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
815
|
+
# Default to :vector.
|
816
|
+
def map(axis = :vector, &block)
|
817
|
+
dispatch_to_axis_pl axis, :map, &block
|
818
|
+
end
|
819
|
+
|
820
|
+
# Destructive map. Modifies the DataFrame. Each run of the block
|
821
|
+
# must return a DaruLite::Vector. You can specify the axis to map over
|
822
|
+
# as the argument. Default to :vector.
|
823
|
+
#
|
824
|
+
# == Arguments
|
825
|
+
#
|
826
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
827
|
+
# Default to :vector.
|
828
|
+
def map!(axis = :vector, &block)
|
829
|
+
if %i[vector column].include?(axis)
|
830
|
+
map_vectors!(&block)
|
831
|
+
elsif axis == :row
|
832
|
+
map_rows!(&block)
|
833
|
+
end
|
834
|
+
end
|
835
|
+
|
836
|
+
# Maps over the DataFrame and returns a DataFrame. Each run of the
|
837
|
+
# block must return a DaruLite::Vector object. You can specify the axis
|
838
|
+
# to map over. Default to :vector.
|
839
|
+
#
|
840
|
+
# == Description
|
841
|
+
#
|
842
|
+
# Recode works similarly to #map, but an important difference between
|
843
|
+
# the two is that recode returns a modified DaruLite::DataFrame instead
|
844
|
+
# of an Array. For this reason, #recode expects that every run of the
|
845
|
+
# block to return a DaruLite::Vector.
|
846
|
+
#
|
847
|
+
# Just like map and each, recode also accepts an optional _axis_ argument.
|
848
|
+
#
|
849
|
+
# == Arguments
|
850
|
+
#
|
851
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
852
|
+
# Default to :vector.
|
853
|
+
def recode(axis = :vector, &block)
|
854
|
+
dispatch_to_axis_pl axis, :recode, &block
|
855
|
+
end
|
856
|
+
|
857
|
+
# Retain vectors or rows if the block returns a truthy value.
|
858
|
+
#
|
859
|
+
# == Description
|
860
|
+
#
|
861
|
+
# For filtering out certain rows/vectors based on their values,
|
862
|
+
# use the #filter method. By default it iterates over vectors and
|
863
|
+
# keeps those vectors for which the block returns true. It accepts
|
864
|
+
# an optional axis argument which lets you specify whether you want
|
865
|
+
# to iterate over vectors or rows.
|
866
|
+
#
|
867
|
+
# == Arguments
|
868
|
+
#
|
869
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
870
|
+
# Default to :vector.
|
871
|
+
#
|
872
|
+
# == Usage
|
873
|
+
#
|
874
|
+
# # Filter vectors
|
875
|
+
#
|
876
|
+
# df.filter do |vector|
|
877
|
+
# vector.type == :numeric and vector.median < 50
|
878
|
+
# end
|
879
|
+
#
|
880
|
+
# # Filter rows
|
881
|
+
#
|
882
|
+
# df.filter(:row) do |row|
|
883
|
+
# row[:a] + row[:d] < 100
|
884
|
+
# end
|
885
|
+
def filter(axis = :vector, &block)
|
886
|
+
dispatch_to_axis_pl axis, :filter, &block
|
887
|
+
end
|
888
|
+
|
889
|
+
def recode_vectors
|
890
|
+
block_given? or return to_enum(:recode_vectors)
|
891
|
+
|
892
|
+
dup.tap do |df|
|
893
|
+
df.each_vector_with_index do |v, i|
|
894
|
+
df[*i] = should_be_vector!(yield(v))
|
895
|
+
end
|
896
|
+
end
|
897
|
+
end
|
898
|
+
|
899
|
+
def recode_rows
|
900
|
+
block_given? or return to_enum(:recode_rows)
|
901
|
+
|
902
|
+
dup.tap do |df|
|
903
|
+
df.each_row_with_index do |r, i|
|
904
|
+
df.row[i] = should_be_vector!(yield(r))
|
905
|
+
end
|
906
|
+
end
|
907
|
+
end
|
908
|
+
|
909
|
+
# Map each vector and return an Array.
|
910
|
+
def map_vectors(&block)
|
911
|
+
return to_enum(:map_vectors) unless block
|
912
|
+
|
913
|
+
@data.map(&block)
|
914
|
+
end
|
915
|
+
|
916
|
+
# Destructive form of #map_vectors
|
917
|
+
def map_vectors!
|
918
|
+
return to_enum(:map_vectors!) unless block_given?
|
919
|
+
|
920
|
+
vectors.dup.each do |n|
|
921
|
+
self[n] = should_be_vector!(yield(self[n]))
|
922
|
+
end
|
923
|
+
|
924
|
+
self
|
925
|
+
end
|
926
|
+
|
927
|
+
# Map vectors alongwith the index.
|
928
|
+
def map_vectors_with_index(&block)
|
929
|
+
return to_enum(:map_vectors_with_index) unless block
|
930
|
+
|
931
|
+
each_vector_with_index.map(&block)
|
932
|
+
end
|
933
|
+
|
934
|
+
# Map each row
|
935
|
+
def map_rows(&block)
|
936
|
+
return to_enum(:map_rows) unless block
|
937
|
+
|
938
|
+
each_row.map(&block)
|
939
|
+
end
|
940
|
+
|
941
|
+
def map_rows_with_index(&block)
|
942
|
+
return to_enum(:map_rows_with_index) unless block
|
943
|
+
|
944
|
+
each_row_with_index.map(&block)
|
945
|
+
end
|
946
|
+
|
947
|
+
def map_rows!
|
948
|
+
return to_enum(:map_rows!) unless block_given?
|
949
|
+
|
950
|
+
index.dup.each do |i|
|
951
|
+
row[i] = should_be_vector!(yield(row[i]))
|
952
|
+
end
|
953
|
+
|
954
|
+
self
|
955
|
+
end
|
956
|
+
|
957
|
+
def apply_method(method, keys: nil, by_position: true)
|
958
|
+
df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
|
959
|
+
|
960
|
+
case method
|
961
|
+
when Symbol then df.send(method)
|
962
|
+
when Proc then method.call(df)
|
963
|
+
when Array then method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
|
964
|
+
else raise
|
965
|
+
end
|
966
|
+
end
|
967
|
+
alias apply_method_on_sub_df apply_method
|
968
|
+
|
969
|
+
# Retrieves a DaruLite::Vector, based on the result of calculation
|
970
|
+
# performed on each row.
|
971
|
+
def collect_rows(&block)
|
972
|
+
return to_enum(:collect_rows) unless block
|
973
|
+
|
974
|
+
DaruLite::Vector.new(each_row.map(&block), index: @index)
|
975
|
+
end
|
976
|
+
|
977
|
+
def collect_row_with_index(&block)
|
978
|
+
return to_enum(:collect_row_with_index) unless block
|
979
|
+
|
980
|
+
DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
|
981
|
+
end
|
982
|
+
|
983
|
+
# Retrives a DaruLite::Vector, based on the result of calculation
|
984
|
+
# performed on each vector.
|
985
|
+
def collect_vectors(&block)
|
986
|
+
return to_enum(:collect_vectors) unless block
|
987
|
+
|
988
|
+
DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
|
989
|
+
end
|
990
|
+
|
991
|
+
def collect_vector_with_index(&block)
|
992
|
+
return to_enum(:collect_vector_with_index) unless block
|
993
|
+
|
994
|
+
DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
|
995
|
+
end
|
996
|
+
|
997
|
+
# Generate a matrix, based on vector names of the DataFrame.
|
998
|
+
#
|
999
|
+
# @return {::Matrix}
|
1000
|
+
# :nocov:
|
1001
|
+
# FIXME: Even not trying to cover this: I can't get, how it is expected
|
1002
|
+
# to work.... -- zverok
|
1003
|
+
def collect_matrix
|
1004
|
+
return to_enum(:collect_matrix) unless block_given?
|
1005
|
+
|
1006
|
+
vecs = vectors.to_a
|
1007
|
+
rows = vecs.collect do |row|
|
1008
|
+
vecs.collect do |col|
|
1009
|
+
yield row, col
|
1010
|
+
end
|
1011
|
+
end
|
1012
|
+
|
1013
|
+
Matrix.rows(rows)
|
1014
|
+
end
|
1015
|
+
# :nocov:
|
1016
|
+
|
1017
|
+
# Delete a vector
|
1018
|
+
def delete_vector(vector)
|
1019
|
+
raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
|
1020
|
+
|
1021
|
+
@data.delete_at @vectors[vector]
|
1022
|
+
@vectors = DaruLite::Index.new @vectors.to_a - [vector]
|
1023
|
+
|
1024
|
+
self
|
1025
|
+
end
|
1026
|
+
|
1027
|
+
# Deletes a list of vectors
|
1028
|
+
def delete_vectors(*vectors)
|
1029
|
+
Array(vectors).each { |vec| delete_vector vec }
|
1030
|
+
|
1031
|
+
self
|
1032
|
+
end
|
1033
|
+
|
1034
|
+
# Delete a row
|
1035
|
+
def delete_row(index)
|
1036
|
+
idx = named_index_for index
|
1037
|
+
|
1038
|
+
raise IndexError, "Index #{index} does not exist." unless @index.include? idx
|
1039
|
+
|
1040
|
+
@index = DaruLite::Index.new(@index.to_a - [idx])
|
1041
|
+
each_vector do |vector|
|
1042
|
+
vector.delete_at idx
|
1043
|
+
end
|
1044
|
+
|
1045
|
+
set_size
|
1046
|
+
end
|
1047
|
+
|
1048
|
+
# Creates a DataFrame with the random data, of n size.
|
1049
|
+
# If n not given, uses original number of rows.
|
1050
|
+
#
|
1051
|
+
# @return {DaruLite::DataFrame}
|
1052
|
+
def bootstrap(n = nil)
|
1053
|
+
n ||= nrows
|
1054
|
+
DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
|
1055
|
+
n.times do
|
1056
|
+
df_boot.add_row(row[rand(n)])
|
1057
|
+
end
|
1058
|
+
df_boot.update
|
1059
|
+
end
|
1060
|
+
end
|
1061
|
+
|
1062
|
+
def keep_row_if
|
1063
|
+
@index
|
1064
|
+
.reject { |idx| yield access_row(idx) }
|
1065
|
+
.each { |idx| delete_row idx }
|
1066
|
+
end
|
1067
|
+
|
1068
|
+
def keep_vector_if
|
1069
|
+
@vectors.each do |vector|
|
1070
|
+
delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
|
1071
|
+
end
|
1072
|
+
end
|
1073
|
+
|
1074
|
+
# creates a new vector with the data of a given field which the block returns true
|
1075
|
+
def filter_vector(vec, &block)
|
1076
|
+
DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
|
1077
|
+
end
|
1078
|
+
|
1079
|
+
# Iterates over each row and retains it in a new DataFrame if the block returns
|
1080
|
+
# true for that row.
|
1081
|
+
def filter_rows
|
1082
|
+
return to_enum(:filter_rows) unless block_given?
|
1083
|
+
|
1084
|
+
keep_rows = @index.map { |index| yield access_row(index) }
|
1085
|
+
|
1086
|
+
where keep_rows
|
1087
|
+
end
|
1088
|
+
|
1089
|
+
# Iterates over each vector and retains it in a new DataFrame if the block returns
|
1090
|
+
# true for that vector.
|
1091
|
+
def filter_vectors(&block)
|
1092
|
+
return to_enum(:filter_vectors) unless block
|
1093
|
+
|
1094
|
+
dup.tap { |df| df.keep_vector_if(&block) }
|
1095
|
+
end
|
1096
|
+
|
1097
|
+
# Test each row with one or more tests.
|
1098
|
+
# @param tests [Proc] Each test is a Proc with the form
|
1099
|
+
# *Proc.new {|row| row[:age] > 0}*
|
1100
|
+
# The function returns an array with all errors.
|
1101
|
+
#
|
1102
|
+
# FIXME: description here is too sparse. As far as I can get,
|
1103
|
+
# it should tell something about that each test is [descr, fields, block],
|
1104
|
+
# and that first value may be column name to output. - zverok, 2016-05-18
|
1105
|
+
def verify(*tests)
|
1106
|
+
id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
|
1107
|
+
|
1108
|
+
each_row_with_index.map do |row, i|
|
1109
|
+
tests.reject { |*_, block| block.call(row) }
|
1110
|
+
.map { |test| verify_error_message row, test, id, i }
|
1111
|
+
end.flatten
|
1112
|
+
end
|
1113
|
+
|
1114
|
+
# DSL for yielding each row and returning a DaruLite::Vector based on the
|
1115
|
+
# value each run of the block returns.
|
1116
|
+
#
|
1117
|
+
# == Usage
|
1118
|
+
#
|
1119
|
+
# a1 = DaruLite::Vector.new([1, 2, 3, 4, 5, 6, 7])
|
1120
|
+
# a2 = DaruLite::Vector.new([10, 20, 30, 40, 50, 60, 70])
|
1121
|
+
# a3 = DaruLite::Vector.new([100, 200, 300, 400, 500, 600, 700])
|
1122
|
+
# ds = DaruLite::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
|
1123
|
+
# total = ds.vector_by_calculation { a + b + c }
|
1124
|
+
# # <DaruLite::Vector:82314050 @name = nil @size = 7 >
|
1125
|
+
# # nil
|
1126
|
+
# # 0 111
|
1127
|
+
# # 1 222
|
1128
|
+
# # 2 333
|
1129
|
+
# # 3 444
|
1130
|
+
# # 4 555
|
1131
|
+
# # 5 666
|
1132
|
+
# # 6 777
|
1133
|
+
def vector_by_calculation(&block)
|
1134
|
+
a = each_row.map { |r| r.instance_eval(&block) }
|
1135
|
+
|
1136
|
+
DaruLite::Vector.new a, index: @index
|
1137
|
+
end
|
1138
|
+
|
1139
|
+
# Reorder the vectors in a dataframe
|
1140
|
+
# @param [Array] order_array new order of the vectors
|
1141
|
+
# @example
|
1142
|
+
# df = DaruLite::DataFrame({
|
1143
|
+
# a: [1, 2, 3],
|
1144
|
+
# b: [4, 5, 6]
|
1145
|
+
# }, order: [:a, :b])
|
1146
|
+
# df.order = [:b, :a]
|
1147
|
+
# df
|
1148
|
+
# # => #<DaruLite::DataFrame(3x2)>
|
1149
|
+
# # b a
|
1150
|
+
# # 0 4 1
|
1151
|
+
# # 1 5 2
|
1152
|
+
# # 2 6 3
|
1153
|
+
def order=(order_array)
|
1154
|
+
raise ArgumentError, 'Invalid order' unless
|
1155
|
+
order_array.sort == vectors.to_a.sort
|
1156
|
+
|
1157
|
+
initialize(to_h, order: order_array)
|
1158
|
+
end
|
1159
|
+
|
1160
|
+
# Return the dataframe with rotate vectors positions, the vector at position count is now
|
1161
|
+
# the first vector of the dataframe.
|
1162
|
+
# If only one vector in the dataframe, the dataframe is return without any change.
|
1163
|
+
# @param count => Integer, the vector at position count will be the first vector of the dataframe.
|
1164
|
+
# @example
|
1165
|
+
# df = DaruLite::DataFrame({
|
1166
|
+
# a: [1, 2, 3],
|
1167
|
+
# b: [4, 5, 6],
|
1168
|
+
# total: [5, 7, 9],
|
1169
|
+
# })
|
1170
|
+
# df.rotate_vectors(-1)
|
1171
|
+
# df
|
1172
|
+
# # => #<DaruLite::DataFrame(3x3)>
|
1173
|
+
# # total b a
|
1174
|
+
# # 0 5 4 1
|
1175
|
+
# # 1 7 5 2
|
1176
|
+
# # 2 9 6 3
|
1177
|
+
def rotate_vectors(count = -1)
|
1178
|
+
return self unless vectors.many?
|
1179
|
+
|
1180
|
+
self.order = vectors.to_a.rotate(count)
|
1181
|
+
self
|
1182
|
+
end
|
1183
|
+
|
1184
|
+
# Returns a vector, based on a string with a calculation based
|
1185
|
+
# on vector.
|
1186
|
+
#
|
1187
|
+
# The calculation will be eval'ed, so you can put any variable
|
1188
|
+
# or expression valid on ruby.
|
1189
|
+
#
|
1190
|
+
# For example:
|
1191
|
+
# a = DaruLite::Vector.new [1,2]
|
1192
|
+
# b = DaruLite::Vector.new [3,4]
|
1193
|
+
# ds = DaruLite::DataFrame.new({:a => a,:b => b})
|
1194
|
+
# ds.compute("a+b")
|
1195
|
+
# => Vector [4,6]
|
1196
|
+
def compute(text, &block)
|
1197
|
+
return instance_eval(&block) if block
|
1198
|
+
|
1199
|
+
instance_eval(text)
|
1200
|
+
end
|
1201
|
+
|
1202
|
+
# Return a vector with the number of missing values in each row.
|
1203
|
+
#
|
1204
|
+
# == Arguments
|
1205
|
+
#
|
1206
|
+
# * +missing_values+ - An Array of the values that should be
|
1207
|
+
# treated as 'missing'. The default missing value is *nil*.
|
1208
|
+
def missing_values_rows(missing_values = [nil])
|
1209
|
+
number_of_missing = each_row.map do |row|
|
1210
|
+
row.indexes(*missing_values).size
|
1211
|
+
end
|
1212
|
+
|
1213
|
+
DaruLite::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
|
1214
|
+
end
|
1215
|
+
|
1216
|
+
# TODO: remove next version
|
1217
|
+
alias vector_missing_values missing_values_rows
|
1218
|
+
|
1219
|
+
def has_missing_data?
|
1220
|
+
@data.any? { |vec| vec.include_values?(*DaruLite::MISSING_VALUES) }
|
1221
|
+
end
|
1222
|
+
alias flawed? has_missing_data?
|
1223
|
+
deprecate :has_missing_data?, :include_values?, 2016, 10
|
1224
|
+
deprecate :flawed?, :include_values?, 2016, 10
|
1225
|
+
|
1226
|
+
# Check if any of given values occur in the data frame
|
1227
|
+
# @param [Array] values to check for
|
1228
|
+
# @return [true, false] true if any of the given values occur in the
|
1229
|
+
# dataframe, false otherwise
|
1230
|
+
# @example
|
1231
|
+
# df = DaruLite::DataFrame.new({
|
1232
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
1233
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
1234
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
1235
|
+
# }, index: 11..18)
|
1236
|
+
# df.include_values? nil
|
1237
|
+
# # => true
|
1238
|
+
def include_values?(*values)
|
1239
|
+
@data.any? { |vec| vec.include_values?(*values) }
|
1240
|
+
end
|
1241
|
+
|
1242
|
+
# Return a nested hash using vector names as keys and an array constructed of
|
1243
|
+
# hashes with other values. If block provided, is used to provide the
|
1244
|
+
# values, with parameters +row+ of dataset, +current+ last hash on
|
1245
|
+
# hierarchy and +name+ of the key to include
|
1246
|
+
def nest(*tree_keys, &block)
|
1247
|
+
tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
|
1248
|
+
|
1249
|
+
each_row.with_object({}) do |row, current|
|
1250
|
+
# Create tree
|
1251
|
+
*keys, last = tree_keys
|
1252
|
+
current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
|
1253
|
+
name = row[last]
|
1254
|
+
|
1255
|
+
if block
|
1256
|
+
current[name] = yield(row, current, name)
|
1257
|
+
else
|
1258
|
+
current[name] ||= []
|
1259
|
+
current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
|
1260
|
+
end
|
1261
|
+
end
|
1262
|
+
end
|
1263
|
+
|
1264
|
+
def vector_count_characters(vecs = nil)
|
1265
|
+
vecs ||= @vectors.to_a
|
1266
|
+
|
1267
|
+
collect_rows do |row|
|
1268
|
+
vecs.sum { |v| row[v].to_s.size }
|
1269
|
+
end
|
1270
|
+
end
|
1271
|
+
|
1272
|
+
def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
|
1273
|
+
self[name]
|
1274
|
+
.split_by_separator(sep)
|
1275
|
+
.each { |k, v| self[:"#{name}#{join}#{k}"] = v }
|
1276
|
+
end
|
1277
|
+
|
1278
|
+
# Return the number of rows and columns of the DataFrame in an Array.
|
1279
|
+
def shape
|
1280
|
+
[nrows, ncols]
|
1281
|
+
end
|
1282
|
+
|
1283
|
+
# The number of rows
|
1284
|
+
def nrows
|
1285
|
+
@index.size
|
1286
|
+
end
|
1287
|
+
|
1288
|
+
# The number of vectors
|
1289
|
+
def ncols
|
1290
|
+
@vectors.size
|
1291
|
+
end
|
1292
|
+
|
1293
|
+
# Check if a vector is present
|
1294
|
+
def has_vector?(vector)
|
1295
|
+
@vectors.include? vector
|
1296
|
+
end
|
1297
|
+
|
1298
|
+
# Works like Array#any?.
|
1299
|
+
#
|
1300
|
+
# @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
|
1301
|
+
# :row. A DaruLite::Vector object is yielded in the block.
|
1302
|
+
# @example Using any?
|
1303
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
1304
|
+
# df.any?(:row) do |row|
|
1305
|
+
# row[:a] < 3 and row[:b] == 'b'
|
1306
|
+
# end #=> true
|
1307
|
+
def any?(axis = :vector, &block)
|
1308
|
+
if %i[vector column].include?(axis)
|
1309
|
+
@data.any?(&block)
|
1310
|
+
elsif axis == :row
|
1311
|
+
each_row do |row|
|
1312
|
+
return true if yield(row)
|
1313
|
+
end
|
1314
|
+
false
|
1315
|
+
else
|
1316
|
+
raise ArgumentError, "Unidentified axis #{axis}"
|
1317
|
+
end
|
1318
|
+
end
|
1319
|
+
|
1320
|
+
# Works like Array#all?
|
1321
|
+
#
|
1322
|
+
# @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
|
1323
|
+
# :row. A DaruLite::Vector object is yielded in the block.
|
1324
|
+
# @example Using all?
|
1325
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
1326
|
+
# df.all?(:row) do |row|
|
1327
|
+
# row[:a] < 10
|
1328
|
+
# end #=> true
|
1329
|
+
def all?(axis = :vector, &block)
|
1330
|
+
if %i[vector column].include?(axis)
|
1331
|
+
@data.all?(&block)
|
1332
|
+
elsif axis == :row
|
1333
|
+
each_row.all?(&block)
|
1334
|
+
else
|
1335
|
+
raise ArgumentError, "Unidentified axis #{axis}"
|
1336
|
+
end
|
1337
|
+
end
|
1338
|
+
|
1339
|
+
# The first ten elements of the DataFrame
|
1340
|
+
#
|
1341
|
+
# @param [Fixnum] quantity (10) The number of elements to display from the top.
|
1342
|
+
def head(quantity = 10)
|
1343
|
+
row.at 0..(quantity - 1)
|
1344
|
+
end
|
1345
|
+
|
1346
|
+
alias first head
|
1347
|
+
|
1348
|
+
# The last ten elements of the DataFrame
|
1349
|
+
#
|
1350
|
+
# @param [Fixnum] quantity (10) The number of elements to display from the bottom.
|
1351
|
+
def tail(quantity = 10)
|
1352
|
+
start = [-quantity, -size].max
|
1353
|
+
row.at start..-1
|
1354
|
+
end
|
1355
|
+
|
1356
|
+
alias last tail
|
1357
|
+
|
1358
|
+
# Sum all numeric/specified vectors in the DataFrame.
|
1359
|
+
#
|
1360
|
+
# Returns a new vector that's a containing a sum of all numeric
|
1361
|
+
# or specified vectors of the DataFrame. By default, if the vector
|
1362
|
+
# contains a nil, the sum is nil.
|
1363
|
+
# With :skipnil argument set to true, nil values are assumed to be
|
1364
|
+
# 0 (zero) and the sum vector is returned.
|
1365
|
+
#
|
1366
|
+
# @param args [Array] List of vectors to sum. Default is nil in which case
|
1367
|
+
# all numeric vectors are summed.
|
1368
|
+
#
|
1369
|
+
# @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
|
1370
|
+
#
|
1371
|
+
# @return Vector with sum of all vectors specified in the argument.
|
1372
|
+
# If vecs parameter is empty, sum all numeric vector.
|
1373
|
+
#
|
1374
|
+
# @example
|
1375
|
+
# df = DaruLite::DataFrame.new({
|
1376
|
+
# a: [1, 2, nil],
|
1377
|
+
# b: [2, 1, 3],
|
1378
|
+
# c: [1, 1, 1]
|
1379
|
+
# })
|
1380
|
+
# => #<DaruLite::DataFrame(3x3)>
|
1381
|
+
# a b c
|
1382
|
+
# 0 1 2 1
|
1383
|
+
# 1 2 1 1
|
1384
|
+
# 2 nil 3 1
|
1385
|
+
# df.vector_sum [:a, :c]
|
1386
|
+
# => #<DaruLite::Vector(3)>
|
1387
|
+
# 0 2
|
1388
|
+
# 1 3
|
1389
|
+
# 2 nil
|
1390
|
+
# df.vector_sum
|
1391
|
+
# => #<DaruLite::Vector(3)>
|
1392
|
+
# 0 4
|
1393
|
+
# 1 4
|
1394
|
+
# 2 nil
|
1395
|
+
# df.vector_sum skipnil: true
|
1396
|
+
# => #<DaruLite::Vector(3)>
|
1397
|
+
# c
|
1398
|
+
# 0 4
|
1399
|
+
# 1 4
|
1400
|
+
# 2 4
|
1401
|
+
#
|
1402
|
+
def vector_sum(*args)
|
1403
|
+
defaults = { vecs: nil, skipnil: false }
|
1404
|
+
options = args.last.is_a?(::Hash) ? args.pop : {}
|
1405
|
+
options = defaults.merge(options)
|
1406
|
+
vecs = args[0] || options[:vecs]
|
1407
|
+
skipnil = args[1] || options[:skipnil]
|
1408
|
+
|
1409
|
+
vecs ||= numeric_vectors
|
1410
|
+
sum = DaruLite::Vector.new [0] * @size, index: @index, name: @name, dtype: @dtype
|
1411
|
+
vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
|
1412
|
+
end
|
1413
|
+
|
1414
|
+
# Calculate mean of the rows of the dataframe.
|
1415
|
+
#
|
1416
|
+
# == Arguments
|
1417
|
+
#
|
1418
|
+
# * +max_missing+ - The maximum number of elements in the row that can be
|
1419
|
+
# zero for the mean calculation to happen. Default to 0.
|
1420
|
+
def vector_mean(max_missing = 0)
|
1421
|
+
# FIXME: in vector_sum we preserve created vector dtype, but
|
1422
|
+
# here we are not. Is this by design or ...? - zverok, 2016-05-18
|
1423
|
+
mean_vec = DaruLite::Vector.new [0] * @size, index: @index, name: "mean_#{@name}"
|
1424
|
+
|
1425
|
+
each_row_with_index.with_object(mean_vec) do |(row, i), memo|
|
1426
|
+
memo[i] = row.indexes(*DaruLite::MISSING_VALUES).size > max_missing ? nil : row.mean
|
1427
|
+
end
|
1428
|
+
end
|
1429
|
+
|
1430
|
+
# Group elements by vector to perform operations on them. Returns a
|
1431
|
+
# DaruLite::Core::GroupBy object.See the DaruLite::Core::GroupBy docs for a detailed
|
1432
|
+
# list of possible operations.
|
1433
|
+
#
|
1434
|
+
# == Arguments
|
1435
|
+
#
|
1436
|
+
# * vectors - An Array contatining names of vectors to group by.
|
1437
|
+
#
|
1438
|
+
# == Usage
|
1439
|
+
#
|
1440
|
+
# df = DaruLite::DataFrame.new({
|
1441
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
1442
|
+
# b: %w{one one two three two two one three},
|
1443
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
1444
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
1445
|
+
# })
|
1446
|
+
# df.group_by([:a,:b,:c]).groups
|
1447
|
+
# #=> {["bar", "one", 2]=>[1],
|
1448
|
+
# # ["bar", "three", 1]=>[3],
|
1449
|
+
# # ["bar", "two", 6]=>[5],
|
1450
|
+
# # ["foo", "one", 1]=>[0],
|
1451
|
+
# # ["foo", "one", 3]=>[6],
|
1452
|
+
# # ["foo", "three", 8]=>[7],
|
1453
|
+
# # ["foo", "two", 3]=>[2, 4]}
|
1454
|
+
def group_by(*vectors)
|
1455
|
+
vectors.flatten!
|
1456
|
+
missing = vectors - @vectors.to_a
|
1457
|
+
raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}") unless missing.empty?
|
1458
|
+
|
1459
|
+
vectors = [@vectors.first] if vectors.empty?
|
1460
|
+
|
1461
|
+
DaruLite::Core::GroupBy.new(self, vectors)
|
1462
|
+
end
|
1463
|
+
|
1464
|
+
def reindex_vectors(new_vectors)
|
1465
|
+
unless new_vectors.is_a?(DaruLite::Index)
|
1466
|
+
raise ArgumentError, 'Must pass the new index of type Index or its ' \
|
1467
|
+
"subclasses, not #{new_vectors.class}"
|
1468
|
+
end
|
1469
|
+
|
1470
|
+
cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
|
1471
|
+
new_vectors.each_with_object(cl) do |vec, memo|
|
1472
|
+
memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
|
1473
|
+
end
|
1474
|
+
end
|
1475
|
+
|
1476
|
+
def get_vector_anyways(v)
|
1477
|
+
@vectors.include?(v) ? self[v].to_a : Array.new(size)
|
1478
|
+
end
|
1479
|
+
|
1480
|
+
# Concatenate another DataFrame along corresponding columns.
|
1481
|
+
# If columns do not exist in both dataframes, they are filled with nils
|
1482
|
+
def concat(other_df)
|
1483
|
+
vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
|
1484
|
+
|
1485
|
+
data = vectors.map do |v|
|
1486
|
+
get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
|
1487
|
+
end
|
1488
|
+
|
1489
|
+
DaruLite::DataFrame.new(data, order: vectors)
|
1490
|
+
end
|
1491
|
+
|
1492
|
+
# Concatenates another DataFrame as #concat.
|
1493
|
+
# Additionally it tries to preserve the index. If the indices contain
|
1494
|
+
# common elements, #union will overwrite the according rows in the
|
1495
|
+
# first dataframe.
|
1496
|
+
def union(other_df)
|
1497
|
+
index = (@index.to_a + other_df.index.to_a).uniq
|
1498
|
+
df = row[*(@index.to_a - other_df.index.to_a)]
|
1499
|
+
|
1500
|
+
df = df.concat(other_df)
|
1501
|
+
df.index = DaruLite::Index.new(index)
|
1502
|
+
df
|
1503
|
+
end
|
1504
|
+
|
1505
|
+
module SetSingleIndexStrategy
|
1506
|
+
def self.uniq_size(df, col)
|
1507
|
+
df[col].uniq.size
|
1508
|
+
end
|
1509
|
+
|
1510
|
+
def self.new_index(df, col)
|
1511
|
+
DaruLite::Index.new(df[col].to_a)
|
1512
|
+
end
|
1513
|
+
|
1514
|
+
def self.delete_vector(df, col)
|
1515
|
+
df.delete_vector(col)
|
1516
|
+
end
|
1517
|
+
end
|
1518
|
+
|
1519
|
+
module SetCategoricalIndexStrategy
|
1520
|
+
def self.new_index(df, col)
|
1521
|
+
DaruLite::CategoricalIndex.new(df[col].to_a)
|
1522
|
+
end
|
1523
|
+
|
1524
|
+
def self.delete_vector(df, col)
|
1525
|
+
df.delete_vector(col)
|
1526
|
+
end
|
1527
|
+
end
|
1528
|
+
|
1529
|
+
module SetMultiIndexStrategy
|
1530
|
+
def self.uniq_size(df, cols)
|
1531
|
+
df[*cols].uniq.size
|
1532
|
+
end
|
1533
|
+
|
1534
|
+
def self.new_index(df, cols)
|
1535
|
+
DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
|
1536
|
+
mi.name = cols
|
1537
|
+
end
|
1538
|
+
end
|
1539
|
+
|
1540
|
+
def self.delete_vector(df, cols)
|
1541
|
+
df.delete_vectors(*cols)
|
1542
|
+
end
|
1543
|
+
end
|
1544
|
+
|
1545
|
+
# Set a particular column as the new DF
|
1546
|
+
def set_index(new_index_col, keep: false, categorical: false)
|
1547
|
+
if categorical
|
1548
|
+
strategy = SetCategoricalIndexStrategy
|
1549
|
+
elsif new_index_col.respond_to?(:to_a)
|
1550
|
+
strategy = SetMultiIndexStrategy
|
1551
|
+
new_index_col = new_index_col.to_a
|
1552
|
+
else
|
1553
|
+
strategy = SetSingleIndexStrategy
|
1554
|
+
end
|
1555
|
+
|
1556
|
+
unless categorical
|
1557
|
+
uniq_size = strategy.uniq_size(self, new_index_col)
|
1558
|
+
raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
|
1559
|
+
end
|
1560
|
+
|
1561
|
+
self.index = strategy.new_index(self, new_index_col)
|
1562
|
+
strategy.delete_vector(self, new_index_col) unless keep
|
1563
|
+
self
|
1564
|
+
end
|
1565
|
+
|
1566
|
+
# Change the index of the DataFrame and preserve the labels of the previous
|
1567
|
+
# indexing. New index can be DaruLite::Index or any of its subclasses.
|
1568
|
+
#
|
1569
|
+
# @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
|
1570
|
+
# @example Reindexing DataFrame
|
1571
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
|
1572
|
+
# index: ['a','b','c','d'])
|
1573
|
+
# #=>
|
1574
|
+
# ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
1575
|
+
# # a b
|
1576
|
+
# # a 1 11
|
1577
|
+
# # b 2 22
|
1578
|
+
# # c 3 33
|
1579
|
+
# # d 4 44
|
1580
|
+
# df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
|
1581
|
+
# #=>
|
1582
|
+
# ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
1583
|
+
# # a b
|
1584
|
+
# # b 2 22
|
1585
|
+
# # 0 nil nil
|
1586
|
+
# # a 1 11
|
1587
|
+
# # g nil nil
|
1588
|
+
def reindex(new_index)
|
1589
|
+
unless new_index.is_a?(DaruLite::Index)
|
1590
|
+
raise ArgumentError, 'Must pass the new index of type Index or its ' \
|
1591
|
+
"subclasses, not #{new_index.class}"
|
1592
|
+
end
|
1593
|
+
|
1594
|
+
cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
|
1595
|
+
new_index.each_with_object(cl) do |idx, memo|
|
1596
|
+
memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
|
1597
|
+
end
|
1598
|
+
end
|
1599
|
+
|
1600
|
+
def reset_index
|
1601
|
+
index_df = index.to_df
|
1602
|
+
names = index.name
|
1603
|
+
names = [names] unless names.instance_of?(Array)
|
1604
|
+
new_vectors = names + vectors.to_a
|
1605
|
+
self.index = index_df.index
|
1606
|
+
names.each do |name|
|
1607
|
+
self[name] = index_df[name]
|
1608
|
+
end
|
1609
|
+
self.order = new_vectors
|
1610
|
+
self
|
1611
|
+
end
|
1612
|
+
|
1613
|
+
# Reassign index with a new index of type DaruLite::Index or any of its subclasses.
|
1614
|
+
#
|
1615
|
+
# @param [DaruLite::Index] idx New index object on which the rows of the dataframe
|
1616
|
+
# are to be indexed.
|
1617
|
+
# @example Reassigining index of a DataFrame
|
1618
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
|
1619
|
+
# df.index.to_a #=> [0,1,2,3]
|
1620
|
+
#
|
1621
|
+
# df.index = DaruLite::Index.new(['a','b','c','d'])
|
1622
|
+
# df.index.to_a #=> ['a','b','c','d']
|
1623
|
+
# df.row['a'].to_a #=> [1,11]
|
1624
|
+
def index=(idx)
|
1625
|
+
@index = Index.coerce idx
|
1626
|
+
@data.each { |vec| vec.index = @index }
|
1627
|
+
|
1628
|
+
self
|
1629
|
+
end
|
1630
|
+
|
1631
|
+
# Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
|
1632
|
+
#
|
1633
|
+
# @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
|
1634
|
+
# be indexed. Must of the same size as ncols.
|
1635
|
+
# @example Reassigning vectors of a DataFrame
|
1636
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
|
1637
|
+
# df.vectors.to_a #=> [:a, :b, :c]
|
1638
|
+
#
|
1639
|
+
# df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
|
1640
|
+
# df.vectors.to_a #=> [:foo, :bar, :baz]
|
1641
|
+
def vectors=(new_index)
|
1642
|
+
raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
|
1643
|
+
|
1644
|
+
if new_index.size != ncols
|
1645
|
+
raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
|
1646
|
+
"dataframe size #{ncols}"
|
1647
|
+
end
|
1648
|
+
|
1649
|
+
@vectors = new_index
|
1650
|
+
@data.zip(new_index.to_a).each do |vect, name|
|
1651
|
+
vect.name = name
|
1652
|
+
end
|
1653
|
+
self
|
1654
|
+
end
|
1655
|
+
|
1656
|
+
# Renames the vectors
|
1657
|
+
#
|
1658
|
+
# == Arguments
|
1659
|
+
#
|
1660
|
+
# * name_map - A hash where the keys are the exising vector names and
|
1661
|
+
# the values are the new names. If a vector is renamed
|
1662
|
+
# to a vector name that is already in use, the existing
|
1663
|
+
# one is overwritten.
|
1664
|
+
#
|
1665
|
+
# == Usage
|
1666
|
+
#
|
1667
|
+
# df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
|
1668
|
+
# df.rename_vectors :a => :alpha, :c => :gamma
|
1669
|
+
# df.vectors.to_a #=> [:alpha, :b, :gamma]
|
1670
|
+
def rename_vectors(name_map)
|
1671
|
+
existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
|
1672
|
+
delete_vectors(*existing_targets)
|
1673
|
+
|
1674
|
+
new_names = vectors.to_a.map { |v| name_map[v] || v }
|
1675
|
+
self.vectors = DaruLite::Index.new new_names
|
1676
|
+
end
|
1677
|
+
|
1678
|
+
# Renames the vectors and returns itself
|
1679
|
+
#
|
1680
|
+
# == Arguments
|
1681
|
+
#
|
1682
|
+
# * name_map - A hash where the keys are the exising vector names and
|
1683
|
+
# the values are the new names. If a vector is renamed
|
1684
|
+
# to a vector name that is already in use, the existing
|
1685
|
+
# one is overwritten.
|
1686
|
+
#
|
1687
|
+
# == Usage
|
1688
|
+
#
|
1689
|
+
# df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
|
1690
|
+
# df.rename_vectors! :a => :alpha, :c => :gamma # df
|
1691
|
+
def rename_vectors!(name_map)
|
1692
|
+
rename_vectors(name_map)
|
1693
|
+
self
|
1694
|
+
end
|
1695
|
+
|
1696
|
+
# Converts the vectors to a DaruLite::MultiIndex.
|
1697
|
+
# The argument passed is used as the MultiIndex's top level
|
1698
|
+
def add_level_to_vectors(top_level_label)
|
1699
|
+
tuples = vectors.map { |label| [top_level_label, *label] }
|
1700
|
+
self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
|
1701
|
+
end
|
1702
|
+
|
1703
|
+
# Return the indexes of all the numeric vectors. Will include vectors with nils
|
1704
|
+
# alongwith numbers.
|
1705
|
+
def numeric_vectors
|
1706
|
+
# FIXME: Why _with_index ?..
|
1707
|
+
each_vector_with_index
|
1708
|
+
.select { |vec, _i| vec.numeric? }
|
1709
|
+
.map(&:last)
|
1710
|
+
end
|
1711
|
+
|
1712
|
+
def numeric_vector_names
|
1713
|
+
@vectors.select { |v| self[v].numeric? }
|
1714
|
+
end
|
1715
|
+
|
1716
|
+
# Return a DataFrame of only the numerical Vectors. If clone: false
|
1717
|
+
# is specified as option, only a *view* of the Vectors will be
|
1718
|
+
# returned. Defaults to clone: true.
|
1719
|
+
def only_numerics(opts = {})
|
1720
|
+
cln = opts[:clone] != false
|
1721
|
+
arry = numeric_vectors.map { |v| self[v] }
|
1722
|
+
|
1723
|
+
order = Index.new(numeric_vectors)
|
1724
|
+
DaruLite::DataFrame.new(arry, clone: cln, order: order, index: @index)
|
1725
|
+
end
|
1726
|
+
|
1727
|
+
# Generate a summary of this DataFrame based on individual vectors in the DataFrame
|
1728
|
+
# @return [String] String containing the summary of the DataFrame
|
1729
|
+
def summary
|
1730
|
+
summary = "= #{name}"
|
1731
|
+
summary << "\n Number of rows: #{nrows}"
|
1732
|
+
@vectors.each do |v|
|
1733
|
+
summary << "\n Element:[#{v}]\n"
|
1734
|
+
summary << self[v].summary(1)
|
1735
|
+
end
|
1736
|
+
summary
|
1737
|
+
end
|
1738
|
+
|
1739
|
+
# Sorts a dataframe (ascending/descending) in the given pripority sequence of
|
1740
|
+
# vectors, with or without a block.
|
1741
|
+
#
|
1742
|
+
# @param vector_order [Array] The order of vector names in which the DataFrame
|
1743
|
+
# should be sorted.
|
1744
|
+
# @param opts [Hash] opts The options to sort with.
|
1745
|
+
# @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
|
1746
|
+
# or descending order. Specify Array corresponding to *order* for multiple
|
1747
|
+
# sort orders.
|
1748
|
+
# @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
|
1749
|
+
# to be used for sorting, for each vector name in *order* as a hash of
|
1750
|
+
# vector name and lambda expressions. In case a lambda for a vector is not
|
1751
|
+
# specified, the default will be used.
|
1752
|
+
# @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
|
1753
|
+
# automatically or not when a block is provided.
|
1754
|
+
# If set to True, nils will appear at top after sorting.
|
1755
|
+
#
|
1756
|
+
# @example Sort a dataframe with a vector sequence.
|
1757
|
+
#
|
1758
|
+
#
|
1759
|
+
# df = DaruLite::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
|
1760
|
+
#
|
1761
|
+
# df.sort [:a, :b]
|
1762
|
+
# # =>
|
1763
|
+
# # <DaruLite::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
|
1764
|
+
# # a b
|
1765
|
+
# # 2 1 3
|
1766
|
+
# # 0 1 5
|
1767
|
+
# # 3 2 2
|
1768
|
+
# # 1 2 4
|
1769
|
+
# # 4 3 1
|
1770
|
+
#
|
1771
|
+
# @example Sort a dataframe without a block. Here nils will be handled automatically.
|
1772
|
+
#
|
1773
|
+
# df = DaruLite::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
|
1774
|
+
#
|
1775
|
+
# df.sort([:a])
|
1776
|
+
# # =>
|
1777
|
+
# # <DaruLite::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
|
1778
|
+
# # a b
|
1779
|
+
# # 1 nil 3
|
1780
|
+
# # 3 nil 1
|
1781
|
+
# # 0 -3 4
|
1782
|
+
# # 2 -1 2
|
1783
|
+
# # 4 5 4
|
1784
|
+
#
|
1785
|
+
# @example Sort a dataframe with a block with nils handled automatically.
|
1786
|
+
#
|
1787
|
+
# df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
|
1788
|
+
#
|
1789
|
+
# df.sort [:b], by: {b: lambda { |a| a.length } }
|
1790
|
+
# # NoMethodError: undefined method `length' for nil:NilClass
|
1791
|
+
# # from (pry):8:in `block in __pry__'
|
1792
|
+
#
|
1793
|
+
# df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
|
1794
|
+
#
|
1795
|
+
# # =>
|
1796
|
+
# # <DaruLite::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
|
1797
|
+
# # a b
|
1798
|
+
# # 2 1 nil
|
1799
|
+
# # 5 1 nil
|
1800
|
+
# # 4 -1 x
|
1801
|
+
# # 1 -1 aa
|
1802
|
+
# # 0 nil aaa
|
1803
|
+
# # 3 nil baaa
|
1804
|
+
#
|
1805
|
+
# @example Sort a dataframe with a block with nils handled manually.
|
1806
|
+
#
|
1807
|
+
# df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
|
1808
|
+
#
|
1809
|
+
# # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
|
1810
|
+
# df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
|
1811
|
+
#
|
1812
|
+
# # =>
|
1813
|
+
# #<DaruLite::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
|
1814
|
+
# # a b
|
1815
|
+
# # 4 -1 x
|
1816
|
+
# # 1 -1 aa
|
1817
|
+
# # 0 nil aaa
|
1818
|
+
# # 3 nil baaa
|
1819
|
+
# # 2 1 nil
|
1820
|
+
# # 5 1 nil
|
1821
|
+
|
1822
|
+
def sort!(vector_order, opts = {})
|
1823
|
+
raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
|
1824
|
+
|
1825
|
+
# To enable sorting with categorical data,
|
1826
|
+
# map categories to integers preserving their order
|
1827
|
+
old = convert_categorical_vectors vector_order
|
1828
|
+
block = sort_prepare_block vector_order, opts
|
1829
|
+
|
1830
|
+
order = @index.size.times.sort(&block)
|
1831
|
+
new_index = @index.reorder order
|
1832
|
+
|
1833
|
+
# To reverse map mapping of categorical data to integers
|
1834
|
+
restore_categorical_vectors old
|
1835
|
+
|
1836
|
+
@data.each do |vector|
|
1837
|
+
vector.reorder! order
|
1838
|
+
end
|
1839
|
+
|
1840
|
+
self.index = new_index
|
1841
|
+
|
1842
|
+
self
|
1843
|
+
end
|
1844
|
+
|
1845
|
+
# Non-destructive version of #sort!
|
1846
|
+
def sort(vector_order, opts = {})
|
1847
|
+
dup.sort! vector_order, opts
|
1848
|
+
end
|
1849
|
+
|
1850
|
+
# Pivots a data frame on specified vectors and applies an aggregate function
|
1851
|
+
# to quickly generate a summary.
|
1852
|
+
#
|
1853
|
+
# == Options
|
1854
|
+
#
|
1855
|
+
# +:index+ - Keys to group by on the pivot table row index. Pass vector names
|
1856
|
+
# contained in an Array.
|
1857
|
+
#
|
1858
|
+
# +:vectors+ - Keys to group by on the pivot table column index. Pass vector
|
1859
|
+
# names contained in an Array.
|
1860
|
+
#
|
1861
|
+
# +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
|
1862
|
+
# use any of the statistics functions applicable on Vectors that can be found in
|
1863
|
+
# the DaruLite::Statistics::Vector module.
|
1864
|
+
#
|
1865
|
+
# +:values+ - Columns to aggregate. Will consider all numeric columns not
|
1866
|
+
# specified in *:index* or *:vectors*. Optional.
|
1867
|
+
#
|
1868
|
+
# == Usage
|
1869
|
+
#
|
1870
|
+
# df = DaruLite::DataFrame.new({
|
1871
|
+
# a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
|
1872
|
+
# b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
|
1873
|
+
# c: ['small','large','large','small','small','large','small','large','small'],
|
1874
|
+
# d: [1,2,2,3,3,4,5,6,7],
|
1875
|
+
# e: [2,4,4,6,6,8,10,12,14]
|
1876
|
+
# })
|
1877
|
+
# df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
|
1878
|
+
#
|
1879
|
+
# #=>
|
1880
|
+
# # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
|
1881
|
+
# # [:e, :one] [:e, :two]
|
1882
|
+
# # [:bar] 18 26
|
1883
|
+
# # [:foo] 10 12
|
1884
|
+
def pivot_table(opts = {})
|
1885
|
+
raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?
|
1886
|
+
|
1887
|
+
index = opts[:index]
|
1888
|
+
vectors = opts[:vectors] || []
|
1889
|
+
aggregate_function = opts[:agg] || :mean
|
1890
|
+
values = prepare_pivot_values index, vectors, opts
|
1891
|
+
raise IndexError, 'No numeric vectors to aggregate' if values.empty?
|
1892
|
+
|
1893
|
+
grouped = group_by(index)
|
1894
|
+
return grouped.send(aggregate_function) if vectors.empty?
|
1895
|
+
|
1896
|
+
super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
|
1897
|
+
|
1898
|
+
pivot_dataframe super_hash
|
1899
|
+
end
|
1900
|
+
|
1901
|
+
# Merge vectors from two DataFrames. In case of name collision,
|
1902
|
+
# the vectors names are changed to x_1, x_2 ....
|
1903
|
+
#
|
1904
|
+
# @return {DaruLite::DataFrame}
|
1905
|
+
def merge(other_df)
|
1906
|
+
unless nrows == other_df.nrows
|
1907
|
+
raise ArgumentError,
|
1908
|
+
"Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
|
1909
|
+
end
|
1910
|
+
|
1911
|
+
new_fields = (@vectors.to_a + other_df.vectors.to_a)
|
1912
|
+
new_fields = ArrayHelper.recode_repeated(new_fields)
|
1913
|
+
DataFrame.new({}, order: new_fields).tap do |df_new|
|
1914
|
+
(0...nrows).each do |i|
|
1915
|
+
df_new.add_row row[i].to_a + other_df.row[i].to_a
|
1916
|
+
end
|
1917
|
+
df_new.index = @index if @index == other_df.index
|
1918
|
+
df_new.update
|
1919
|
+
end
|
1920
|
+
end
|
1921
|
+
|
1922
|
+
# Join 2 DataFrames with SQL style joins. Currently supports inner, left
|
1923
|
+
# outer, right outer and full outer joins.
|
1924
|
+
#
|
1925
|
+
# @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
|
1926
|
+
# to be performed.
|
1927
|
+
# @param [Hash] opts Options Hash
|
1928
|
+
# @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
|
1929
|
+
# @option :on [Array] The columns on which the join is to be performed.
|
1930
|
+
# Column names specified here must be common to both DataFrames.
|
1931
|
+
# @option :indicator [Symbol] The name of a vector to add to the resultant
|
1932
|
+
# dataframe that indicates whether the record was in the left (:left_only),
|
1933
|
+
# right (:right_only), or both (:both) joining dataframes.
|
1934
|
+
# @return [DaruLite::DataFrame]
|
1935
|
+
# @example Inner Join
|
1936
|
+
# left = DaruLite::DataFrame.new({
|
1937
|
+
# :id => [1,2,3,4],
|
1938
|
+
# :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
|
1939
|
+
# })
|
1940
|
+
# right = DaruLite::DataFrame.new({
|
1941
|
+
# :id => [1,2,3,4],
|
1942
|
+
# :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
|
1943
|
+
# })
|
1944
|
+
# left.join(right, how: :inner, on: [:name])
|
1945
|
+
# #=>
|
1946
|
+
# ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
|
1947
|
+
# # id_1 name id_2
|
1948
|
+
# # 0 1 Pirate 2
|
1949
|
+
# # 1 3 Ninja 4
|
1950
|
+
def join(other_df, opts = {})
|
1951
|
+
DaruLite::Core::Merge.join(self, other_df, opts)
|
1952
|
+
end
|
1953
|
+
|
1954
|
+
# Creates a new dataset for one to many relations
|
1955
|
+
# on a dataset, based on pattern of field names.
|
1956
|
+
#
|
1957
|
+
# for example, you have a survey for number of children
|
1958
|
+
# with this structure:
|
1959
|
+
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
|
1960
|
+
# with
|
1961
|
+
# ds.one_to_many([:id], "child_%v_%n"
|
1962
|
+
# the field of first parameters will be copied verbatim
|
1963
|
+
# to new dataset, and fields which responds to second
|
1964
|
+
# pattern will be added one case for each different %n.
|
1965
|
+
#
|
1966
|
+
# @example
|
1967
|
+
# cases=[
|
1968
|
+
# ['1','george','red',10,'blue',20,nil,nil],
|
1969
|
+
# ['2','fred','green',15,'orange',30,'white',20],
|
1970
|
+
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
1971
|
+
# ]
|
1972
|
+
# ds=DaruLite::DataFrame.rows(cases, order:
|
1973
|
+
# [:id, :name,
|
1974
|
+
# :car_color1, :car_value1,
|
1975
|
+
# :car_color2, :car_value2,
|
1976
|
+
# :car_color3, :car_value3])
|
1977
|
+
# ds.one_to_many([:id],'car_%v%n').to_matrix
|
1978
|
+
# #=> Matrix[
|
1979
|
+
# # ["red", "1", 10],
|
1980
|
+
# # ["blue", "1", 20],
|
1981
|
+
# # ["green", "2", 15],
|
1982
|
+
# # ["orange", "2", 30],
|
1983
|
+
# # ["white", "2", 20]
|
1984
|
+
# # ]
|
1985
|
+
def one_to_many(parent_fields, pattern)
|
1986
|
+
vars, numbers = one_to_many_components(pattern)
|
1987
|
+
|
1988
|
+
DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
|
1989
|
+
each_row do |row|
|
1990
|
+
verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
|
1991
|
+
numbers.each do |n|
|
1992
|
+
generated = one_to_many_row row, n, vars, pattern
|
1993
|
+
next if generated.values.all?(&:nil?)
|
1994
|
+
|
1995
|
+
ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
|
1996
|
+
end
|
1997
|
+
end
|
1998
|
+
ds.update
|
1999
|
+
end
|
2000
|
+
end
|
2001
|
+
|
2002
|
+
def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
|
2003
|
+
self[nm]
|
2004
|
+
.split_by_separator(sep)
|
2005
|
+
.each_with_index do |(k, v), i|
|
2006
|
+
v.rename "#{nm}:#{k}"
|
2007
|
+
self[:"#{nm}#{join}#{i + 1}"] = v
|
2008
|
+
end
|
2009
|
+
end
|
2010
|
+
|
2011
|
+
# Create a sql, basen on a given Dataset
|
2012
|
+
#
|
2013
|
+
# == Arguments
|
2014
|
+
#
|
2015
|
+
# * table - String specifying name of the table that will created in SQL.
|
2016
|
+
# * charset - Character set. Default is "UTF8".
|
2017
|
+
#
|
2018
|
+
# @example
|
2019
|
+
#
|
2020
|
+
# ds = DaruLite::DataFrame.new({
|
2021
|
+
# :id => DaruLite::Vector.new([1,2,3,4,5]),
|
2022
|
+
# :name => DaruLite::Vector.new(%w{Alex Peter Susan Mary John})
|
2023
|
+
# })
|
2024
|
+
# ds.create_sql('names')
|
2025
|
+
# #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
|
2026
|
+
#
|
2027
|
+
def create_sql(table, charset = 'UTF8')
|
2028
|
+
sql = "CREATE TABLE #{table} ("
|
2029
|
+
fields = vectors.to_a.collect do |f|
|
2030
|
+
v = self[f]
|
2031
|
+
"#{f} #{v.db_type}"
|
2032
|
+
end
|
2033
|
+
|
2034
|
+
sql + fields.join(",\n ") + ") CHARACTER SET=#{charset};"
|
2035
|
+
end
|
2036
|
+
|
2037
|
+
# Returns the dataframe. This can be convenient when the user does not
|
2038
|
+
# know whether the object is a vector or a dataframe.
|
2039
|
+
# @return [self] the dataframe
|
2040
|
+
def to_df
|
2041
|
+
self
|
2042
|
+
end
|
2043
|
+
|
2044
|
+
# Convert all vectors of type *:numeric* into a Matrix.
|
2045
|
+
def to_matrix
|
2046
|
+
Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
|
2047
|
+
end
|
2048
|
+
|
2049
|
+
# Converts the DataFrame into an array of hashes where key is vector name
|
2050
|
+
# and value is the corresponding element. The 0th index of the array contains
|
2051
|
+
# the array of hashes while the 1th index contains the indexes of each row
|
2052
|
+
# of the dataframe. Each element in the index array corresponds to its row
|
2053
|
+
# in the array of hashes, which has the same index.
|
2054
|
+
def to_a
|
2055
|
+
[each_row.map(&:to_h), @index.to_a]
|
2056
|
+
end
|
2057
|
+
|
2058
|
+
# Convert to json. If no_index is false then the index will NOT be included
|
2059
|
+
# in the JSON thus created.
|
2060
|
+
def to_json(no_index = true)
|
2061
|
+
if no_index
|
2062
|
+
to_a[0].to_json
|
2063
|
+
else
|
2064
|
+
to_a.to_json
|
2065
|
+
end
|
2066
|
+
end
|
2067
|
+
|
2068
|
+
# Converts DataFrame to a hash (explicit) with keys as vector names and values as
|
2069
|
+
# the corresponding vectors.
|
2070
|
+
def to_h
|
2071
|
+
@vectors
|
2072
|
+
.each_with_index
|
2073
|
+
.map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
|
2074
|
+
end
|
2075
|
+
|
2076
|
+
# Convert to html for IRuby.
|
2077
|
+
def to_html(threshold = DaruLite.max_rows)
|
2078
|
+
table_thead = to_html_thead
|
2079
|
+
table_tbody = to_html_tbody(threshold)
|
2080
|
+
path = if index.is_a?(MultiIndex)
|
2081
|
+
File.expand_path('iruby/templates/dataframe_mi.html.erb', __dir__)
|
2082
|
+
else
|
2083
|
+
File.expand_path('iruby/templates/dataframe.html.erb', __dir__)
|
2084
|
+
end
|
2085
|
+
ERB.new(File.read(path).strip).result(binding)
|
2086
|
+
end
|
2087
|
+
|
2088
|
+
def to_html_thead
|
2089
|
+
table_thead_path =
|
2090
|
+
if index.is_a?(MultiIndex)
|
2091
|
+
File.expand_path('iruby/templates/dataframe_mi_thead.html.erb', __dir__)
|
2092
|
+
else
|
2093
|
+
File.expand_path('iruby/templates/dataframe_thead.html.erb', __dir__)
|
2094
|
+
end
|
2095
|
+
ERB.new(File.read(table_thead_path).strip).result(binding)
|
2096
|
+
end
|
2097
|
+
|
2098
|
+
def to_html_tbody(threshold = DaruLite.max_rows)
|
2099
|
+
threshold ||= @size
|
2100
|
+
table_tbody_path =
|
2101
|
+
if index.is_a?(MultiIndex)
|
2102
|
+
File.expand_path('iruby/templates/dataframe_mi_tbody.html.erb', __dir__)
|
2103
|
+
else
|
2104
|
+
File.expand_path('iruby/templates/dataframe_tbody.html.erb', __dir__)
|
2105
|
+
end
|
2106
|
+
ERB.new(File.read(table_tbody_path).strip).result(binding)
|
2107
|
+
end
|
2108
|
+
|
2109
|
+
def to_s
|
2110
|
+
"#<#{self.class}#{": #{@name}" if @name}(#{nrows}x#{ncols})>"
|
2111
|
+
end
|
2112
|
+
|
2113
|
+
# Method for updating the metadata (i.e. missing value positions) of the
|
2114
|
+
# after assingment/deletion etc. are complete. This is provided so that
|
2115
|
+
# time is not wasted in creating the metadata for the vector each time
|
2116
|
+
# assignment/deletion of elements is done. Updating data this way is called
|
2117
|
+
# lazy loading. To set or unset lazy loading, see the .lazy_update= method.
|
2118
|
+
def update
|
2119
|
+
@data.each(&:update) if DaruLite.lazy_update
|
2120
|
+
end
|
2121
|
+
|
2122
|
+
# Rename the DataFrame.
|
2123
|
+
def rename(new_name)
|
2124
|
+
@name = new_name
|
2125
|
+
self
|
2126
|
+
end
|
2127
|
+
|
2128
|
+
alias name= rename
|
2129
|
+
|
2130
|
+
# Write this DataFrame to a CSV file.
|
2131
|
+
#
|
2132
|
+
# == Arguments
|
2133
|
+
#
|
2134
|
+
# * filename - Path of CSV file where the DataFrame is to be saved.
|
2135
|
+
#
|
2136
|
+
# == Options
|
2137
|
+
#
|
2138
|
+
# * convert_comma - If set to *true*, will convert any commas in any
|
2139
|
+
# of the data to full stops ('.').
|
2140
|
+
# All the options accepted by CSV.read() can also be passed into this
|
2141
|
+
# function.
|
2142
|
+
def write_csv(filename, opts = {})
|
2143
|
+
DaruLite::IO.dataframe_write_csv self, filename, opts
|
2144
|
+
end
|
2145
|
+
|
2146
|
+
# Write this dataframe to an Excel Spreadsheet
|
2147
|
+
#
|
2148
|
+
# == Arguments
|
2149
|
+
#
|
2150
|
+
# * filename - The path of the file where the DataFrame should be written.
|
2151
|
+
def write_excel(filename, opts = {})
|
2152
|
+
DaruLite::IO.dataframe_write_excel self, filename, opts
|
2153
|
+
end
|
2154
|
+
|
2155
|
+
# Insert each case of the Dataset on the selected table
|
2156
|
+
#
|
2157
|
+
# == Arguments
|
2158
|
+
#
|
2159
|
+
# * dbh - DBI database connection object.
|
2160
|
+
# * query - Query string.
|
2161
|
+
#
|
2162
|
+
# == Usage
|
2163
|
+
#
|
2164
|
+
# ds = DaruLite::DataFrame.new({:id=>DaruLite::Vector.new([1,2,3]), :name=>DaruLite::Vector.new(["a","b","c"])})
|
2165
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
2166
|
+
# ds.write_sql(dbh,"test")
|
2167
|
+
def write_sql(dbh, table)
|
2168
|
+
DaruLite::IO.dataframe_write_sql self, dbh, table
|
2169
|
+
end
|
2170
|
+
|
2171
|
+
# Use marshalling to save dataframe to a file.
|
2172
|
+
def save(filename)
|
2173
|
+
DaruLite::IO.save self, filename
|
2174
|
+
end
|
2175
|
+
|
2176
|
+
def _dump(_depth)
|
2177
|
+
Marshal.dump(
|
2178
|
+
data: @data,
|
2179
|
+
index: @index.to_a,
|
2180
|
+
order: @vectors.to_a,
|
2181
|
+
name: @name
|
2182
|
+
)
|
2183
|
+
end
|
2184
|
+
|
2185
|
+
def self._load(data)
|
2186
|
+
h = Marshal.load data
|
2187
|
+
DaruLite::DataFrame.new(h[:data],
|
2188
|
+
index: h[:index],
|
2189
|
+
order: h[:order],
|
2190
|
+
name: h[:name])
|
2191
|
+
end
|
2192
|
+
|
2193
|
+
# Transpose a DataFrame, tranposing elements and row, column indexing.
|
2194
|
+
def transpose
|
2195
|
+
DaruLite::DataFrame.new(
|
2196
|
+
each_vector.map(&:to_a).transpose,
|
2197
|
+
index: @vectors,
|
2198
|
+
order: @index,
|
2199
|
+
dtype: @dtype,
|
2200
|
+
name: @name
|
2201
|
+
)
|
2202
|
+
end
|
2203
|
+
|
2204
|
+
# Pretty print in a nice table format for the command line (irb/pry/iruby)
|
2205
|
+
def inspect(spacing = DaruLite.spacing, threshold = DaruLite.max_rows)
|
2206
|
+
name_part = @name ? ": #{@name} " : ''
|
2207
|
+
spacing = [headers.to_a.map(&:length).max, spacing].max
|
2208
|
+
|
2209
|
+
"#<#{self.class}#{name_part}(#{nrows}x#{ncols})>#{$INPUT_RECORD_SEPARATOR}" +
|
2210
|
+
Formatters::Table.format(
|
2211
|
+
each_row.lazy,
|
2212
|
+
row_headers: row_headers,
|
2213
|
+
headers: headers,
|
2214
|
+
threshold: threshold,
|
2215
|
+
spacing: spacing
|
2216
|
+
)
|
2217
|
+
end
|
2218
|
+
|
2219
|
+
# Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
|
2220
|
+
def where(bool_array)
|
2221
|
+
DaruLite::Core::Query.df_where self, bool_array
|
2222
|
+
end
|
2223
|
+
|
2224
|
+
def ==(other)
|
2225
|
+
self.class == other.class &&
|
2226
|
+
@size == other.size &&
|
2227
|
+
@index == other.index &&
|
2228
|
+
@vectors == other.vectors &&
|
2229
|
+
@vectors.to_a.all? { |v| self[v] == other[v] }
|
2230
|
+
end
|
2231
|
+
|
2232
|
+
# Converts the specified non category type vectors to category type vectors
|
2233
|
+
# @param [Array] names of non category type vectors to be converted
|
2234
|
+
# @return [DaruLite::DataFrame] data frame in which specified vectors have been
|
2235
|
+
# converted to category type
|
2236
|
+
# @example
|
2237
|
+
# df = DaruLite::DataFrame.new({
|
2238
|
+
# a: [1, 2, 3],
|
2239
|
+
# b: ['a', 'a', 'b']
|
2240
|
+
# })
|
2241
|
+
# df.to_category :b
|
2242
|
+
# df[:b].type
|
2243
|
+
# # => :category
|
2244
|
+
def to_category(*names)
|
2245
|
+
names.each { |n| self[n] = self[n].to_category }
|
2246
|
+
self
|
2247
|
+
end
|
2248
|
+
|
2249
|
+
def method_missing(name, *args, &block)
|
2250
|
+
if /(.+)=/.match?(name)
|
2251
|
+
name = name[/(.+)=/].delete('=')
|
2252
|
+
name = name.to_sym unless has_vector?(name)
|
2253
|
+
insert_or_modify_vector [name], args[0]
|
2254
|
+
elsif has_vector?(name)
|
2255
|
+
self[name]
|
2256
|
+
elsif has_vector?(name.to_s)
|
2257
|
+
self[name.to_s]
|
2258
|
+
else
|
2259
|
+
super
|
2260
|
+
end
|
2261
|
+
end
|
2262
|
+
|
2263
|
+
def respond_to_missing?(name, include_private = false)
|
2264
|
+
name.to_s.end_with?('=') || has_vector?(name) || super
|
2265
|
+
end
|
2266
|
+
|
2267
|
+
def interact_code(vector_names, full)
|
2268
|
+
dfs = vector_names.zip(full).map do |vec_name, f|
|
2269
|
+
self[vec_name].contrast_code(full: f).each.to_a
|
2270
|
+
end
|
2271
|
+
|
2272
|
+
all_vectors = recursive_product(dfs)
|
2273
|
+
DaruLite::DataFrame.new all_vectors,
|
2274
|
+
order: all_vectors.map(&:name)
|
2275
|
+
end
|
2276
|
+
|
2277
|
+
# Split the dataframe into many dataframes based on category vector
|
2278
|
+
# @param [object] cat_name name of category vector to split the dataframe
|
2279
|
+
# @return [Array] array of dataframes split by category with category vector
|
2280
|
+
# used to split not included
|
2281
|
+
# @example
|
2282
|
+
# df = DaruLite::DataFrame.new({
|
2283
|
+
# a: [1, 2, 3],
|
2284
|
+
# b: ['a', 'a', 'b']
|
2285
|
+
# })
|
2286
|
+
# df.to_category :b
|
2287
|
+
# df.split_by_category :b
|
2288
|
+
# # => [#<DaruLite::DataFrame: a (2x1)>
|
2289
|
+
# # a
|
2290
|
+
# # 0 1
|
2291
|
+
# # 1 2,
|
2292
|
+
# # #<DaruLite::DataFrame: b (1x1)>
|
2293
|
+
# # a
|
2294
|
+
# # 2 3]
|
2295
|
+
def split_by_category(cat_name)
|
2296
|
+
cat_dv = self[cat_name]
|
2297
|
+
raise ArgumentError, "#{cat_name} is not a category vector" unless
|
2298
|
+
cat_dv.category?
|
2299
|
+
|
2300
|
+
cat_dv.categories.map do |cat|
|
2301
|
+
where(cat_dv.eq cat)
|
2302
|
+
.rename(cat)
|
2303
|
+
.delete_vector cat_name
|
2304
|
+
end
|
2305
|
+
end
|
2306
|
+
|
2307
|
+
# @param indexes [Array] index(s) at which row tuples are retrieved
|
2308
|
+
# @return [Array] returns array of row tuples at given index(s)
|
2309
|
+
# @example Using DaruLite::Index
|
2310
|
+
# df = DaruLite::DataFrame.new({
|
2311
|
+
# a: [1, 2, 3],
|
2312
|
+
# b: ['a', 'a', 'b']
|
2313
|
+
# })
|
2314
|
+
#
|
2315
|
+
# df.access_row_tuples_by_indexs(1,2)
|
2316
|
+
# # => [[2, "a"], [3, "b"]]
|
2317
|
+
#
|
2318
|
+
# df.index = DaruLite::Index.new([:one,:two,:three])
|
2319
|
+
# df.access_row_tuples_by_indexs(:one,:three)
|
2320
|
+
# # => [[1, "a"], [3, "b"]]
|
2321
|
+
#
|
2322
|
+
# @example Using DaruLite::MultiIndex
|
2323
|
+
# mi_idx = DaruLite::MultiIndex.from_tuples [
|
2324
|
+
# [:a,:one,:bar],
|
2325
|
+
# [:a,:one,:baz],
|
2326
|
+
# [:b,:two,:bar],
|
2327
|
+
# [:a,:two,:baz],
|
2328
|
+
# ]
|
2329
|
+
# df_mi = DaruLite::DataFrame.new({
|
2330
|
+
# a: 1..4,
|
2331
|
+
# b: 'a'..'d'
|
2332
|
+
# }, index: mi_idx )
|
2333
|
+
#
|
2334
|
+
# df_mi.access_row_tuples_by_indexs(:b, :two, :bar)
|
2335
|
+
# # => [[3, "c"]]
|
2336
|
+
# df_mi.access_row_tuples_by_indexs(:a)
|
2337
|
+
# # => [[1, "a"], [2, "b"], [4, "d"]]
|
2338
|
+
def access_row_tuples_by_indexs(*indexes)
|
2339
|
+
return get_sub_dataframe(indexes, by_position: false).map_rows(&:to_a) if
|
2340
|
+
@index.is_a?(DaruLite::MultiIndex)
|
2341
|
+
|
2342
|
+
positions = @index.pos(*indexes)
|
2343
|
+
if positions.is_a? Numeric
|
2344
|
+
row = get_rows_for([positions])
|
2345
|
+
row.first.is_a?(Array) ? row : [row]
|
2346
|
+
else
|
2347
|
+
new_rows = get_rows_for(indexes, by_position: false)
|
2348
|
+
indexes.map { |index| new_rows.map { |r| r[index] } }
|
2349
|
+
end
|
2350
|
+
end
|
2351
|
+
|
2352
|
+
# Function to use for aggregating the data.
|
2353
|
+
#
|
2354
|
+
# @param options [Hash] options for column, you want in resultant dataframe
|
2355
|
+
#
|
2356
|
+
# @return [DaruLite::DataFrame]
|
2357
|
+
#
|
2358
|
+
# @example
|
2359
|
+
# df = DaruLite::DataFrame.new(
|
2360
|
+
# {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
|
2361
|
+
# => #<DaruLite::DataFrame(5x2)>
|
2362
|
+
# col num
|
2363
|
+
# 0 a 52
|
2364
|
+
# 1 b 12
|
2365
|
+
# 2 c 7
|
2366
|
+
# 3 d 17
|
2367
|
+
# 4 e 1
|
2368
|
+
#
|
2369
|
+
# df.aggregate(num_100_times: ->(df) { (df.num*100).first })
|
2370
|
+
# => #<DaruLite::DataFrame(5x1)>
|
2371
|
+
# num_100_ti
|
2372
|
+
# 0 5200
|
2373
|
+
# 1 1200
|
2374
|
+
# 2 700
|
2375
|
+
# 3 1700
|
2376
|
+
# 4 100
|
2377
|
+
#
|
2378
|
+
# When we have duplicate index :
|
2379
|
+
#
|
2380
|
+
# idx = DaruLite::CategoricalIndex.new [:a, :b, :a, :a, :c]
|
2381
|
+
# df = DaruLite::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
|
2382
|
+
# => #<DaruLite::DataFrame(5x1)>
|
2383
|
+
# num
|
2384
|
+
# a 52
|
2385
|
+
# b 12
|
2386
|
+
# a 7
|
2387
|
+
# a 17
|
2388
|
+
# c 1
|
2389
|
+
#
|
2390
|
+
# df.aggregate(num: :mean)
|
2391
|
+
# => #<DaruLite::DataFrame(3x1)>
|
2392
|
+
# num
|
2393
|
+
# a 25.3333333
|
2394
|
+
# b 12
|
2395
|
+
# c 1
|
2396
|
+
#
|
2397
|
+
# Note: `GroupBy` class `aggregate` method uses this `aggregate` method
|
2398
|
+
# internally.
|
2399
|
+
def aggregate(options = {}, multi_index_level = -1)
|
2400
|
+
if block_given?
|
2401
|
+
positions_tuples, new_index = yield(@index) # NOTE: use of yield is private for now
|
2402
|
+
else
|
2403
|
+
positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
|
2404
|
+
end
|
2405
|
+
|
2406
|
+
colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
|
2407
|
+
|
2408
|
+
DaruLite::DataFrame.new(colmn_value, index: new_index, order: options.keys)
|
2409
|
+
end
|
2410
|
+
|
2411
|
+
def group_by_and_aggregate(*group_by_keys, **aggregation_map)
|
2412
|
+
group_by(*group_by_keys).aggregate(aggregation_map)
|
2413
|
+
end
|
2414
|
+
|
2415
|
+
private
|
2416
|
+
|
2417
|
+
def headers
|
2418
|
+
DaruLite::Index.new(Array(index.name) + @vectors.to_a)
|
2419
|
+
end
|
2420
|
+
|
2421
|
+
def row_headers
|
2422
|
+
index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
|
2423
|
+
end
|
2424
|
+
|
2425
|
+
def convert_categorical_vectors(names)
|
2426
|
+
names.filter_map do |n|
|
2427
|
+
next unless self[n].category?
|
2428
|
+
|
2429
|
+
old = [n, self[n]]
|
2430
|
+
self[n] = DaruLite::Vector.new(self[n].to_ints)
|
2431
|
+
old
|
2432
|
+
end
|
2433
|
+
end
|
2434
|
+
|
2435
|
+
def restore_categorical_vectors(old)
|
2436
|
+
old.each { |name, vector| self[name] = vector }
|
2437
|
+
end
|
2438
|
+
|
2439
|
+
def recursive_product(dfs)
|
2440
|
+
return dfs.first if dfs.size == 1
|
2441
|
+
|
2442
|
+
left = dfs.first
|
2443
|
+
dfs.shift
|
2444
|
+
right = recursive_product dfs
|
2445
|
+
left.product(right).map do |dv1, dv2|
|
2446
|
+
(dv1 * dv2).rename "#{dv1.name}:#{dv2.name}"
|
2447
|
+
end
|
2448
|
+
end
|
2449
|
+
|
2450
|
+
def should_be_vector!(val)
|
2451
|
+
return val if val.is_a?(DaruLite::Vector)
|
2452
|
+
|
2453
|
+
raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
|
2454
|
+
end
|
2455
|
+
|
2456
|
+
def dispatch_to_axis(axis, method, *args, &block)
|
2457
|
+
if %i[vector column].include?(axis)
|
2458
|
+
send(:"#{method}_vector", *args, &block)
|
2459
|
+
elsif axis == :row
|
2460
|
+
send(:"#{method}_row", *args, &block)
|
2461
|
+
else
|
2462
|
+
raise ArgumentError, "Unknown axis #{axis}"
|
2463
|
+
end
|
2464
|
+
end
|
2465
|
+
|
2466
|
+
def dispatch_to_axis_pl(axis, method, *args, &block)
|
2467
|
+
if %i[vector column].include?(axis)
|
2468
|
+
send(:"#{method}_vectors", *args, &block)
|
2469
|
+
elsif axis == :row
|
2470
|
+
send(:"#{method}_rows", *args, &block)
|
2471
|
+
else
|
2472
|
+
raise ArgumentError, "Unknown axis #{axis}"
|
2473
|
+
end
|
2474
|
+
end
|
2475
|
+
|
2476
|
+
AXES = %i[row vector].freeze
|
2477
|
+
|
2478
|
+
def extract_axis(names, default = :vector)
|
2479
|
+
if AXES.include?(names.last)
|
2480
|
+
names.pop
|
2481
|
+
else
|
2482
|
+
default
|
2483
|
+
end
|
2484
|
+
end
|
2485
|
+
|
2486
|
+
def access_vector(*names)
|
2487
|
+
if names.first.is_a?(Range)
|
2488
|
+
dup(@vectors.subset(names.first))
|
2489
|
+
elsif @vectors.is_a?(MultiIndex)
|
2490
|
+
access_vector_multi_index(*names)
|
2491
|
+
else
|
2492
|
+
access_vector_single_index(*names)
|
2493
|
+
end
|
2494
|
+
end
|
2495
|
+
|
2496
|
+
def access_vector_multi_index(*names)
|
2497
|
+
pos = @vectors[names]
|
2498
|
+
|
2499
|
+
return @data[pos] if pos.is_a?(Integer)
|
2500
|
+
|
2501
|
+
new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
|
2502
|
+
|
2503
|
+
pos = pos.drop_left_level(names.size) if names.size < @vectors.width
|
2504
|
+
|
2505
|
+
DaruLite::DataFrame.new(new_vectors, index: @index, order: pos)
|
2506
|
+
end
|
2507
|
+
|
2508
|
+
def access_vector_single_index(*names)
|
2509
|
+
if names.count < 2
|
2510
|
+
begin
|
2511
|
+
pos = @vectors.is_a?(DaruLite::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
|
2512
|
+
rescue IndexError
|
2513
|
+
raise IndexError, "Specified vector #{names.first} does not exist"
|
2514
|
+
end
|
2515
|
+
return @data[pos] if pos.is_a?(Numeric)
|
2516
|
+
|
2517
|
+
names = pos
|
2518
|
+
end
|
2519
|
+
|
2520
|
+
new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
|
2521
|
+
|
2522
|
+
order = names.is_a?(Array) ? DaruLite::Index.new(names) : names
|
2523
|
+
DaruLite::DataFrame.new(new_vectors, order: order, index: @index, name: @name)
|
2524
|
+
end
|
2525
|
+
|
2526
|
+
def access_row(*indexes)
|
2527
|
+
positions = @index.pos(*indexes)
|
2528
|
+
|
2529
|
+
if positions.is_a? Numeric
|
2530
|
+
row = get_rows_for([positions])
|
2531
|
+
DaruLite::Vector.new row, index: @vectors, name: indexes.first
|
2532
|
+
else
|
2533
|
+
new_rows = get_rows_for(indexes, by_position: false)
|
2534
|
+
DaruLite::DataFrame.new new_rows, index: @index.subset(*indexes), order: @vectors
|
2535
|
+
end
|
2536
|
+
end
|
2537
|
+
|
2538
|
+
# @param keys [Array] can be an array of positions (if by_position is true) or indexes (if by_position if false)
|
2539
|
+
# because of coercion by DaruLite::Vector#at and DaruLite::Vector#[], can return either an Array of
|
2540
|
+
# values (representing a row) or an array of Vectors (that can be seen as rows)
|
2541
|
+
def get_rows_for(keys, by_position: true)
|
2542
|
+
raise unless keys.is_a?(Array)
|
2543
|
+
|
2544
|
+
if by_position
|
2545
|
+
pos = keys
|
2546
|
+
@data.map { |vector| vector.at(*pos) }
|
2547
|
+
else
|
2548
|
+
# TODO: for now (2018-07-27), it is different than using
|
2549
|
+
# get_rows_for(@index.pos(*keys))
|
2550
|
+
# because DaruLite::Vector#at and DaruLite::Vector#[] don't handle DaruLite::MultiIndex the same way
|
2551
|
+
indexes = keys
|
2552
|
+
@data.map { |vec| vec[*indexes] }
|
2553
|
+
end
|
2554
|
+
end
|
2555
|
+
|
2556
|
+
def insert_or_modify_vector(name, vector)
|
2557
|
+
name = name[0] unless @vectors.is_a?(MultiIndex)
|
2558
|
+
|
2559
|
+
if @index.empty?
|
2560
|
+
insert_vector_in_empty name, vector
|
2561
|
+
else
|
2562
|
+
vec = prepare_for_insert name, vector
|
2563
|
+
|
2564
|
+
assign_or_add_vector name, vec
|
2565
|
+
end
|
2566
|
+
end
|
2567
|
+
|
2568
|
+
def assign_or_add_vector(name, v)
|
2569
|
+
# FIXME: fix this jugaad. need to make changes in Indexing itself.
|
2570
|
+
begin
|
2571
|
+
pos = @vectors[name]
|
2572
|
+
rescue IndexError
|
2573
|
+
pos = name
|
2574
|
+
end
|
2575
|
+
|
2576
|
+
if pos.is_a?(DaruLite::Index)
|
2577
|
+
assign_multiple_vectors pos, v
|
2578
|
+
elsif pos == name &&
|
2579
|
+
(@vectors.include?(name) || (pos.is_a?(Integer) && pos < @data.size))
|
2580
|
+
|
2581
|
+
@data[pos] = v
|
2582
|
+
else
|
2583
|
+
assign_or_add_vector_rough name, v
|
2584
|
+
end
|
2585
|
+
end
|
2586
|
+
|
2587
|
+
def assign_multiple_vectors(pos, v)
|
2588
|
+
pos.each do |p|
|
2589
|
+
@data[@vectors[p]] = v
|
2590
|
+
end
|
2591
|
+
end
|
2592
|
+
|
2593
|
+
def assign_or_add_vector_rough(name, v)
|
2594
|
+
@vectors |= [name] unless @vectors.include?(name)
|
2595
|
+
@data[@vectors[name]] = v
|
2596
|
+
end
|
2597
|
+
|
2598
|
+
def insert_vector_in_empty(name, vector)
|
2599
|
+
vec = Vector.coerce(vector.to_a, name: coerce_name(name))
|
2600
|
+
|
2601
|
+
@index = vec.index
|
2602
|
+
assign_or_add_vector name, vec
|
2603
|
+
set_size
|
2604
|
+
|
2605
|
+
@data.map! { |v| v.empty? ? v.reindex(@index) : v }
|
2606
|
+
end
|
2607
|
+
|
2608
|
+
def prepare_for_insert(name, arg)
|
2609
|
+
if arg.is_a? DaruLite::Vector
|
2610
|
+
prepare_vector_for_insert name, arg
|
2611
|
+
elsif arg.respond_to?(:to_a)
|
2612
|
+
prepare_enum_for_insert name, arg
|
2613
|
+
else
|
2614
|
+
prepare_value_for_insert name, arg
|
2615
|
+
end
|
2616
|
+
end
|
2617
|
+
|
2618
|
+
def prepare_vector_for_insert(name, vector)
|
2619
|
+
# so that index-by-index assignment is avoided when possible.
|
2620
|
+
return vector.dup if vector.index == @index
|
2621
|
+
|
2622
|
+
DaruLite::Vector.new([], name: coerce_name(name), index: @index).tap do |v|
|
2623
|
+
@index.each do |idx|
|
2624
|
+
v[idx] = vector.index.include?(idx) ? vector[idx] : nil
|
2625
|
+
end
|
2626
|
+
end
|
2627
|
+
end
|
2628
|
+
|
2629
|
+
def prepare_enum_for_insert(name, enum)
|
2630
|
+
if @size != enum.size
|
2631
|
+
raise "Specified vector of length #{enum.size} cannot be inserted in DataFrame of size #{@size}"
|
2632
|
+
end
|
2633
|
+
|
2634
|
+
DaruLite::Vector.new(enum, name: coerce_name(name), index: @index)
|
2635
|
+
end
|
2636
|
+
|
2637
|
+
def prepare_value_for_insert(name, value)
|
2638
|
+
DaruLite::Vector.new(Array(value) * @size, name: coerce_name(name), index: @index)
|
2639
|
+
end
|
2640
|
+
|
2641
|
+
def insert_or_modify_row(indexes, vector)
|
2642
|
+
vector = coerce_vector vector
|
2643
|
+
|
2644
|
+
raise SizeError, 'Vector length should match row length' if
|
2645
|
+
vector.size != @vectors.size
|
2646
|
+
|
2647
|
+
@data.each_with_index do |vec, pos|
|
2648
|
+
vec.send(:set, indexes, vector.at(pos))
|
2649
|
+
end
|
2650
|
+
@index = @data[0].index
|
2651
|
+
|
2652
|
+
set_size
|
2653
|
+
end
|
2654
|
+
|
2655
|
+
def create_empty_vectors(vectors, index)
|
2656
|
+
@vectors = Index.coerce vectors
|
2657
|
+
@index = Index.coerce index
|
2658
|
+
|
2659
|
+
@data = @vectors.map do |name|
|
2660
|
+
DaruLite::Vector.new([], name: coerce_name(name), index: @index)
|
2661
|
+
end
|
2662
|
+
end
|
2663
|
+
|
2664
|
+
def validate_labels
|
2665
|
+
if @vectors && @vectors.size != @data.size
|
2666
|
+
raise IndexError, "Expected equal number of vector names (#{@vectors.size}) " \
|
2667
|
+
"for number of vectors (#{@data.size})."
|
2668
|
+
end
|
2669
|
+
|
2670
|
+
return unless @index && @data[0] && @index.size != @data[0].size
|
2671
|
+
|
2672
|
+
raise IndexError, 'Expected number of indexes same as number of rows'
|
2673
|
+
end
|
2674
|
+
|
2675
|
+
def validate_vector_sizes
|
2676
|
+
@data.each do |vector|
|
2677
|
+
raise IndexError, 'Expected vectors with equal length' if vector.size != @size
|
2678
|
+
end
|
2679
|
+
end
|
2680
|
+
|
2681
|
+
def validate
|
2682
|
+
validate_labels
|
2683
|
+
validate_vector_sizes
|
2684
|
+
end
|
2685
|
+
|
2686
|
+
def set_size
|
2687
|
+
@size = @index.size
|
2688
|
+
end
|
2689
|
+
|
2690
|
+
def named_index_for(index)
|
2691
|
+
if @index.include? index
|
2692
|
+
index
|
2693
|
+
elsif @index.key index
|
2694
|
+
@index.key index
|
2695
|
+
else
|
2696
|
+
raise IndexError, "Specified index #{index} does not exist."
|
2697
|
+
end
|
2698
|
+
end
|
2699
|
+
|
2700
|
+
def create_vectors_index_with(vectors, source)
|
2701
|
+
vectors = source.keys if vectors.nil?
|
2702
|
+
|
2703
|
+
@vectors =
|
2704
|
+
if vectors.is_a?(Index) || vectors.is_a?(MultiIndex)
|
2705
|
+
vectors
|
2706
|
+
else
|
2707
|
+
DaruLite::Index.new((vectors + (source.keys - vectors)).uniq)
|
2708
|
+
end
|
2709
|
+
end
|
2710
|
+
|
2711
|
+
def all_vectors_have_equal_indexes?(source)
|
2712
|
+
idx = source.values[0].index
|
2713
|
+
|
2714
|
+
source.values.all? { |vector| idx == vector.index }
|
2715
|
+
end
|
2716
|
+
|
2717
|
+
def coerce_name(potential_name)
|
2718
|
+
potential_name.is_a?(Array) ? potential_name.join : potential_name
|
2719
|
+
end
|
2720
|
+
|
2721
|
+
def initialize_from_array(source, vectors, index, opts)
|
2722
|
+
raise ArgumentError, 'All objects in data source should be same class' \
|
2723
|
+
unless source.map(&:class).uniq.size == 1
|
2724
|
+
|
2725
|
+
case source.first
|
2726
|
+
when Array
|
2727
|
+
vectors ||= (0..source.size - 1).to_a
|
2728
|
+
initialize_from_array_of_arrays source, vectors, index, opts
|
2729
|
+
when Vector
|
2730
|
+
vectors ||= (0..source.size - 1).to_a
|
2731
|
+
initialize_from_array_of_vectors source, vectors, index, opts
|
2732
|
+
when Hash
|
2733
|
+
initialize_from_array_of_hashes source, vectors, index, opts
|
2734
|
+
else
|
2735
|
+
raise ArgumentError, "Can't create DataFrame from #{source}"
|
2736
|
+
end
|
2737
|
+
end
|
2738
|
+
|
2739
|
+
def initialize_from_array_of_arrays(source, vectors, index, _opts)
|
2740
|
+
if source.size != vectors.size
|
2741
|
+
raise ArgumentError, "Number of vectors (#{vectors.size}) should " \
|
2742
|
+
"equal order size (#{source.size})"
|
2743
|
+
end
|
2744
|
+
|
2745
|
+
@index = Index.coerce(index || source[0].size)
|
2746
|
+
@vectors = Index.coerce(vectors)
|
2747
|
+
|
2748
|
+
update_data source, vectors
|
2749
|
+
end
|
2750
|
+
|
2751
|
+
def initialize_from_array_of_vectors(source, vectors, index, opts)
|
2752
|
+
clone = opts[:clone] != false
|
2753
|
+
hsh = vectors.each_with_index.to_h do |name, idx|
|
2754
|
+
[name, source[idx]]
|
2755
|
+
end
|
2756
|
+
initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
|
2757
|
+
end
|
2758
|
+
|
2759
|
+
def initialize_from_array_of_hashes(source, vectors, index, _opts)
|
2760
|
+
names =
|
2761
|
+
if vectors.nil?
|
2762
|
+
source[0].keys
|
2763
|
+
else
|
2764
|
+
(vectors + source[0].keys).uniq
|
2765
|
+
end
|
2766
|
+
@vectors = DaruLite::Index.new(names)
|
2767
|
+
@index = DaruLite::Index.new(index || source.size)
|
2768
|
+
|
2769
|
+
@data = @vectors.map do |name|
|
2770
|
+
v = source.map { |h| h.fetch(name) { h[name.to_s] } }
|
2771
|
+
DaruLite::Vector.new(v, name: coerce_name(name), index: @index)
|
2772
|
+
end
|
2773
|
+
end
|
2774
|
+
|
2775
|
+
def initialize_from_hash(source, vectors, index, opts)
|
2776
|
+
create_vectors_index_with vectors, source
|
2777
|
+
|
2778
|
+
if ArrayHelper.array_of?(source.values, Vector)
|
2779
|
+
initialize_from_hash_with_vectors source, index, opts
|
2780
|
+
else
|
2781
|
+
initialize_from_hash_with_arrays source, index, opts
|
2782
|
+
end
|
2783
|
+
end
|
2784
|
+
|
2785
|
+
def initialize_from_hash_with_vectors(source, index, opts)
|
2786
|
+
vectors_have_same_index = all_vectors_have_equal_indexes?(source)
|
2787
|
+
|
2788
|
+
clone = opts[:clone] != false
|
2789
|
+
clone = true unless index || vectors_have_same_index
|
2790
|
+
|
2791
|
+
@index = deduce_index index, source, vectors_have_same_index
|
2792
|
+
|
2793
|
+
if clone
|
2794
|
+
@data = clone_vectors source, vectors_have_same_index
|
2795
|
+
else
|
2796
|
+
@data.concat source.values
|
2797
|
+
end
|
2798
|
+
end
|
2799
|
+
|
2800
|
+
def deduce_index(index, source, vectors_have_same_index)
|
2801
|
+
if !index.nil?
|
2802
|
+
Index.coerce index
|
2803
|
+
elsif vectors_have_same_index
|
2804
|
+
source.values[0].index.dup
|
2805
|
+
else
|
2806
|
+
all_indexes = source
|
2807
|
+
.values.map { |v| v.index.to_a }
|
2808
|
+
.flatten.uniq.sort # sort only if missing indexes detected
|
2809
|
+
|
2810
|
+
DaruLite::Index.new all_indexes
|
2811
|
+
end
|
2812
|
+
end
|
2813
|
+
|
2814
|
+
def clone_vectors(source, vectors_have_same_index)
|
2815
|
+
@vectors.map do |vector|
|
2816
|
+
# avoids matching indexes of vectors if all the supplied vectors
|
2817
|
+
# have the same index.
|
2818
|
+
if vectors_have_same_index
|
2819
|
+
source[vector].dup
|
2820
|
+
else
|
2821
|
+
DaruLite::Vector.new([], name: vector, index: @index).tap do |v|
|
2822
|
+
@index.each do |idx|
|
2823
|
+
v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
|
2824
|
+
end
|
2825
|
+
end
|
2826
|
+
end
|
2827
|
+
end
|
2828
|
+
end
|
2829
|
+
|
2830
|
+
def initialize_from_hash_with_arrays(source, index, _opts)
|
2831
|
+
@index = Index.coerce(index || source.values[0].size)
|
2832
|
+
|
2833
|
+
@vectors.each do |name|
|
2834
|
+
@data << DaruLite::Vector.new(source[name].dup, name: coerce_name(name), index: @index)
|
2835
|
+
end
|
2836
|
+
end
|
2837
|
+
|
2838
|
+
def sort_build_row(vector_locs, by_blocks, ascending, handle_nils, r1, r2) # rubocop:disable Metrics/ParameterLists
|
2839
|
+
# Create an array to be used for comparison of two rows in sorting
|
2840
|
+
vector_locs
|
2841
|
+
.zip(by_blocks, ascending, handle_nils)
|
2842
|
+
.map do |vector_loc, by, asc, handle_nil|
|
2843
|
+
value = @data[vector_loc].data[asc ? r1 : r2]
|
2844
|
+
|
2845
|
+
if by
|
2846
|
+
value = begin
|
2847
|
+
by.call(value)
|
2848
|
+
rescue StandardError
|
2849
|
+
nil
|
2850
|
+
end
|
2851
|
+
end
|
2852
|
+
|
2853
|
+
sort_handle_nils value, asc, handle_nil || !by
|
2854
|
+
end
|
2855
|
+
end
|
2856
|
+
|
2857
|
+
def sort_handle_nils(value, asc, handle_nil)
|
2858
|
+
if !handle_nil
|
2859
|
+
value
|
2860
|
+
elsif asc
|
2861
|
+
[value.nil? ? 0 : 1, value]
|
2862
|
+
else
|
2863
|
+
[value.nil? ? 1 : 0, value]
|
2864
|
+
end
|
2865
|
+
end
|
2866
|
+
|
2867
|
+
def sort_coerce_boolean(opts, symbol, default, size)
|
2868
|
+
val = opts[symbol]
|
2869
|
+
case val
|
2870
|
+
when true, false
|
2871
|
+
Array.new(size, val)
|
2872
|
+
when nil
|
2873
|
+
Array.new(size, default)
|
2874
|
+
when Array
|
2875
|
+
raise ArgumentError, "Specify same number of vector names and #{symbol}" if
|
2876
|
+
size != val.size
|
2877
|
+
|
2878
|
+
val
|
2879
|
+
else
|
2880
|
+
raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
|
2881
|
+
end
|
2882
|
+
end
|
2883
|
+
|
2884
|
+
def sort_prepare_block(vector_order, opts)
|
2885
|
+
ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
|
2886
|
+
handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
|
2887
|
+
|
2888
|
+
by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
|
2889
|
+
vector_locs = vector_order.map { |v| @vectors[v] }
|
2890
|
+
|
2891
|
+
lambda do |index1, index2|
|
2892
|
+
# Build left and right array to compare two rows
|
2893
|
+
left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
|
2894
|
+
right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
|
2895
|
+
|
2896
|
+
# Resolve conflict by Index if all attributes are same
|
2897
|
+
left << index1
|
2898
|
+
right << index2
|
2899
|
+
left <=> right
|
2900
|
+
end
|
2901
|
+
end
|
2902
|
+
|
2903
|
+
def verify_error_message(row, test, id, i)
|
2904
|
+
description, fields, = test
|
2905
|
+
values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
|
2906
|
+
"#{i + 1} [#{row[id]}]: #{description}#{values}"
|
2907
|
+
end
|
2908
|
+
|
2909
|
+
def prepare_pivot_values(index, vectors, opts)
|
2910
|
+
case opts[:values]
|
2911
|
+
when nil # values not specified at all.
|
2912
|
+
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
2913
|
+
when Array # multiple values specified.
|
2914
|
+
opts[:values]
|
2915
|
+
else # single value specified.
|
2916
|
+
[opts[:values]]
|
2917
|
+
end
|
2918
|
+
end
|
2919
|
+
|
2920
|
+
def make_pivot_hash(grouped, vectors, values, aggregate_function)
|
2921
|
+
grouped.groups.transform_values { |_| {} }.tap do |super_hash|
|
2922
|
+
values.each do |value|
|
2923
|
+
grouped.groups.each do |group_name, row_numbers|
|
2924
|
+
row_numbers.each do |num|
|
2925
|
+
arry = [value, *vectors.map { |v| self[v][num] }]
|
2926
|
+
sub_hash = super_hash[group_name]
|
2927
|
+
sub_hash[arry] ||= []
|
2928
|
+
|
2929
|
+
sub_hash[arry] << self[value][num]
|
2930
|
+
end
|
2931
|
+
end
|
2932
|
+
end
|
2933
|
+
|
2934
|
+
setup_pivot_aggregates super_hash, aggregate_function
|
2935
|
+
end
|
2936
|
+
end
|
2937
|
+
|
2938
|
+
def setup_pivot_aggregates(super_hash, aggregate_function)
|
2939
|
+
super_hash.each_value do |sub_hash|
|
2940
|
+
sub_hash.each do |group_name, aggregates|
|
2941
|
+
sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function)
|
2942
|
+
end
|
2943
|
+
end
|
2944
|
+
end
|
2945
|
+
|
2946
|
+
def pivot_dataframe(super_hash)
|
2947
|
+
df_index = DaruLite::MultiIndex.from_tuples super_hash.keys
|
2948
|
+
df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
|
2949
|
+
|
2950
|
+
DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
|
2951
|
+
super_hash.each do |row_index, sub_h|
|
2952
|
+
sub_h.each do |vector_index, val|
|
2953
|
+
pivoted_dataframe[vector_index][row_index] = val
|
2954
|
+
end
|
2955
|
+
end
|
2956
|
+
end
|
2957
|
+
end
|
2958
|
+
|
2959
|
+
def one_to_many_components(pattern)
|
2960
|
+
re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
|
2961
|
+
|
2962
|
+
vars, numbers =
|
2963
|
+
@vectors
|
2964
|
+
.map { |v| v.scan(re) }
|
2965
|
+
.reject(&:empty?).flatten(1).transpose
|
2966
|
+
|
2967
|
+
[vars.uniq, numbers.map(&:to_i).sort.uniq]
|
2968
|
+
end
|
2969
|
+
|
2970
|
+
def one_to_many_row(row, number, vars, pattern)
|
2971
|
+
vars
|
2972
|
+
.to_h do |v|
|
2973
|
+
name = pattern.sub('%v', v).sub('%n', number.to_s)
|
2974
|
+
[v, row[name]]
|
2975
|
+
end
|
2976
|
+
end
|
2977
|
+
|
2978
|
+
# Raises IndexError when one of the positions is not a valid position
|
2979
|
+
def validate_positions(*positions, size)
|
2980
|
+
positions.each do |pos|
|
2981
|
+
raise IndexError, "#{pos} is not a valid position." if pos >= size
|
2982
|
+
end
|
2983
|
+
end
|
2984
|
+
|
2985
|
+
# Accepts hash, enumerable and vector and align it properly so it can be added
|
2986
|
+
def coerce_vector(vector)
|
2987
|
+
case vector
|
2988
|
+
when DaruLite::Vector
|
2989
|
+
vector.reindex @vectors
|
2990
|
+
when Hash
|
2991
|
+
DaruLite::Vector.new(vector).reindex @vectors
|
2992
|
+
else
|
2993
|
+
DaruLite::Vector.new vector
|
2994
|
+
end
|
2995
|
+
end
|
2996
|
+
|
2997
|
+
def update_data(source, vectors)
|
2998
|
+
@data = @vectors.each_with_index.map do |_vec, idx|
|
2999
|
+
DaruLite::Vector.new(source[idx], index: @index, name: vectors[idx])
|
3000
|
+
end
|
3001
|
+
end
|
3002
|
+
|
3003
|
+
def aggregate_by_positions_tuples(options, positions_tuples)
|
3004
|
+
agg_over_vectors_only, options = cast_aggregation_options(options)
|
3005
|
+
|
3006
|
+
if agg_over_vectors_only
|
3007
|
+
options.map do |vect_name, method|
|
3008
|
+
vect = self[vect_name]
|
3009
|
+
|
3010
|
+
positions_tuples.map do |positions|
|
3011
|
+
vect.apply_method_on_sub_vector(method, keys: positions)
|
3012
|
+
end
|
3013
|
+
end
|
3014
|
+
else
|
3015
|
+
methods = options.values
|
3016
|
+
|
3017
|
+
# NOTE: because we aggregate over rows, we don't have to re-get sub-dfs for each method (which is expensive)
|
3018
|
+
rows = positions_tuples.map do |positions|
|
3019
|
+
apply_method_on_sub_df(methods, keys: positions)
|
3020
|
+
end
|
3021
|
+
|
3022
|
+
rows.transpose
|
3023
|
+
end
|
3024
|
+
end
|
3025
|
+
|
3026
|
+
# convert operations over sub-vectors to operations over sub-dfs when it improves perf
|
3027
|
+
# note: we don't always "cast" because aggregation over a single vector / a few vector is faster
|
3028
|
+
# than aggregation over (sub-)dfs
|
3029
|
+
def cast_aggregation_options(options)
|
3030
|
+
vects, non_vects = options.keys.partition { |k| @vectors.include?(k) }
|
3031
|
+
|
3032
|
+
over_vectors = true
|
3033
|
+
|
3034
|
+
if non_vects.any?
|
3035
|
+
options = options.clone
|
3036
|
+
|
3037
|
+
vects.each do |name|
|
3038
|
+
proc_on_vect = options[name].to_proc
|
3039
|
+
options[name] = ->(sub_df) { proc_on_vect.call(sub_df[name]) }
|
3040
|
+
end
|
3041
|
+
|
3042
|
+
over_vectors = false
|
3043
|
+
end
|
3044
|
+
|
3045
|
+
[over_vectors, options]
|
3046
|
+
end
|
3047
|
+
|
3048
|
+
def group_index_for_aggregation(index, multi_index_level = -1)
|
3049
|
+
case index
|
3050
|
+
when DaruLite::MultiIndex
|
3051
|
+
groups_by_pos = DaruLite::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
|
3052
|
+
|
3053
|
+
new_index = DaruLite::MultiIndex.from_tuples(groups_by_pos.keys).coerce_index
|
3054
|
+
pos_tuples = groups_by_pos.values
|
3055
|
+
when DaruLite::Index, DaruLite::CategoricalIndex
|
3056
|
+
new_index = Array(index).uniq
|
3057
|
+
pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
|
3058
|
+
else raise
|
3059
|
+
end
|
3060
|
+
|
3061
|
+
[pos_tuples, new_index]
|
3062
|
+
end
|
3063
|
+
|
3064
|
+
# coerce ranges, integers and array in appropriate ways
|
3065
|
+
def coerce_positions(*positions, size)
|
3066
|
+
if positions.size == 1
|
3067
|
+
case positions.first
|
3068
|
+
when Integer
|
3069
|
+
positions.first
|
3070
|
+
when Range
|
3071
|
+
size.times.to_a[positions.first]
|
3072
|
+
else
|
3073
|
+
raise ArgumentError, 'Unknown position type.'
|
3074
|
+
end
|
3075
|
+
else
|
3076
|
+
positions
|
3077
|
+
end
|
3078
|
+
end
|
3079
|
+
end
|
3080
|
+
end
|