daru_lite 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/workflows/ci.yml +33 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +27 -0
- data/.rubocop_todo.yml +137 -0
- data/CONTRIBUTING.md +47 -0
- data/Gemfile +2 -0
- data/History.md +4 -0
- data/LICENSE +24 -0
- data/README.md +218 -0
- data/Rakefile +69 -0
- data/ReleasePolicy.md +20 -0
- data/benchmarks/TradeoffData.csv +65 -0
- data/benchmarks/csv_reading.rb +22 -0
- data/benchmarks/dataframe_creation.rb +39 -0
- data/benchmarks/db_loading.rb +34 -0
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +32 -0
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/row_access.rb +41 -0
- data/benchmarks/row_assign.rb +36 -0
- data/benchmarks/sorting.rb +51 -0
- data/benchmarks/statistics.rb +28 -0
- data/benchmarks/vector_access.rb +31 -0
- data/benchmarks/vector_assign.rb +42 -0
- data/benchmarks/where_clause.rb +48 -0
- data/benchmarks/where_vs_filter.rb +28 -0
- data/daru_lite.gemspec +55 -0
- data/images/README.md +5 -0
- data/images/con0.png +0 -0
- data/images/con1.png +0 -0
- data/images/init0.png +0 -0
- data/images/init1.png +0 -0
- data/images/man0.png +0 -0
- data/images/man1.png +0 -0
- data/images/man2.png +0 -0
- data/images/man3.png +0 -0
- data/images/man4.png +0 -0
- data/images/man5.png +0 -0
- data/images/man6.png +0 -0
- data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
- data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
- data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
- data/lib/daru_lite/category.rb +929 -0
- data/lib/daru_lite/configuration.rb +34 -0
- data/lib/daru_lite/core/group_by.rb +403 -0
- data/lib/daru_lite/core/merge.rb +270 -0
- data/lib/daru_lite/core/query.rb +109 -0
- data/lib/daru_lite/dataframe.rb +3080 -0
- data/lib/daru_lite/date_time/index.rb +569 -0
- data/lib/daru_lite/date_time/offsets.rb +397 -0
- data/lib/daru_lite/exceptions.rb +2 -0
- data/lib/daru_lite/extensions/which_dsl.rb +53 -0
- data/lib/daru_lite/formatters/table.rb +52 -0
- data/lib/daru_lite/helpers/array.rb +53 -0
- data/lib/daru_lite/index/categorical_index.rb +201 -0
- data/lib/daru_lite/index/index.rb +374 -0
- data/lib/daru_lite/index/multi_index.rb +374 -0
- data/lib/daru_lite/io/csv/converters.rb +21 -0
- data/lib/daru_lite/io/io.rb +294 -0
- data/lib/daru_lite/io/sql_data_source.rb +97 -0
- data/lib/daru_lite/iruby/helpers.rb +38 -0
- data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
- data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
- data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
- data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
- data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
- data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
- data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
- data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
- data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
- data/lib/daru_lite/monkeys.rb +56 -0
- data/lib/daru_lite/vector.rb +1678 -0
- data/lib/daru_lite/version.rb +3 -0
- data/lib/daru_lite.rb +99 -0
- data/profile/_base.rb +23 -0
- data/profile/df_to_a.rb +10 -0
- data/profile/filter.rb +13 -0
- data/profile/joining.rb +13 -0
- data/profile/sorting.rb +12 -0
- data/profile/vector_each_with_index.rb +9 -0
- data/profile/vector_new.rb +9 -0
- data/spec/accessors/array_wrapper_spec.rb +3 -0
- data/spec/category_spec.rb +1741 -0
- data/spec/core/group_by_spec.rb +655 -0
- data/spec/core/merge_spec.rb +179 -0
- data/spec/core/query_spec.rb +347 -0
- data/spec/daru_lite_spec.rb +22 -0
- data/spec/dataframe_spec.rb +4330 -0
- data/spec/date_time/data_spec.rb +197 -0
- data/spec/date_time/date_time_index_helper_spec.rb +72 -0
- data/spec/date_time/index_spec.rb +588 -0
- data/spec/date_time/offsets_spec.rb +465 -0
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/boolean_converter_test.csv +5 -0
- data/spec/fixtures/countries.json +7794 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/fixtures/eciresults.html +394 -0
- data/spec/fixtures/empties.dat +2 -0
- data/spec/fixtures/empty_rows_test.csv +17 -0
- data/spec/fixtures/macau.html +3691 -0
- data/spec/fixtures/macd_data.csv +150 -0
- data/spec/fixtures/matrix_test.csv +100 -0
- data/spec/fixtures/moneycontrol.html +6812 -0
- data/spec/fixtures/music_data.tsv +2501 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/sales-funnel.csv +18 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/string_converter_test.csv +5 -0
- data/spec/fixtures/strings.dat +2 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/fixtures/test_xls_2.xls +0 -0
- data/spec/fixtures/url_test.txt~ +0 -0
- data/spec/fixtures/valid_markup.html +62 -0
- data/spec/fixtures/wiki_climate.html +1243 -0
- data/spec/fixtures/wiki_table_info.html +631 -0
- data/spec/formatters/table_formatter_spec.rb +137 -0
- data/spec/helpers_spec.rb +8 -0
- data/spec/index/categorical_index_spec.rb +170 -0
- data/spec/index/index_spec.rb +417 -0
- data/spec/index/multi_index_spec.rb +680 -0
- data/spec/io/io_spec.rb +373 -0
- data/spec/io/sql_data_source_spec.rb +56 -0
- data/spec/iruby/dataframe_spec.rb +170 -0
- data/spec/iruby/helpers_spec.rb +49 -0
- data/spec/iruby/multi_index_spec.rb +37 -0
- data/spec/iruby/vector_spec.rb +105 -0
- data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
- data/spec/maths/arithmetic/vector_spec.rb +165 -0
- data/spec/maths/statistics/dataframe_spec.rb +178 -0
- data/spec/maths/statistics/vector_spec.rb +756 -0
- data/spec/monkeys_spec.rb +42 -0
- data/spec/shared/vector_display_spec.rb +213 -0
- data/spec/spec_helper.rb +87 -0
- data/spec/support/database_helper.rb +30 -0
- data/spec/support/matchers.rb +5 -0
- data/spec/vector_spec.rb +2293 -0
- metadata +571 -0
|
@@ -0,0 +1,3080 @@
|
|
|
1
|
+
require 'daru_lite/accessors/dataframe_by_row'
|
|
2
|
+
require 'daru_lite/maths/arithmetic/dataframe'
|
|
3
|
+
require 'daru_lite/maths/statistics/dataframe'
|
|
4
|
+
require 'daru_lite/io/io'
|
|
5
|
+
|
|
6
|
+
module DaruLite
|
|
7
|
+
class DataFrame # rubocop:disable Metrics/ClassLength
|
|
8
|
+
include DaruLite::Maths::Arithmetic::DataFrame
|
|
9
|
+
include DaruLite::Maths::Statistics::DataFrame
|
|
10
|
+
|
|
11
|
+
attr_accessor(*Configuration::INSPECT_OPTIONS_KEYS)
|
|
12
|
+
|
|
13
|
+
extend Gem::Deprecate
|
|
14
|
+
|
|
15
|
+
class << self
|
|
16
|
+
# Load data from a CSV file. Specify an optional block to grab the CSV
|
|
17
|
+
# object and pre-condition it (for example use the `convert` or
|
|
18
|
+
# `header_convert` methods).
|
|
19
|
+
#
|
|
20
|
+
# == Arguments
|
|
21
|
+
#
|
|
22
|
+
# * path - Local path / Remote URL of the file to load specified as a String.
|
|
23
|
+
#
|
|
24
|
+
# == Options
|
|
25
|
+
#
|
|
26
|
+
# Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
|
|
27
|
+
# and uses those to eventually construct the resulting DataFrame.
|
|
28
|
+
#
|
|
29
|
+
# == Verbose Description
|
|
30
|
+
#
|
|
31
|
+
# You can specify all the options to the `.from_csv` function that you
|
|
32
|
+
# do to the Ruby `CSV.read()` function, since this is what is used internally.
|
|
33
|
+
#
|
|
34
|
+
# For example, if the columns in your CSV file are separated by something
|
|
35
|
+
# other that commas, you can use the `:col_sep` option. If you want to
|
|
36
|
+
# convert numeric values to numbers and not keep them as strings, you can
|
|
37
|
+
# use the `:converters` option and set it to `:numeric`.
|
|
38
|
+
#
|
|
39
|
+
# The `.from_csv` function uses the following defaults for reading CSV files
|
|
40
|
+
# (that are passed into the `CSV.read()` function):
|
|
41
|
+
#
|
|
42
|
+
# {
|
|
43
|
+
# :col_sep => ',',
|
|
44
|
+
# :converters => :numeric
|
|
45
|
+
# }
|
|
46
|
+
def from_csv(path, opts = {}, &block)
|
|
47
|
+
DaruLite::IO.from_csv path, opts, &block
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Read data from an Excel file into a DataFrame.
|
|
51
|
+
#
|
|
52
|
+
# == Arguments
|
|
53
|
+
#
|
|
54
|
+
# * path - Path of the file to be read.
|
|
55
|
+
#
|
|
56
|
+
# == Options
|
|
57
|
+
#
|
|
58
|
+
# *:worksheet_id - ID of the worksheet that is to be read.
|
|
59
|
+
def from_excel(path, opts = {}, &block)
|
|
60
|
+
DaruLite::IO.from_excel path, opts, &block
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Read a database query and returns a Dataset
|
|
64
|
+
#
|
|
65
|
+
# @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
|
|
66
|
+
# @param query [String] The query to be executed
|
|
67
|
+
#
|
|
68
|
+
# @return A dataframe containing the data resulting from the query
|
|
69
|
+
#
|
|
70
|
+
# USE:
|
|
71
|
+
#
|
|
72
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
|
73
|
+
# DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
|
|
74
|
+
#
|
|
75
|
+
# #Alternatively
|
|
76
|
+
#
|
|
77
|
+
# require 'dbi'
|
|
78
|
+
# DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
|
|
79
|
+
def from_sql(dbh, query)
|
|
80
|
+
DaruLite::IO.from_sql dbh, query
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Read a dataframe from AR::Relation
|
|
84
|
+
#
|
|
85
|
+
# @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
|
|
86
|
+
# @param fields [Array] Field names to be loaded (optional)
|
|
87
|
+
#
|
|
88
|
+
# @return A dataframe containing the data loaded from the relation
|
|
89
|
+
#
|
|
90
|
+
# USE:
|
|
91
|
+
#
|
|
92
|
+
# # When Post model is defined as:
|
|
93
|
+
# class Post < ActiveRecord::Base
|
|
94
|
+
# scope :active, -> { where.not(published_at: nil) }
|
|
95
|
+
# end
|
|
96
|
+
#
|
|
97
|
+
# # You can load active posts into a dataframe by:
|
|
98
|
+
# DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
|
|
99
|
+
def from_activerecord(relation, *fields)
|
|
100
|
+
DaruLite::IO.from_activerecord relation, *fields
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Read the database from a plaintext file. For this method to work,
|
|
104
|
+
# the data should be present in a plain text file in columns. See
|
|
105
|
+
# spec/fixtures/bank2.dat for an example.
|
|
106
|
+
#
|
|
107
|
+
# == Arguments
|
|
108
|
+
#
|
|
109
|
+
# * path - Path of the file to be read.
|
|
110
|
+
# * fields - Vector names of the resulting database.
|
|
111
|
+
#
|
|
112
|
+
# == Usage
|
|
113
|
+
#
|
|
114
|
+
# df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
|
|
115
|
+
def from_plaintext(path, fields)
|
|
116
|
+
DaruLite::IO.from_plaintext path, fields
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Create DataFrame by specifying rows as an Array of Arrays or Array of
|
|
120
|
+
# DaruLite::Vector objects.
|
|
121
|
+
def rows(source, opts = {})
|
|
122
|
+
raise SizeError, 'All vectors must have same length' \
|
|
123
|
+
unless source.all? { |v| v.size == source.first.size }
|
|
124
|
+
|
|
125
|
+
opts[:order] ||= guess_order(source)
|
|
126
|
+
|
|
127
|
+
if ArrayHelper.array_of?(source, Array) || source.empty?
|
|
128
|
+
DataFrame.new(source.transpose, opts)
|
|
129
|
+
elsif ArrayHelper.array_of?(source, Vector)
|
|
130
|
+
from_vector_rows(source, opts)
|
|
131
|
+
else
|
|
132
|
+
raise ArgumentError, "Can't create DataFrame from #{source}"
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Generates a new dataset, using three vectors
|
|
137
|
+
# - Rows
|
|
138
|
+
# - Columns
|
|
139
|
+
# - Values
|
|
140
|
+
#
|
|
141
|
+
# For example, you have these values
|
|
142
|
+
#
|
|
143
|
+
# x y v
|
|
144
|
+
# a a 0
|
|
145
|
+
# a b 1
|
|
146
|
+
# b a 1
|
|
147
|
+
# b b 0
|
|
148
|
+
#
|
|
149
|
+
# You obtain
|
|
150
|
+
# id a b
|
|
151
|
+
# a 0 1
|
|
152
|
+
# b 1 0
|
|
153
|
+
#
|
|
154
|
+
# Useful to process outputs from databases
|
|
155
|
+
def crosstab_by_assignation(rows, columns, values)
|
|
156
|
+
raise 'Three vectors should be equal size' if
|
|
157
|
+
rows.size != columns.size || rows.size != values.size
|
|
158
|
+
|
|
159
|
+
data = Hash.new do |h, col|
|
|
160
|
+
h[col] = rows.factors.map { |r| [r, nil] }.to_h
|
|
161
|
+
end
|
|
162
|
+
columns.zip(rows, values).each { |c, r, v| data[c][r] = v }
|
|
163
|
+
|
|
164
|
+
# FIXME: in fact, WITHOUT this line you'll obtain more "right"
|
|
165
|
+
# data: with vectors having "rows" as an index...
|
|
166
|
+
data = data.transform_values(&:values)
|
|
167
|
+
data[:_id] = rows.factors
|
|
168
|
+
|
|
169
|
+
DataFrame.new(data)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
private
|
|
173
|
+
|
|
174
|
+
def guess_order(source)
|
|
175
|
+
case source.first
|
|
176
|
+
when Vector # assume that all are Vectors
|
|
177
|
+
source.first.index.to_a
|
|
178
|
+
when Array
|
|
179
|
+
Array.new(source.first.size, &:to_s)
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def from_vector_rows(source, opts)
|
|
184
|
+
index = source.map(&:name)
|
|
185
|
+
.each_with_index.map { |n, i| n || i }
|
|
186
|
+
index = ArrayHelper.recode_repeated(index)
|
|
187
|
+
|
|
188
|
+
DataFrame.new({}, opts).tap do |df|
|
|
189
|
+
source.each_with_index do |row, idx|
|
|
190
|
+
df[index[idx] || idx, :row] = row
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# The vectors (columns) index of the DataFrame
|
|
197
|
+
attr_reader :vectors
|
|
198
|
+
# TOREMOVE
|
|
199
|
+
attr_reader :data
|
|
200
|
+
|
|
201
|
+
# The index of the rows of the DataFrame
|
|
202
|
+
attr_reader :index
|
|
203
|
+
|
|
204
|
+
# The name of the DataFrame
|
|
205
|
+
attr_reader :name
|
|
206
|
+
|
|
207
|
+
# The number of rows present in the DataFrame
|
|
208
|
+
attr_reader :size
|
|
209
|
+
|
|
210
|
+
# DataFrame basically consists of an Array of Vector objects.
|
|
211
|
+
# These objects are indexed by row and column by vectors and index Index objects.
|
|
212
|
+
#
|
|
213
|
+
# == Arguments
|
|
214
|
+
#
|
|
215
|
+
# * source - Source from the DataFrame is to be initialized. Can be a Hash
|
|
216
|
+
# of names and vectors (array or DaruLite::Vector), an array of arrays or
|
|
217
|
+
# array of DaruLite::Vectors.
|
|
218
|
+
#
|
|
219
|
+
# == Options
|
|
220
|
+
#
|
|
221
|
+
# +:order+ - An *Array*/*DaruLite::Index*/*DaruLite::MultiIndex* containing the order in
|
|
222
|
+
# which Vectors should appear in the DataFrame.
|
|
223
|
+
#
|
|
224
|
+
# +:index+ - An *Array*/*DaruLite::Index*/*DaruLite::MultiIndex* containing the order
|
|
225
|
+
# in which rows of the DataFrame will be named.
|
|
226
|
+
#
|
|
227
|
+
# +:name+ - A name for the DataFrame.
|
|
228
|
+
#
|
|
229
|
+
# +:clone+ - Specify as *true* or *false*. When set to false, and Vector
|
|
230
|
+
# objects are passed for the source, the Vector objects will not duplicated
|
|
231
|
+
# when creating the DataFrame. Will have no effect if Array is passed in
|
|
232
|
+
# the source, or if the passed DaruLite::Vectors have different indexes.
|
|
233
|
+
# Default to *true*.
|
|
234
|
+
#
|
|
235
|
+
# == Usage
|
|
236
|
+
#
|
|
237
|
+
# df = DaruLite::DataFrame.new
|
|
238
|
+
# # =>
|
|
239
|
+
# # <DaruLite::DataFrame(0x0)>
|
|
240
|
+
# # Creates an empty DataFrame with no rows or columns.
|
|
241
|
+
#
|
|
242
|
+
# df = DaruLite::DataFrame.new({}, order: [:a, :b])
|
|
243
|
+
# #<DaruLite::DataFrame(0x2)>
|
|
244
|
+
# a b
|
|
245
|
+
# # Creates a DataFrame with no rows and columns :a and :b
|
|
246
|
+
#
|
|
247
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
|
|
248
|
+
# index: [:a, :b, :c, :d], name: :spider_man)
|
|
249
|
+
#
|
|
250
|
+
# # =>
|
|
251
|
+
# # <DaruLite::DataFrame:80766980 @name = spider_man @size = 4>
|
|
252
|
+
# # b a
|
|
253
|
+
# # a 6 1
|
|
254
|
+
# # b 7 2
|
|
255
|
+
# # c 8 3
|
|
256
|
+
# # d 9 4
|
|
257
|
+
#
|
|
258
|
+
# df = DaruLite::DataFrame.new([[1,2,3,4],[6,7,8,9]], name: :bat_man)
|
|
259
|
+
#
|
|
260
|
+
# # =>
|
|
261
|
+
# # #<DaruLite::DataFrame: bat_man (4x2)>
|
|
262
|
+
# # 0 1
|
|
263
|
+
# # 0 1 6
|
|
264
|
+
# # 1 2 7
|
|
265
|
+
# # 2 3 8
|
|
266
|
+
# # 3 4 9
|
|
267
|
+
#
|
|
268
|
+
# # Dataframe having Index name
|
|
269
|
+
#
|
|
270
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
|
|
271
|
+
# index: DaruLite::Index.new([:a, :b, :c, :d], name: 'idx_name'),
|
|
272
|
+
# name: :spider_man)
|
|
273
|
+
#
|
|
274
|
+
# # =>
|
|
275
|
+
# # <DaruLite::DataFrame:80766980 @name = spider_man @size = 4>
|
|
276
|
+
# # idx_name b a
|
|
277
|
+
# # a 6 1
|
|
278
|
+
# # b 7 2
|
|
279
|
+
# # c 8 3
|
|
280
|
+
# # d 9 4
|
|
281
|
+
#
|
|
282
|
+
#
|
|
283
|
+
# idx = DaruLite::Index.new [100, 99, 101, 1, 2], name: "s1"
|
|
284
|
+
# => #<DaruLite::Index(5): s1 {100, 99, 101, 1, 2}>
|
|
285
|
+
#
|
|
286
|
+
# df = DaruLite::DataFrame.new({b: [11,12,13,14,15], a: [1,2,3,4,5],
|
|
287
|
+
# c: [11,22,33,44,55]},
|
|
288
|
+
# order: [:a, :b, :c],
|
|
289
|
+
# index: idx)
|
|
290
|
+
# # =>
|
|
291
|
+
# #<DaruLite::DataFrame(5x3)>
|
|
292
|
+
# # s1 a b c
|
|
293
|
+
# # 100 1 11 11
|
|
294
|
+
# # 99 2 12 22
|
|
295
|
+
# # 101 3 13 33
|
|
296
|
+
# # 1 4 14 44
|
|
297
|
+
# # 2 5 15 55
|
|
298
|
+
|
|
299
|
+
def initialize(source = {}, opts = {})
|
|
300
|
+
vectors = opts[:order]
|
|
301
|
+
index = opts[:index] # FIXME: just keyword arges after Ruby 2.1
|
|
302
|
+
@data = []
|
|
303
|
+
@name = opts[:name]
|
|
304
|
+
|
|
305
|
+
case source
|
|
306
|
+
when [], {}
|
|
307
|
+
create_empty_vectors(vectors, index)
|
|
308
|
+
when Array
|
|
309
|
+
initialize_from_array source, vectors, index, opts
|
|
310
|
+
when Hash
|
|
311
|
+
initialize_from_hash source, vectors, index, opts
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
set_size
|
|
315
|
+
validate
|
|
316
|
+
update
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
|
|
320
|
+
# Defaults to *:vector*. Use of this method is not recommended for accessing
|
|
321
|
+
# rows. Use df.row[:a] for accessing row with index ':a'.
|
|
322
|
+
def [](*names)
|
|
323
|
+
axis = extract_axis(names, :vector)
|
|
324
|
+
dispatch_to_axis axis, :access, *names
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
# Retrive rows by positions
|
|
328
|
+
# @param [Array<Integer>] positions of rows to retrive
|
|
329
|
+
# @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
|
|
330
|
+
# @example
|
|
331
|
+
# df = DaruLite::DataFrame.new({
|
|
332
|
+
# a: [1, 2, 3],
|
|
333
|
+
# b: ['a', 'b', 'c']
|
|
334
|
+
# })
|
|
335
|
+
# df.row_at 1, 2
|
|
336
|
+
# # => #<DaruLite::DataFrame(2x2)>
|
|
337
|
+
# # a b
|
|
338
|
+
# # 1 2 b
|
|
339
|
+
# # 2 3 c
|
|
340
|
+
def row_at(*positions)
|
|
341
|
+
original_positions = positions
|
|
342
|
+
positions = coerce_positions(*positions, nrows)
|
|
343
|
+
validate_positions(*positions, nrows)
|
|
344
|
+
|
|
345
|
+
if positions.is_a? Integer
|
|
346
|
+
row = get_rows_for([positions])
|
|
347
|
+
DaruLite::Vector.new row, index: @vectors
|
|
348
|
+
else
|
|
349
|
+
new_rows = get_rows_for(original_positions)
|
|
350
|
+
DaruLite::DataFrame.new new_rows, index: @index.at(*original_positions), order: @vectors
|
|
351
|
+
end
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
# Set rows by positions
|
|
355
|
+
# @param [Array<Integer>] positions positions of rows to set
|
|
356
|
+
# @param [Array, DaruLite::Vector] vector vector to be assigned
|
|
357
|
+
# @example
|
|
358
|
+
# df = DaruLite::DataFrame.new({
|
|
359
|
+
# a: [1, 2, 3],
|
|
360
|
+
# b: ['a', 'b', 'c']
|
|
361
|
+
# })
|
|
362
|
+
# df.set_row_at [0, 1], ['x', 'x']
|
|
363
|
+
# df
|
|
364
|
+
# #=> #<DaruLite::DataFrame(3x2)>
|
|
365
|
+
# # a b
|
|
366
|
+
# # 0 x x
|
|
367
|
+
# # 1 x x
|
|
368
|
+
# # 2 3 c
|
|
369
|
+
def set_row_at(positions, vector)
|
|
370
|
+
validate_positions(*positions, nrows)
|
|
371
|
+
vector =
|
|
372
|
+
if vector.is_a? DaruLite::Vector
|
|
373
|
+
vector.reindex @vectors
|
|
374
|
+
else
|
|
375
|
+
DaruLite::Vector.new vector
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
raise SizeError, 'Vector length should match row length' if
|
|
379
|
+
vector.size != @vectors.size
|
|
380
|
+
|
|
381
|
+
@data.each_with_index do |vec, pos|
|
|
382
|
+
vec.set_at(positions, vector.at(pos))
|
|
383
|
+
end
|
|
384
|
+
@index = @data[0].index
|
|
385
|
+
set_size
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
# Retrive vectors by positions
|
|
389
|
+
# @param [Array<Integer>] positions of vectors to retrive
|
|
390
|
+
# @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
|
|
391
|
+
# @example
|
|
392
|
+
# df = DaruLite::DataFrame.new({
|
|
393
|
+
# a: [1, 2, 3],
|
|
394
|
+
# b: ['a', 'b', 'c']
|
|
395
|
+
# })
|
|
396
|
+
# df.at 0
|
|
397
|
+
# # => #<DaruLite::Vector(3)>
|
|
398
|
+
# # a
|
|
399
|
+
# # 0 1
|
|
400
|
+
# # 1 2
|
|
401
|
+
# # 2 3
|
|
402
|
+
def at(*positions)
|
|
403
|
+
if AXES.include? positions.last
|
|
404
|
+
axis = positions.pop
|
|
405
|
+
return row_at(*positions) if axis == :row
|
|
406
|
+
end
|
|
407
|
+
|
|
408
|
+
original_positions = positions
|
|
409
|
+
positions = coerce_positions(*positions, ncols)
|
|
410
|
+
validate_positions(*positions, ncols)
|
|
411
|
+
|
|
412
|
+
if positions.is_a? Integer
|
|
413
|
+
@data[positions].dup
|
|
414
|
+
else
|
|
415
|
+
DaruLite::DataFrame.new positions.map { |pos| @data[pos].dup },
|
|
416
|
+
index: @index,
|
|
417
|
+
order: @vectors.at(*original_positions),
|
|
418
|
+
name: @name
|
|
419
|
+
end
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
# Set vectors by positions
|
|
423
|
+
# @param [Array<Integer>] positions positions of vectors to set
|
|
424
|
+
# @param [Array, DaruLite::Vector] vector vector to be assigned
|
|
425
|
+
# @example
|
|
426
|
+
# df = DaruLite::DataFrame.new({
|
|
427
|
+
# a: [1, 2, 3],
|
|
428
|
+
# b: ['a', 'b', 'c']
|
|
429
|
+
# })
|
|
430
|
+
# df.set_at [0], ['x', 'y', 'z']
|
|
431
|
+
# df
|
|
432
|
+
# #=> #<DaruLite::DataFrame(3x2)>
|
|
433
|
+
# # a b
|
|
434
|
+
# # 0 x a
|
|
435
|
+
# # 1 y b
|
|
436
|
+
# # 2 z c
|
|
437
|
+
def set_at(positions, vector)
|
|
438
|
+
if positions.last == :row
|
|
439
|
+
positions.pop
|
|
440
|
+
return set_row_at(positions, vector)
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
validate_positions(*positions, ncols)
|
|
444
|
+
vector =
|
|
445
|
+
if vector.is_a? DaruLite::Vector
|
|
446
|
+
vector.reindex @index
|
|
447
|
+
else
|
|
448
|
+
DaruLite::Vector.new vector
|
|
449
|
+
end
|
|
450
|
+
|
|
451
|
+
raise SizeError, 'Vector length should match index length' if
|
|
452
|
+
vector.size != @index.size
|
|
453
|
+
|
|
454
|
+
positions.each { |pos| @data[pos] = vector }
|
|
455
|
+
end
|
|
456
|
+
|
|
457
|
+
# Insert a new row/vector of the specified name or modify a previous row.
|
|
458
|
+
# Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
|
|
459
|
+
# a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
|
|
460
|
+
#
|
|
461
|
+
# In case a DaruLite::Vector is specified after the equality the sign, the indexes
|
|
462
|
+
# of the vector will be matched against the row/vector indexes of the DataFrame
|
|
463
|
+
# before an insertion is performed. Unmatched indexes will be set to nil.
|
|
464
|
+
def []=(*args)
|
|
465
|
+
vector = args.pop
|
|
466
|
+
axis = extract_axis(args)
|
|
467
|
+
names = args
|
|
468
|
+
|
|
469
|
+
dispatch_to_axis axis, :insert_or_modify, names, vector
|
|
470
|
+
end
|
|
471
|
+
|
|
472
|
+
def add_row(row, index = nil)
|
|
473
|
+
self.row[*(index || @size)] = row
|
|
474
|
+
end
|
|
475
|
+
|
|
476
|
+
def add_vector(n, vector)
|
|
477
|
+
self[n] = vector
|
|
478
|
+
end
|
|
479
|
+
|
|
480
|
+
def insert_vector(n, name, source)
|
|
481
|
+
raise ArgumentError unless source.is_a? Array
|
|
482
|
+
|
|
483
|
+
vector = DaruLite::Vector.new(source, index: @index, name: @name)
|
|
484
|
+
@data << vector
|
|
485
|
+
@vectors = @vectors.add name
|
|
486
|
+
ordr = @vectors.dup.to_a
|
|
487
|
+
elmnt = ordr.pop
|
|
488
|
+
ordr.insert n, elmnt
|
|
489
|
+
self.order = ordr
|
|
490
|
+
end
|
|
491
|
+
|
|
492
|
+
# Access a row or set/create a row. Refer #[] and #[]= docs for details.
|
|
493
|
+
#
|
|
494
|
+
# == Usage
|
|
495
|
+
# df.row[:a] # access row named ':a'
|
|
496
|
+
# df.row[:b] = [1,2,3] # set row ':b' to [1,2,3]
|
|
497
|
+
def row
|
|
498
|
+
DaruLite::Accessors::DataFrameByRow.new(self)
|
|
499
|
+
end
|
|
500
|
+
|
|
501
|
+
# Extract a dataframe given row indexes or positions
|
|
502
|
+
# @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
|
|
503
|
+
# @return [DaruLite::Dataframe]
|
|
504
|
+
def get_sub_dataframe(keys, by_position: true)
|
|
505
|
+
return DaruLite::DataFrame.new({}) if keys == []
|
|
506
|
+
|
|
507
|
+
keys = @index.pos(*keys) unless by_position
|
|
508
|
+
|
|
509
|
+
sub_df = row_at(*keys)
|
|
510
|
+
sub_df = sub_df.to_df.transpose if sub_df.is_a?(DaruLite::Vector)
|
|
511
|
+
|
|
512
|
+
sub_df
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
# Duplicate the DataFrame entirely.
|
|
516
|
+
#
|
|
517
|
+
# == Arguments
|
|
518
|
+
#
|
|
519
|
+
# * +vectors_to_dup+ - An Array specifying the names of Vectors to
|
|
520
|
+
# be duplicated. Will duplicate the entire DataFrame if not specified.
|
|
521
|
+
def dup(vectors_to_dup = nil)
|
|
522
|
+
vectors_to_dup ||= @vectors.to_a
|
|
523
|
+
|
|
524
|
+
src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
|
|
525
|
+
new_order = DaruLite::Index.new(vectors_to_dup)
|
|
526
|
+
|
|
527
|
+
DaruLite::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
|
|
528
|
+
end
|
|
529
|
+
|
|
530
|
+
# Only clone the structure of the DataFrame.
|
|
531
|
+
def clone_structure
|
|
532
|
+
DaruLite::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
|
|
533
|
+
end
|
|
534
|
+
|
|
535
|
+
# Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
|
|
536
|
+
# preserved.
|
|
537
|
+
#
|
|
538
|
+
# == Arguments
|
|
539
|
+
#
|
|
540
|
+
# +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
|
|
541
|
+
# a view of the whole data frame otherwise.
|
|
542
|
+
def clone(*vectors_to_clone)
|
|
543
|
+
vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
|
|
544
|
+
vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
|
|
545
|
+
|
|
546
|
+
h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
|
|
547
|
+
DaruLite::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
|
|
548
|
+
end
|
|
549
|
+
|
|
550
|
+
# Returns a 'shallow' copy of DataFrame if missing data is not present,
|
|
551
|
+
# or a full copy of only valid data if missing data is present.
|
|
552
|
+
def clone_only_valid
|
|
553
|
+
if include_values?(*DaruLite::MISSING_VALUES)
|
|
554
|
+
reject_values(*DaruLite::MISSING_VALUES)
|
|
555
|
+
else
|
|
556
|
+
clone
|
|
557
|
+
end
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
# Creates a new duplicate dataframe containing only rows
|
|
561
|
+
# without a single missing value.
|
|
562
|
+
def dup_only_valid(vecs = nil)
|
|
563
|
+
rows_with_nil = @data.map { |vec| vec.indexes(*DaruLite::MISSING_VALUES) }
|
|
564
|
+
.inject(&:concat)
|
|
565
|
+
.uniq
|
|
566
|
+
|
|
567
|
+
row_indexes = @index.to_a
|
|
568
|
+
(vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
|
|
569
|
+
end
|
|
570
|
+
deprecate :dup_only_valid, :reject_values, 2016, 10
|
|
571
|
+
|
|
572
|
+
# Returns a dataframe in which rows with any of the mentioned values
|
|
573
|
+
# are ignored.
|
|
574
|
+
# @param [Array] values to reject to form the new dataframe
|
|
575
|
+
# @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
|
|
576
|
+
# contain the mentioned values
|
|
577
|
+
# @example
|
|
578
|
+
# df = DaruLite::DataFrame.new({
|
|
579
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
|
580
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
|
581
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
|
582
|
+
# }, index: 11..18)
|
|
583
|
+
# df.reject_values nil, Float::NAN
|
|
584
|
+
# # => #<DaruLite::DataFrame(2x3)>
|
|
585
|
+
# # a b c
|
|
586
|
+
# # 11 1 a a
|
|
587
|
+
# # 18 7 8 7
|
|
588
|
+
def reject_values(*values)
|
|
589
|
+
positions =
|
|
590
|
+
size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
|
|
591
|
+
# Handle the case when positions size is 1 and #row_at wouldn't return a df
|
|
592
|
+
if positions.size == 1
|
|
593
|
+
pos = positions.first
|
|
594
|
+
row_at(pos..pos)
|
|
595
|
+
else
|
|
596
|
+
row_at(*positions)
|
|
597
|
+
end
|
|
598
|
+
end
|
|
599
|
+
|
|
600
|
+
# Replace specified values with given value
|
|
601
|
+
# @param [Array] old_values values to replace with new value
|
|
602
|
+
# @param [object] new_value new value to replace with
|
|
603
|
+
# @return [DaruLite::DataFrame] Data Frame itself with old values replace
|
|
604
|
+
# with new value
|
|
605
|
+
# @example
|
|
606
|
+
# df = DaruLite::DataFrame.new({
|
|
607
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
|
608
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
|
609
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
|
610
|
+
# }, index: 11..18)
|
|
611
|
+
# df.replace_values nil, Float::NAN
|
|
612
|
+
# # => #<DaruLite::DataFrame(8x3)>
|
|
613
|
+
# # a b c
|
|
614
|
+
# # 11 1 a a
|
|
615
|
+
# # 12 2 b NaN
|
|
616
|
+
# # 13 3 NaN 3
|
|
617
|
+
# # 14 NaN NaN 4
|
|
618
|
+
# # 15 NaN NaN 3
|
|
619
|
+
# # 16 NaN 3 5
|
|
620
|
+
# # 17 1 5 NaN
|
|
621
|
+
# # 18 7 8 7
|
|
622
|
+
def replace_values(old_values, new_value)
|
|
623
|
+
@data.each { |vec| vec.replace_values old_values, new_value }
|
|
624
|
+
self
|
|
625
|
+
end
|
|
626
|
+
|
|
627
|
+
# Rolling fillna
|
|
628
|
+
# replace all Float::NAN and NIL values with the preceeding or following value
|
|
629
|
+
#
|
|
630
|
+
# @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
|
|
631
|
+
#
|
|
632
|
+
# @example
|
|
633
|
+
# df = DaruLite::DataFrame.new({
|
|
634
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
|
635
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
|
|
636
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
|
637
|
+
# })
|
|
638
|
+
#
|
|
639
|
+
# => #<DaruLite::DataFrame(8x3)>
|
|
640
|
+
# a b c
|
|
641
|
+
# 0 1 a a
|
|
642
|
+
# 1 2 b NaN
|
|
643
|
+
# 2 3 nil 3
|
|
644
|
+
# 3 nil NaN 4
|
|
645
|
+
# 4 NaN nil 3
|
|
646
|
+
# 5 nil 3 5
|
|
647
|
+
# 6 1 5 nil
|
|
648
|
+
# 7 7 nil 7
|
|
649
|
+
#
|
|
650
|
+
# 2.3.3 :068 > df.rolling_fillna(:forward)
|
|
651
|
+
# => #<DaruLite::DataFrame(8x3)>
|
|
652
|
+
# a b c
|
|
653
|
+
# 0 1 a a
|
|
654
|
+
# 1 2 b a
|
|
655
|
+
# 2 3 b 3
|
|
656
|
+
# 3 3 b 4
|
|
657
|
+
# 4 3 b 3
|
|
658
|
+
# 5 3 3 5
|
|
659
|
+
# 6 1 5 5
|
|
660
|
+
# 7 7 5 7
|
|
661
|
+
#
|
|
662
|
+
def rolling_fillna!(direction = :forward)
|
|
663
|
+
@data.each { |vec| vec.rolling_fillna!(direction) }
|
|
664
|
+
self
|
|
665
|
+
end
|
|
666
|
+
|
|
667
|
+
def rolling_fillna(direction = :forward)
|
|
668
|
+
dup.rolling_fillna!(direction)
|
|
669
|
+
end
|
|
670
|
+
|
|
671
|
+
# Return unique rows by vector specified or all vectors
|
|
672
|
+
#
|
|
673
|
+
# @param vtrs [String][Symbol] vector names(s) that should be considered
|
|
674
|
+
#
|
|
675
|
+
# @example
|
|
676
|
+
#
|
|
677
|
+
# => #<DaruLite::DataFrame(6x2)>
|
|
678
|
+
# a b
|
|
679
|
+
# 0 1 a
|
|
680
|
+
# 1 2 b
|
|
681
|
+
# 2 3 c
|
|
682
|
+
# 3 4 d
|
|
683
|
+
# 2 3 c
|
|
684
|
+
# 3 4 f
|
|
685
|
+
#
|
|
686
|
+
# 2.3.3 :> df.unique
|
|
687
|
+
# => #<DaruLite::DataFrame(5x2)>
|
|
688
|
+
# a b
|
|
689
|
+
# 0 1 a
|
|
690
|
+
# 1 2 b
|
|
691
|
+
# 2 3 c
|
|
692
|
+
# 3 4 d
|
|
693
|
+
# 3 4 f
|
|
694
|
+
#
|
|
695
|
+
# 2.3.3 :> df.unique(:a)
|
|
696
|
+
# => #<DaruLite::DataFrame(5x2)>
|
|
697
|
+
# a b
|
|
698
|
+
# 0 1 a
|
|
699
|
+
# 1 2 b
|
|
700
|
+
# 2 3 c
|
|
701
|
+
# 3 4 d
|
|
702
|
+
#
|
|
703
|
+
def uniq(*vtrs)
|
|
704
|
+
vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
|
|
705
|
+
grouped = group_by(vecs)
|
|
706
|
+
indexes = grouped.groups.values.map { |v| v[0] }.sort
|
|
707
|
+
row[*indexes]
|
|
708
|
+
end
|
|
709
|
+
|
|
710
|
+
# Iterate over each index of the DataFrame.
|
|
711
|
+
def each_index(&block)
|
|
712
|
+
return to_enum(:each_index) unless block
|
|
713
|
+
|
|
714
|
+
@index.each(&block)
|
|
715
|
+
|
|
716
|
+
self
|
|
717
|
+
end
|
|
718
|
+
|
|
719
|
+
# Iterate over each vector
|
|
720
|
+
def each_vector(&block)
|
|
721
|
+
return to_enum(:each_vector) unless block
|
|
722
|
+
|
|
723
|
+
@data.each(&block)
|
|
724
|
+
|
|
725
|
+
self
|
|
726
|
+
end
|
|
727
|
+
|
|
728
|
+
alias each_column each_vector
|
|
729
|
+
|
|
730
|
+
# Iterate over each vector alongwith the name of the vector
|
|
731
|
+
def each_vector_with_index
|
|
732
|
+
return to_enum(:each_vector_with_index) unless block_given?
|
|
733
|
+
|
|
734
|
+
@vectors.each do |vector|
|
|
735
|
+
yield @data[@vectors[vector]], vector
|
|
736
|
+
end
|
|
737
|
+
|
|
738
|
+
self
|
|
739
|
+
end
|
|
740
|
+
|
|
741
|
+
alias each_column_with_index each_vector_with_index
|
|
742
|
+
|
|
743
|
+
# Iterate over each row
|
|
744
|
+
def each_row
|
|
745
|
+
return to_enum(:each_row) unless block_given?
|
|
746
|
+
|
|
747
|
+
@index.size.times do |pos|
|
|
748
|
+
yield row_at(pos)
|
|
749
|
+
end
|
|
750
|
+
|
|
751
|
+
self
|
|
752
|
+
end
|
|
753
|
+
|
|
754
|
+
def each_row_with_index
|
|
755
|
+
return to_enum(:each_row_with_index) unless block_given?
|
|
756
|
+
|
|
757
|
+
@index.each do |index|
|
|
758
|
+
yield access_row(index), index
|
|
759
|
+
end
|
|
760
|
+
|
|
761
|
+
self
|
|
762
|
+
end
|
|
763
|
+
|
|
764
|
+
# Iterate over each row or vector of the DataFrame. Specify axis
|
|
765
|
+
# by passing :vector or :row as the argument. Default to :vector.
|
|
766
|
+
#
|
|
767
|
+
# == Description
|
|
768
|
+
#
|
|
769
|
+
# `#each` works exactly like Array#each. The default mode for `each`
|
|
770
|
+
# is to iterate over the columns of the DataFrame. To iterate over
|
|
771
|
+
# rows you must pass the axis, i.e `:row` as an argument.
|
|
772
|
+
#
|
|
773
|
+
# == Arguments
|
|
774
|
+
#
|
|
775
|
+
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
|
776
|
+
# or :row. Default to :vector.
|
|
777
|
+
def each(axis = :vector, &block)
|
|
778
|
+
dispatch_to_axis axis, :each, &block
|
|
779
|
+
end
|
|
780
|
+
|
|
781
|
+
# Iterate over a row or vector and return results in a DaruLite::Vector.
|
|
782
|
+
# Specify axis with :vector or :row. Default to :vector.
|
|
783
|
+
#
|
|
784
|
+
# == Description
|
|
785
|
+
#
|
|
786
|
+
# The #collect iterator works similar to #map, the only difference
|
|
787
|
+
# being that it returns a DaruLite::Vector comprising of the results of
|
|
788
|
+
# each block run. The resultant Vector has the same index as that
|
|
789
|
+
# of the axis over which collect has iterated. It also accepts the
|
|
790
|
+
# optional axis argument.
|
|
791
|
+
#
|
|
792
|
+
# == Arguments
|
|
793
|
+
#
|
|
794
|
+
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
|
|
795
|
+
# or :row. Default to :vector.
|
|
796
|
+
def collect(axis = :vector, &block)
|
|
797
|
+
dispatch_to_axis_pl axis, :collect, &block
|
|
798
|
+
end
|
|
799
|
+
|
|
800
|
+
# Map over each vector or row of the data frame according to
|
|
801
|
+
# the argument specified. Will return an Array of the resulting
|
|
802
|
+
# elements. To map over each row/vector and get a DataFrame,
|
|
803
|
+
# see #recode.
|
|
804
|
+
#
|
|
805
|
+
# == Description
|
|
806
|
+
#
|
|
807
|
+
# The #map iterator works like Array#map. The value returned by
|
|
808
|
+
# each run of the block is added to an Array and the Array is
|
|
809
|
+
# returned. This method also accepts an axis argument, like #each.
|
|
810
|
+
# The default is :vector.
|
|
811
|
+
#
|
|
812
|
+
# == Arguments
|
|
813
|
+
#
|
|
814
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
|
815
|
+
# Default to :vector.
|
|
816
|
+
def map(axis = :vector, &block)
|
|
817
|
+
dispatch_to_axis_pl axis, :map, &block
|
|
818
|
+
end
|
|
819
|
+
|
|
820
|
+
# Destructive map. Modifies the DataFrame. Each run of the block
|
|
821
|
+
# must return a DaruLite::Vector. You can specify the axis to map over
|
|
822
|
+
# as the argument. Default to :vector.
|
|
823
|
+
#
|
|
824
|
+
# == Arguments
|
|
825
|
+
#
|
|
826
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
|
827
|
+
# Default to :vector.
|
|
828
|
+
def map!(axis = :vector, &block)
|
|
829
|
+
if %i[vector column].include?(axis)
|
|
830
|
+
map_vectors!(&block)
|
|
831
|
+
elsif axis == :row
|
|
832
|
+
map_rows!(&block)
|
|
833
|
+
end
|
|
834
|
+
end
|
|
835
|
+
|
|
836
|
+
# Maps over the DataFrame and returns a DataFrame. Each run of the
|
|
837
|
+
# block must return a DaruLite::Vector object. You can specify the axis
|
|
838
|
+
# to map over. Default to :vector.
|
|
839
|
+
#
|
|
840
|
+
# == Description
|
|
841
|
+
#
|
|
842
|
+
# Recode works similarly to #map, but an important difference between
|
|
843
|
+
# the two is that recode returns a modified DaruLite::DataFrame instead
|
|
844
|
+
# of an Array. For this reason, #recode expects that every run of the
|
|
845
|
+
# block to return a DaruLite::Vector.
|
|
846
|
+
#
|
|
847
|
+
# Just like map and each, recode also accepts an optional _axis_ argument.
|
|
848
|
+
#
|
|
849
|
+
# == Arguments
|
|
850
|
+
#
|
|
851
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
|
852
|
+
# Default to :vector.
|
|
853
|
+
def recode(axis = :vector, &block)
|
|
854
|
+
dispatch_to_axis_pl axis, :recode, &block
|
|
855
|
+
end
|
|
856
|
+
|
|
857
|
+
# Retain vectors or rows if the block returns a truthy value.
|
|
858
|
+
#
|
|
859
|
+
# == Description
|
|
860
|
+
#
|
|
861
|
+
# For filtering out certain rows/vectors based on their values,
|
|
862
|
+
# use the #filter method. By default it iterates over vectors and
|
|
863
|
+
# keeps those vectors for which the block returns true. It accepts
|
|
864
|
+
# an optional axis argument which lets you specify whether you want
|
|
865
|
+
# to iterate over vectors or rows.
|
|
866
|
+
#
|
|
867
|
+
# == Arguments
|
|
868
|
+
#
|
|
869
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
|
870
|
+
# Default to :vector.
|
|
871
|
+
#
|
|
872
|
+
# == Usage
|
|
873
|
+
#
|
|
874
|
+
# # Filter vectors
|
|
875
|
+
#
|
|
876
|
+
# df.filter do |vector|
|
|
877
|
+
# vector.type == :numeric and vector.median < 50
|
|
878
|
+
# end
|
|
879
|
+
#
|
|
880
|
+
# # Filter rows
|
|
881
|
+
#
|
|
882
|
+
# df.filter(:row) do |row|
|
|
883
|
+
# row[:a] + row[:d] < 100
|
|
884
|
+
# end
|
|
885
|
+
def filter(axis = :vector, &block)
|
|
886
|
+
dispatch_to_axis_pl axis, :filter, &block
|
|
887
|
+
end
|
|
888
|
+
|
|
889
|
+
def recode_vectors
|
|
890
|
+
block_given? or return to_enum(:recode_vectors)
|
|
891
|
+
|
|
892
|
+
dup.tap do |df|
|
|
893
|
+
df.each_vector_with_index do |v, i|
|
|
894
|
+
df[*i] = should_be_vector!(yield(v))
|
|
895
|
+
end
|
|
896
|
+
end
|
|
897
|
+
end
|
|
898
|
+
|
|
899
|
+
def recode_rows
|
|
900
|
+
block_given? or return to_enum(:recode_rows)
|
|
901
|
+
|
|
902
|
+
dup.tap do |df|
|
|
903
|
+
df.each_row_with_index do |r, i|
|
|
904
|
+
df.row[i] = should_be_vector!(yield(r))
|
|
905
|
+
end
|
|
906
|
+
end
|
|
907
|
+
end
|
|
908
|
+
|
|
909
|
+
# Map each vector and return an Array.
|
|
910
|
+
def map_vectors(&block)
|
|
911
|
+
return to_enum(:map_vectors) unless block
|
|
912
|
+
|
|
913
|
+
@data.map(&block)
|
|
914
|
+
end
|
|
915
|
+
|
|
916
|
+
# Destructive form of #map_vectors
|
|
917
|
+
def map_vectors!
|
|
918
|
+
return to_enum(:map_vectors!) unless block_given?
|
|
919
|
+
|
|
920
|
+
vectors.dup.each do |n|
|
|
921
|
+
self[n] = should_be_vector!(yield(self[n]))
|
|
922
|
+
end
|
|
923
|
+
|
|
924
|
+
self
|
|
925
|
+
end
|
|
926
|
+
|
|
927
|
+
# Map vectors alongwith the index.
|
|
928
|
+
def map_vectors_with_index(&block)
|
|
929
|
+
return to_enum(:map_vectors_with_index) unless block
|
|
930
|
+
|
|
931
|
+
each_vector_with_index.map(&block)
|
|
932
|
+
end
|
|
933
|
+
|
|
934
|
+
# Map each row
|
|
935
|
+
def map_rows(&block)
|
|
936
|
+
return to_enum(:map_rows) unless block
|
|
937
|
+
|
|
938
|
+
each_row.map(&block)
|
|
939
|
+
end
|
|
940
|
+
|
|
941
|
+
def map_rows_with_index(&block)
|
|
942
|
+
return to_enum(:map_rows_with_index) unless block
|
|
943
|
+
|
|
944
|
+
each_row_with_index.map(&block)
|
|
945
|
+
end
|
|
946
|
+
|
|
947
|
+
def map_rows!
|
|
948
|
+
return to_enum(:map_rows!) unless block_given?
|
|
949
|
+
|
|
950
|
+
index.dup.each do |i|
|
|
951
|
+
row[i] = should_be_vector!(yield(row[i]))
|
|
952
|
+
end
|
|
953
|
+
|
|
954
|
+
self
|
|
955
|
+
end
|
|
956
|
+
|
|
957
|
+
def apply_method(method, keys: nil, by_position: true)
|
|
958
|
+
df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
|
|
959
|
+
|
|
960
|
+
case method
|
|
961
|
+
when Symbol then df.send(method)
|
|
962
|
+
when Proc then method.call(df)
|
|
963
|
+
when Array then method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
|
|
964
|
+
else raise
|
|
965
|
+
end
|
|
966
|
+
end
|
|
967
|
+
alias apply_method_on_sub_df apply_method
|
|
968
|
+
|
|
969
|
+
# Retrieves a DaruLite::Vector, based on the result of calculation
|
|
970
|
+
# performed on each row.
|
|
971
|
+
def collect_rows(&block)
|
|
972
|
+
return to_enum(:collect_rows) unless block
|
|
973
|
+
|
|
974
|
+
DaruLite::Vector.new(each_row.map(&block), index: @index)
|
|
975
|
+
end
|
|
976
|
+
|
|
977
|
+
def collect_row_with_index(&block)
|
|
978
|
+
return to_enum(:collect_row_with_index) unless block
|
|
979
|
+
|
|
980
|
+
DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
|
|
981
|
+
end
|
|
982
|
+
|
|
983
|
+
# Retrives a DaruLite::Vector, based on the result of calculation
|
|
984
|
+
# performed on each vector.
|
|
985
|
+
def collect_vectors(&block)
|
|
986
|
+
return to_enum(:collect_vectors) unless block
|
|
987
|
+
|
|
988
|
+
DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
|
|
989
|
+
end
|
|
990
|
+
|
|
991
|
+
def collect_vector_with_index(&block)
|
|
992
|
+
return to_enum(:collect_vector_with_index) unless block
|
|
993
|
+
|
|
994
|
+
DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
|
|
995
|
+
end
|
|
996
|
+
|
|
997
|
+
# Generate a matrix, based on vector names of the DataFrame.
|
|
998
|
+
#
|
|
999
|
+
# @return {::Matrix}
|
|
1000
|
+
# :nocov:
|
|
1001
|
+
# FIXME: Even not trying to cover this: I can't get, how it is expected
|
|
1002
|
+
# to work.... -- zverok
|
|
1003
|
+
def collect_matrix
|
|
1004
|
+
return to_enum(:collect_matrix) unless block_given?
|
|
1005
|
+
|
|
1006
|
+
vecs = vectors.to_a
|
|
1007
|
+
rows = vecs.collect do |row|
|
|
1008
|
+
vecs.collect do |col|
|
|
1009
|
+
yield row, col
|
|
1010
|
+
end
|
|
1011
|
+
end
|
|
1012
|
+
|
|
1013
|
+
Matrix.rows(rows)
|
|
1014
|
+
end
|
|
1015
|
+
# :nocov:
|
|
1016
|
+
|
|
1017
|
+
# Delete a vector
|
|
1018
|
+
def delete_vector(vector)
|
|
1019
|
+
raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
|
|
1020
|
+
|
|
1021
|
+
@data.delete_at @vectors[vector]
|
|
1022
|
+
@vectors = DaruLite::Index.new @vectors.to_a - [vector]
|
|
1023
|
+
|
|
1024
|
+
self
|
|
1025
|
+
end
|
|
1026
|
+
|
|
1027
|
+
# Deletes a list of vectors
|
|
1028
|
+
def delete_vectors(*vectors)
|
|
1029
|
+
Array(vectors).each { |vec| delete_vector vec }
|
|
1030
|
+
|
|
1031
|
+
self
|
|
1032
|
+
end
|
|
1033
|
+
|
|
1034
|
+
# Delete a row
|
|
1035
|
+
def delete_row(index)
|
|
1036
|
+
idx = named_index_for index
|
|
1037
|
+
|
|
1038
|
+
raise IndexError, "Index #{index} does not exist." unless @index.include? idx
|
|
1039
|
+
|
|
1040
|
+
@index = DaruLite::Index.new(@index.to_a - [idx])
|
|
1041
|
+
each_vector do |vector|
|
|
1042
|
+
vector.delete_at idx
|
|
1043
|
+
end
|
|
1044
|
+
|
|
1045
|
+
set_size
|
|
1046
|
+
end
|
|
1047
|
+
|
|
1048
|
+
# Creates a DataFrame with the random data, of n size.
|
|
1049
|
+
# If n not given, uses original number of rows.
|
|
1050
|
+
#
|
|
1051
|
+
# @return {DaruLite::DataFrame}
|
|
1052
|
+
def bootstrap(n = nil)
|
|
1053
|
+
n ||= nrows
|
|
1054
|
+
DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
|
|
1055
|
+
n.times do
|
|
1056
|
+
df_boot.add_row(row[rand(n)])
|
|
1057
|
+
end
|
|
1058
|
+
df_boot.update
|
|
1059
|
+
end
|
|
1060
|
+
end
|
|
1061
|
+
|
|
1062
|
+
def keep_row_if
|
|
1063
|
+
@index
|
|
1064
|
+
.reject { |idx| yield access_row(idx) }
|
|
1065
|
+
.each { |idx| delete_row idx }
|
|
1066
|
+
end
|
|
1067
|
+
|
|
1068
|
+
def keep_vector_if
|
|
1069
|
+
@vectors.each do |vector|
|
|
1070
|
+
delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
|
|
1071
|
+
end
|
|
1072
|
+
end
|
|
1073
|
+
|
|
1074
|
+
# creates a new vector with the data of a given field which the block returns true
|
|
1075
|
+
def filter_vector(vec, &block)
|
|
1076
|
+
DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
|
|
1077
|
+
end
|
|
1078
|
+
|
|
1079
|
+
# Iterates over each row and retains it in a new DataFrame if the block returns
|
|
1080
|
+
# true for that row.
|
|
1081
|
+
def filter_rows
|
|
1082
|
+
return to_enum(:filter_rows) unless block_given?
|
|
1083
|
+
|
|
1084
|
+
keep_rows = @index.map { |index| yield access_row(index) }
|
|
1085
|
+
|
|
1086
|
+
where keep_rows
|
|
1087
|
+
end
|
|
1088
|
+
|
|
1089
|
+
# Iterates over each vector and retains it in a new DataFrame if the block returns
|
|
1090
|
+
# true for that vector.
|
|
1091
|
+
def filter_vectors(&block)
|
|
1092
|
+
return to_enum(:filter_vectors) unless block
|
|
1093
|
+
|
|
1094
|
+
dup.tap { |df| df.keep_vector_if(&block) }
|
|
1095
|
+
end
|
|
1096
|
+
|
|
1097
|
+
# Test each row with one or more tests.
|
|
1098
|
+
# @param tests [Proc] Each test is a Proc with the form
|
|
1099
|
+
# *Proc.new {|row| row[:age] > 0}*
|
|
1100
|
+
# The function returns an array with all errors.
|
|
1101
|
+
#
|
|
1102
|
+
# FIXME: description here is too sparse. As far as I can get,
|
|
1103
|
+
# it should tell something about that each test is [descr, fields, block],
|
|
1104
|
+
# and that first value may be column name to output. - zverok, 2016-05-18
|
|
1105
|
+
def verify(*tests)
|
|
1106
|
+
id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
|
|
1107
|
+
|
|
1108
|
+
each_row_with_index.map do |row, i|
|
|
1109
|
+
tests.reject { |*_, block| block.call(row) }
|
|
1110
|
+
.map { |test| verify_error_message row, test, id, i }
|
|
1111
|
+
end.flatten
|
|
1112
|
+
end
|
|
1113
|
+
|
|
1114
|
+
# DSL for yielding each row and returning a DaruLite::Vector based on the
|
|
1115
|
+
# value each run of the block returns.
|
|
1116
|
+
#
|
|
1117
|
+
# == Usage
|
|
1118
|
+
#
|
|
1119
|
+
# a1 = DaruLite::Vector.new([1, 2, 3, 4, 5, 6, 7])
|
|
1120
|
+
# a2 = DaruLite::Vector.new([10, 20, 30, 40, 50, 60, 70])
|
|
1121
|
+
# a3 = DaruLite::Vector.new([100, 200, 300, 400, 500, 600, 700])
|
|
1122
|
+
# ds = DaruLite::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
|
|
1123
|
+
# total = ds.vector_by_calculation { a + b + c }
|
|
1124
|
+
# # <DaruLite::Vector:82314050 @name = nil @size = 7 >
|
|
1125
|
+
# # nil
|
|
1126
|
+
# # 0 111
|
|
1127
|
+
# # 1 222
|
|
1128
|
+
# # 2 333
|
|
1129
|
+
# # 3 444
|
|
1130
|
+
# # 4 555
|
|
1131
|
+
# # 5 666
|
|
1132
|
+
# # 6 777
|
|
1133
|
+
def vector_by_calculation(&block)
|
|
1134
|
+
a = each_row.map { |r| r.instance_eval(&block) }
|
|
1135
|
+
|
|
1136
|
+
DaruLite::Vector.new a, index: @index
|
|
1137
|
+
end
|
|
1138
|
+
|
|
1139
|
+
# Reorder the vectors in a dataframe
|
|
1140
|
+
# @param [Array] order_array new order of the vectors
|
|
1141
|
+
# @example
|
|
1142
|
+
# df = DaruLite::DataFrame({
|
|
1143
|
+
# a: [1, 2, 3],
|
|
1144
|
+
# b: [4, 5, 6]
|
|
1145
|
+
# }, order: [:a, :b])
|
|
1146
|
+
# df.order = [:b, :a]
|
|
1147
|
+
# df
|
|
1148
|
+
# # => #<DaruLite::DataFrame(3x2)>
|
|
1149
|
+
# # b a
|
|
1150
|
+
# # 0 4 1
|
|
1151
|
+
# # 1 5 2
|
|
1152
|
+
# # 2 6 3
|
|
1153
|
+
def order=(order_array)
|
|
1154
|
+
raise ArgumentError, 'Invalid order' unless
|
|
1155
|
+
order_array.sort == vectors.to_a.sort
|
|
1156
|
+
|
|
1157
|
+
initialize(to_h, order: order_array)
|
|
1158
|
+
end
|
|
1159
|
+
|
|
1160
|
+
# Return the dataframe with rotate vectors positions, the vector at position count is now
|
|
1161
|
+
# the first vector of the dataframe.
|
|
1162
|
+
# If only one vector in the dataframe, the dataframe is return without any change.
|
|
1163
|
+
# @param count => Integer, the vector at position count will be the first vector of the dataframe.
|
|
1164
|
+
# @example
|
|
1165
|
+
# df = DaruLite::DataFrame({
|
|
1166
|
+
# a: [1, 2, 3],
|
|
1167
|
+
# b: [4, 5, 6],
|
|
1168
|
+
# total: [5, 7, 9],
|
|
1169
|
+
# })
|
|
1170
|
+
# df.rotate_vectors(-1)
|
|
1171
|
+
# df
|
|
1172
|
+
# # => #<DaruLite::DataFrame(3x3)>
|
|
1173
|
+
# # total b a
|
|
1174
|
+
# # 0 5 4 1
|
|
1175
|
+
# # 1 7 5 2
|
|
1176
|
+
# # 2 9 6 3
|
|
1177
|
+
def rotate_vectors(count = -1)
|
|
1178
|
+
return self unless vectors.many?
|
|
1179
|
+
|
|
1180
|
+
self.order = vectors.to_a.rotate(count)
|
|
1181
|
+
self
|
|
1182
|
+
end
|
|
1183
|
+
|
|
1184
|
+
# Returns a vector, based on a string with a calculation based
|
|
1185
|
+
# on vector.
|
|
1186
|
+
#
|
|
1187
|
+
# The calculation will be eval'ed, so you can put any variable
|
|
1188
|
+
# or expression valid on ruby.
|
|
1189
|
+
#
|
|
1190
|
+
# For example:
|
|
1191
|
+
# a = DaruLite::Vector.new [1,2]
|
|
1192
|
+
# b = DaruLite::Vector.new [3,4]
|
|
1193
|
+
# ds = DaruLite::DataFrame.new({:a => a,:b => b})
|
|
1194
|
+
# ds.compute("a+b")
|
|
1195
|
+
# => Vector [4,6]
|
|
1196
|
+
def compute(text, &block)
|
|
1197
|
+
return instance_eval(&block) if block
|
|
1198
|
+
|
|
1199
|
+
instance_eval(text)
|
|
1200
|
+
end
|
|
1201
|
+
|
|
1202
|
+
# Return a vector with the number of missing values in each row.
|
|
1203
|
+
#
|
|
1204
|
+
# == Arguments
|
|
1205
|
+
#
|
|
1206
|
+
# * +missing_values+ - An Array of the values that should be
|
|
1207
|
+
# treated as 'missing'. The default missing value is *nil*.
|
|
1208
|
+
def missing_values_rows(missing_values = [nil])
|
|
1209
|
+
number_of_missing = each_row.map do |row|
|
|
1210
|
+
row.indexes(*missing_values).size
|
|
1211
|
+
end
|
|
1212
|
+
|
|
1213
|
+
DaruLite::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
|
|
1214
|
+
end
|
|
1215
|
+
|
|
1216
|
+
# TODO: remove next version
|
|
1217
|
+
alias vector_missing_values missing_values_rows
|
|
1218
|
+
|
|
1219
|
+
def has_missing_data?
|
|
1220
|
+
@data.any? { |vec| vec.include_values?(*DaruLite::MISSING_VALUES) }
|
|
1221
|
+
end
|
|
1222
|
+
alias flawed? has_missing_data?
|
|
1223
|
+
deprecate :has_missing_data?, :include_values?, 2016, 10
|
|
1224
|
+
deprecate :flawed?, :include_values?, 2016, 10
|
|
1225
|
+
|
|
1226
|
+
# Check if any of given values occur in the data frame
|
|
1227
|
+
# @param [Array] values to check for
|
|
1228
|
+
# @return [true, false] true if any of the given values occur in the
|
|
1229
|
+
# dataframe, false otherwise
|
|
1230
|
+
# @example
|
|
1231
|
+
# df = DaruLite::DataFrame.new({
|
|
1232
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
|
1233
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
|
1234
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
|
1235
|
+
# }, index: 11..18)
|
|
1236
|
+
# df.include_values? nil
|
|
1237
|
+
# # => true
|
|
1238
|
+
def include_values?(*values)
|
|
1239
|
+
@data.any? { |vec| vec.include_values?(*values) }
|
|
1240
|
+
end
|
|
1241
|
+
|
|
1242
|
+
# Return a nested hash using vector names as keys and an array constructed of
|
|
1243
|
+
# hashes with other values. If block provided, is used to provide the
|
|
1244
|
+
# values, with parameters +row+ of dataset, +current+ last hash on
|
|
1245
|
+
# hierarchy and +name+ of the key to include
|
|
1246
|
+
def nest(*tree_keys, &block)
|
|
1247
|
+
tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
|
|
1248
|
+
|
|
1249
|
+
each_row.with_object({}) do |row, current|
|
|
1250
|
+
# Create tree
|
|
1251
|
+
*keys, last = tree_keys
|
|
1252
|
+
current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
|
|
1253
|
+
name = row[last]
|
|
1254
|
+
|
|
1255
|
+
if block
|
|
1256
|
+
current[name] = yield(row, current, name)
|
|
1257
|
+
else
|
|
1258
|
+
current[name] ||= []
|
|
1259
|
+
current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
|
|
1260
|
+
end
|
|
1261
|
+
end
|
|
1262
|
+
end
|
|
1263
|
+
|
|
1264
|
+
def vector_count_characters(vecs = nil)
|
|
1265
|
+
vecs ||= @vectors.to_a
|
|
1266
|
+
|
|
1267
|
+
collect_rows do |row|
|
|
1268
|
+
vecs.sum { |v| row[v].to_s.size }
|
|
1269
|
+
end
|
|
1270
|
+
end
|
|
1271
|
+
|
|
1272
|
+
def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
|
|
1273
|
+
self[name]
|
|
1274
|
+
.split_by_separator(sep)
|
|
1275
|
+
.each { |k, v| self[:"#{name}#{join}#{k}"] = v }
|
|
1276
|
+
end
|
|
1277
|
+
|
|
1278
|
+
# Return the number of rows and columns of the DataFrame in an Array.
|
|
1279
|
+
def shape
|
|
1280
|
+
[nrows, ncols]
|
|
1281
|
+
end
|
|
1282
|
+
|
|
1283
|
+
# The number of rows
|
|
1284
|
+
def nrows
|
|
1285
|
+
@index.size
|
|
1286
|
+
end
|
|
1287
|
+
|
|
1288
|
+
# The number of vectors
|
|
1289
|
+
def ncols
|
|
1290
|
+
@vectors.size
|
|
1291
|
+
end
|
|
1292
|
+
|
|
1293
|
+
# Check if a vector is present
|
|
1294
|
+
def has_vector?(vector)
|
|
1295
|
+
@vectors.include? vector
|
|
1296
|
+
end
|
|
1297
|
+
|
|
1298
|
+
# Works like Array#any?.
|
|
1299
|
+
#
|
|
1300
|
+
# @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
|
|
1301
|
+
# :row. A DaruLite::Vector object is yielded in the block.
|
|
1302
|
+
# @example Using any?
|
|
1303
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
|
1304
|
+
# df.any?(:row) do |row|
|
|
1305
|
+
# row[:a] < 3 and row[:b] == 'b'
|
|
1306
|
+
# end #=> true
|
|
1307
|
+
def any?(axis = :vector, &block)
|
|
1308
|
+
if %i[vector column].include?(axis)
|
|
1309
|
+
@data.any?(&block)
|
|
1310
|
+
elsif axis == :row
|
|
1311
|
+
each_row do |row|
|
|
1312
|
+
return true if yield(row)
|
|
1313
|
+
end
|
|
1314
|
+
false
|
|
1315
|
+
else
|
|
1316
|
+
raise ArgumentError, "Unidentified axis #{axis}"
|
|
1317
|
+
end
|
|
1318
|
+
end
|
|
1319
|
+
|
|
1320
|
+
# Works like Array#all?
|
|
1321
|
+
#
|
|
1322
|
+
# @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
|
|
1323
|
+
# :row. A DaruLite::Vector object is yielded in the block.
|
|
1324
|
+
# @example Using all?
|
|
1325
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
|
|
1326
|
+
# df.all?(:row) do |row|
|
|
1327
|
+
# row[:a] < 10
|
|
1328
|
+
# end #=> true
|
|
1329
|
+
def all?(axis = :vector, &block)
|
|
1330
|
+
if %i[vector column].include?(axis)
|
|
1331
|
+
@data.all?(&block)
|
|
1332
|
+
elsif axis == :row
|
|
1333
|
+
each_row.all?(&block)
|
|
1334
|
+
else
|
|
1335
|
+
raise ArgumentError, "Unidentified axis #{axis}"
|
|
1336
|
+
end
|
|
1337
|
+
end
|
|
1338
|
+
|
|
1339
|
+
# The first ten elements of the DataFrame
|
|
1340
|
+
#
|
|
1341
|
+
# @param [Fixnum] quantity (10) The number of elements to display from the top.
|
|
1342
|
+
def head(quantity = 10)
|
|
1343
|
+
row.at 0..(quantity - 1)
|
|
1344
|
+
end
|
|
1345
|
+
|
|
1346
|
+
alias first head
|
|
1347
|
+
|
|
1348
|
+
# The last ten elements of the DataFrame
|
|
1349
|
+
#
|
|
1350
|
+
# @param [Fixnum] quantity (10) The number of elements to display from the bottom.
|
|
1351
|
+
def tail(quantity = 10)
|
|
1352
|
+
start = [-quantity, -size].max
|
|
1353
|
+
row.at start..-1
|
|
1354
|
+
end
|
|
1355
|
+
|
|
1356
|
+
alias last tail
|
|
1357
|
+
|
|
1358
|
+
# Sum all numeric/specified vectors in the DataFrame.
|
|
1359
|
+
#
|
|
1360
|
+
# Returns a new vector that's a containing a sum of all numeric
|
|
1361
|
+
# or specified vectors of the DataFrame. By default, if the vector
|
|
1362
|
+
# contains a nil, the sum is nil.
|
|
1363
|
+
# With :skipnil argument set to true, nil values are assumed to be
|
|
1364
|
+
# 0 (zero) and the sum vector is returned.
|
|
1365
|
+
#
|
|
1366
|
+
# @param args [Array] List of vectors to sum. Default is nil in which case
|
|
1367
|
+
# all numeric vectors are summed.
|
|
1368
|
+
#
|
|
1369
|
+
# @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
|
|
1370
|
+
#
|
|
1371
|
+
# @return Vector with sum of all vectors specified in the argument.
|
|
1372
|
+
# If vecs parameter is empty, sum all numeric vector.
|
|
1373
|
+
#
|
|
1374
|
+
# @example
|
|
1375
|
+
# df = DaruLite::DataFrame.new({
|
|
1376
|
+
# a: [1, 2, nil],
|
|
1377
|
+
# b: [2, 1, 3],
|
|
1378
|
+
# c: [1, 1, 1]
|
|
1379
|
+
# })
|
|
1380
|
+
# => #<DaruLite::DataFrame(3x3)>
|
|
1381
|
+
# a b c
|
|
1382
|
+
# 0 1 2 1
|
|
1383
|
+
# 1 2 1 1
|
|
1384
|
+
# 2 nil 3 1
|
|
1385
|
+
# df.vector_sum [:a, :c]
|
|
1386
|
+
# => #<DaruLite::Vector(3)>
|
|
1387
|
+
# 0 2
|
|
1388
|
+
# 1 3
|
|
1389
|
+
# 2 nil
|
|
1390
|
+
# df.vector_sum
|
|
1391
|
+
# => #<DaruLite::Vector(3)>
|
|
1392
|
+
# 0 4
|
|
1393
|
+
# 1 4
|
|
1394
|
+
# 2 nil
|
|
1395
|
+
# df.vector_sum skipnil: true
|
|
1396
|
+
# => #<DaruLite::Vector(3)>
|
|
1397
|
+
# c
|
|
1398
|
+
# 0 4
|
|
1399
|
+
# 1 4
|
|
1400
|
+
# 2 4
|
|
1401
|
+
#
|
|
1402
|
+
def vector_sum(*args)
|
|
1403
|
+
defaults = { vecs: nil, skipnil: false }
|
|
1404
|
+
options = args.last.is_a?(::Hash) ? args.pop : {}
|
|
1405
|
+
options = defaults.merge(options)
|
|
1406
|
+
vecs = args[0] || options[:vecs]
|
|
1407
|
+
skipnil = args[1] || options[:skipnil]
|
|
1408
|
+
|
|
1409
|
+
vecs ||= numeric_vectors
|
|
1410
|
+
sum = DaruLite::Vector.new [0] * @size, index: @index, name: @name, dtype: @dtype
|
|
1411
|
+
vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
|
|
1412
|
+
end
|
|
1413
|
+
|
|
1414
|
+
# Calculate mean of the rows of the dataframe.
|
|
1415
|
+
#
|
|
1416
|
+
# == Arguments
|
|
1417
|
+
#
|
|
1418
|
+
# * +max_missing+ - The maximum number of elements in the row that can be
|
|
1419
|
+
# zero for the mean calculation to happen. Default to 0.
|
|
1420
|
+
def vector_mean(max_missing = 0)
|
|
1421
|
+
# FIXME: in vector_sum we preserve created vector dtype, but
|
|
1422
|
+
# here we are not. Is this by design or ...? - zverok, 2016-05-18
|
|
1423
|
+
mean_vec = DaruLite::Vector.new [0] * @size, index: @index, name: "mean_#{@name}"
|
|
1424
|
+
|
|
1425
|
+
each_row_with_index.with_object(mean_vec) do |(row, i), memo|
|
|
1426
|
+
memo[i] = row.indexes(*DaruLite::MISSING_VALUES).size > max_missing ? nil : row.mean
|
|
1427
|
+
end
|
|
1428
|
+
end
|
|
1429
|
+
|
|
1430
|
+
# Group elements by vector to perform operations on them. Returns a
|
|
1431
|
+
# DaruLite::Core::GroupBy object.See the DaruLite::Core::GroupBy docs for a detailed
|
|
1432
|
+
# list of possible operations.
|
|
1433
|
+
#
|
|
1434
|
+
# == Arguments
|
|
1435
|
+
#
|
|
1436
|
+
# * vectors - An Array contatining names of vectors to group by.
|
|
1437
|
+
#
|
|
1438
|
+
# == Usage
|
|
1439
|
+
#
|
|
1440
|
+
# df = DaruLite::DataFrame.new({
|
|
1441
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
|
1442
|
+
# b: %w{one one two three two two one three},
|
|
1443
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
|
1444
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
|
1445
|
+
# })
|
|
1446
|
+
# df.group_by([:a,:b,:c]).groups
|
|
1447
|
+
# #=> {["bar", "one", 2]=>[1],
|
|
1448
|
+
# # ["bar", "three", 1]=>[3],
|
|
1449
|
+
# # ["bar", "two", 6]=>[5],
|
|
1450
|
+
# # ["foo", "one", 1]=>[0],
|
|
1451
|
+
# # ["foo", "one", 3]=>[6],
|
|
1452
|
+
# # ["foo", "three", 8]=>[7],
|
|
1453
|
+
# # ["foo", "two", 3]=>[2, 4]}
|
|
1454
|
+
def group_by(*vectors)
|
|
1455
|
+
vectors.flatten!
|
|
1456
|
+
missing = vectors - @vectors.to_a
|
|
1457
|
+
raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}") unless missing.empty?
|
|
1458
|
+
|
|
1459
|
+
vectors = [@vectors.first] if vectors.empty?
|
|
1460
|
+
|
|
1461
|
+
DaruLite::Core::GroupBy.new(self, vectors)
|
|
1462
|
+
end
|
|
1463
|
+
|
|
1464
|
+
def reindex_vectors(new_vectors)
|
|
1465
|
+
unless new_vectors.is_a?(DaruLite::Index)
|
|
1466
|
+
raise ArgumentError, 'Must pass the new index of type Index or its ' \
|
|
1467
|
+
"subclasses, not #{new_vectors.class}"
|
|
1468
|
+
end
|
|
1469
|
+
|
|
1470
|
+
cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
|
|
1471
|
+
new_vectors.each_with_object(cl) do |vec, memo|
|
|
1472
|
+
memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
|
|
1473
|
+
end
|
|
1474
|
+
end
|
|
1475
|
+
|
|
1476
|
+
def get_vector_anyways(v)
|
|
1477
|
+
@vectors.include?(v) ? self[v].to_a : Array.new(size)
|
|
1478
|
+
end
|
|
1479
|
+
|
|
1480
|
+
# Concatenate another DataFrame along corresponding columns.
|
|
1481
|
+
# If columns do not exist in both dataframes, they are filled with nils
|
|
1482
|
+
def concat(other_df)
|
|
1483
|
+
vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
|
|
1484
|
+
|
|
1485
|
+
data = vectors.map do |v|
|
|
1486
|
+
get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
|
|
1487
|
+
end
|
|
1488
|
+
|
|
1489
|
+
DaruLite::DataFrame.new(data, order: vectors)
|
|
1490
|
+
end
|
|
1491
|
+
|
|
1492
|
+
# Concatenates another DataFrame as #concat.
|
|
1493
|
+
# Additionally it tries to preserve the index. If the indices contain
|
|
1494
|
+
# common elements, #union will overwrite the according rows in the
|
|
1495
|
+
# first dataframe.
|
|
1496
|
+
def union(other_df)
|
|
1497
|
+
index = (@index.to_a + other_df.index.to_a).uniq
|
|
1498
|
+
df = row[*(@index.to_a - other_df.index.to_a)]
|
|
1499
|
+
|
|
1500
|
+
df = df.concat(other_df)
|
|
1501
|
+
df.index = DaruLite::Index.new(index)
|
|
1502
|
+
df
|
|
1503
|
+
end
|
|
1504
|
+
|
|
1505
|
+
module SetSingleIndexStrategy
|
|
1506
|
+
def self.uniq_size(df, col)
|
|
1507
|
+
df[col].uniq.size
|
|
1508
|
+
end
|
|
1509
|
+
|
|
1510
|
+
def self.new_index(df, col)
|
|
1511
|
+
DaruLite::Index.new(df[col].to_a)
|
|
1512
|
+
end
|
|
1513
|
+
|
|
1514
|
+
def self.delete_vector(df, col)
|
|
1515
|
+
df.delete_vector(col)
|
|
1516
|
+
end
|
|
1517
|
+
end
|
|
1518
|
+
|
|
1519
|
+
module SetCategoricalIndexStrategy
|
|
1520
|
+
def self.new_index(df, col)
|
|
1521
|
+
DaruLite::CategoricalIndex.new(df[col].to_a)
|
|
1522
|
+
end
|
|
1523
|
+
|
|
1524
|
+
def self.delete_vector(df, col)
|
|
1525
|
+
df.delete_vector(col)
|
|
1526
|
+
end
|
|
1527
|
+
end
|
|
1528
|
+
|
|
1529
|
+
module SetMultiIndexStrategy
|
|
1530
|
+
def self.uniq_size(df, cols)
|
|
1531
|
+
df[*cols].uniq.size
|
|
1532
|
+
end
|
|
1533
|
+
|
|
1534
|
+
def self.new_index(df, cols)
|
|
1535
|
+
DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
|
|
1536
|
+
mi.name = cols
|
|
1537
|
+
end
|
|
1538
|
+
end
|
|
1539
|
+
|
|
1540
|
+
def self.delete_vector(df, cols)
|
|
1541
|
+
df.delete_vectors(*cols)
|
|
1542
|
+
end
|
|
1543
|
+
end
|
|
1544
|
+
|
|
1545
|
+
# Set a particular column as the new DF
|
|
1546
|
+
def set_index(new_index_col, keep: false, categorical: false)
|
|
1547
|
+
if categorical
|
|
1548
|
+
strategy = SetCategoricalIndexStrategy
|
|
1549
|
+
elsif new_index_col.respond_to?(:to_a)
|
|
1550
|
+
strategy = SetMultiIndexStrategy
|
|
1551
|
+
new_index_col = new_index_col.to_a
|
|
1552
|
+
else
|
|
1553
|
+
strategy = SetSingleIndexStrategy
|
|
1554
|
+
end
|
|
1555
|
+
|
|
1556
|
+
unless categorical
|
|
1557
|
+
uniq_size = strategy.uniq_size(self, new_index_col)
|
|
1558
|
+
raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
|
|
1559
|
+
end
|
|
1560
|
+
|
|
1561
|
+
self.index = strategy.new_index(self, new_index_col)
|
|
1562
|
+
strategy.delete_vector(self, new_index_col) unless keep
|
|
1563
|
+
self
|
|
1564
|
+
end
|
|
1565
|
+
|
|
1566
|
+
# Change the index of the DataFrame and preserve the labels of the previous
|
|
1567
|
+
# indexing. New index can be DaruLite::Index or any of its subclasses.
|
|
1568
|
+
#
|
|
1569
|
+
# @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
|
|
1570
|
+
# @example Reindexing DataFrame
|
|
1571
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
|
|
1572
|
+
# index: ['a','b','c','d'])
|
|
1573
|
+
# #=>
|
|
1574
|
+
# ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
|
1575
|
+
# # a b
|
|
1576
|
+
# # a 1 11
|
|
1577
|
+
# # b 2 22
|
|
1578
|
+
# # c 3 33
|
|
1579
|
+
# # d 4 44
|
|
1580
|
+
# df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
|
|
1581
|
+
# #=>
|
|
1582
|
+
# ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
|
|
1583
|
+
# # a b
|
|
1584
|
+
# # b 2 22
|
|
1585
|
+
# # 0 nil nil
|
|
1586
|
+
# # a 1 11
|
|
1587
|
+
# # g nil nil
|
|
1588
|
+
def reindex(new_index)
|
|
1589
|
+
unless new_index.is_a?(DaruLite::Index)
|
|
1590
|
+
raise ArgumentError, 'Must pass the new index of type Index or its ' \
|
|
1591
|
+
"subclasses, not #{new_index.class}"
|
|
1592
|
+
end
|
|
1593
|
+
|
|
1594
|
+
cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
|
|
1595
|
+
new_index.each_with_object(cl) do |idx, memo|
|
|
1596
|
+
memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
|
|
1597
|
+
end
|
|
1598
|
+
end
|
|
1599
|
+
|
|
1600
|
+
def reset_index
|
|
1601
|
+
index_df = index.to_df
|
|
1602
|
+
names = index.name
|
|
1603
|
+
names = [names] unless names.instance_of?(Array)
|
|
1604
|
+
new_vectors = names + vectors.to_a
|
|
1605
|
+
self.index = index_df.index
|
|
1606
|
+
names.each do |name|
|
|
1607
|
+
self[name] = index_df[name]
|
|
1608
|
+
end
|
|
1609
|
+
self.order = new_vectors
|
|
1610
|
+
self
|
|
1611
|
+
end
|
|
1612
|
+
|
|
1613
|
+
# Reassign index with a new index of type DaruLite::Index or any of its subclasses.
|
|
1614
|
+
#
|
|
1615
|
+
# @param [DaruLite::Index] idx New index object on which the rows of the dataframe
|
|
1616
|
+
# are to be indexed.
|
|
1617
|
+
# @example Reassigining index of a DataFrame
|
|
1618
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
|
|
1619
|
+
# df.index.to_a #=> [0,1,2,3]
|
|
1620
|
+
#
|
|
1621
|
+
# df.index = DaruLite::Index.new(['a','b','c','d'])
|
|
1622
|
+
# df.index.to_a #=> ['a','b','c','d']
|
|
1623
|
+
# df.row['a'].to_a #=> [1,11]
|
|
1624
|
+
def index=(idx)
|
|
1625
|
+
@index = Index.coerce idx
|
|
1626
|
+
@data.each { |vec| vec.index = @index }
|
|
1627
|
+
|
|
1628
|
+
self
|
|
1629
|
+
end
|
|
1630
|
+
|
|
1631
|
+
# Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
|
|
1632
|
+
#
|
|
1633
|
+
# @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
|
|
1634
|
+
# be indexed. Must of the same size as ncols.
|
|
1635
|
+
# @example Reassigning vectors of a DataFrame
|
|
1636
|
+
# df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
|
|
1637
|
+
# df.vectors.to_a #=> [:a, :b, :c]
|
|
1638
|
+
#
|
|
1639
|
+
# df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
|
|
1640
|
+
# df.vectors.to_a #=> [:foo, :bar, :baz]
|
|
1641
|
+
def vectors=(new_index)
|
|
1642
|
+
raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
|
|
1643
|
+
|
|
1644
|
+
if new_index.size != ncols
|
|
1645
|
+
raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
|
|
1646
|
+
"dataframe size #{ncols}"
|
|
1647
|
+
end
|
|
1648
|
+
|
|
1649
|
+
@vectors = new_index
|
|
1650
|
+
@data.zip(new_index.to_a).each do |vect, name|
|
|
1651
|
+
vect.name = name
|
|
1652
|
+
end
|
|
1653
|
+
self
|
|
1654
|
+
end
|
|
1655
|
+
|
|
1656
|
+
# Renames the vectors
|
|
1657
|
+
#
|
|
1658
|
+
# == Arguments
|
|
1659
|
+
#
|
|
1660
|
+
# * name_map - A hash where the keys are the exising vector names and
|
|
1661
|
+
# the values are the new names. If a vector is renamed
|
|
1662
|
+
# to a vector name that is already in use, the existing
|
|
1663
|
+
# one is overwritten.
|
|
1664
|
+
#
|
|
1665
|
+
# == Usage
|
|
1666
|
+
#
|
|
1667
|
+
# df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
|
|
1668
|
+
# df.rename_vectors :a => :alpha, :c => :gamma
|
|
1669
|
+
# df.vectors.to_a #=> [:alpha, :b, :gamma]
|
|
1670
|
+
def rename_vectors(name_map)
|
|
1671
|
+
existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
|
|
1672
|
+
delete_vectors(*existing_targets)
|
|
1673
|
+
|
|
1674
|
+
new_names = vectors.to_a.map { |v| name_map[v] || v }
|
|
1675
|
+
self.vectors = DaruLite::Index.new new_names
|
|
1676
|
+
end
|
|
1677
|
+
|
|
1678
|
+
# Renames the vectors and returns itself
|
|
1679
|
+
#
|
|
1680
|
+
# == Arguments
|
|
1681
|
+
#
|
|
1682
|
+
# * name_map - A hash where the keys are the exising vector names and
|
|
1683
|
+
# the values are the new names. If a vector is renamed
|
|
1684
|
+
# to a vector name that is already in use, the existing
|
|
1685
|
+
# one is overwritten.
|
|
1686
|
+
#
|
|
1687
|
+
# == Usage
|
|
1688
|
+
#
|
|
1689
|
+
# df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
|
|
1690
|
+
# df.rename_vectors! :a => :alpha, :c => :gamma # df
|
|
1691
|
+
def rename_vectors!(name_map)
|
|
1692
|
+
rename_vectors(name_map)
|
|
1693
|
+
self
|
|
1694
|
+
end
|
|
1695
|
+
|
|
1696
|
+
# Converts the vectors to a DaruLite::MultiIndex.
|
|
1697
|
+
# The argument passed is used as the MultiIndex's top level
|
|
1698
|
+
def add_level_to_vectors(top_level_label)
|
|
1699
|
+
tuples = vectors.map { |label| [top_level_label, *label] }
|
|
1700
|
+
self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
|
|
1701
|
+
end
|
|
1702
|
+
|
|
1703
|
+
# Return the indexes of all the numeric vectors. Will include vectors with nils
|
|
1704
|
+
# alongwith numbers.
|
|
1705
|
+
def numeric_vectors
|
|
1706
|
+
# FIXME: Why _with_index ?..
|
|
1707
|
+
each_vector_with_index
|
|
1708
|
+
.select { |vec, _i| vec.numeric? }
|
|
1709
|
+
.map(&:last)
|
|
1710
|
+
end
|
|
1711
|
+
|
|
1712
|
+
def numeric_vector_names
|
|
1713
|
+
@vectors.select { |v| self[v].numeric? }
|
|
1714
|
+
end
|
|
1715
|
+
|
|
1716
|
+
# Return a DataFrame of only the numerical Vectors. If clone: false
|
|
1717
|
+
# is specified as option, only a *view* of the Vectors will be
|
|
1718
|
+
# returned. Defaults to clone: true.
|
|
1719
|
+
def only_numerics(opts = {})
|
|
1720
|
+
cln = opts[:clone] != false
|
|
1721
|
+
arry = numeric_vectors.map { |v| self[v] }
|
|
1722
|
+
|
|
1723
|
+
order = Index.new(numeric_vectors)
|
|
1724
|
+
DaruLite::DataFrame.new(arry, clone: cln, order: order, index: @index)
|
|
1725
|
+
end
|
|
1726
|
+
|
|
1727
|
+
# Generate a summary of this DataFrame based on individual vectors in the DataFrame
|
|
1728
|
+
# @return [String] String containing the summary of the DataFrame
|
|
1729
|
+
def summary
|
|
1730
|
+
summary = "= #{name}"
|
|
1731
|
+
summary << "\n Number of rows: #{nrows}"
|
|
1732
|
+
@vectors.each do |v|
|
|
1733
|
+
summary << "\n Element:[#{v}]\n"
|
|
1734
|
+
summary << self[v].summary(1)
|
|
1735
|
+
end
|
|
1736
|
+
summary
|
|
1737
|
+
end
|
|
1738
|
+
|
|
1739
|
+
# Sorts a dataframe (ascending/descending) in the given pripority sequence of
|
|
1740
|
+
# vectors, with or without a block.
|
|
1741
|
+
#
|
|
1742
|
+
# @param vector_order [Array] The order of vector names in which the DataFrame
|
|
1743
|
+
# should be sorted.
|
|
1744
|
+
# @param opts [Hash] opts The options to sort with.
|
|
1745
|
+
# @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
|
|
1746
|
+
# or descending order. Specify Array corresponding to *order* for multiple
|
|
1747
|
+
# sort orders.
|
|
1748
|
+
# @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
|
|
1749
|
+
# to be used for sorting, for each vector name in *order* as a hash of
|
|
1750
|
+
# vector name and lambda expressions. In case a lambda for a vector is not
|
|
1751
|
+
# specified, the default will be used.
|
|
1752
|
+
# @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
|
|
1753
|
+
# automatically or not when a block is provided.
|
|
1754
|
+
# If set to True, nils will appear at top after sorting.
|
|
1755
|
+
#
|
|
1756
|
+
# @example Sort a dataframe with a vector sequence.
|
|
1757
|
+
#
|
|
1758
|
+
#
|
|
1759
|
+
# df = DaruLite::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
|
|
1760
|
+
#
|
|
1761
|
+
# df.sort [:a, :b]
|
|
1762
|
+
# # =>
|
|
1763
|
+
# # <DaruLite::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
|
|
1764
|
+
# # a b
|
|
1765
|
+
# # 2 1 3
|
|
1766
|
+
# # 0 1 5
|
|
1767
|
+
# # 3 2 2
|
|
1768
|
+
# # 1 2 4
|
|
1769
|
+
# # 4 3 1
|
|
1770
|
+
#
|
|
1771
|
+
# @example Sort a dataframe without a block. Here nils will be handled automatically.
|
|
1772
|
+
#
|
|
1773
|
+
# df = DaruLite::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
|
|
1774
|
+
#
|
|
1775
|
+
# df.sort([:a])
|
|
1776
|
+
# # =>
|
|
1777
|
+
# # <DaruLite::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
|
|
1778
|
+
# # a b
|
|
1779
|
+
# # 1 nil 3
|
|
1780
|
+
# # 3 nil 1
|
|
1781
|
+
# # 0 -3 4
|
|
1782
|
+
# # 2 -1 2
|
|
1783
|
+
# # 4 5 4
|
|
1784
|
+
#
|
|
1785
|
+
# @example Sort a dataframe with a block with nils handled automatically.
|
|
1786
|
+
#
|
|
1787
|
+
# df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
|
|
1788
|
+
#
|
|
1789
|
+
# df.sort [:b], by: {b: lambda { |a| a.length } }
|
|
1790
|
+
# # NoMethodError: undefined method `length' for nil:NilClass
|
|
1791
|
+
# # from (pry):8:in `block in __pry__'
|
|
1792
|
+
#
|
|
1793
|
+
# df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
|
|
1794
|
+
#
|
|
1795
|
+
# # =>
|
|
1796
|
+
# # <DaruLite::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
|
|
1797
|
+
# # a b
|
|
1798
|
+
# # 2 1 nil
|
|
1799
|
+
# # 5 1 nil
|
|
1800
|
+
# # 4 -1 x
|
|
1801
|
+
# # 1 -1 aa
|
|
1802
|
+
# # 0 nil aaa
|
|
1803
|
+
# # 3 nil baaa
|
|
1804
|
+
#
|
|
1805
|
+
# @example Sort a dataframe with a block with nils handled manually.
|
|
1806
|
+
#
|
|
1807
|
+
# df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
|
|
1808
|
+
#
|
|
1809
|
+
# # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
|
|
1810
|
+
# df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
|
|
1811
|
+
#
|
|
1812
|
+
# # =>
|
|
1813
|
+
# #<DaruLite::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
|
|
1814
|
+
# # a b
|
|
1815
|
+
# # 4 -1 x
|
|
1816
|
+
# # 1 -1 aa
|
|
1817
|
+
# # 0 nil aaa
|
|
1818
|
+
# # 3 nil baaa
|
|
1819
|
+
# # 2 1 nil
|
|
1820
|
+
# # 5 1 nil
|
|
1821
|
+
|
|
1822
|
+
def sort!(vector_order, opts = {})
|
|
1823
|
+
raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
|
|
1824
|
+
|
|
1825
|
+
# To enable sorting with categorical data,
|
|
1826
|
+
# map categories to integers preserving their order
|
|
1827
|
+
old = convert_categorical_vectors vector_order
|
|
1828
|
+
block = sort_prepare_block vector_order, opts
|
|
1829
|
+
|
|
1830
|
+
order = @index.size.times.sort(&block)
|
|
1831
|
+
new_index = @index.reorder order
|
|
1832
|
+
|
|
1833
|
+
# To reverse map mapping of categorical data to integers
|
|
1834
|
+
restore_categorical_vectors old
|
|
1835
|
+
|
|
1836
|
+
@data.each do |vector|
|
|
1837
|
+
vector.reorder! order
|
|
1838
|
+
end
|
|
1839
|
+
|
|
1840
|
+
self.index = new_index
|
|
1841
|
+
|
|
1842
|
+
self
|
|
1843
|
+
end
|
|
1844
|
+
|
|
1845
|
+
# Non-destructive version of #sort!
|
|
1846
|
+
def sort(vector_order, opts = {})
|
|
1847
|
+
dup.sort! vector_order, opts
|
|
1848
|
+
end
|
|
1849
|
+
|
|
1850
|
+
# Pivots a data frame on specified vectors and applies an aggregate function
|
|
1851
|
+
# to quickly generate a summary.
|
|
1852
|
+
#
|
|
1853
|
+
# == Options
|
|
1854
|
+
#
|
|
1855
|
+
# +:index+ - Keys to group by on the pivot table row index. Pass vector names
|
|
1856
|
+
# contained in an Array.
|
|
1857
|
+
#
|
|
1858
|
+
# +:vectors+ - Keys to group by on the pivot table column index. Pass vector
|
|
1859
|
+
# names contained in an Array.
|
|
1860
|
+
#
|
|
1861
|
+
# +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
|
|
1862
|
+
# use any of the statistics functions applicable on Vectors that can be found in
|
|
1863
|
+
# the DaruLite::Statistics::Vector module.
|
|
1864
|
+
#
|
|
1865
|
+
# +:values+ - Columns to aggregate. Will consider all numeric columns not
|
|
1866
|
+
# specified in *:index* or *:vectors*. Optional.
|
|
1867
|
+
#
|
|
1868
|
+
# == Usage
|
|
1869
|
+
#
|
|
1870
|
+
# df = DaruLite::DataFrame.new({
|
|
1871
|
+
# a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
|
|
1872
|
+
# b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
|
|
1873
|
+
# c: ['small','large','large','small','small','large','small','large','small'],
|
|
1874
|
+
# d: [1,2,2,3,3,4,5,6,7],
|
|
1875
|
+
# e: [2,4,4,6,6,8,10,12,14]
|
|
1876
|
+
# })
|
|
1877
|
+
# df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
|
|
1878
|
+
#
|
|
1879
|
+
# #=>
|
|
1880
|
+
# # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
|
|
1881
|
+
# # [:e, :one] [:e, :two]
|
|
1882
|
+
# # [:bar] 18 26
|
|
1883
|
+
# # [:foo] 10 12
|
|
1884
|
+
def pivot_table(opts = {})
|
|
1885
|
+
raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?
|
|
1886
|
+
|
|
1887
|
+
index = opts[:index]
|
|
1888
|
+
vectors = opts[:vectors] || []
|
|
1889
|
+
aggregate_function = opts[:agg] || :mean
|
|
1890
|
+
values = prepare_pivot_values index, vectors, opts
|
|
1891
|
+
raise IndexError, 'No numeric vectors to aggregate' if values.empty?
|
|
1892
|
+
|
|
1893
|
+
grouped = group_by(index)
|
|
1894
|
+
return grouped.send(aggregate_function) if vectors.empty?
|
|
1895
|
+
|
|
1896
|
+
super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
|
|
1897
|
+
|
|
1898
|
+
pivot_dataframe super_hash
|
|
1899
|
+
end
|
|
1900
|
+
|
|
1901
|
+
# Merge vectors from two DataFrames. In case of name collision,
|
|
1902
|
+
# the vectors names are changed to x_1, x_2 ....
|
|
1903
|
+
#
|
|
1904
|
+
# @return {DaruLite::DataFrame}
|
|
1905
|
+
def merge(other_df)
|
|
1906
|
+
unless nrows == other_df.nrows
|
|
1907
|
+
raise ArgumentError,
|
|
1908
|
+
"Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
|
|
1909
|
+
end
|
|
1910
|
+
|
|
1911
|
+
new_fields = (@vectors.to_a + other_df.vectors.to_a)
|
|
1912
|
+
new_fields = ArrayHelper.recode_repeated(new_fields)
|
|
1913
|
+
DataFrame.new({}, order: new_fields).tap do |df_new|
|
|
1914
|
+
(0...nrows).each do |i|
|
|
1915
|
+
df_new.add_row row[i].to_a + other_df.row[i].to_a
|
|
1916
|
+
end
|
|
1917
|
+
df_new.index = @index if @index == other_df.index
|
|
1918
|
+
df_new.update
|
|
1919
|
+
end
|
|
1920
|
+
end
|
|
1921
|
+
|
|
1922
|
+
# Join 2 DataFrames with SQL style joins. Currently supports inner, left
|
|
1923
|
+
# outer, right outer and full outer joins.
|
|
1924
|
+
#
|
|
1925
|
+
# @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
|
|
1926
|
+
# to be performed.
|
|
1927
|
+
# @param [Hash] opts Options Hash
|
|
1928
|
+
# @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
|
|
1929
|
+
# @option :on [Array] The columns on which the join is to be performed.
|
|
1930
|
+
# Column names specified here must be common to both DataFrames.
|
|
1931
|
+
# @option :indicator [Symbol] The name of a vector to add to the resultant
|
|
1932
|
+
# dataframe that indicates whether the record was in the left (:left_only),
|
|
1933
|
+
# right (:right_only), or both (:both) joining dataframes.
|
|
1934
|
+
# @return [DaruLite::DataFrame]
|
|
1935
|
+
# @example Inner Join
|
|
1936
|
+
# left = DaruLite::DataFrame.new({
|
|
1937
|
+
# :id => [1,2,3,4],
|
|
1938
|
+
# :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
|
|
1939
|
+
# })
|
|
1940
|
+
# right = DaruLite::DataFrame.new({
|
|
1941
|
+
# :id => [1,2,3,4],
|
|
1942
|
+
# :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
|
|
1943
|
+
# })
|
|
1944
|
+
# left.join(right, how: :inner, on: [:name])
|
|
1945
|
+
# #=>
|
|
1946
|
+
# ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
|
|
1947
|
+
# # id_1 name id_2
|
|
1948
|
+
# # 0 1 Pirate 2
|
|
1949
|
+
# # 1 3 Ninja 4
|
|
1950
|
+
def join(other_df, opts = {})
|
|
1951
|
+
DaruLite::Core::Merge.join(self, other_df, opts)
|
|
1952
|
+
end
|
|
1953
|
+
|
|
1954
|
+
# Creates a new dataset for one to many relations
|
|
1955
|
+
# on a dataset, based on pattern of field names.
|
|
1956
|
+
#
|
|
1957
|
+
# for example, you have a survey for number of children
|
|
1958
|
+
# with this structure:
|
|
1959
|
+
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
|
|
1960
|
+
# with
|
|
1961
|
+
# ds.one_to_many([:id], "child_%v_%n"
|
|
1962
|
+
# the field of first parameters will be copied verbatim
|
|
1963
|
+
# to new dataset, and fields which responds to second
|
|
1964
|
+
# pattern will be added one case for each different %n.
|
|
1965
|
+
#
|
|
1966
|
+
# @example
|
|
1967
|
+
# cases=[
|
|
1968
|
+
# ['1','george','red',10,'blue',20,nil,nil],
|
|
1969
|
+
# ['2','fred','green',15,'orange',30,'white',20],
|
|
1970
|
+
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
|
1971
|
+
# ]
|
|
1972
|
+
# ds=DaruLite::DataFrame.rows(cases, order:
|
|
1973
|
+
# [:id, :name,
|
|
1974
|
+
# :car_color1, :car_value1,
|
|
1975
|
+
# :car_color2, :car_value2,
|
|
1976
|
+
# :car_color3, :car_value3])
|
|
1977
|
+
# ds.one_to_many([:id],'car_%v%n').to_matrix
|
|
1978
|
+
# #=> Matrix[
|
|
1979
|
+
# # ["red", "1", 10],
|
|
1980
|
+
# # ["blue", "1", 20],
|
|
1981
|
+
# # ["green", "2", 15],
|
|
1982
|
+
# # ["orange", "2", 30],
|
|
1983
|
+
# # ["white", "2", 20]
|
|
1984
|
+
# # ]
|
|
1985
|
+
def one_to_many(parent_fields, pattern)
|
|
1986
|
+
vars, numbers = one_to_many_components(pattern)
|
|
1987
|
+
|
|
1988
|
+
DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
|
|
1989
|
+
each_row do |row|
|
|
1990
|
+
verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
|
|
1991
|
+
numbers.each do |n|
|
|
1992
|
+
generated = one_to_many_row row, n, vars, pattern
|
|
1993
|
+
next if generated.values.all?(&:nil?)
|
|
1994
|
+
|
|
1995
|
+
ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
|
|
1996
|
+
end
|
|
1997
|
+
end
|
|
1998
|
+
ds.update
|
|
1999
|
+
end
|
|
2000
|
+
end
|
|
2001
|
+
|
|
2002
|
+
def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
|
|
2003
|
+
self[nm]
|
|
2004
|
+
.split_by_separator(sep)
|
|
2005
|
+
.each_with_index do |(k, v), i|
|
|
2006
|
+
v.rename "#{nm}:#{k}"
|
|
2007
|
+
self[:"#{nm}#{join}#{i + 1}"] = v
|
|
2008
|
+
end
|
|
2009
|
+
end
|
|
2010
|
+
|
|
2011
|
+
# Create a sql, basen on a given Dataset
|
|
2012
|
+
#
|
|
2013
|
+
# == Arguments
|
|
2014
|
+
#
|
|
2015
|
+
# * table - String specifying name of the table that will created in SQL.
|
|
2016
|
+
# * charset - Character set. Default is "UTF8".
|
|
2017
|
+
#
|
|
2018
|
+
# @example
|
|
2019
|
+
#
|
|
2020
|
+
# ds = DaruLite::DataFrame.new({
|
|
2021
|
+
# :id => DaruLite::Vector.new([1,2,3,4,5]),
|
|
2022
|
+
# :name => DaruLite::Vector.new(%w{Alex Peter Susan Mary John})
|
|
2023
|
+
# })
|
|
2024
|
+
# ds.create_sql('names')
|
|
2025
|
+
# #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
|
|
2026
|
+
#
|
|
2027
|
+
def create_sql(table, charset = 'UTF8')
|
|
2028
|
+
sql = "CREATE TABLE #{table} ("
|
|
2029
|
+
fields = vectors.to_a.collect do |f|
|
|
2030
|
+
v = self[f]
|
|
2031
|
+
"#{f} #{v.db_type}"
|
|
2032
|
+
end
|
|
2033
|
+
|
|
2034
|
+
sql + fields.join(",\n ") + ") CHARACTER SET=#{charset};"
|
|
2035
|
+
end
|
|
2036
|
+
|
|
2037
|
+
# Returns the dataframe. This can be convenient when the user does not
|
|
2038
|
+
# know whether the object is a vector or a dataframe.
|
|
2039
|
+
# @return [self] the dataframe
|
|
2040
|
+
def to_df
|
|
2041
|
+
self
|
|
2042
|
+
end
|
|
2043
|
+
|
|
2044
|
+
# Convert all vectors of type *:numeric* into a Matrix.
|
|
2045
|
+
def to_matrix
|
|
2046
|
+
Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
|
|
2047
|
+
end
|
|
2048
|
+
|
|
2049
|
+
# Converts the DataFrame into an array of hashes where key is vector name
|
|
2050
|
+
# and value is the corresponding element. The 0th index of the array contains
|
|
2051
|
+
# the array of hashes while the 1th index contains the indexes of each row
|
|
2052
|
+
# of the dataframe. Each element in the index array corresponds to its row
|
|
2053
|
+
# in the array of hashes, which has the same index.
|
|
2054
|
+
def to_a
|
|
2055
|
+
[each_row.map(&:to_h), @index.to_a]
|
|
2056
|
+
end
|
|
2057
|
+
|
|
2058
|
+
# Convert to json. If no_index is false then the index will NOT be included
|
|
2059
|
+
# in the JSON thus created.
|
|
2060
|
+
def to_json(no_index = true)
|
|
2061
|
+
if no_index
|
|
2062
|
+
to_a[0].to_json
|
|
2063
|
+
else
|
|
2064
|
+
to_a.to_json
|
|
2065
|
+
end
|
|
2066
|
+
end
|
|
2067
|
+
|
|
2068
|
+
# Converts DataFrame to a hash (explicit) with keys as vector names and values as
|
|
2069
|
+
# the corresponding vectors.
|
|
2070
|
+
def to_h
|
|
2071
|
+
@vectors
|
|
2072
|
+
.each_with_index
|
|
2073
|
+
.map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
|
|
2074
|
+
end
|
|
2075
|
+
|
|
2076
|
+
# Convert to html for IRuby.
|
|
2077
|
+
def to_html(threshold = DaruLite.max_rows)
|
|
2078
|
+
table_thead = to_html_thead
|
|
2079
|
+
table_tbody = to_html_tbody(threshold)
|
|
2080
|
+
path = if index.is_a?(MultiIndex)
|
|
2081
|
+
File.expand_path('iruby/templates/dataframe_mi.html.erb', __dir__)
|
|
2082
|
+
else
|
|
2083
|
+
File.expand_path('iruby/templates/dataframe.html.erb', __dir__)
|
|
2084
|
+
end
|
|
2085
|
+
ERB.new(File.read(path).strip).result(binding)
|
|
2086
|
+
end
|
|
2087
|
+
|
|
2088
|
+
def to_html_thead
|
|
2089
|
+
table_thead_path =
|
|
2090
|
+
if index.is_a?(MultiIndex)
|
|
2091
|
+
File.expand_path('iruby/templates/dataframe_mi_thead.html.erb', __dir__)
|
|
2092
|
+
else
|
|
2093
|
+
File.expand_path('iruby/templates/dataframe_thead.html.erb', __dir__)
|
|
2094
|
+
end
|
|
2095
|
+
ERB.new(File.read(table_thead_path).strip).result(binding)
|
|
2096
|
+
end
|
|
2097
|
+
|
|
2098
|
+
def to_html_tbody(threshold = DaruLite.max_rows)
|
|
2099
|
+
threshold ||= @size
|
|
2100
|
+
table_tbody_path =
|
|
2101
|
+
if index.is_a?(MultiIndex)
|
|
2102
|
+
File.expand_path('iruby/templates/dataframe_mi_tbody.html.erb', __dir__)
|
|
2103
|
+
else
|
|
2104
|
+
File.expand_path('iruby/templates/dataframe_tbody.html.erb', __dir__)
|
|
2105
|
+
end
|
|
2106
|
+
ERB.new(File.read(table_tbody_path).strip).result(binding)
|
|
2107
|
+
end
|
|
2108
|
+
|
|
2109
|
+
def to_s
|
|
2110
|
+
"#<#{self.class}#{": #{@name}" if @name}(#{nrows}x#{ncols})>"
|
|
2111
|
+
end
|
|
2112
|
+
|
|
2113
|
+
# Method for updating the metadata (i.e. missing value positions) of the
|
|
2114
|
+
# after assingment/deletion etc. are complete. This is provided so that
|
|
2115
|
+
# time is not wasted in creating the metadata for the vector each time
|
|
2116
|
+
# assignment/deletion of elements is done. Updating data this way is called
|
|
2117
|
+
# lazy loading. To set or unset lazy loading, see the .lazy_update= method.
|
|
2118
|
+
def update
|
|
2119
|
+
@data.each(&:update) if DaruLite.lazy_update
|
|
2120
|
+
end
|
|
2121
|
+
|
|
2122
|
+
# Rename the DataFrame.
|
|
2123
|
+
def rename(new_name)
|
|
2124
|
+
@name = new_name
|
|
2125
|
+
self
|
|
2126
|
+
end
|
|
2127
|
+
|
|
2128
|
+
alias name= rename
|
|
2129
|
+
|
|
2130
|
+
# Write this DataFrame to a CSV file.
|
|
2131
|
+
#
|
|
2132
|
+
# == Arguments
|
|
2133
|
+
#
|
|
2134
|
+
# * filename - Path of CSV file where the DataFrame is to be saved.
|
|
2135
|
+
#
|
|
2136
|
+
# == Options
|
|
2137
|
+
#
|
|
2138
|
+
# * convert_comma - If set to *true*, will convert any commas in any
|
|
2139
|
+
# of the data to full stops ('.').
|
|
2140
|
+
# All the options accepted by CSV.read() can also be passed into this
|
|
2141
|
+
# function.
|
|
2142
|
+
def write_csv(filename, opts = {})
|
|
2143
|
+
DaruLite::IO.dataframe_write_csv self, filename, opts
|
|
2144
|
+
end
|
|
2145
|
+
|
|
2146
|
+
# Write this dataframe to an Excel Spreadsheet
|
|
2147
|
+
#
|
|
2148
|
+
# == Arguments
|
|
2149
|
+
#
|
|
2150
|
+
# * filename - The path of the file where the DataFrame should be written.
|
|
2151
|
+
def write_excel(filename, opts = {})
|
|
2152
|
+
DaruLite::IO.dataframe_write_excel self, filename, opts
|
|
2153
|
+
end
|
|
2154
|
+
|
|
2155
|
+
# Insert each case of the Dataset on the selected table
|
|
2156
|
+
#
|
|
2157
|
+
# == Arguments
|
|
2158
|
+
#
|
|
2159
|
+
# * dbh - DBI database connection object.
|
|
2160
|
+
# * query - Query string.
|
|
2161
|
+
#
|
|
2162
|
+
# == Usage
|
|
2163
|
+
#
|
|
2164
|
+
# ds = DaruLite::DataFrame.new({:id=>DaruLite::Vector.new([1,2,3]), :name=>DaruLite::Vector.new(["a","b","c"])})
|
|
2165
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
|
2166
|
+
# ds.write_sql(dbh,"test")
|
|
2167
|
+
def write_sql(dbh, table)
|
|
2168
|
+
DaruLite::IO.dataframe_write_sql self, dbh, table
|
|
2169
|
+
end
|
|
2170
|
+
|
|
2171
|
+
# Use marshalling to save dataframe to a file.
|
|
2172
|
+
def save(filename)
|
|
2173
|
+
DaruLite::IO.save self, filename
|
|
2174
|
+
end
|
|
2175
|
+
|
|
2176
|
+
def _dump(_depth)
|
|
2177
|
+
Marshal.dump(
|
|
2178
|
+
data: @data,
|
|
2179
|
+
index: @index.to_a,
|
|
2180
|
+
order: @vectors.to_a,
|
|
2181
|
+
name: @name
|
|
2182
|
+
)
|
|
2183
|
+
end
|
|
2184
|
+
|
|
2185
|
+
def self._load(data)
|
|
2186
|
+
h = Marshal.load data
|
|
2187
|
+
DaruLite::DataFrame.new(h[:data],
|
|
2188
|
+
index: h[:index],
|
|
2189
|
+
order: h[:order],
|
|
2190
|
+
name: h[:name])
|
|
2191
|
+
end
|
|
2192
|
+
|
|
2193
|
+
# Transpose a DataFrame, tranposing elements and row, column indexing.
|
|
2194
|
+
def transpose
|
|
2195
|
+
DaruLite::DataFrame.new(
|
|
2196
|
+
each_vector.map(&:to_a).transpose,
|
|
2197
|
+
index: @vectors,
|
|
2198
|
+
order: @index,
|
|
2199
|
+
dtype: @dtype,
|
|
2200
|
+
name: @name
|
|
2201
|
+
)
|
|
2202
|
+
end
|
|
2203
|
+
|
|
2204
|
+
# Pretty print in a nice table format for the command line (irb/pry/iruby)
|
|
2205
|
+
def inspect(spacing = DaruLite.spacing, threshold = DaruLite.max_rows)
|
|
2206
|
+
name_part = @name ? ": #{@name} " : ''
|
|
2207
|
+
spacing = [headers.to_a.map(&:length).max, spacing].max
|
|
2208
|
+
|
|
2209
|
+
"#<#{self.class}#{name_part}(#{nrows}x#{ncols})>#{$INPUT_RECORD_SEPARATOR}" +
|
|
2210
|
+
Formatters::Table.format(
|
|
2211
|
+
each_row.lazy,
|
|
2212
|
+
row_headers: row_headers,
|
|
2213
|
+
headers: headers,
|
|
2214
|
+
threshold: threshold,
|
|
2215
|
+
spacing: spacing
|
|
2216
|
+
)
|
|
2217
|
+
end
|
|
2218
|
+
|
|
2219
|
+
# Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
|
|
2220
|
+
def where(bool_array)
|
|
2221
|
+
DaruLite::Core::Query.df_where self, bool_array
|
|
2222
|
+
end
|
|
2223
|
+
|
|
2224
|
+
def ==(other)
|
|
2225
|
+
self.class == other.class &&
|
|
2226
|
+
@size == other.size &&
|
|
2227
|
+
@index == other.index &&
|
|
2228
|
+
@vectors == other.vectors &&
|
|
2229
|
+
@vectors.to_a.all? { |v| self[v] == other[v] }
|
|
2230
|
+
end
|
|
2231
|
+
|
|
2232
|
+
# Converts the specified non category type vectors to category type vectors
|
|
2233
|
+
# @param [Array] names of non category type vectors to be converted
|
|
2234
|
+
# @return [DaruLite::DataFrame] data frame in which specified vectors have been
|
|
2235
|
+
# converted to category type
|
|
2236
|
+
# @example
|
|
2237
|
+
# df = DaruLite::DataFrame.new({
|
|
2238
|
+
# a: [1, 2, 3],
|
|
2239
|
+
# b: ['a', 'a', 'b']
|
|
2240
|
+
# })
|
|
2241
|
+
# df.to_category :b
|
|
2242
|
+
# df[:b].type
|
|
2243
|
+
# # => :category
|
|
2244
|
+
def to_category(*names)
|
|
2245
|
+
names.each { |n| self[n] = self[n].to_category }
|
|
2246
|
+
self
|
|
2247
|
+
end
|
|
2248
|
+
|
|
2249
|
+
def method_missing(name, *args, &block)
|
|
2250
|
+
if /(.+)=/.match?(name)
|
|
2251
|
+
name = name[/(.+)=/].delete('=')
|
|
2252
|
+
name = name.to_sym unless has_vector?(name)
|
|
2253
|
+
insert_or_modify_vector [name], args[0]
|
|
2254
|
+
elsif has_vector?(name)
|
|
2255
|
+
self[name]
|
|
2256
|
+
elsif has_vector?(name.to_s)
|
|
2257
|
+
self[name.to_s]
|
|
2258
|
+
else
|
|
2259
|
+
super
|
|
2260
|
+
end
|
|
2261
|
+
end
|
|
2262
|
+
|
|
2263
|
+
def respond_to_missing?(name, include_private = false)
|
|
2264
|
+
name.to_s.end_with?('=') || has_vector?(name) || super
|
|
2265
|
+
end
|
|
2266
|
+
|
|
2267
|
+
def interact_code(vector_names, full)
|
|
2268
|
+
dfs = vector_names.zip(full).map do |vec_name, f|
|
|
2269
|
+
self[vec_name].contrast_code(full: f).each.to_a
|
|
2270
|
+
end
|
|
2271
|
+
|
|
2272
|
+
all_vectors = recursive_product(dfs)
|
|
2273
|
+
DaruLite::DataFrame.new all_vectors,
|
|
2274
|
+
order: all_vectors.map(&:name)
|
|
2275
|
+
end
|
|
2276
|
+
|
|
2277
|
+
# Split the dataframe into many dataframes based on category vector
|
|
2278
|
+
# @param [object] cat_name name of category vector to split the dataframe
|
|
2279
|
+
# @return [Array] array of dataframes split by category with category vector
|
|
2280
|
+
# used to split not included
|
|
2281
|
+
# @example
|
|
2282
|
+
# df = DaruLite::DataFrame.new({
|
|
2283
|
+
# a: [1, 2, 3],
|
|
2284
|
+
# b: ['a', 'a', 'b']
|
|
2285
|
+
# })
|
|
2286
|
+
# df.to_category :b
|
|
2287
|
+
# df.split_by_category :b
|
|
2288
|
+
# # => [#<DaruLite::DataFrame: a (2x1)>
|
|
2289
|
+
# # a
|
|
2290
|
+
# # 0 1
|
|
2291
|
+
# # 1 2,
|
|
2292
|
+
# # #<DaruLite::DataFrame: b (1x1)>
|
|
2293
|
+
# # a
|
|
2294
|
+
# # 2 3]
|
|
2295
|
+
def split_by_category(cat_name)
|
|
2296
|
+
cat_dv = self[cat_name]
|
|
2297
|
+
raise ArgumentError, "#{cat_name} is not a category vector" unless
|
|
2298
|
+
cat_dv.category?
|
|
2299
|
+
|
|
2300
|
+
cat_dv.categories.map do |cat|
|
|
2301
|
+
where(cat_dv.eq cat)
|
|
2302
|
+
.rename(cat)
|
|
2303
|
+
.delete_vector cat_name
|
|
2304
|
+
end
|
|
2305
|
+
end
|
|
2306
|
+
|
|
2307
|
+
# @param indexes [Array] index(s) at which row tuples are retrieved
|
|
2308
|
+
# @return [Array] returns array of row tuples at given index(s)
|
|
2309
|
+
# @example Using DaruLite::Index
|
|
2310
|
+
# df = DaruLite::DataFrame.new({
|
|
2311
|
+
# a: [1, 2, 3],
|
|
2312
|
+
# b: ['a', 'a', 'b']
|
|
2313
|
+
# })
|
|
2314
|
+
#
|
|
2315
|
+
# df.access_row_tuples_by_indexs(1,2)
|
|
2316
|
+
# # => [[2, "a"], [3, "b"]]
|
|
2317
|
+
#
|
|
2318
|
+
# df.index = DaruLite::Index.new([:one,:two,:three])
|
|
2319
|
+
# df.access_row_tuples_by_indexs(:one,:three)
|
|
2320
|
+
# # => [[1, "a"], [3, "b"]]
|
|
2321
|
+
#
|
|
2322
|
+
# @example Using DaruLite::MultiIndex
|
|
2323
|
+
# mi_idx = DaruLite::MultiIndex.from_tuples [
|
|
2324
|
+
# [:a,:one,:bar],
|
|
2325
|
+
# [:a,:one,:baz],
|
|
2326
|
+
# [:b,:two,:bar],
|
|
2327
|
+
# [:a,:two,:baz],
|
|
2328
|
+
# ]
|
|
2329
|
+
# df_mi = DaruLite::DataFrame.new({
|
|
2330
|
+
# a: 1..4,
|
|
2331
|
+
# b: 'a'..'d'
|
|
2332
|
+
# }, index: mi_idx )
|
|
2333
|
+
#
|
|
2334
|
+
# df_mi.access_row_tuples_by_indexs(:b, :two, :bar)
|
|
2335
|
+
# # => [[3, "c"]]
|
|
2336
|
+
# df_mi.access_row_tuples_by_indexs(:a)
|
|
2337
|
+
# # => [[1, "a"], [2, "b"], [4, "d"]]
|
|
2338
|
+
def access_row_tuples_by_indexs(*indexes)
|
|
2339
|
+
return get_sub_dataframe(indexes, by_position: false).map_rows(&:to_a) if
|
|
2340
|
+
@index.is_a?(DaruLite::MultiIndex)
|
|
2341
|
+
|
|
2342
|
+
positions = @index.pos(*indexes)
|
|
2343
|
+
if positions.is_a? Numeric
|
|
2344
|
+
row = get_rows_for([positions])
|
|
2345
|
+
row.first.is_a?(Array) ? row : [row]
|
|
2346
|
+
else
|
|
2347
|
+
new_rows = get_rows_for(indexes, by_position: false)
|
|
2348
|
+
indexes.map { |index| new_rows.map { |r| r[index] } }
|
|
2349
|
+
end
|
|
2350
|
+
end
|
|
2351
|
+
|
|
2352
|
+
# Function to use for aggregating the data.
|
|
2353
|
+
#
|
|
2354
|
+
# @param options [Hash] options for column, you want in resultant dataframe
|
|
2355
|
+
#
|
|
2356
|
+
# @return [DaruLite::DataFrame]
|
|
2357
|
+
#
|
|
2358
|
+
# @example
|
|
2359
|
+
# df = DaruLite::DataFrame.new(
|
|
2360
|
+
# {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
|
|
2361
|
+
# => #<DaruLite::DataFrame(5x2)>
|
|
2362
|
+
# col num
|
|
2363
|
+
# 0 a 52
|
|
2364
|
+
# 1 b 12
|
|
2365
|
+
# 2 c 7
|
|
2366
|
+
# 3 d 17
|
|
2367
|
+
# 4 e 1
|
|
2368
|
+
#
|
|
2369
|
+
# df.aggregate(num_100_times: ->(df) { (df.num*100).first })
|
|
2370
|
+
# => #<DaruLite::DataFrame(5x1)>
|
|
2371
|
+
# num_100_ti
|
|
2372
|
+
# 0 5200
|
|
2373
|
+
# 1 1200
|
|
2374
|
+
# 2 700
|
|
2375
|
+
# 3 1700
|
|
2376
|
+
# 4 100
|
|
2377
|
+
#
|
|
2378
|
+
# When we have duplicate index :
|
|
2379
|
+
#
|
|
2380
|
+
# idx = DaruLite::CategoricalIndex.new [:a, :b, :a, :a, :c]
|
|
2381
|
+
# df = DaruLite::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
|
|
2382
|
+
# => #<DaruLite::DataFrame(5x1)>
|
|
2383
|
+
# num
|
|
2384
|
+
# a 52
|
|
2385
|
+
# b 12
|
|
2386
|
+
# a 7
|
|
2387
|
+
# a 17
|
|
2388
|
+
# c 1
|
|
2389
|
+
#
|
|
2390
|
+
# df.aggregate(num: :mean)
|
|
2391
|
+
# => #<DaruLite::DataFrame(3x1)>
|
|
2392
|
+
# num
|
|
2393
|
+
# a 25.3333333
|
|
2394
|
+
# b 12
|
|
2395
|
+
# c 1
|
|
2396
|
+
#
|
|
2397
|
+
# Note: `GroupBy` class `aggregate` method uses this `aggregate` method
|
|
2398
|
+
# internally.
|
|
2399
|
+
def aggregate(options = {}, multi_index_level = -1)
|
|
2400
|
+
if block_given?
|
|
2401
|
+
positions_tuples, new_index = yield(@index) # NOTE: use of yield is private for now
|
|
2402
|
+
else
|
|
2403
|
+
positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
|
|
2404
|
+
end
|
|
2405
|
+
|
|
2406
|
+
colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
|
|
2407
|
+
|
|
2408
|
+
DaruLite::DataFrame.new(colmn_value, index: new_index, order: options.keys)
|
|
2409
|
+
end
|
|
2410
|
+
|
|
2411
|
+
def group_by_and_aggregate(*group_by_keys, **aggregation_map)
|
|
2412
|
+
group_by(*group_by_keys).aggregate(aggregation_map)
|
|
2413
|
+
end
|
|
2414
|
+
|
|
2415
|
+
private
|
|
2416
|
+
|
|
2417
|
+
def headers
|
|
2418
|
+
DaruLite::Index.new(Array(index.name) + @vectors.to_a)
|
|
2419
|
+
end
|
|
2420
|
+
|
|
2421
|
+
def row_headers
|
|
2422
|
+
index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
|
|
2423
|
+
end
|
|
2424
|
+
|
|
2425
|
+
def convert_categorical_vectors(names)
|
|
2426
|
+
names.filter_map do |n|
|
|
2427
|
+
next unless self[n].category?
|
|
2428
|
+
|
|
2429
|
+
old = [n, self[n]]
|
|
2430
|
+
self[n] = DaruLite::Vector.new(self[n].to_ints)
|
|
2431
|
+
old
|
|
2432
|
+
end
|
|
2433
|
+
end
|
|
2434
|
+
|
|
2435
|
+
def restore_categorical_vectors(old)
|
|
2436
|
+
old.each { |name, vector| self[name] = vector }
|
|
2437
|
+
end
|
|
2438
|
+
|
|
2439
|
+
def recursive_product(dfs)
|
|
2440
|
+
return dfs.first if dfs.size == 1
|
|
2441
|
+
|
|
2442
|
+
left = dfs.first
|
|
2443
|
+
dfs.shift
|
|
2444
|
+
right = recursive_product dfs
|
|
2445
|
+
left.product(right).map do |dv1, dv2|
|
|
2446
|
+
(dv1 * dv2).rename "#{dv1.name}:#{dv2.name}"
|
|
2447
|
+
end
|
|
2448
|
+
end
|
|
2449
|
+
|
|
2450
|
+
def should_be_vector!(val)
|
|
2451
|
+
return val if val.is_a?(DaruLite::Vector)
|
|
2452
|
+
|
|
2453
|
+
raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
|
|
2454
|
+
end
|
|
2455
|
+
|
|
2456
|
+
def dispatch_to_axis(axis, method, *args, &block)
|
|
2457
|
+
if %i[vector column].include?(axis)
|
|
2458
|
+
send(:"#{method}_vector", *args, &block)
|
|
2459
|
+
elsif axis == :row
|
|
2460
|
+
send(:"#{method}_row", *args, &block)
|
|
2461
|
+
else
|
|
2462
|
+
raise ArgumentError, "Unknown axis #{axis}"
|
|
2463
|
+
end
|
|
2464
|
+
end
|
|
2465
|
+
|
|
2466
|
+
def dispatch_to_axis_pl(axis, method, *args, &block)
|
|
2467
|
+
if %i[vector column].include?(axis)
|
|
2468
|
+
send(:"#{method}_vectors", *args, &block)
|
|
2469
|
+
elsif axis == :row
|
|
2470
|
+
send(:"#{method}_rows", *args, &block)
|
|
2471
|
+
else
|
|
2472
|
+
raise ArgumentError, "Unknown axis #{axis}"
|
|
2473
|
+
end
|
|
2474
|
+
end
|
|
2475
|
+
|
|
2476
|
+
AXES = %i[row vector].freeze
|
|
2477
|
+
|
|
2478
|
+
def extract_axis(names, default = :vector)
|
|
2479
|
+
if AXES.include?(names.last)
|
|
2480
|
+
names.pop
|
|
2481
|
+
else
|
|
2482
|
+
default
|
|
2483
|
+
end
|
|
2484
|
+
end
|
|
2485
|
+
|
|
2486
|
+
def access_vector(*names)
|
|
2487
|
+
if names.first.is_a?(Range)
|
|
2488
|
+
dup(@vectors.subset(names.first))
|
|
2489
|
+
elsif @vectors.is_a?(MultiIndex)
|
|
2490
|
+
access_vector_multi_index(*names)
|
|
2491
|
+
else
|
|
2492
|
+
access_vector_single_index(*names)
|
|
2493
|
+
end
|
|
2494
|
+
end
|
|
2495
|
+
|
|
2496
|
+
def access_vector_multi_index(*names)
|
|
2497
|
+
pos = @vectors[names]
|
|
2498
|
+
|
|
2499
|
+
return @data[pos] if pos.is_a?(Integer)
|
|
2500
|
+
|
|
2501
|
+
new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
|
|
2502
|
+
|
|
2503
|
+
pos = pos.drop_left_level(names.size) if names.size < @vectors.width
|
|
2504
|
+
|
|
2505
|
+
DaruLite::DataFrame.new(new_vectors, index: @index, order: pos)
|
|
2506
|
+
end
|
|
2507
|
+
|
|
2508
|
+
def access_vector_single_index(*names)
|
|
2509
|
+
if names.count < 2
|
|
2510
|
+
begin
|
|
2511
|
+
pos = @vectors.is_a?(DaruLite::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
|
|
2512
|
+
rescue IndexError
|
|
2513
|
+
raise IndexError, "Specified vector #{names.first} does not exist"
|
|
2514
|
+
end
|
|
2515
|
+
return @data[pos] if pos.is_a?(Numeric)
|
|
2516
|
+
|
|
2517
|
+
names = pos
|
|
2518
|
+
end
|
|
2519
|
+
|
|
2520
|
+
new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
|
|
2521
|
+
|
|
2522
|
+
order = names.is_a?(Array) ? DaruLite::Index.new(names) : names
|
|
2523
|
+
DaruLite::DataFrame.new(new_vectors, order: order, index: @index, name: @name)
|
|
2524
|
+
end
|
|
2525
|
+
|
|
2526
|
+
def access_row(*indexes)
|
|
2527
|
+
positions = @index.pos(*indexes)
|
|
2528
|
+
|
|
2529
|
+
if positions.is_a? Numeric
|
|
2530
|
+
row = get_rows_for([positions])
|
|
2531
|
+
DaruLite::Vector.new row, index: @vectors, name: indexes.first
|
|
2532
|
+
else
|
|
2533
|
+
new_rows = get_rows_for(indexes, by_position: false)
|
|
2534
|
+
DaruLite::DataFrame.new new_rows, index: @index.subset(*indexes), order: @vectors
|
|
2535
|
+
end
|
|
2536
|
+
end
|
|
2537
|
+
|
|
2538
|
+
# @param keys [Array] can be an array of positions (if by_position is true) or indexes (if by_position if false)
|
|
2539
|
+
# because of coercion by DaruLite::Vector#at and DaruLite::Vector#[], can return either an Array of
|
|
2540
|
+
# values (representing a row) or an array of Vectors (that can be seen as rows)
|
|
2541
|
+
def get_rows_for(keys, by_position: true)
|
|
2542
|
+
raise unless keys.is_a?(Array)
|
|
2543
|
+
|
|
2544
|
+
if by_position
|
|
2545
|
+
pos = keys
|
|
2546
|
+
@data.map { |vector| vector.at(*pos) }
|
|
2547
|
+
else
|
|
2548
|
+
# TODO: for now (2018-07-27), it is different than using
|
|
2549
|
+
# get_rows_for(@index.pos(*keys))
|
|
2550
|
+
# because DaruLite::Vector#at and DaruLite::Vector#[] don't handle DaruLite::MultiIndex the same way
|
|
2551
|
+
indexes = keys
|
|
2552
|
+
@data.map { |vec| vec[*indexes] }
|
|
2553
|
+
end
|
|
2554
|
+
end
|
|
2555
|
+
|
|
2556
|
+
def insert_or_modify_vector(name, vector)
|
|
2557
|
+
name = name[0] unless @vectors.is_a?(MultiIndex)
|
|
2558
|
+
|
|
2559
|
+
if @index.empty?
|
|
2560
|
+
insert_vector_in_empty name, vector
|
|
2561
|
+
else
|
|
2562
|
+
vec = prepare_for_insert name, vector
|
|
2563
|
+
|
|
2564
|
+
assign_or_add_vector name, vec
|
|
2565
|
+
end
|
|
2566
|
+
end
|
|
2567
|
+
|
|
2568
|
+
def assign_or_add_vector(name, v)
|
|
2569
|
+
# FIXME: fix this jugaad. need to make changes in Indexing itself.
|
|
2570
|
+
begin
|
|
2571
|
+
pos = @vectors[name]
|
|
2572
|
+
rescue IndexError
|
|
2573
|
+
pos = name
|
|
2574
|
+
end
|
|
2575
|
+
|
|
2576
|
+
if pos.is_a?(DaruLite::Index)
|
|
2577
|
+
assign_multiple_vectors pos, v
|
|
2578
|
+
elsif pos == name &&
|
|
2579
|
+
(@vectors.include?(name) || (pos.is_a?(Integer) && pos < @data.size))
|
|
2580
|
+
|
|
2581
|
+
@data[pos] = v
|
|
2582
|
+
else
|
|
2583
|
+
assign_or_add_vector_rough name, v
|
|
2584
|
+
end
|
|
2585
|
+
end
|
|
2586
|
+
|
|
2587
|
+
def assign_multiple_vectors(pos, v)
|
|
2588
|
+
pos.each do |p|
|
|
2589
|
+
@data[@vectors[p]] = v
|
|
2590
|
+
end
|
|
2591
|
+
end
|
|
2592
|
+
|
|
2593
|
+
def assign_or_add_vector_rough(name, v)
|
|
2594
|
+
@vectors |= [name] unless @vectors.include?(name)
|
|
2595
|
+
@data[@vectors[name]] = v
|
|
2596
|
+
end
|
|
2597
|
+
|
|
2598
|
+
def insert_vector_in_empty(name, vector)
|
|
2599
|
+
vec = Vector.coerce(vector.to_a, name: coerce_name(name))
|
|
2600
|
+
|
|
2601
|
+
@index = vec.index
|
|
2602
|
+
assign_or_add_vector name, vec
|
|
2603
|
+
set_size
|
|
2604
|
+
|
|
2605
|
+
@data.map! { |v| v.empty? ? v.reindex(@index) : v }
|
|
2606
|
+
end
|
|
2607
|
+
|
|
2608
|
+
def prepare_for_insert(name, arg)
|
|
2609
|
+
if arg.is_a? DaruLite::Vector
|
|
2610
|
+
prepare_vector_for_insert name, arg
|
|
2611
|
+
elsif arg.respond_to?(:to_a)
|
|
2612
|
+
prepare_enum_for_insert name, arg
|
|
2613
|
+
else
|
|
2614
|
+
prepare_value_for_insert name, arg
|
|
2615
|
+
end
|
|
2616
|
+
end
|
|
2617
|
+
|
|
2618
|
+
def prepare_vector_for_insert(name, vector)
|
|
2619
|
+
# so that index-by-index assignment is avoided when possible.
|
|
2620
|
+
return vector.dup if vector.index == @index
|
|
2621
|
+
|
|
2622
|
+
DaruLite::Vector.new([], name: coerce_name(name), index: @index).tap do |v|
|
|
2623
|
+
@index.each do |idx|
|
|
2624
|
+
v[idx] = vector.index.include?(idx) ? vector[idx] : nil
|
|
2625
|
+
end
|
|
2626
|
+
end
|
|
2627
|
+
end
|
|
2628
|
+
|
|
2629
|
+
def prepare_enum_for_insert(name, enum)
|
|
2630
|
+
if @size != enum.size
|
|
2631
|
+
raise "Specified vector of length #{enum.size} cannot be inserted in DataFrame of size #{@size}"
|
|
2632
|
+
end
|
|
2633
|
+
|
|
2634
|
+
DaruLite::Vector.new(enum, name: coerce_name(name), index: @index)
|
|
2635
|
+
end
|
|
2636
|
+
|
|
2637
|
+
def prepare_value_for_insert(name, value)
|
|
2638
|
+
DaruLite::Vector.new(Array(value) * @size, name: coerce_name(name), index: @index)
|
|
2639
|
+
end
|
|
2640
|
+
|
|
2641
|
+
def insert_or_modify_row(indexes, vector)
|
|
2642
|
+
vector = coerce_vector vector
|
|
2643
|
+
|
|
2644
|
+
raise SizeError, 'Vector length should match row length' if
|
|
2645
|
+
vector.size != @vectors.size
|
|
2646
|
+
|
|
2647
|
+
@data.each_with_index do |vec, pos|
|
|
2648
|
+
vec.send(:set, indexes, vector.at(pos))
|
|
2649
|
+
end
|
|
2650
|
+
@index = @data[0].index
|
|
2651
|
+
|
|
2652
|
+
set_size
|
|
2653
|
+
end
|
|
2654
|
+
|
|
2655
|
+
def create_empty_vectors(vectors, index)
|
|
2656
|
+
@vectors = Index.coerce vectors
|
|
2657
|
+
@index = Index.coerce index
|
|
2658
|
+
|
|
2659
|
+
@data = @vectors.map do |name|
|
|
2660
|
+
DaruLite::Vector.new([], name: coerce_name(name), index: @index)
|
|
2661
|
+
end
|
|
2662
|
+
end
|
|
2663
|
+
|
|
2664
|
+
def validate_labels
|
|
2665
|
+
if @vectors && @vectors.size != @data.size
|
|
2666
|
+
raise IndexError, "Expected equal number of vector names (#{@vectors.size}) " \
|
|
2667
|
+
"for number of vectors (#{@data.size})."
|
|
2668
|
+
end
|
|
2669
|
+
|
|
2670
|
+
return unless @index && @data[0] && @index.size != @data[0].size
|
|
2671
|
+
|
|
2672
|
+
raise IndexError, 'Expected number of indexes same as number of rows'
|
|
2673
|
+
end
|
|
2674
|
+
|
|
2675
|
+
def validate_vector_sizes
|
|
2676
|
+
@data.each do |vector|
|
|
2677
|
+
raise IndexError, 'Expected vectors with equal length' if vector.size != @size
|
|
2678
|
+
end
|
|
2679
|
+
end
|
|
2680
|
+
|
|
2681
|
+
def validate
|
|
2682
|
+
validate_labels
|
|
2683
|
+
validate_vector_sizes
|
|
2684
|
+
end
|
|
2685
|
+
|
|
2686
|
+
def set_size
|
|
2687
|
+
@size = @index.size
|
|
2688
|
+
end
|
|
2689
|
+
|
|
2690
|
+
def named_index_for(index)
|
|
2691
|
+
if @index.include? index
|
|
2692
|
+
index
|
|
2693
|
+
elsif @index.key index
|
|
2694
|
+
@index.key index
|
|
2695
|
+
else
|
|
2696
|
+
raise IndexError, "Specified index #{index} does not exist."
|
|
2697
|
+
end
|
|
2698
|
+
end
|
|
2699
|
+
|
|
2700
|
+
def create_vectors_index_with(vectors, source)
|
|
2701
|
+
vectors = source.keys if vectors.nil?
|
|
2702
|
+
|
|
2703
|
+
@vectors =
|
|
2704
|
+
if vectors.is_a?(Index) || vectors.is_a?(MultiIndex)
|
|
2705
|
+
vectors
|
|
2706
|
+
else
|
|
2707
|
+
DaruLite::Index.new((vectors + (source.keys - vectors)).uniq)
|
|
2708
|
+
end
|
|
2709
|
+
end
|
|
2710
|
+
|
|
2711
|
+
def all_vectors_have_equal_indexes?(source)
|
|
2712
|
+
idx = source.values[0].index
|
|
2713
|
+
|
|
2714
|
+
source.values.all? { |vector| idx == vector.index }
|
|
2715
|
+
end
|
|
2716
|
+
|
|
2717
|
+
def coerce_name(potential_name)
|
|
2718
|
+
potential_name.is_a?(Array) ? potential_name.join : potential_name
|
|
2719
|
+
end
|
|
2720
|
+
|
|
2721
|
+
def initialize_from_array(source, vectors, index, opts)
|
|
2722
|
+
raise ArgumentError, 'All objects in data source should be same class' \
|
|
2723
|
+
unless source.map(&:class).uniq.size == 1
|
|
2724
|
+
|
|
2725
|
+
case source.first
|
|
2726
|
+
when Array
|
|
2727
|
+
vectors ||= (0..source.size - 1).to_a
|
|
2728
|
+
initialize_from_array_of_arrays source, vectors, index, opts
|
|
2729
|
+
when Vector
|
|
2730
|
+
vectors ||= (0..source.size - 1).to_a
|
|
2731
|
+
initialize_from_array_of_vectors source, vectors, index, opts
|
|
2732
|
+
when Hash
|
|
2733
|
+
initialize_from_array_of_hashes source, vectors, index, opts
|
|
2734
|
+
else
|
|
2735
|
+
raise ArgumentError, "Can't create DataFrame from #{source}"
|
|
2736
|
+
end
|
|
2737
|
+
end
|
|
2738
|
+
|
|
2739
|
+
def initialize_from_array_of_arrays(source, vectors, index, _opts)
|
|
2740
|
+
if source.size != vectors.size
|
|
2741
|
+
raise ArgumentError, "Number of vectors (#{vectors.size}) should " \
|
|
2742
|
+
"equal order size (#{source.size})"
|
|
2743
|
+
end
|
|
2744
|
+
|
|
2745
|
+
@index = Index.coerce(index || source[0].size)
|
|
2746
|
+
@vectors = Index.coerce(vectors)
|
|
2747
|
+
|
|
2748
|
+
update_data source, vectors
|
|
2749
|
+
end
|
|
2750
|
+
|
|
2751
|
+
def initialize_from_array_of_vectors(source, vectors, index, opts)
|
|
2752
|
+
clone = opts[:clone] != false
|
|
2753
|
+
hsh = vectors.each_with_index.to_h do |name, idx|
|
|
2754
|
+
[name, source[idx]]
|
|
2755
|
+
end
|
|
2756
|
+
initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
|
|
2757
|
+
end
|
|
2758
|
+
|
|
2759
|
+
def initialize_from_array_of_hashes(source, vectors, index, _opts)
|
|
2760
|
+
names =
|
|
2761
|
+
if vectors.nil?
|
|
2762
|
+
source[0].keys
|
|
2763
|
+
else
|
|
2764
|
+
(vectors + source[0].keys).uniq
|
|
2765
|
+
end
|
|
2766
|
+
@vectors = DaruLite::Index.new(names)
|
|
2767
|
+
@index = DaruLite::Index.new(index || source.size)
|
|
2768
|
+
|
|
2769
|
+
@data = @vectors.map do |name|
|
|
2770
|
+
v = source.map { |h| h.fetch(name) { h[name.to_s] } }
|
|
2771
|
+
DaruLite::Vector.new(v, name: coerce_name(name), index: @index)
|
|
2772
|
+
end
|
|
2773
|
+
end
|
|
2774
|
+
|
|
2775
|
+
def initialize_from_hash(source, vectors, index, opts)
|
|
2776
|
+
create_vectors_index_with vectors, source
|
|
2777
|
+
|
|
2778
|
+
if ArrayHelper.array_of?(source.values, Vector)
|
|
2779
|
+
initialize_from_hash_with_vectors source, index, opts
|
|
2780
|
+
else
|
|
2781
|
+
initialize_from_hash_with_arrays source, index, opts
|
|
2782
|
+
end
|
|
2783
|
+
end
|
|
2784
|
+
|
|
2785
|
+
def initialize_from_hash_with_vectors(source, index, opts)
|
|
2786
|
+
vectors_have_same_index = all_vectors_have_equal_indexes?(source)
|
|
2787
|
+
|
|
2788
|
+
clone = opts[:clone] != false
|
|
2789
|
+
clone = true unless index || vectors_have_same_index
|
|
2790
|
+
|
|
2791
|
+
@index = deduce_index index, source, vectors_have_same_index
|
|
2792
|
+
|
|
2793
|
+
if clone
|
|
2794
|
+
@data = clone_vectors source, vectors_have_same_index
|
|
2795
|
+
else
|
|
2796
|
+
@data.concat source.values
|
|
2797
|
+
end
|
|
2798
|
+
end
|
|
2799
|
+
|
|
2800
|
+
def deduce_index(index, source, vectors_have_same_index)
|
|
2801
|
+
if !index.nil?
|
|
2802
|
+
Index.coerce index
|
|
2803
|
+
elsif vectors_have_same_index
|
|
2804
|
+
source.values[0].index.dup
|
|
2805
|
+
else
|
|
2806
|
+
all_indexes = source
|
|
2807
|
+
.values.map { |v| v.index.to_a }
|
|
2808
|
+
.flatten.uniq.sort # sort only if missing indexes detected
|
|
2809
|
+
|
|
2810
|
+
DaruLite::Index.new all_indexes
|
|
2811
|
+
end
|
|
2812
|
+
end
|
|
2813
|
+
|
|
2814
|
+
def clone_vectors(source, vectors_have_same_index)
|
|
2815
|
+
@vectors.map do |vector|
|
|
2816
|
+
# avoids matching indexes of vectors if all the supplied vectors
|
|
2817
|
+
# have the same index.
|
|
2818
|
+
if vectors_have_same_index
|
|
2819
|
+
source[vector].dup
|
|
2820
|
+
else
|
|
2821
|
+
DaruLite::Vector.new([], name: vector, index: @index).tap do |v|
|
|
2822
|
+
@index.each do |idx|
|
|
2823
|
+
v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
|
|
2824
|
+
end
|
|
2825
|
+
end
|
|
2826
|
+
end
|
|
2827
|
+
end
|
|
2828
|
+
end
|
|
2829
|
+
|
|
2830
|
+
def initialize_from_hash_with_arrays(source, index, _opts)
|
|
2831
|
+
@index = Index.coerce(index || source.values[0].size)
|
|
2832
|
+
|
|
2833
|
+
@vectors.each do |name|
|
|
2834
|
+
@data << DaruLite::Vector.new(source[name].dup, name: coerce_name(name), index: @index)
|
|
2835
|
+
end
|
|
2836
|
+
end
|
|
2837
|
+
|
|
2838
|
+
def sort_build_row(vector_locs, by_blocks, ascending, handle_nils, r1, r2) # rubocop:disable Metrics/ParameterLists
|
|
2839
|
+
# Create an array to be used for comparison of two rows in sorting
|
|
2840
|
+
vector_locs
|
|
2841
|
+
.zip(by_blocks, ascending, handle_nils)
|
|
2842
|
+
.map do |vector_loc, by, asc, handle_nil|
|
|
2843
|
+
value = @data[vector_loc].data[asc ? r1 : r2]
|
|
2844
|
+
|
|
2845
|
+
if by
|
|
2846
|
+
value = begin
|
|
2847
|
+
by.call(value)
|
|
2848
|
+
rescue StandardError
|
|
2849
|
+
nil
|
|
2850
|
+
end
|
|
2851
|
+
end
|
|
2852
|
+
|
|
2853
|
+
sort_handle_nils value, asc, handle_nil || !by
|
|
2854
|
+
end
|
|
2855
|
+
end
|
|
2856
|
+
|
|
2857
|
+
def sort_handle_nils(value, asc, handle_nil)
|
|
2858
|
+
if !handle_nil
|
|
2859
|
+
value
|
|
2860
|
+
elsif asc
|
|
2861
|
+
[value.nil? ? 0 : 1, value]
|
|
2862
|
+
else
|
|
2863
|
+
[value.nil? ? 1 : 0, value]
|
|
2864
|
+
end
|
|
2865
|
+
end
|
|
2866
|
+
|
|
2867
|
+
def sort_coerce_boolean(opts, symbol, default, size)
|
|
2868
|
+
val = opts[symbol]
|
|
2869
|
+
case val
|
|
2870
|
+
when true, false
|
|
2871
|
+
Array.new(size, val)
|
|
2872
|
+
when nil
|
|
2873
|
+
Array.new(size, default)
|
|
2874
|
+
when Array
|
|
2875
|
+
raise ArgumentError, "Specify same number of vector names and #{symbol}" if
|
|
2876
|
+
size != val.size
|
|
2877
|
+
|
|
2878
|
+
val
|
|
2879
|
+
else
|
|
2880
|
+
raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
|
|
2881
|
+
end
|
|
2882
|
+
end
|
|
2883
|
+
|
|
2884
|
+
def sort_prepare_block(vector_order, opts)
|
|
2885
|
+
ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
|
|
2886
|
+
handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
|
|
2887
|
+
|
|
2888
|
+
by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
|
|
2889
|
+
vector_locs = vector_order.map { |v| @vectors[v] }
|
|
2890
|
+
|
|
2891
|
+
lambda do |index1, index2|
|
|
2892
|
+
# Build left and right array to compare two rows
|
|
2893
|
+
left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
|
|
2894
|
+
right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
|
|
2895
|
+
|
|
2896
|
+
# Resolve conflict by Index if all attributes are same
|
|
2897
|
+
left << index1
|
|
2898
|
+
right << index2
|
|
2899
|
+
left <=> right
|
|
2900
|
+
end
|
|
2901
|
+
end
|
|
2902
|
+
|
|
2903
|
+
def verify_error_message(row, test, id, i)
|
|
2904
|
+
description, fields, = test
|
|
2905
|
+
values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
|
|
2906
|
+
"#{i + 1} [#{row[id]}]: #{description}#{values}"
|
|
2907
|
+
end
|
|
2908
|
+
|
|
2909
|
+
def prepare_pivot_values(index, vectors, opts)
|
|
2910
|
+
case opts[:values]
|
|
2911
|
+
when nil # values not specified at all.
|
|
2912
|
+
(@vectors.to_a - (index | vectors)) & numeric_vector_names
|
|
2913
|
+
when Array # multiple values specified.
|
|
2914
|
+
opts[:values]
|
|
2915
|
+
else # single value specified.
|
|
2916
|
+
[opts[:values]]
|
|
2917
|
+
end
|
|
2918
|
+
end
|
|
2919
|
+
|
|
2920
|
+
def make_pivot_hash(grouped, vectors, values, aggregate_function)
|
|
2921
|
+
grouped.groups.transform_values { |_| {} }.tap do |super_hash|
|
|
2922
|
+
values.each do |value|
|
|
2923
|
+
grouped.groups.each do |group_name, row_numbers|
|
|
2924
|
+
row_numbers.each do |num|
|
|
2925
|
+
arry = [value, *vectors.map { |v| self[v][num] }]
|
|
2926
|
+
sub_hash = super_hash[group_name]
|
|
2927
|
+
sub_hash[arry] ||= []
|
|
2928
|
+
|
|
2929
|
+
sub_hash[arry] << self[value][num]
|
|
2930
|
+
end
|
|
2931
|
+
end
|
|
2932
|
+
end
|
|
2933
|
+
|
|
2934
|
+
setup_pivot_aggregates super_hash, aggregate_function
|
|
2935
|
+
end
|
|
2936
|
+
end
|
|
2937
|
+
|
|
2938
|
+
def setup_pivot_aggregates(super_hash, aggregate_function)
|
|
2939
|
+
super_hash.each_value do |sub_hash|
|
|
2940
|
+
sub_hash.each do |group_name, aggregates|
|
|
2941
|
+
sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function)
|
|
2942
|
+
end
|
|
2943
|
+
end
|
|
2944
|
+
end
|
|
2945
|
+
|
|
2946
|
+
def pivot_dataframe(super_hash)
|
|
2947
|
+
df_index = DaruLite::MultiIndex.from_tuples super_hash.keys
|
|
2948
|
+
df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
|
|
2949
|
+
|
|
2950
|
+
DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
|
|
2951
|
+
super_hash.each do |row_index, sub_h|
|
|
2952
|
+
sub_h.each do |vector_index, val|
|
|
2953
|
+
pivoted_dataframe[vector_index][row_index] = val
|
|
2954
|
+
end
|
|
2955
|
+
end
|
|
2956
|
+
end
|
|
2957
|
+
end
|
|
2958
|
+
|
|
2959
|
+
def one_to_many_components(pattern)
|
|
2960
|
+
re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
|
|
2961
|
+
|
|
2962
|
+
vars, numbers =
|
|
2963
|
+
@vectors
|
|
2964
|
+
.map { |v| v.scan(re) }
|
|
2965
|
+
.reject(&:empty?).flatten(1).transpose
|
|
2966
|
+
|
|
2967
|
+
[vars.uniq, numbers.map(&:to_i).sort.uniq]
|
|
2968
|
+
end
|
|
2969
|
+
|
|
2970
|
+
def one_to_many_row(row, number, vars, pattern)
|
|
2971
|
+
vars
|
|
2972
|
+
.to_h do |v|
|
|
2973
|
+
name = pattern.sub('%v', v).sub('%n', number.to_s)
|
|
2974
|
+
[v, row[name]]
|
|
2975
|
+
end
|
|
2976
|
+
end
|
|
2977
|
+
|
|
2978
|
+
# Raises IndexError when one of the positions is not a valid position
|
|
2979
|
+
def validate_positions(*positions, size)
|
|
2980
|
+
positions.each do |pos|
|
|
2981
|
+
raise IndexError, "#{pos} is not a valid position." if pos >= size
|
|
2982
|
+
end
|
|
2983
|
+
end
|
|
2984
|
+
|
|
2985
|
+
# Accepts hash, enumerable and vector and align it properly so it can be added
|
|
2986
|
+
def coerce_vector(vector)
|
|
2987
|
+
case vector
|
|
2988
|
+
when DaruLite::Vector
|
|
2989
|
+
vector.reindex @vectors
|
|
2990
|
+
when Hash
|
|
2991
|
+
DaruLite::Vector.new(vector).reindex @vectors
|
|
2992
|
+
else
|
|
2993
|
+
DaruLite::Vector.new vector
|
|
2994
|
+
end
|
|
2995
|
+
end
|
|
2996
|
+
|
|
2997
|
+
def update_data(source, vectors)
|
|
2998
|
+
@data = @vectors.each_with_index.map do |_vec, idx|
|
|
2999
|
+
DaruLite::Vector.new(source[idx], index: @index, name: vectors[idx])
|
|
3000
|
+
end
|
|
3001
|
+
end
|
|
3002
|
+
|
|
3003
|
+
def aggregate_by_positions_tuples(options, positions_tuples)
|
|
3004
|
+
agg_over_vectors_only, options = cast_aggregation_options(options)
|
|
3005
|
+
|
|
3006
|
+
if agg_over_vectors_only
|
|
3007
|
+
options.map do |vect_name, method|
|
|
3008
|
+
vect = self[vect_name]
|
|
3009
|
+
|
|
3010
|
+
positions_tuples.map do |positions|
|
|
3011
|
+
vect.apply_method_on_sub_vector(method, keys: positions)
|
|
3012
|
+
end
|
|
3013
|
+
end
|
|
3014
|
+
else
|
|
3015
|
+
methods = options.values
|
|
3016
|
+
|
|
3017
|
+
# NOTE: because we aggregate over rows, we don't have to re-get sub-dfs for each method (which is expensive)
|
|
3018
|
+
rows = positions_tuples.map do |positions|
|
|
3019
|
+
apply_method_on_sub_df(methods, keys: positions)
|
|
3020
|
+
end
|
|
3021
|
+
|
|
3022
|
+
rows.transpose
|
|
3023
|
+
end
|
|
3024
|
+
end
|
|
3025
|
+
|
|
3026
|
+
# convert operations over sub-vectors to operations over sub-dfs when it improves perf
|
|
3027
|
+
# note: we don't always "cast" because aggregation over a single vector / a few vector is faster
|
|
3028
|
+
# than aggregation over (sub-)dfs
|
|
3029
|
+
def cast_aggregation_options(options)
|
|
3030
|
+
vects, non_vects = options.keys.partition { |k| @vectors.include?(k) }
|
|
3031
|
+
|
|
3032
|
+
over_vectors = true
|
|
3033
|
+
|
|
3034
|
+
if non_vects.any?
|
|
3035
|
+
options = options.clone
|
|
3036
|
+
|
|
3037
|
+
vects.each do |name|
|
|
3038
|
+
proc_on_vect = options[name].to_proc
|
|
3039
|
+
options[name] = ->(sub_df) { proc_on_vect.call(sub_df[name]) }
|
|
3040
|
+
end
|
|
3041
|
+
|
|
3042
|
+
over_vectors = false
|
|
3043
|
+
end
|
|
3044
|
+
|
|
3045
|
+
[over_vectors, options]
|
|
3046
|
+
end
|
|
3047
|
+
|
|
3048
|
+
def group_index_for_aggregation(index, multi_index_level = -1)
|
|
3049
|
+
case index
|
|
3050
|
+
when DaruLite::MultiIndex
|
|
3051
|
+
groups_by_pos = DaruLite::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
|
|
3052
|
+
|
|
3053
|
+
new_index = DaruLite::MultiIndex.from_tuples(groups_by_pos.keys).coerce_index
|
|
3054
|
+
pos_tuples = groups_by_pos.values
|
|
3055
|
+
when DaruLite::Index, DaruLite::CategoricalIndex
|
|
3056
|
+
new_index = Array(index).uniq
|
|
3057
|
+
pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
|
|
3058
|
+
else raise
|
|
3059
|
+
end
|
|
3060
|
+
|
|
3061
|
+
[pos_tuples, new_index]
|
|
3062
|
+
end
|
|
3063
|
+
|
|
3064
|
+
# coerce ranges, integers and array in appropriate ways
|
|
3065
|
+
def coerce_positions(*positions, size)
|
|
3066
|
+
if positions.size == 1
|
|
3067
|
+
case positions.first
|
|
3068
|
+
when Integer
|
|
3069
|
+
positions.first
|
|
3070
|
+
when Range
|
|
3071
|
+
size.times.to_a[positions.first]
|
|
3072
|
+
else
|
|
3073
|
+
raise ArgumentError, 'Unknown position type.'
|
|
3074
|
+
end
|
|
3075
|
+
else
|
|
3076
|
+
positions
|
|
3077
|
+
end
|
|
3078
|
+
end
|
|
3079
|
+
end
|
|
3080
|
+
end
|