daru_lite 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/workflows/ci.yml +33 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +27 -0
- data/.rubocop_todo.yml +137 -0
- data/CONTRIBUTING.md +47 -0
- data/Gemfile +2 -0
- data/History.md +4 -0
- data/LICENSE +24 -0
- data/README.md +218 -0
- data/Rakefile +69 -0
- data/ReleasePolicy.md +20 -0
- data/benchmarks/TradeoffData.csv +65 -0
- data/benchmarks/csv_reading.rb +22 -0
- data/benchmarks/dataframe_creation.rb +39 -0
- data/benchmarks/db_loading.rb +34 -0
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +32 -0
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/row_access.rb +41 -0
- data/benchmarks/row_assign.rb +36 -0
- data/benchmarks/sorting.rb +51 -0
- data/benchmarks/statistics.rb +28 -0
- data/benchmarks/vector_access.rb +31 -0
- data/benchmarks/vector_assign.rb +42 -0
- data/benchmarks/where_clause.rb +48 -0
- data/benchmarks/where_vs_filter.rb +28 -0
- data/daru_lite.gemspec +55 -0
- data/images/README.md +5 -0
- data/images/con0.png +0 -0
- data/images/con1.png +0 -0
- data/images/init0.png +0 -0
- data/images/init1.png +0 -0
- data/images/man0.png +0 -0
- data/images/man1.png +0 -0
- data/images/man2.png +0 -0
- data/images/man3.png +0 -0
- data/images/man4.png +0 -0
- data/images/man5.png +0 -0
- data/images/man6.png +0 -0
- data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
- data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
- data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
- data/lib/daru_lite/category.rb +929 -0
- data/lib/daru_lite/configuration.rb +34 -0
- data/lib/daru_lite/core/group_by.rb +403 -0
- data/lib/daru_lite/core/merge.rb +270 -0
- data/lib/daru_lite/core/query.rb +109 -0
- data/lib/daru_lite/dataframe.rb +3080 -0
- data/lib/daru_lite/date_time/index.rb +569 -0
- data/lib/daru_lite/date_time/offsets.rb +397 -0
- data/lib/daru_lite/exceptions.rb +2 -0
- data/lib/daru_lite/extensions/which_dsl.rb +53 -0
- data/lib/daru_lite/formatters/table.rb +52 -0
- data/lib/daru_lite/helpers/array.rb +53 -0
- data/lib/daru_lite/index/categorical_index.rb +201 -0
- data/lib/daru_lite/index/index.rb +374 -0
- data/lib/daru_lite/index/multi_index.rb +374 -0
- data/lib/daru_lite/io/csv/converters.rb +21 -0
- data/lib/daru_lite/io/io.rb +294 -0
- data/lib/daru_lite/io/sql_data_source.rb +97 -0
- data/lib/daru_lite/iruby/helpers.rb +38 -0
- data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
- data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
- data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
- data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
- data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
- data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
- data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
- data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
- data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
- data/lib/daru_lite/monkeys.rb +56 -0
- data/lib/daru_lite/vector.rb +1678 -0
- data/lib/daru_lite/version.rb +3 -0
- data/lib/daru_lite.rb +99 -0
- data/profile/_base.rb +23 -0
- data/profile/df_to_a.rb +10 -0
- data/profile/filter.rb +13 -0
- data/profile/joining.rb +13 -0
- data/profile/sorting.rb +12 -0
- data/profile/vector_each_with_index.rb +9 -0
- data/profile/vector_new.rb +9 -0
- data/spec/accessors/array_wrapper_spec.rb +3 -0
- data/spec/category_spec.rb +1741 -0
- data/spec/core/group_by_spec.rb +655 -0
- data/spec/core/merge_spec.rb +179 -0
- data/spec/core/query_spec.rb +347 -0
- data/spec/daru_lite_spec.rb +22 -0
- data/spec/dataframe_spec.rb +4330 -0
- data/spec/date_time/data_spec.rb +197 -0
- data/spec/date_time/date_time_index_helper_spec.rb +72 -0
- data/spec/date_time/index_spec.rb +588 -0
- data/spec/date_time/offsets_spec.rb +465 -0
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/boolean_converter_test.csv +5 -0
- data/spec/fixtures/countries.json +7794 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/fixtures/eciresults.html +394 -0
- data/spec/fixtures/empties.dat +2 -0
- data/spec/fixtures/empty_rows_test.csv +17 -0
- data/spec/fixtures/macau.html +3691 -0
- data/spec/fixtures/macd_data.csv +150 -0
- data/spec/fixtures/matrix_test.csv +100 -0
- data/spec/fixtures/moneycontrol.html +6812 -0
- data/spec/fixtures/music_data.tsv +2501 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/sales-funnel.csv +18 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/string_converter_test.csv +5 -0
- data/spec/fixtures/strings.dat +2 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/fixtures/test_xls_2.xls +0 -0
- data/spec/fixtures/url_test.txt~ +0 -0
- data/spec/fixtures/valid_markup.html +62 -0
- data/spec/fixtures/wiki_climate.html +1243 -0
- data/spec/fixtures/wiki_table_info.html +631 -0
- data/spec/formatters/table_formatter_spec.rb +137 -0
- data/spec/helpers_spec.rb +8 -0
- data/spec/index/categorical_index_spec.rb +170 -0
- data/spec/index/index_spec.rb +417 -0
- data/spec/index/multi_index_spec.rb +680 -0
- data/spec/io/io_spec.rb +373 -0
- data/spec/io/sql_data_source_spec.rb +56 -0
- data/spec/iruby/dataframe_spec.rb +170 -0
- data/spec/iruby/helpers_spec.rb +49 -0
- data/spec/iruby/multi_index_spec.rb +37 -0
- data/spec/iruby/vector_spec.rb +105 -0
- data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
- data/spec/maths/arithmetic/vector_spec.rb +165 -0
- data/spec/maths/statistics/dataframe_spec.rb +178 -0
- data/spec/maths/statistics/vector_spec.rb +756 -0
- data/spec/monkeys_spec.rb +42 -0
- data/spec/shared/vector_display_spec.rb +213 -0
- data/spec/spec_helper.rb +87 -0
- data/spec/support/database_helper.rb +30 -0
- data/spec/support/matchers.rb +5 -0
- data/spec/vector_spec.rb +2293 -0
- metadata +571 -0
@@ -0,0 +1,1678 @@
|
|
1
|
+
require 'daru_lite/maths/arithmetic/vector'
|
2
|
+
require 'daru_lite/maths/statistics/vector'
|
3
|
+
require 'daru_lite/accessors/array_wrapper'
|
4
|
+
require 'daru_lite/category'
|
5
|
+
|
6
|
+
module DaruLite
|
7
|
+
class Vector # rubocop:disable Metrics/ClassLength
|
8
|
+
include Enumerable
|
9
|
+
include DaruLite::Maths::Arithmetic::Vector
|
10
|
+
include DaruLite::Maths::Statistics::Vector
|
11
|
+
extend Gem::Deprecate
|
12
|
+
|
13
|
+
class << self
|
14
|
+
# Create a new vector by specifying the size and an optional value
|
15
|
+
# and block to generate values.
|
16
|
+
#
|
17
|
+
# == Description
|
18
|
+
#
|
19
|
+
# The *new_with_size* class method lets you create a DaruLite::Vector
|
20
|
+
# by specifying the size as the argument. The optional block, if
|
21
|
+
# supplied, is run once for populating each element in the Vector.
|
22
|
+
#
|
23
|
+
# The result of each run of the block is the value that is ultimately
|
24
|
+
# assigned to that position in the Vector.
|
25
|
+
#
|
26
|
+
# == Options
|
27
|
+
# :value
|
28
|
+
# All the rest like .new
|
29
|
+
def new_with_size(n, opts = {}, &block)
|
30
|
+
value = opts.delete :value
|
31
|
+
block ||= ->(_) { value }
|
32
|
+
DaruLite::Vector.new Array.new(n, &block), opts
|
33
|
+
end
|
34
|
+
|
35
|
+
# Create a vector using (almost) any object
|
36
|
+
# * Array: flattened
|
37
|
+
# * Range: transformed using to_a
|
38
|
+
# * DaruLite::Vector
|
39
|
+
# * Numeric and string values
|
40
|
+
#
|
41
|
+
# == Description
|
42
|
+
#
|
43
|
+
# The `Vector.[]` class method creates a vector from almost any
|
44
|
+
# object that has a `#to_a` method defined on it. It is similar
|
45
|
+
# to R's `c` method.
|
46
|
+
#
|
47
|
+
# == Usage
|
48
|
+
#
|
49
|
+
# a = DaruLite::Vector[1,2,3,4,6..10]
|
50
|
+
# #=>
|
51
|
+
# # <DaruLite::Vector:99448510 @name = nil @size = 9 >
|
52
|
+
# # nil
|
53
|
+
# # 0 1
|
54
|
+
# # 1 2
|
55
|
+
# # 2 3
|
56
|
+
# # 3 4
|
57
|
+
# # 4 6
|
58
|
+
# # 5 7
|
59
|
+
# # 6 8
|
60
|
+
# # 7 9
|
61
|
+
# # 8 10
|
62
|
+
def [](*indexes)
|
63
|
+
values = indexes.map do |a|
|
64
|
+
a.respond_to?(:to_a) ? a.to_a : a
|
65
|
+
end.flatten
|
66
|
+
DaruLite::Vector.new(values)
|
67
|
+
end
|
68
|
+
|
69
|
+
def _load(data) # :nodoc:
|
70
|
+
h = Marshal.load(data)
|
71
|
+
DaruLite::Vector.new(h[:data],
|
72
|
+
index: h[:index],
|
73
|
+
name: h[:name],
|
74
|
+
dtype: h[:dtype], missing_values: h[:missing_values])
|
75
|
+
end
|
76
|
+
|
77
|
+
def coerce(data, options = {})
|
78
|
+
case data
|
79
|
+
when DaruLite::Vector
|
80
|
+
data
|
81
|
+
when Array, Hash
|
82
|
+
new(data, options)
|
83
|
+
else
|
84
|
+
raise ArgumentError, "Can't coerce #{data.class} to #{self}"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def size
|
90
|
+
@data.size
|
91
|
+
end
|
92
|
+
|
93
|
+
def each(&block)
|
94
|
+
return to_enum(:each) unless block
|
95
|
+
|
96
|
+
@data.each(&block)
|
97
|
+
self
|
98
|
+
end
|
99
|
+
|
100
|
+
def each_index(&block)
|
101
|
+
return to_enum(:each_index) unless block
|
102
|
+
|
103
|
+
@index.each(&block)
|
104
|
+
self
|
105
|
+
end
|
106
|
+
|
107
|
+
def each_with_index(&block)
|
108
|
+
return to_enum(:each_with_index) unless block
|
109
|
+
|
110
|
+
@data.to_a.zip(@index.to_a).each(&block)
|
111
|
+
|
112
|
+
self
|
113
|
+
end
|
114
|
+
|
115
|
+
def map!(&block)
|
116
|
+
return to_enum(:map!) unless block
|
117
|
+
|
118
|
+
@data.map!(&block)
|
119
|
+
self
|
120
|
+
end
|
121
|
+
|
122
|
+
def apply_method(method, keys: nil, by_position: true)
|
123
|
+
vect = keys ? get_sub_vector(keys, by_position: by_position) : self
|
124
|
+
|
125
|
+
case method
|
126
|
+
when Symbol then vect.send(method)
|
127
|
+
when Proc then method.call(vect)
|
128
|
+
else raise
|
129
|
+
end
|
130
|
+
end
|
131
|
+
alias apply_method_on_sub_vector apply_method
|
132
|
+
|
133
|
+
# The name of the DaruLite::Vector. String.
|
134
|
+
attr_reader :name
|
135
|
+
# The row index. Can be either DaruLite::Index or DaruLite::MultiIndex.
|
136
|
+
attr_reader :index
|
137
|
+
# The underlying dtype of the Vector. Can be :array.
|
138
|
+
attr_reader :dtype
|
139
|
+
attr_reader :nm_dtype
|
140
|
+
# An Array or the positions in the vector that are being treated as 'missing'.
|
141
|
+
attr_reader :missing_positions
|
142
|
+
|
143
|
+
deprecate :missing_positions, :indexes, 2016, 10
|
144
|
+
# Store a hash of labels for values. Supplementary only. Recommend using index
|
145
|
+
# for proper usage.
|
146
|
+
attr_accessor :labels
|
147
|
+
# Store vector data in an array
|
148
|
+
attr_reader :data
|
149
|
+
|
150
|
+
# Create a Vector object.
|
151
|
+
#
|
152
|
+
# == Arguments
|
153
|
+
#
|
154
|
+
# @param source[Array,Hash] - Supply elements in the form of an Array or a
|
155
|
+
# Hash. If Array, a numeric index will be created if not supplied in the
|
156
|
+
# options. Specifying more index elements than actual values in *source*
|
157
|
+
# will insert *nil* into the surplus index elements. When a Hash is specified,
|
158
|
+
# the keys of the Hash are taken as the index elements and the corresponding
|
159
|
+
# values as the values that populate the vector.
|
160
|
+
#
|
161
|
+
# == Options
|
162
|
+
#
|
163
|
+
# * +:name+ - Name of the vector
|
164
|
+
#
|
165
|
+
# * +:index+ - Index of the vector
|
166
|
+
#
|
167
|
+
# * +:dtype+ - The underlying data type. Can be :array.
|
168
|
+
# Default :array.
|
169
|
+
#
|
170
|
+
# * +:missing_values+ - An Array of the values that are to be treated as 'missing'.
|
171
|
+
# nil is the default missing value.
|
172
|
+
#
|
173
|
+
# == Usage
|
174
|
+
#
|
175
|
+
# vecarr = DaruLite::Vector.new [1,2,3,4], index: [:a, :e, :i, :o]
|
176
|
+
# vechsh = DaruLite::Vector.new({a: 1, e: 2, i: 3, o: 4})
|
177
|
+
def initialize(source, opts = {})
|
178
|
+
if opts[:type] == :category
|
179
|
+
# Initialize category type vector
|
180
|
+
extend DaruLite::Category
|
181
|
+
initialize_category source, opts
|
182
|
+
else
|
183
|
+
# Initialize non-category type vector
|
184
|
+
initialize_vector source, opts
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# Get one or more elements with specified index or a range.
|
189
|
+
#
|
190
|
+
# == Usage
|
191
|
+
# # For vectors employing single layer Index
|
192
|
+
#
|
193
|
+
# v[:one, :two] # => DaruLite::Vector with indexes :one and :two
|
194
|
+
# v[:one] # => Single element
|
195
|
+
# v[:one..:three] # => DaruLite::Vector with indexes :one, :two and :three
|
196
|
+
#
|
197
|
+
# # For vectors employing hierarchial multi index
|
198
|
+
#
|
199
|
+
def [](*input_indexes)
|
200
|
+
# Get array of positions indexes
|
201
|
+
positions = @index.pos(*input_indexes)
|
202
|
+
|
203
|
+
# If one object is asked return it
|
204
|
+
return @data[positions] if positions.is_a? Numeric
|
205
|
+
|
206
|
+
# Form a new Vector using positional indexes
|
207
|
+
DaruLite::Vector.new(
|
208
|
+
positions.map { |loc| @data[loc] },
|
209
|
+
name: @name,
|
210
|
+
index: @index.subset(*input_indexes), dtype: @dtype
|
211
|
+
)
|
212
|
+
end
|
213
|
+
|
214
|
+
# Returns vector of values given positional values
|
215
|
+
# @param positions [Array<object>] positional values
|
216
|
+
# @return [object] vector
|
217
|
+
# @example
|
218
|
+
# dv = DaruLite::Vector.new 'a'..'e'
|
219
|
+
# dv.at 0, 1, 2
|
220
|
+
# # => #<DaruLite::Vector(3)>
|
221
|
+
# # 0 a
|
222
|
+
# # 1 b
|
223
|
+
# # 2 c
|
224
|
+
def at(*positions)
|
225
|
+
# to be used to form index
|
226
|
+
original_positions = positions
|
227
|
+
positions = coerce_positions(*positions)
|
228
|
+
validate_positions(*positions)
|
229
|
+
|
230
|
+
if positions.is_a? Integer
|
231
|
+
@data[positions]
|
232
|
+
else
|
233
|
+
values = positions.map { |pos| @data[pos] }
|
234
|
+
DaruLite::Vector.new values, index: @index.at(*original_positions), dtype: dtype
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
# Change value at given positions
|
239
|
+
# @param positions [Array<object>] positional values
|
240
|
+
# @param [object] val value to assign
|
241
|
+
# @example
|
242
|
+
# dv = DaruLite::Vector.new 'a'..'e'
|
243
|
+
# dv.set_at [0, 1], 'x'
|
244
|
+
# dv
|
245
|
+
# # => #<DaruLite::Vector(5)>
|
246
|
+
# # 0 x
|
247
|
+
# # 1 x
|
248
|
+
# # 2 c
|
249
|
+
# # 3 d
|
250
|
+
# # 4 e
|
251
|
+
def set_at(positions, val)
|
252
|
+
validate_positions(*positions)
|
253
|
+
positions.map { |pos| @data[pos] = val }
|
254
|
+
update_position_cache
|
255
|
+
end
|
256
|
+
|
257
|
+
# Just like in Hashes, you can specify the index label of the DaruLite::Vector
|
258
|
+
# and assign an element an that place in the DaruLite::Vector.
|
259
|
+
#
|
260
|
+
# == Usage
|
261
|
+
#
|
262
|
+
# v = DaruLite::Vector.new([1,2,3], index: [:a, :b, :c])
|
263
|
+
# v[:a] = 999
|
264
|
+
# #=>
|
265
|
+
# ##<DaruLite::Vector:90257920 @name = nil @size = 3 >
|
266
|
+
# # nil
|
267
|
+
# # a 999
|
268
|
+
# # b 2
|
269
|
+
# # c 3
|
270
|
+
def []=(*indexes, val)
|
271
|
+
cast(dtype: :array) if val.nil? && dtype != :array
|
272
|
+
|
273
|
+
guard_type_check(val)
|
274
|
+
|
275
|
+
modify_vector(indexes, val)
|
276
|
+
|
277
|
+
update_position_cache
|
278
|
+
end
|
279
|
+
|
280
|
+
# Two vectors are equal if they have the exact same index values corresponding
|
281
|
+
# with the exact same elements. Name is ignored.
|
282
|
+
def ==(other)
|
283
|
+
case other
|
284
|
+
when DaruLite::Vector
|
285
|
+
@index == other.index && size == other.size &&
|
286
|
+
each_with_index.with_index.all? do |(e, index), position|
|
287
|
+
e == other.at(position) && index == other.index.to_a[position]
|
288
|
+
end
|
289
|
+
else
|
290
|
+
super
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
# !@method eq
|
295
|
+
# Uses `==` and returns `true` for each **equal** entry
|
296
|
+
# @param [#==, DaruLite::Vector] If scalar object, compares it with each
|
297
|
+
# element in self. If DaruLite::Vector, compares elements with same indexes.
|
298
|
+
# @example (see #where)
|
299
|
+
# !@method not_eq
|
300
|
+
# Uses `!=` and returns `true` for each **unequal** entry
|
301
|
+
# @param [#!=, DaruLite::Vector] If scalar object, compares it with each
|
302
|
+
# element in self. If DaruLite::Vector, compares elements with same indexes.
|
303
|
+
# @example (see #where)
|
304
|
+
# !@method lt
|
305
|
+
# Uses `<` and returns `true` for each entry **less than** the supplied object
|
306
|
+
# @param [#<, DaruLite::Vector] If scalar object, compares it with each
|
307
|
+
# element in self. If DaruLite::Vector, compares elements with same indexes.
|
308
|
+
# @example (see #where)
|
309
|
+
# !@method lteq
|
310
|
+
# Uses `<=` and returns `true` for each entry **less than or equal to** the supplied object
|
311
|
+
# @param [#<=, DaruLite::Vector] If scalar object, compares it with each
|
312
|
+
# element in self. If DaruLite::Vector, compares elements with same indexes.
|
313
|
+
# @example (see #where)
|
314
|
+
# !@method mt
|
315
|
+
# Uses `>` and returns `true` for each entry **more than** the supplied object
|
316
|
+
# @param [#>, DaruLite::Vector] If scalar object, compares it with each
|
317
|
+
# element in self. If DaruLite::Vector, compares elements with same indexes.
|
318
|
+
# @example (see #where)
|
319
|
+
# !@method mteq
|
320
|
+
# Uses `>=` and returns `true` for each entry **more than or equal to** the supplied object
|
321
|
+
# @param [#>=, DaruLite::Vector] If scalar object, compares it with each
|
322
|
+
# element in self. If DaruLite::Vector, compares elements with same indexes.
|
323
|
+
# @example (see #where)
|
324
|
+
|
325
|
+
# Define the comparator methods with metaprogramming. See documentation
|
326
|
+
# written above for functionality of each method. Use these methods with the
|
327
|
+
# `where` method to obtain the corresponding Vector/DataFrame.
|
328
|
+
{
|
329
|
+
eq: :==,
|
330
|
+
not_eq: :!=,
|
331
|
+
lt: :<,
|
332
|
+
lteq: :<=,
|
333
|
+
mt: :>,
|
334
|
+
mteq: :>=
|
335
|
+
}.each do |method, operator|
|
336
|
+
define_method(method) do |other|
|
337
|
+
mod = DaruLite::Core::Query
|
338
|
+
if other.is_a?(DaruLite::Vector)
|
339
|
+
mod.apply_vector_operator operator, self, other
|
340
|
+
else
|
341
|
+
mod.apply_scalar_operator operator, @data, other
|
342
|
+
end
|
343
|
+
end
|
344
|
+
alias_method operator, method if operator != :== && operator != :!=
|
345
|
+
end
|
346
|
+
alias gt mt
|
347
|
+
alias gteq mteq
|
348
|
+
|
349
|
+
# Comparator for checking if any of the elements in *other* exist in self.
|
350
|
+
#
|
351
|
+
# @param [Array, DaruLite::Vector] other A collection which has elements that
|
352
|
+
# need to be checked for in self.
|
353
|
+
# @example Usage of `in`.
|
354
|
+
# vector = DaruLite::Vector.new([1,2,3,4,5])
|
355
|
+
# vector.where(vector.in([3,5]))
|
356
|
+
# #=>
|
357
|
+
# ##<DaruLite::Vector:82215960 @name = nil @size = 2 >
|
358
|
+
# # nil
|
359
|
+
# # 2 3
|
360
|
+
# # 4 5
|
361
|
+
def in(other)
|
362
|
+
other = other.zip(Array.new(other.size, 0)).to_h
|
363
|
+
DaruLite::Core::Query::BoolArray.new(
|
364
|
+
@data.each_with_object([]) do |d, memo|
|
365
|
+
memo << (other.key?(d))
|
366
|
+
end
|
367
|
+
)
|
368
|
+
end
|
369
|
+
|
370
|
+
# Return a new vector based on the contents of a boolean array. Use with the
|
371
|
+
# comparator methods to obtain meaningful results. See this notebook for
|
372
|
+
# a good overview of using #where.
|
373
|
+
#
|
374
|
+
# @param bool_array [DaruLite::Core::Query::BoolArray, Array<TrueClass, FalseClass>] The
|
375
|
+
# collection containing the true of false values. Each element in the Vector
|
376
|
+
# corresponding to a `true` in the bool_arry will be returned alongwith it's
|
377
|
+
# index.
|
378
|
+
# @example Usage of #where.
|
379
|
+
# vector = DaruLite::Vector.new([2,4,5,51,5,16,2,5,3,2,1,5,2,5,2,1,56,234,6,21])
|
380
|
+
#
|
381
|
+
# # Simple logic statement passed to #where.
|
382
|
+
# vector.where(vector.eq(5).or(vector.eq(1)))
|
383
|
+
# # =>
|
384
|
+
# ##<DaruLite::Vector:77626210 @name = nil @size = 7 >
|
385
|
+
# # nil
|
386
|
+
# # 2 5
|
387
|
+
# # 4 5
|
388
|
+
# # 7 5
|
389
|
+
# # 10 1
|
390
|
+
# # 11 5
|
391
|
+
# # 13 5
|
392
|
+
# # 15 1
|
393
|
+
#
|
394
|
+
# # A somewhat more complex logic statement
|
395
|
+
# vector.where((vector.eq(5) | vector.lteq(1)) & vector.in([4,5,1]))
|
396
|
+
# #=>
|
397
|
+
# ##<DaruLite::Vector:81072310 @name = nil @size = 7 >
|
398
|
+
# # nil
|
399
|
+
# # 2 5
|
400
|
+
# # 4 5
|
401
|
+
# # 7 5
|
402
|
+
# # 10 1
|
403
|
+
# # 11 5
|
404
|
+
# # 13 5
|
405
|
+
# # 15 1
|
406
|
+
def where(bool_array)
|
407
|
+
DaruLite::Core::Query.vector_where self, bool_array
|
408
|
+
end
|
409
|
+
|
410
|
+
# Return a new vector based on the contents of a boolean array and &block.
|
411
|
+
#
|
412
|
+
# @param bool_array [DaruLite::Core::Query::BoolArray, Array<TrueClass, FalseClass>, &block] The
|
413
|
+
# collection containing the true of false values. Each element in the Vector
|
414
|
+
# corresponding to a `true` in the bool_array will be returned along with it's
|
415
|
+
# index. The &block may contain manipulative functions for the Vector elements.
|
416
|
+
#
|
417
|
+
# @return [DaruLite::Vector]
|
418
|
+
#
|
419
|
+
# @example Usage of #apply_where.
|
420
|
+
# dv = DaruLite::Vector.new ['3 days', '5 weeks', '2 weeks']
|
421
|
+
# dv = dv.apply_where(dv.match /weeks/) { |x| "#{x.split.first.to_i * 7} days" }
|
422
|
+
# # =>
|
423
|
+
# ##<DaruLite::Vector(3)>
|
424
|
+
# # 0 3 days
|
425
|
+
# # 1 35 days
|
426
|
+
# # 2 14 days
|
427
|
+
def apply_where(bool_array, &block)
|
428
|
+
DaruLite::Core::Query.vector_apply_where self, bool_array, &block
|
429
|
+
end
|
430
|
+
|
431
|
+
def head(q = 10)
|
432
|
+
self[0..(q - 1)]
|
433
|
+
end
|
434
|
+
|
435
|
+
def tail(q = 10)
|
436
|
+
start = [size - q, 0].max
|
437
|
+
self[start..(size - 1)]
|
438
|
+
end
|
439
|
+
|
440
|
+
def last(q = 1)
|
441
|
+
# The Enumerable mixin dose not provide the last method.
|
442
|
+
tail(q)
|
443
|
+
end
|
444
|
+
|
445
|
+
def empty?
|
446
|
+
@index.empty?
|
447
|
+
end
|
448
|
+
|
449
|
+
def numeric?
|
450
|
+
type == :numeric
|
451
|
+
end
|
452
|
+
|
453
|
+
def object?
|
454
|
+
type == :object
|
455
|
+
end
|
456
|
+
|
457
|
+
# Reports whether missing data is present in the Vector.
|
458
|
+
def has_missing_data?
|
459
|
+
!indexes(*DaruLite::MISSING_VALUES).empty?
|
460
|
+
end
|
461
|
+
alias flawed? has_missing_data?
|
462
|
+
deprecate :has_missing_data?, :include_values?, 2016, 10
|
463
|
+
deprecate :flawed?, :include_values?, 2016, 10
|
464
|
+
|
465
|
+
# Check if any one of mentioned values occur in the vector
|
466
|
+
# @param values [Array] values to check for
|
467
|
+
# @return [true, false] returns true if any one of specified values
|
468
|
+
# occur in the vector
|
469
|
+
# @example
|
470
|
+
# dv = DaruLite::Vector.new [1, 2, 3, 4, nil]
|
471
|
+
# dv.include_values? nil, Float::NAN
|
472
|
+
# # => true
|
473
|
+
def include_values?(*values)
|
474
|
+
values.any? { |v| include_with_nan? @data, v }
|
475
|
+
end
|
476
|
+
|
477
|
+
# @note Do not use it to check for Float::NAN as
|
478
|
+
# Float::NAN == Float::NAN is false
|
479
|
+
# Return vector of booleans with value at ith position is either
|
480
|
+
# true or false depending upon whether value at position i is equal to
|
481
|
+
# any of the values passed in the argument or not
|
482
|
+
# @param values [Array] values to equate with
|
483
|
+
# @return [DaruLite::Vector] vector of boolean values
|
484
|
+
# @example
|
485
|
+
# dv = DaruLite::Vector.new [1, 2, 3, 2, 1]
|
486
|
+
# dv.is_values 1, 2
|
487
|
+
# # => #<DaruLite::Vector(5)>
|
488
|
+
# # 0 true
|
489
|
+
# # 1 true
|
490
|
+
# # 2 false
|
491
|
+
# # 3 true
|
492
|
+
# # 4 true
|
493
|
+
def is_values(*values)
|
494
|
+
DaruLite::Vector.new values.map { |v| eq(v) }.inject(:|)
|
495
|
+
end
|
496
|
+
|
497
|
+
# Append an element to the vector by specifying the element and index
|
498
|
+
def concat(element, index)
|
499
|
+
raise IndexError, 'Expected new unique index' if @index.include? index
|
500
|
+
|
501
|
+
@index |= [index]
|
502
|
+
@data[@index[index]] = element
|
503
|
+
|
504
|
+
update_position_cache
|
505
|
+
end
|
506
|
+
alias push concat
|
507
|
+
alias << concat
|
508
|
+
|
509
|
+
# Cast a vector to a new data type.
|
510
|
+
#
|
511
|
+
# == Options
|
512
|
+
#
|
513
|
+
# * +:dtype+ - :array for Ruby Array..
|
514
|
+
def cast(opts = {})
|
515
|
+
dt = opts[:dtype]
|
516
|
+
raise ArgumentError, "Unsupported dtype #{opts[:dtype]}" unless dt == :array
|
517
|
+
|
518
|
+
@data = cast_vector_to dt unless @dtype == dt
|
519
|
+
end
|
520
|
+
|
521
|
+
# Delete an element by value
|
522
|
+
def delete(element)
|
523
|
+
delete_at index_of(element)
|
524
|
+
end
|
525
|
+
|
526
|
+
# Delete element by index
|
527
|
+
def delete_at(index)
|
528
|
+
@data.delete_at @index[index]
|
529
|
+
@index = DaruLite::Index.new(@index.to_a - [index])
|
530
|
+
|
531
|
+
update_position_cache
|
532
|
+
end
|
533
|
+
|
534
|
+
# The type of data contained in the vector. Can be :object.
|
535
|
+
#
|
536
|
+
# Running through the data to figure out the kind of data is delayed to the
|
537
|
+
# last possible moment.
|
538
|
+
def type
|
539
|
+
if @type.nil? || @possibly_changed_type
|
540
|
+
@type = :numeric
|
541
|
+
each do |e|
|
542
|
+
next if e.nil? || e.is_a?(Numeric)
|
543
|
+
|
544
|
+
@type = :object
|
545
|
+
break
|
546
|
+
end
|
547
|
+
@possibly_changed_type = false
|
548
|
+
end
|
549
|
+
|
550
|
+
@type
|
551
|
+
end
|
552
|
+
|
553
|
+
# Tells if vector is categorical or not.
|
554
|
+
# @return [true, false] true if vector is of type category, false otherwise
|
555
|
+
# @example
|
556
|
+
# dv = DaruLite::Vector.new [1, 2, 3], type: :category
|
557
|
+
# dv.category?
|
558
|
+
# # => true
|
559
|
+
def category?
|
560
|
+
type == :category
|
561
|
+
end
|
562
|
+
|
563
|
+
# Get index of element
|
564
|
+
def index_of(element)
|
565
|
+
case dtype
|
566
|
+
when :array then @index.key(@data.index { |x| x.eql? element })
|
567
|
+
else @index.key @data.index(element)
|
568
|
+
end
|
569
|
+
end
|
570
|
+
|
571
|
+
# Keep only unique elements of the vector alongwith their indexes.
|
572
|
+
def uniq
|
573
|
+
uniq_vector = @data.uniq
|
574
|
+
new_index = uniq_vector.map { |element| index_of(element) }
|
575
|
+
|
576
|
+
DaruLite::Vector.new uniq_vector, name: @name, index: new_index, dtype: @dtype
|
577
|
+
end
|
578
|
+
|
579
|
+
def any?(&block)
|
580
|
+
@data.data.any?(&block)
|
581
|
+
end
|
582
|
+
|
583
|
+
def all?(&block)
|
584
|
+
@data.data.all?(&block)
|
585
|
+
end
|
586
|
+
|
587
|
+
# Sorts a vector according to its values. If a block is specified, the contents
|
588
|
+
# will be evaluated and data will be swapped whenever the block evaluates
|
589
|
+
# to *true*. Defaults to ascending order sorting. Any missing values will be
|
590
|
+
# put at the end of the vector. Preserves indexing. Default sort algorithm is
|
591
|
+
# quick sort.
|
592
|
+
#
|
593
|
+
# == Options
|
594
|
+
#
|
595
|
+
# * +:ascending+ - if false, will sort in descending order. Defaults to true.
|
596
|
+
#
|
597
|
+
# * +:type+ - Specify the sorting algorithm. Only supports quick_sort for now.
|
598
|
+
# == Usage
|
599
|
+
#
|
600
|
+
# v = DaruLite::Vector.new ["My first guitar", "jazz", "guitar"]
|
601
|
+
# # Say you want to sort these strings by length.
|
602
|
+
# v.sort(ascending: false) { |a,b| a.length <=> b.length }
|
603
|
+
def sort(opts = {}, &block)
|
604
|
+
opts = { ascending: true }.merge(opts)
|
605
|
+
|
606
|
+
vector_index = resort_index(@data.each_with_index, opts, &block)
|
607
|
+
vector, index = vector_index.transpose
|
608
|
+
|
609
|
+
index = @index.reorder index
|
610
|
+
|
611
|
+
DaruLite::Vector.new(vector, index: index, name: @name, dtype: @dtype)
|
612
|
+
end
|
613
|
+
|
614
|
+
# Sorts the vector according to it's`Index` values. Defaults to ascending
|
615
|
+
# order sorting.
|
616
|
+
#
|
617
|
+
# @param [Hash] opts the options for sort_by_index method.
|
618
|
+
# @option opts [Boolean] :ascending false, will sort `index` in
|
619
|
+
# descending order.
|
620
|
+
#
|
621
|
+
# @return [Vector] new sorted `Vector` according to the index values.
|
622
|
+
#
|
623
|
+
# @example
|
624
|
+
#
|
625
|
+
# dv = DaruLite::Vector.new [11, 13, 12], index: [23, 21, 22]
|
626
|
+
# # Say you want to sort index in ascending order
|
627
|
+
# dv.sort_by_index(ascending: true)
|
628
|
+
# #=> DaruLite::Vector.new [13, 12, 11], index: [21, 22, 23]
|
629
|
+
# # Say you want to sort index in descending order
|
630
|
+
# dv.sort_by_index(ascending: false)
|
631
|
+
# #=> DaruLite::Vector.new [11, 12, 13], index: [23, 22, 21]
|
632
|
+
def sort_by_index(opts = {})
|
633
|
+
opts = { ascending: true }.merge(opts)
|
634
|
+
_, new_order = resort_index(@index.each_with_index, opts).transpose
|
635
|
+
|
636
|
+
reorder new_order
|
637
|
+
end
|
638
|
+
|
639
|
+
DEFAULT_SORTER = lambda { |(lv, li), (rv, ri)|
|
640
|
+
if lv.nil? && rv.nil?
|
641
|
+
li <=> ri
|
642
|
+
elsif lv.nil?
|
643
|
+
-1
|
644
|
+
elsif rv.nil?
|
645
|
+
1
|
646
|
+
else
|
647
|
+
lv <=> rv
|
648
|
+
end
|
649
|
+
}
|
650
|
+
|
651
|
+
# Just sort the data and get an Array in return using Enumerable#sort.
|
652
|
+
# Non-destructive.
|
653
|
+
# :nocov:
|
654
|
+
def sorted_data(&block)
|
655
|
+
@data.to_a.sort(&block)
|
656
|
+
end
|
657
|
+
# :nocov:
|
658
|
+
|
659
|
+
# Like map, but returns a DaruLite::Vector with the returned values.
|
660
|
+
def recode(dt = nil, &block)
|
661
|
+
return to_enum(:recode, dt) unless block
|
662
|
+
|
663
|
+
dup.recode! dt, &block
|
664
|
+
end
|
665
|
+
|
666
|
+
# Destructive version of recode!
|
667
|
+
def recode!(dt = nil, &block)
|
668
|
+
return to_enum(:recode!, dt) unless block
|
669
|
+
|
670
|
+
@data.map!(&block).data
|
671
|
+
@data = cast_vector_to(dt || @dtype)
|
672
|
+
self
|
673
|
+
end
|
674
|
+
|
675
|
+
# Delete an element if block returns true. Destructive.
|
676
|
+
def delete_if
|
677
|
+
return to_enum(:delete_if) unless block_given?
|
678
|
+
|
679
|
+
keep_e, keep_i = each_with_index.reject { |n, _i| yield(n) }.transpose
|
680
|
+
|
681
|
+
@data = cast_vector_to @dtype, keep_e
|
682
|
+
@index = DaruLite::Index.new(keep_i)
|
683
|
+
|
684
|
+
update_position_cache
|
685
|
+
|
686
|
+
self
|
687
|
+
end
|
688
|
+
|
689
|
+
# Keep an element if block returns true. Destructive.
|
690
|
+
def keep_if
|
691
|
+
return to_enum(:keep_if) unless block_given?
|
692
|
+
|
693
|
+
delete_if { |val| !yield(val) }
|
694
|
+
end
|
695
|
+
|
696
|
+
# Reports all values that doesn't comply with a condition.
|
697
|
+
# Returns a hash with the index of data and the invalid data.
|
698
|
+
def verify
|
699
|
+
(0...size)
|
700
|
+
.map { |i| [i, @data[i]] }
|
701
|
+
.reject { |_i, val| yield(val) }
|
702
|
+
.to_h
|
703
|
+
end
|
704
|
+
|
705
|
+
# Return an Array with the data splitted by a separator.
|
706
|
+
# a=DaruLite::Vector.new(["a,b","c,d","a,b","d"])
|
707
|
+
# a.splitted
|
708
|
+
# =>
|
709
|
+
# [["a","b"],["c","d"],["a","b"],["d"]]
|
710
|
+
def splitted(sep = ',')
|
711
|
+
@data.map do |s|
|
712
|
+
if s.nil?
|
713
|
+
nil
|
714
|
+
elsif s.respond_to? :split
|
715
|
+
s.split sep
|
716
|
+
else
|
717
|
+
[s]
|
718
|
+
end
|
719
|
+
end
|
720
|
+
end
|
721
|
+
|
722
|
+
# Returns a hash of Vectors, defined by the different values
|
723
|
+
# defined on the fields
|
724
|
+
# Example:
|
725
|
+
#
|
726
|
+
# a=DaruLite::Vector.new(["a,b","c,d","a,b"])
|
727
|
+
# a.split_by_separator
|
728
|
+
# => {"a"=>#<DaruLite::Vector:0x7f2dbcc09d88
|
729
|
+
# @data=[1, 0, 1]>,
|
730
|
+
# "b"=>#<DaruLite::Vector:0x7f2dbcc09c48
|
731
|
+
# @data=[1, 1, 0]>,
|
732
|
+
# "c"=>#<DaruLite::Vector:0x7f2dbcc09b08
|
733
|
+
# @data=[0, 1, 1]>}
|
734
|
+
#
|
735
|
+
def split_by_separator(sep = ',')
|
736
|
+
split_data = splitted sep
|
737
|
+
split_data
|
738
|
+
.flatten.uniq.compact.to_h do |key|
|
739
|
+
[
|
740
|
+
key,
|
741
|
+
DaruLite::Vector.new(split_data.map { |v| split_value(key, v) })
|
742
|
+
]
|
743
|
+
end
|
744
|
+
end
|
745
|
+
|
746
|
+
def split_by_separator_freq(sep = ',')
|
747
|
+
split_by_separator(sep).transform_values do |v|
|
748
|
+
v.sum(&:to_i)
|
749
|
+
end
|
750
|
+
end
|
751
|
+
|
752
|
+
def reset_index!
|
753
|
+
@index = DaruLite::Index.new(Array.new(size) { |i| i })
|
754
|
+
self
|
755
|
+
end
|
756
|
+
|
757
|
+
# Replace all nils in the vector with the value passed as an argument. Destructive.
|
758
|
+
# See #replace_nils for non-destructive version
|
759
|
+
#
|
760
|
+
# == Arguments
|
761
|
+
#
|
762
|
+
# * +replacement+ - The value which should replace all nils
|
763
|
+
def replace_nils!(replacement)
|
764
|
+
indexes(*DaruLite::MISSING_VALUES).each do |idx|
|
765
|
+
self[idx] = replacement
|
766
|
+
end
|
767
|
+
|
768
|
+
self
|
769
|
+
end
|
770
|
+
|
771
|
+
# Rolling fillna
|
772
|
+
# replace all Float::NAN and NIL values with the preceeding or following value
|
773
|
+
#
|
774
|
+
# @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
|
775
|
+
#
|
776
|
+
# @example
|
777
|
+
# dv = DaruLite::Vector.new([1, 2, 1, 4, nil, Float::NAN, 3, nil, Float::NAN])
|
778
|
+
#
|
779
|
+
# 2.3.3 :068 > dv.rolling_fillna(:forward)
|
780
|
+
# => #<DaruLite::Vector(9)>
|
781
|
+
# 0 1
|
782
|
+
# 1 2
|
783
|
+
# 2 1
|
784
|
+
# 3 4
|
785
|
+
# 4 4
|
786
|
+
# 5 4
|
787
|
+
# 6 3
|
788
|
+
# 7 3
|
789
|
+
# 8 3
|
790
|
+
#
|
791
|
+
def rolling_fillna!(direction = :forward)
|
792
|
+
enum = direction == :forward ? index : index.reverse_each
|
793
|
+
last_valid_value = 0
|
794
|
+
enum.each do |idx|
|
795
|
+
if valid_value?(self[idx])
|
796
|
+
last_valid_value = self[idx]
|
797
|
+
else
|
798
|
+
self[idx] = last_valid_value
|
799
|
+
end
|
800
|
+
end
|
801
|
+
self
|
802
|
+
end
|
803
|
+
|
804
|
+
# Non-destructive version of rolling_fillna!
|
805
|
+
def rolling_fillna(direction = :forward)
|
806
|
+
dup.rolling_fillna!(direction)
|
807
|
+
end
|
808
|
+
|
809
|
+
# Lags the series by `k` periods.
|
810
|
+
#
|
811
|
+
# Lags the series by `k` periods, "shifting" data and inserting `nil`s
|
812
|
+
# from beginning or end of a vector, while preserving original vector's
|
813
|
+
# size.
|
814
|
+
#
|
815
|
+
# `k` can be positive or negative integer. If `k` is positive, `nil`s
|
816
|
+
# are inserted at the beginning of the vector, otherwise they are
|
817
|
+
# inserted at the end.
|
818
|
+
#
|
819
|
+
# @param [Integer] k "shift" the series by `k` periods. `k` can be
|
820
|
+
# positive or negative. (default = 1)
|
821
|
+
#
|
822
|
+
# @return [DaruLite::Vector] a new vector with "shifted" inital values
|
823
|
+
# and `nil` values inserted. The return vector is the same length
|
824
|
+
# as the orignal vector.
|
825
|
+
#
|
826
|
+
# @example Lag a vector with different periods `k`
|
827
|
+
#
|
828
|
+
# ts = DaruLite::Vector.new(1..5)
|
829
|
+
# # => [1, 2, 3, 4, 5]
|
830
|
+
#
|
831
|
+
# ts.lag # => [nil, 1, 2, 3, 4]
|
832
|
+
# ts.lag(1) # => [nil, 1, 2, 3, 4]
|
833
|
+
# ts.lag(2) # => [nil, nil, 1, 2, 3]
|
834
|
+
# ts.lag(-1) # => [2, 3, 4, 5, nil]
|
835
|
+
#
|
836
|
+
def lag(k = 1)
|
837
|
+
case k
|
838
|
+
when 0 then dup
|
839
|
+
when 1...size
|
840
|
+
copy(([nil] * k) + data.to_a)
|
841
|
+
when -size..-1
|
842
|
+
copy(data.to_a[k.abs...size])
|
843
|
+
else
|
844
|
+
copy([])
|
845
|
+
end
|
846
|
+
end
|
847
|
+
|
848
|
+
def detach_index
|
849
|
+
DaruLite::DataFrame.new(
|
850
|
+
index: @index.to_a,
|
851
|
+
values: @data.to_a
|
852
|
+
)
|
853
|
+
end
|
854
|
+
|
855
|
+
# Non-destructive version of #replace_nils!
|
856
|
+
def replace_nils(replacement)
|
857
|
+
dup.replace_nils!(replacement)
|
858
|
+
end
|
859
|
+
|
860
|
+
# number of non-missing elements
|
861
|
+
def n_valid
|
862
|
+
size - indexes(*DaruLite::MISSING_VALUES).size
|
863
|
+
end
|
864
|
+
deprecate :n_valid, :count_values, 2016, 10
|
865
|
+
|
866
|
+
# Count the number of values specified
|
867
|
+
# @param values [Array] values to count for
|
868
|
+
# @return [Integer] the number of times the values mentioned occurs
|
869
|
+
# @example
|
870
|
+
# dv = DaruLite::Vector.new [1, 2, 1, 2, 3, 4, nil, nil]
|
871
|
+
# dv.count_values nil
|
872
|
+
# # => 2
|
873
|
+
def count_values(*values)
|
874
|
+
positions(*values).size
|
875
|
+
end
|
876
|
+
|
877
|
+
# Returns *true* if an index exists
|
878
|
+
def has_index?(index)
|
879
|
+
@index.include? index
|
880
|
+
end
|
881
|
+
|
882
|
+
# @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
|
883
|
+
# @return [DaruLite::Vector]
|
884
|
+
def get_sub_vector(keys, by_position: true)
|
885
|
+
return DaruLite::Vector.new([]) if keys == []
|
886
|
+
|
887
|
+
keys = @index.pos(*keys) unless by_position
|
888
|
+
|
889
|
+
sub_vect = at(*keys)
|
890
|
+
sub_vect = DaruLite::Vector.new([sub_vect]) unless sub_vect.is_a?(DaruLite::Vector)
|
891
|
+
|
892
|
+
sub_vect
|
893
|
+
end
|
894
|
+
|
895
|
+
# @return [DaruLite::DataFrame] the vector as a single-vector dataframe
|
896
|
+
def to_df
|
897
|
+
DaruLite::DataFrame.new({ @name => @data }, name: @name, index: @index)
|
898
|
+
end
|
899
|
+
|
900
|
+
# Convert Vector to a horizontal or vertical Ruby Matrix.
|
901
|
+
#
|
902
|
+
# == Arguments
|
903
|
+
#
|
904
|
+
# * +axis+ - Specify whether you want a *:horizontal* or a *:vertical* matrix.
|
905
|
+
def to_matrix(axis = :horizontal)
|
906
|
+
case axis
|
907
|
+
when :horizontal
|
908
|
+
Matrix[to_a]
|
909
|
+
when :vertical
|
910
|
+
Matrix.columns([to_a])
|
911
|
+
else
|
912
|
+
raise ArgumentError, "axis should be either :horizontal or :vertical, not #{axis}"
|
913
|
+
end
|
914
|
+
end
|
915
|
+
|
916
|
+
# Convert to hash (explicit). Hash keys are indexes and values are the correspoding elements
|
917
|
+
def to_h
|
918
|
+
@index.to_h { |index| [index, self[index]] }
|
919
|
+
end
|
920
|
+
|
921
|
+
# Return an array
|
922
|
+
def to_a
|
923
|
+
@data.to_a
|
924
|
+
end
|
925
|
+
|
926
|
+
# Convert the hash from to_h to json
|
927
|
+
def to_json(*)
|
928
|
+
to_h.to_json
|
929
|
+
end
|
930
|
+
|
931
|
+
# Convert to html for iruby
|
932
|
+
def to_html(threshold = 30)
|
933
|
+
table_thead = to_html_thead
|
934
|
+
table_tbody = to_html_tbody(threshold)
|
935
|
+
path = if index.is_a?(MultiIndex)
|
936
|
+
File.expand_path('iruby/templates/vector_mi.html.erb', __dir__)
|
937
|
+
else
|
938
|
+
File.expand_path('iruby/templates/vector.html.erb', __dir__)
|
939
|
+
end
|
940
|
+
ERB.new(File.read(path).strip).result(binding)
|
941
|
+
end
|
942
|
+
|
943
|
+
def to_html_thead
|
944
|
+
table_thead_path =
|
945
|
+
if index.is_a?(MultiIndex)
|
946
|
+
File.expand_path('iruby/templates/vector_mi_thead.html.erb', __dir__)
|
947
|
+
else
|
948
|
+
File.expand_path('iruby/templates/vector_thead.html.erb', __dir__)
|
949
|
+
end
|
950
|
+
ERB.new(File.read(table_thead_path).strip).result(binding)
|
951
|
+
end
|
952
|
+
|
953
|
+
def to_html_tbody(threshold = 30)
|
954
|
+
table_tbody_path =
|
955
|
+
if index.is_a?(MultiIndex)
|
956
|
+
File.expand_path('iruby/templates/vector_mi_tbody.html.erb', __dir__)
|
957
|
+
else
|
958
|
+
File.expand_path('iruby/templates/vector_tbody.html.erb', __dir__)
|
959
|
+
end
|
960
|
+
ERB.new(File.read(table_tbody_path).strip).result(binding)
|
961
|
+
end
|
962
|
+
|
963
|
+
def to_s
|
964
|
+
"#<#{self.class}#{": #{@name}" if @name}(#{size})#{':category' if category?}>"
|
965
|
+
end
|
966
|
+
|
967
|
+
# Create a summary of the Vector
|
968
|
+
# @param indent_level [Fixnum] indent level
|
969
|
+
# @return [String] String containing the summary of the Vector
|
970
|
+
# @example
|
971
|
+
# dv = DaruLite::Vector.new [1, 2, 3]
|
972
|
+
# puts dv.summary
|
973
|
+
#
|
974
|
+
# # =
|
975
|
+
# # n :3
|
976
|
+
# # non-missing:3
|
977
|
+
# # median: 2
|
978
|
+
# # mean: 2.0000
|
979
|
+
# # std.dev.: 1.0000
|
980
|
+
# # std.err.: 0.5774
|
981
|
+
# # skew: 0.0000
|
982
|
+
# # kurtosis: -2.3333
|
983
|
+
def summary(indent_level = 0)
|
984
|
+
non_missing = size - count_values(*DaruLite::MISSING_VALUES)
|
985
|
+
summary = (' =' * indent_level) + "= #{name}" \
|
986
|
+
"\n n :#{size}" \
|
987
|
+
"\n non-missing:#{non_missing}"
|
988
|
+
case type
|
989
|
+
when :object
|
990
|
+
summary << object_summary
|
991
|
+
when :numeric
|
992
|
+
summary << numeric_summary
|
993
|
+
end
|
994
|
+
summary.split("\n").join("\n#{' ' * indent_level}")
|
995
|
+
end
|
996
|
+
|
997
|
+
# Displays summary for an object type Vector
|
998
|
+
# @return [String] String containing object vector summary
|
999
|
+
def object_summary
|
1000
|
+
nval = count_values(*DaruLite::MISSING_VALUES)
|
1001
|
+
summary = "\n factors: #{factors.to_a.join(',')}" \
|
1002
|
+
"\n mode: #{mode.to_a.join(',')}" \
|
1003
|
+
"\n Distribution\n"
|
1004
|
+
|
1005
|
+
data = frequencies.sort.each_with_index.map do |v, k|
|
1006
|
+
[k, v, format('%0.2f%%', ((nval.zero? ? 1 : v.quo(nval)) * 100))]
|
1007
|
+
end
|
1008
|
+
|
1009
|
+
summary + Formatters::Table.format(data)
|
1010
|
+
end
|
1011
|
+
|
1012
|
+
# Displays summary for an numeric type Vector
|
1013
|
+
# @return [String] String containing numeric vector summary
|
1014
|
+
def numeric_summary
|
1015
|
+
summary = "\n median: #{median}" +
|
1016
|
+
format("\n mean: %0.4f", mean)
|
1017
|
+
if sd
|
1018
|
+
summary << (format("\n std.dev.: %0.4f", sd) +
|
1019
|
+
format("\n std.err.: %0.4f", se))
|
1020
|
+
end
|
1021
|
+
|
1022
|
+
if count_values(*DaruLite::MISSING_VALUES).zero?
|
1023
|
+
summary << (format("\n skew: %0.4f", skew) +
|
1024
|
+
format("\n kurtosis: %0.4f", kurtosis))
|
1025
|
+
end
|
1026
|
+
summary
|
1027
|
+
end
|
1028
|
+
|
1029
|
+
# Over rides original inspect for pretty printing in irb
|
1030
|
+
def inspect(spacing = 20, threshold = 15)
|
1031
|
+
row_headers = index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
|
1032
|
+
|
1033
|
+
"#<#{self.class}(#{size})#{':category' if category?}>\n" +
|
1034
|
+
Formatters::Table.format(
|
1035
|
+
to_a.lazy.map { |v| [v] },
|
1036
|
+
headers: @name && [@name],
|
1037
|
+
row_headers: row_headers,
|
1038
|
+
threshold: threshold,
|
1039
|
+
spacing: spacing
|
1040
|
+
)
|
1041
|
+
end
|
1042
|
+
|
1043
|
+
# Sets new index for vector. Preserves index->value correspondence.
|
1044
|
+
# Sets nil for new index keys absent from original index.
|
1045
|
+
# @note Unlike #reorder! which takes positions as input it takes
|
1046
|
+
# index as an input to reorder the vector
|
1047
|
+
# @param [DaruLite::Index, DaruLite::MultiIndex] new_index new index to order with
|
1048
|
+
# @return [DaruLite::Vector] vector reindexed with new index
|
1049
|
+
def reindex!(new_index)
|
1050
|
+
values = []
|
1051
|
+
each_with_index do |val, i|
|
1052
|
+
values[new_index[i]] = val if new_index.include?(i)
|
1053
|
+
end
|
1054
|
+
values.fill(nil, values.size, new_index.size - values.size)
|
1055
|
+
|
1056
|
+
@data = cast_vector_to @dtype, values
|
1057
|
+
@index = new_index
|
1058
|
+
|
1059
|
+
update_position_cache
|
1060
|
+
|
1061
|
+
self
|
1062
|
+
end
|
1063
|
+
|
1064
|
+
# Reorder the vector with given positions
|
1065
|
+
# @note Unlike #reindex! which takes index as input, it takes
|
1066
|
+
# positions as an input to reorder the vector
|
1067
|
+
# @param [Array] order the order to reorder the vector with
|
1068
|
+
# @return reordered vector
|
1069
|
+
# @example
|
1070
|
+
# dv = DaruLite::Vector.new [3, 2, 1], index: ['c', 'b', 'a']
|
1071
|
+
# dv.reorder! [2, 1, 0]
|
1072
|
+
# # => #<DaruLite::Vector(3)>
|
1073
|
+
# # a 1
|
1074
|
+
# # b 2
|
1075
|
+
# # c 3
|
1076
|
+
def reorder!(order)
|
1077
|
+
@index = @index.reorder order
|
1078
|
+
data_array = order.map { |i| @data[i] }
|
1079
|
+
@data = cast_vector_to @dtype, data_array, @nm_dtype
|
1080
|
+
update_position_cache
|
1081
|
+
self
|
1082
|
+
end
|
1083
|
+
|
1084
|
+
# Non-destructive version of #reorder!
|
1085
|
+
def reorder(order)
|
1086
|
+
dup.reorder! order
|
1087
|
+
end
|
1088
|
+
|
1089
|
+
# Create a new vector with a different index, and preserve the indexing of
|
1090
|
+
# current elements.
|
1091
|
+
def reindex(new_index)
|
1092
|
+
dup.reindex!(new_index)
|
1093
|
+
end
|
1094
|
+
|
1095
|
+
def index=(idx)
|
1096
|
+
idx = Index.coerce(idx)
|
1097
|
+
|
1098
|
+
raise ArgumentError, "Size of supplied index #{idx.size} does not match size of Vector" if idx.size != size
|
1099
|
+
raise ArgumentError, 'Can only assign type Index and its subclasses.' unless idx.is_a?(DaruLite::Index)
|
1100
|
+
|
1101
|
+
@index = idx
|
1102
|
+
self
|
1103
|
+
end
|
1104
|
+
|
1105
|
+
# Give the vector a new name
|
1106
|
+
#
|
1107
|
+
# @param new_name [Symbol] The new name.
|
1108
|
+
def rename(new_name)
|
1109
|
+
@name = new_name
|
1110
|
+
self
|
1111
|
+
end
|
1112
|
+
|
1113
|
+
alias name= rename
|
1114
|
+
|
1115
|
+
# Duplicated a vector
|
1116
|
+
# @return [DaruLite::Vector] duplicated vector
|
1117
|
+
def dup
|
1118
|
+
DaruLite::Vector.new @data.dup, name: @name, index: @index.dup
|
1119
|
+
end
|
1120
|
+
|
1121
|
+
# == Bootstrap
|
1122
|
+
# Generate +nr+ resamples (with replacement) of size +s+
|
1123
|
+
# from vector, computing each estimate from +estimators+
|
1124
|
+
# over each resample.
|
1125
|
+
# +estimators+ could be
|
1126
|
+
# a) Hash with variable names as keys and lambdas as values
|
1127
|
+
# a.bootstrap(:log_s2=>lambda {|v| Math.log(v.variance)},1000)
|
1128
|
+
# b) Array with names of method to bootstrap
|
1129
|
+
# a.bootstrap([:mean, :sd],1000)
|
1130
|
+
# c) A single method to bootstrap
|
1131
|
+
# a.jacknife(:mean, 1000)
|
1132
|
+
# If s is nil, is set to vector size by default.
|
1133
|
+
#
|
1134
|
+
# Returns a DataFrame where each vector is a vector
|
1135
|
+
# of length +nr+ containing the computed resample estimates.
|
1136
|
+
def bootstrap(estimators, nr, s = nil)
|
1137
|
+
s ||= size
|
1138
|
+
h_est, es, bss = prepare_bootstrap(estimators)
|
1139
|
+
|
1140
|
+
nr.times do
|
1141
|
+
bs = sample_with_replacement(s)
|
1142
|
+
es.each do |estimator|
|
1143
|
+
bss[estimator].push(h_est[estimator].call(bs))
|
1144
|
+
end
|
1145
|
+
end
|
1146
|
+
|
1147
|
+
es.each do |est|
|
1148
|
+
bss[est] = DaruLite::Vector.new bss[est]
|
1149
|
+
end
|
1150
|
+
|
1151
|
+
DaruLite::DataFrame.new bss
|
1152
|
+
end
|
1153
|
+
|
1154
|
+
# == Jacknife
|
1155
|
+
# Returns a dataset with jacknife delete-+k+ +estimators+
|
1156
|
+
# +estimators+ could be:
|
1157
|
+
# a) Hash with variable names as keys and lambdas as values
|
1158
|
+
# a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance)})
|
1159
|
+
# b) Array with method names to jacknife
|
1160
|
+
# a.jacknife([:mean, :sd])
|
1161
|
+
# c) A single method to jacknife
|
1162
|
+
# a.jacknife(:mean)
|
1163
|
+
# +k+ represent the block size for block jacknife. By default
|
1164
|
+
# is set to 1, for classic delete-one jacknife.
|
1165
|
+
#
|
1166
|
+
# Returns a dataset where each vector is an vector
|
1167
|
+
# of length +cases+/+k+ containing the computed jacknife estimates.
|
1168
|
+
#
|
1169
|
+
# == Reference:
|
1170
|
+
# * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife.
|
1171
|
+
def jackknife(estimators, k = 1) # rubocop:disable Metrics/MethodLength
|
1172
|
+
raise "n should be divisible by k:#{k}" unless (size % k).zero?
|
1173
|
+
|
1174
|
+
nb = (size / k).to_i
|
1175
|
+
h_est, es, ps = prepare_bootstrap(estimators)
|
1176
|
+
|
1177
|
+
est_n = es.to_h { |v| [v, h_est[v].call(self)] }
|
1178
|
+
|
1179
|
+
nb.times do |i|
|
1180
|
+
other = @data.dup
|
1181
|
+
other.slice!(i * k, k)
|
1182
|
+
other = DaruLite::Vector.new other
|
1183
|
+
|
1184
|
+
es.each do |estimator|
|
1185
|
+
# Add pseudovalue
|
1186
|
+
ps[estimator].push(
|
1187
|
+
(nb * est_n[estimator]) - ((nb - 1) * h_est[estimator].call(other))
|
1188
|
+
)
|
1189
|
+
end
|
1190
|
+
end
|
1191
|
+
|
1192
|
+
es.each do |est|
|
1193
|
+
ps[est] = DaruLite::Vector.new ps[est]
|
1194
|
+
end
|
1195
|
+
DaruLite::DataFrame.new ps
|
1196
|
+
end
|
1197
|
+
|
1198
|
+
# Returns an array of either none or integer values, indicating the
|
1199
|
+
# +regexp+ matching with the given array.
|
1200
|
+
#
|
1201
|
+
# @param regexp [Regexp] A regular matching expression. For example, +/weeks/+.
|
1202
|
+
#
|
1203
|
+
# @return [Array] Containing either +nil+ or integer values, according to the match with the given +regexp+
|
1204
|
+
#
|
1205
|
+
# @example
|
1206
|
+
# dv = DaruLite::Vector.new(['3 days', '5 weeks', '2 weeks'])
|
1207
|
+
# dv.match(/weeks/)
|
1208
|
+
#
|
1209
|
+
# # => [false, true, true]
|
1210
|
+
def match(regexp)
|
1211
|
+
@data.map { |value| !!(value =~ regexp) }
|
1212
|
+
end
|
1213
|
+
|
1214
|
+
# Creates a new vector consisting only of non-nil data
|
1215
|
+
#
|
1216
|
+
# == Arguments
|
1217
|
+
#
|
1218
|
+
# @param as_a [Symbol] Passing :array will return only the elements
|
1219
|
+
# as an Array. Otherwise will return a DaruLite::Vector.
|
1220
|
+
#
|
1221
|
+
# @param _duplicate [Symbol] In case no missing data is found in the
|
1222
|
+
# vector, setting this to false will return the same vector.
|
1223
|
+
# Otherwise, a duplicate will be returned irrespective of
|
1224
|
+
# presence of missing data.
|
1225
|
+
|
1226
|
+
def only_valid(as_a = :vector, _duplicate = true)
|
1227
|
+
# FIXME: Now duplicate is just ignored.
|
1228
|
+
# There are no spec that fail on this case, so I'll leave it
|
1229
|
+
# this way for now - zverok, 2016-05-07
|
1230
|
+
|
1231
|
+
new_index = @index.to_a - indexes(*DaruLite::MISSING_VALUES)
|
1232
|
+
new_vector = new_index.map { |idx| self[idx] }
|
1233
|
+
|
1234
|
+
if as_a == :vector
|
1235
|
+
DaruLite::Vector.new new_vector, index: new_index, name: @name, dtype: dtype
|
1236
|
+
else
|
1237
|
+
new_vector
|
1238
|
+
end
|
1239
|
+
end
|
1240
|
+
deprecate :only_valid, :reject_values, 2016, 10
|
1241
|
+
|
1242
|
+
# Return a vector with specified values removed
|
1243
|
+
# @param values [Array] values to reject from resultant vector
|
1244
|
+
# @return [DaruLite::Vector] vector with specified values removed
|
1245
|
+
# @example
|
1246
|
+
# dv = DaruLite::Vector.new [1, 2, nil, Float::NAN]
|
1247
|
+
# dv.reject_values nil, Float::NAN
|
1248
|
+
# # => #<DaruLite::Vector(2)>
|
1249
|
+
# # 0 1
|
1250
|
+
# # 1 2
|
1251
|
+
def reject_values(*values)
|
1252
|
+
resultant_pos = size.times.to_a - positions(*values)
|
1253
|
+
dv = at(*resultant_pos)
|
1254
|
+
# Handle the case when number of positions is 1
|
1255
|
+
# and hence #at doesn't return a vector
|
1256
|
+
if dv.is_a?(DaruLite::Vector)
|
1257
|
+
dv
|
1258
|
+
else
|
1259
|
+
pos = resultant_pos.first
|
1260
|
+
at(pos..pos)
|
1261
|
+
end
|
1262
|
+
end
|
1263
|
+
|
1264
|
+
# Return indexes of values specified
|
1265
|
+
# @param values [Array] values to find indexes for
|
1266
|
+
# @return [Array] array of indexes of values specified
|
1267
|
+
# @example
|
1268
|
+
# dv = DaruLite::Vector.new [1, 2, nil, Float::NAN], index: 11..14
|
1269
|
+
# dv.indexes nil, Float::NAN
|
1270
|
+
# # => [13, 14]
|
1271
|
+
def indexes(*values)
|
1272
|
+
index.to_a.values_at(*positions(*values))
|
1273
|
+
end
|
1274
|
+
|
1275
|
+
# Replaces specified values with a new value
|
1276
|
+
# @param [Array] old_values array of values to replace
|
1277
|
+
# @param [object] new_value new value to replace with
|
1278
|
+
# @note It performs the replace in place.
|
1279
|
+
# @return [DaruLite::Vector] Same vector itself with values
|
1280
|
+
# replaced with new value
|
1281
|
+
# @example
|
1282
|
+
# dv = DaruLite::Vector.new [1, 2, :a, :b]
|
1283
|
+
# dv.replace_values [:a, :b], nil
|
1284
|
+
# dv
|
1285
|
+
# # =>
|
1286
|
+
# # #<DaruLite::Vector:19903200 @name = nil @metadata = {} @size = 4 >
|
1287
|
+
# # nil
|
1288
|
+
# # 0 1
|
1289
|
+
# # 1 2
|
1290
|
+
# # 2 nil
|
1291
|
+
# # 3 nil
|
1292
|
+
def replace_values(old_values, new_value)
|
1293
|
+
old_values = [old_values] unless old_values.is_a? Array
|
1294
|
+
size.times do |pos|
|
1295
|
+
set_at([pos], new_value) if include_with_nan? old_values, at(pos)
|
1296
|
+
end
|
1297
|
+
self
|
1298
|
+
end
|
1299
|
+
|
1300
|
+
# Returns a Vector containing only missing data (preserves indexes).
|
1301
|
+
def only_missing(as_a = :vector)
|
1302
|
+
case as_a
|
1303
|
+
when :vector
|
1304
|
+
self[*indexes(*DaruLite::MISSING_VALUES)]
|
1305
|
+
when :array
|
1306
|
+
self[*indexes(*DaruLite::MISSING_VALUES)].to_a
|
1307
|
+
end
|
1308
|
+
end
|
1309
|
+
deprecate :only_missing, nil, 2016, 10
|
1310
|
+
|
1311
|
+
# Returns a Vector with only numerical data. Missing data is included
|
1312
|
+
# but non-Numeric objects are excluded. Preserves index.
|
1313
|
+
def only_numerics
|
1314
|
+
numeric_indexes =
|
1315
|
+
each_with_index
|
1316
|
+
.select { |v, _i| v.is_a?(Numeric) || v.nil? }
|
1317
|
+
.map(&:last)
|
1318
|
+
|
1319
|
+
self[*numeric_indexes]
|
1320
|
+
end
|
1321
|
+
|
1322
|
+
DATE_REGEXP = /^(\d{2}-\d{2}-\d{4}|\d{4}-\d{2}-\d{2})$/.freeze
|
1323
|
+
|
1324
|
+
# Returns the database type for the vector, according to its content
|
1325
|
+
def db_type
|
1326
|
+
# first, detect any character not number
|
1327
|
+
if @data.any? { |v| v.to_s =~ DATE_REGEXP }
|
1328
|
+
'DATE'
|
1329
|
+
elsif @data.any? { |v| v.to_s =~ /[^0-9e.-]/ }
|
1330
|
+
'VARCHAR (255)'
|
1331
|
+
elsif @data.any? { |v| v.to_s.include?('.') }
|
1332
|
+
'DOUBLE'
|
1333
|
+
else
|
1334
|
+
'INTEGER'
|
1335
|
+
end
|
1336
|
+
end
|
1337
|
+
|
1338
|
+
# Copies the structure of the vector (i.e the index, size, etc.) and fills all
|
1339
|
+
# all values with nils.
|
1340
|
+
def clone_structure
|
1341
|
+
DaruLite::Vector.new(([nil] * size), name: @name, index: @index.dup)
|
1342
|
+
end
|
1343
|
+
|
1344
|
+
# Save the vector to a file
|
1345
|
+
#
|
1346
|
+
# == Arguments
|
1347
|
+
#
|
1348
|
+
# * filename - Path of file where the vector is to be saved
|
1349
|
+
def save(filename)
|
1350
|
+
DaruLite::IO.save self, filename
|
1351
|
+
end
|
1352
|
+
|
1353
|
+
def _dump(*) # :nodoc:
|
1354
|
+
Marshal.dump(
|
1355
|
+
data: @data.to_a,
|
1356
|
+
dtype: @dtype,
|
1357
|
+
name: @name,
|
1358
|
+
index: @index
|
1359
|
+
)
|
1360
|
+
end
|
1361
|
+
|
1362
|
+
# :nocov:
|
1363
|
+
def daru_lite_vector(*)
|
1364
|
+
self
|
1365
|
+
end
|
1366
|
+
# :nocov:
|
1367
|
+
|
1368
|
+
alias dv daru_lite_vector
|
1369
|
+
|
1370
|
+
# Converts a non category type vector to category type vector.
|
1371
|
+
# @param [Hash] opts options to convert to category
|
1372
|
+
# @option opts [true, false] :ordered Specify if vector is ordered or not.
|
1373
|
+
# If it is ordered, it can be sorted and min, max like functions would work
|
1374
|
+
# @option opts [Array] :categories set categories in the specified order
|
1375
|
+
# @return [DaruLite::Vector] vector with type category
|
1376
|
+
def to_category(opts = {})
|
1377
|
+
dv = DaruLite::Vector.new to_a, type: :category, name: @name, index: @index
|
1378
|
+
dv.ordered = opts[:ordered] || false
|
1379
|
+
dv.categories = opts[:categories] if opts[:categories]
|
1380
|
+
dv
|
1381
|
+
end
|
1382
|
+
|
1383
|
+
def method_missing(name, *args, &block)
|
1384
|
+
# FIXME: it is shamefully fragile. Should be either made stronger
|
1385
|
+
# (string/symbol dychotomy, informative errors) or removed totally. - zverok
|
1386
|
+
if name =~ /(.+)=/
|
1387
|
+
self[Regexp.last_match(1).to_sym] = args[0]
|
1388
|
+
elsif has_index?(name)
|
1389
|
+
self[name]
|
1390
|
+
else
|
1391
|
+
super
|
1392
|
+
end
|
1393
|
+
end
|
1394
|
+
|
1395
|
+
def respond_to_missing?(name, include_private = false)
|
1396
|
+
name.to_s.end_with?('=') || has_index?(name) || super
|
1397
|
+
end
|
1398
|
+
|
1399
|
+
# Partition a numeric variable into categories.
|
1400
|
+
# @param [Array<Numeric>] partitions an array whose consecutive elements
|
1401
|
+
# provide intervals for categories
|
1402
|
+
# @param [Hash] opts options to cut the partition
|
1403
|
+
# @option opts [:left, :right] :close_at specifies whether the interval closes at
|
1404
|
+
# the right side of left side
|
1405
|
+
# @option opts [Array] :labels names of the categories
|
1406
|
+
# @return [DaruLite::Vector] numeric variable converted to categorical variable
|
1407
|
+
# @example
|
1408
|
+
# heights = DaruLite::Vector.new [30, 35, 32, 50, 42, 51]
|
1409
|
+
# height_cat = heights.cut [30, 40, 50, 60], labels=['low', 'medium', 'high']
|
1410
|
+
# # => #<DaruLite::Vector(6)>
|
1411
|
+
# # 0 low
|
1412
|
+
# # 1 low
|
1413
|
+
# # 2 low
|
1414
|
+
# # 3 high
|
1415
|
+
# # 4 medium
|
1416
|
+
# # 5 high
|
1417
|
+
def cut(partitions, opts = {})
|
1418
|
+
close_at = opts[:close_at] || :right
|
1419
|
+
labels = opts[:labels]
|
1420
|
+
partitions = partitions.to_a
|
1421
|
+
values = to_a.map { |val| cut_find_category partitions, val, close_at }
|
1422
|
+
cats = cut_categories(partitions, close_at)
|
1423
|
+
|
1424
|
+
dv = DaruLite::Vector.new values,
|
1425
|
+
index: @index,
|
1426
|
+
type: :category,
|
1427
|
+
categories: cats
|
1428
|
+
|
1429
|
+
# Rename categories if new labels provided
|
1430
|
+
if labels
|
1431
|
+
dv.rename_categories cats.zip(labels).to_h
|
1432
|
+
else
|
1433
|
+
dv
|
1434
|
+
end
|
1435
|
+
end
|
1436
|
+
|
1437
|
+
def positions(*values)
|
1438
|
+
case values
|
1439
|
+
when [nil]
|
1440
|
+
nil_positions
|
1441
|
+
when [Float::NAN]
|
1442
|
+
nan_positions
|
1443
|
+
when [nil, Float::NAN], [Float::NAN, nil]
|
1444
|
+
nil_positions + nan_positions
|
1445
|
+
else
|
1446
|
+
size.times.select { |i| include_with_nan? values, @data[i] }
|
1447
|
+
end
|
1448
|
+
end
|
1449
|
+
|
1450
|
+
def group_by(*args)
|
1451
|
+
to_df.group_by(*args)
|
1452
|
+
end
|
1453
|
+
|
1454
|
+
private
|
1455
|
+
|
1456
|
+
def copy(values)
|
1457
|
+
# Make sure values is right-justified to the size of the vector
|
1458
|
+
values.concat([nil] * (size - values.size)) if values.size < size
|
1459
|
+
DaruLite::Vector.new(values[0...size], index: @index, name: @name)
|
1460
|
+
end
|
1461
|
+
|
1462
|
+
def nil_positions
|
1463
|
+
@nil_positions ||
|
1464
|
+
@nil_positions = size.times.select { |i| @data[i].nil? }
|
1465
|
+
end
|
1466
|
+
|
1467
|
+
def nan_positions
|
1468
|
+
@nan_positions ||
|
1469
|
+
@nan_positions = size.times.select do |i|
|
1470
|
+
@data[i].respond_to?(:nan?) && @data[i].nan?
|
1471
|
+
end
|
1472
|
+
end
|
1473
|
+
|
1474
|
+
# Helper method returning validity of arbitrary value
|
1475
|
+
def valid_value?(v)
|
1476
|
+
!((v.respond_to?(:nan?) && v.nan?) || v.nil?)
|
1477
|
+
end
|
1478
|
+
|
1479
|
+
def initialize_vector(source, opts)
|
1480
|
+
index, source = parse_source(source, opts)
|
1481
|
+
set_name opts[:name]
|
1482
|
+
|
1483
|
+
@data = cast_vector_to(opts[:dtype] || :array, source, opts[:nm_dtype])
|
1484
|
+
@index = Index.coerce(index || @data.size)
|
1485
|
+
|
1486
|
+
guard_sizes!
|
1487
|
+
|
1488
|
+
@possibly_changed_type = true
|
1489
|
+
end
|
1490
|
+
|
1491
|
+
def parse_source(source, opts)
|
1492
|
+
if source.is_a?(Hash)
|
1493
|
+
[source.keys, source.values]
|
1494
|
+
else
|
1495
|
+
[opts[:index], source || []]
|
1496
|
+
end
|
1497
|
+
end
|
1498
|
+
|
1499
|
+
def guard_sizes!
|
1500
|
+
if @index.size > @data.size
|
1501
|
+
cast(dtype: :array) # NM with nils seg faults
|
1502
|
+
@data.fill(nil, @data.size...@index.size)
|
1503
|
+
elsif @index.size < @data.size
|
1504
|
+
raise IndexError, "Expected index size >= vector size. Index size : #{@index.size}, vector size : #{@data.size}"
|
1505
|
+
end
|
1506
|
+
end
|
1507
|
+
|
1508
|
+
def guard_type_check(value)
|
1509
|
+
@possibly_changed_type = true \
|
1510
|
+
if (object? && (value.nil? || value.is_a?(Numeric))) ||
|
1511
|
+
(numeric? && !value.is_a?(Numeric) && !value.nil?)
|
1512
|
+
end
|
1513
|
+
|
1514
|
+
def split_value(key, v)
|
1515
|
+
if v.nil?
|
1516
|
+
nil
|
1517
|
+
elsif v.include?(key)
|
1518
|
+
1
|
1519
|
+
else
|
1520
|
+
0
|
1521
|
+
end
|
1522
|
+
end
|
1523
|
+
|
1524
|
+
# For an array or hash of estimators methods, returns
|
1525
|
+
# an array with three elements
|
1526
|
+
# 1.- A hash with estimators names as keys and lambdas as values
|
1527
|
+
# 2.- An array with estimators names
|
1528
|
+
# 3.- A Hash with estimators names as keys and empty arrays as values
|
1529
|
+
def prepare_bootstrap(estimators)
|
1530
|
+
h_est = estimators
|
1531
|
+
h_est = [h_est] unless h_est.is_a?(Array) || h_est.is_a?(Hash)
|
1532
|
+
|
1533
|
+
if h_est.is_a? Array
|
1534
|
+
h_est = h_est.to_h do |est|
|
1535
|
+
[est, ->(v) { DaruLite::Vector.new(v).send(est) }]
|
1536
|
+
end
|
1537
|
+
end
|
1538
|
+
bss = h_est.keys.to_h { |v| [v, []] }
|
1539
|
+
|
1540
|
+
[h_est, h_est.keys, bss]
|
1541
|
+
end
|
1542
|
+
|
1543
|
+
# NOTE: To maintain sanity, this _MUST_ be the _ONLY_ place in daru where the
|
1544
|
+
# @param dtype [db_type] variable is set and the underlying data type of vector changed.
|
1545
|
+
def cast_vector_to(dtype, source = nil, _nm_dtype = nil)
|
1546
|
+
source = @data.to_a if source.nil?
|
1547
|
+
|
1548
|
+
new_vector =
|
1549
|
+
case dtype
|
1550
|
+
when :array then DaruLite::Accessors::ArrayWrapper.new(source, self)
|
1551
|
+
when :mdarray then raise NotImplementedError, 'MDArray not yet supported.'
|
1552
|
+
else raise ArgumentError, "Unknown dtype #{dtype}"
|
1553
|
+
end
|
1554
|
+
|
1555
|
+
@dtype = dtype
|
1556
|
+
new_vector
|
1557
|
+
end
|
1558
|
+
|
1559
|
+
def set_name(name) # rubocop:disable Naming/AccessorMethodName
|
1560
|
+
@name = name.is_a?(Array) ? name.join : name # join in case of MultiIndex tuple
|
1561
|
+
end
|
1562
|
+
|
1563
|
+
# Raises IndexError when one of the positions is an invalid position
|
1564
|
+
def validate_positions(*positions)
|
1565
|
+
positions.each do |pos|
|
1566
|
+
raise IndexError, "#{pos} is not a valid position." if pos >= size
|
1567
|
+
end
|
1568
|
+
end
|
1569
|
+
|
1570
|
+
# coerce ranges, integers and array in appropriate ways
|
1571
|
+
def coerce_positions(*positions)
|
1572
|
+
if positions.size == 1
|
1573
|
+
case positions.first
|
1574
|
+
when Integer
|
1575
|
+
positions.first
|
1576
|
+
when Range
|
1577
|
+
size.times.to_a[positions.first]
|
1578
|
+
else
|
1579
|
+
raise ArgumentError, 'Unkown position type.'
|
1580
|
+
end
|
1581
|
+
else
|
1582
|
+
positions
|
1583
|
+
end
|
1584
|
+
end
|
1585
|
+
|
1586
|
+
# Helper method for []=.
|
1587
|
+
# Assigs existing index to another value
|
1588
|
+
def modify_vector(indexes, val)
|
1589
|
+
positions = @index.pos(*indexes)
|
1590
|
+
|
1591
|
+
if positions.is_a? Numeric
|
1592
|
+
@data[positions] = val
|
1593
|
+
else
|
1594
|
+
positions.each { |pos| @data[pos] = val }
|
1595
|
+
end
|
1596
|
+
end
|
1597
|
+
|
1598
|
+
# Helper method for []=.
|
1599
|
+
# Add a new index and assign it value
|
1600
|
+
def insert_vector(indexes, val)
|
1601
|
+
new_index = @index.add(*indexes)
|
1602
|
+
# May be create +=
|
1603
|
+
(new_index.size - @index.size).times { @data << val }
|
1604
|
+
@index = new_index
|
1605
|
+
end
|
1606
|
+
|
1607
|
+
# Works similar to #[]= but also insert the vector in case index is not valid
|
1608
|
+
# It is there only to be accessed by DaruLite::DataFrame and not meant for user.
|
1609
|
+
def set(indexes, val)
|
1610
|
+
cast(dtype: :array) if val.nil? && dtype != :array
|
1611
|
+
guard_type_check(val)
|
1612
|
+
|
1613
|
+
if @index.valid?(*indexes)
|
1614
|
+
modify_vector(indexes, val)
|
1615
|
+
else
|
1616
|
+
insert_vector(indexes, val)
|
1617
|
+
end
|
1618
|
+
|
1619
|
+
update_position_cache
|
1620
|
+
end
|
1621
|
+
|
1622
|
+
def cut_find_category(partitions, val, close_at)
|
1623
|
+
case close_at
|
1624
|
+
when :right
|
1625
|
+
right_index = partitions.index { |i| i > val }
|
1626
|
+
raise ArgumentError, 'Invalid partition' if right_index.nil?
|
1627
|
+
|
1628
|
+
left_index = right_index - 1
|
1629
|
+
"#{partitions[left_index]}-#{partitions[right_index] - 1}"
|
1630
|
+
when :left
|
1631
|
+
right_index = partitions.index { |i| i >= val }
|
1632
|
+
raise ArgumentError, 'Invalid partition' if right_index.nil?
|
1633
|
+
|
1634
|
+
left_index = right_index - 1
|
1635
|
+
"#{partitions[left_index] + 1}-#{partitions[right_index]}"
|
1636
|
+
else
|
1637
|
+
raise ArgumentError, "Invalid parameter #{close_at} to close_at."
|
1638
|
+
end
|
1639
|
+
end
|
1640
|
+
|
1641
|
+
def cut_categories(partitions, close_at)
|
1642
|
+
case close_at
|
1643
|
+
when :right
|
1644
|
+
Array.new(partitions.size - 1) do |left_index|
|
1645
|
+
"#{partitions[left_index]}-#{partitions[left_index + 1] - 1}"
|
1646
|
+
end
|
1647
|
+
when :left
|
1648
|
+
Array.new(partitions.size - 1) do |left_index|
|
1649
|
+
"#{partitions[left_index] + 1}-#{partitions[left_index + 1]}"
|
1650
|
+
end
|
1651
|
+
end
|
1652
|
+
end
|
1653
|
+
|
1654
|
+
def include_with_nan?(array, value)
|
1655
|
+
# Returns true if value is included in array.
|
1656
|
+
# Similar to include? but also works if value is Float::NAN
|
1657
|
+
if value.respond_to?(:nan?) && value.nan?
|
1658
|
+
array.any? { |i| i.respond_to?(:nan?) && i.nan? }
|
1659
|
+
else
|
1660
|
+
array.include? value
|
1661
|
+
end
|
1662
|
+
end
|
1663
|
+
|
1664
|
+
def update_position_cache
|
1665
|
+
@nil_positions = nil
|
1666
|
+
@nan_positions = nil
|
1667
|
+
end
|
1668
|
+
|
1669
|
+
def resort_index(vector_index, opts)
|
1670
|
+
if block_given?
|
1671
|
+
vector_index.sort { |(lv, _li), (rv, _ri)| yield(lv, rv) }
|
1672
|
+
else
|
1673
|
+
vector_index.sort(&DEFAULT_SORTER)
|
1674
|
+
end
|
1675
|
+
.tap { |res| res.reverse! unless opts[:ascending] }
|
1676
|
+
end
|
1677
|
+
end
|
1678
|
+
end
|