daru_lite 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/workflows/ci.yml +33 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +27 -0
- data/.rubocop_todo.yml +137 -0
- data/CONTRIBUTING.md +47 -0
- data/Gemfile +2 -0
- data/History.md +4 -0
- data/LICENSE +24 -0
- data/README.md +218 -0
- data/Rakefile +69 -0
- data/ReleasePolicy.md +20 -0
- data/benchmarks/TradeoffData.csv +65 -0
- data/benchmarks/csv_reading.rb +22 -0
- data/benchmarks/dataframe_creation.rb +39 -0
- data/benchmarks/db_loading.rb +34 -0
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +32 -0
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/row_access.rb +41 -0
- data/benchmarks/row_assign.rb +36 -0
- data/benchmarks/sorting.rb +51 -0
- data/benchmarks/statistics.rb +28 -0
- data/benchmarks/vector_access.rb +31 -0
- data/benchmarks/vector_assign.rb +42 -0
- data/benchmarks/where_clause.rb +48 -0
- data/benchmarks/where_vs_filter.rb +28 -0
- data/daru_lite.gemspec +55 -0
- data/images/README.md +5 -0
- data/images/con0.png +0 -0
- data/images/con1.png +0 -0
- data/images/init0.png +0 -0
- data/images/init1.png +0 -0
- data/images/man0.png +0 -0
- data/images/man1.png +0 -0
- data/images/man2.png +0 -0
- data/images/man3.png +0 -0
- data/images/man4.png +0 -0
- data/images/man5.png +0 -0
- data/images/man6.png +0 -0
- data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
- data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
- data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
- data/lib/daru_lite/category.rb +929 -0
- data/lib/daru_lite/configuration.rb +34 -0
- data/lib/daru_lite/core/group_by.rb +403 -0
- data/lib/daru_lite/core/merge.rb +270 -0
- data/lib/daru_lite/core/query.rb +109 -0
- data/lib/daru_lite/dataframe.rb +3080 -0
- data/lib/daru_lite/date_time/index.rb +569 -0
- data/lib/daru_lite/date_time/offsets.rb +397 -0
- data/lib/daru_lite/exceptions.rb +2 -0
- data/lib/daru_lite/extensions/which_dsl.rb +53 -0
- data/lib/daru_lite/formatters/table.rb +52 -0
- data/lib/daru_lite/helpers/array.rb +53 -0
- data/lib/daru_lite/index/categorical_index.rb +201 -0
- data/lib/daru_lite/index/index.rb +374 -0
- data/lib/daru_lite/index/multi_index.rb +374 -0
- data/lib/daru_lite/io/csv/converters.rb +21 -0
- data/lib/daru_lite/io/io.rb +294 -0
- data/lib/daru_lite/io/sql_data_source.rb +97 -0
- data/lib/daru_lite/iruby/helpers.rb +38 -0
- data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
- data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
- data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
- data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
- data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
- data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
- data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
- data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
- data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
- data/lib/daru_lite/monkeys.rb +56 -0
- data/lib/daru_lite/vector.rb +1678 -0
- data/lib/daru_lite/version.rb +3 -0
- data/lib/daru_lite.rb +99 -0
- data/profile/_base.rb +23 -0
- data/profile/df_to_a.rb +10 -0
- data/profile/filter.rb +13 -0
- data/profile/joining.rb +13 -0
- data/profile/sorting.rb +12 -0
- data/profile/vector_each_with_index.rb +9 -0
- data/profile/vector_new.rb +9 -0
- data/spec/accessors/array_wrapper_spec.rb +3 -0
- data/spec/category_spec.rb +1741 -0
- data/spec/core/group_by_spec.rb +655 -0
- data/spec/core/merge_spec.rb +179 -0
- data/spec/core/query_spec.rb +347 -0
- data/spec/daru_lite_spec.rb +22 -0
- data/spec/dataframe_spec.rb +4330 -0
- data/spec/date_time/data_spec.rb +197 -0
- data/spec/date_time/date_time_index_helper_spec.rb +72 -0
- data/spec/date_time/index_spec.rb +588 -0
- data/spec/date_time/offsets_spec.rb +465 -0
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/boolean_converter_test.csv +5 -0
- data/spec/fixtures/countries.json +7794 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/fixtures/eciresults.html +394 -0
- data/spec/fixtures/empties.dat +2 -0
- data/spec/fixtures/empty_rows_test.csv +17 -0
- data/spec/fixtures/macau.html +3691 -0
- data/spec/fixtures/macd_data.csv +150 -0
- data/spec/fixtures/matrix_test.csv +100 -0
- data/spec/fixtures/moneycontrol.html +6812 -0
- data/spec/fixtures/music_data.tsv +2501 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/sales-funnel.csv +18 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/string_converter_test.csv +5 -0
- data/spec/fixtures/strings.dat +2 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/fixtures/test_xls_2.xls +0 -0
- data/spec/fixtures/url_test.txt~ +0 -0
- data/spec/fixtures/valid_markup.html +62 -0
- data/spec/fixtures/wiki_climate.html +1243 -0
- data/spec/fixtures/wiki_table_info.html +631 -0
- data/spec/formatters/table_formatter_spec.rb +137 -0
- data/spec/helpers_spec.rb +8 -0
- data/spec/index/categorical_index_spec.rb +170 -0
- data/spec/index/index_spec.rb +417 -0
- data/spec/index/multi_index_spec.rb +680 -0
- data/spec/io/io_spec.rb +373 -0
- data/spec/io/sql_data_source_spec.rb +56 -0
- data/spec/iruby/dataframe_spec.rb +170 -0
- data/spec/iruby/helpers_spec.rb +49 -0
- data/spec/iruby/multi_index_spec.rb +37 -0
- data/spec/iruby/vector_spec.rb +105 -0
- data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
- data/spec/maths/arithmetic/vector_spec.rb +165 -0
- data/spec/maths/statistics/dataframe_spec.rb +178 -0
- data/spec/maths/statistics/vector_spec.rb +756 -0
- data/spec/monkeys_spec.rb +42 -0
- data/spec/shared/vector_display_spec.rb +213 -0
- data/spec/spec_helper.rb +87 -0
- data/spec/support/database_helper.rb +30 -0
- data/spec/support/matchers.rb +5 -0
- data/spec/vector_spec.rb +2293 -0
- metadata +571 -0
|
@@ -0,0 +1,1678 @@
|
|
|
1
|
+
require 'daru_lite/maths/arithmetic/vector'
|
|
2
|
+
require 'daru_lite/maths/statistics/vector'
|
|
3
|
+
require 'daru_lite/accessors/array_wrapper'
|
|
4
|
+
require 'daru_lite/category'
|
|
5
|
+
|
|
6
|
+
module DaruLite
|
|
7
|
+
class Vector # rubocop:disable Metrics/ClassLength
|
|
8
|
+
include Enumerable
|
|
9
|
+
include DaruLite::Maths::Arithmetic::Vector
|
|
10
|
+
include DaruLite::Maths::Statistics::Vector
|
|
11
|
+
extend Gem::Deprecate
|
|
12
|
+
|
|
13
|
+
class << self
|
|
14
|
+
# Create a new vector by specifying the size and an optional value
|
|
15
|
+
# and block to generate values.
|
|
16
|
+
#
|
|
17
|
+
# == Description
|
|
18
|
+
#
|
|
19
|
+
# The *new_with_size* class method lets you create a DaruLite::Vector
|
|
20
|
+
# by specifying the size as the argument. The optional block, if
|
|
21
|
+
# supplied, is run once for populating each element in the Vector.
|
|
22
|
+
#
|
|
23
|
+
# The result of each run of the block is the value that is ultimately
|
|
24
|
+
# assigned to that position in the Vector.
|
|
25
|
+
#
|
|
26
|
+
# == Options
|
|
27
|
+
# :value
|
|
28
|
+
# All the rest like .new
|
|
29
|
+
def new_with_size(n, opts = {}, &block)
|
|
30
|
+
value = opts.delete :value
|
|
31
|
+
block ||= ->(_) { value }
|
|
32
|
+
DaruLite::Vector.new Array.new(n, &block), opts
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Create a vector using (almost) any object
|
|
36
|
+
# * Array: flattened
|
|
37
|
+
# * Range: transformed using to_a
|
|
38
|
+
# * DaruLite::Vector
|
|
39
|
+
# * Numeric and string values
|
|
40
|
+
#
|
|
41
|
+
# == Description
|
|
42
|
+
#
|
|
43
|
+
# The `Vector.[]` class method creates a vector from almost any
|
|
44
|
+
# object that has a `#to_a` method defined on it. It is similar
|
|
45
|
+
# to R's `c` method.
|
|
46
|
+
#
|
|
47
|
+
# == Usage
|
|
48
|
+
#
|
|
49
|
+
# a = DaruLite::Vector[1,2,3,4,6..10]
|
|
50
|
+
# #=>
|
|
51
|
+
# # <DaruLite::Vector:99448510 @name = nil @size = 9 >
|
|
52
|
+
# # nil
|
|
53
|
+
# # 0 1
|
|
54
|
+
# # 1 2
|
|
55
|
+
# # 2 3
|
|
56
|
+
# # 3 4
|
|
57
|
+
# # 4 6
|
|
58
|
+
# # 5 7
|
|
59
|
+
# # 6 8
|
|
60
|
+
# # 7 9
|
|
61
|
+
# # 8 10
|
|
62
|
+
def [](*indexes)
|
|
63
|
+
values = indexes.map do |a|
|
|
64
|
+
a.respond_to?(:to_a) ? a.to_a : a
|
|
65
|
+
end.flatten
|
|
66
|
+
DaruLite::Vector.new(values)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def _load(data) # :nodoc:
|
|
70
|
+
h = Marshal.load(data)
|
|
71
|
+
DaruLite::Vector.new(h[:data],
|
|
72
|
+
index: h[:index],
|
|
73
|
+
name: h[:name],
|
|
74
|
+
dtype: h[:dtype], missing_values: h[:missing_values])
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def coerce(data, options = {})
|
|
78
|
+
case data
|
|
79
|
+
when DaruLite::Vector
|
|
80
|
+
data
|
|
81
|
+
when Array, Hash
|
|
82
|
+
new(data, options)
|
|
83
|
+
else
|
|
84
|
+
raise ArgumentError, "Can't coerce #{data.class} to #{self}"
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def size
|
|
90
|
+
@data.size
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def each(&block)
|
|
94
|
+
return to_enum(:each) unless block
|
|
95
|
+
|
|
96
|
+
@data.each(&block)
|
|
97
|
+
self
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def each_index(&block)
|
|
101
|
+
return to_enum(:each_index) unless block
|
|
102
|
+
|
|
103
|
+
@index.each(&block)
|
|
104
|
+
self
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def each_with_index(&block)
|
|
108
|
+
return to_enum(:each_with_index) unless block
|
|
109
|
+
|
|
110
|
+
@data.to_a.zip(@index.to_a).each(&block)
|
|
111
|
+
|
|
112
|
+
self
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def map!(&block)
|
|
116
|
+
return to_enum(:map!) unless block
|
|
117
|
+
|
|
118
|
+
@data.map!(&block)
|
|
119
|
+
self
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def apply_method(method, keys: nil, by_position: true)
|
|
123
|
+
vect = keys ? get_sub_vector(keys, by_position: by_position) : self
|
|
124
|
+
|
|
125
|
+
case method
|
|
126
|
+
when Symbol then vect.send(method)
|
|
127
|
+
when Proc then method.call(vect)
|
|
128
|
+
else raise
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
alias apply_method_on_sub_vector apply_method
|
|
132
|
+
|
|
133
|
+
# The name of the DaruLite::Vector. String.
|
|
134
|
+
attr_reader :name
|
|
135
|
+
# The row index. Can be either DaruLite::Index or DaruLite::MultiIndex.
|
|
136
|
+
attr_reader :index
|
|
137
|
+
# The underlying dtype of the Vector. Can be :array.
|
|
138
|
+
attr_reader :dtype
|
|
139
|
+
attr_reader :nm_dtype
|
|
140
|
+
# An Array or the positions in the vector that are being treated as 'missing'.
|
|
141
|
+
attr_reader :missing_positions
|
|
142
|
+
|
|
143
|
+
deprecate :missing_positions, :indexes, 2016, 10
|
|
144
|
+
# Store a hash of labels for values. Supplementary only. Recommend using index
|
|
145
|
+
# for proper usage.
|
|
146
|
+
attr_accessor :labels
|
|
147
|
+
# Store vector data in an array
|
|
148
|
+
attr_reader :data
|
|
149
|
+
|
|
150
|
+
# Create a Vector object.
|
|
151
|
+
#
|
|
152
|
+
# == Arguments
|
|
153
|
+
#
|
|
154
|
+
# @param source[Array,Hash] - Supply elements in the form of an Array or a
|
|
155
|
+
# Hash. If Array, a numeric index will be created if not supplied in the
|
|
156
|
+
# options. Specifying more index elements than actual values in *source*
|
|
157
|
+
# will insert *nil* into the surplus index elements. When a Hash is specified,
|
|
158
|
+
# the keys of the Hash are taken as the index elements and the corresponding
|
|
159
|
+
# values as the values that populate the vector.
|
|
160
|
+
#
|
|
161
|
+
# == Options
|
|
162
|
+
#
|
|
163
|
+
# * +:name+ - Name of the vector
|
|
164
|
+
#
|
|
165
|
+
# * +:index+ - Index of the vector
|
|
166
|
+
#
|
|
167
|
+
# * +:dtype+ - The underlying data type. Can be :array.
|
|
168
|
+
# Default :array.
|
|
169
|
+
#
|
|
170
|
+
# * +:missing_values+ - An Array of the values that are to be treated as 'missing'.
|
|
171
|
+
# nil is the default missing value.
|
|
172
|
+
#
|
|
173
|
+
# == Usage
|
|
174
|
+
#
|
|
175
|
+
# vecarr = DaruLite::Vector.new [1,2,3,4], index: [:a, :e, :i, :o]
|
|
176
|
+
# vechsh = DaruLite::Vector.new({a: 1, e: 2, i: 3, o: 4})
|
|
177
|
+
def initialize(source, opts = {})
|
|
178
|
+
if opts[:type] == :category
|
|
179
|
+
# Initialize category type vector
|
|
180
|
+
extend DaruLite::Category
|
|
181
|
+
initialize_category source, opts
|
|
182
|
+
else
|
|
183
|
+
# Initialize non-category type vector
|
|
184
|
+
initialize_vector source, opts
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Get one or more elements with specified index or a range.
|
|
189
|
+
#
|
|
190
|
+
# == Usage
|
|
191
|
+
# # For vectors employing single layer Index
|
|
192
|
+
#
|
|
193
|
+
# v[:one, :two] # => DaruLite::Vector with indexes :one and :two
|
|
194
|
+
# v[:one] # => Single element
|
|
195
|
+
# v[:one..:three] # => DaruLite::Vector with indexes :one, :two and :three
|
|
196
|
+
#
|
|
197
|
+
# # For vectors employing hierarchial multi index
|
|
198
|
+
#
|
|
199
|
+
def [](*input_indexes)
|
|
200
|
+
# Get array of positions indexes
|
|
201
|
+
positions = @index.pos(*input_indexes)
|
|
202
|
+
|
|
203
|
+
# If one object is asked return it
|
|
204
|
+
return @data[positions] if positions.is_a? Numeric
|
|
205
|
+
|
|
206
|
+
# Form a new Vector using positional indexes
|
|
207
|
+
DaruLite::Vector.new(
|
|
208
|
+
positions.map { |loc| @data[loc] },
|
|
209
|
+
name: @name,
|
|
210
|
+
index: @index.subset(*input_indexes), dtype: @dtype
|
|
211
|
+
)
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
# Returns vector of values given positional values
|
|
215
|
+
# @param positions [Array<object>] positional values
|
|
216
|
+
# @return [object] vector
|
|
217
|
+
# @example
|
|
218
|
+
# dv = DaruLite::Vector.new 'a'..'e'
|
|
219
|
+
# dv.at 0, 1, 2
|
|
220
|
+
# # => #<DaruLite::Vector(3)>
|
|
221
|
+
# # 0 a
|
|
222
|
+
# # 1 b
|
|
223
|
+
# # 2 c
|
|
224
|
+
def at(*positions)
|
|
225
|
+
# to be used to form index
|
|
226
|
+
original_positions = positions
|
|
227
|
+
positions = coerce_positions(*positions)
|
|
228
|
+
validate_positions(*positions)
|
|
229
|
+
|
|
230
|
+
if positions.is_a? Integer
|
|
231
|
+
@data[positions]
|
|
232
|
+
else
|
|
233
|
+
values = positions.map { |pos| @data[pos] }
|
|
234
|
+
DaruLite::Vector.new values, index: @index.at(*original_positions), dtype: dtype
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Change value at given positions
|
|
239
|
+
# @param positions [Array<object>] positional values
|
|
240
|
+
# @param [object] val value to assign
|
|
241
|
+
# @example
|
|
242
|
+
# dv = DaruLite::Vector.new 'a'..'e'
|
|
243
|
+
# dv.set_at [0, 1], 'x'
|
|
244
|
+
# dv
|
|
245
|
+
# # => #<DaruLite::Vector(5)>
|
|
246
|
+
# # 0 x
|
|
247
|
+
# # 1 x
|
|
248
|
+
# # 2 c
|
|
249
|
+
# # 3 d
|
|
250
|
+
# # 4 e
|
|
251
|
+
def set_at(positions, val)
|
|
252
|
+
validate_positions(*positions)
|
|
253
|
+
positions.map { |pos| @data[pos] = val }
|
|
254
|
+
update_position_cache
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# Just like in Hashes, you can specify the index label of the DaruLite::Vector
|
|
258
|
+
# and assign an element an that place in the DaruLite::Vector.
|
|
259
|
+
#
|
|
260
|
+
# == Usage
|
|
261
|
+
#
|
|
262
|
+
# v = DaruLite::Vector.new([1,2,3], index: [:a, :b, :c])
|
|
263
|
+
# v[:a] = 999
|
|
264
|
+
# #=>
|
|
265
|
+
# ##<DaruLite::Vector:90257920 @name = nil @size = 3 >
|
|
266
|
+
# # nil
|
|
267
|
+
# # a 999
|
|
268
|
+
# # b 2
|
|
269
|
+
# # c 3
|
|
270
|
+
def []=(*indexes, val)
|
|
271
|
+
cast(dtype: :array) if val.nil? && dtype != :array
|
|
272
|
+
|
|
273
|
+
guard_type_check(val)
|
|
274
|
+
|
|
275
|
+
modify_vector(indexes, val)
|
|
276
|
+
|
|
277
|
+
update_position_cache
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# Two vectors are equal if they have the exact same index values corresponding
|
|
281
|
+
# with the exact same elements. Name is ignored.
|
|
282
|
+
def ==(other)
|
|
283
|
+
case other
|
|
284
|
+
when DaruLite::Vector
|
|
285
|
+
@index == other.index && size == other.size &&
|
|
286
|
+
each_with_index.with_index.all? do |(e, index), position|
|
|
287
|
+
e == other.at(position) && index == other.index.to_a[position]
|
|
288
|
+
end
|
|
289
|
+
else
|
|
290
|
+
super
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
# !@method eq
|
|
295
|
+
# Uses `==` and returns `true` for each **equal** entry
|
|
296
|
+
# @param [#==, DaruLite::Vector] If scalar object, compares it with each
|
|
297
|
+
# element in self. If DaruLite::Vector, compares elements with same indexes.
|
|
298
|
+
# @example (see #where)
|
|
299
|
+
# !@method not_eq
|
|
300
|
+
# Uses `!=` and returns `true` for each **unequal** entry
|
|
301
|
+
# @param [#!=, DaruLite::Vector] If scalar object, compares it with each
|
|
302
|
+
# element in self. If DaruLite::Vector, compares elements with same indexes.
|
|
303
|
+
# @example (see #where)
|
|
304
|
+
# !@method lt
|
|
305
|
+
# Uses `<` and returns `true` for each entry **less than** the supplied object
|
|
306
|
+
# @param [#<, DaruLite::Vector] If scalar object, compares it with each
|
|
307
|
+
# element in self. If DaruLite::Vector, compares elements with same indexes.
|
|
308
|
+
# @example (see #where)
|
|
309
|
+
# !@method lteq
|
|
310
|
+
# Uses `<=` and returns `true` for each entry **less than or equal to** the supplied object
|
|
311
|
+
# @param [#<=, DaruLite::Vector] If scalar object, compares it with each
|
|
312
|
+
# element in self. If DaruLite::Vector, compares elements with same indexes.
|
|
313
|
+
# @example (see #where)
|
|
314
|
+
# !@method mt
|
|
315
|
+
# Uses `>` and returns `true` for each entry **more than** the supplied object
|
|
316
|
+
# @param [#>, DaruLite::Vector] If scalar object, compares it with each
|
|
317
|
+
# element in self. If DaruLite::Vector, compares elements with same indexes.
|
|
318
|
+
# @example (see #where)
|
|
319
|
+
# !@method mteq
|
|
320
|
+
# Uses `>=` and returns `true` for each entry **more than or equal to** the supplied object
|
|
321
|
+
# @param [#>=, DaruLite::Vector] If scalar object, compares it with each
|
|
322
|
+
# element in self. If DaruLite::Vector, compares elements with same indexes.
|
|
323
|
+
# @example (see #where)
|
|
324
|
+
|
|
325
|
+
# Define the comparator methods with metaprogramming. See documentation
|
|
326
|
+
# written above for functionality of each method. Use these methods with the
|
|
327
|
+
# `where` method to obtain the corresponding Vector/DataFrame.
|
|
328
|
+
{
|
|
329
|
+
eq: :==,
|
|
330
|
+
not_eq: :!=,
|
|
331
|
+
lt: :<,
|
|
332
|
+
lteq: :<=,
|
|
333
|
+
mt: :>,
|
|
334
|
+
mteq: :>=
|
|
335
|
+
}.each do |method, operator|
|
|
336
|
+
define_method(method) do |other|
|
|
337
|
+
mod = DaruLite::Core::Query
|
|
338
|
+
if other.is_a?(DaruLite::Vector)
|
|
339
|
+
mod.apply_vector_operator operator, self, other
|
|
340
|
+
else
|
|
341
|
+
mod.apply_scalar_operator operator, @data, other
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
alias_method operator, method if operator != :== && operator != :!=
|
|
345
|
+
end
|
|
346
|
+
alias gt mt
|
|
347
|
+
alias gteq mteq
|
|
348
|
+
|
|
349
|
+
# Comparator for checking if any of the elements in *other* exist in self.
|
|
350
|
+
#
|
|
351
|
+
# @param [Array, DaruLite::Vector] other A collection which has elements that
|
|
352
|
+
# need to be checked for in self.
|
|
353
|
+
# @example Usage of `in`.
|
|
354
|
+
# vector = DaruLite::Vector.new([1,2,3,4,5])
|
|
355
|
+
# vector.where(vector.in([3,5]))
|
|
356
|
+
# #=>
|
|
357
|
+
# ##<DaruLite::Vector:82215960 @name = nil @size = 2 >
|
|
358
|
+
# # nil
|
|
359
|
+
# # 2 3
|
|
360
|
+
# # 4 5
|
|
361
|
+
def in(other)
|
|
362
|
+
other = other.zip(Array.new(other.size, 0)).to_h
|
|
363
|
+
DaruLite::Core::Query::BoolArray.new(
|
|
364
|
+
@data.each_with_object([]) do |d, memo|
|
|
365
|
+
memo << (other.key?(d))
|
|
366
|
+
end
|
|
367
|
+
)
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
# Return a new vector based on the contents of a boolean array. Use with the
|
|
371
|
+
# comparator methods to obtain meaningful results. See this notebook for
|
|
372
|
+
# a good overview of using #where.
|
|
373
|
+
#
|
|
374
|
+
# @param bool_array [DaruLite::Core::Query::BoolArray, Array<TrueClass, FalseClass>] The
|
|
375
|
+
# collection containing the true of false values. Each element in the Vector
|
|
376
|
+
# corresponding to a `true` in the bool_arry will be returned alongwith it's
|
|
377
|
+
# index.
|
|
378
|
+
# @example Usage of #where.
|
|
379
|
+
# vector = DaruLite::Vector.new([2,4,5,51,5,16,2,5,3,2,1,5,2,5,2,1,56,234,6,21])
|
|
380
|
+
#
|
|
381
|
+
# # Simple logic statement passed to #where.
|
|
382
|
+
# vector.where(vector.eq(5).or(vector.eq(1)))
|
|
383
|
+
# # =>
|
|
384
|
+
# ##<DaruLite::Vector:77626210 @name = nil @size = 7 >
|
|
385
|
+
# # nil
|
|
386
|
+
# # 2 5
|
|
387
|
+
# # 4 5
|
|
388
|
+
# # 7 5
|
|
389
|
+
# # 10 1
|
|
390
|
+
# # 11 5
|
|
391
|
+
# # 13 5
|
|
392
|
+
# # 15 1
|
|
393
|
+
#
|
|
394
|
+
# # A somewhat more complex logic statement
|
|
395
|
+
# vector.where((vector.eq(5) | vector.lteq(1)) & vector.in([4,5,1]))
|
|
396
|
+
# #=>
|
|
397
|
+
# ##<DaruLite::Vector:81072310 @name = nil @size = 7 >
|
|
398
|
+
# # nil
|
|
399
|
+
# # 2 5
|
|
400
|
+
# # 4 5
|
|
401
|
+
# # 7 5
|
|
402
|
+
# # 10 1
|
|
403
|
+
# # 11 5
|
|
404
|
+
# # 13 5
|
|
405
|
+
# # 15 1
|
|
406
|
+
def where(bool_array)
|
|
407
|
+
DaruLite::Core::Query.vector_where self, bool_array
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
# Return a new vector based on the contents of a boolean array and &block.
|
|
411
|
+
#
|
|
412
|
+
# @param bool_array [DaruLite::Core::Query::BoolArray, Array<TrueClass, FalseClass>, &block] The
|
|
413
|
+
# collection containing the true of false values. Each element in the Vector
|
|
414
|
+
# corresponding to a `true` in the bool_array will be returned along with it's
|
|
415
|
+
# index. The &block may contain manipulative functions for the Vector elements.
|
|
416
|
+
#
|
|
417
|
+
# @return [DaruLite::Vector]
|
|
418
|
+
#
|
|
419
|
+
# @example Usage of #apply_where.
|
|
420
|
+
# dv = DaruLite::Vector.new ['3 days', '5 weeks', '2 weeks']
|
|
421
|
+
# dv = dv.apply_where(dv.match /weeks/) { |x| "#{x.split.first.to_i * 7} days" }
|
|
422
|
+
# # =>
|
|
423
|
+
# ##<DaruLite::Vector(3)>
|
|
424
|
+
# # 0 3 days
|
|
425
|
+
# # 1 35 days
|
|
426
|
+
# # 2 14 days
|
|
427
|
+
def apply_where(bool_array, &block)
|
|
428
|
+
DaruLite::Core::Query.vector_apply_where self, bool_array, &block
|
|
429
|
+
end
|
|
430
|
+
|
|
431
|
+
def head(q = 10)
|
|
432
|
+
self[0..(q - 1)]
|
|
433
|
+
end
|
|
434
|
+
|
|
435
|
+
def tail(q = 10)
|
|
436
|
+
start = [size - q, 0].max
|
|
437
|
+
self[start..(size - 1)]
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
def last(q = 1)
|
|
441
|
+
# The Enumerable mixin dose not provide the last method.
|
|
442
|
+
tail(q)
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
def empty?
|
|
446
|
+
@index.empty?
|
|
447
|
+
end
|
|
448
|
+
|
|
449
|
+
def numeric?
|
|
450
|
+
type == :numeric
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
def object?
|
|
454
|
+
type == :object
|
|
455
|
+
end
|
|
456
|
+
|
|
457
|
+
# Reports whether missing data is present in the Vector.
|
|
458
|
+
def has_missing_data?
|
|
459
|
+
!indexes(*DaruLite::MISSING_VALUES).empty?
|
|
460
|
+
end
|
|
461
|
+
alias flawed? has_missing_data?
|
|
462
|
+
deprecate :has_missing_data?, :include_values?, 2016, 10
|
|
463
|
+
deprecate :flawed?, :include_values?, 2016, 10
|
|
464
|
+
|
|
465
|
+
# Check if any one of mentioned values occur in the vector
|
|
466
|
+
# @param values [Array] values to check for
|
|
467
|
+
# @return [true, false] returns true if any one of specified values
|
|
468
|
+
# occur in the vector
|
|
469
|
+
# @example
|
|
470
|
+
# dv = DaruLite::Vector.new [1, 2, 3, 4, nil]
|
|
471
|
+
# dv.include_values? nil, Float::NAN
|
|
472
|
+
# # => true
|
|
473
|
+
def include_values?(*values)
|
|
474
|
+
values.any? { |v| include_with_nan? @data, v }
|
|
475
|
+
end
|
|
476
|
+
|
|
477
|
+
# @note Do not use it to check for Float::NAN as
|
|
478
|
+
# Float::NAN == Float::NAN is false
|
|
479
|
+
# Return vector of booleans with value at ith position is either
|
|
480
|
+
# true or false depending upon whether value at position i is equal to
|
|
481
|
+
# any of the values passed in the argument or not
|
|
482
|
+
# @param values [Array] values to equate with
|
|
483
|
+
# @return [DaruLite::Vector] vector of boolean values
|
|
484
|
+
# @example
|
|
485
|
+
# dv = DaruLite::Vector.new [1, 2, 3, 2, 1]
|
|
486
|
+
# dv.is_values 1, 2
|
|
487
|
+
# # => #<DaruLite::Vector(5)>
|
|
488
|
+
# # 0 true
|
|
489
|
+
# # 1 true
|
|
490
|
+
# # 2 false
|
|
491
|
+
# # 3 true
|
|
492
|
+
# # 4 true
|
|
493
|
+
def is_values(*values)
|
|
494
|
+
DaruLite::Vector.new values.map { |v| eq(v) }.inject(:|)
|
|
495
|
+
end
|
|
496
|
+
|
|
497
|
+
# Append an element to the vector by specifying the element and index
|
|
498
|
+
def concat(element, index)
|
|
499
|
+
raise IndexError, 'Expected new unique index' if @index.include? index
|
|
500
|
+
|
|
501
|
+
@index |= [index]
|
|
502
|
+
@data[@index[index]] = element
|
|
503
|
+
|
|
504
|
+
update_position_cache
|
|
505
|
+
end
|
|
506
|
+
alias push concat
|
|
507
|
+
alias << concat
|
|
508
|
+
|
|
509
|
+
# Cast a vector to a new data type.
|
|
510
|
+
#
|
|
511
|
+
# == Options
|
|
512
|
+
#
|
|
513
|
+
# * +:dtype+ - :array for Ruby Array..
|
|
514
|
+
def cast(opts = {})
|
|
515
|
+
dt = opts[:dtype]
|
|
516
|
+
raise ArgumentError, "Unsupported dtype #{opts[:dtype]}" unless dt == :array
|
|
517
|
+
|
|
518
|
+
@data = cast_vector_to dt unless @dtype == dt
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
# Delete an element by value
|
|
522
|
+
def delete(element)
|
|
523
|
+
delete_at index_of(element)
|
|
524
|
+
end
|
|
525
|
+
|
|
526
|
+
# Delete element by index
|
|
527
|
+
def delete_at(index)
|
|
528
|
+
@data.delete_at @index[index]
|
|
529
|
+
@index = DaruLite::Index.new(@index.to_a - [index])
|
|
530
|
+
|
|
531
|
+
update_position_cache
|
|
532
|
+
end
|
|
533
|
+
|
|
534
|
+
# The type of data contained in the vector. Can be :object.
|
|
535
|
+
#
|
|
536
|
+
# Running through the data to figure out the kind of data is delayed to the
|
|
537
|
+
# last possible moment.
|
|
538
|
+
def type
|
|
539
|
+
if @type.nil? || @possibly_changed_type
|
|
540
|
+
@type = :numeric
|
|
541
|
+
each do |e|
|
|
542
|
+
next if e.nil? || e.is_a?(Numeric)
|
|
543
|
+
|
|
544
|
+
@type = :object
|
|
545
|
+
break
|
|
546
|
+
end
|
|
547
|
+
@possibly_changed_type = false
|
|
548
|
+
end
|
|
549
|
+
|
|
550
|
+
@type
|
|
551
|
+
end
|
|
552
|
+
|
|
553
|
+
# Tells if vector is categorical or not.
|
|
554
|
+
# @return [true, false] true if vector is of type category, false otherwise
|
|
555
|
+
# @example
|
|
556
|
+
# dv = DaruLite::Vector.new [1, 2, 3], type: :category
|
|
557
|
+
# dv.category?
|
|
558
|
+
# # => true
|
|
559
|
+
def category?
|
|
560
|
+
type == :category
|
|
561
|
+
end
|
|
562
|
+
|
|
563
|
+
# Get index of element
|
|
564
|
+
def index_of(element)
|
|
565
|
+
case dtype
|
|
566
|
+
when :array then @index.key(@data.index { |x| x.eql? element })
|
|
567
|
+
else @index.key @data.index(element)
|
|
568
|
+
end
|
|
569
|
+
end
|
|
570
|
+
|
|
571
|
+
# Keep only unique elements of the vector alongwith their indexes.
|
|
572
|
+
def uniq
|
|
573
|
+
uniq_vector = @data.uniq
|
|
574
|
+
new_index = uniq_vector.map { |element| index_of(element) }
|
|
575
|
+
|
|
576
|
+
DaruLite::Vector.new uniq_vector, name: @name, index: new_index, dtype: @dtype
|
|
577
|
+
end
|
|
578
|
+
|
|
579
|
+
def any?(&block)
|
|
580
|
+
@data.data.any?(&block)
|
|
581
|
+
end
|
|
582
|
+
|
|
583
|
+
def all?(&block)
|
|
584
|
+
@data.data.all?(&block)
|
|
585
|
+
end
|
|
586
|
+
|
|
587
|
+
# Sorts a vector according to its values. If a block is specified, the contents
|
|
588
|
+
# will be evaluated and data will be swapped whenever the block evaluates
|
|
589
|
+
# to *true*. Defaults to ascending order sorting. Any missing values will be
|
|
590
|
+
# put at the end of the vector. Preserves indexing. Default sort algorithm is
|
|
591
|
+
# quick sort.
|
|
592
|
+
#
|
|
593
|
+
# == Options
|
|
594
|
+
#
|
|
595
|
+
# * +:ascending+ - if false, will sort in descending order. Defaults to true.
|
|
596
|
+
#
|
|
597
|
+
# * +:type+ - Specify the sorting algorithm. Only supports quick_sort for now.
|
|
598
|
+
# == Usage
|
|
599
|
+
#
|
|
600
|
+
# v = DaruLite::Vector.new ["My first guitar", "jazz", "guitar"]
|
|
601
|
+
# # Say you want to sort these strings by length.
|
|
602
|
+
# v.sort(ascending: false) { |a,b| a.length <=> b.length }
|
|
603
|
+
def sort(opts = {}, &block)
|
|
604
|
+
opts = { ascending: true }.merge(opts)
|
|
605
|
+
|
|
606
|
+
vector_index = resort_index(@data.each_with_index, opts, &block)
|
|
607
|
+
vector, index = vector_index.transpose
|
|
608
|
+
|
|
609
|
+
index = @index.reorder index
|
|
610
|
+
|
|
611
|
+
DaruLite::Vector.new(vector, index: index, name: @name, dtype: @dtype)
|
|
612
|
+
end
|
|
613
|
+
|
|
614
|
+
# Sorts the vector according to it's`Index` values. Defaults to ascending
|
|
615
|
+
# order sorting.
|
|
616
|
+
#
|
|
617
|
+
# @param [Hash] opts the options for sort_by_index method.
|
|
618
|
+
# @option opts [Boolean] :ascending false, will sort `index` in
|
|
619
|
+
# descending order.
|
|
620
|
+
#
|
|
621
|
+
# @return [Vector] new sorted `Vector` according to the index values.
|
|
622
|
+
#
|
|
623
|
+
# @example
|
|
624
|
+
#
|
|
625
|
+
# dv = DaruLite::Vector.new [11, 13, 12], index: [23, 21, 22]
|
|
626
|
+
# # Say you want to sort index in ascending order
|
|
627
|
+
# dv.sort_by_index(ascending: true)
|
|
628
|
+
# #=> DaruLite::Vector.new [13, 12, 11], index: [21, 22, 23]
|
|
629
|
+
# # Say you want to sort index in descending order
|
|
630
|
+
# dv.sort_by_index(ascending: false)
|
|
631
|
+
# #=> DaruLite::Vector.new [11, 12, 13], index: [23, 22, 21]
|
|
632
|
+
def sort_by_index(opts = {})
|
|
633
|
+
opts = { ascending: true }.merge(opts)
|
|
634
|
+
_, new_order = resort_index(@index.each_with_index, opts).transpose
|
|
635
|
+
|
|
636
|
+
reorder new_order
|
|
637
|
+
end
|
|
638
|
+
|
|
639
|
+
DEFAULT_SORTER = lambda { |(lv, li), (rv, ri)|
|
|
640
|
+
if lv.nil? && rv.nil?
|
|
641
|
+
li <=> ri
|
|
642
|
+
elsif lv.nil?
|
|
643
|
+
-1
|
|
644
|
+
elsif rv.nil?
|
|
645
|
+
1
|
|
646
|
+
else
|
|
647
|
+
lv <=> rv
|
|
648
|
+
end
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
# Just sort the data and get an Array in return using Enumerable#sort.
|
|
652
|
+
# Non-destructive.
|
|
653
|
+
# :nocov:
|
|
654
|
+
def sorted_data(&block)
|
|
655
|
+
@data.to_a.sort(&block)
|
|
656
|
+
end
|
|
657
|
+
# :nocov:
|
|
658
|
+
|
|
659
|
+
# Like map, but returns a DaruLite::Vector with the returned values.
|
|
660
|
+
def recode(dt = nil, &block)
|
|
661
|
+
return to_enum(:recode, dt) unless block
|
|
662
|
+
|
|
663
|
+
dup.recode! dt, &block
|
|
664
|
+
end
|
|
665
|
+
|
|
666
|
+
# Destructive version of recode!
|
|
667
|
+
def recode!(dt = nil, &block)
|
|
668
|
+
return to_enum(:recode!, dt) unless block
|
|
669
|
+
|
|
670
|
+
@data.map!(&block).data
|
|
671
|
+
@data = cast_vector_to(dt || @dtype)
|
|
672
|
+
self
|
|
673
|
+
end
|
|
674
|
+
|
|
675
|
+
# Delete an element if block returns true. Destructive.
|
|
676
|
+
def delete_if
|
|
677
|
+
return to_enum(:delete_if) unless block_given?
|
|
678
|
+
|
|
679
|
+
keep_e, keep_i = each_with_index.reject { |n, _i| yield(n) }.transpose
|
|
680
|
+
|
|
681
|
+
@data = cast_vector_to @dtype, keep_e
|
|
682
|
+
@index = DaruLite::Index.new(keep_i)
|
|
683
|
+
|
|
684
|
+
update_position_cache
|
|
685
|
+
|
|
686
|
+
self
|
|
687
|
+
end
|
|
688
|
+
|
|
689
|
+
# Keep an element if block returns true. Destructive.
|
|
690
|
+
def keep_if
|
|
691
|
+
return to_enum(:keep_if) unless block_given?
|
|
692
|
+
|
|
693
|
+
delete_if { |val| !yield(val) }
|
|
694
|
+
end
|
|
695
|
+
|
|
696
|
+
# Reports all values that doesn't comply with a condition.
|
|
697
|
+
# Returns a hash with the index of data and the invalid data.
|
|
698
|
+
def verify
|
|
699
|
+
(0...size)
|
|
700
|
+
.map { |i| [i, @data[i]] }
|
|
701
|
+
.reject { |_i, val| yield(val) }
|
|
702
|
+
.to_h
|
|
703
|
+
end
|
|
704
|
+
|
|
705
|
+
# Return an Array with the data splitted by a separator.
|
|
706
|
+
# a=DaruLite::Vector.new(["a,b","c,d","a,b","d"])
|
|
707
|
+
# a.splitted
|
|
708
|
+
# =>
|
|
709
|
+
# [["a","b"],["c","d"],["a","b"],["d"]]
|
|
710
|
+
def splitted(sep = ',')
|
|
711
|
+
@data.map do |s|
|
|
712
|
+
if s.nil?
|
|
713
|
+
nil
|
|
714
|
+
elsif s.respond_to? :split
|
|
715
|
+
s.split sep
|
|
716
|
+
else
|
|
717
|
+
[s]
|
|
718
|
+
end
|
|
719
|
+
end
|
|
720
|
+
end
|
|
721
|
+
|
|
722
|
+
# Returns a hash of Vectors, defined by the different values
|
|
723
|
+
# defined on the fields
|
|
724
|
+
# Example:
|
|
725
|
+
#
|
|
726
|
+
# a=DaruLite::Vector.new(["a,b","c,d","a,b"])
|
|
727
|
+
# a.split_by_separator
|
|
728
|
+
# => {"a"=>#<DaruLite::Vector:0x7f2dbcc09d88
|
|
729
|
+
# @data=[1, 0, 1]>,
|
|
730
|
+
# "b"=>#<DaruLite::Vector:0x7f2dbcc09c48
|
|
731
|
+
# @data=[1, 1, 0]>,
|
|
732
|
+
# "c"=>#<DaruLite::Vector:0x7f2dbcc09b08
|
|
733
|
+
# @data=[0, 1, 1]>}
|
|
734
|
+
#
|
|
735
|
+
def split_by_separator(sep = ',')
|
|
736
|
+
split_data = splitted sep
|
|
737
|
+
split_data
|
|
738
|
+
.flatten.uniq.compact.to_h do |key|
|
|
739
|
+
[
|
|
740
|
+
key,
|
|
741
|
+
DaruLite::Vector.new(split_data.map { |v| split_value(key, v) })
|
|
742
|
+
]
|
|
743
|
+
end
|
|
744
|
+
end
|
|
745
|
+
|
|
746
|
+
def split_by_separator_freq(sep = ',')
|
|
747
|
+
split_by_separator(sep).transform_values do |v|
|
|
748
|
+
v.sum(&:to_i)
|
|
749
|
+
end
|
|
750
|
+
end
|
|
751
|
+
|
|
752
|
+
def reset_index!
|
|
753
|
+
@index = DaruLite::Index.new(Array.new(size) { |i| i })
|
|
754
|
+
self
|
|
755
|
+
end
|
|
756
|
+
|
|
757
|
+
# Replace all nils in the vector with the value passed as an argument. Destructive.
|
|
758
|
+
# See #replace_nils for non-destructive version
|
|
759
|
+
#
|
|
760
|
+
# == Arguments
|
|
761
|
+
#
|
|
762
|
+
# * +replacement+ - The value which should replace all nils
|
|
763
|
+
def replace_nils!(replacement)
|
|
764
|
+
indexes(*DaruLite::MISSING_VALUES).each do |idx|
|
|
765
|
+
self[idx] = replacement
|
|
766
|
+
end
|
|
767
|
+
|
|
768
|
+
self
|
|
769
|
+
end
|
|
770
|
+
|
|
771
|
+
# Rolling fillna
|
|
772
|
+
# replace all Float::NAN and NIL values with the preceeding or following value
|
|
773
|
+
#
|
|
774
|
+
# @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
|
|
775
|
+
#
|
|
776
|
+
# @example
|
|
777
|
+
# dv = DaruLite::Vector.new([1, 2, 1, 4, nil, Float::NAN, 3, nil, Float::NAN])
|
|
778
|
+
#
|
|
779
|
+
# 2.3.3 :068 > dv.rolling_fillna(:forward)
|
|
780
|
+
# => #<DaruLite::Vector(9)>
|
|
781
|
+
# 0 1
|
|
782
|
+
# 1 2
|
|
783
|
+
# 2 1
|
|
784
|
+
# 3 4
|
|
785
|
+
# 4 4
|
|
786
|
+
# 5 4
|
|
787
|
+
# 6 3
|
|
788
|
+
# 7 3
|
|
789
|
+
# 8 3
|
|
790
|
+
#
|
|
791
|
+
def rolling_fillna!(direction = :forward)
|
|
792
|
+
enum = direction == :forward ? index : index.reverse_each
|
|
793
|
+
last_valid_value = 0
|
|
794
|
+
enum.each do |idx|
|
|
795
|
+
if valid_value?(self[idx])
|
|
796
|
+
last_valid_value = self[idx]
|
|
797
|
+
else
|
|
798
|
+
self[idx] = last_valid_value
|
|
799
|
+
end
|
|
800
|
+
end
|
|
801
|
+
self
|
|
802
|
+
end
|
|
803
|
+
|
|
804
|
+
# Non-destructive version of rolling_fillna!
|
|
805
|
+
def rolling_fillna(direction = :forward)
|
|
806
|
+
dup.rolling_fillna!(direction)
|
|
807
|
+
end
|
|
808
|
+
|
|
809
|
+
# Lags the series by `k` periods.
|
|
810
|
+
#
|
|
811
|
+
# Lags the series by `k` periods, "shifting" data and inserting `nil`s
|
|
812
|
+
# from beginning or end of a vector, while preserving original vector's
|
|
813
|
+
# size.
|
|
814
|
+
#
|
|
815
|
+
# `k` can be positive or negative integer. If `k` is positive, `nil`s
|
|
816
|
+
# are inserted at the beginning of the vector, otherwise they are
|
|
817
|
+
# inserted at the end.
|
|
818
|
+
#
|
|
819
|
+
# @param [Integer] k "shift" the series by `k` periods. `k` can be
|
|
820
|
+
# positive or negative. (default = 1)
|
|
821
|
+
#
|
|
822
|
+
# @return [DaruLite::Vector] a new vector with "shifted" inital values
|
|
823
|
+
# and `nil` values inserted. The return vector is the same length
|
|
824
|
+
# as the orignal vector.
|
|
825
|
+
#
|
|
826
|
+
# @example Lag a vector with different periods `k`
|
|
827
|
+
#
|
|
828
|
+
# ts = DaruLite::Vector.new(1..5)
|
|
829
|
+
# # => [1, 2, 3, 4, 5]
|
|
830
|
+
#
|
|
831
|
+
# ts.lag # => [nil, 1, 2, 3, 4]
|
|
832
|
+
# ts.lag(1) # => [nil, 1, 2, 3, 4]
|
|
833
|
+
# ts.lag(2) # => [nil, nil, 1, 2, 3]
|
|
834
|
+
# ts.lag(-1) # => [2, 3, 4, 5, nil]
|
|
835
|
+
#
|
|
836
|
+
def lag(k = 1)
|
|
837
|
+
case k
|
|
838
|
+
when 0 then dup
|
|
839
|
+
when 1...size
|
|
840
|
+
copy(([nil] * k) + data.to_a)
|
|
841
|
+
when -size..-1
|
|
842
|
+
copy(data.to_a[k.abs...size])
|
|
843
|
+
else
|
|
844
|
+
copy([])
|
|
845
|
+
end
|
|
846
|
+
end
|
|
847
|
+
|
|
848
|
+
def detach_index
|
|
849
|
+
DaruLite::DataFrame.new(
|
|
850
|
+
index: @index.to_a,
|
|
851
|
+
values: @data.to_a
|
|
852
|
+
)
|
|
853
|
+
end
|
|
854
|
+
|
|
855
|
+
# Non-destructive version of #replace_nils!
|
|
856
|
+
def replace_nils(replacement)
|
|
857
|
+
dup.replace_nils!(replacement)
|
|
858
|
+
end
|
|
859
|
+
|
|
860
|
+
# number of non-missing elements
|
|
861
|
+
def n_valid
|
|
862
|
+
size - indexes(*DaruLite::MISSING_VALUES).size
|
|
863
|
+
end
|
|
864
|
+
deprecate :n_valid, :count_values, 2016, 10
|
|
865
|
+
|
|
866
|
+
# Count the number of values specified
|
|
867
|
+
# @param values [Array] values to count for
|
|
868
|
+
# @return [Integer] the number of times the values mentioned occurs
|
|
869
|
+
# @example
|
|
870
|
+
# dv = DaruLite::Vector.new [1, 2, 1, 2, 3, 4, nil, nil]
|
|
871
|
+
# dv.count_values nil
|
|
872
|
+
# # => 2
|
|
873
|
+
def count_values(*values)
|
|
874
|
+
positions(*values).size
|
|
875
|
+
end
|
|
876
|
+
|
|
877
|
+
# Returns *true* if an index exists
|
|
878
|
+
def has_index?(index)
|
|
879
|
+
@index.include? index
|
|
880
|
+
end
|
|
881
|
+
|
|
882
|
+
# @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
|
|
883
|
+
# @return [DaruLite::Vector]
|
|
884
|
+
def get_sub_vector(keys, by_position: true)
|
|
885
|
+
return DaruLite::Vector.new([]) if keys == []
|
|
886
|
+
|
|
887
|
+
keys = @index.pos(*keys) unless by_position
|
|
888
|
+
|
|
889
|
+
sub_vect = at(*keys)
|
|
890
|
+
sub_vect = DaruLite::Vector.new([sub_vect]) unless sub_vect.is_a?(DaruLite::Vector)
|
|
891
|
+
|
|
892
|
+
sub_vect
|
|
893
|
+
end
|
|
894
|
+
|
|
895
|
+
# @return [DaruLite::DataFrame] the vector as a single-vector dataframe
|
|
896
|
+
def to_df
|
|
897
|
+
DaruLite::DataFrame.new({ @name => @data }, name: @name, index: @index)
|
|
898
|
+
end
|
|
899
|
+
|
|
900
|
+
# Convert Vector to a horizontal or vertical Ruby Matrix.
|
|
901
|
+
#
|
|
902
|
+
# == Arguments
|
|
903
|
+
#
|
|
904
|
+
# * +axis+ - Specify whether you want a *:horizontal* or a *:vertical* matrix.
|
|
905
|
+
def to_matrix(axis = :horizontal)
|
|
906
|
+
case axis
|
|
907
|
+
when :horizontal
|
|
908
|
+
Matrix[to_a]
|
|
909
|
+
when :vertical
|
|
910
|
+
Matrix.columns([to_a])
|
|
911
|
+
else
|
|
912
|
+
raise ArgumentError, "axis should be either :horizontal or :vertical, not #{axis}"
|
|
913
|
+
end
|
|
914
|
+
end
|
|
915
|
+
|
|
916
|
+
# Convert to hash (explicit). Hash keys are indexes and values are the correspoding elements
|
|
917
|
+
def to_h
|
|
918
|
+
@index.to_h { |index| [index, self[index]] }
|
|
919
|
+
end
|
|
920
|
+
|
|
921
|
+
# Return an array
|
|
922
|
+
def to_a
|
|
923
|
+
@data.to_a
|
|
924
|
+
end
|
|
925
|
+
|
|
926
|
+
# Convert the hash from to_h to json
|
|
927
|
+
def to_json(*)
|
|
928
|
+
to_h.to_json
|
|
929
|
+
end
|
|
930
|
+
|
|
931
|
+
# Convert to html for iruby
|
|
932
|
+
def to_html(threshold = 30)
|
|
933
|
+
table_thead = to_html_thead
|
|
934
|
+
table_tbody = to_html_tbody(threshold)
|
|
935
|
+
path = if index.is_a?(MultiIndex)
|
|
936
|
+
File.expand_path('iruby/templates/vector_mi.html.erb', __dir__)
|
|
937
|
+
else
|
|
938
|
+
File.expand_path('iruby/templates/vector.html.erb', __dir__)
|
|
939
|
+
end
|
|
940
|
+
ERB.new(File.read(path).strip).result(binding)
|
|
941
|
+
end
|
|
942
|
+
|
|
943
|
+
def to_html_thead
|
|
944
|
+
table_thead_path =
|
|
945
|
+
if index.is_a?(MultiIndex)
|
|
946
|
+
File.expand_path('iruby/templates/vector_mi_thead.html.erb', __dir__)
|
|
947
|
+
else
|
|
948
|
+
File.expand_path('iruby/templates/vector_thead.html.erb', __dir__)
|
|
949
|
+
end
|
|
950
|
+
ERB.new(File.read(table_thead_path).strip).result(binding)
|
|
951
|
+
end
|
|
952
|
+
|
|
953
|
+
def to_html_tbody(threshold = 30)
|
|
954
|
+
table_tbody_path =
|
|
955
|
+
if index.is_a?(MultiIndex)
|
|
956
|
+
File.expand_path('iruby/templates/vector_mi_tbody.html.erb', __dir__)
|
|
957
|
+
else
|
|
958
|
+
File.expand_path('iruby/templates/vector_tbody.html.erb', __dir__)
|
|
959
|
+
end
|
|
960
|
+
ERB.new(File.read(table_tbody_path).strip).result(binding)
|
|
961
|
+
end
|
|
962
|
+
|
|
963
|
+
def to_s
|
|
964
|
+
"#<#{self.class}#{": #{@name}" if @name}(#{size})#{':category' if category?}>"
|
|
965
|
+
end
|
|
966
|
+
|
|
967
|
+
# Create a summary of the Vector
|
|
968
|
+
# @param indent_level [Fixnum] indent level
|
|
969
|
+
# @return [String] String containing the summary of the Vector
|
|
970
|
+
# @example
|
|
971
|
+
# dv = DaruLite::Vector.new [1, 2, 3]
|
|
972
|
+
# puts dv.summary
|
|
973
|
+
#
|
|
974
|
+
# # =
|
|
975
|
+
# # n :3
|
|
976
|
+
# # non-missing:3
|
|
977
|
+
# # median: 2
|
|
978
|
+
# # mean: 2.0000
|
|
979
|
+
# # std.dev.: 1.0000
|
|
980
|
+
# # std.err.: 0.5774
|
|
981
|
+
# # skew: 0.0000
|
|
982
|
+
# # kurtosis: -2.3333
|
|
983
|
+
def summary(indent_level = 0)
|
|
984
|
+
non_missing = size - count_values(*DaruLite::MISSING_VALUES)
|
|
985
|
+
summary = (' =' * indent_level) + "= #{name}" \
|
|
986
|
+
"\n n :#{size}" \
|
|
987
|
+
"\n non-missing:#{non_missing}"
|
|
988
|
+
case type
|
|
989
|
+
when :object
|
|
990
|
+
summary << object_summary
|
|
991
|
+
when :numeric
|
|
992
|
+
summary << numeric_summary
|
|
993
|
+
end
|
|
994
|
+
summary.split("\n").join("\n#{' ' * indent_level}")
|
|
995
|
+
end
|
|
996
|
+
|
|
997
|
+
# Displays summary for an object type Vector
|
|
998
|
+
# @return [String] String containing object vector summary
|
|
999
|
+
def object_summary
|
|
1000
|
+
nval = count_values(*DaruLite::MISSING_VALUES)
|
|
1001
|
+
summary = "\n factors: #{factors.to_a.join(',')}" \
|
|
1002
|
+
"\n mode: #{mode.to_a.join(',')}" \
|
|
1003
|
+
"\n Distribution\n"
|
|
1004
|
+
|
|
1005
|
+
data = frequencies.sort.each_with_index.map do |v, k|
|
|
1006
|
+
[k, v, format('%0.2f%%', ((nval.zero? ? 1 : v.quo(nval)) * 100))]
|
|
1007
|
+
end
|
|
1008
|
+
|
|
1009
|
+
summary + Formatters::Table.format(data)
|
|
1010
|
+
end
|
|
1011
|
+
|
|
1012
|
+
# Displays summary for an numeric type Vector
|
|
1013
|
+
# @return [String] String containing numeric vector summary
|
|
1014
|
+
def numeric_summary
|
|
1015
|
+
summary = "\n median: #{median}" +
|
|
1016
|
+
format("\n mean: %0.4f", mean)
|
|
1017
|
+
if sd
|
|
1018
|
+
summary << (format("\n std.dev.: %0.4f", sd) +
|
|
1019
|
+
format("\n std.err.: %0.4f", se))
|
|
1020
|
+
end
|
|
1021
|
+
|
|
1022
|
+
if count_values(*DaruLite::MISSING_VALUES).zero?
|
|
1023
|
+
summary << (format("\n skew: %0.4f", skew) +
|
|
1024
|
+
format("\n kurtosis: %0.4f", kurtosis))
|
|
1025
|
+
end
|
|
1026
|
+
summary
|
|
1027
|
+
end
|
|
1028
|
+
|
|
1029
|
+
# Over rides original inspect for pretty printing in irb
|
|
1030
|
+
def inspect(spacing = 20, threshold = 15)
|
|
1031
|
+
row_headers = index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
|
|
1032
|
+
|
|
1033
|
+
"#<#{self.class}(#{size})#{':category' if category?}>\n" +
|
|
1034
|
+
Formatters::Table.format(
|
|
1035
|
+
to_a.lazy.map { |v| [v] },
|
|
1036
|
+
headers: @name && [@name],
|
|
1037
|
+
row_headers: row_headers,
|
|
1038
|
+
threshold: threshold,
|
|
1039
|
+
spacing: spacing
|
|
1040
|
+
)
|
|
1041
|
+
end
|
|
1042
|
+
|
|
1043
|
+
# Sets new index for vector. Preserves index->value correspondence.
|
|
1044
|
+
# Sets nil for new index keys absent from original index.
|
|
1045
|
+
# @note Unlike #reorder! which takes positions as input it takes
|
|
1046
|
+
# index as an input to reorder the vector
|
|
1047
|
+
# @param [DaruLite::Index, DaruLite::MultiIndex] new_index new index to order with
|
|
1048
|
+
# @return [DaruLite::Vector] vector reindexed with new index
|
|
1049
|
+
def reindex!(new_index)
|
|
1050
|
+
values = []
|
|
1051
|
+
each_with_index do |val, i|
|
|
1052
|
+
values[new_index[i]] = val if new_index.include?(i)
|
|
1053
|
+
end
|
|
1054
|
+
values.fill(nil, values.size, new_index.size - values.size)
|
|
1055
|
+
|
|
1056
|
+
@data = cast_vector_to @dtype, values
|
|
1057
|
+
@index = new_index
|
|
1058
|
+
|
|
1059
|
+
update_position_cache
|
|
1060
|
+
|
|
1061
|
+
self
|
|
1062
|
+
end
|
|
1063
|
+
|
|
1064
|
+
# Reorder the vector with given positions
|
|
1065
|
+
# @note Unlike #reindex! which takes index as input, it takes
|
|
1066
|
+
# positions as an input to reorder the vector
|
|
1067
|
+
# @param [Array] order the order to reorder the vector with
|
|
1068
|
+
# @return reordered vector
|
|
1069
|
+
# @example
|
|
1070
|
+
# dv = DaruLite::Vector.new [3, 2, 1], index: ['c', 'b', 'a']
|
|
1071
|
+
# dv.reorder! [2, 1, 0]
|
|
1072
|
+
# # => #<DaruLite::Vector(3)>
|
|
1073
|
+
# # a 1
|
|
1074
|
+
# # b 2
|
|
1075
|
+
# # c 3
|
|
1076
|
+
def reorder!(order)
|
|
1077
|
+
@index = @index.reorder order
|
|
1078
|
+
data_array = order.map { |i| @data[i] }
|
|
1079
|
+
@data = cast_vector_to @dtype, data_array, @nm_dtype
|
|
1080
|
+
update_position_cache
|
|
1081
|
+
self
|
|
1082
|
+
end
|
|
1083
|
+
|
|
1084
|
+
# Non-destructive version of #reorder!
|
|
1085
|
+
def reorder(order)
|
|
1086
|
+
dup.reorder! order
|
|
1087
|
+
end
|
|
1088
|
+
|
|
1089
|
+
# Create a new vector with a different index, and preserve the indexing of
|
|
1090
|
+
# current elements.
|
|
1091
|
+
def reindex(new_index)
|
|
1092
|
+
dup.reindex!(new_index)
|
|
1093
|
+
end
|
|
1094
|
+
|
|
1095
|
+
def index=(idx)
|
|
1096
|
+
idx = Index.coerce(idx)
|
|
1097
|
+
|
|
1098
|
+
raise ArgumentError, "Size of supplied index #{idx.size} does not match size of Vector" if idx.size != size
|
|
1099
|
+
raise ArgumentError, 'Can only assign type Index and its subclasses.' unless idx.is_a?(DaruLite::Index)
|
|
1100
|
+
|
|
1101
|
+
@index = idx
|
|
1102
|
+
self
|
|
1103
|
+
end
|
|
1104
|
+
|
|
1105
|
+
# Give the vector a new name
|
|
1106
|
+
#
|
|
1107
|
+
# @param new_name [Symbol] The new name.
|
|
1108
|
+
def rename(new_name)
|
|
1109
|
+
@name = new_name
|
|
1110
|
+
self
|
|
1111
|
+
end
|
|
1112
|
+
|
|
1113
|
+
alias name= rename
|
|
1114
|
+
|
|
1115
|
+
# Duplicated a vector
|
|
1116
|
+
# @return [DaruLite::Vector] duplicated vector
|
|
1117
|
+
def dup
|
|
1118
|
+
DaruLite::Vector.new @data.dup, name: @name, index: @index.dup
|
|
1119
|
+
end
|
|
1120
|
+
|
|
1121
|
+
# == Bootstrap
|
|
1122
|
+
# Generate +nr+ resamples (with replacement) of size +s+
|
|
1123
|
+
# from vector, computing each estimate from +estimators+
|
|
1124
|
+
# over each resample.
|
|
1125
|
+
# +estimators+ could be
|
|
1126
|
+
# a) Hash with variable names as keys and lambdas as values
|
|
1127
|
+
# a.bootstrap(:log_s2=>lambda {|v| Math.log(v.variance)},1000)
|
|
1128
|
+
# b) Array with names of method to bootstrap
|
|
1129
|
+
# a.bootstrap([:mean, :sd],1000)
|
|
1130
|
+
# c) A single method to bootstrap
|
|
1131
|
+
# a.jacknife(:mean, 1000)
|
|
1132
|
+
# If s is nil, is set to vector size by default.
|
|
1133
|
+
#
|
|
1134
|
+
# Returns a DataFrame where each vector is a vector
|
|
1135
|
+
# of length +nr+ containing the computed resample estimates.
|
|
1136
|
+
def bootstrap(estimators, nr, s = nil)
|
|
1137
|
+
s ||= size
|
|
1138
|
+
h_est, es, bss = prepare_bootstrap(estimators)
|
|
1139
|
+
|
|
1140
|
+
nr.times do
|
|
1141
|
+
bs = sample_with_replacement(s)
|
|
1142
|
+
es.each do |estimator|
|
|
1143
|
+
bss[estimator].push(h_est[estimator].call(bs))
|
|
1144
|
+
end
|
|
1145
|
+
end
|
|
1146
|
+
|
|
1147
|
+
es.each do |est|
|
|
1148
|
+
bss[est] = DaruLite::Vector.new bss[est]
|
|
1149
|
+
end
|
|
1150
|
+
|
|
1151
|
+
DaruLite::DataFrame.new bss
|
|
1152
|
+
end
|
|
1153
|
+
|
|
1154
|
+
# == Jacknife
|
|
1155
|
+
# Returns a dataset with jacknife delete-+k+ +estimators+
|
|
1156
|
+
# +estimators+ could be:
|
|
1157
|
+
# a) Hash with variable names as keys and lambdas as values
|
|
1158
|
+
# a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance)})
|
|
1159
|
+
# b) Array with method names to jacknife
|
|
1160
|
+
# a.jacknife([:mean, :sd])
|
|
1161
|
+
# c) A single method to jacknife
|
|
1162
|
+
# a.jacknife(:mean)
|
|
1163
|
+
# +k+ represent the block size for block jacknife. By default
|
|
1164
|
+
# is set to 1, for classic delete-one jacknife.
|
|
1165
|
+
#
|
|
1166
|
+
# Returns a dataset where each vector is an vector
|
|
1167
|
+
# of length +cases+/+k+ containing the computed jacknife estimates.
|
|
1168
|
+
#
|
|
1169
|
+
# == Reference:
|
|
1170
|
+
# * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife.
|
|
1171
|
+
def jackknife(estimators, k = 1) # rubocop:disable Metrics/MethodLength
|
|
1172
|
+
raise "n should be divisible by k:#{k}" unless (size % k).zero?
|
|
1173
|
+
|
|
1174
|
+
nb = (size / k).to_i
|
|
1175
|
+
h_est, es, ps = prepare_bootstrap(estimators)
|
|
1176
|
+
|
|
1177
|
+
est_n = es.to_h { |v| [v, h_est[v].call(self)] }
|
|
1178
|
+
|
|
1179
|
+
nb.times do |i|
|
|
1180
|
+
other = @data.dup
|
|
1181
|
+
other.slice!(i * k, k)
|
|
1182
|
+
other = DaruLite::Vector.new other
|
|
1183
|
+
|
|
1184
|
+
es.each do |estimator|
|
|
1185
|
+
# Add pseudovalue
|
|
1186
|
+
ps[estimator].push(
|
|
1187
|
+
(nb * est_n[estimator]) - ((nb - 1) * h_est[estimator].call(other))
|
|
1188
|
+
)
|
|
1189
|
+
end
|
|
1190
|
+
end
|
|
1191
|
+
|
|
1192
|
+
es.each do |est|
|
|
1193
|
+
ps[est] = DaruLite::Vector.new ps[est]
|
|
1194
|
+
end
|
|
1195
|
+
DaruLite::DataFrame.new ps
|
|
1196
|
+
end
|
|
1197
|
+
|
|
1198
|
+
# Returns an array of either none or integer values, indicating the
|
|
1199
|
+
# +regexp+ matching with the given array.
|
|
1200
|
+
#
|
|
1201
|
+
# @param regexp [Regexp] A regular matching expression. For example, +/weeks/+.
|
|
1202
|
+
#
|
|
1203
|
+
# @return [Array] Containing either +nil+ or integer values, according to the match with the given +regexp+
|
|
1204
|
+
#
|
|
1205
|
+
# @example
|
|
1206
|
+
# dv = DaruLite::Vector.new(['3 days', '5 weeks', '2 weeks'])
|
|
1207
|
+
# dv.match(/weeks/)
|
|
1208
|
+
#
|
|
1209
|
+
# # => [false, true, true]
|
|
1210
|
+
def match(regexp)
|
|
1211
|
+
@data.map { |value| !!(value =~ regexp) }
|
|
1212
|
+
end
|
|
1213
|
+
|
|
1214
|
+
# Creates a new vector consisting only of non-nil data
|
|
1215
|
+
#
|
|
1216
|
+
# == Arguments
|
|
1217
|
+
#
|
|
1218
|
+
# @param as_a [Symbol] Passing :array will return only the elements
|
|
1219
|
+
# as an Array. Otherwise will return a DaruLite::Vector.
|
|
1220
|
+
#
|
|
1221
|
+
# @param _duplicate [Symbol] In case no missing data is found in the
|
|
1222
|
+
# vector, setting this to false will return the same vector.
|
|
1223
|
+
# Otherwise, a duplicate will be returned irrespective of
|
|
1224
|
+
# presence of missing data.
|
|
1225
|
+
|
|
1226
|
+
def only_valid(as_a = :vector, _duplicate = true)
|
|
1227
|
+
# FIXME: Now duplicate is just ignored.
|
|
1228
|
+
# There are no spec that fail on this case, so I'll leave it
|
|
1229
|
+
# this way for now - zverok, 2016-05-07
|
|
1230
|
+
|
|
1231
|
+
new_index = @index.to_a - indexes(*DaruLite::MISSING_VALUES)
|
|
1232
|
+
new_vector = new_index.map { |idx| self[idx] }
|
|
1233
|
+
|
|
1234
|
+
if as_a == :vector
|
|
1235
|
+
DaruLite::Vector.new new_vector, index: new_index, name: @name, dtype: dtype
|
|
1236
|
+
else
|
|
1237
|
+
new_vector
|
|
1238
|
+
end
|
|
1239
|
+
end
|
|
1240
|
+
deprecate :only_valid, :reject_values, 2016, 10
|
|
1241
|
+
|
|
1242
|
+
# Return a vector with specified values removed
|
|
1243
|
+
# @param values [Array] values to reject from resultant vector
|
|
1244
|
+
# @return [DaruLite::Vector] vector with specified values removed
|
|
1245
|
+
# @example
|
|
1246
|
+
# dv = DaruLite::Vector.new [1, 2, nil, Float::NAN]
|
|
1247
|
+
# dv.reject_values nil, Float::NAN
|
|
1248
|
+
# # => #<DaruLite::Vector(2)>
|
|
1249
|
+
# # 0 1
|
|
1250
|
+
# # 1 2
|
|
1251
|
+
def reject_values(*values)
|
|
1252
|
+
resultant_pos = size.times.to_a - positions(*values)
|
|
1253
|
+
dv = at(*resultant_pos)
|
|
1254
|
+
# Handle the case when number of positions is 1
|
|
1255
|
+
# and hence #at doesn't return a vector
|
|
1256
|
+
if dv.is_a?(DaruLite::Vector)
|
|
1257
|
+
dv
|
|
1258
|
+
else
|
|
1259
|
+
pos = resultant_pos.first
|
|
1260
|
+
at(pos..pos)
|
|
1261
|
+
end
|
|
1262
|
+
end
|
|
1263
|
+
|
|
1264
|
+
# Return indexes of values specified
|
|
1265
|
+
# @param values [Array] values to find indexes for
|
|
1266
|
+
# @return [Array] array of indexes of values specified
|
|
1267
|
+
# @example
|
|
1268
|
+
# dv = DaruLite::Vector.new [1, 2, nil, Float::NAN], index: 11..14
|
|
1269
|
+
# dv.indexes nil, Float::NAN
|
|
1270
|
+
# # => [13, 14]
|
|
1271
|
+
def indexes(*values)
|
|
1272
|
+
index.to_a.values_at(*positions(*values))
|
|
1273
|
+
end
|
|
1274
|
+
|
|
1275
|
+
# Replaces specified values with a new value
|
|
1276
|
+
# @param [Array] old_values array of values to replace
|
|
1277
|
+
# @param [object] new_value new value to replace with
|
|
1278
|
+
# @note It performs the replace in place.
|
|
1279
|
+
# @return [DaruLite::Vector] Same vector itself with values
|
|
1280
|
+
# replaced with new value
|
|
1281
|
+
# @example
|
|
1282
|
+
# dv = DaruLite::Vector.new [1, 2, :a, :b]
|
|
1283
|
+
# dv.replace_values [:a, :b], nil
|
|
1284
|
+
# dv
|
|
1285
|
+
# # =>
|
|
1286
|
+
# # #<DaruLite::Vector:19903200 @name = nil @metadata = {} @size = 4 >
|
|
1287
|
+
# # nil
|
|
1288
|
+
# # 0 1
|
|
1289
|
+
# # 1 2
|
|
1290
|
+
# # 2 nil
|
|
1291
|
+
# # 3 nil
|
|
1292
|
+
def replace_values(old_values, new_value)
|
|
1293
|
+
old_values = [old_values] unless old_values.is_a? Array
|
|
1294
|
+
size.times do |pos|
|
|
1295
|
+
set_at([pos], new_value) if include_with_nan? old_values, at(pos)
|
|
1296
|
+
end
|
|
1297
|
+
self
|
|
1298
|
+
end
|
|
1299
|
+
|
|
1300
|
+
# Returns a Vector containing only missing data (preserves indexes).
|
|
1301
|
+
def only_missing(as_a = :vector)
|
|
1302
|
+
case as_a
|
|
1303
|
+
when :vector
|
|
1304
|
+
self[*indexes(*DaruLite::MISSING_VALUES)]
|
|
1305
|
+
when :array
|
|
1306
|
+
self[*indexes(*DaruLite::MISSING_VALUES)].to_a
|
|
1307
|
+
end
|
|
1308
|
+
end
|
|
1309
|
+
deprecate :only_missing, nil, 2016, 10
|
|
1310
|
+
|
|
1311
|
+
# Returns a Vector with only numerical data. Missing data is included
|
|
1312
|
+
# but non-Numeric objects are excluded. Preserves index.
|
|
1313
|
+
def only_numerics
|
|
1314
|
+
numeric_indexes =
|
|
1315
|
+
each_with_index
|
|
1316
|
+
.select { |v, _i| v.is_a?(Numeric) || v.nil? }
|
|
1317
|
+
.map(&:last)
|
|
1318
|
+
|
|
1319
|
+
self[*numeric_indexes]
|
|
1320
|
+
end
|
|
1321
|
+
|
|
1322
|
+
DATE_REGEXP = /^(\d{2}-\d{2}-\d{4}|\d{4}-\d{2}-\d{2})$/.freeze
|
|
1323
|
+
|
|
1324
|
+
# Returns the database type for the vector, according to its content
|
|
1325
|
+
def db_type
|
|
1326
|
+
# first, detect any character not number
|
|
1327
|
+
if @data.any? { |v| v.to_s =~ DATE_REGEXP }
|
|
1328
|
+
'DATE'
|
|
1329
|
+
elsif @data.any? { |v| v.to_s =~ /[^0-9e.-]/ }
|
|
1330
|
+
'VARCHAR (255)'
|
|
1331
|
+
elsif @data.any? { |v| v.to_s.include?('.') }
|
|
1332
|
+
'DOUBLE'
|
|
1333
|
+
else
|
|
1334
|
+
'INTEGER'
|
|
1335
|
+
end
|
|
1336
|
+
end
|
|
1337
|
+
|
|
1338
|
+
# Copies the structure of the vector (i.e the index, size, etc.) and fills all
|
|
1339
|
+
# all values with nils.
|
|
1340
|
+
def clone_structure
|
|
1341
|
+
DaruLite::Vector.new(([nil] * size), name: @name, index: @index.dup)
|
|
1342
|
+
end
|
|
1343
|
+
|
|
1344
|
+
# Save the vector to a file
|
|
1345
|
+
#
|
|
1346
|
+
# == Arguments
|
|
1347
|
+
#
|
|
1348
|
+
# * filename - Path of file where the vector is to be saved
|
|
1349
|
+
def save(filename)
|
|
1350
|
+
DaruLite::IO.save self, filename
|
|
1351
|
+
end
|
|
1352
|
+
|
|
1353
|
+
def _dump(*) # :nodoc:
|
|
1354
|
+
Marshal.dump(
|
|
1355
|
+
data: @data.to_a,
|
|
1356
|
+
dtype: @dtype,
|
|
1357
|
+
name: @name,
|
|
1358
|
+
index: @index
|
|
1359
|
+
)
|
|
1360
|
+
end
|
|
1361
|
+
|
|
1362
|
+
# :nocov:
|
|
1363
|
+
def daru_lite_vector(*)
|
|
1364
|
+
self
|
|
1365
|
+
end
|
|
1366
|
+
# :nocov:
|
|
1367
|
+
|
|
1368
|
+
alias dv daru_lite_vector
|
|
1369
|
+
|
|
1370
|
+
# Converts a non category type vector to category type vector.
|
|
1371
|
+
# @param [Hash] opts options to convert to category
|
|
1372
|
+
# @option opts [true, false] :ordered Specify if vector is ordered or not.
|
|
1373
|
+
# If it is ordered, it can be sorted and min, max like functions would work
|
|
1374
|
+
# @option opts [Array] :categories set categories in the specified order
|
|
1375
|
+
# @return [DaruLite::Vector] vector with type category
|
|
1376
|
+
def to_category(opts = {})
|
|
1377
|
+
dv = DaruLite::Vector.new to_a, type: :category, name: @name, index: @index
|
|
1378
|
+
dv.ordered = opts[:ordered] || false
|
|
1379
|
+
dv.categories = opts[:categories] if opts[:categories]
|
|
1380
|
+
dv
|
|
1381
|
+
end
|
|
1382
|
+
|
|
1383
|
+
def method_missing(name, *args, &block)
|
|
1384
|
+
# FIXME: it is shamefully fragile. Should be either made stronger
|
|
1385
|
+
# (string/symbol dychotomy, informative errors) or removed totally. - zverok
|
|
1386
|
+
if name =~ /(.+)=/
|
|
1387
|
+
self[Regexp.last_match(1).to_sym] = args[0]
|
|
1388
|
+
elsif has_index?(name)
|
|
1389
|
+
self[name]
|
|
1390
|
+
else
|
|
1391
|
+
super
|
|
1392
|
+
end
|
|
1393
|
+
end
|
|
1394
|
+
|
|
1395
|
+
def respond_to_missing?(name, include_private = false)
|
|
1396
|
+
name.to_s.end_with?('=') || has_index?(name) || super
|
|
1397
|
+
end
|
|
1398
|
+
|
|
1399
|
+
# Partition a numeric variable into categories.
|
|
1400
|
+
# @param [Array<Numeric>] partitions an array whose consecutive elements
|
|
1401
|
+
# provide intervals for categories
|
|
1402
|
+
# @param [Hash] opts options to cut the partition
|
|
1403
|
+
# @option opts [:left, :right] :close_at specifies whether the interval closes at
|
|
1404
|
+
# the right side of left side
|
|
1405
|
+
# @option opts [Array] :labels names of the categories
|
|
1406
|
+
# @return [DaruLite::Vector] numeric variable converted to categorical variable
|
|
1407
|
+
# @example
|
|
1408
|
+
# heights = DaruLite::Vector.new [30, 35, 32, 50, 42, 51]
|
|
1409
|
+
# height_cat = heights.cut [30, 40, 50, 60], labels=['low', 'medium', 'high']
|
|
1410
|
+
# # => #<DaruLite::Vector(6)>
|
|
1411
|
+
# # 0 low
|
|
1412
|
+
# # 1 low
|
|
1413
|
+
# # 2 low
|
|
1414
|
+
# # 3 high
|
|
1415
|
+
# # 4 medium
|
|
1416
|
+
# # 5 high
|
|
1417
|
+
def cut(partitions, opts = {})
|
|
1418
|
+
close_at = opts[:close_at] || :right
|
|
1419
|
+
labels = opts[:labels]
|
|
1420
|
+
partitions = partitions.to_a
|
|
1421
|
+
values = to_a.map { |val| cut_find_category partitions, val, close_at }
|
|
1422
|
+
cats = cut_categories(partitions, close_at)
|
|
1423
|
+
|
|
1424
|
+
dv = DaruLite::Vector.new values,
|
|
1425
|
+
index: @index,
|
|
1426
|
+
type: :category,
|
|
1427
|
+
categories: cats
|
|
1428
|
+
|
|
1429
|
+
# Rename categories if new labels provided
|
|
1430
|
+
if labels
|
|
1431
|
+
dv.rename_categories cats.zip(labels).to_h
|
|
1432
|
+
else
|
|
1433
|
+
dv
|
|
1434
|
+
end
|
|
1435
|
+
end
|
|
1436
|
+
|
|
1437
|
+
def positions(*values)
|
|
1438
|
+
case values
|
|
1439
|
+
when [nil]
|
|
1440
|
+
nil_positions
|
|
1441
|
+
when [Float::NAN]
|
|
1442
|
+
nan_positions
|
|
1443
|
+
when [nil, Float::NAN], [Float::NAN, nil]
|
|
1444
|
+
nil_positions + nan_positions
|
|
1445
|
+
else
|
|
1446
|
+
size.times.select { |i| include_with_nan? values, @data[i] }
|
|
1447
|
+
end
|
|
1448
|
+
end
|
|
1449
|
+
|
|
1450
|
+
def group_by(*args)
|
|
1451
|
+
to_df.group_by(*args)
|
|
1452
|
+
end
|
|
1453
|
+
|
|
1454
|
+
private
|
|
1455
|
+
|
|
1456
|
+
def copy(values)
|
|
1457
|
+
# Make sure values is right-justified to the size of the vector
|
|
1458
|
+
values.concat([nil] * (size - values.size)) if values.size < size
|
|
1459
|
+
DaruLite::Vector.new(values[0...size], index: @index, name: @name)
|
|
1460
|
+
end
|
|
1461
|
+
|
|
1462
|
+
def nil_positions
|
|
1463
|
+
@nil_positions ||
|
|
1464
|
+
@nil_positions = size.times.select { |i| @data[i].nil? }
|
|
1465
|
+
end
|
|
1466
|
+
|
|
1467
|
+
def nan_positions
|
|
1468
|
+
@nan_positions ||
|
|
1469
|
+
@nan_positions = size.times.select do |i|
|
|
1470
|
+
@data[i].respond_to?(:nan?) && @data[i].nan?
|
|
1471
|
+
end
|
|
1472
|
+
end
|
|
1473
|
+
|
|
1474
|
+
# Helper method returning validity of arbitrary value
|
|
1475
|
+
def valid_value?(v)
|
|
1476
|
+
!((v.respond_to?(:nan?) && v.nan?) || v.nil?)
|
|
1477
|
+
end
|
|
1478
|
+
|
|
1479
|
+
def initialize_vector(source, opts)
|
|
1480
|
+
index, source = parse_source(source, opts)
|
|
1481
|
+
set_name opts[:name]
|
|
1482
|
+
|
|
1483
|
+
@data = cast_vector_to(opts[:dtype] || :array, source, opts[:nm_dtype])
|
|
1484
|
+
@index = Index.coerce(index || @data.size)
|
|
1485
|
+
|
|
1486
|
+
guard_sizes!
|
|
1487
|
+
|
|
1488
|
+
@possibly_changed_type = true
|
|
1489
|
+
end
|
|
1490
|
+
|
|
1491
|
+
def parse_source(source, opts)
|
|
1492
|
+
if source.is_a?(Hash)
|
|
1493
|
+
[source.keys, source.values]
|
|
1494
|
+
else
|
|
1495
|
+
[opts[:index], source || []]
|
|
1496
|
+
end
|
|
1497
|
+
end
|
|
1498
|
+
|
|
1499
|
+
def guard_sizes!
|
|
1500
|
+
if @index.size > @data.size
|
|
1501
|
+
cast(dtype: :array) # NM with nils seg faults
|
|
1502
|
+
@data.fill(nil, @data.size...@index.size)
|
|
1503
|
+
elsif @index.size < @data.size
|
|
1504
|
+
raise IndexError, "Expected index size >= vector size. Index size : #{@index.size}, vector size : #{@data.size}"
|
|
1505
|
+
end
|
|
1506
|
+
end
|
|
1507
|
+
|
|
1508
|
+
def guard_type_check(value)
|
|
1509
|
+
@possibly_changed_type = true \
|
|
1510
|
+
if (object? && (value.nil? || value.is_a?(Numeric))) ||
|
|
1511
|
+
(numeric? && !value.is_a?(Numeric) && !value.nil?)
|
|
1512
|
+
end
|
|
1513
|
+
|
|
1514
|
+
def split_value(key, v)
|
|
1515
|
+
if v.nil?
|
|
1516
|
+
nil
|
|
1517
|
+
elsif v.include?(key)
|
|
1518
|
+
1
|
|
1519
|
+
else
|
|
1520
|
+
0
|
|
1521
|
+
end
|
|
1522
|
+
end
|
|
1523
|
+
|
|
1524
|
+
# For an array or hash of estimators methods, returns
|
|
1525
|
+
# an array with three elements
|
|
1526
|
+
# 1.- A hash with estimators names as keys and lambdas as values
|
|
1527
|
+
# 2.- An array with estimators names
|
|
1528
|
+
# 3.- A Hash with estimators names as keys and empty arrays as values
|
|
1529
|
+
def prepare_bootstrap(estimators)
|
|
1530
|
+
h_est = estimators
|
|
1531
|
+
h_est = [h_est] unless h_est.is_a?(Array) || h_est.is_a?(Hash)
|
|
1532
|
+
|
|
1533
|
+
if h_est.is_a? Array
|
|
1534
|
+
h_est = h_est.to_h do |est|
|
|
1535
|
+
[est, ->(v) { DaruLite::Vector.new(v).send(est) }]
|
|
1536
|
+
end
|
|
1537
|
+
end
|
|
1538
|
+
bss = h_est.keys.to_h { |v| [v, []] }
|
|
1539
|
+
|
|
1540
|
+
[h_est, h_est.keys, bss]
|
|
1541
|
+
end
|
|
1542
|
+
|
|
1543
|
+
# NOTE: To maintain sanity, this _MUST_ be the _ONLY_ place in daru where the
|
|
1544
|
+
# @param dtype [db_type] variable is set and the underlying data type of vector changed.
|
|
1545
|
+
def cast_vector_to(dtype, source = nil, _nm_dtype = nil)
|
|
1546
|
+
source = @data.to_a if source.nil?
|
|
1547
|
+
|
|
1548
|
+
new_vector =
|
|
1549
|
+
case dtype
|
|
1550
|
+
when :array then DaruLite::Accessors::ArrayWrapper.new(source, self)
|
|
1551
|
+
when :mdarray then raise NotImplementedError, 'MDArray not yet supported.'
|
|
1552
|
+
else raise ArgumentError, "Unknown dtype #{dtype}"
|
|
1553
|
+
end
|
|
1554
|
+
|
|
1555
|
+
@dtype = dtype
|
|
1556
|
+
new_vector
|
|
1557
|
+
end
|
|
1558
|
+
|
|
1559
|
+
def set_name(name) # rubocop:disable Naming/AccessorMethodName
|
|
1560
|
+
@name = name.is_a?(Array) ? name.join : name # join in case of MultiIndex tuple
|
|
1561
|
+
end
|
|
1562
|
+
|
|
1563
|
+
# Raises IndexError when one of the positions is an invalid position
|
|
1564
|
+
def validate_positions(*positions)
|
|
1565
|
+
positions.each do |pos|
|
|
1566
|
+
raise IndexError, "#{pos} is not a valid position." if pos >= size
|
|
1567
|
+
end
|
|
1568
|
+
end
|
|
1569
|
+
|
|
1570
|
+
# coerce ranges, integers and array in appropriate ways
|
|
1571
|
+
def coerce_positions(*positions)
|
|
1572
|
+
if positions.size == 1
|
|
1573
|
+
case positions.first
|
|
1574
|
+
when Integer
|
|
1575
|
+
positions.first
|
|
1576
|
+
when Range
|
|
1577
|
+
size.times.to_a[positions.first]
|
|
1578
|
+
else
|
|
1579
|
+
raise ArgumentError, 'Unkown position type.'
|
|
1580
|
+
end
|
|
1581
|
+
else
|
|
1582
|
+
positions
|
|
1583
|
+
end
|
|
1584
|
+
end
|
|
1585
|
+
|
|
1586
|
+
# Helper method for []=.
|
|
1587
|
+
# Assigs existing index to another value
|
|
1588
|
+
def modify_vector(indexes, val)
|
|
1589
|
+
positions = @index.pos(*indexes)
|
|
1590
|
+
|
|
1591
|
+
if positions.is_a? Numeric
|
|
1592
|
+
@data[positions] = val
|
|
1593
|
+
else
|
|
1594
|
+
positions.each { |pos| @data[pos] = val }
|
|
1595
|
+
end
|
|
1596
|
+
end
|
|
1597
|
+
|
|
1598
|
+
# Helper method for []=.
|
|
1599
|
+
# Add a new index and assign it value
|
|
1600
|
+
def insert_vector(indexes, val)
|
|
1601
|
+
new_index = @index.add(*indexes)
|
|
1602
|
+
# May be create +=
|
|
1603
|
+
(new_index.size - @index.size).times { @data << val }
|
|
1604
|
+
@index = new_index
|
|
1605
|
+
end
|
|
1606
|
+
|
|
1607
|
+
# Works similar to #[]= but also insert the vector in case index is not valid
|
|
1608
|
+
# It is there only to be accessed by DaruLite::DataFrame and not meant for user.
|
|
1609
|
+
def set(indexes, val)
|
|
1610
|
+
cast(dtype: :array) if val.nil? && dtype != :array
|
|
1611
|
+
guard_type_check(val)
|
|
1612
|
+
|
|
1613
|
+
if @index.valid?(*indexes)
|
|
1614
|
+
modify_vector(indexes, val)
|
|
1615
|
+
else
|
|
1616
|
+
insert_vector(indexes, val)
|
|
1617
|
+
end
|
|
1618
|
+
|
|
1619
|
+
update_position_cache
|
|
1620
|
+
end
|
|
1621
|
+
|
|
1622
|
+
def cut_find_category(partitions, val, close_at)
|
|
1623
|
+
case close_at
|
|
1624
|
+
when :right
|
|
1625
|
+
right_index = partitions.index { |i| i > val }
|
|
1626
|
+
raise ArgumentError, 'Invalid partition' if right_index.nil?
|
|
1627
|
+
|
|
1628
|
+
left_index = right_index - 1
|
|
1629
|
+
"#{partitions[left_index]}-#{partitions[right_index] - 1}"
|
|
1630
|
+
when :left
|
|
1631
|
+
right_index = partitions.index { |i| i >= val }
|
|
1632
|
+
raise ArgumentError, 'Invalid partition' if right_index.nil?
|
|
1633
|
+
|
|
1634
|
+
left_index = right_index - 1
|
|
1635
|
+
"#{partitions[left_index] + 1}-#{partitions[right_index]}"
|
|
1636
|
+
else
|
|
1637
|
+
raise ArgumentError, "Invalid parameter #{close_at} to close_at."
|
|
1638
|
+
end
|
|
1639
|
+
end
|
|
1640
|
+
|
|
1641
|
+
def cut_categories(partitions, close_at)
|
|
1642
|
+
case close_at
|
|
1643
|
+
when :right
|
|
1644
|
+
Array.new(partitions.size - 1) do |left_index|
|
|
1645
|
+
"#{partitions[left_index]}-#{partitions[left_index + 1] - 1}"
|
|
1646
|
+
end
|
|
1647
|
+
when :left
|
|
1648
|
+
Array.new(partitions.size - 1) do |left_index|
|
|
1649
|
+
"#{partitions[left_index] + 1}-#{partitions[left_index + 1]}"
|
|
1650
|
+
end
|
|
1651
|
+
end
|
|
1652
|
+
end
|
|
1653
|
+
|
|
1654
|
+
def include_with_nan?(array, value)
|
|
1655
|
+
# Returns true if value is included in array.
|
|
1656
|
+
# Similar to include? but also works if value is Float::NAN
|
|
1657
|
+
if value.respond_to?(:nan?) && value.nan?
|
|
1658
|
+
array.any? { |i| i.respond_to?(:nan?) && i.nan? }
|
|
1659
|
+
else
|
|
1660
|
+
array.include? value
|
|
1661
|
+
end
|
|
1662
|
+
end
|
|
1663
|
+
|
|
1664
|
+
def update_position_cache
|
|
1665
|
+
@nil_positions = nil
|
|
1666
|
+
@nan_positions = nil
|
|
1667
|
+
end
|
|
1668
|
+
|
|
1669
|
+
def resort_index(vector_index, opts)
|
|
1670
|
+
if block_given?
|
|
1671
|
+
vector_index.sort { |(lv, _li), (rv, _ri)| yield(lv, rv) }
|
|
1672
|
+
else
|
|
1673
|
+
vector_index.sort(&DEFAULT_SORTER)
|
|
1674
|
+
end
|
|
1675
|
+
.tap { |res| res.reverse! unless opts[:ascending] }
|
|
1676
|
+
end
|
|
1677
|
+
end
|
|
1678
|
+
end
|