daru_lite 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (149) hide show
  1. checksums.yaml +7 -0
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/workflows/ci.yml +33 -0
  4. data/.gitignore +10 -0
  5. data/.rspec +2 -0
  6. data/.rubocop.yml +27 -0
  7. data/.rubocop_todo.yml +137 -0
  8. data/CONTRIBUTING.md +47 -0
  9. data/Gemfile +2 -0
  10. data/History.md +4 -0
  11. data/LICENSE +24 -0
  12. data/README.md +218 -0
  13. data/Rakefile +69 -0
  14. data/ReleasePolicy.md +20 -0
  15. data/benchmarks/TradeoffData.csv +65 -0
  16. data/benchmarks/csv_reading.rb +22 -0
  17. data/benchmarks/dataframe_creation.rb +39 -0
  18. data/benchmarks/db_loading.rb +34 -0
  19. data/benchmarks/duplicating.rb +45 -0
  20. data/benchmarks/group_by.rb +32 -0
  21. data/benchmarks/joining.rb +52 -0
  22. data/benchmarks/row_access.rb +41 -0
  23. data/benchmarks/row_assign.rb +36 -0
  24. data/benchmarks/sorting.rb +51 -0
  25. data/benchmarks/statistics.rb +28 -0
  26. data/benchmarks/vector_access.rb +31 -0
  27. data/benchmarks/vector_assign.rb +42 -0
  28. data/benchmarks/where_clause.rb +48 -0
  29. data/benchmarks/where_vs_filter.rb +28 -0
  30. data/daru_lite.gemspec +55 -0
  31. data/images/README.md +5 -0
  32. data/images/con0.png +0 -0
  33. data/images/con1.png +0 -0
  34. data/images/init0.png +0 -0
  35. data/images/init1.png +0 -0
  36. data/images/man0.png +0 -0
  37. data/images/man1.png +0 -0
  38. data/images/man2.png +0 -0
  39. data/images/man3.png +0 -0
  40. data/images/man4.png +0 -0
  41. data/images/man5.png +0 -0
  42. data/images/man6.png +0 -0
  43. data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
  44. data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
  45. data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
  46. data/lib/daru_lite/category.rb +929 -0
  47. data/lib/daru_lite/configuration.rb +34 -0
  48. data/lib/daru_lite/core/group_by.rb +403 -0
  49. data/lib/daru_lite/core/merge.rb +270 -0
  50. data/lib/daru_lite/core/query.rb +109 -0
  51. data/lib/daru_lite/dataframe.rb +3080 -0
  52. data/lib/daru_lite/date_time/index.rb +569 -0
  53. data/lib/daru_lite/date_time/offsets.rb +397 -0
  54. data/lib/daru_lite/exceptions.rb +2 -0
  55. data/lib/daru_lite/extensions/which_dsl.rb +53 -0
  56. data/lib/daru_lite/formatters/table.rb +52 -0
  57. data/lib/daru_lite/helpers/array.rb +53 -0
  58. data/lib/daru_lite/index/categorical_index.rb +201 -0
  59. data/lib/daru_lite/index/index.rb +374 -0
  60. data/lib/daru_lite/index/multi_index.rb +374 -0
  61. data/lib/daru_lite/io/csv/converters.rb +21 -0
  62. data/lib/daru_lite/io/io.rb +294 -0
  63. data/lib/daru_lite/io/sql_data_source.rb +97 -0
  64. data/lib/daru_lite/iruby/helpers.rb +38 -0
  65. data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
  66. data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
  67. data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  68. data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  69. data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
  70. data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
  71. data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
  72. data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
  73. data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
  74. data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
  75. data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
  76. data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
  77. data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
  78. data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
  79. data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
  80. data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
  81. data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
  82. data/lib/daru_lite/monkeys.rb +56 -0
  83. data/lib/daru_lite/vector.rb +1678 -0
  84. data/lib/daru_lite/version.rb +3 -0
  85. data/lib/daru_lite.rb +99 -0
  86. data/profile/_base.rb +23 -0
  87. data/profile/df_to_a.rb +10 -0
  88. data/profile/filter.rb +13 -0
  89. data/profile/joining.rb +13 -0
  90. data/profile/sorting.rb +12 -0
  91. data/profile/vector_each_with_index.rb +9 -0
  92. data/profile/vector_new.rb +9 -0
  93. data/spec/accessors/array_wrapper_spec.rb +3 -0
  94. data/spec/category_spec.rb +1741 -0
  95. data/spec/core/group_by_spec.rb +655 -0
  96. data/spec/core/merge_spec.rb +179 -0
  97. data/spec/core/query_spec.rb +347 -0
  98. data/spec/daru_lite_spec.rb +22 -0
  99. data/spec/dataframe_spec.rb +4330 -0
  100. data/spec/date_time/data_spec.rb +197 -0
  101. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  102. data/spec/date_time/index_spec.rb +588 -0
  103. data/spec/date_time/offsets_spec.rb +465 -0
  104. data/spec/extensions/which_dsl_spec.rb +38 -0
  105. data/spec/fixtures/bank2.dat +200 -0
  106. data/spec/fixtures/boolean_converter_test.csv +5 -0
  107. data/spec/fixtures/countries.json +7794 -0
  108. data/spec/fixtures/duplicates.csv +32 -0
  109. data/spec/fixtures/eciresults.html +394 -0
  110. data/spec/fixtures/empties.dat +2 -0
  111. data/spec/fixtures/empty_rows_test.csv +17 -0
  112. data/spec/fixtures/macau.html +3691 -0
  113. data/spec/fixtures/macd_data.csv +150 -0
  114. data/spec/fixtures/matrix_test.csv +100 -0
  115. data/spec/fixtures/moneycontrol.html +6812 -0
  116. data/spec/fixtures/music_data.tsv +2501 -0
  117. data/spec/fixtures/repeated_fields.csv +7 -0
  118. data/spec/fixtures/sales-funnel.csv +18 -0
  119. data/spec/fixtures/scientific_notation.csv +4 -0
  120. data/spec/fixtures/string_converter_test.csv +5 -0
  121. data/spec/fixtures/strings.dat +2 -0
  122. data/spec/fixtures/test_xls.xls +0 -0
  123. data/spec/fixtures/test_xls_2.xls +0 -0
  124. data/spec/fixtures/url_test.txt~ +0 -0
  125. data/spec/fixtures/valid_markup.html +62 -0
  126. data/spec/fixtures/wiki_climate.html +1243 -0
  127. data/spec/fixtures/wiki_table_info.html +631 -0
  128. data/spec/formatters/table_formatter_spec.rb +137 -0
  129. data/spec/helpers_spec.rb +8 -0
  130. data/spec/index/categorical_index_spec.rb +170 -0
  131. data/spec/index/index_spec.rb +417 -0
  132. data/spec/index/multi_index_spec.rb +680 -0
  133. data/spec/io/io_spec.rb +373 -0
  134. data/spec/io/sql_data_source_spec.rb +56 -0
  135. data/spec/iruby/dataframe_spec.rb +170 -0
  136. data/spec/iruby/helpers_spec.rb +49 -0
  137. data/spec/iruby/multi_index_spec.rb +37 -0
  138. data/spec/iruby/vector_spec.rb +105 -0
  139. data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
  140. data/spec/maths/arithmetic/vector_spec.rb +165 -0
  141. data/spec/maths/statistics/dataframe_spec.rb +178 -0
  142. data/spec/maths/statistics/vector_spec.rb +756 -0
  143. data/spec/monkeys_spec.rb +42 -0
  144. data/spec/shared/vector_display_spec.rb +213 -0
  145. data/spec/spec_helper.rb +87 -0
  146. data/spec/support/database_helper.rb +30 -0
  147. data/spec/support/matchers.rb +5 -0
  148. data/spec/vector_spec.rb +2293 -0
  149. metadata +571 -0
@@ -0,0 +1,1678 @@
1
+ require 'daru_lite/maths/arithmetic/vector'
2
+ require 'daru_lite/maths/statistics/vector'
3
+ require 'daru_lite/accessors/array_wrapper'
4
+ require 'daru_lite/category'
5
+
6
+ module DaruLite
7
+ class Vector # rubocop:disable Metrics/ClassLength
8
+ include Enumerable
9
+ include DaruLite::Maths::Arithmetic::Vector
10
+ include DaruLite::Maths::Statistics::Vector
11
+ extend Gem::Deprecate
12
+
13
+ class << self
14
+ # Create a new vector by specifying the size and an optional value
15
+ # and block to generate values.
16
+ #
17
+ # == Description
18
+ #
19
+ # The *new_with_size* class method lets you create a DaruLite::Vector
20
+ # by specifying the size as the argument. The optional block, if
21
+ # supplied, is run once for populating each element in the Vector.
22
+ #
23
+ # The result of each run of the block is the value that is ultimately
24
+ # assigned to that position in the Vector.
25
+ #
26
+ # == Options
27
+ # :value
28
+ # All the rest like .new
29
+ def new_with_size(n, opts = {}, &block)
30
+ value = opts.delete :value
31
+ block ||= ->(_) { value }
32
+ DaruLite::Vector.new Array.new(n, &block), opts
33
+ end
34
+
35
+ # Create a vector using (almost) any object
36
+ # * Array: flattened
37
+ # * Range: transformed using to_a
38
+ # * DaruLite::Vector
39
+ # * Numeric and string values
40
+ #
41
+ # == Description
42
+ #
43
+ # The `Vector.[]` class method creates a vector from almost any
44
+ # object that has a `#to_a` method defined on it. It is similar
45
+ # to R's `c` method.
46
+ #
47
+ # == Usage
48
+ #
49
+ # a = DaruLite::Vector[1,2,3,4,6..10]
50
+ # #=>
51
+ # # <DaruLite::Vector:99448510 @name = nil @size = 9 >
52
+ # # nil
53
+ # # 0 1
54
+ # # 1 2
55
+ # # 2 3
56
+ # # 3 4
57
+ # # 4 6
58
+ # # 5 7
59
+ # # 6 8
60
+ # # 7 9
61
+ # # 8 10
62
+ def [](*indexes)
63
+ values = indexes.map do |a|
64
+ a.respond_to?(:to_a) ? a.to_a : a
65
+ end.flatten
66
+ DaruLite::Vector.new(values)
67
+ end
68
+
69
+ def _load(data) # :nodoc:
70
+ h = Marshal.load(data)
71
+ DaruLite::Vector.new(h[:data],
72
+ index: h[:index],
73
+ name: h[:name],
74
+ dtype: h[:dtype], missing_values: h[:missing_values])
75
+ end
76
+
77
+ def coerce(data, options = {})
78
+ case data
79
+ when DaruLite::Vector
80
+ data
81
+ when Array, Hash
82
+ new(data, options)
83
+ else
84
+ raise ArgumentError, "Can't coerce #{data.class} to #{self}"
85
+ end
86
+ end
87
+ end
88
+
89
+ def size
90
+ @data.size
91
+ end
92
+
93
+ def each(&block)
94
+ return to_enum(:each) unless block
95
+
96
+ @data.each(&block)
97
+ self
98
+ end
99
+
100
+ def each_index(&block)
101
+ return to_enum(:each_index) unless block
102
+
103
+ @index.each(&block)
104
+ self
105
+ end
106
+
107
+ def each_with_index(&block)
108
+ return to_enum(:each_with_index) unless block
109
+
110
+ @data.to_a.zip(@index.to_a).each(&block)
111
+
112
+ self
113
+ end
114
+
115
+ def map!(&block)
116
+ return to_enum(:map!) unless block
117
+
118
+ @data.map!(&block)
119
+ self
120
+ end
121
+
122
+ def apply_method(method, keys: nil, by_position: true)
123
+ vect = keys ? get_sub_vector(keys, by_position: by_position) : self
124
+
125
+ case method
126
+ when Symbol then vect.send(method)
127
+ when Proc then method.call(vect)
128
+ else raise
129
+ end
130
+ end
131
+ alias apply_method_on_sub_vector apply_method
132
+
133
+ # The name of the DaruLite::Vector. String.
134
+ attr_reader :name
135
+ # The row index. Can be either DaruLite::Index or DaruLite::MultiIndex.
136
+ attr_reader :index
137
+ # The underlying dtype of the Vector. Can be :array.
138
+ attr_reader :dtype
139
+ attr_reader :nm_dtype
140
+ # An Array or the positions in the vector that are being treated as 'missing'.
141
+ attr_reader :missing_positions
142
+
143
+ deprecate :missing_positions, :indexes, 2016, 10
144
+ # Store a hash of labels for values. Supplementary only. Recommend using index
145
+ # for proper usage.
146
+ attr_accessor :labels
147
+ # Store vector data in an array
148
+ attr_reader :data
149
+
150
+ # Create a Vector object.
151
+ #
152
+ # == Arguments
153
+ #
154
+ # @param source[Array,Hash] - Supply elements in the form of an Array or a
155
+ # Hash. If Array, a numeric index will be created if not supplied in the
156
+ # options. Specifying more index elements than actual values in *source*
157
+ # will insert *nil* into the surplus index elements. When a Hash is specified,
158
+ # the keys of the Hash are taken as the index elements and the corresponding
159
+ # values as the values that populate the vector.
160
+ #
161
+ # == Options
162
+ #
163
+ # * +:name+ - Name of the vector
164
+ #
165
+ # * +:index+ - Index of the vector
166
+ #
167
+ # * +:dtype+ - The underlying data type. Can be :array.
168
+ # Default :array.
169
+ #
170
+ # * +:missing_values+ - An Array of the values that are to be treated as 'missing'.
171
+ # nil is the default missing value.
172
+ #
173
+ # == Usage
174
+ #
175
+ # vecarr = DaruLite::Vector.new [1,2,3,4], index: [:a, :e, :i, :o]
176
+ # vechsh = DaruLite::Vector.new({a: 1, e: 2, i: 3, o: 4})
177
+ def initialize(source, opts = {})
178
+ if opts[:type] == :category
179
+ # Initialize category type vector
180
+ extend DaruLite::Category
181
+ initialize_category source, opts
182
+ else
183
+ # Initialize non-category type vector
184
+ initialize_vector source, opts
185
+ end
186
+ end
187
+
188
+ # Get one or more elements with specified index or a range.
189
+ #
190
+ # == Usage
191
+ # # For vectors employing single layer Index
192
+ #
193
+ # v[:one, :two] # => DaruLite::Vector with indexes :one and :two
194
+ # v[:one] # => Single element
195
+ # v[:one..:three] # => DaruLite::Vector with indexes :one, :two and :three
196
+ #
197
+ # # For vectors employing hierarchial multi index
198
+ #
199
+ def [](*input_indexes)
200
+ # Get array of positions indexes
201
+ positions = @index.pos(*input_indexes)
202
+
203
+ # If one object is asked return it
204
+ return @data[positions] if positions.is_a? Numeric
205
+
206
+ # Form a new Vector using positional indexes
207
+ DaruLite::Vector.new(
208
+ positions.map { |loc| @data[loc] },
209
+ name: @name,
210
+ index: @index.subset(*input_indexes), dtype: @dtype
211
+ )
212
+ end
213
+
214
+ # Returns vector of values given positional values
215
+ # @param positions [Array<object>] positional values
216
+ # @return [object] vector
217
+ # @example
218
+ # dv = DaruLite::Vector.new 'a'..'e'
219
+ # dv.at 0, 1, 2
220
+ # # => #<DaruLite::Vector(3)>
221
+ # # 0 a
222
+ # # 1 b
223
+ # # 2 c
224
+ def at(*positions)
225
+ # to be used to form index
226
+ original_positions = positions
227
+ positions = coerce_positions(*positions)
228
+ validate_positions(*positions)
229
+
230
+ if positions.is_a? Integer
231
+ @data[positions]
232
+ else
233
+ values = positions.map { |pos| @data[pos] }
234
+ DaruLite::Vector.new values, index: @index.at(*original_positions), dtype: dtype
235
+ end
236
+ end
237
+
238
+ # Change value at given positions
239
+ # @param positions [Array<object>] positional values
240
+ # @param [object] val value to assign
241
+ # @example
242
+ # dv = DaruLite::Vector.new 'a'..'e'
243
+ # dv.set_at [0, 1], 'x'
244
+ # dv
245
+ # # => #<DaruLite::Vector(5)>
246
+ # # 0 x
247
+ # # 1 x
248
+ # # 2 c
249
+ # # 3 d
250
+ # # 4 e
251
+ def set_at(positions, val)
252
+ validate_positions(*positions)
253
+ positions.map { |pos| @data[pos] = val }
254
+ update_position_cache
255
+ end
256
+
257
+ # Just like in Hashes, you can specify the index label of the DaruLite::Vector
258
+ # and assign an element an that place in the DaruLite::Vector.
259
+ #
260
+ # == Usage
261
+ #
262
+ # v = DaruLite::Vector.new([1,2,3], index: [:a, :b, :c])
263
+ # v[:a] = 999
264
+ # #=>
265
+ # ##<DaruLite::Vector:90257920 @name = nil @size = 3 >
266
+ # # nil
267
+ # # a 999
268
+ # # b 2
269
+ # # c 3
270
+ def []=(*indexes, val)
271
+ cast(dtype: :array) if val.nil? && dtype != :array
272
+
273
+ guard_type_check(val)
274
+
275
+ modify_vector(indexes, val)
276
+
277
+ update_position_cache
278
+ end
279
+
280
+ # Two vectors are equal if they have the exact same index values corresponding
281
+ # with the exact same elements. Name is ignored.
282
+ def ==(other)
283
+ case other
284
+ when DaruLite::Vector
285
+ @index == other.index && size == other.size &&
286
+ each_with_index.with_index.all? do |(e, index), position|
287
+ e == other.at(position) && index == other.index.to_a[position]
288
+ end
289
+ else
290
+ super
291
+ end
292
+ end
293
+
294
+ # !@method eq
295
+ # Uses `==` and returns `true` for each **equal** entry
296
+ # @param [#==, DaruLite::Vector] If scalar object, compares it with each
297
+ # element in self. If DaruLite::Vector, compares elements with same indexes.
298
+ # @example (see #where)
299
+ # !@method not_eq
300
+ # Uses `!=` and returns `true` for each **unequal** entry
301
+ # @param [#!=, DaruLite::Vector] If scalar object, compares it with each
302
+ # element in self. If DaruLite::Vector, compares elements with same indexes.
303
+ # @example (see #where)
304
+ # !@method lt
305
+ # Uses `<` and returns `true` for each entry **less than** the supplied object
306
+ # @param [#<, DaruLite::Vector] If scalar object, compares it with each
307
+ # element in self. If DaruLite::Vector, compares elements with same indexes.
308
+ # @example (see #where)
309
+ # !@method lteq
310
+ # Uses `<=` and returns `true` for each entry **less than or equal to** the supplied object
311
+ # @param [#<=, DaruLite::Vector] If scalar object, compares it with each
312
+ # element in self. If DaruLite::Vector, compares elements with same indexes.
313
+ # @example (see #where)
314
+ # !@method mt
315
+ # Uses `>` and returns `true` for each entry **more than** the supplied object
316
+ # @param [#>, DaruLite::Vector] If scalar object, compares it with each
317
+ # element in self. If DaruLite::Vector, compares elements with same indexes.
318
+ # @example (see #where)
319
+ # !@method mteq
320
+ # Uses `>=` and returns `true` for each entry **more than or equal to** the supplied object
321
+ # @param [#>=, DaruLite::Vector] If scalar object, compares it with each
322
+ # element in self. If DaruLite::Vector, compares elements with same indexes.
323
+ # @example (see #where)
324
+
325
+ # Define the comparator methods with metaprogramming. See documentation
326
+ # written above for functionality of each method. Use these methods with the
327
+ # `where` method to obtain the corresponding Vector/DataFrame.
328
+ {
329
+ eq: :==,
330
+ not_eq: :!=,
331
+ lt: :<,
332
+ lteq: :<=,
333
+ mt: :>,
334
+ mteq: :>=
335
+ }.each do |method, operator|
336
+ define_method(method) do |other|
337
+ mod = DaruLite::Core::Query
338
+ if other.is_a?(DaruLite::Vector)
339
+ mod.apply_vector_operator operator, self, other
340
+ else
341
+ mod.apply_scalar_operator operator, @data, other
342
+ end
343
+ end
344
+ alias_method operator, method if operator != :== && operator != :!=
345
+ end
346
+ alias gt mt
347
+ alias gteq mteq
348
+
349
+ # Comparator for checking if any of the elements in *other* exist in self.
350
+ #
351
+ # @param [Array, DaruLite::Vector] other A collection which has elements that
352
+ # need to be checked for in self.
353
+ # @example Usage of `in`.
354
+ # vector = DaruLite::Vector.new([1,2,3,4,5])
355
+ # vector.where(vector.in([3,5]))
356
+ # #=>
357
+ # ##<DaruLite::Vector:82215960 @name = nil @size = 2 >
358
+ # # nil
359
+ # # 2 3
360
+ # # 4 5
361
+ def in(other)
362
+ other = other.zip(Array.new(other.size, 0)).to_h
363
+ DaruLite::Core::Query::BoolArray.new(
364
+ @data.each_with_object([]) do |d, memo|
365
+ memo << (other.key?(d))
366
+ end
367
+ )
368
+ end
369
+
370
+ # Return a new vector based on the contents of a boolean array. Use with the
371
+ # comparator methods to obtain meaningful results. See this notebook for
372
+ # a good overview of using #where.
373
+ #
374
+ # @param bool_array [DaruLite::Core::Query::BoolArray, Array<TrueClass, FalseClass>] The
375
+ # collection containing the true of false values. Each element in the Vector
376
+ # corresponding to a `true` in the bool_arry will be returned alongwith it's
377
+ # index.
378
+ # @example Usage of #where.
379
+ # vector = DaruLite::Vector.new([2,4,5,51,5,16,2,5,3,2,1,5,2,5,2,1,56,234,6,21])
380
+ #
381
+ # # Simple logic statement passed to #where.
382
+ # vector.where(vector.eq(5).or(vector.eq(1)))
383
+ # # =>
384
+ # ##<DaruLite::Vector:77626210 @name = nil @size = 7 >
385
+ # # nil
386
+ # # 2 5
387
+ # # 4 5
388
+ # # 7 5
389
+ # # 10 1
390
+ # # 11 5
391
+ # # 13 5
392
+ # # 15 1
393
+ #
394
+ # # A somewhat more complex logic statement
395
+ # vector.where((vector.eq(5) | vector.lteq(1)) & vector.in([4,5,1]))
396
+ # #=>
397
+ # ##<DaruLite::Vector:81072310 @name = nil @size = 7 >
398
+ # # nil
399
+ # # 2 5
400
+ # # 4 5
401
+ # # 7 5
402
+ # # 10 1
403
+ # # 11 5
404
+ # # 13 5
405
+ # # 15 1
406
+ def where(bool_array)
407
+ DaruLite::Core::Query.vector_where self, bool_array
408
+ end
409
+
410
+ # Return a new vector based on the contents of a boolean array and &block.
411
+ #
412
+ # @param bool_array [DaruLite::Core::Query::BoolArray, Array<TrueClass, FalseClass>, &block] The
413
+ # collection containing the true of false values. Each element in the Vector
414
+ # corresponding to a `true` in the bool_array will be returned along with it's
415
+ # index. The &block may contain manipulative functions for the Vector elements.
416
+ #
417
+ # @return [DaruLite::Vector]
418
+ #
419
+ # @example Usage of #apply_where.
420
+ # dv = DaruLite::Vector.new ['3 days', '5 weeks', '2 weeks']
421
+ # dv = dv.apply_where(dv.match /weeks/) { |x| "#{x.split.first.to_i * 7} days" }
422
+ # # =>
423
+ # ##<DaruLite::Vector(3)>
424
+ # # 0 3 days
425
+ # # 1 35 days
426
+ # # 2 14 days
427
+ def apply_where(bool_array, &block)
428
+ DaruLite::Core::Query.vector_apply_where self, bool_array, &block
429
+ end
430
+
431
+ def head(q = 10)
432
+ self[0..(q - 1)]
433
+ end
434
+
435
+ def tail(q = 10)
436
+ start = [size - q, 0].max
437
+ self[start..(size - 1)]
438
+ end
439
+
440
+ def last(q = 1)
441
+ # The Enumerable mixin dose not provide the last method.
442
+ tail(q)
443
+ end
444
+
445
+ def empty?
446
+ @index.empty?
447
+ end
448
+
449
+ def numeric?
450
+ type == :numeric
451
+ end
452
+
453
+ def object?
454
+ type == :object
455
+ end
456
+
457
+ # Reports whether missing data is present in the Vector.
458
+ def has_missing_data?
459
+ !indexes(*DaruLite::MISSING_VALUES).empty?
460
+ end
461
+ alias flawed? has_missing_data?
462
+ deprecate :has_missing_data?, :include_values?, 2016, 10
463
+ deprecate :flawed?, :include_values?, 2016, 10
464
+
465
+ # Check if any one of mentioned values occur in the vector
466
+ # @param values [Array] values to check for
467
+ # @return [true, false] returns true if any one of specified values
468
+ # occur in the vector
469
+ # @example
470
+ # dv = DaruLite::Vector.new [1, 2, 3, 4, nil]
471
+ # dv.include_values? nil, Float::NAN
472
+ # # => true
473
+ def include_values?(*values)
474
+ values.any? { |v| include_with_nan? @data, v }
475
+ end
476
+
477
+ # @note Do not use it to check for Float::NAN as
478
+ # Float::NAN == Float::NAN is false
479
+ # Return vector of booleans with value at ith position is either
480
+ # true or false depending upon whether value at position i is equal to
481
+ # any of the values passed in the argument or not
482
+ # @param values [Array] values to equate with
483
+ # @return [DaruLite::Vector] vector of boolean values
484
+ # @example
485
+ # dv = DaruLite::Vector.new [1, 2, 3, 2, 1]
486
+ # dv.is_values 1, 2
487
+ # # => #<DaruLite::Vector(5)>
488
+ # # 0 true
489
+ # # 1 true
490
+ # # 2 false
491
+ # # 3 true
492
+ # # 4 true
493
+ def is_values(*values)
494
+ DaruLite::Vector.new values.map { |v| eq(v) }.inject(:|)
495
+ end
496
+
497
+ # Append an element to the vector by specifying the element and index
498
+ def concat(element, index)
499
+ raise IndexError, 'Expected new unique index' if @index.include? index
500
+
501
+ @index |= [index]
502
+ @data[@index[index]] = element
503
+
504
+ update_position_cache
505
+ end
506
+ alias push concat
507
+ alias << concat
508
+
509
+ # Cast a vector to a new data type.
510
+ #
511
+ # == Options
512
+ #
513
+ # * +:dtype+ - :array for Ruby Array..
514
+ def cast(opts = {})
515
+ dt = opts[:dtype]
516
+ raise ArgumentError, "Unsupported dtype #{opts[:dtype]}" unless dt == :array
517
+
518
+ @data = cast_vector_to dt unless @dtype == dt
519
+ end
520
+
521
+ # Delete an element by value
522
+ def delete(element)
523
+ delete_at index_of(element)
524
+ end
525
+
526
+ # Delete element by index
527
+ def delete_at(index)
528
+ @data.delete_at @index[index]
529
+ @index = DaruLite::Index.new(@index.to_a - [index])
530
+
531
+ update_position_cache
532
+ end
533
+
534
+ # The type of data contained in the vector. Can be :object.
535
+ #
536
+ # Running through the data to figure out the kind of data is delayed to the
537
+ # last possible moment.
538
+ def type
539
+ if @type.nil? || @possibly_changed_type
540
+ @type = :numeric
541
+ each do |e|
542
+ next if e.nil? || e.is_a?(Numeric)
543
+
544
+ @type = :object
545
+ break
546
+ end
547
+ @possibly_changed_type = false
548
+ end
549
+
550
+ @type
551
+ end
552
+
553
+ # Tells if vector is categorical or not.
554
+ # @return [true, false] true if vector is of type category, false otherwise
555
+ # @example
556
+ # dv = DaruLite::Vector.new [1, 2, 3], type: :category
557
+ # dv.category?
558
+ # # => true
559
+ def category?
560
+ type == :category
561
+ end
562
+
563
+ # Get index of element
564
+ def index_of(element)
565
+ case dtype
566
+ when :array then @index.key(@data.index { |x| x.eql? element })
567
+ else @index.key @data.index(element)
568
+ end
569
+ end
570
+
571
+ # Keep only unique elements of the vector alongwith their indexes.
572
+ def uniq
573
+ uniq_vector = @data.uniq
574
+ new_index = uniq_vector.map { |element| index_of(element) }
575
+
576
+ DaruLite::Vector.new uniq_vector, name: @name, index: new_index, dtype: @dtype
577
+ end
578
+
579
+ def any?(&block)
580
+ @data.data.any?(&block)
581
+ end
582
+
583
+ def all?(&block)
584
+ @data.data.all?(&block)
585
+ end
586
+
587
+ # Sorts a vector according to its values. If a block is specified, the contents
588
+ # will be evaluated and data will be swapped whenever the block evaluates
589
+ # to *true*. Defaults to ascending order sorting. Any missing values will be
590
+ # put at the end of the vector. Preserves indexing. Default sort algorithm is
591
+ # quick sort.
592
+ #
593
+ # == Options
594
+ #
595
+ # * +:ascending+ - if false, will sort in descending order. Defaults to true.
596
+ #
597
+ # * +:type+ - Specify the sorting algorithm. Only supports quick_sort for now.
598
+ # == Usage
599
+ #
600
+ # v = DaruLite::Vector.new ["My first guitar", "jazz", "guitar"]
601
+ # # Say you want to sort these strings by length.
602
+ # v.sort(ascending: false) { |a,b| a.length <=> b.length }
603
+ def sort(opts = {}, &block)
604
+ opts = { ascending: true }.merge(opts)
605
+
606
+ vector_index = resort_index(@data.each_with_index, opts, &block)
607
+ vector, index = vector_index.transpose
608
+
609
+ index = @index.reorder index
610
+
611
+ DaruLite::Vector.new(vector, index: index, name: @name, dtype: @dtype)
612
+ end
613
+
614
+ # Sorts the vector according to it's`Index` values. Defaults to ascending
615
+ # order sorting.
616
+ #
617
+ # @param [Hash] opts the options for sort_by_index method.
618
+ # @option opts [Boolean] :ascending false, will sort `index` in
619
+ # descending order.
620
+ #
621
+ # @return [Vector] new sorted `Vector` according to the index values.
622
+ #
623
+ # @example
624
+ #
625
+ # dv = DaruLite::Vector.new [11, 13, 12], index: [23, 21, 22]
626
+ # # Say you want to sort index in ascending order
627
+ # dv.sort_by_index(ascending: true)
628
+ # #=> DaruLite::Vector.new [13, 12, 11], index: [21, 22, 23]
629
+ # # Say you want to sort index in descending order
630
+ # dv.sort_by_index(ascending: false)
631
+ # #=> DaruLite::Vector.new [11, 12, 13], index: [23, 22, 21]
632
+ def sort_by_index(opts = {})
633
+ opts = { ascending: true }.merge(opts)
634
+ _, new_order = resort_index(@index.each_with_index, opts).transpose
635
+
636
+ reorder new_order
637
+ end
638
+
639
+ DEFAULT_SORTER = lambda { |(lv, li), (rv, ri)|
640
+ if lv.nil? && rv.nil?
641
+ li <=> ri
642
+ elsif lv.nil?
643
+ -1
644
+ elsif rv.nil?
645
+ 1
646
+ else
647
+ lv <=> rv
648
+ end
649
+ }
650
+
651
+ # Just sort the data and get an Array in return using Enumerable#sort.
652
+ # Non-destructive.
653
+ # :nocov:
654
+ def sorted_data(&block)
655
+ @data.to_a.sort(&block)
656
+ end
657
+ # :nocov:
658
+
659
+ # Like map, but returns a DaruLite::Vector with the returned values.
660
+ def recode(dt = nil, &block)
661
+ return to_enum(:recode, dt) unless block
662
+
663
+ dup.recode! dt, &block
664
+ end
665
+
666
+ # Destructive version of recode!
667
+ def recode!(dt = nil, &block)
668
+ return to_enum(:recode!, dt) unless block
669
+
670
+ @data.map!(&block).data
671
+ @data = cast_vector_to(dt || @dtype)
672
+ self
673
+ end
674
+
675
+ # Delete an element if block returns true. Destructive.
676
+ def delete_if
677
+ return to_enum(:delete_if) unless block_given?
678
+
679
+ keep_e, keep_i = each_with_index.reject { |n, _i| yield(n) }.transpose
680
+
681
+ @data = cast_vector_to @dtype, keep_e
682
+ @index = DaruLite::Index.new(keep_i)
683
+
684
+ update_position_cache
685
+
686
+ self
687
+ end
688
+
689
+ # Keep an element if block returns true. Destructive.
690
+ def keep_if
691
+ return to_enum(:keep_if) unless block_given?
692
+
693
+ delete_if { |val| !yield(val) }
694
+ end
695
+
696
+ # Reports all values that doesn't comply with a condition.
697
+ # Returns a hash with the index of data and the invalid data.
698
+ def verify
699
+ (0...size)
700
+ .map { |i| [i, @data[i]] }
701
+ .reject { |_i, val| yield(val) }
702
+ .to_h
703
+ end
704
+
705
+ # Return an Array with the data splitted by a separator.
706
+ # a=DaruLite::Vector.new(["a,b","c,d","a,b","d"])
707
+ # a.splitted
708
+ # =>
709
+ # [["a","b"],["c","d"],["a","b"],["d"]]
710
+ def splitted(sep = ',')
711
+ @data.map do |s|
712
+ if s.nil?
713
+ nil
714
+ elsif s.respond_to? :split
715
+ s.split sep
716
+ else
717
+ [s]
718
+ end
719
+ end
720
+ end
721
+
722
+ # Returns a hash of Vectors, defined by the different values
723
+ # defined on the fields
724
+ # Example:
725
+ #
726
+ # a=DaruLite::Vector.new(["a,b","c,d","a,b"])
727
+ # a.split_by_separator
728
+ # => {"a"=>#<DaruLite::Vector:0x7f2dbcc09d88
729
+ # @data=[1, 0, 1]>,
730
+ # "b"=>#<DaruLite::Vector:0x7f2dbcc09c48
731
+ # @data=[1, 1, 0]>,
732
+ # "c"=>#<DaruLite::Vector:0x7f2dbcc09b08
733
+ # @data=[0, 1, 1]>}
734
+ #
735
+ def split_by_separator(sep = ',')
736
+ split_data = splitted sep
737
+ split_data
738
+ .flatten.uniq.compact.to_h do |key|
739
+ [
740
+ key,
741
+ DaruLite::Vector.new(split_data.map { |v| split_value(key, v) })
742
+ ]
743
+ end
744
+ end
745
+
746
+ def split_by_separator_freq(sep = ',')
747
+ split_by_separator(sep).transform_values do |v|
748
+ v.sum(&:to_i)
749
+ end
750
+ end
751
+
752
+ def reset_index!
753
+ @index = DaruLite::Index.new(Array.new(size) { |i| i })
754
+ self
755
+ end
756
+
757
+ # Replace all nils in the vector with the value passed as an argument. Destructive.
758
+ # See #replace_nils for non-destructive version
759
+ #
760
+ # == Arguments
761
+ #
762
+ # * +replacement+ - The value which should replace all nils
763
+ def replace_nils!(replacement)
764
+ indexes(*DaruLite::MISSING_VALUES).each do |idx|
765
+ self[idx] = replacement
766
+ end
767
+
768
+ self
769
+ end
770
+
771
+ # Rolling fillna
772
+ # replace all Float::NAN and NIL values with the preceeding or following value
773
+ #
774
+ # @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
775
+ #
776
+ # @example
777
+ # dv = DaruLite::Vector.new([1, 2, 1, 4, nil, Float::NAN, 3, nil, Float::NAN])
778
+ #
779
+ # 2.3.3 :068 > dv.rolling_fillna(:forward)
780
+ # => #<DaruLite::Vector(9)>
781
+ # 0 1
782
+ # 1 2
783
+ # 2 1
784
+ # 3 4
785
+ # 4 4
786
+ # 5 4
787
+ # 6 3
788
+ # 7 3
789
+ # 8 3
790
+ #
791
+ def rolling_fillna!(direction = :forward)
792
+ enum = direction == :forward ? index : index.reverse_each
793
+ last_valid_value = 0
794
+ enum.each do |idx|
795
+ if valid_value?(self[idx])
796
+ last_valid_value = self[idx]
797
+ else
798
+ self[idx] = last_valid_value
799
+ end
800
+ end
801
+ self
802
+ end
803
+
804
+ # Non-destructive version of rolling_fillna!
805
+ def rolling_fillna(direction = :forward)
806
+ dup.rolling_fillna!(direction)
807
+ end
808
+
809
+ # Lags the series by `k` periods.
810
+ #
811
+ # Lags the series by `k` periods, "shifting" data and inserting `nil`s
812
+ # from beginning or end of a vector, while preserving original vector's
813
+ # size.
814
+ #
815
+ # `k` can be positive or negative integer. If `k` is positive, `nil`s
816
+ # are inserted at the beginning of the vector, otherwise they are
817
+ # inserted at the end.
818
+ #
819
+ # @param [Integer] k "shift" the series by `k` periods. `k` can be
820
+ # positive or negative. (default = 1)
821
+ #
822
+ # @return [DaruLite::Vector] a new vector with "shifted" inital values
823
+ # and `nil` values inserted. The return vector is the same length
824
+ # as the orignal vector.
825
+ #
826
+ # @example Lag a vector with different periods `k`
827
+ #
828
+ # ts = DaruLite::Vector.new(1..5)
829
+ # # => [1, 2, 3, 4, 5]
830
+ #
831
+ # ts.lag # => [nil, 1, 2, 3, 4]
832
+ # ts.lag(1) # => [nil, 1, 2, 3, 4]
833
+ # ts.lag(2) # => [nil, nil, 1, 2, 3]
834
+ # ts.lag(-1) # => [2, 3, 4, 5, nil]
835
+ #
836
+ def lag(k = 1)
837
+ case k
838
+ when 0 then dup
839
+ when 1...size
840
+ copy(([nil] * k) + data.to_a)
841
+ when -size..-1
842
+ copy(data.to_a[k.abs...size])
843
+ else
844
+ copy([])
845
+ end
846
+ end
847
+
848
+ def detach_index
849
+ DaruLite::DataFrame.new(
850
+ index: @index.to_a,
851
+ values: @data.to_a
852
+ )
853
+ end
854
+
855
+ # Non-destructive version of #replace_nils!
856
+ def replace_nils(replacement)
857
+ dup.replace_nils!(replacement)
858
+ end
859
+
860
+ # number of non-missing elements
861
+ def n_valid
862
+ size - indexes(*DaruLite::MISSING_VALUES).size
863
+ end
864
+ deprecate :n_valid, :count_values, 2016, 10
865
+
866
+ # Count the number of values specified
867
+ # @param values [Array] values to count for
868
+ # @return [Integer] the number of times the values mentioned occurs
869
+ # @example
870
+ # dv = DaruLite::Vector.new [1, 2, 1, 2, 3, 4, nil, nil]
871
+ # dv.count_values nil
872
+ # # => 2
873
+ def count_values(*values)
874
+ positions(*values).size
875
+ end
876
+
877
+ # Returns *true* if an index exists
878
+ def has_index?(index)
879
+ @index.include? index
880
+ end
881
+
882
+ # @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
883
+ # @return [DaruLite::Vector]
884
+ def get_sub_vector(keys, by_position: true)
885
+ return DaruLite::Vector.new([]) if keys == []
886
+
887
+ keys = @index.pos(*keys) unless by_position
888
+
889
+ sub_vect = at(*keys)
890
+ sub_vect = DaruLite::Vector.new([sub_vect]) unless sub_vect.is_a?(DaruLite::Vector)
891
+
892
+ sub_vect
893
+ end
894
+
895
+ # @return [DaruLite::DataFrame] the vector as a single-vector dataframe
896
+ def to_df
897
+ DaruLite::DataFrame.new({ @name => @data }, name: @name, index: @index)
898
+ end
899
+
900
+ # Convert Vector to a horizontal or vertical Ruby Matrix.
901
+ #
902
+ # == Arguments
903
+ #
904
+ # * +axis+ - Specify whether you want a *:horizontal* or a *:vertical* matrix.
905
+ def to_matrix(axis = :horizontal)
906
+ case axis
907
+ when :horizontal
908
+ Matrix[to_a]
909
+ when :vertical
910
+ Matrix.columns([to_a])
911
+ else
912
+ raise ArgumentError, "axis should be either :horizontal or :vertical, not #{axis}"
913
+ end
914
+ end
915
+
916
+ # Convert to hash (explicit). Hash keys are indexes and values are the correspoding elements
917
+ def to_h
918
+ @index.to_h { |index| [index, self[index]] }
919
+ end
920
+
921
+ # Return an array
922
+ def to_a
923
+ @data.to_a
924
+ end
925
+
926
+ # Convert the hash from to_h to json
927
+ def to_json(*)
928
+ to_h.to_json
929
+ end
930
+
931
+ # Convert to html for iruby
932
+ def to_html(threshold = 30)
933
+ table_thead = to_html_thead
934
+ table_tbody = to_html_tbody(threshold)
935
+ path = if index.is_a?(MultiIndex)
936
+ File.expand_path('iruby/templates/vector_mi.html.erb', __dir__)
937
+ else
938
+ File.expand_path('iruby/templates/vector.html.erb', __dir__)
939
+ end
940
+ ERB.new(File.read(path).strip).result(binding)
941
+ end
942
+
943
+ def to_html_thead
944
+ table_thead_path =
945
+ if index.is_a?(MultiIndex)
946
+ File.expand_path('iruby/templates/vector_mi_thead.html.erb', __dir__)
947
+ else
948
+ File.expand_path('iruby/templates/vector_thead.html.erb', __dir__)
949
+ end
950
+ ERB.new(File.read(table_thead_path).strip).result(binding)
951
+ end
952
+
953
+ def to_html_tbody(threshold = 30)
954
+ table_tbody_path =
955
+ if index.is_a?(MultiIndex)
956
+ File.expand_path('iruby/templates/vector_mi_tbody.html.erb', __dir__)
957
+ else
958
+ File.expand_path('iruby/templates/vector_tbody.html.erb', __dir__)
959
+ end
960
+ ERB.new(File.read(table_tbody_path).strip).result(binding)
961
+ end
962
+
963
+ def to_s
964
+ "#<#{self.class}#{": #{@name}" if @name}(#{size})#{':category' if category?}>"
965
+ end
966
+
967
+ # Create a summary of the Vector
968
+ # @param indent_level [Fixnum] indent level
969
+ # @return [String] String containing the summary of the Vector
970
+ # @example
971
+ # dv = DaruLite::Vector.new [1, 2, 3]
972
+ # puts dv.summary
973
+ #
974
+ # # =
975
+ # # n :3
976
+ # # non-missing:3
977
+ # # median: 2
978
+ # # mean: 2.0000
979
+ # # std.dev.: 1.0000
980
+ # # std.err.: 0.5774
981
+ # # skew: 0.0000
982
+ # # kurtosis: -2.3333
983
+ def summary(indent_level = 0)
984
+ non_missing = size - count_values(*DaruLite::MISSING_VALUES)
985
+ summary = (' =' * indent_level) + "= #{name}" \
986
+ "\n n :#{size}" \
987
+ "\n non-missing:#{non_missing}"
988
+ case type
989
+ when :object
990
+ summary << object_summary
991
+ when :numeric
992
+ summary << numeric_summary
993
+ end
994
+ summary.split("\n").join("\n#{' ' * indent_level}")
995
+ end
996
+
997
+ # Displays summary for an object type Vector
998
+ # @return [String] String containing object vector summary
999
+ def object_summary
1000
+ nval = count_values(*DaruLite::MISSING_VALUES)
1001
+ summary = "\n factors: #{factors.to_a.join(',')}" \
1002
+ "\n mode: #{mode.to_a.join(',')}" \
1003
+ "\n Distribution\n"
1004
+
1005
+ data = frequencies.sort.each_with_index.map do |v, k|
1006
+ [k, v, format('%0.2f%%', ((nval.zero? ? 1 : v.quo(nval)) * 100))]
1007
+ end
1008
+
1009
+ summary + Formatters::Table.format(data)
1010
+ end
1011
+
1012
+ # Displays summary for an numeric type Vector
1013
+ # @return [String] String containing numeric vector summary
1014
+ def numeric_summary
1015
+ summary = "\n median: #{median}" +
1016
+ format("\n mean: %0.4f", mean)
1017
+ if sd
1018
+ summary << (format("\n std.dev.: %0.4f", sd) +
1019
+ format("\n std.err.: %0.4f", se))
1020
+ end
1021
+
1022
+ if count_values(*DaruLite::MISSING_VALUES).zero?
1023
+ summary << (format("\n skew: %0.4f", skew) +
1024
+ format("\n kurtosis: %0.4f", kurtosis))
1025
+ end
1026
+ summary
1027
+ end
1028
+
1029
+ # Over rides original inspect for pretty printing in irb
1030
+ def inspect(spacing = 20, threshold = 15)
1031
+ row_headers = index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
1032
+
1033
+ "#<#{self.class}(#{size})#{':category' if category?}>\n" +
1034
+ Formatters::Table.format(
1035
+ to_a.lazy.map { |v| [v] },
1036
+ headers: @name && [@name],
1037
+ row_headers: row_headers,
1038
+ threshold: threshold,
1039
+ spacing: spacing
1040
+ )
1041
+ end
1042
+
1043
+ # Sets new index for vector. Preserves index->value correspondence.
1044
+ # Sets nil for new index keys absent from original index.
1045
+ # @note Unlike #reorder! which takes positions as input it takes
1046
+ # index as an input to reorder the vector
1047
+ # @param [DaruLite::Index, DaruLite::MultiIndex] new_index new index to order with
1048
+ # @return [DaruLite::Vector] vector reindexed with new index
1049
+ def reindex!(new_index)
1050
+ values = []
1051
+ each_with_index do |val, i|
1052
+ values[new_index[i]] = val if new_index.include?(i)
1053
+ end
1054
+ values.fill(nil, values.size, new_index.size - values.size)
1055
+
1056
+ @data = cast_vector_to @dtype, values
1057
+ @index = new_index
1058
+
1059
+ update_position_cache
1060
+
1061
+ self
1062
+ end
1063
+
1064
+ # Reorder the vector with given positions
1065
+ # @note Unlike #reindex! which takes index as input, it takes
1066
+ # positions as an input to reorder the vector
1067
+ # @param [Array] order the order to reorder the vector with
1068
+ # @return reordered vector
1069
+ # @example
1070
+ # dv = DaruLite::Vector.new [3, 2, 1], index: ['c', 'b', 'a']
1071
+ # dv.reorder! [2, 1, 0]
1072
+ # # => #<DaruLite::Vector(3)>
1073
+ # # a 1
1074
+ # # b 2
1075
+ # # c 3
1076
+ def reorder!(order)
1077
+ @index = @index.reorder order
1078
+ data_array = order.map { |i| @data[i] }
1079
+ @data = cast_vector_to @dtype, data_array, @nm_dtype
1080
+ update_position_cache
1081
+ self
1082
+ end
1083
+
1084
+ # Non-destructive version of #reorder!
1085
+ def reorder(order)
1086
+ dup.reorder! order
1087
+ end
1088
+
1089
+ # Create a new vector with a different index, and preserve the indexing of
1090
+ # current elements.
1091
+ def reindex(new_index)
1092
+ dup.reindex!(new_index)
1093
+ end
1094
+
1095
+ def index=(idx)
1096
+ idx = Index.coerce(idx)
1097
+
1098
+ raise ArgumentError, "Size of supplied index #{idx.size} does not match size of Vector" if idx.size != size
1099
+ raise ArgumentError, 'Can only assign type Index and its subclasses.' unless idx.is_a?(DaruLite::Index)
1100
+
1101
+ @index = idx
1102
+ self
1103
+ end
1104
+
1105
+ # Give the vector a new name
1106
+ #
1107
+ # @param new_name [Symbol] The new name.
1108
+ def rename(new_name)
1109
+ @name = new_name
1110
+ self
1111
+ end
1112
+
1113
+ alias name= rename
1114
+
1115
+ # Duplicated a vector
1116
+ # @return [DaruLite::Vector] duplicated vector
1117
+ def dup
1118
+ DaruLite::Vector.new @data.dup, name: @name, index: @index.dup
1119
+ end
1120
+
1121
+ # == Bootstrap
1122
+ # Generate +nr+ resamples (with replacement) of size +s+
1123
+ # from vector, computing each estimate from +estimators+
1124
+ # over each resample.
1125
+ # +estimators+ could be
1126
+ # a) Hash with variable names as keys and lambdas as values
1127
+ # a.bootstrap(:log_s2=>lambda {|v| Math.log(v.variance)},1000)
1128
+ # b) Array with names of method to bootstrap
1129
+ # a.bootstrap([:mean, :sd],1000)
1130
+ # c) A single method to bootstrap
1131
+ # a.jacknife(:mean, 1000)
1132
+ # If s is nil, is set to vector size by default.
1133
+ #
1134
+ # Returns a DataFrame where each vector is a vector
1135
+ # of length +nr+ containing the computed resample estimates.
1136
+ def bootstrap(estimators, nr, s = nil)
1137
+ s ||= size
1138
+ h_est, es, bss = prepare_bootstrap(estimators)
1139
+
1140
+ nr.times do
1141
+ bs = sample_with_replacement(s)
1142
+ es.each do |estimator|
1143
+ bss[estimator].push(h_est[estimator].call(bs))
1144
+ end
1145
+ end
1146
+
1147
+ es.each do |est|
1148
+ bss[est] = DaruLite::Vector.new bss[est]
1149
+ end
1150
+
1151
+ DaruLite::DataFrame.new bss
1152
+ end
1153
+
1154
+ # == Jacknife
1155
+ # Returns a dataset with jacknife delete-+k+ +estimators+
1156
+ # +estimators+ could be:
1157
+ # a) Hash with variable names as keys and lambdas as values
1158
+ # a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance)})
1159
+ # b) Array with method names to jacknife
1160
+ # a.jacknife([:mean, :sd])
1161
+ # c) A single method to jacknife
1162
+ # a.jacknife(:mean)
1163
+ # +k+ represent the block size for block jacknife. By default
1164
+ # is set to 1, for classic delete-one jacknife.
1165
+ #
1166
+ # Returns a dataset where each vector is an vector
1167
+ # of length +cases+/+k+ containing the computed jacknife estimates.
1168
+ #
1169
+ # == Reference:
1170
+ # * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife.
1171
+ def jackknife(estimators, k = 1) # rubocop:disable Metrics/MethodLength
1172
+ raise "n should be divisible by k:#{k}" unless (size % k).zero?
1173
+
1174
+ nb = (size / k).to_i
1175
+ h_est, es, ps = prepare_bootstrap(estimators)
1176
+
1177
+ est_n = es.to_h { |v| [v, h_est[v].call(self)] }
1178
+
1179
+ nb.times do |i|
1180
+ other = @data.dup
1181
+ other.slice!(i * k, k)
1182
+ other = DaruLite::Vector.new other
1183
+
1184
+ es.each do |estimator|
1185
+ # Add pseudovalue
1186
+ ps[estimator].push(
1187
+ (nb * est_n[estimator]) - ((nb - 1) * h_est[estimator].call(other))
1188
+ )
1189
+ end
1190
+ end
1191
+
1192
+ es.each do |est|
1193
+ ps[est] = DaruLite::Vector.new ps[est]
1194
+ end
1195
+ DaruLite::DataFrame.new ps
1196
+ end
1197
+
1198
+ # Returns an array of either none or integer values, indicating the
1199
+ # +regexp+ matching with the given array.
1200
+ #
1201
+ # @param regexp [Regexp] A regular matching expression. For example, +/weeks/+.
1202
+ #
1203
+ # @return [Array] Containing either +nil+ or integer values, according to the match with the given +regexp+
1204
+ #
1205
+ # @example
1206
+ # dv = DaruLite::Vector.new(['3 days', '5 weeks', '2 weeks'])
1207
+ # dv.match(/weeks/)
1208
+ #
1209
+ # # => [false, true, true]
1210
+ def match(regexp)
1211
+ @data.map { |value| !!(value =~ regexp) }
1212
+ end
1213
+
1214
+ # Creates a new vector consisting only of non-nil data
1215
+ #
1216
+ # == Arguments
1217
+ #
1218
+ # @param as_a [Symbol] Passing :array will return only the elements
1219
+ # as an Array. Otherwise will return a DaruLite::Vector.
1220
+ #
1221
+ # @param _duplicate [Symbol] In case no missing data is found in the
1222
+ # vector, setting this to false will return the same vector.
1223
+ # Otherwise, a duplicate will be returned irrespective of
1224
+ # presence of missing data.
1225
+
1226
+ def only_valid(as_a = :vector, _duplicate = true)
1227
+ # FIXME: Now duplicate is just ignored.
1228
+ # There are no spec that fail on this case, so I'll leave it
1229
+ # this way for now - zverok, 2016-05-07
1230
+
1231
+ new_index = @index.to_a - indexes(*DaruLite::MISSING_VALUES)
1232
+ new_vector = new_index.map { |idx| self[idx] }
1233
+
1234
+ if as_a == :vector
1235
+ DaruLite::Vector.new new_vector, index: new_index, name: @name, dtype: dtype
1236
+ else
1237
+ new_vector
1238
+ end
1239
+ end
1240
+ deprecate :only_valid, :reject_values, 2016, 10
1241
+
1242
+ # Return a vector with specified values removed
1243
+ # @param values [Array] values to reject from resultant vector
1244
+ # @return [DaruLite::Vector] vector with specified values removed
1245
+ # @example
1246
+ # dv = DaruLite::Vector.new [1, 2, nil, Float::NAN]
1247
+ # dv.reject_values nil, Float::NAN
1248
+ # # => #<DaruLite::Vector(2)>
1249
+ # # 0 1
1250
+ # # 1 2
1251
+ def reject_values(*values)
1252
+ resultant_pos = size.times.to_a - positions(*values)
1253
+ dv = at(*resultant_pos)
1254
+ # Handle the case when number of positions is 1
1255
+ # and hence #at doesn't return a vector
1256
+ if dv.is_a?(DaruLite::Vector)
1257
+ dv
1258
+ else
1259
+ pos = resultant_pos.first
1260
+ at(pos..pos)
1261
+ end
1262
+ end
1263
+
1264
+ # Return indexes of values specified
1265
+ # @param values [Array] values to find indexes for
1266
+ # @return [Array] array of indexes of values specified
1267
+ # @example
1268
+ # dv = DaruLite::Vector.new [1, 2, nil, Float::NAN], index: 11..14
1269
+ # dv.indexes nil, Float::NAN
1270
+ # # => [13, 14]
1271
+ def indexes(*values)
1272
+ index.to_a.values_at(*positions(*values))
1273
+ end
1274
+
1275
+ # Replaces specified values with a new value
1276
+ # @param [Array] old_values array of values to replace
1277
+ # @param [object] new_value new value to replace with
1278
+ # @note It performs the replace in place.
1279
+ # @return [DaruLite::Vector] Same vector itself with values
1280
+ # replaced with new value
1281
+ # @example
1282
+ # dv = DaruLite::Vector.new [1, 2, :a, :b]
1283
+ # dv.replace_values [:a, :b], nil
1284
+ # dv
1285
+ # # =>
1286
+ # # #<DaruLite::Vector:19903200 @name = nil @metadata = {} @size = 4 >
1287
+ # # nil
1288
+ # # 0 1
1289
+ # # 1 2
1290
+ # # 2 nil
1291
+ # # 3 nil
1292
+ def replace_values(old_values, new_value)
1293
+ old_values = [old_values] unless old_values.is_a? Array
1294
+ size.times do |pos|
1295
+ set_at([pos], new_value) if include_with_nan? old_values, at(pos)
1296
+ end
1297
+ self
1298
+ end
1299
+
1300
+ # Returns a Vector containing only missing data (preserves indexes).
1301
+ def only_missing(as_a = :vector)
1302
+ case as_a
1303
+ when :vector
1304
+ self[*indexes(*DaruLite::MISSING_VALUES)]
1305
+ when :array
1306
+ self[*indexes(*DaruLite::MISSING_VALUES)].to_a
1307
+ end
1308
+ end
1309
+ deprecate :only_missing, nil, 2016, 10
1310
+
1311
+ # Returns a Vector with only numerical data. Missing data is included
1312
+ # but non-Numeric objects are excluded. Preserves index.
1313
+ def only_numerics
1314
+ numeric_indexes =
1315
+ each_with_index
1316
+ .select { |v, _i| v.is_a?(Numeric) || v.nil? }
1317
+ .map(&:last)
1318
+
1319
+ self[*numeric_indexes]
1320
+ end
1321
+
1322
+ DATE_REGEXP = /^(\d{2}-\d{2}-\d{4}|\d{4}-\d{2}-\d{2})$/.freeze
1323
+
1324
+ # Returns the database type for the vector, according to its content
1325
+ def db_type
1326
+ # first, detect any character not number
1327
+ if @data.any? { |v| v.to_s =~ DATE_REGEXP }
1328
+ 'DATE'
1329
+ elsif @data.any? { |v| v.to_s =~ /[^0-9e.-]/ }
1330
+ 'VARCHAR (255)'
1331
+ elsif @data.any? { |v| v.to_s.include?('.') }
1332
+ 'DOUBLE'
1333
+ else
1334
+ 'INTEGER'
1335
+ end
1336
+ end
1337
+
1338
+ # Copies the structure of the vector (i.e the index, size, etc.) and fills all
1339
+ # all values with nils.
1340
+ def clone_structure
1341
+ DaruLite::Vector.new(([nil] * size), name: @name, index: @index.dup)
1342
+ end
1343
+
1344
+ # Save the vector to a file
1345
+ #
1346
+ # == Arguments
1347
+ #
1348
+ # * filename - Path of file where the vector is to be saved
1349
+ def save(filename)
1350
+ DaruLite::IO.save self, filename
1351
+ end
1352
+
1353
+ def _dump(*) # :nodoc:
1354
+ Marshal.dump(
1355
+ data: @data.to_a,
1356
+ dtype: @dtype,
1357
+ name: @name,
1358
+ index: @index
1359
+ )
1360
+ end
1361
+
1362
+ # :nocov:
1363
+ def daru_lite_vector(*)
1364
+ self
1365
+ end
1366
+ # :nocov:
1367
+
1368
+ alias dv daru_lite_vector
1369
+
1370
+ # Converts a non category type vector to category type vector.
1371
+ # @param [Hash] opts options to convert to category
1372
+ # @option opts [true, false] :ordered Specify if vector is ordered or not.
1373
+ # If it is ordered, it can be sorted and min, max like functions would work
1374
+ # @option opts [Array] :categories set categories in the specified order
1375
+ # @return [DaruLite::Vector] vector with type category
1376
+ def to_category(opts = {})
1377
+ dv = DaruLite::Vector.new to_a, type: :category, name: @name, index: @index
1378
+ dv.ordered = opts[:ordered] || false
1379
+ dv.categories = opts[:categories] if opts[:categories]
1380
+ dv
1381
+ end
1382
+
1383
+ def method_missing(name, *args, &block)
1384
+ # FIXME: it is shamefully fragile. Should be either made stronger
1385
+ # (string/symbol dychotomy, informative errors) or removed totally. - zverok
1386
+ if name =~ /(.+)=/
1387
+ self[Regexp.last_match(1).to_sym] = args[0]
1388
+ elsif has_index?(name)
1389
+ self[name]
1390
+ else
1391
+ super
1392
+ end
1393
+ end
1394
+
1395
+ def respond_to_missing?(name, include_private = false)
1396
+ name.to_s.end_with?('=') || has_index?(name) || super
1397
+ end
1398
+
1399
+ # Partition a numeric variable into categories.
1400
+ # @param [Array<Numeric>] partitions an array whose consecutive elements
1401
+ # provide intervals for categories
1402
+ # @param [Hash] opts options to cut the partition
1403
+ # @option opts [:left, :right] :close_at specifies whether the interval closes at
1404
+ # the right side of left side
1405
+ # @option opts [Array] :labels names of the categories
1406
+ # @return [DaruLite::Vector] numeric variable converted to categorical variable
1407
+ # @example
1408
+ # heights = DaruLite::Vector.new [30, 35, 32, 50, 42, 51]
1409
+ # height_cat = heights.cut [30, 40, 50, 60], labels=['low', 'medium', 'high']
1410
+ # # => #<DaruLite::Vector(6)>
1411
+ # # 0 low
1412
+ # # 1 low
1413
+ # # 2 low
1414
+ # # 3 high
1415
+ # # 4 medium
1416
+ # # 5 high
1417
+ def cut(partitions, opts = {})
1418
+ close_at = opts[:close_at] || :right
1419
+ labels = opts[:labels]
1420
+ partitions = partitions.to_a
1421
+ values = to_a.map { |val| cut_find_category partitions, val, close_at }
1422
+ cats = cut_categories(partitions, close_at)
1423
+
1424
+ dv = DaruLite::Vector.new values,
1425
+ index: @index,
1426
+ type: :category,
1427
+ categories: cats
1428
+
1429
+ # Rename categories if new labels provided
1430
+ if labels
1431
+ dv.rename_categories cats.zip(labels).to_h
1432
+ else
1433
+ dv
1434
+ end
1435
+ end
1436
+
1437
+ def positions(*values)
1438
+ case values
1439
+ when [nil]
1440
+ nil_positions
1441
+ when [Float::NAN]
1442
+ nan_positions
1443
+ when [nil, Float::NAN], [Float::NAN, nil]
1444
+ nil_positions + nan_positions
1445
+ else
1446
+ size.times.select { |i| include_with_nan? values, @data[i] }
1447
+ end
1448
+ end
1449
+
1450
+ def group_by(*args)
1451
+ to_df.group_by(*args)
1452
+ end
1453
+
1454
+ private
1455
+
1456
+ def copy(values)
1457
+ # Make sure values is right-justified to the size of the vector
1458
+ values.concat([nil] * (size - values.size)) if values.size < size
1459
+ DaruLite::Vector.new(values[0...size], index: @index, name: @name)
1460
+ end
1461
+
1462
+ def nil_positions
1463
+ @nil_positions ||
1464
+ @nil_positions = size.times.select { |i| @data[i].nil? }
1465
+ end
1466
+
1467
+ def nan_positions
1468
+ @nan_positions ||
1469
+ @nan_positions = size.times.select do |i|
1470
+ @data[i].respond_to?(:nan?) && @data[i].nan?
1471
+ end
1472
+ end
1473
+
1474
+ # Helper method returning validity of arbitrary value
1475
+ def valid_value?(v)
1476
+ !((v.respond_to?(:nan?) && v.nan?) || v.nil?)
1477
+ end
1478
+
1479
+ def initialize_vector(source, opts)
1480
+ index, source = parse_source(source, opts)
1481
+ set_name opts[:name]
1482
+
1483
+ @data = cast_vector_to(opts[:dtype] || :array, source, opts[:nm_dtype])
1484
+ @index = Index.coerce(index || @data.size)
1485
+
1486
+ guard_sizes!
1487
+
1488
+ @possibly_changed_type = true
1489
+ end
1490
+
1491
+ def parse_source(source, opts)
1492
+ if source.is_a?(Hash)
1493
+ [source.keys, source.values]
1494
+ else
1495
+ [opts[:index], source || []]
1496
+ end
1497
+ end
1498
+
1499
+ def guard_sizes!
1500
+ if @index.size > @data.size
1501
+ cast(dtype: :array) # NM with nils seg faults
1502
+ @data.fill(nil, @data.size...@index.size)
1503
+ elsif @index.size < @data.size
1504
+ raise IndexError, "Expected index size >= vector size. Index size : #{@index.size}, vector size : #{@data.size}"
1505
+ end
1506
+ end
1507
+
1508
+ def guard_type_check(value)
1509
+ @possibly_changed_type = true \
1510
+ if (object? && (value.nil? || value.is_a?(Numeric))) ||
1511
+ (numeric? && !value.is_a?(Numeric) && !value.nil?)
1512
+ end
1513
+
1514
+ def split_value(key, v)
1515
+ if v.nil?
1516
+ nil
1517
+ elsif v.include?(key)
1518
+ 1
1519
+ else
1520
+ 0
1521
+ end
1522
+ end
1523
+
1524
+ # For an array or hash of estimators methods, returns
1525
+ # an array with three elements
1526
+ # 1.- A hash with estimators names as keys and lambdas as values
1527
+ # 2.- An array with estimators names
1528
+ # 3.- A Hash with estimators names as keys and empty arrays as values
1529
+ def prepare_bootstrap(estimators)
1530
+ h_est = estimators
1531
+ h_est = [h_est] unless h_est.is_a?(Array) || h_est.is_a?(Hash)
1532
+
1533
+ if h_est.is_a? Array
1534
+ h_est = h_est.to_h do |est|
1535
+ [est, ->(v) { DaruLite::Vector.new(v).send(est) }]
1536
+ end
1537
+ end
1538
+ bss = h_est.keys.to_h { |v| [v, []] }
1539
+
1540
+ [h_est, h_est.keys, bss]
1541
+ end
1542
+
1543
+ # NOTE: To maintain sanity, this _MUST_ be the _ONLY_ place in daru where the
1544
+ # @param dtype [db_type] variable is set and the underlying data type of vector changed.
1545
+ def cast_vector_to(dtype, source = nil, _nm_dtype = nil)
1546
+ source = @data.to_a if source.nil?
1547
+
1548
+ new_vector =
1549
+ case dtype
1550
+ when :array then DaruLite::Accessors::ArrayWrapper.new(source, self)
1551
+ when :mdarray then raise NotImplementedError, 'MDArray not yet supported.'
1552
+ else raise ArgumentError, "Unknown dtype #{dtype}"
1553
+ end
1554
+
1555
+ @dtype = dtype
1556
+ new_vector
1557
+ end
1558
+
1559
+ def set_name(name) # rubocop:disable Naming/AccessorMethodName
1560
+ @name = name.is_a?(Array) ? name.join : name # join in case of MultiIndex tuple
1561
+ end
1562
+
1563
+ # Raises IndexError when one of the positions is an invalid position
1564
+ def validate_positions(*positions)
1565
+ positions.each do |pos|
1566
+ raise IndexError, "#{pos} is not a valid position." if pos >= size
1567
+ end
1568
+ end
1569
+
1570
+ # coerce ranges, integers and array in appropriate ways
1571
+ def coerce_positions(*positions)
1572
+ if positions.size == 1
1573
+ case positions.first
1574
+ when Integer
1575
+ positions.first
1576
+ when Range
1577
+ size.times.to_a[positions.first]
1578
+ else
1579
+ raise ArgumentError, 'Unkown position type.'
1580
+ end
1581
+ else
1582
+ positions
1583
+ end
1584
+ end
1585
+
1586
+ # Helper method for []=.
1587
+ # Assigs existing index to another value
1588
+ def modify_vector(indexes, val)
1589
+ positions = @index.pos(*indexes)
1590
+
1591
+ if positions.is_a? Numeric
1592
+ @data[positions] = val
1593
+ else
1594
+ positions.each { |pos| @data[pos] = val }
1595
+ end
1596
+ end
1597
+
1598
+ # Helper method for []=.
1599
+ # Add a new index and assign it value
1600
+ def insert_vector(indexes, val)
1601
+ new_index = @index.add(*indexes)
1602
+ # May be create +=
1603
+ (new_index.size - @index.size).times { @data << val }
1604
+ @index = new_index
1605
+ end
1606
+
1607
+ # Works similar to #[]= but also insert the vector in case index is not valid
1608
+ # It is there only to be accessed by DaruLite::DataFrame and not meant for user.
1609
+ def set(indexes, val)
1610
+ cast(dtype: :array) if val.nil? && dtype != :array
1611
+ guard_type_check(val)
1612
+
1613
+ if @index.valid?(*indexes)
1614
+ modify_vector(indexes, val)
1615
+ else
1616
+ insert_vector(indexes, val)
1617
+ end
1618
+
1619
+ update_position_cache
1620
+ end
1621
+
1622
+ def cut_find_category(partitions, val, close_at)
1623
+ case close_at
1624
+ when :right
1625
+ right_index = partitions.index { |i| i > val }
1626
+ raise ArgumentError, 'Invalid partition' if right_index.nil?
1627
+
1628
+ left_index = right_index - 1
1629
+ "#{partitions[left_index]}-#{partitions[right_index] - 1}"
1630
+ when :left
1631
+ right_index = partitions.index { |i| i >= val }
1632
+ raise ArgumentError, 'Invalid partition' if right_index.nil?
1633
+
1634
+ left_index = right_index - 1
1635
+ "#{partitions[left_index] + 1}-#{partitions[right_index]}"
1636
+ else
1637
+ raise ArgumentError, "Invalid parameter #{close_at} to close_at."
1638
+ end
1639
+ end
1640
+
1641
+ def cut_categories(partitions, close_at)
1642
+ case close_at
1643
+ when :right
1644
+ Array.new(partitions.size - 1) do |left_index|
1645
+ "#{partitions[left_index]}-#{partitions[left_index + 1] - 1}"
1646
+ end
1647
+ when :left
1648
+ Array.new(partitions.size - 1) do |left_index|
1649
+ "#{partitions[left_index] + 1}-#{partitions[left_index + 1]}"
1650
+ end
1651
+ end
1652
+ end
1653
+
1654
+ def include_with_nan?(array, value)
1655
+ # Returns true if value is included in array.
1656
+ # Similar to include? but also works if value is Float::NAN
1657
+ if value.respond_to?(:nan?) && value.nan?
1658
+ array.any? { |i| i.respond_to?(:nan?) && i.nan? }
1659
+ else
1660
+ array.include? value
1661
+ end
1662
+ end
1663
+
1664
+ def update_position_cache
1665
+ @nil_positions = nil
1666
+ @nan_positions = nil
1667
+ end
1668
+
1669
+ def resort_index(vector_index, opts)
1670
+ if block_given?
1671
+ vector_index.sort { |(lv, _li), (rv, _ri)| yield(lv, rv) }
1672
+ else
1673
+ vector_index.sort(&DEFAULT_SORTER)
1674
+ end
1675
+ .tap { |res| res.reverse! unless opts[:ascending] }
1676
+ end
1677
+ end
1678
+ end