daru_lite 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. checksums.yaml +7 -0
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/workflows/ci.yml +33 -0
  4. data/.gitignore +10 -0
  5. data/.rspec +2 -0
  6. data/.rubocop.yml +27 -0
  7. data/.rubocop_todo.yml +137 -0
  8. data/CONTRIBUTING.md +47 -0
  9. data/Gemfile +2 -0
  10. data/History.md +4 -0
  11. data/LICENSE +24 -0
  12. data/README.md +218 -0
  13. data/Rakefile +69 -0
  14. data/ReleasePolicy.md +20 -0
  15. data/benchmarks/TradeoffData.csv +65 -0
  16. data/benchmarks/csv_reading.rb +22 -0
  17. data/benchmarks/dataframe_creation.rb +39 -0
  18. data/benchmarks/db_loading.rb +34 -0
  19. data/benchmarks/duplicating.rb +45 -0
  20. data/benchmarks/group_by.rb +32 -0
  21. data/benchmarks/joining.rb +52 -0
  22. data/benchmarks/row_access.rb +41 -0
  23. data/benchmarks/row_assign.rb +36 -0
  24. data/benchmarks/sorting.rb +51 -0
  25. data/benchmarks/statistics.rb +28 -0
  26. data/benchmarks/vector_access.rb +31 -0
  27. data/benchmarks/vector_assign.rb +42 -0
  28. data/benchmarks/where_clause.rb +48 -0
  29. data/benchmarks/where_vs_filter.rb +28 -0
  30. data/daru_lite.gemspec +55 -0
  31. data/images/README.md +5 -0
  32. data/images/con0.png +0 -0
  33. data/images/con1.png +0 -0
  34. data/images/init0.png +0 -0
  35. data/images/init1.png +0 -0
  36. data/images/man0.png +0 -0
  37. data/images/man1.png +0 -0
  38. data/images/man2.png +0 -0
  39. data/images/man3.png +0 -0
  40. data/images/man4.png +0 -0
  41. data/images/man5.png +0 -0
  42. data/images/man6.png +0 -0
  43. data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
  44. data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
  45. data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
  46. data/lib/daru_lite/category.rb +929 -0
  47. data/lib/daru_lite/configuration.rb +34 -0
  48. data/lib/daru_lite/core/group_by.rb +403 -0
  49. data/lib/daru_lite/core/merge.rb +270 -0
  50. data/lib/daru_lite/core/query.rb +109 -0
  51. data/lib/daru_lite/dataframe.rb +3080 -0
  52. data/lib/daru_lite/date_time/index.rb +569 -0
  53. data/lib/daru_lite/date_time/offsets.rb +397 -0
  54. data/lib/daru_lite/exceptions.rb +2 -0
  55. data/lib/daru_lite/extensions/which_dsl.rb +53 -0
  56. data/lib/daru_lite/formatters/table.rb +52 -0
  57. data/lib/daru_lite/helpers/array.rb +53 -0
  58. data/lib/daru_lite/index/categorical_index.rb +201 -0
  59. data/lib/daru_lite/index/index.rb +374 -0
  60. data/lib/daru_lite/index/multi_index.rb +374 -0
  61. data/lib/daru_lite/io/csv/converters.rb +21 -0
  62. data/lib/daru_lite/io/io.rb +294 -0
  63. data/lib/daru_lite/io/sql_data_source.rb +97 -0
  64. data/lib/daru_lite/iruby/helpers.rb +38 -0
  65. data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
  66. data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
  67. data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  68. data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  69. data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
  70. data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
  71. data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
  72. data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
  73. data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
  74. data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
  75. data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
  76. data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
  77. data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
  78. data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
  79. data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
  80. data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
  81. data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
  82. data/lib/daru_lite/monkeys.rb +56 -0
  83. data/lib/daru_lite/vector.rb +1678 -0
  84. data/lib/daru_lite/version.rb +3 -0
  85. data/lib/daru_lite.rb +99 -0
  86. data/profile/_base.rb +23 -0
  87. data/profile/df_to_a.rb +10 -0
  88. data/profile/filter.rb +13 -0
  89. data/profile/joining.rb +13 -0
  90. data/profile/sorting.rb +12 -0
  91. data/profile/vector_each_with_index.rb +9 -0
  92. data/profile/vector_new.rb +9 -0
  93. data/spec/accessors/array_wrapper_spec.rb +3 -0
  94. data/spec/category_spec.rb +1741 -0
  95. data/spec/core/group_by_spec.rb +655 -0
  96. data/spec/core/merge_spec.rb +179 -0
  97. data/spec/core/query_spec.rb +347 -0
  98. data/spec/daru_lite_spec.rb +22 -0
  99. data/spec/dataframe_spec.rb +4330 -0
  100. data/spec/date_time/data_spec.rb +197 -0
  101. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  102. data/spec/date_time/index_spec.rb +588 -0
  103. data/spec/date_time/offsets_spec.rb +465 -0
  104. data/spec/extensions/which_dsl_spec.rb +38 -0
  105. data/spec/fixtures/bank2.dat +200 -0
  106. data/spec/fixtures/boolean_converter_test.csv +5 -0
  107. data/spec/fixtures/countries.json +7794 -0
  108. data/spec/fixtures/duplicates.csv +32 -0
  109. data/spec/fixtures/eciresults.html +394 -0
  110. data/spec/fixtures/empties.dat +2 -0
  111. data/spec/fixtures/empty_rows_test.csv +17 -0
  112. data/spec/fixtures/macau.html +3691 -0
  113. data/spec/fixtures/macd_data.csv +150 -0
  114. data/spec/fixtures/matrix_test.csv +100 -0
  115. data/spec/fixtures/moneycontrol.html +6812 -0
  116. data/spec/fixtures/music_data.tsv +2501 -0
  117. data/spec/fixtures/repeated_fields.csv +7 -0
  118. data/spec/fixtures/sales-funnel.csv +18 -0
  119. data/spec/fixtures/scientific_notation.csv +4 -0
  120. data/spec/fixtures/string_converter_test.csv +5 -0
  121. data/spec/fixtures/strings.dat +2 -0
  122. data/spec/fixtures/test_xls.xls +0 -0
  123. data/spec/fixtures/test_xls_2.xls +0 -0
  124. data/spec/fixtures/url_test.txt~ +0 -0
  125. data/spec/fixtures/valid_markup.html +62 -0
  126. data/spec/fixtures/wiki_climate.html +1243 -0
  127. data/spec/fixtures/wiki_table_info.html +631 -0
  128. data/spec/formatters/table_formatter_spec.rb +137 -0
  129. data/spec/helpers_spec.rb +8 -0
  130. data/spec/index/categorical_index_spec.rb +170 -0
  131. data/spec/index/index_spec.rb +417 -0
  132. data/spec/index/multi_index_spec.rb +680 -0
  133. data/spec/io/io_spec.rb +373 -0
  134. data/spec/io/sql_data_source_spec.rb +56 -0
  135. data/spec/iruby/dataframe_spec.rb +170 -0
  136. data/spec/iruby/helpers_spec.rb +49 -0
  137. data/spec/iruby/multi_index_spec.rb +37 -0
  138. data/spec/iruby/vector_spec.rb +105 -0
  139. data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
  140. data/spec/maths/arithmetic/vector_spec.rb +165 -0
  141. data/spec/maths/statistics/dataframe_spec.rb +178 -0
  142. data/spec/maths/statistics/vector_spec.rb +756 -0
  143. data/spec/monkeys_spec.rb +42 -0
  144. data/spec/shared/vector_display_spec.rb +213 -0
  145. data/spec/spec_helper.rb +87 -0
  146. data/spec/support/database_helper.rb +30 -0
  147. data/spec/support/matchers.rb +5 -0
  148. data/spec/vector_spec.rb +2293 -0
  149. metadata +571 -0
@@ -0,0 +1,1678 @@
1
+ require 'daru_lite/maths/arithmetic/vector'
2
+ require 'daru_lite/maths/statistics/vector'
3
+ require 'daru_lite/accessors/array_wrapper'
4
+ require 'daru_lite/category'
5
+
6
+ module DaruLite
7
+ class Vector # rubocop:disable Metrics/ClassLength
8
+ include Enumerable
9
+ include DaruLite::Maths::Arithmetic::Vector
10
+ include DaruLite::Maths::Statistics::Vector
11
+ extend Gem::Deprecate
12
+
13
+ class << self
14
+ # Create a new vector by specifying the size and an optional value
15
+ # and block to generate values.
16
+ #
17
+ # == Description
18
+ #
19
+ # The *new_with_size* class method lets you create a DaruLite::Vector
20
+ # by specifying the size as the argument. The optional block, if
21
+ # supplied, is run once for populating each element in the Vector.
22
+ #
23
+ # The result of each run of the block is the value that is ultimately
24
+ # assigned to that position in the Vector.
25
+ #
26
+ # == Options
27
+ # :value
28
+ # All the rest like .new
29
+ def new_with_size(n, opts = {}, &block)
30
+ value = opts.delete :value
31
+ block ||= ->(_) { value }
32
+ DaruLite::Vector.new Array.new(n, &block), opts
33
+ end
34
+
35
+ # Create a vector using (almost) any object
36
+ # * Array: flattened
37
+ # * Range: transformed using to_a
38
+ # * DaruLite::Vector
39
+ # * Numeric and string values
40
+ #
41
+ # == Description
42
+ #
43
+ # The `Vector.[]` class method creates a vector from almost any
44
+ # object that has a `#to_a` method defined on it. It is similar
45
+ # to R's `c` method.
46
+ #
47
+ # == Usage
48
+ #
49
+ # a = DaruLite::Vector[1,2,3,4,6..10]
50
+ # #=>
51
+ # # <DaruLite::Vector:99448510 @name = nil @size = 9 >
52
+ # # nil
53
+ # # 0 1
54
+ # # 1 2
55
+ # # 2 3
56
+ # # 3 4
57
+ # # 4 6
58
+ # # 5 7
59
+ # # 6 8
60
+ # # 7 9
61
+ # # 8 10
62
+ def [](*indexes)
63
+ values = indexes.map do |a|
64
+ a.respond_to?(:to_a) ? a.to_a : a
65
+ end.flatten
66
+ DaruLite::Vector.new(values)
67
+ end
68
+
69
+ def _load(data) # :nodoc:
70
+ h = Marshal.load(data)
71
+ DaruLite::Vector.new(h[:data],
72
+ index: h[:index],
73
+ name: h[:name],
74
+ dtype: h[:dtype], missing_values: h[:missing_values])
75
+ end
76
+
77
+ def coerce(data, options = {})
78
+ case data
79
+ when DaruLite::Vector
80
+ data
81
+ when Array, Hash
82
+ new(data, options)
83
+ else
84
+ raise ArgumentError, "Can't coerce #{data.class} to #{self}"
85
+ end
86
+ end
87
+ end
88
+
89
+ def size
90
+ @data.size
91
+ end
92
+
93
+ def each(&block)
94
+ return to_enum(:each) unless block
95
+
96
+ @data.each(&block)
97
+ self
98
+ end
99
+
100
+ def each_index(&block)
101
+ return to_enum(:each_index) unless block
102
+
103
+ @index.each(&block)
104
+ self
105
+ end
106
+
107
+ def each_with_index(&block)
108
+ return to_enum(:each_with_index) unless block
109
+
110
+ @data.to_a.zip(@index.to_a).each(&block)
111
+
112
+ self
113
+ end
114
+
115
+ def map!(&block)
116
+ return to_enum(:map!) unless block
117
+
118
+ @data.map!(&block)
119
+ self
120
+ end
121
+
122
+ def apply_method(method, keys: nil, by_position: true)
123
+ vect = keys ? get_sub_vector(keys, by_position: by_position) : self
124
+
125
+ case method
126
+ when Symbol then vect.send(method)
127
+ when Proc then method.call(vect)
128
+ else raise
129
+ end
130
+ end
131
+ alias apply_method_on_sub_vector apply_method
132
+
133
+ # The name of the DaruLite::Vector. String.
134
+ attr_reader :name
135
+ # The row index. Can be either DaruLite::Index or DaruLite::MultiIndex.
136
+ attr_reader :index
137
+ # The underlying dtype of the Vector. Can be :array.
138
+ attr_reader :dtype
139
+ attr_reader :nm_dtype
140
+ # An Array or the positions in the vector that are being treated as 'missing'.
141
+ attr_reader :missing_positions
142
+
143
+ deprecate :missing_positions, :indexes, 2016, 10
144
+ # Store a hash of labels for values. Supplementary only. Recommend using index
145
+ # for proper usage.
146
+ attr_accessor :labels
147
+ # Store vector data in an array
148
+ attr_reader :data
149
+
150
+ # Create a Vector object.
151
+ #
152
+ # == Arguments
153
+ #
154
+ # @param source[Array,Hash] - Supply elements in the form of an Array or a
155
+ # Hash. If Array, a numeric index will be created if not supplied in the
156
+ # options. Specifying more index elements than actual values in *source*
157
+ # will insert *nil* into the surplus index elements. When a Hash is specified,
158
+ # the keys of the Hash are taken as the index elements and the corresponding
159
+ # values as the values that populate the vector.
160
+ #
161
+ # == Options
162
+ #
163
+ # * +:name+ - Name of the vector
164
+ #
165
+ # * +:index+ - Index of the vector
166
+ #
167
+ # * +:dtype+ - The underlying data type. Can be :array.
168
+ # Default :array.
169
+ #
170
+ # * +:missing_values+ - An Array of the values that are to be treated as 'missing'.
171
+ # nil is the default missing value.
172
+ #
173
+ # == Usage
174
+ #
175
+ # vecarr = DaruLite::Vector.new [1,2,3,4], index: [:a, :e, :i, :o]
176
+ # vechsh = DaruLite::Vector.new({a: 1, e: 2, i: 3, o: 4})
177
+ def initialize(source, opts = {})
178
+ if opts[:type] == :category
179
+ # Initialize category type vector
180
+ extend DaruLite::Category
181
+ initialize_category source, opts
182
+ else
183
+ # Initialize non-category type vector
184
+ initialize_vector source, opts
185
+ end
186
+ end
187
+
188
+ # Get one or more elements with specified index or a range.
189
+ #
190
+ # == Usage
191
+ # # For vectors employing single layer Index
192
+ #
193
+ # v[:one, :two] # => DaruLite::Vector with indexes :one and :two
194
+ # v[:one] # => Single element
195
+ # v[:one..:three] # => DaruLite::Vector with indexes :one, :two and :three
196
+ #
197
+ # # For vectors employing hierarchial multi index
198
+ #
199
+ def [](*input_indexes)
200
+ # Get array of positions indexes
201
+ positions = @index.pos(*input_indexes)
202
+
203
+ # If one object is asked return it
204
+ return @data[positions] if positions.is_a? Numeric
205
+
206
+ # Form a new Vector using positional indexes
207
+ DaruLite::Vector.new(
208
+ positions.map { |loc| @data[loc] },
209
+ name: @name,
210
+ index: @index.subset(*input_indexes), dtype: @dtype
211
+ )
212
+ end
213
+
214
+ # Returns vector of values given positional values
215
+ # @param positions [Array<object>] positional values
216
+ # @return [object] vector
217
+ # @example
218
+ # dv = DaruLite::Vector.new 'a'..'e'
219
+ # dv.at 0, 1, 2
220
+ # # => #<DaruLite::Vector(3)>
221
+ # # 0 a
222
+ # # 1 b
223
+ # # 2 c
224
+ def at(*positions)
225
+ # to be used to form index
226
+ original_positions = positions
227
+ positions = coerce_positions(*positions)
228
+ validate_positions(*positions)
229
+
230
+ if positions.is_a? Integer
231
+ @data[positions]
232
+ else
233
+ values = positions.map { |pos| @data[pos] }
234
+ DaruLite::Vector.new values, index: @index.at(*original_positions), dtype: dtype
235
+ end
236
+ end
237
+
238
+ # Change value at given positions
239
+ # @param positions [Array<object>] positional values
240
+ # @param [object] val value to assign
241
+ # @example
242
+ # dv = DaruLite::Vector.new 'a'..'e'
243
+ # dv.set_at [0, 1], 'x'
244
+ # dv
245
+ # # => #<DaruLite::Vector(5)>
246
+ # # 0 x
247
+ # # 1 x
248
+ # # 2 c
249
+ # # 3 d
250
+ # # 4 e
251
+ def set_at(positions, val)
252
+ validate_positions(*positions)
253
+ positions.map { |pos| @data[pos] = val }
254
+ update_position_cache
255
+ end
256
+
257
+ # Just like in Hashes, you can specify the index label of the DaruLite::Vector
258
+ # and assign an element an that place in the DaruLite::Vector.
259
+ #
260
+ # == Usage
261
+ #
262
+ # v = DaruLite::Vector.new([1,2,3], index: [:a, :b, :c])
263
+ # v[:a] = 999
264
+ # #=>
265
+ # ##<DaruLite::Vector:90257920 @name = nil @size = 3 >
266
+ # # nil
267
+ # # a 999
268
+ # # b 2
269
+ # # c 3
270
+ def []=(*indexes, val)
271
+ cast(dtype: :array) if val.nil? && dtype != :array
272
+
273
+ guard_type_check(val)
274
+
275
+ modify_vector(indexes, val)
276
+
277
+ update_position_cache
278
+ end
279
+
280
+ # Two vectors are equal if they have the exact same index values corresponding
281
+ # with the exact same elements. Name is ignored.
282
+ def ==(other)
283
+ case other
284
+ when DaruLite::Vector
285
+ @index == other.index && size == other.size &&
286
+ each_with_index.with_index.all? do |(e, index), position|
287
+ e == other.at(position) && index == other.index.to_a[position]
288
+ end
289
+ else
290
+ super
291
+ end
292
+ end
293
+
294
+ # !@method eq
295
+ # Uses `==` and returns `true` for each **equal** entry
296
+ # @param [#==, DaruLite::Vector] If scalar object, compares it with each
297
+ # element in self. If DaruLite::Vector, compares elements with same indexes.
298
+ # @example (see #where)
299
+ # !@method not_eq
300
+ # Uses `!=` and returns `true` for each **unequal** entry
301
+ # @param [#!=, DaruLite::Vector] If scalar object, compares it with each
302
+ # element in self. If DaruLite::Vector, compares elements with same indexes.
303
+ # @example (see #where)
304
+ # !@method lt
305
+ # Uses `<` and returns `true` for each entry **less than** the supplied object
306
+ # @param [#<, DaruLite::Vector] If scalar object, compares it with each
307
+ # element in self. If DaruLite::Vector, compares elements with same indexes.
308
+ # @example (see #where)
309
+ # !@method lteq
310
+ # Uses `<=` and returns `true` for each entry **less than or equal to** the supplied object
311
+ # @param [#<=, DaruLite::Vector] If scalar object, compares it with each
312
+ # element in self. If DaruLite::Vector, compares elements with same indexes.
313
+ # @example (see #where)
314
+ # !@method mt
315
+ # Uses `>` and returns `true` for each entry **more than** the supplied object
316
+ # @param [#>, DaruLite::Vector] If scalar object, compares it with each
317
+ # element in self. If DaruLite::Vector, compares elements with same indexes.
318
+ # @example (see #where)
319
+ # !@method mteq
320
+ # Uses `>=` and returns `true` for each entry **more than or equal to** the supplied object
321
+ # @param [#>=, DaruLite::Vector] If scalar object, compares it with each
322
+ # element in self. If DaruLite::Vector, compares elements with same indexes.
323
+ # @example (see #where)
324
+
325
+ # Define the comparator methods with metaprogramming. See documentation
326
+ # written above for functionality of each method. Use these methods with the
327
+ # `where` method to obtain the corresponding Vector/DataFrame.
328
+ {
329
+ eq: :==,
330
+ not_eq: :!=,
331
+ lt: :<,
332
+ lteq: :<=,
333
+ mt: :>,
334
+ mteq: :>=
335
+ }.each do |method, operator|
336
+ define_method(method) do |other|
337
+ mod = DaruLite::Core::Query
338
+ if other.is_a?(DaruLite::Vector)
339
+ mod.apply_vector_operator operator, self, other
340
+ else
341
+ mod.apply_scalar_operator operator, @data, other
342
+ end
343
+ end
344
+ alias_method operator, method if operator != :== && operator != :!=
345
+ end
346
+ alias gt mt
347
+ alias gteq mteq
348
+
349
+ # Comparator for checking if any of the elements in *other* exist in self.
350
+ #
351
+ # @param [Array, DaruLite::Vector] other A collection which has elements that
352
+ # need to be checked for in self.
353
+ # @example Usage of `in`.
354
+ # vector = DaruLite::Vector.new([1,2,3,4,5])
355
+ # vector.where(vector.in([3,5]))
356
+ # #=>
357
+ # ##<DaruLite::Vector:82215960 @name = nil @size = 2 >
358
+ # # nil
359
+ # # 2 3
360
+ # # 4 5
361
+ def in(other)
362
+ other = other.zip(Array.new(other.size, 0)).to_h
363
+ DaruLite::Core::Query::BoolArray.new(
364
+ @data.each_with_object([]) do |d, memo|
365
+ memo << (other.key?(d))
366
+ end
367
+ )
368
+ end
369
+
370
+ # Return a new vector based on the contents of a boolean array. Use with the
371
+ # comparator methods to obtain meaningful results. See this notebook for
372
+ # a good overview of using #where.
373
+ #
374
+ # @param bool_array [DaruLite::Core::Query::BoolArray, Array<TrueClass, FalseClass>] The
375
+ # collection containing the true of false values. Each element in the Vector
376
+ # corresponding to a `true` in the bool_arry will be returned alongwith it's
377
+ # index.
378
+ # @example Usage of #where.
379
+ # vector = DaruLite::Vector.new([2,4,5,51,5,16,2,5,3,2,1,5,2,5,2,1,56,234,6,21])
380
+ #
381
+ # # Simple logic statement passed to #where.
382
+ # vector.where(vector.eq(5).or(vector.eq(1)))
383
+ # # =>
384
+ # ##<DaruLite::Vector:77626210 @name = nil @size = 7 >
385
+ # # nil
386
+ # # 2 5
387
+ # # 4 5
388
+ # # 7 5
389
+ # # 10 1
390
+ # # 11 5
391
+ # # 13 5
392
+ # # 15 1
393
+ #
394
+ # # A somewhat more complex logic statement
395
+ # vector.where((vector.eq(5) | vector.lteq(1)) & vector.in([4,5,1]))
396
+ # #=>
397
+ # ##<DaruLite::Vector:81072310 @name = nil @size = 7 >
398
+ # # nil
399
+ # # 2 5
400
+ # # 4 5
401
+ # # 7 5
402
+ # # 10 1
403
+ # # 11 5
404
+ # # 13 5
405
+ # # 15 1
406
+ def where(bool_array)
407
+ DaruLite::Core::Query.vector_where self, bool_array
408
+ end
409
+
410
+ # Return a new vector based on the contents of a boolean array and &block.
411
+ #
412
+ # @param bool_array [DaruLite::Core::Query::BoolArray, Array<TrueClass, FalseClass>, &block] The
413
+ # collection containing the true of false values. Each element in the Vector
414
+ # corresponding to a `true` in the bool_array will be returned along with it's
415
+ # index. The &block may contain manipulative functions for the Vector elements.
416
+ #
417
+ # @return [DaruLite::Vector]
418
+ #
419
+ # @example Usage of #apply_where.
420
+ # dv = DaruLite::Vector.new ['3 days', '5 weeks', '2 weeks']
421
+ # dv = dv.apply_where(dv.match /weeks/) { |x| "#{x.split.first.to_i * 7} days" }
422
+ # # =>
423
+ # ##<DaruLite::Vector(3)>
424
+ # # 0 3 days
425
+ # # 1 35 days
426
+ # # 2 14 days
427
+ def apply_where(bool_array, &block)
428
+ DaruLite::Core::Query.vector_apply_where self, bool_array, &block
429
+ end
430
+
431
+ def head(q = 10)
432
+ self[0..(q - 1)]
433
+ end
434
+
435
+ def tail(q = 10)
436
+ start = [size - q, 0].max
437
+ self[start..(size - 1)]
438
+ end
439
+
440
+ def last(q = 1)
441
+ # The Enumerable mixin dose not provide the last method.
442
+ tail(q)
443
+ end
444
+
445
+ def empty?
446
+ @index.empty?
447
+ end
448
+
449
+ def numeric?
450
+ type == :numeric
451
+ end
452
+
453
+ def object?
454
+ type == :object
455
+ end
456
+
457
+ # Reports whether missing data is present in the Vector.
458
+ def has_missing_data?
459
+ !indexes(*DaruLite::MISSING_VALUES).empty?
460
+ end
461
+ alias flawed? has_missing_data?
462
+ deprecate :has_missing_data?, :include_values?, 2016, 10
463
+ deprecate :flawed?, :include_values?, 2016, 10
464
+
465
+ # Check if any one of mentioned values occur in the vector
466
+ # @param values [Array] values to check for
467
+ # @return [true, false] returns true if any one of specified values
468
+ # occur in the vector
469
+ # @example
470
+ # dv = DaruLite::Vector.new [1, 2, 3, 4, nil]
471
+ # dv.include_values? nil, Float::NAN
472
+ # # => true
473
+ def include_values?(*values)
474
+ values.any? { |v| include_with_nan? @data, v }
475
+ end
476
+
477
+ # @note Do not use it to check for Float::NAN as
478
+ # Float::NAN == Float::NAN is false
479
+ # Return vector of booleans with value at ith position is either
480
+ # true or false depending upon whether value at position i is equal to
481
+ # any of the values passed in the argument or not
482
+ # @param values [Array] values to equate with
483
+ # @return [DaruLite::Vector] vector of boolean values
484
+ # @example
485
+ # dv = DaruLite::Vector.new [1, 2, 3, 2, 1]
486
+ # dv.is_values 1, 2
487
+ # # => #<DaruLite::Vector(5)>
488
+ # # 0 true
489
+ # # 1 true
490
+ # # 2 false
491
+ # # 3 true
492
+ # # 4 true
493
+ def is_values(*values)
494
+ DaruLite::Vector.new values.map { |v| eq(v) }.inject(:|)
495
+ end
496
+
497
+ # Append an element to the vector by specifying the element and index
498
+ def concat(element, index)
499
+ raise IndexError, 'Expected new unique index' if @index.include? index
500
+
501
+ @index |= [index]
502
+ @data[@index[index]] = element
503
+
504
+ update_position_cache
505
+ end
506
+ alias push concat
507
+ alias << concat
508
+
509
+ # Cast a vector to a new data type.
510
+ #
511
+ # == Options
512
+ #
513
+ # * +:dtype+ - :array for Ruby Array..
514
+ def cast(opts = {})
515
+ dt = opts[:dtype]
516
+ raise ArgumentError, "Unsupported dtype #{opts[:dtype]}" unless dt == :array
517
+
518
+ @data = cast_vector_to dt unless @dtype == dt
519
+ end
520
+
521
+ # Delete an element by value
522
+ def delete(element)
523
+ delete_at index_of(element)
524
+ end
525
+
526
+ # Delete element by index
527
+ def delete_at(index)
528
+ @data.delete_at @index[index]
529
+ @index = DaruLite::Index.new(@index.to_a - [index])
530
+
531
+ update_position_cache
532
+ end
533
+
534
+ # The type of data contained in the vector. Can be :object.
535
+ #
536
+ # Running through the data to figure out the kind of data is delayed to the
537
+ # last possible moment.
538
+ def type
539
+ if @type.nil? || @possibly_changed_type
540
+ @type = :numeric
541
+ each do |e|
542
+ next if e.nil? || e.is_a?(Numeric)
543
+
544
+ @type = :object
545
+ break
546
+ end
547
+ @possibly_changed_type = false
548
+ end
549
+
550
+ @type
551
+ end
552
+
553
+ # Tells if vector is categorical or not.
554
+ # @return [true, false] true if vector is of type category, false otherwise
555
+ # @example
556
+ # dv = DaruLite::Vector.new [1, 2, 3], type: :category
557
+ # dv.category?
558
+ # # => true
559
+ def category?
560
+ type == :category
561
+ end
562
+
563
+ # Get index of element
564
+ def index_of(element)
565
+ case dtype
566
+ when :array then @index.key(@data.index { |x| x.eql? element })
567
+ else @index.key @data.index(element)
568
+ end
569
+ end
570
+
571
+ # Keep only unique elements of the vector alongwith their indexes.
572
+ def uniq
573
+ uniq_vector = @data.uniq
574
+ new_index = uniq_vector.map { |element| index_of(element) }
575
+
576
+ DaruLite::Vector.new uniq_vector, name: @name, index: new_index, dtype: @dtype
577
+ end
578
+
579
+ def any?(&block)
580
+ @data.data.any?(&block)
581
+ end
582
+
583
+ def all?(&block)
584
+ @data.data.all?(&block)
585
+ end
586
+
587
+ # Sorts a vector according to its values. If a block is specified, the contents
588
+ # will be evaluated and data will be swapped whenever the block evaluates
589
+ # to *true*. Defaults to ascending order sorting. Any missing values will be
590
+ # put at the end of the vector. Preserves indexing. Default sort algorithm is
591
+ # quick sort.
592
+ #
593
+ # == Options
594
+ #
595
+ # * +:ascending+ - if false, will sort in descending order. Defaults to true.
596
+ #
597
+ # * +:type+ - Specify the sorting algorithm. Only supports quick_sort for now.
598
+ # == Usage
599
+ #
600
+ # v = DaruLite::Vector.new ["My first guitar", "jazz", "guitar"]
601
+ # # Say you want to sort these strings by length.
602
+ # v.sort(ascending: false) { |a,b| a.length <=> b.length }
603
+ def sort(opts = {}, &block)
604
+ opts = { ascending: true }.merge(opts)
605
+
606
+ vector_index = resort_index(@data.each_with_index, opts, &block)
607
+ vector, index = vector_index.transpose
608
+
609
+ index = @index.reorder index
610
+
611
+ DaruLite::Vector.new(vector, index: index, name: @name, dtype: @dtype)
612
+ end
613
+
614
+ # Sorts the vector according to it's`Index` values. Defaults to ascending
615
+ # order sorting.
616
+ #
617
+ # @param [Hash] opts the options for sort_by_index method.
618
+ # @option opts [Boolean] :ascending false, will sort `index` in
619
+ # descending order.
620
+ #
621
+ # @return [Vector] new sorted `Vector` according to the index values.
622
+ #
623
+ # @example
624
+ #
625
+ # dv = DaruLite::Vector.new [11, 13, 12], index: [23, 21, 22]
626
+ # # Say you want to sort index in ascending order
627
+ # dv.sort_by_index(ascending: true)
628
+ # #=> DaruLite::Vector.new [13, 12, 11], index: [21, 22, 23]
629
+ # # Say you want to sort index in descending order
630
+ # dv.sort_by_index(ascending: false)
631
+ # #=> DaruLite::Vector.new [11, 12, 13], index: [23, 22, 21]
632
+ def sort_by_index(opts = {})
633
+ opts = { ascending: true }.merge(opts)
634
+ _, new_order = resort_index(@index.each_with_index, opts).transpose
635
+
636
+ reorder new_order
637
+ end
638
+
639
+ DEFAULT_SORTER = lambda { |(lv, li), (rv, ri)|
640
+ if lv.nil? && rv.nil?
641
+ li <=> ri
642
+ elsif lv.nil?
643
+ -1
644
+ elsif rv.nil?
645
+ 1
646
+ else
647
+ lv <=> rv
648
+ end
649
+ }
650
+
651
+ # Just sort the data and get an Array in return using Enumerable#sort.
652
+ # Non-destructive.
653
+ # :nocov:
654
+ def sorted_data(&block)
655
+ @data.to_a.sort(&block)
656
+ end
657
+ # :nocov:
658
+
659
+ # Like map, but returns a DaruLite::Vector with the returned values.
660
+ def recode(dt = nil, &block)
661
+ return to_enum(:recode, dt) unless block
662
+
663
+ dup.recode! dt, &block
664
+ end
665
+
666
+ # Destructive version of recode!
667
+ def recode!(dt = nil, &block)
668
+ return to_enum(:recode!, dt) unless block
669
+
670
+ @data.map!(&block).data
671
+ @data = cast_vector_to(dt || @dtype)
672
+ self
673
+ end
674
+
675
+ # Delete an element if block returns true. Destructive.
676
+ def delete_if
677
+ return to_enum(:delete_if) unless block_given?
678
+
679
+ keep_e, keep_i = each_with_index.reject { |n, _i| yield(n) }.transpose
680
+
681
+ @data = cast_vector_to @dtype, keep_e
682
+ @index = DaruLite::Index.new(keep_i)
683
+
684
+ update_position_cache
685
+
686
+ self
687
+ end
688
+
689
+ # Keep an element if block returns true. Destructive.
690
+ def keep_if
691
+ return to_enum(:keep_if) unless block_given?
692
+
693
+ delete_if { |val| !yield(val) }
694
+ end
695
+
696
+ # Reports all values that doesn't comply with a condition.
697
+ # Returns a hash with the index of data and the invalid data.
698
+ def verify
699
+ (0...size)
700
+ .map { |i| [i, @data[i]] }
701
+ .reject { |_i, val| yield(val) }
702
+ .to_h
703
+ end
704
+
705
+ # Return an Array with the data splitted by a separator.
706
+ # a=DaruLite::Vector.new(["a,b","c,d","a,b","d"])
707
+ # a.splitted
708
+ # =>
709
+ # [["a","b"],["c","d"],["a","b"],["d"]]
710
+ def splitted(sep = ',')
711
+ @data.map do |s|
712
+ if s.nil?
713
+ nil
714
+ elsif s.respond_to? :split
715
+ s.split sep
716
+ else
717
+ [s]
718
+ end
719
+ end
720
+ end
721
+
722
+ # Returns a hash of Vectors, defined by the different values
723
+ # defined on the fields
724
+ # Example:
725
+ #
726
+ # a=DaruLite::Vector.new(["a,b","c,d","a,b"])
727
+ # a.split_by_separator
728
+ # => {"a"=>#<DaruLite::Vector:0x7f2dbcc09d88
729
+ # @data=[1, 0, 1]>,
730
+ # "b"=>#<DaruLite::Vector:0x7f2dbcc09c48
731
+ # @data=[1, 1, 0]>,
732
+ # "c"=>#<DaruLite::Vector:0x7f2dbcc09b08
733
+ # @data=[0, 1, 1]>}
734
+ #
735
+ def split_by_separator(sep = ',')
736
+ split_data = splitted sep
737
+ split_data
738
+ .flatten.uniq.compact.to_h do |key|
739
+ [
740
+ key,
741
+ DaruLite::Vector.new(split_data.map { |v| split_value(key, v) })
742
+ ]
743
+ end
744
+ end
745
+
746
+ def split_by_separator_freq(sep = ',')
747
+ split_by_separator(sep).transform_values do |v|
748
+ v.sum(&:to_i)
749
+ end
750
+ end
751
+
752
+ def reset_index!
753
+ @index = DaruLite::Index.new(Array.new(size) { |i| i })
754
+ self
755
+ end
756
+
757
+ # Replace all nils in the vector with the value passed as an argument. Destructive.
758
+ # See #replace_nils for non-destructive version
759
+ #
760
+ # == Arguments
761
+ #
762
+ # * +replacement+ - The value which should replace all nils
763
+ def replace_nils!(replacement)
764
+ indexes(*DaruLite::MISSING_VALUES).each do |idx|
765
+ self[idx] = replacement
766
+ end
767
+
768
+ self
769
+ end
770
+
771
+ # Rolling fillna
772
+ # replace all Float::NAN and NIL values with the preceeding or following value
773
+ #
774
+ # @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
775
+ #
776
+ # @example
777
+ # dv = DaruLite::Vector.new([1, 2, 1, 4, nil, Float::NAN, 3, nil, Float::NAN])
778
+ #
779
+ # 2.3.3 :068 > dv.rolling_fillna(:forward)
780
+ # => #<DaruLite::Vector(9)>
781
+ # 0 1
782
+ # 1 2
783
+ # 2 1
784
+ # 3 4
785
+ # 4 4
786
+ # 5 4
787
+ # 6 3
788
+ # 7 3
789
+ # 8 3
790
+ #
791
+ def rolling_fillna!(direction = :forward)
792
+ enum = direction == :forward ? index : index.reverse_each
793
+ last_valid_value = 0
794
+ enum.each do |idx|
795
+ if valid_value?(self[idx])
796
+ last_valid_value = self[idx]
797
+ else
798
+ self[idx] = last_valid_value
799
+ end
800
+ end
801
+ self
802
+ end
803
+
804
+ # Non-destructive version of rolling_fillna!
805
+ def rolling_fillna(direction = :forward)
806
+ dup.rolling_fillna!(direction)
807
+ end
808
+
809
+ # Lags the series by `k` periods.
810
+ #
811
+ # Lags the series by `k` periods, "shifting" data and inserting `nil`s
812
+ # from beginning or end of a vector, while preserving original vector's
813
+ # size.
814
+ #
815
+ # `k` can be positive or negative integer. If `k` is positive, `nil`s
816
+ # are inserted at the beginning of the vector, otherwise they are
817
+ # inserted at the end.
818
+ #
819
+ # @param [Integer] k "shift" the series by `k` periods. `k` can be
820
+ # positive or negative. (default = 1)
821
+ #
822
+ # @return [DaruLite::Vector] a new vector with "shifted" inital values
823
+ # and `nil` values inserted. The return vector is the same length
824
+ # as the orignal vector.
825
+ #
826
+ # @example Lag a vector with different periods `k`
827
+ #
828
+ # ts = DaruLite::Vector.new(1..5)
829
+ # # => [1, 2, 3, 4, 5]
830
+ #
831
+ # ts.lag # => [nil, 1, 2, 3, 4]
832
+ # ts.lag(1) # => [nil, 1, 2, 3, 4]
833
+ # ts.lag(2) # => [nil, nil, 1, 2, 3]
834
+ # ts.lag(-1) # => [2, 3, 4, 5, nil]
835
+ #
836
+ def lag(k = 1)
837
+ case k
838
+ when 0 then dup
839
+ when 1...size
840
+ copy(([nil] * k) + data.to_a)
841
+ when -size..-1
842
+ copy(data.to_a[k.abs...size])
843
+ else
844
+ copy([])
845
+ end
846
+ end
847
+
848
+ def detach_index
849
+ DaruLite::DataFrame.new(
850
+ index: @index.to_a,
851
+ values: @data.to_a
852
+ )
853
+ end
854
+
855
+ # Non-destructive version of #replace_nils!
856
+ def replace_nils(replacement)
857
+ dup.replace_nils!(replacement)
858
+ end
859
+
860
+ # number of non-missing elements
861
+ def n_valid
862
+ size - indexes(*DaruLite::MISSING_VALUES).size
863
+ end
864
+ deprecate :n_valid, :count_values, 2016, 10
865
+
866
+ # Count the number of values specified
867
+ # @param values [Array] values to count for
868
+ # @return [Integer] the number of times the values mentioned occurs
869
+ # @example
870
+ # dv = DaruLite::Vector.new [1, 2, 1, 2, 3, 4, nil, nil]
871
+ # dv.count_values nil
872
+ # # => 2
873
+ def count_values(*values)
874
+ positions(*values).size
875
+ end
876
+
877
+ # Returns *true* if an index exists
878
+ def has_index?(index)
879
+ @index.include? index
880
+ end
881
+
882
+ # @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
883
+ # @return [DaruLite::Vector]
884
+ def get_sub_vector(keys, by_position: true)
885
+ return DaruLite::Vector.new([]) if keys == []
886
+
887
+ keys = @index.pos(*keys) unless by_position
888
+
889
+ sub_vect = at(*keys)
890
+ sub_vect = DaruLite::Vector.new([sub_vect]) unless sub_vect.is_a?(DaruLite::Vector)
891
+
892
+ sub_vect
893
+ end
894
+
895
+ # @return [DaruLite::DataFrame] the vector as a single-vector dataframe
896
+ def to_df
897
+ DaruLite::DataFrame.new({ @name => @data }, name: @name, index: @index)
898
+ end
899
+
900
+ # Convert Vector to a horizontal or vertical Ruby Matrix.
901
+ #
902
+ # == Arguments
903
+ #
904
+ # * +axis+ - Specify whether you want a *:horizontal* or a *:vertical* matrix.
905
+ def to_matrix(axis = :horizontal)
906
+ case axis
907
+ when :horizontal
908
+ Matrix[to_a]
909
+ when :vertical
910
+ Matrix.columns([to_a])
911
+ else
912
+ raise ArgumentError, "axis should be either :horizontal or :vertical, not #{axis}"
913
+ end
914
+ end
915
+
916
+ # Convert to hash (explicit). Hash keys are indexes and values are the correspoding elements
917
+ def to_h
918
+ @index.to_h { |index| [index, self[index]] }
919
+ end
920
+
921
+ # Return an array
922
+ def to_a
923
+ @data.to_a
924
+ end
925
+
926
+ # Convert the hash from to_h to json
927
+ def to_json(*)
928
+ to_h.to_json
929
+ end
930
+
931
+ # Convert to html for iruby
932
+ def to_html(threshold = 30)
933
+ table_thead = to_html_thead
934
+ table_tbody = to_html_tbody(threshold)
935
+ path = if index.is_a?(MultiIndex)
936
+ File.expand_path('iruby/templates/vector_mi.html.erb', __dir__)
937
+ else
938
+ File.expand_path('iruby/templates/vector.html.erb', __dir__)
939
+ end
940
+ ERB.new(File.read(path).strip).result(binding)
941
+ end
942
+
943
+ def to_html_thead
944
+ table_thead_path =
945
+ if index.is_a?(MultiIndex)
946
+ File.expand_path('iruby/templates/vector_mi_thead.html.erb', __dir__)
947
+ else
948
+ File.expand_path('iruby/templates/vector_thead.html.erb', __dir__)
949
+ end
950
+ ERB.new(File.read(table_thead_path).strip).result(binding)
951
+ end
952
+
953
+ def to_html_tbody(threshold = 30)
954
+ table_tbody_path =
955
+ if index.is_a?(MultiIndex)
956
+ File.expand_path('iruby/templates/vector_mi_tbody.html.erb', __dir__)
957
+ else
958
+ File.expand_path('iruby/templates/vector_tbody.html.erb', __dir__)
959
+ end
960
+ ERB.new(File.read(table_tbody_path).strip).result(binding)
961
+ end
962
+
963
+ def to_s
964
+ "#<#{self.class}#{": #{@name}" if @name}(#{size})#{':category' if category?}>"
965
+ end
966
+
967
+ # Create a summary of the Vector
968
+ # @param indent_level [Fixnum] indent level
969
+ # @return [String] String containing the summary of the Vector
970
+ # @example
971
+ # dv = DaruLite::Vector.new [1, 2, 3]
972
+ # puts dv.summary
973
+ #
974
+ # # =
975
+ # # n :3
976
+ # # non-missing:3
977
+ # # median: 2
978
+ # # mean: 2.0000
979
+ # # std.dev.: 1.0000
980
+ # # std.err.: 0.5774
981
+ # # skew: 0.0000
982
+ # # kurtosis: -2.3333
983
+ def summary(indent_level = 0)
984
+ non_missing = size - count_values(*DaruLite::MISSING_VALUES)
985
+ summary = (' =' * indent_level) + "= #{name}" \
986
+ "\n n :#{size}" \
987
+ "\n non-missing:#{non_missing}"
988
+ case type
989
+ when :object
990
+ summary << object_summary
991
+ when :numeric
992
+ summary << numeric_summary
993
+ end
994
+ summary.split("\n").join("\n#{' ' * indent_level}")
995
+ end
996
+
997
+ # Displays summary for an object type Vector
998
+ # @return [String] String containing object vector summary
999
+ def object_summary
1000
+ nval = count_values(*DaruLite::MISSING_VALUES)
1001
+ summary = "\n factors: #{factors.to_a.join(',')}" \
1002
+ "\n mode: #{mode.to_a.join(',')}" \
1003
+ "\n Distribution\n"
1004
+
1005
+ data = frequencies.sort.each_with_index.map do |v, k|
1006
+ [k, v, format('%0.2f%%', ((nval.zero? ? 1 : v.quo(nval)) * 100))]
1007
+ end
1008
+
1009
+ summary + Formatters::Table.format(data)
1010
+ end
1011
+
1012
+ # Displays summary for an numeric type Vector
1013
+ # @return [String] String containing numeric vector summary
1014
+ def numeric_summary
1015
+ summary = "\n median: #{median}" +
1016
+ format("\n mean: %0.4f", mean)
1017
+ if sd
1018
+ summary << (format("\n std.dev.: %0.4f", sd) +
1019
+ format("\n std.err.: %0.4f", se))
1020
+ end
1021
+
1022
+ if count_values(*DaruLite::MISSING_VALUES).zero?
1023
+ summary << (format("\n skew: %0.4f", skew) +
1024
+ format("\n kurtosis: %0.4f", kurtosis))
1025
+ end
1026
+ summary
1027
+ end
1028
+
1029
+ # Over rides original inspect for pretty printing in irb
1030
+ def inspect(spacing = 20, threshold = 15)
1031
+ row_headers = index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
1032
+
1033
+ "#<#{self.class}(#{size})#{':category' if category?}>\n" +
1034
+ Formatters::Table.format(
1035
+ to_a.lazy.map { |v| [v] },
1036
+ headers: @name && [@name],
1037
+ row_headers: row_headers,
1038
+ threshold: threshold,
1039
+ spacing: spacing
1040
+ )
1041
+ end
1042
+
1043
+ # Sets new index for vector. Preserves index->value correspondence.
1044
+ # Sets nil for new index keys absent from original index.
1045
+ # @note Unlike #reorder! which takes positions as input it takes
1046
+ # index as an input to reorder the vector
1047
+ # @param [DaruLite::Index, DaruLite::MultiIndex] new_index new index to order with
1048
+ # @return [DaruLite::Vector] vector reindexed with new index
1049
+ def reindex!(new_index)
1050
+ values = []
1051
+ each_with_index do |val, i|
1052
+ values[new_index[i]] = val if new_index.include?(i)
1053
+ end
1054
+ values.fill(nil, values.size, new_index.size - values.size)
1055
+
1056
+ @data = cast_vector_to @dtype, values
1057
+ @index = new_index
1058
+
1059
+ update_position_cache
1060
+
1061
+ self
1062
+ end
1063
+
1064
+ # Reorder the vector with given positions
1065
+ # @note Unlike #reindex! which takes index as input, it takes
1066
+ # positions as an input to reorder the vector
1067
+ # @param [Array] order the order to reorder the vector with
1068
+ # @return reordered vector
1069
+ # @example
1070
+ # dv = DaruLite::Vector.new [3, 2, 1], index: ['c', 'b', 'a']
1071
+ # dv.reorder! [2, 1, 0]
1072
+ # # => #<DaruLite::Vector(3)>
1073
+ # # a 1
1074
+ # # b 2
1075
+ # # c 3
1076
+ def reorder!(order)
1077
+ @index = @index.reorder order
1078
+ data_array = order.map { |i| @data[i] }
1079
+ @data = cast_vector_to @dtype, data_array, @nm_dtype
1080
+ update_position_cache
1081
+ self
1082
+ end
1083
+
1084
+ # Non-destructive version of #reorder!
1085
+ def reorder(order)
1086
+ dup.reorder! order
1087
+ end
1088
+
1089
+ # Create a new vector with a different index, and preserve the indexing of
1090
+ # current elements.
1091
+ def reindex(new_index)
1092
+ dup.reindex!(new_index)
1093
+ end
1094
+
1095
+ def index=(idx)
1096
+ idx = Index.coerce(idx)
1097
+
1098
+ raise ArgumentError, "Size of supplied index #{idx.size} does not match size of Vector" if idx.size != size
1099
+ raise ArgumentError, 'Can only assign type Index and its subclasses.' unless idx.is_a?(DaruLite::Index)
1100
+
1101
+ @index = idx
1102
+ self
1103
+ end
1104
+
1105
+ # Give the vector a new name
1106
+ #
1107
+ # @param new_name [Symbol] The new name.
1108
+ def rename(new_name)
1109
+ @name = new_name
1110
+ self
1111
+ end
1112
+
1113
+ alias name= rename
1114
+
1115
+ # Duplicated a vector
1116
+ # @return [DaruLite::Vector] duplicated vector
1117
+ def dup
1118
+ DaruLite::Vector.new @data.dup, name: @name, index: @index.dup
1119
+ end
1120
+
1121
+ # == Bootstrap
1122
+ # Generate +nr+ resamples (with replacement) of size +s+
1123
+ # from vector, computing each estimate from +estimators+
1124
+ # over each resample.
1125
+ # +estimators+ could be
1126
+ # a) Hash with variable names as keys and lambdas as values
1127
+ # a.bootstrap(:log_s2=>lambda {|v| Math.log(v.variance)},1000)
1128
+ # b) Array with names of method to bootstrap
1129
+ # a.bootstrap([:mean, :sd],1000)
1130
+ # c) A single method to bootstrap
1131
+ # a.jacknife(:mean, 1000)
1132
+ # If s is nil, is set to vector size by default.
1133
+ #
1134
+ # Returns a DataFrame where each vector is a vector
1135
+ # of length +nr+ containing the computed resample estimates.
1136
+ def bootstrap(estimators, nr, s = nil)
1137
+ s ||= size
1138
+ h_est, es, bss = prepare_bootstrap(estimators)
1139
+
1140
+ nr.times do
1141
+ bs = sample_with_replacement(s)
1142
+ es.each do |estimator|
1143
+ bss[estimator].push(h_est[estimator].call(bs))
1144
+ end
1145
+ end
1146
+
1147
+ es.each do |est|
1148
+ bss[est] = DaruLite::Vector.new bss[est]
1149
+ end
1150
+
1151
+ DaruLite::DataFrame.new bss
1152
+ end
1153
+
1154
+ # == Jacknife
1155
+ # Returns a dataset with jacknife delete-+k+ +estimators+
1156
+ # +estimators+ could be:
1157
+ # a) Hash with variable names as keys and lambdas as values
1158
+ # a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance)})
1159
+ # b) Array with method names to jacknife
1160
+ # a.jacknife([:mean, :sd])
1161
+ # c) A single method to jacknife
1162
+ # a.jacknife(:mean)
1163
+ # +k+ represent the block size for block jacknife. By default
1164
+ # is set to 1, for classic delete-one jacknife.
1165
+ #
1166
+ # Returns a dataset where each vector is an vector
1167
+ # of length +cases+/+k+ containing the computed jacknife estimates.
1168
+ #
1169
+ # == Reference:
1170
+ # * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife.
1171
+ def jackknife(estimators, k = 1) # rubocop:disable Metrics/MethodLength
1172
+ raise "n should be divisible by k:#{k}" unless (size % k).zero?
1173
+
1174
+ nb = (size / k).to_i
1175
+ h_est, es, ps = prepare_bootstrap(estimators)
1176
+
1177
+ est_n = es.to_h { |v| [v, h_est[v].call(self)] }
1178
+
1179
+ nb.times do |i|
1180
+ other = @data.dup
1181
+ other.slice!(i * k, k)
1182
+ other = DaruLite::Vector.new other
1183
+
1184
+ es.each do |estimator|
1185
+ # Add pseudovalue
1186
+ ps[estimator].push(
1187
+ (nb * est_n[estimator]) - ((nb - 1) * h_est[estimator].call(other))
1188
+ )
1189
+ end
1190
+ end
1191
+
1192
+ es.each do |est|
1193
+ ps[est] = DaruLite::Vector.new ps[est]
1194
+ end
1195
+ DaruLite::DataFrame.new ps
1196
+ end
1197
+
1198
+ # Returns an array of either none or integer values, indicating the
1199
+ # +regexp+ matching with the given array.
1200
+ #
1201
+ # @param regexp [Regexp] A regular matching expression. For example, +/weeks/+.
1202
+ #
1203
+ # @return [Array] Containing either +nil+ or integer values, according to the match with the given +regexp+
1204
+ #
1205
+ # @example
1206
+ # dv = DaruLite::Vector.new(['3 days', '5 weeks', '2 weeks'])
1207
+ # dv.match(/weeks/)
1208
+ #
1209
+ # # => [false, true, true]
1210
+ def match(regexp)
1211
+ @data.map { |value| !!(value =~ regexp) }
1212
+ end
1213
+
1214
+ # Creates a new vector consisting only of non-nil data
1215
+ #
1216
+ # == Arguments
1217
+ #
1218
+ # @param as_a [Symbol] Passing :array will return only the elements
1219
+ # as an Array. Otherwise will return a DaruLite::Vector.
1220
+ #
1221
+ # @param _duplicate [Symbol] In case no missing data is found in the
1222
+ # vector, setting this to false will return the same vector.
1223
+ # Otherwise, a duplicate will be returned irrespective of
1224
+ # presence of missing data.
1225
+
1226
+ def only_valid(as_a = :vector, _duplicate = true)
1227
+ # FIXME: Now duplicate is just ignored.
1228
+ # There are no spec that fail on this case, so I'll leave it
1229
+ # this way for now - zverok, 2016-05-07
1230
+
1231
+ new_index = @index.to_a - indexes(*DaruLite::MISSING_VALUES)
1232
+ new_vector = new_index.map { |idx| self[idx] }
1233
+
1234
+ if as_a == :vector
1235
+ DaruLite::Vector.new new_vector, index: new_index, name: @name, dtype: dtype
1236
+ else
1237
+ new_vector
1238
+ end
1239
+ end
1240
+ deprecate :only_valid, :reject_values, 2016, 10
1241
+
1242
+ # Return a vector with specified values removed
1243
+ # @param values [Array] values to reject from resultant vector
1244
+ # @return [DaruLite::Vector] vector with specified values removed
1245
+ # @example
1246
+ # dv = DaruLite::Vector.new [1, 2, nil, Float::NAN]
1247
+ # dv.reject_values nil, Float::NAN
1248
+ # # => #<DaruLite::Vector(2)>
1249
+ # # 0 1
1250
+ # # 1 2
1251
+ def reject_values(*values)
1252
+ resultant_pos = size.times.to_a - positions(*values)
1253
+ dv = at(*resultant_pos)
1254
+ # Handle the case when number of positions is 1
1255
+ # and hence #at doesn't return a vector
1256
+ if dv.is_a?(DaruLite::Vector)
1257
+ dv
1258
+ else
1259
+ pos = resultant_pos.first
1260
+ at(pos..pos)
1261
+ end
1262
+ end
1263
+
1264
+ # Return indexes of values specified
1265
+ # @param values [Array] values to find indexes for
1266
+ # @return [Array] array of indexes of values specified
1267
+ # @example
1268
+ # dv = DaruLite::Vector.new [1, 2, nil, Float::NAN], index: 11..14
1269
+ # dv.indexes nil, Float::NAN
1270
+ # # => [13, 14]
1271
+ def indexes(*values)
1272
+ index.to_a.values_at(*positions(*values))
1273
+ end
1274
+
1275
+ # Replaces specified values with a new value
1276
+ # @param [Array] old_values array of values to replace
1277
+ # @param [object] new_value new value to replace with
1278
+ # @note It performs the replace in place.
1279
+ # @return [DaruLite::Vector] Same vector itself with values
1280
+ # replaced with new value
1281
+ # @example
1282
+ # dv = DaruLite::Vector.new [1, 2, :a, :b]
1283
+ # dv.replace_values [:a, :b], nil
1284
+ # dv
1285
+ # # =>
1286
+ # # #<DaruLite::Vector:19903200 @name = nil @metadata = {} @size = 4 >
1287
+ # # nil
1288
+ # # 0 1
1289
+ # # 1 2
1290
+ # # 2 nil
1291
+ # # 3 nil
1292
+ def replace_values(old_values, new_value)
1293
+ old_values = [old_values] unless old_values.is_a? Array
1294
+ size.times do |pos|
1295
+ set_at([pos], new_value) if include_with_nan? old_values, at(pos)
1296
+ end
1297
+ self
1298
+ end
1299
+
1300
+ # Returns a Vector containing only missing data (preserves indexes).
1301
+ def only_missing(as_a = :vector)
1302
+ case as_a
1303
+ when :vector
1304
+ self[*indexes(*DaruLite::MISSING_VALUES)]
1305
+ when :array
1306
+ self[*indexes(*DaruLite::MISSING_VALUES)].to_a
1307
+ end
1308
+ end
1309
+ deprecate :only_missing, nil, 2016, 10
1310
+
1311
+ # Returns a Vector with only numerical data. Missing data is included
1312
+ # but non-Numeric objects are excluded. Preserves index.
1313
+ def only_numerics
1314
+ numeric_indexes =
1315
+ each_with_index
1316
+ .select { |v, _i| v.is_a?(Numeric) || v.nil? }
1317
+ .map(&:last)
1318
+
1319
+ self[*numeric_indexes]
1320
+ end
1321
+
1322
+ DATE_REGEXP = /^(\d{2}-\d{2}-\d{4}|\d{4}-\d{2}-\d{2})$/.freeze
1323
+
1324
+ # Returns the database type for the vector, according to its content
1325
+ def db_type
1326
+ # first, detect any character not number
1327
+ if @data.any? { |v| v.to_s =~ DATE_REGEXP }
1328
+ 'DATE'
1329
+ elsif @data.any? { |v| v.to_s =~ /[^0-9e.-]/ }
1330
+ 'VARCHAR (255)'
1331
+ elsif @data.any? { |v| v.to_s.include?('.') }
1332
+ 'DOUBLE'
1333
+ else
1334
+ 'INTEGER'
1335
+ end
1336
+ end
1337
+
1338
+ # Copies the structure of the vector (i.e the index, size, etc.) and fills all
1339
+ # all values with nils.
1340
+ def clone_structure
1341
+ DaruLite::Vector.new(([nil] * size), name: @name, index: @index.dup)
1342
+ end
1343
+
1344
+ # Save the vector to a file
1345
+ #
1346
+ # == Arguments
1347
+ #
1348
+ # * filename - Path of file where the vector is to be saved
1349
+ def save(filename)
1350
+ DaruLite::IO.save self, filename
1351
+ end
1352
+
1353
+ def _dump(*) # :nodoc:
1354
+ Marshal.dump(
1355
+ data: @data.to_a,
1356
+ dtype: @dtype,
1357
+ name: @name,
1358
+ index: @index
1359
+ )
1360
+ end
1361
+
1362
+ # :nocov:
1363
+ def daru_lite_vector(*)
1364
+ self
1365
+ end
1366
+ # :nocov:
1367
+
1368
+ alias dv daru_lite_vector
1369
+
1370
+ # Converts a non category type vector to category type vector.
1371
+ # @param [Hash] opts options to convert to category
1372
+ # @option opts [true, false] :ordered Specify if vector is ordered or not.
1373
+ # If it is ordered, it can be sorted and min, max like functions would work
1374
+ # @option opts [Array] :categories set categories in the specified order
1375
+ # @return [DaruLite::Vector] vector with type category
1376
+ def to_category(opts = {})
1377
+ dv = DaruLite::Vector.new to_a, type: :category, name: @name, index: @index
1378
+ dv.ordered = opts[:ordered] || false
1379
+ dv.categories = opts[:categories] if opts[:categories]
1380
+ dv
1381
+ end
1382
+
1383
+ def method_missing(name, *args, &block)
1384
+ # FIXME: it is shamefully fragile. Should be either made stronger
1385
+ # (string/symbol dychotomy, informative errors) or removed totally. - zverok
1386
+ if name =~ /(.+)=/
1387
+ self[Regexp.last_match(1).to_sym] = args[0]
1388
+ elsif has_index?(name)
1389
+ self[name]
1390
+ else
1391
+ super
1392
+ end
1393
+ end
1394
+
1395
+ def respond_to_missing?(name, include_private = false)
1396
+ name.to_s.end_with?('=') || has_index?(name) || super
1397
+ end
1398
+
1399
+ # Partition a numeric variable into categories.
1400
+ # @param [Array<Numeric>] partitions an array whose consecutive elements
1401
+ # provide intervals for categories
1402
+ # @param [Hash] opts options to cut the partition
1403
+ # @option opts [:left, :right] :close_at specifies whether the interval closes at
1404
+ # the right side of left side
1405
+ # @option opts [Array] :labels names of the categories
1406
+ # @return [DaruLite::Vector] numeric variable converted to categorical variable
1407
+ # @example
1408
+ # heights = DaruLite::Vector.new [30, 35, 32, 50, 42, 51]
1409
+ # height_cat = heights.cut [30, 40, 50, 60], labels=['low', 'medium', 'high']
1410
+ # # => #<DaruLite::Vector(6)>
1411
+ # # 0 low
1412
+ # # 1 low
1413
+ # # 2 low
1414
+ # # 3 high
1415
+ # # 4 medium
1416
+ # # 5 high
1417
+ def cut(partitions, opts = {})
1418
+ close_at = opts[:close_at] || :right
1419
+ labels = opts[:labels]
1420
+ partitions = partitions.to_a
1421
+ values = to_a.map { |val| cut_find_category partitions, val, close_at }
1422
+ cats = cut_categories(partitions, close_at)
1423
+
1424
+ dv = DaruLite::Vector.new values,
1425
+ index: @index,
1426
+ type: :category,
1427
+ categories: cats
1428
+
1429
+ # Rename categories if new labels provided
1430
+ if labels
1431
+ dv.rename_categories cats.zip(labels).to_h
1432
+ else
1433
+ dv
1434
+ end
1435
+ end
1436
+
1437
+ def positions(*values)
1438
+ case values
1439
+ when [nil]
1440
+ nil_positions
1441
+ when [Float::NAN]
1442
+ nan_positions
1443
+ when [nil, Float::NAN], [Float::NAN, nil]
1444
+ nil_positions + nan_positions
1445
+ else
1446
+ size.times.select { |i| include_with_nan? values, @data[i] }
1447
+ end
1448
+ end
1449
+
1450
+ def group_by(*args)
1451
+ to_df.group_by(*args)
1452
+ end
1453
+
1454
+ private
1455
+
1456
+ def copy(values)
1457
+ # Make sure values is right-justified to the size of the vector
1458
+ values.concat([nil] * (size - values.size)) if values.size < size
1459
+ DaruLite::Vector.new(values[0...size], index: @index, name: @name)
1460
+ end
1461
+
1462
+ def nil_positions
1463
+ @nil_positions ||
1464
+ @nil_positions = size.times.select { |i| @data[i].nil? }
1465
+ end
1466
+
1467
+ def nan_positions
1468
+ @nan_positions ||
1469
+ @nan_positions = size.times.select do |i|
1470
+ @data[i].respond_to?(:nan?) && @data[i].nan?
1471
+ end
1472
+ end
1473
+
1474
+ # Helper method returning validity of arbitrary value
1475
+ def valid_value?(v)
1476
+ !((v.respond_to?(:nan?) && v.nan?) || v.nil?)
1477
+ end
1478
+
1479
+ def initialize_vector(source, opts)
1480
+ index, source = parse_source(source, opts)
1481
+ set_name opts[:name]
1482
+
1483
+ @data = cast_vector_to(opts[:dtype] || :array, source, opts[:nm_dtype])
1484
+ @index = Index.coerce(index || @data.size)
1485
+
1486
+ guard_sizes!
1487
+
1488
+ @possibly_changed_type = true
1489
+ end
1490
+
1491
+ def parse_source(source, opts)
1492
+ if source.is_a?(Hash)
1493
+ [source.keys, source.values]
1494
+ else
1495
+ [opts[:index], source || []]
1496
+ end
1497
+ end
1498
+
1499
+ def guard_sizes!
1500
+ if @index.size > @data.size
1501
+ cast(dtype: :array) # NM with nils seg faults
1502
+ @data.fill(nil, @data.size...@index.size)
1503
+ elsif @index.size < @data.size
1504
+ raise IndexError, "Expected index size >= vector size. Index size : #{@index.size}, vector size : #{@data.size}"
1505
+ end
1506
+ end
1507
+
1508
+ def guard_type_check(value)
1509
+ @possibly_changed_type = true \
1510
+ if (object? && (value.nil? || value.is_a?(Numeric))) ||
1511
+ (numeric? && !value.is_a?(Numeric) && !value.nil?)
1512
+ end
1513
+
1514
+ def split_value(key, v)
1515
+ if v.nil?
1516
+ nil
1517
+ elsif v.include?(key)
1518
+ 1
1519
+ else
1520
+ 0
1521
+ end
1522
+ end
1523
+
1524
+ # For an array or hash of estimators methods, returns
1525
+ # an array with three elements
1526
+ # 1.- A hash with estimators names as keys and lambdas as values
1527
+ # 2.- An array with estimators names
1528
+ # 3.- A Hash with estimators names as keys and empty arrays as values
1529
+ def prepare_bootstrap(estimators)
1530
+ h_est = estimators
1531
+ h_est = [h_est] unless h_est.is_a?(Array) || h_est.is_a?(Hash)
1532
+
1533
+ if h_est.is_a? Array
1534
+ h_est = h_est.to_h do |est|
1535
+ [est, ->(v) { DaruLite::Vector.new(v).send(est) }]
1536
+ end
1537
+ end
1538
+ bss = h_est.keys.to_h { |v| [v, []] }
1539
+
1540
+ [h_est, h_est.keys, bss]
1541
+ end
1542
+
1543
+ # NOTE: To maintain sanity, this _MUST_ be the _ONLY_ place in daru where the
1544
+ # @param dtype [db_type] variable is set and the underlying data type of vector changed.
1545
+ def cast_vector_to(dtype, source = nil, _nm_dtype = nil)
1546
+ source = @data.to_a if source.nil?
1547
+
1548
+ new_vector =
1549
+ case dtype
1550
+ when :array then DaruLite::Accessors::ArrayWrapper.new(source, self)
1551
+ when :mdarray then raise NotImplementedError, 'MDArray not yet supported.'
1552
+ else raise ArgumentError, "Unknown dtype #{dtype}"
1553
+ end
1554
+
1555
+ @dtype = dtype
1556
+ new_vector
1557
+ end
1558
+
1559
+ def set_name(name) # rubocop:disable Naming/AccessorMethodName
1560
+ @name = name.is_a?(Array) ? name.join : name # join in case of MultiIndex tuple
1561
+ end
1562
+
1563
+ # Raises IndexError when one of the positions is an invalid position
1564
+ def validate_positions(*positions)
1565
+ positions.each do |pos|
1566
+ raise IndexError, "#{pos} is not a valid position." if pos >= size
1567
+ end
1568
+ end
1569
+
1570
+ # coerce ranges, integers and array in appropriate ways
1571
+ def coerce_positions(*positions)
1572
+ if positions.size == 1
1573
+ case positions.first
1574
+ when Integer
1575
+ positions.first
1576
+ when Range
1577
+ size.times.to_a[positions.first]
1578
+ else
1579
+ raise ArgumentError, 'Unkown position type.'
1580
+ end
1581
+ else
1582
+ positions
1583
+ end
1584
+ end
1585
+
1586
+ # Helper method for []=.
1587
+ # Assigs existing index to another value
1588
+ def modify_vector(indexes, val)
1589
+ positions = @index.pos(*indexes)
1590
+
1591
+ if positions.is_a? Numeric
1592
+ @data[positions] = val
1593
+ else
1594
+ positions.each { |pos| @data[pos] = val }
1595
+ end
1596
+ end
1597
+
1598
+ # Helper method for []=.
1599
+ # Add a new index and assign it value
1600
+ def insert_vector(indexes, val)
1601
+ new_index = @index.add(*indexes)
1602
+ # May be create +=
1603
+ (new_index.size - @index.size).times { @data << val }
1604
+ @index = new_index
1605
+ end
1606
+
1607
+ # Works similar to #[]= but also insert the vector in case index is not valid
1608
+ # It is there only to be accessed by DaruLite::DataFrame and not meant for user.
1609
+ def set(indexes, val)
1610
+ cast(dtype: :array) if val.nil? && dtype != :array
1611
+ guard_type_check(val)
1612
+
1613
+ if @index.valid?(*indexes)
1614
+ modify_vector(indexes, val)
1615
+ else
1616
+ insert_vector(indexes, val)
1617
+ end
1618
+
1619
+ update_position_cache
1620
+ end
1621
+
1622
+ def cut_find_category(partitions, val, close_at)
1623
+ case close_at
1624
+ when :right
1625
+ right_index = partitions.index { |i| i > val }
1626
+ raise ArgumentError, 'Invalid partition' if right_index.nil?
1627
+
1628
+ left_index = right_index - 1
1629
+ "#{partitions[left_index]}-#{partitions[right_index] - 1}"
1630
+ when :left
1631
+ right_index = partitions.index { |i| i >= val }
1632
+ raise ArgumentError, 'Invalid partition' if right_index.nil?
1633
+
1634
+ left_index = right_index - 1
1635
+ "#{partitions[left_index] + 1}-#{partitions[right_index]}"
1636
+ else
1637
+ raise ArgumentError, "Invalid parameter #{close_at} to close_at."
1638
+ end
1639
+ end
1640
+
1641
+ def cut_categories(partitions, close_at)
1642
+ case close_at
1643
+ when :right
1644
+ Array.new(partitions.size - 1) do |left_index|
1645
+ "#{partitions[left_index]}-#{partitions[left_index + 1] - 1}"
1646
+ end
1647
+ when :left
1648
+ Array.new(partitions.size - 1) do |left_index|
1649
+ "#{partitions[left_index] + 1}-#{partitions[left_index + 1]}"
1650
+ end
1651
+ end
1652
+ end
1653
+
1654
+ def include_with_nan?(array, value)
1655
+ # Returns true if value is included in array.
1656
+ # Similar to include? but also works if value is Float::NAN
1657
+ if value.respond_to?(:nan?) && value.nan?
1658
+ array.any? { |i| i.respond_to?(:nan?) && i.nan? }
1659
+ else
1660
+ array.include? value
1661
+ end
1662
+ end
1663
+
1664
+ def update_position_cache
1665
+ @nil_positions = nil
1666
+ @nan_positions = nil
1667
+ end
1668
+
1669
+ def resort_index(vector_index, opts)
1670
+ if block_given?
1671
+ vector_index.sort { |(lv, _li), (rv, _ri)| yield(lv, rv) }
1672
+ else
1673
+ vector_index.sort(&DEFAULT_SORTER)
1674
+ end
1675
+ .tap { |res| res.reverse! unless opts[:ascending] }
1676
+ end
1677
+ end
1678
+ end