daru_lite 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. checksums.yaml +7 -0
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/workflows/ci.yml +33 -0
  4. data/.gitignore +10 -0
  5. data/.rspec +2 -0
  6. data/.rubocop.yml +27 -0
  7. data/.rubocop_todo.yml +137 -0
  8. data/CONTRIBUTING.md +47 -0
  9. data/Gemfile +2 -0
  10. data/History.md +4 -0
  11. data/LICENSE +24 -0
  12. data/README.md +218 -0
  13. data/Rakefile +69 -0
  14. data/ReleasePolicy.md +20 -0
  15. data/benchmarks/TradeoffData.csv +65 -0
  16. data/benchmarks/csv_reading.rb +22 -0
  17. data/benchmarks/dataframe_creation.rb +39 -0
  18. data/benchmarks/db_loading.rb +34 -0
  19. data/benchmarks/duplicating.rb +45 -0
  20. data/benchmarks/group_by.rb +32 -0
  21. data/benchmarks/joining.rb +52 -0
  22. data/benchmarks/row_access.rb +41 -0
  23. data/benchmarks/row_assign.rb +36 -0
  24. data/benchmarks/sorting.rb +51 -0
  25. data/benchmarks/statistics.rb +28 -0
  26. data/benchmarks/vector_access.rb +31 -0
  27. data/benchmarks/vector_assign.rb +42 -0
  28. data/benchmarks/where_clause.rb +48 -0
  29. data/benchmarks/where_vs_filter.rb +28 -0
  30. data/daru_lite.gemspec +55 -0
  31. data/images/README.md +5 -0
  32. data/images/con0.png +0 -0
  33. data/images/con1.png +0 -0
  34. data/images/init0.png +0 -0
  35. data/images/init1.png +0 -0
  36. data/images/man0.png +0 -0
  37. data/images/man1.png +0 -0
  38. data/images/man2.png +0 -0
  39. data/images/man3.png +0 -0
  40. data/images/man4.png +0 -0
  41. data/images/man5.png +0 -0
  42. data/images/man6.png +0 -0
  43. data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
  44. data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
  45. data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
  46. data/lib/daru_lite/category.rb +929 -0
  47. data/lib/daru_lite/configuration.rb +34 -0
  48. data/lib/daru_lite/core/group_by.rb +403 -0
  49. data/lib/daru_lite/core/merge.rb +270 -0
  50. data/lib/daru_lite/core/query.rb +109 -0
  51. data/lib/daru_lite/dataframe.rb +3080 -0
  52. data/lib/daru_lite/date_time/index.rb +569 -0
  53. data/lib/daru_lite/date_time/offsets.rb +397 -0
  54. data/lib/daru_lite/exceptions.rb +2 -0
  55. data/lib/daru_lite/extensions/which_dsl.rb +53 -0
  56. data/lib/daru_lite/formatters/table.rb +52 -0
  57. data/lib/daru_lite/helpers/array.rb +53 -0
  58. data/lib/daru_lite/index/categorical_index.rb +201 -0
  59. data/lib/daru_lite/index/index.rb +374 -0
  60. data/lib/daru_lite/index/multi_index.rb +374 -0
  61. data/lib/daru_lite/io/csv/converters.rb +21 -0
  62. data/lib/daru_lite/io/io.rb +294 -0
  63. data/lib/daru_lite/io/sql_data_source.rb +97 -0
  64. data/lib/daru_lite/iruby/helpers.rb +38 -0
  65. data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
  66. data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
  67. data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  68. data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  69. data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
  70. data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
  71. data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
  72. data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
  73. data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
  74. data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
  75. data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
  76. data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
  77. data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
  78. data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
  79. data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
  80. data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
  81. data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
  82. data/lib/daru_lite/monkeys.rb +56 -0
  83. data/lib/daru_lite/vector.rb +1678 -0
  84. data/lib/daru_lite/version.rb +3 -0
  85. data/lib/daru_lite.rb +99 -0
  86. data/profile/_base.rb +23 -0
  87. data/profile/df_to_a.rb +10 -0
  88. data/profile/filter.rb +13 -0
  89. data/profile/joining.rb +13 -0
  90. data/profile/sorting.rb +12 -0
  91. data/profile/vector_each_with_index.rb +9 -0
  92. data/profile/vector_new.rb +9 -0
  93. data/spec/accessors/array_wrapper_spec.rb +3 -0
  94. data/spec/category_spec.rb +1741 -0
  95. data/spec/core/group_by_spec.rb +655 -0
  96. data/spec/core/merge_spec.rb +179 -0
  97. data/spec/core/query_spec.rb +347 -0
  98. data/spec/daru_lite_spec.rb +22 -0
  99. data/spec/dataframe_spec.rb +4330 -0
  100. data/spec/date_time/data_spec.rb +197 -0
  101. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  102. data/spec/date_time/index_spec.rb +588 -0
  103. data/spec/date_time/offsets_spec.rb +465 -0
  104. data/spec/extensions/which_dsl_spec.rb +38 -0
  105. data/spec/fixtures/bank2.dat +200 -0
  106. data/spec/fixtures/boolean_converter_test.csv +5 -0
  107. data/spec/fixtures/countries.json +7794 -0
  108. data/spec/fixtures/duplicates.csv +32 -0
  109. data/spec/fixtures/eciresults.html +394 -0
  110. data/spec/fixtures/empties.dat +2 -0
  111. data/spec/fixtures/empty_rows_test.csv +17 -0
  112. data/spec/fixtures/macau.html +3691 -0
  113. data/spec/fixtures/macd_data.csv +150 -0
  114. data/spec/fixtures/matrix_test.csv +100 -0
  115. data/spec/fixtures/moneycontrol.html +6812 -0
  116. data/spec/fixtures/music_data.tsv +2501 -0
  117. data/spec/fixtures/repeated_fields.csv +7 -0
  118. data/spec/fixtures/sales-funnel.csv +18 -0
  119. data/spec/fixtures/scientific_notation.csv +4 -0
  120. data/spec/fixtures/string_converter_test.csv +5 -0
  121. data/spec/fixtures/strings.dat +2 -0
  122. data/spec/fixtures/test_xls.xls +0 -0
  123. data/spec/fixtures/test_xls_2.xls +0 -0
  124. data/spec/fixtures/url_test.txt~ +0 -0
  125. data/spec/fixtures/valid_markup.html +62 -0
  126. data/spec/fixtures/wiki_climate.html +1243 -0
  127. data/spec/fixtures/wiki_table_info.html +631 -0
  128. data/spec/formatters/table_formatter_spec.rb +137 -0
  129. data/spec/helpers_spec.rb +8 -0
  130. data/spec/index/categorical_index_spec.rb +170 -0
  131. data/spec/index/index_spec.rb +417 -0
  132. data/spec/index/multi_index_spec.rb +680 -0
  133. data/spec/io/io_spec.rb +373 -0
  134. data/spec/io/sql_data_source_spec.rb +56 -0
  135. data/spec/iruby/dataframe_spec.rb +170 -0
  136. data/spec/iruby/helpers_spec.rb +49 -0
  137. data/spec/iruby/multi_index_spec.rb +37 -0
  138. data/spec/iruby/vector_spec.rb +105 -0
  139. data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
  140. data/spec/maths/arithmetic/vector_spec.rb +165 -0
  141. data/spec/maths/statistics/dataframe_spec.rb +178 -0
  142. data/spec/maths/statistics/vector_spec.rb +756 -0
  143. data/spec/monkeys_spec.rb +42 -0
  144. data/spec/shared/vector_display_spec.rb +213 -0
  145. data/spec/spec_helper.rb +87 -0
  146. data/spec/support/database_helper.rb +30 -0
  147. data/spec/support/matchers.rb +5 -0
  148. data/spec/vector_spec.rb +2293 -0
  149. metadata +571 -0
@@ -0,0 +1,929 @@
1
+ module DaruLite
2
+ module Category # rubocop:disable Metrics/ModuleLength
3
+ UNDEFINED = Object.new.freeze
4
+
5
+ attr_accessor :base_category
6
+ attr_reader :index, :coding_scheme, :name
7
+
8
+ # Initializes a vector to store categorical data.
9
+ # @note Base category is set to the first category encountered in the vector.
10
+ # @param [Array] data the categorical data
11
+ # @param [Hash] opts the options
12
+ # @option opts [Boolean] :ordered true if data is ordered, false otherwise
13
+ # @option opts [Array] :categories categories to associate with the vector.
14
+ # It add extra categories if specified and provides order of categories also.
15
+ # @option opts [object] :index gives index to vector. By default its from 0 to size-1
16
+ # @return the categorical data created
17
+ # @example
18
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c],
19
+ # type: :category,
20
+ # ordered: true,
21
+ # categories: [:a, :b, :c, 1]
22
+ # # => #<DaruLite::Vector(5)>
23
+ # # 0 a
24
+ # # 1 1
25
+ # # 2 a
26
+ # # 3 1
27
+ # # 4 c
28
+ def initialize_category(data, opts = {})
29
+ @type = :category
30
+ initialize_core_attributes data
31
+
32
+ if opts[:categories]
33
+ validate_categories(opts[:categories])
34
+ add_extra_categories(opts[:categories] - categories)
35
+ order_with opts[:categories]
36
+ end
37
+
38
+ # Specify if the categories are ordered or not.
39
+ # By default its unordered
40
+ @ordered = opts[:ordered] || false
41
+
42
+ # The coding scheme to code with. Default is dummy coding.
43
+ @coding_scheme = :dummy
44
+
45
+ # Base category which won't be present in the coding
46
+ @base_category = @cat_hash.keys.first
47
+
48
+ # Stores the name of the vector
49
+ @name = opts[:name]
50
+
51
+ # Index of the vector
52
+ @index = coerce_index opts[:index]
53
+
54
+ self
55
+ end
56
+
57
+ def name=(new_name)
58
+ @name = new_name
59
+ self
60
+ end
61
+
62
+ alias rename name=
63
+
64
+ # Returns an enumerator that enumerates on categorical data
65
+ # @return [Enumerator] an enumerator that enumerates over data stored in vector
66
+ def each
67
+ return enum_for(:each) unless block_given?
68
+
69
+ @array.each { |pos| yield cat_from_int pos }
70
+ self
71
+ end
72
+
73
+ # Returns all categorical data
74
+ # @return [Array] array of all categorical data which vector is storing
75
+ # @example
76
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
77
+ # dv.to_a
78
+ # # => [:a, 1, :a, 1, :c]
79
+ def to_a
80
+ each.to_a
81
+ end
82
+
83
+ # Duplicated a vector
84
+ # @return [DaruLite::Vector] duplicated vector
85
+ # @example
86
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
87
+ # dv.dup
88
+ # # => #<DaruLite::Vector(5)>
89
+ # # 0 a
90
+ # # 1 1
91
+ # # 2 a
92
+ # # 3 1
93
+ # # 4 c
94
+ def dup
95
+ DaruLite::Vector.new to_a.dup,
96
+ name: @name,
97
+ index: @index.dup,
98
+ type: :category,
99
+ categories: categories,
100
+ ordered: ordered?
101
+ end
102
+
103
+ # Associates a category to the vector.
104
+ # @param [Array] new_categories new categories to be associated
105
+ # @example
106
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
107
+ # dv.add_category :b
108
+ # dv.categories
109
+ # # => [:a, :b, :c, 1]
110
+ def add_category(*new_categories)
111
+ new_categories -= categories
112
+ add_extra_categories new_categories
113
+ end
114
+
115
+ # Returns frequency of given category
116
+ # @param [object] category given category whose count has to be founded
117
+ # @return count/frequency of given category
118
+ # @example
119
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
120
+ # dv.count :a
121
+ # # => 2
122
+ # dv.count
123
+ # # => 5
124
+ def count(category = UNDEFINED)
125
+ return @cat_hash.values.sum(&:size) if category == UNDEFINED # count all
126
+ raise ArgumentError, "Invalid category #{category}" unless
127
+ categories.include?(category)
128
+
129
+ @cat_hash[category].size
130
+ end
131
+
132
+ # Returns a vector storing count/frequency of each category
133
+ # @return [DaruLite::Vector] Return a vector whose indexes are categories
134
+ # and corresponding values are its count
135
+ # @example
136
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
137
+ # dv.frequencies
138
+ # # => #<DaruLite::Vector(4)>
139
+ # # a 2
140
+ # # b 0
141
+ # # c 1
142
+ # # 1 2
143
+ def frequencies(type = :count)
144
+ counts = @cat_hash.values.map(&:size)
145
+ values =
146
+ case type
147
+ when :count
148
+ counts
149
+ when :fraction
150
+ counts.map { |c| c / size.to_f }
151
+ when :percentage
152
+ counts.map { |c| c / size.to_f * 100 }
153
+ else
154
+ raise ArgumentError, 'Type should be either :count, :fraction or ' \
155
+ ":percentage. #{type} not supported."
156
+ end
157
+ DaruLite::Vector.new values, index: categories, name: name
158
+ end
159
+
160
+ # Returns vector for indexes/positions specified
161
+ # @param [Array] indexes for which values has to be retrived
162
+ # @note Since it accepts both indexes and postions. In case of collision,
163
+ # argument will be treated as index
164
+ # @return vector containing values specified at specified indexes/positions
165
+ # @example
166
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c],
167
+ # type: :category,
168
+ # index: 'a'..'e'
169
+ # dv[:a, 1]
170
+ # # => #<DaruLite::Vector(2)>
171
+ # # a a
172
+ # # b 1
173
+ # dv[0, 1]
174
+ # # => #<DaruLite::Vector(2)>
175
+ # # a a
176
+ # # b 1
177
+ def [](*indexes)
178
+ positions = @index.pos(*indexes)
179
+ return category_from_position(positions) if positions.is_a? Integer
180
+
181
+ DaruLite::Vector.new positions.map { |pos| category_from_position pos },
182
+ index: @index.subset(*indexes),
183
+ name: @name,
184
+ type: :category,
185
+ ordered: @ordered,
186
+ categories: categories
187
+ end
188
+
189
+ # Returns vector for positions specified.
190
+ # @param [Array] positions at which values to be retrived.
191
+ # @return vector containing values specified at specified positions
192
+ # @example
193
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
194
+ # dv.at 0..-2
195
+ # # => #<DaruLite::Vector(4)>
196
+ # # 0 a
197
+ # # 1 1
198
+ # # 2 a
199
+ # # 3 1
200
+ def at(*positions)
201
+ original_positions = positions
202
+ positions = coerce_positions(*positions)
203
+ validate_positions(*positions)
204
+
205
+ return category_from_position(positions) if positions.is_a? Integer
206
+
207
+ DaruLite::Vector.new positions.map { |pos| category_from_position(pos) },
208
+ index: @index.at(*original_positions),
209
+ name: @name,
210
+ type: :category,
211
+ ordered: @ordered,
212
+ categories: categories
213
+ end
214
+
215
+ # Modifies values at specified indexes/positions.
216
+ # @note In order to add a new category you need to associate it via #add_category
217
+ # @param [Array] indexes at which to modify value
218
+ # @param [object] val value to assign at specific indexes/positions
219
+ # @return modified vector
220
+ # @example
221
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
222
+ # dv.add_category :b
223
+ # dv[0] = :b
224
+ # dv
225
+ # # => #<DaruLite::Vector(5)>
226
+ # # 0 b
227
+ # # 1 1
228
+ # # 2 a
229
+ # # 3 1
230
+ # # 4 c
231
+ def []=(*indexes, val)
232
+ positions = @index.pos(*indexes)
233
+
234
+ if positions.is_a? Numeric
235
+ modify_category_at positions, val
236
+ else
237
+ positions.each { |pos| modify_category_at pos, val }
238
+ end
239
+ self
240
+ end
241
+
242
+ # Modifies values at specified positions.
243
+ # @param [Array] positions positions at which to modify value
244
+ # @param [object] val value to assign at specific positions
245
+ # @return modified vector
246
+ # @example
247
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
248
+ # dv.add_category :b
249
+ # dv.set_at [0, 1], :b
250
+ # # => #<DaruLite::Vector(5)>
251
+ # # 0 b
252
+ # # 1 b
253
+ # # 2 a
254
+ # # 3 1
255
+ # # 4 c
256
+ def set_at(positions, val)
257
+ validate_positions(*positions)
258
+ positions.map { |pos| modify_category_at pos, val }
259
+ self
260
+ end
261
+
262
+ # Size of categorical data.
263
+ # @return total number of values in the vector
264
+ # @example
265
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
266
+ # dv.size
267
+ # # => 5
268
+ def size
269
+ @array.size
270
+ end
271
+
272
+ # Tells whether vector is ordered or not.
273
+ # @return [Boolean] true if vector is ordered, false otherwise
274
+ # @example
275
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
276
+ # dv.ordered?
277
+ # # => false
278
+ def ordered?
279
+ @ordered
280
+ end
281
+
282
+ # Make categorical data ordered or unordered.
283
+ # @param [Boolean] bool true if categorical data is to be to ordered, false otherwise
284
+ # @example
285
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
286
+ # dv.ordered = true
287
+ # dv.ordered?
288
+ # # => true
289
+ def ordered=(bool)
290
+ @ordered = bool
291
+ end
292
+
293
+ # Returns all the categories with the inherent order
294
+ # @return [Array] categories of the vector with the order
295
+ # @example
296
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c],
297
+ # type: :category,
298
+ # categories: [:a, :b, :c, 1]
299
+ # dv.categories
300
+ # # => [:a, :b, :c, 1]
301
+ def categories
302
+ @cat_hash.keys
303
+ end
304
+
305
+ alias order categories
306
+
307
+ # Sets order of the categories.
308
+ # @note If extra categories are specified, they get added too.
309
+ # @param [Array] cat_with_order categories specifying their order
310
+ # @example
311
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
312
+ # dv.categories = [:a, :b, :c, 1]
313
+ # dv.categories
314
+ # # => [:a, :b, :c, 1]
315
+ def categories=(cat_with_order)
316
+ validate_categories(cat_with_order)
317
+ add_extra_categories(cat_with_order - categories)
318
+ order_with cat_with_order
319
+ end
320
+
321
+ # Rename categories.
322
+ # @note The order of categories after renaming is preserved but new categories
323
+ # are added at the end in the order. Also the base-category is reassigned
324
+ # to new value if it is renamed
325
+ # @param [Hash] old_to_new a hash mapping categories whose name to be changed
326
+ # to their new names
327
+ # @example
328
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
329
+ # dv.rename_categories :a => :b
330
+ # dv
331
+ # # => #<DaruLite::Vector(5)>
332
+ # # 0 b
333
+ # # 1 1
334
+ # # 2 b
335
+ # # 3 1
336
+ # # 4 c
337
+ def rename_categories(old_to_new)
338
+ old_categories = categories
339
+ data = to_a.map do |cat|
340
+ old_to_new.include?(cat) ? old_to_new[cat] : cat
341
+ end
342
+
343
+ initialize_core_attributes data
344
+ self.categories = (old_categories - old_to_new.keys) | old_to_new.values
345
+ self.base_category = old_to_new[base_category] if
346
+ old_to_new.include? base_category
347
+ self
348
+ end
349
+
350
+ # Removes the unused categories
351
+ # @note If base category is removed, then the first occuring category in the
352
+ # data is taken as base category. Order of the undeleted categories
353
+ # remains preserved.
354
+ # @return [DaruLite::Vector] Makes changes in the vector itself i.e. deletes
355
+ # the unused categories and returns itself
356
+ # @example
357
+ # dv = DaruLite::Vector.new [:one, :two, :one], type: :category,
358
+ # categories: [:three, :two, :one]
359
+ # dv.remove_unused_categories
360
+ # dv.categories
361
+ # # => [:two, :one]
362
+ def remove_unused_categories
363
+ old_categories = categories
364
+
365
+ initialize_core_attributes to_a
366
+ self.categories = old_categories & categories
367
+ self.base_category = @cat_hash.keys.first unless
368
+ categories.include? base_category
369
+ self
370
+ end
371
+
372
+ # Returns the minimum category acording to the order specified.
373
+ # @note This operation will only work if vector is ordered.
374
+ # To set the vector ordered do `vector.ordered = true`
375
+ # @return [object] the minimum category acording to the order
376
+ # @example
377
+ # dv = DaruLite::Vector.new ['second', 'second', 'third', 'first'],
378
+ # categories: ['first', 'second', 'third']
379
+ # dv.min
380
+ # # => 'first'
381
+ def min
382
+ assert_ordered :min
383
+ categories.first
384
+ end
385
+
386
+ # Returns the maximum category acording to the order specified.
387
+ # @note This operation will only work if vector is ordered.
388
+ # To set the vector ordered do `vector.ordered = true`
389
+ # @return [object] the maximum category acording to the order
390
+ # @example
391
+ # dv = DaruLite::Vector.new ['second', 'second', 'third', 'first'],
392
+ # categories: ['first', 'second', 'third']
393
+ # dv.max
394
+ # # => 'third'
395
+ def max
396
+ assert_ordered :max
397
+ categories.last
398
+ end
399
+
400
+ # Sorts the vector in the order specified.
401
+ # @note This operation will only work if vector is ordered.
402
+ # To set the vector ordered, do `vector.ordered = true`
403
+ # @return [DaruLite::Vector] sorted vector
404
+ # @example
405
+ # dv = DaruLite::Vector.new ['second', 'second', 'third', 'first'],
406
+ # categories: ['first', 'second', 'thrid'],
407
+ # type: :categories,
408
+ # ordered: true
409
+ # dv.sort!
410
+ # # => #<DaruLite::Vector(4)>
411
+ # # 3 first
412
+ # # 0 second
413
+ # # 1 second
414
+ # # 2 third
415
+ def sort!
416
+ # TODO: Simply the code
417
+ assert_ordered :sort
418
+
419
+ # Build sorted index
420
+ old_index = @index.to_a
421
+ new_index = @cat_hash.values.map do |positions|
422
+ old_index.values_at(*positions)
423
+ end.flatten
424
+ @index = @index.class.new new_index
425
+
426
+ # Build sorted data
427
+ @cat_hash = categories.inject([{}, 0]) do |acc, cat|
428
+ hash, count = acc
429
+ cat_count = @cat_hash[cat].size
430
+ cat_count.times { |i| @array[count + i] = int_from_cat(cat) }
431
+ hash[cat] = (count...(cat_count + count)).to_a
432
+ [hash, count + cat_count]
433
+ end.first
434
+
435
+ self
436
+ end
437
+
438
+ def sort
439
+ dup.sort!
440
+ end
441
+
442
+ # Set coding scheme
443
+ # @param [Symbol] scheme to set
444
+ # @example
445
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
446
+ # dv.coding_scheme = :deviation
447
+ # dv.coding_scheme
448
+ # # => :deviation
449
+ def coding_scheme=(scheme)
450
+ raise ArgumentError, "Unknown or unsupported coding scheme #{scheme}." unless
451
+ CODING_SCHEMES.include? scheme
452
+
453
+ @coding_scheme = scheme
454
+ end
455
+
456
+ CODING_SCHEMES = %i[dummy deviation helmert simple].freeze
457
+
458
+ # Contrast code the vector acording to the coding scheme set.
459
+ # @note To set the coding scheme use #coding_scheme=
460
+ # @param [Hash] opts The options to pass for coding.
461
+ # @option opts [TrueClass, FalseClass] :full (false) True if you want k variables
462
+ # for k categories, false if you want k-1 variables for k categories.
463
+ # @return [DaruLite::DataFrame] dataframe containing all coded variables
464
+ # @example
465
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
466
+ # dv.contrast_code full: false
467
+ # # => #<DaruLite::DataFrame(5x2)>
468
+ # # daru_1 daru_c
469
+ # # 0 0 0
470
+ # # 1 1 0
471
+ # # 2 0 0
472
+ # # 3 1 0
473
+ # # 4 0 1
474
+ def contrast_code(opts = {})
475
+ if opts[:user_defined]
476
+ user_defined_coding(opts[:user_defined])
477
+ else
478
+ # TODO: Make various coding schemes code DRY
479
+ send(:"#{coding_scheme}_coding", opts[:full] || false)
480
+ end
481
+ end
482
+
483
+ # Two categorical vectors are equal if their index and corresponding values are same
484
+ # return [true, false] true if two vectors are similar
485
+ # @example
486
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
487
+ # other = DaruLite::Vector.new [:a, 1, :a, 1, :c],
488
+ # type: :category,
489
+ # index: 1..5
490
+ # dv == other
491
+ # # => false
492
+ def ==(other)
493
+ size == other.size &&
494
+ to_a == other.to_a &&
495
+ index == other.index
496
+ end
497
+
498
+ # Returns integer coding for categorical data in the order starting from 0.
499
+ # For example if order is [:a, :b, :c], then :a, will be coded as 0, :b as 1 and :c as 2
500
+ # @return [Array] integer coding of all values of vector
501
+ # @example
502
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c],
503
+ # type: :category,
504
+ # categories: [:a, :b, :c, 1]
505
+ # dv.to_ints
506
+ # # => [0, 1, 0, 1, 2]
507
+ def to_ints
508
+ @array
509
+ end
510
+
511
+ # Reorder the vector with given positions
512
+ # @note Unlike #reindex! which takes index as input, it takes
513
+ # positions as an input to reorder the vector
514
+ # @param [Array] order the order to reorder the vector with
515
+ # @return reordered vector
516
+ # @example
517
+ # dv = DaruLite::Vector.new [3, 2, 1], index: ['c', 'b', 'a'], type: :category
518
+ # dv.reorder! [2, 1, 0]
519
+ # # => #<DaruLite::Vector(3)>
520
+ # # a 1
521
+ # # b 2
522
+ # # c 3
523
+ def reorder!(order)
524
+ raise ArgumentError, 'Invalid order specified' unless
525
+ order.sort == size.times.to_a
526
+
527
+ # TODO: Room for optimization
528
+ old_data = to_a
529
+ new_data = order.map { |i| old_data[i] }
530
+ initialize_core_attributes new_data
531
+ self
532
+ end
533
+
534
+ # Sets new index for vector. Preserves index->value correspondence.
535
+ # @note Unlike #reorder! which takes positions as input it takes
536
+ # index as an input to reorder the vector
537
+ # @param [DaruLite::Index, DaruLite::MultiIndex, Array] idx new index to order with
538
+ # @return [DaruLite::Vector] vector reindexed with new index
539
+ # @example
540
+ # dv = DaruLite::Vector.new [3, 2, 1], index: ['c', 'b', 'a'], type: :category
541
+ # dv.reindex! ['a', 'b', 'c']
542
+ # # => #<DaruLite::Vector(3)>
543
+ # # a 1
544
+ # # b 2
545
+ # # c 3
546
+ def reindex!(idx)
547
+ idx = DaruLite::Index.new idx unless idx.is_a? DaruLite::Index
548
+ raise ArgumentError, 'Invalid index specified' unless
549
+ idx.to_a.sort == index.to_a.sort
550
+
551
+ old_categories = categories
552
+ data = idx.map { |i| self[i] }
553
+ initialize_core_attributes data
554
+ self.categories = old_categories
555
+ self.index = idx
556
+ self
557
+ end
558
+
559
+ {
560
+ eq: :==,
561
+ not_eq: :!=,
562
+ lt: :<,
563
+ lteq: :<=,
564
+ mt: :>,
565
+ mteq: :>=
566
+ }.each do |method, operator|
567
+ define_method(method) do |other|
568
+ mod = DaruLite::Core::Query
569
+ if other.is_a?(DaruLite::Vector)
570
+ mod.apply_vector_operator operator, to_ints, other.to_ints
571
+ else
572
+ mod.apply_scalar_operator operator, @array, int_from_cat(other)
573
+ end
574
+ end
575
+ end
576
+ alias gt mt
577
+ alias gteq mteq
578
+
579
+ # For querying the data
580
+ # @param bool_array [object] arel like query syntax
581
+ # @return [DaruLite::Vector] Vector which makes the conditions true
582
+ # @example
583
+ # dv = DaruLite::Vector.new ['I', 'II', 'I', 'III', 'I', 'II'],
584
+ # type: :category,
585
+ # ordered: true,
586
+ # categories: ['I', 'II', 'III']
587
+ # dv.where(dv.mt('I') & dv.lt('III'))
588
+ # # => #<DaruLite::Vector(2)>
589
+ # # 1 II
590
+ # # 5 II
591
+ def where(bool_array)
592
+ DaruLite::Core::Query.vector_where self, bool_array
593
+ end
594
+
595
+ # Gives the summary of data using following parameters
596
+ # - size: size of the data
597
+ # - categories: total number of categories
598
+ # - max_freq: Max no of times a category occurs
599
+ # - max_category: The category which occurs max no of times
600
+ # - min_freq: Min no of times a category occurs
601
+ # - min_category: The category which occurs min no of times
602
+ # @return [DaruLite::Vector] Vector with index as following parameters
603
+ # and values as values to these parameters
604
+ # @example
605
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
606
+ # dv.describe
607
+ # # => #<DaruLite::Vector(6)>
608
+ # # size 5
609
+ # # categories 3
610
+ # # max_freq 2
611
+ # # max_category a
612
+ # # min_freq 1
613
+ # # min_category c
614
+ def describe
615
+ DaruLite::Vector.new(
616
+ size: size,
617
+ categories: categories.size,
618
+ max_freq: @cat_hash.values.map(&:size).max,
619
+ max_category: @cat_hash.keys.max_by { |cat| @cat_hash[cat].size },
620
+ min_freq: @cat_hash.values.map(&:size).min,
621
+ min_category: @cat_hash.keys.min_by { |cat| @cat_hash[cat].size }
622
+ )
623
+ end
624
+
625
+ # Does nothing since its already of type category.
626
+ # @return [DaruLite::Vector] categorical vector
627
+ def to_category
628
+ self
629
+ end
630
+
631
+ # Converts a category type vector to non category type vector
632
+ # @return [DaruLite::Vector] non category type vector
633
+ def to_non_category
634
+ DaruLite::Vector.new to_a, name: name, index: index
635
+ end
636
+
637
+ # Sets index of the vector
638
+ # @param [DaruLite::Index, DaruLite::MultiIndex, DaruLite::CategoricalIndex, Array, Range]
639
+ # idx new index to assign to vector
640
+ # @return [DaruLite::Index, DaruLite::CategoricalIndex, DaruLite::MultiIndex] the index assigned
641
+ # @example
642
+ # dv = DaruLite::Vector.new [1, 2, 3], type: :category
643
+ # dv.index = 'a'..'c'
644
+ # dv
645
+ # # => #<DaruLite::Vector(3)>
646
+ # # a 1
647
+ # # b 2
648
+ # # c 3
649
+ def index=(idx)
650
+ @index = coerce_index idx
651
+ end
652
+
653
+ # Check if any one of mentioned values occur in the vector
654
+ # @param [Array] values to check for
655
+ # @return [true, false] returns true if any one of specified values
656
+ # occur in the vector
657
+ # @example
658
+ # dv = DaruLite::Vector.new [1, 2, 3, 4, nil]
659
+ # dv.include_values? nil, Float::NAN
660
+ # # => true
661
+ def include_values?(*values)
662
+ values.any? { |v| @cat_hash.include?(v) && !@cat_hash[v].empty? }
663
+ end
664
+
665
+ # Return a vector with specified values removed
666
+ # @param [Array] values to reject from resultant vector
667
+ # @return [DaruLite::Vector] vector with specified values removed
668
+ # @example
669
+ # dv = DaruLite::Vector.new [1, 2, nil, Float::NAN], type: :category
670
+ # dv.reject_values nil, Float::NAN
671
+ # # => #<DaruLite::Vector(2)>
672
+ # # 0 1
673
+ # # 1 2
674
+ def reject_values(*values)
675
+ resultant_pos = size.times.to_a - values.flat_map { |v| @cat_hash[v] }
676
+ dv = at(*resultant_pos)
677
+ unless dv.is_a? DaruLite::Vector
678
+ pos = resultant_pos.first
679
+ dv = at(pos..pos)
680
+ end
681
+ dv.remove_unused_categories
682
+ end
683
+
684
+ # Count the number of values specified
685
+ # @param [Array] values to count for
686
+ # @return [Integer] the number of times the values mentioned occurs
687
+ # @example
688
+ # dv = DaruLite::Vector.new [1, 2, 1, 2, 3, 4, nil, nil]
689
+ # dv.count_values nil
690
+ # # => 2
691
+ def count_values(*values)
692
+ values.filter_map { |v| @cat_hash[v].size if @cat_hash.include? v }
693
+ .sum
694
+ end
695
+
696
+ # Return indexes of values specified
697
+ # @param [Array] values to find indexes for
698
+ # @return [Array] array of indexes of values specified
699
+ # @example
700
+ # dv = DaruLite::Vector.new [1, 2, nil, Float::NAN], index: 11..14
701
+ # dv.indexes nil, Float::NAN
702
+ # # => [13, 14]
703
+ def indexes(*values)
704
+ values &= categories
705
+ index.to_a.values_at(*values.flat_map { |v| @cat_hash[v] }.sort)
706
+ end
707
+
708
+ # Replaces specified values with a new value
709
+ # @param [Array] old_values array of values to replace
710
+ # @param [object] new_value new value to replace with
711
+ # @note It performs the replace in place.
712
+ # @return [DaruLite::Vector] Same vector itself with values
713
+ # replaced with new value
714
+ # @example
715
+ # dv = DaruLite::Vector.new [1, 2, :a, :b]
716
+ # dv.replace_values [:a, :b], nil
717
+ # dv
718
+ # # =>
719
+ # # #<DaruLite::Vector:19903200 @name = nil @metadata = {} @size = 4 >
720
+ # # nil
721
+ # # 0 1
722
+ # # 1 2
723
+ # # 2 nil
724
+ # # 3 nil
725
+ def replace_values(old_values, new_value)
726
+ old_values = [old_values] unless old_values.is_a? Array
727
+ rename_hash = old_values.to_h { |v| [v, new_value] }
728
+ rename_categories rename_hash
729
+ end
730
+
731
+ def positions(*values)
732
+ values &= categories
733
+ values.flat_map { |v| @cat_hash[v] }.sort
734
+ end
735
+
736
+ private
737
+
738
+ def validate_categories(input_categories)
739
+ raise ArgumentError, 'Input categories and speculated categories mismatch' unless
740
+ (categories - input_categories).empty?
741
+ end
742
+
743
+ def add_extra_categories(extra_categories)
744
+ extra_categories.each { |cat| @cat_hash[cat] = [] }
745
+ end
746
+
747
+ def initialize_core_attributes(data)
748
+ # Create a hash to map each category to positional indexes
749
+ categories = data.each_with_index.group_by(&:first)
750
+ @cat_hash = categories.transform_values { |group| group.map(&:last) }
751
+
752
+ # Map each category to a unique integer for effective storage in @array
753
+ map_cat_int = categories.keys.each_with_index.to_h
754
+
755
+ # To link every instance to its category,
756
+ # it stores integer for every instance representing its category
757
+ @array = map_cat_int.values_at(*data)
758
+ end
759
+
760
+ def category_from_position(position)
761
+ cat_from_int @array[position]
762
+ end
763
+
764
+ def assert_ordered(operation)
765
+ # TODO: Change ArgumentError to something more expressive
766
+ return if ordered?
767
+
768
+ raise ArgumentError, "Can not apply #{operation} when vector is unordered. " \
769
+ 'To make the categorical data ordered, use #ordered = true' \
770
+ end
771
+
772
+ def dummy_coding(full)
773
+ categories = @cat_hash.keys
774
+ categories.delete(base_category) unless full
775
+
776
+ df = categories.map do |category|
777
+ dummy_code @cat_hash[category]
778
+ end
779
+
780
+ DaruLite::DataFrame.new df,
781
+ index: @index,
782
+ order: create_names(categories)
783
+ end
784
+
785
+ def dummy_code(positions)
786
+ code = Array.new(size, 0)
787
+ positions.each { |pos| code[pos] = 1 }
788
+ code
789
+ end
790
+
791
+ def simple_coding(full)
792
+ categories = @cat_hash.keys
793
+ categories.delete(base_category) unless full
794
+
795
+ df = categories.map do |category|
796
+ simple_code @cat_hash[category]
797
+ end
798
+
799
+ DaruLite::DataFrame.new df,
800
+ index: @index,
801
+ order: create_names(categories)
802
+ end
803
+
804
+ def simple_code(positions)
805
+ n = @cat_hash.keys.size.to_f
806
+ code = Array.new(size, -1 / n)
807
+ positions.each { |pos| code[pos] = (n - 1) / n }
808
+ code
809
+ end
810
+
811
+ def helmert_coding(*)
812
+ categories = @cat_hash.keys[0..-2]
813
+
814
+ df = categories.each_index.map do |index|
815
+ helmert_code index
816
+ end
817
+
818
+ DaruLite::DataFrame.new df,
819
+ index: @index,
820
+ order: create_names(categories)
821
+ end
822
+
823
+ def helmert_code(index)
824
+ n = (categories.size - index).to_f
825
+
826
+ @array.map do |cat_index|
827
+ if cat_index == index
828
+ (n - 1) / n
829
+ elsif cat_index > index
830
+ -1 / n
831
+ else
832
+ 0
833
+ end
834
+ end
835
+ end
836
+
837
+ def deviation_coding(*)
838
+ categories = @cat_hash.keys[0..-2]
839
+
840
+ df = categories.each_index.map do |index|
841
+ deviation_code index
842
+ end
843
+
844
+ DaruLite::DataFrame.new df,
845
+ index: @index,
846
+ order: create_names(categories)
847
+ end
848
+
849
+ def deviation_code(index)
850
+ last = categories.size - 1
851
+ @array.map do |cat_index|
852
+ case cat_index
853
+ when index then 1
854
+ when last then -1
855
+ else 0
856
+ end
857
+ end
858
+ end
859
+
860
+ def user_defined_coding(df)
861
+ DaruLite::DataFrame.rows (Array.new(size) { |pos| df.row[at(pos)].to_a }),
862
+ index: @index,
863
+ order: df.vectors.to_a
864
+ end
865
+
866
+ def create_names(categories)
867
+ categories.map do |cat|
868
+ name.is_a?(Symbol) ? :"#{name}_#{cat}" : "#{name}_#{cat}"
869
+ end
870
+ end
871
+
872
+ def coerce_index(index)
873
+ index =
874
+ case index
875
+ when DaruLite::MultiIndex, DaruLite::CategoricalIndex, DaruLite::Index
876
+ index
877
+ when nil
878
+ DaruLite::Index.new size
879
+ when Range
880
+ DaruLite::Index.new index.to_a
881
+ when Array
882
+ DaruLite::Index.new index
883
+ else
884
+ raise ArgumentError, "Unregnized index type #{index.class}"
885
+ end
886
+ validate_index index
887
+ index
888
+ end
889
+
890
+ def validate_index(index)
891
+ # Change to SizeError
892
+ return unless size != index.size
893
+
894
+ raise ArgumentError, "Size of index (#{index.size}) does not matches" \
895
+ "size of vector (#{size})"
896
+ end
897
+
898
+ def modify_category_at(pos, category)
899
+ unless categories.include? category
900
+ raise ArgumentError, "Invalid category #{category}, " \
901
+ 'to add a new category use #add_category'
902
+ end
903
+ old_category = category_from_position pos
904
+ @array[pos] = int_from_cat category
905
+ @cat_hash[old_category].delete pos
906
+ @cat_hash[category] << pos
907
+ end
908
+
909
+ def order_with(new)
910
+ raise ArgumentError, 'The contents of new and old order must be the same.' if new.to_set != categories.to_set
911
+
912
+ @cat_hash = new.map { |cat| [cat, @cat_hash[cat]] }.to_h
913
+
914
+ map_cat_int = @cat_hash.keys.each_with_index.to_a.to_h
915
+ @array = Array.new(size)
916
+ @cat_hash.map do |cat, positions|
917
+ positions.each { |pos| @array[pos] = map_cat_int[cat] }
918
+ end
919
+ end
920
+
921
+ def cat_from_int(int)
922
+ @cat_hash.keys[int]
923
+ end
924
+
925
+ def int_from_cat(cat)
926
+ @cat_hash.keys.index cat
927
+ end
928
+ end
929
+ end