daru_lite 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (149) hide show
  1. checksums.yaml +7 -0
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/workflows/ci.yml +33 -0
  4. data/.gitignore +10 -0
  5. data/.rspec +2 -0
  6. data/.rubocop.yml +27 -0
  7. data/.rubocop_todo.yml +137 -0
  8. data/CONTRIBUTING.md +47 -0
  9. data/Gemfile +2 -0
  10. data/History.md +4 -0
  11. data/LICENSE +24 -0
  12. data/README.md +218 -0
  13. data/Rakefile +69 -0
  14. data/ReleasePolicy.md +20 -0
  15. data/benchmarks/TradeoffData.csv +65 -0
  16. data/benchmarks/csv_reading.rb +22 -0
  17. data/benchmarks/dataframe_creation.rb +39 -0
  18. data/benchmarks/db_loading.rb +34 -0
  19. data/benchmarks/duplicating.rb +45 -0
  20. data/benchmarks/group_by.rb +32 -0
  21. data/benchmarks/joining.rb +52 -0
  22. data/benchmarks/row_access.rb +41 -0
  23. data/benchmarks/row_assign.rb +36 -0
  24. data/benchmarks/sorting.rb +51 -0
  25. data/benchmarks/statistics.rb +28 -0
  26. data/benchmarks/vector_access.rb +31 -0
  27. data/benchmarks/vector_assign.rb +42 -0
  28. data/benchmarks/where_clause.rb +48 -0
  29. data/benchmarks/where_vs_filter.rb +28 -0
  30. data/daru_lite.gemspec +55 -0
  31. data/images/README.md +5 -0
  32. data/images/con0.png +0 -0
  33. data/images/con1.png +0 -0
  34. data/images/init0.png +0 -0
  35. data/images/init1.png +0 -0
  36. data/images/man0.png +0 -0
  37. data/images/man1.png +0 -0
  38. data/images/man2.png +0 -0
  39. data/images/man3.png +0 -0
  40. data/images/man4.png +0 -0
  41. data/images/man5.png +0 -0
  42. data/images/man6.png +0 -0
  43. data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
  44. data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
  45. data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
  46. data/lib/daru_lite/category.rb +929 -0
  47. data/lib/daru_lite/configuration.rb +34 -0
  48. data/lib/daru_lite/core/group_by.rb +403 -0
  49. data/lib/daru_lite/core/merge.rb +270 -0
  50. data/lib/daru_lite/core/query.rb +109 -0
  51. data/lib/daru_lite/dataframe.rb +3080 -0
  52. data/lib/daru_lite/date_time/index.rb +569 -0
  53. data/lib/daru_lite/date_time/offsets.rb +397 -0
  54. data/lib/daru_lite/exceptions.rb +2 -0
  55. data/lib/daru_lite/extensions/which_dsl.rb +53 -0
  56. data/lib/daru_lite/formatters/table.rb +52 -0
  57. data/lib/daru_lite/helpers/array.rb +53 -0
  58. data/lib/daru_lite/index/categorical_index.rb +201 -0
  59. data/lib/daru_lite/index/index.rb +374 -0
  60. data/lib/daru_lite/index/multi_index.rb +374 -0
  61. data/lib/daru_lite/io/csv/converters.rb +21 -0
  62. data/lib/daru_lite/io/io.rb +294 -0
  63. data/lib/daru_lite/io/sql_data_source.rb +97 -0
  64. data/lib/daru_lite/iruby/helpers.rb +38 -0
  65. data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
  66. data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
  67. data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  68. data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  69. data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
  70. data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
  71. data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
  72. data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
  73. data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
  74. data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
  75. data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
  76. data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
  77. data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
  78. data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
  79. data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
  80. data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
  81. data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
  82. data/lib/daru_lite/monkeys.rb +56 -0
  83. data/lib/daru_lite/vector.rb +1678 -0
  84. data/lib/daru_lite/version.rb +3 -0
  85. data/lib/daru_lite.rb +99 -0
  86. data/profile/_base.rb +23 -0
  87. data/profile/df_to_a.rb +10 -0
  88. data/profile/filter.rb +13 -0
  89. data/profile/joining.rb +13 -0
  90. data/profile/sorting.rb +12 -0
  91. data/profile/vector_each_with_index.rb +9 -0
  92. data/profile/vector_new.rb +9 -0
  93. data/spec/accessors/array_wrapper_spec.rb +3 -0
  94. data/spec/category_spec.rb +1741 -0
  95. data/spec/core/group_by_spec.rb +655 -0
  96. data/spec/core/merge_spec.rb +179 -0
  97. data/spec/core/query_spec.rb +347 -0
  98. data/spec/daru_lite_spec.rb +22 -0
  99. data/spec/dataframe_spec.rb +4330 -0
  100. data/spec/date_time/data_spec.rb +197 -0
  101. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  102. data/spec/date_time/index_spec.rb +588 -0
  103. data/spec/date_time/offsets_spec.rb +465 -0
  104. data/spec/extensions/which_dsl_spec.rb +38 -0
  105. data/spec/fixtures/bank2.dat +200 -0
  106. data/spec/fixtures/boolean_converter_test.csv +5 -0
  107. data/spec/fixtures/countries.json +7794 -0
  108. data/spec/fixtures/duplicates.csv +32 -0
  109. data/spec/fixtures/eciresults.html +394 -0
  110. data/spec/fixtures/empties.dat +2 -0
  111. data/spec/fixtures/empty_rows_test.csv +17 -0
  112. data/spec/fixtures/macau.html +3691 -0
  113. data/spec/fixtures/macd_data.csv +150 -0
  114. data/spec/fixtures/matrix_test.csv +100 -0
  115. data/spec/fixtures/moneycontrol.html +6812 -0
  116. data/spec/fixtures/music_data.tsv +2501 -0
  117. data/spec/fixtures/repeated_fields.csv +7 -0
  118. data/spec/fixtures/sales-funnel.csv +18 -0
  119. data/spec/fixtures/scientific_notation.csv +4 -0
  120. data/spec/fixtures/string_converter_test.csv +5 -0
  121. data/spec/fixtures/strings.dat +2 -0
  122. data/spec/fixtures/test_xls.xls +0 -0
  123. data/spec/fixtures/test_xls_2.xls +0 -0
  124. data/spec/fixtures/url_test.txt~ +0 -0
  125. data/spec/fixtures/valid_markup.html +62 -0
  126. data/spec/fixtures/wiki_climate.html +1243 -0
  127. data/spec/fixtures/wiki_table_info.html +631 -0
  128. data/spec/formatters/table_formatter_spec.rb +137 -0
  129. data/spec/helpers_spec.rb +8 -0
  130. data/spec/index/categorical_index_spec.rb +170 -0
  131. data/spec/index/index_spec.rb +417 -0
  132. data/spec/index/multi_index_spec.rb +680 -0
  133. data/spec/io/io_spec.rb +373 -0
  134. data/spec/io/sql_data_source_spec.rb +56 -0
  135. data/spec/iruby/dataframe_spec.rb +170 -0
  136. data/spec/iruby/helpers_spec.rb +49 -0
  137. data/spec/iruby/multi_index_spec.rb +37 -0
  138. data/spec/iruby/vector_spec.rb +105 -0
  139. data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
  140. data/spec/maths/arithmetic/vector_spec.rb +165 -0
  141. data/spec/maths/statistics/dataframe_spec.rb +178 -0
  142. data/spec/maths/statistics/vector_spec.rb +756 -0
  143. data/spec/monkeys_spec.rb +42 -0
  144. data/spec/shared/vector_display_spec.rb +213 -0
  145. data/spec/spec_helper.rb +87 -0
  146. data/spec/support/database_helper.rb +30 -0
  147. data/spec/support/matchers.rb +5 -0
  148. data/spec/vector_spec.rb +2293 -0
  149. metadata +571 -0
@@ -0,0 +1,929 @@
1
+ module DaruLite
2
+ module Category # rubocop:disable Metrics/ModuleLength
3
+ UNDEFINED = Object.new.freeze
4
+
5
+ attr_accessor :base_category
6
+ attr_reader :index, :coding_scheme, :name
7
+
8
+ # Initializes a vector to store categorical data.
9
+ # @note Base category is set to the first category encountered in the vector.
10
+ # @param [Array] data the categorical data
11
+ # @param [Hash] opts the options
12
+ # @option opts [Boolean] :ordered true if data is ordered, false otherwise
13
+ # @option opts [Array] :categories categories to associate with the vector.
14
+ # It add extra categories if specified and provides order of categories also.
15
+ # @option opts [object] :index gives index to vector. By default its from 0 to size-1
16
+ # @return the categorical data created
17
+ # @example
18
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c],
19
+ # type: :category,
20
+ # ordered: true,
21
+ # categories: [:a, :b, :c, 1]
22
+ # # => #<DaruLite::Vector(5)>
23
+ # # 0 a
24
+ # # 1 1
25
+ # # 2 a
26
+ # # 3 1
27
+ # # 4 c
28
+ def initialize_category(data, opts = {})
29
+ @type = :category
30
+ initialize_core_attributes data
31
+
32
+ if opts[:categories]
33
+ validate_categories(opts[:categories])
34
+ add_extra_categories(opts[:categories] - categories)
35
+ order_with opts[:categories]
36
+ end
37
+
38
+ # Specify if the categories are ordered or not.
39
+ # By default its unordered
40
+ @ordered = opts[:ordered] || false
41
+
42
+ # The coding scheme to code with. Default is dummy coding.
43
+ @coding_scheme = :dummy
44
+
45
+ # Base category which won't be present in the coding
46
+ @base_category = @cat_hash.keys.first
47
+
48
+ # Stores the name of the vector
49
+ @name = opts[:name]
50
+
51
+ # Index of the vector
52
+ @index = coerce_index opts[:index]
53
+
54
+ self
55
+ end
56
+
57
+ def name=(new_name)
58
+ @name = new_name
59
+ self
60
+ end
61
+
62
+ alias rename name=
63
+
64
+ # Returns an enumerator that enumerates on categorical data
65
+ # @return [Enumerator] an enumerator that enumerates over data stored in vector
66
+ def each
67
+ return enum_for(:each) unless block_given?
68
+
69
+ @array.each { |pos| yield cat_from_int pos }
70
+ self
71
+ end
72
+
73
+ # Returns all categorical data
74
+ # @return [Array] array of all categorical data which vector is storing
75
+ # @example
76
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
77
+ # dv.to_a
78
+ # # => [:a, 1, :a, 1, :c]
79
+ def to_a
80
+ each.to_a
81
+ end
82
+
83
+ # Duplicated a vector
84
+ # @return [DaruLite::Vector] duplicated vector
85
+ # @example
86
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
87
+ # dv.dup
88
+ # # => #<DaruLite::Vector(5)>
89
+ # # 0 a
90
+ # # 1 1
91
+ # # 2 a
92
+ # # 3 1
93
+ # # 4 c
94
+ def dup
95
+ DaruLite::Vector.new to_a.dup,
96
+ name: @name,
97
+ index: @index.dup,
98
+ type: :category,
99
+ categories: categories,
100
+ ordered: ordered?
101
+ end
102
+
103
+ # Associates a category to the vector.
104
+ # @param [Array] new_categories new categories to be associated
105
+ # @example
106
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
107
+ # dv.add_category :b
108
+ # dv.categories
109
+ # # => [:a, :b, :c, 1]
110
+ def add_category(*new_categories)
111
+ new_categories -= categories
112
+ add_extra_categories new_categories
113
+ end
114
+
115
+ # Returns frequency of given category
116
+ # @param [object] category given category whose count has to be founded
117
+ # @return count/frequency of given category
118
+ # @example
119
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
120
+ # dv.count :a
121
+ # # => 2
122
+ # dv.count
123
+ # # => 5
124
+ def count(category = UNDEFINED)
125
+ return @cat_hash.values.sum(&:size) if category == UNDEFINED # count all
126
+ raise ArgumentError, "Invalid category #{category}" unless
127
+ categories.include?(category)
128
+
129
+ @cat_hash[category].size
130
+ end
131
+
132
+ # Returns a vector storing count/frequency of each category
133
+ # @return [DaruLite::Vector] Return a vector whose indexes are categories
134
+ # and corresponding values are its count
135
+ # @example
136
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
137
+ # dv.frequencies
138
+ # # => #<DaruLite::Vector(4)>
139
+ # # a 2
140
+ # # b 0
141
+ # # c 1
142
+ # # 1 2
143
+ def frequencies(type = :count)
144
+ counts = @cat_hash.values.map(&:size)
145
+ values =
146
+ case type
147
+ when :count
148
+ counts
149
+ when :fraction
150
+ counts.map { |c| c / size.to_f }
151
+ when :percentage
152
+ counts.map { |c| c / size.to_f * 100 }
153
+ else
154
+ raise ArgumentError, 'Type should be either :count, :fraction or ' \
155
+ ":percentage. #{type} not supported."
156
+ end
157
+ DaruLite::Vector.new values, index: categories, name: name
158
+ end
159
+
160
+ # Returns vector for indexes/positions specified
161
+ # @param [Array] indexes for which values has to be retrived
162
+ # @note Since it accepts both indexes and postions. In case of collision,
163
+ # argument will be treated as index
164
+ # @return vector containing values specified at specified indexes/positions
165
+ # @example
166
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c],
167
+ # type: :category,
168
+ # index: 'a'..'e'
169
+ # dv[:a, 1]
170
+ # # => #<DaruLite::Vector(2)>
171
+ # # a a
172
+ # # b 1
173
+ # dv[0, 1]
174
+ # # => #<DaruLite::Vector(2)>
175
+ # # a a
176
+ # # b 1
177
+ def [](*indexes)
178
+ positions = @index.pos(*indexes)
179
+ return category_from_position(positions) if positions.is_a? Integer
180
+
181
+ DaruLite::Vector.new positions.map { |pos| category_from_position pos },
182
+ index: @index.subset(*indexes),
183
+ name: @name,
184
+ type: :category,
185
+ ordered: @ordered,
186
+ categories: categories
187
+ end
188
+
189
+ # Returns vector for positions specified.
190
+ # @param [Array] positions at which values to be retrived.
191
+ # @return vector containing values specified at specified positions
192
+ # @example
193
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
194
+ # dv.at 0..-2
195
+ # # => #<DaruLite::Vector(4)>
196
+ # # 0 a
197
+ # # 1 1
198
+ # # 2 a
199
+ # # 3 1
200
+ def at(*positions)
201
+ original_positions = positions
202
+ positions = coerce_positions(*positions)
203
+ validate_positions(*positions)
204
+
205
+ return category_from_position(positions) if positions.is_a? Integer
206
+
207
+ DaruLite::Vector.new positions.map { |pos| category_from_position(pos) },
208
+ index: @index.at(*original_positions),
209
+ name: @name,
210
+ type: :category,
211
+ ordered: @ordered,
212
+ categories: categories
213
+ end
214
+
215
+ # Modifies values at specified indexes/positions.
216
+ # @note In order to add a new category you need to associate it via #add_category
217
+ # @param [Array] indexes at which to modify value
218
+ # @param [object] val value to assign at specific indexes/positions
219
+ # @return modified vector
220
+ # @example
221
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
222
+ # dv.add_category :b
223
+ # dv[0] = :b
224
+ # dv
225
+ # # => #<DaruLite::Vector(5)>
226
+ # # 0 b
227
+ # # 1 1
228
+ # # 2 a
229
+ # # 3 1
230
+ # # 4 c
231
+ def []=(*indexes, val)
232
+ positions = @index.pos(*indexes)
233
+
234
+ if positions.is_a? Numeric
235
+ modify_category_at positions, val
236
+ else
237
+ positions.each { |pos| modify_category_at pos, val }
238
+ end
239
+ self
240
+ end
241
+
242
+ # Modifies values at specified positions.
243
+ # @param [Array] positions positions at which to modify value
244
+ # @param [object] val value to assign at specific positions
245
+ # @return modified vector
246
+ # @example
247
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
248
+ # dv.add_category :b
249
+ # dv.set_at [0, 1], :b
250
+ # # => #<DaruLite::Vector(5)>
251
+ # # 0 b
252
+ # # 1 b
253
+ # # 2 a
254
+ # # 3 1
255
+ # # 4 c
256
+ def set_at(positions, val)
257
+ validate_positions(*positions)
258
+ positions.map { |pos| modify_category_at pos, val }
259
+ self
260
+ end
261
+
262
+ # Size of categorical data.
263
+ # @return total number of values in the vector
264
+ # @example
265
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
266
+ # dv.size
267
+ # # => 5
268
+ def size
269
+ @array.size
270
+ end
271
+
272
+ # Tells whether vector is ordered or not.
273
+ # @return [Boolean] true if vector is ordered, false otherwise
274
+ # @example
275
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
276
+ # dv.ordered?
277
+ # # => false
278
+ def ordered?
279
+ @ordered
280
+ end
281
+
282
+ # Make categorical data ordered or unordered.
283
+ # @param [Boolean] bool true if categorical data is to be to ordered, false otherwise
284
+ # @example
285
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
286
+ # dv.ordered = true
287
+ # dv.ordered?
288
+ # # => true
289
+ def ordered=(bool)
290
+ @ordered = bool
291
+ end
292
+
293
+ # Returns all the categories with the inherent order
294
+ # @return [Array] categories of the vector with the order
295
+ # @example
296
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c],
297
+ # type: :category,
298
+ # categories: [:a, :b, :c, 1]
299
+ # dv.categories
300
+ # # => [:a, :b, :c, 1]
301
+ def categories
302
+ @cat_hash.keys
303
+ end
304
+
305
+ alias order categories
306
+
307
+ # Sets order of the categories.
308
+ # @note If extra categories are specified, they get added too.
309
+ # @param [Array] cat_with_order categories specifying their order
310
+ # @example
311
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
312
+ # dv.categories = [:a, :b, :c, 1]
313
+ # dv.categories
314
+ # # => [:a, :b, :c, 1]
315
+ def categories=(cat_with_order)
316
+ validate_categories(cat_with_order)
317
+ add_extra_categories(cat_with_order - categories)
318
+ order_with cat_with_order
319
+ end
320
+
321
+ # Rename categories.
322
+ # @note The order of categories after renaming is preserved but new categories
323
+ # are added at the end in the order. Also the base-category is reassigned
324
+ # to new value if it is renamed
325
+ # @param [Hash] old_to_new a hash mapping categories whose name to be changed
326
+ # to their new names
327
+ # @example
328
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
329
+ # dv.rename_categories :a => :b
330
+ # dv
331
+ # # => #<DaruLite::Vector(5)>
332
+ # # 0 b
333
+ # # 1 1
334
+ # # 2 b
335
+ # # 3 1
336
+ # # 4 c
337
+ def rename_categories(old_to_new)
338
+ old_categories = categories
339
+ data = to_a.map do |cat|
340
+ old_to_new.include?(cat) ? old_to_new[cat] : cat
341
+ end
342
+
343
+ initialize_core_attributes data
344
+ self.categories = (old_categories - old_to_new.keys) | old_to_new.values
345
+ self.base_category = old_to_new[base_category] if
346
+ old_to_new.include? base_category
347
+ self
348
+ end
349
+
350
+ # Removes the unused categories
351
+ # @note If base category is removed, then the first occuring category in the
352
+ # data is taken as base category. Order of the undeleted categories
353
+ # remains preserved.
354
+ # @return [DaruLite::Vector] Makes changes in the vector itself i.e. deletes
355
+ # the unused categories and returns itself
356
+ # @example
357
+ # dv = DaruLite::Vector.new [:one, :two, :one], type: :category,
358
+ # categories: [:three, :two, :one]
359
+ # dv.remove_unused_categories
360
+ # dv.categories
361
+ # # => [:two, :one]
362
+ def remove_unused_categories
363
+ old_categories = categories
364
+
365
+ initialize_core_attributes to_a
366
+ self.categories = old_categories & categories
367
+ self.base_category = @cat_hash.keys.first unless
368
+ categories.include? base_category
369
+ self
370
+ end
371
+
372
+ # Returns the minimum category acording to the order specified.
373
+ # @note This operation will only work if vector is ordered.
374
+ # To set the vector ordered do `vector.ordered = true`
375
+ # @return [object] the minimum category acording to the order
376
+ # @example
377
+ # dv = DaruLite::Vector.new ['second', 'second', 'third', 'first'],
378
+ # categories: ['first', 'second', 'third']
379
+ # dv.min
380
+ # # => 'first'
381
+ def min
382
+ assert_ordered :min
383
+ categories.first
384
+ end
385
+
386
+ # Returns the maximum category acording to the order specified.
387
+ # @note This operation will only work if vector is ordered.
388
+ # To set the vector ordered do `vector.ordered = true`
389
+ # @return [object] the maximum category acording to the order
390
+ # @example
391
+ # dv = DaruLite::Vector.new ['second', 'second', 'third', 'first'],
392
+ # categories: ['first', 'second', 'third']
393
+ # dv.max
394
+ # # => 'third'
395
+ def max
396
+ assert_ordered :max
397
+ categories.last
398
+ end
399
+
400
+ # Sorts the vector in the order specified.
401
+ # @note This operation will only work if vector is ordered.
402
+ # To set the vector ordered, do `vector.ordered = true`
403
+ # @return [DaruLite::Vector] sorted vector
404
+ # @example
405
+ # dv = DaruLite::Vector.new ['second', 'second', 'third', 'first'],
406
+ # categories: ['first', 'second', 'thrid'],
407
+ # type: :categories,
408
+ # ordered: true
409
+ # dv.sort!
410
+ # # => #<DaruLite::Vector(4)>
411
+ # # 3 first
412
+ # # 0 second
413
+ # # 1 second
414
+ # # 2 third
415
+ def sort!
416
+ # TODO: Simply the code
417
+ assert_ordered :sort
418
+
419
+ # Build sorted index
420
+ old_index = @index.to_a
421
+ new_index = @cat_hash.values.map do |positions|
422
+ old_index.values_at(*positions)
423
+ end.flatten
424
+ @index = @index.class.new new_index
425
+
426
+ # Build sorted data
427
+ @cat_hash = categories.inject([{}, 0]) do |acc, cat|
428
+ hash, count = acc
429
+ cat_count = @cat_hash[cat].size
430
+ cat_count.times { |i| @array[count + i] = int_from_cat(cat) }
431
+ hash[cat] = (count...(cat_count + count)).to_a
432
+ [hash, count + cat_count]
433
+ end.first
434
+
435
+ self
436
+ end
437
+
438
+ def sort
439
+ dup.sort!
440
+ end
441
+
442
+ # Set coding scheme
443
+ # @param [Symbol] scheme to set
444
+ # @example
445
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
446
+ # dv.coding_scheme = :deviation
447
+ # dv.coding_scheme
448
+ # # => :deviation
449
+ def coding_scheme=(scheme)
450
+ raise ArgumentError, "Unknown or unsupported coding scheme #{scheme}." unless
451
+ CODING_SCHEMES.include? scheme
452
+
453
+ @coding_scheme = scheme
454
+ end
455
+
456
+ CODING_SCHEMES = %i[dummy deviation helmert simple].freeze
457
+
458
+ # Contrast code the vector acording to the coding scheme set.
459
+ # @note To set the coding scheme use #coding_scheme=
460
+ # @param [Hash] opts The options to pass for coding.
461
+ # @option opts [TrueClass, FalseClass] :full (false) True if you want k variables
462
+ # for k categories, false if you want k-1 variables for k categories.
463
+ # @return [DaruLite::DataFrame] dataframe containing all coded variables
464
+ # @example
465
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
466
+ # dv.contrast_code full: false
467
+ # # => #<DaruLite::DataFrame(5x2)>
468
+ # # daru_1 daru_c
469
+ # # 0 0 0
470
+ # # 1 1 0
471
+ # # 2 0 0
472
+ # # 3 1 0
473
+ # # 4 0 1
474
+ def contrast_code(opts = {})
475
+ if opts[:user_defined]
476
+ user_defined_coding(opts[:user_defined])
477
+ else
478
+ # TODO: Make various coding schemes code DRY
479
+ send(:"#{coding_scheme}_coding", opts[:full] || false)
480
+ end
481
+ end
482
+
483
+ # Two categorical vectors are equal if their index and corresponding values are same
484
+ # return [true, false] true if two vectors are similar
485
+ # @example
486
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
487
+ # other = DaruLite::Vector.new [:a, 1, :a, 1, :c],
488
+ # type: :category,
489
+ # index: 1..5
490
+ # dv == other
491
+ # # => false
492
+ def ==(other)
493
+ size == other.size &&
494
+ to_a == other.to_a &&
495
+ index == other.index
496
+ end
497
+
498
+ # Returns integer coding for categorical data in the order starting from 0.
499
+ # For example if order is [:a, :b, :c], then :a, will be coded as 0, :b as 1 and :c as 2
500
+ # @return [Array] integer coding of all values of vector
501
+ # @example
502
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c],
503
+ # type: :category,
504
+ # categories: [:a, :b, :c, 1]
505
+ # dv.to_ints
506
+ # # => [0, 1, 0, 1, 2]
507
+ def to_ints
508
+ @array
509
+ end
510
+
511
+ # Reorder the vector with given positions
512
+ # @note Unlike #reindex! which takes index as input, it takes
513
+ # positions as an input to reorder the vector
514
+ # @param [Array] order the order to reorder the vector with
515
+ # @return reordered vector
516
+ # @example
517
+ # dv = DaruLite::Vector.new [3, 2, 1], index: ['c', 'b', 'a'], type: :category
518
+ # dv.reorder! [2, 1, 0]
519
+ # # => #<DaruLite::Vector(3)>
520
+ # # a 1
521
+ # # b 2
522
+ # # c 3
523
+ def reorder!(order)
524
+ raise ArgumentError, 'Invalid order specified' unless
525
+ order.sort == size.times.to_a
526
+
527
+ # TODO: Room for optimization
528
+ old_data = to_a
529
+ new_data = order.map { |i| old_data[i] }
530
+ initialize_core_attributes new_data
531
+ self
532
+ end
533
+
534
+ # Sets new index for vector. Preserves index->value correspondence.
535
+ # @note Unlike #reorder! which takes positions as input it takes
536
+ # index as an input to reorder the vector
537
+ # @param [DaruLite::Index, DaruLite::MultiIndex, Array] idx new index to order with
538
+ # @return [DaruLite::Vector] vector reindexed with new index
539
+ # @example
540
+ # dv = DaruLite::Vector.new [3, 2, 1], index: ['c', 'b', 'a'], type: :category
541
+ # dv.reindex! ['a', 'b', 'c']
542
+ # # => #<DaruLite::Vector(3)>
543
+ # # a 1
544
+ # # b 2
545
+ # # c 3
546
+ def reindex!(idx)
547
+ idx = DaruLite::Index.new idx unless idx.is_a? DaruLite::Index
548
+ raise ArgumentError, 'Invalid index specified' unless
549
+ idx.to_a.sort == index.to_a.sort
550
+
551
+ old_categories = categories
552
+ data = idx.map { |i| self[i] }
553
+ initialize_core_attributes data
554
+ self.categories = old_categories
555
+ self.index = idx
556
+ self
557
+ end
558
+
559
+ {
560
+ eq: :==,
561
+ not_eq: :!=,
562
+ lt: :<,
563
+ lteq: :<=,
564
+ mt: :>,
565
+ mteq: :>=
566
+ }.each do |method, operator|
567
+ define_method(method) do |other|
568
+ mod = DaruLite::Core::Query
569
+ if other.is_a?(DaruLite::Vector)
570
+ mod.apply_vector_operator operator, to_ints, other.to_ints
571
+ else
572
+ mod.apply_scalar_operator operator, @array, int_from_cat(other)
573
+ end
574
+ end
575
+ end
576
+ alias gt mt
577
+ alias gteq mteq
578
+
579
+ # For querying the data
580
+ # @param bool_array [object] arel like query syntax
581
+ # @return [DaruLite::Vector] Vector which makes the conditions true
582
+ # @example
583
+ # dv = DaruLite::Vector.new ['I', 'II', 'I', 'III', 'I', 'II'],
584
+ # type: :category,
585
+ # ordered: true,
586
+ # categories: ['I', 'II', 'III']
587
+ # dv.where(dv.mt('I') & dv.lt('III'))
588
+ # # => #<DaruLite::Vector(2)>
589
+ # # 1 II
590
+ # # 5 II
591
+ def where(bool_array)
592
+ DaruLite::Core::Query.vector_where self, bool_array
593
+ end
594
+
595
+ # Gives the summary of data using following parameters
596
+ # - size: size of the data
597
+ # - categories: total number of categories
598
+ # - max_freq: Max no of times a category occurs
599
+ # - max_category: The category which occurs max no of times
600
+ # - min_freq: Min no of times a category occurs
601
+ # - min_category: The category which occurs min no of times
602
+ # @return [DaruLite::Vector] Vector with index as following parameters
603
+ # and values as values to these parameters
604
+ # @example
605
+ # dv = DaruLite::Vector.new [:a, 1, :a, 1, :c], type: :category
606
+ # dv.describe
607
+ # # => #<DaruLite::Vector(6)>
608
+ # # size 5
609
+ # # categories 3
610
+ # # max_freq 2
611
+ # # max_category a
612
+ # # min_freq 1
613
+ # # min_category c
614
+ def describe
615
+ DaruLite::Vector.new(
616
+ size: size,
617
+ categories: categories.size,
618
+ max_freq: @cat_hash.values.map(&:size).max,
619
+ max_category: @cat_hash.keys.max_by { |cat| @cat_hash[cat].size },
620
+ min_freq: @cat_hash.values.map(&:size).min,
621
+ min_category: @cat_hash.keys.min_by { |cat| @cat_hash[cat].size }
622
+ )
623
+ end
624
+
625
+ # Does nothing since its already of type category.
626
+ # @return [DaruLite::Vector] categorical vector
627
+ def to_category
628
+ self
629
+ end
630
+
631
+ # Converts a category type vector to non category type vector
632
+ # @return [DaruLite::Vector] non category type vector
633
+ def to_non_category
634
+ DaruLite::Vector.new to_a, name: name, index: index
635
+ end
636
+
637
+ # Sets index of the vector
638
+ # @param [DaruLite::Index, DaruLite::MultiIndex, DaruLite::CategoricalIndex, Array, Range]
639
+ # idx new index to assign to vector
640
+ # @return [DaruLite::Index, DaruLite::CategoricalIndex, DaruLite::MultiIndex] the index assigned
641
+ # @example
642
+ # dv = DaruLite::Vector.new [1, 2, 3], type: :category
643
+ # dv.index = 'a'..'c'
644
+ # dv
645
+ # # => #<DaruLite::Vector(3)>
646
+ # # a 1
647
+ # # b 2
648
+ # # c 3
649
+ def index=(idx)
650
+ @index = coerce_index idx
651
+ end
652
+
653
+ # Check if any one of mentioned values occur in the vector
654
+ # @param [Array] values to check for
655
+ # @return [true, false] returns true if any one of specified values
656
+ # occur in the vector
657
+ # @example
658
+ # dv = DaruLite::Vector.new [1, 2, 3, 4, nil]
659
+ # dv.include_values? nil, Float::NAN
660
+ # # => true
661
+ def include_values?(*values)
662
+ values.any? { |v| @cat_hash.include?(v) && !@cat_hash[v].empty? }
663
+ end
664
+
665
+ # Return a vector with specified values removed
666
+ # @param [Array] values to reject from resultant vector
667
+ # @return [DaruLite::Vector] vector with specified values removed
668
+ # @example
669
+ # dv = DaruLite::Vector.new [1, 2, nil, Float::NAN], type: :category
670
+ # dv.reject_values nil, Float::NAN
671
+ # # => #<DaruLite::Vector(2)>
672
+ # # 0 1
673
+ # # 1 2
674
+ def reject_values(*values)
675
+ resultant_pos = size.times.to_a - values.flat_map { |v| @cat_hash[v] }
676
+ dv = at(*resultant_pos)
677
+ unless dv.is_a? DaruLite::Vector
678
+ pos = resultant_pos.first
679
+ dv = at(pos..pos)
680
+ end
681
+ dv.remove_unused_categories
682
+ end
683
+
684
+ # Count the number of values specified
685
+ # @param [Array] values to count for
686
+ # @return [Integer] the number of times the values mentioned occurs
687
+ # @example
688
+ # dv = DaruLite::Vector.new [1, 2, 1, 2, 3, 4, nil, nil]
689
+ # dv.count_values nil
690
+ # # => 2
691
+ def count_values(*values)
692
+ values.filter_map { |v| @cat_hash[v].size if @cat_hash.include? v }
693
+ .sum
694
+ end
695
+
696
+ # Return indexes of values specified
697
+ # @param [Array] values to find indexes for
698
+ # @return [Array] array of indexes of values specified
699
+ # @example
700
+ # dv = DaruLite::Vector.new [1, 2, nil, Float::NAN], index: 11..14
701
+ # dv.indexes nil, Float::NAN
702
+ # # => [13, 14]
703
+ def indexes(*values)
704
+ values &= categories
705
+ index.to_a.values_at(*values.flat_map { |v| @cat_hash[v] }.sort)
706
+ end
707
+
708
+ # Replaces specified values with a new value
709
+ # @param [Array] old_values array of values to replace
710
+ # @param [object] new_value new value to replace with
711
+ # @note It performs the replace in place.
712
+ # @return [DaruLite::Vector] Same vector itself with values
713
+ # replaced with new value
714
+ # @example
715
+ # dv = DaruLite::Vector.new [1, 2, :a, :b]
716
+ # dv.replace_values [:a, :b], nil
717
+ # dv
718
+ # # =>
719
+ # # #<DaruLite::Vector:19903200 @name = nil @metadata = {} @size = 4 >
720
+ # # nil
721
+ # # 0 1
722
+ # # 1 2
723
+ # # 2 nil
724
+ # # 3 nil
725
+ def replace_values(old_values, new_value)
726
+ old_values = [old_values] unless old_values.is_a? Array
727
+ rename_hash = old_values.to_h { |v| [v, new_value] }
728
+ rename_categories rename_hash
729
+ end
730
+
731
+ def positions(*values)
732
+ values &= categories
733
+ values.flat_map { |v| @cat_hash[v] }.sort
734
+ end
735
+
736
+ private
737
+
738
+ def validate_categories(input_categories)
739
+ raise ArgumentError, 'Input categories and speculated categories mismatch' unless
740
+ (categories - input_categories).empty?
741
+ end
742
+
743
+ def add_extra_categories(extra_categories)
744
+ extra_categories.each { |cat| @cat_hash[cat] = [] }
745
+ end
746
+
747
+ def initialize_core_attributes(data)
748
+ # Create a hash to map each category to positional indexes
749
+ categories = data.each_with_index.group_by(&:first)
750
+ @cat_hash = categories.transform_values { |group| group.map(&:last) }
751
+
752
+ # Map each category to a unique integer for effective storage in @array
753
+ map_cat_int = categories.keys.each_with_index.to_h
754
+
755
+ # To link every instance to its category,
756
+ # it stores integer for every instance representing its category
757
+ @array = map_cat_int.values_at(*data)
758
+ end
759
+
760
+ def category_from_position(position)
761
+ cat_from_int @array[position]
762
+ end
763
+
764
+ def assert_ordered(operation)
765
+ # TODO: Change ArgumentError to something more expressive
766
+ return if ordered?
767
+
768
+ raise ArgumentError, "Can not apply #{operation} when vector is unordered. " \
769
+ 'To make the categorical data ordered, use #ordered = true' \
770
+ end
771
+
772
+ def dummy_coding(full)
773
+ categories = @cat_hash.keys
774
+ categories.delete(base_category) unless full
775
+
776
+ df = categories.map do |category|
777
+ dummy_code @cat_hash[category]
778
+ end
779
+
780
+ DaruLite::DataFrame.new df,
781
+ index: @index,
782
+ order: create_names(categories)
783
+ end
784
+
785
+ def dummy_code(positions)
786
+ code = Array.new(size, 0)
787
+ positions.each { |pos| code[pos] = 1 }
788
+ code
789
+ end
790
+
791
+ def simple_coding(full)
792
+ categories = @cat_hash.keys
793
+ categories.delete(base_category) unless full
794
+
795
+ df = categories.map do |category|
796
+ simple_code @cat_hash[category]
797
+ end
798
+
799
+ DaruLite::DataFrame.new df,
800
+ index: @index,
801
+ order: create_names(categories)
802
+ end
803
+
804
+ def simple_code(positions)
805
+ n = @cat_hash.keys.size.to_f
806
+ code = Array.new(size, -1 / n)
807
+ positions.each { |pos| code[pos] = (n - 1) / n }
808
+ code
809
+ end
810
+
811
+ def helmert_coding(*)
812
+ categories = @cat_hash.keys[0..-2]
813
+
814
+ df = categories.each_index.map do |index|
815
+ helmert_code index
816
+ end
817
+
818
+ DaruLite::DataFrame.new df,
819
+ index: @index,
820
+ order: create_names(categories)
821
+ end
822
+
823
+ def helmert_code(index)
824
+ n = (categories.size - index).to_f
825
+
826
+ @array.map do |cat_index|
827
+ if cat_index == index
828
+ (n - 1) / n
829
+ elsif cat_index > index
830
+ -1 / n
831
+ else
832
+ 0
833
+ end
834
+ end
835
+ end
836
+
837
+ def deviation_coding(*)
838
+ categories = @cat_hash.keys[0..-2]
839
+
840
+ df = categories.each_index.map do |index|
841
+ deviation_code index
842
+ end
843
+
844
+ DaruLite::DataFrame.new df,
845
+ index: @index,
846
+ order: create_names(categories)
847
+ end
848
+
849
+ def deviation_code(index)
850
+ last = categories.size - 1
851
+ @array.map do |cat_index|
852
+ case cat_index
853
+ when index then 1
854
+ when last then -1
855
+ else 0
856
+ end
857
+ end
858
+ end
859
+
860
+ def user_defined_coding(df)
861
+ DaruLite::DataFrame.rows (Array.new(size) { |pos| df.row[at(pos)].to_a }),
862
+ index: @index,
863
+ order: df.vectors.to_a
864
+ end
865
+
866
+ def create_names(categories)
867
+ categories.map do |cat|
868
+ name.is_a?(Symbol) ? :"#{name}_#{cat}" : "#{name}_#{cat}"
869
+ end
870
+ end
871
+
872
+ def coerce_index(index)
873
+ index =
874
+ case index
875
+ when DaruLite::MultiIndex, DaruLite::CategoricalIndex, DaruLite::Index
876
+ index
877
+ when nil
878
+ DaruLite::Index.new size
879
+ when Range
880
+ DaruLite::Index.new index.to_a
881
+ when Array
882
+ DaruLite::Index.new index
883
+ else
884
+ raise ArgumentError, "Unregnized index type #{index.class}"
885
+ end
886
+ validate_index index
887
+ index
888
+ end
889
+
890
+ def validate_index(index)
891
+ # Change to SizeError
892
+ return unless size != index.size
893
+
894
+ raise ArgumentError, "Size of index (#{index.size}) does not matches" \
895
+ "size of vector (#{size})"
896
+ end
897
+
898
+ def modify_category_at(pos, category)
899
+ unless categories.include? category
900
+ raise ArgumentError, "Invalid category #{category}, " \
901
+ 'to add a new category use #add_category'
902
+ end
903
+ old_category = category_from_position pos
904
+ @array[pos] = int_from_cat category
905
+ @cat_hash[old_category].delete pos
906
+ @cat_hash[category] << pos
907
+ end
908
+
909
+ def order_with(new)
910
+ raise ArgumentError, 'The contents of new and old order must be the same.' if new.to_set != categories.to_set
911
+
912
+ @cat_hash = new.map { |cat| [cat, @cat_hash[cat]] }.to_h
913
+
914
+ map_cat_int = @cat_hash.keys.each_with_index.to_a.to_h
915
+ @array = Array.new(size)
916
+ @cat_hash.map do |cat, positions|
917
+ positions.each { |pos| @array[pos] = map_cat_int[cat] }
918
+ end
919
+ end
920
+
921
+ def cat_from_int(int)
922
+ @cat_hash.keys[int]
923
+ end
924
+
925
+ def int_from_cat(cat)
926
+ @cat_hash.keys.index cat
927
+ end
928
+ end
929
+ end