daru 0.1.3.1 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/.rspec +2 -1
  4. data/.rspec_formatter.rb +33 -0
  5. data/.rubocop.yml +26 -2
  6. data/History.md +38 -0
  7. data/README.md +22 -13
  8. data/Rakefile +50 -2
  9. data/benchmarks/csv_reading.rb +22 -0
  10. data/daru.gemspec +9 -2
  11. data/lib/daru.rb +36 -4
  12. data/lib/daru/accessors/array_wrapper.rb +6 -1
  13. data/lib/daru/accessors/dataframe_by_row.rb +10 -2
  14. data/lib/daru/accessors/gsl_wrapper.rb +1 -3
  15. data/lib/daru/accessors/nmatrix_wrapper.rb +9 -0
  16. data/lib/daru/category.rb +935 -0
  17. data/lib/daru/core/group_by.rb +29 -38
  18. data/lib/daru/core/merge.rb +186 -145
  19. data/lib/daru/core/query.rb +22 -11
  20. data/lib/daru/dataframe.rb +976 -885
  21. data/lib/daru/date_time/index.rb +166 -166
  22. data/lib/daru/date_time/offsets.rb +66 -77
  23. data/lib/daru/formatters/table.rb +54 -0
  24. data/lib/daru/helpers/array.rb +40 -0
  25. data/lib/daru/index.rb +476 -73
  26. data/lib/daru/io/io.rb +66 -45
  27. data/lib/daru/io/sql_data_source.rb +33 -62
  28. data/lib/daru/iruby/helpers.rb +38 -0
  29. data/lib/daru/iruby/templates/dataframe.html.erb +52 -0
  30. data/lib/daru/iruby/templates/dataframe_mi.html.erb +58 -0
  31. data/lib/daru/iruby/templates/multi_index.html.erb +12 -0
  32. data/lib/daru/iruby/templates/vector.html.erb +27 -0
  33. data/lib/daru/iruby/templates/vector_mi.html.erb +36 -0
  34. data/lib/daru/maths/arithmetic/dataframe.rb +16 -18
  35. data/lib/daru/maths/arithmetic/vector.rb +4 -6
  36. data/lib/daru/maths/statistics/dataframe.rb +8 -15
  37. data/lib/daru/maths/statistics/vector.rb +120 -98
  38. data/lib/daru/monkeys.rb +12 -40
  39. data/lib/daru/plotting/gruff.rb +3 -0
  40. data/lib/daru/plotting/gruff/category.rb +49 -0
  41. data/lib/daru/plotting/gruff/dataframe.rb +91 -0
  42. data/lib/daru/plotting/gruff/vector.rb +57 -0
  43. data/lib/daru/plotting/nyaplot.rb +3 -0
  44. data/lib/daru/plotting/nyaplot/category.rb +34 -0
  45. data/lib/daru/plotting/nyaplot/dataframe.rb +187 -0
  46. data/lib/daru/plotting/nyaplot/vector.rb +46 -0
  47. data/lib/daru/vector.rb +694 -421
  48. data/lib/daru/version.rb +1 -1
  49. data/profile/_base.rb +23 -0
  50. data/profile/df_to_a.rb +10 -0
  51. data/profile/filter.rb +13 -0
  52. data/profile/joining.rb +13 -0
  53. data/profile/sorting.rb +12 -0
  54. data/profile/vector_each_with_index.rb +9 -0
  55. data/spec/accessors/wrappers_spec.rb +2 -4
  56. data/spec/categorical_spec.rb +1734 -0
  57. data/spec/core/group_by_spec.rb +52 -2
  58. data/spec/core/merge_spec.rb +63 -2
  59. data/spec/core/query_spec.rb +236 -80
  60. data/spec/dataframe_spec.rb +1373 -79
  61. data/spec/date_time/data_spec.rb +3 -5
  62. data/spec/date_time/index_spec.rb +154 -17
  63. data/spec/date_time/offsets_spec.rb +3 -4
  64. data/spec/fixtures/empties.dat +2 -0
  65. data/spec/fixtures/strings.dat +2 -0
  66. data/spec/formatters/table_formatter_spec.rb +99 -0
  67. data/spec/helpers_spec.rb +8 -0
  68. data/spec/index/categorical_index_spec.rb +168 -0
  69. data/spec/index/index_spec.rb +283 -0
  70. data/spec/index/multi_index_spec.rb +570 -0
  71. data/spec/io/io_spec.rb +31 -4
  72. data/spec/io/sql_data_source_spec.rb +0 -1
  73. data/spec/iruby/dataframe_spec.rb +172 -0
  74. data/spec/iruby/helpers_spec.rb +49 -0
  75. data/spec/iruby/multi_index_spec.rb +37 -0
  76. data/spec/iruby/vector_spec.rb +107 -0
  77. data/spec/math/arithmetic/dataframe_spec.rb +71 -13
  78. data/spec/math/arithmetic/vector_spec.rb +8 -10
  79. data/spec/math/statistics/dataframe_spec.rb +3 -5
  80. data/spec/math/statistics/vector_spec.rb +45 -55
  81. data/spec/monkeys_spec.rb +32 -9
  82. data/spec/plotting/dataframe_spec.rb +386 -0
  83. data/spec/plotting/vector_spec.rb +230 -0
  84. data/spec/shared/vector_display_spec.rb +215 -0
  85. data/spec/spec_helper.rb +23 -0
  86. data/spec/vector_spec.rb +905 -138
  87. metadata +143 -11
  88. data/.rubocop_todo.yml +0 -44
  89. data/lib/daru/plotting/dataframe.rb +0 -104
  90. data/lib/daru/plotting/vector.rb +0 -38
  91. data/spec/daru_spec.rb +0 -58
  92. data/spec/index_spec.rb +0 -375
@@ -54,6 +54,11 @@ module Daru
54
54
  set_size
55
55
  end
56
56
 
57
+ def fill(*arg)
58
+ @data.fill(*arg)
59
+ set_size
60
+ end
61
+
57
62
  def uniq
58
63
  @data.uniq
59
64
  end
@@ -67,7 +72,7 @@ module Daru
67
72
  end
68
73
 
69
74
  def compact
70
- @data - @context.missing_values
75
+ @data - Daru::MISSING_VALUES
71
76
  end
72
77
 
73
78
  def mean
@@ -9,8 +9,16 @@ module Daru
9
9
  @data_frame[*names, :row]
10
10
  end
11
11
 
12
- def []=(name, vector)
13
- @data_frame[name, :row] = vector
12
+ def []=(*names, vector)
13
+ @data_frame[*names, :row] = vector
14
+ end
15
+
16
+ def at *positions
17
+ @data_frame.row_at(*positions)
18
+ end
19
+
20
+ def set_at positions, vector
21
+ @data_frame.set_row_at(positions, vector)
14
22
  end
15
23
  end
16
24
  end
@@ -61,9 +61,7 @@ module Daru
61
61
  attr_reader :data
62
62
 
63
63
  def compact
64
- # set missing to [] incase @context is not Daru::Vector
65
- missing = @context.missing_values rescue []
66
- ::GSL::Vector.alloc(@data.to_a - missing.map(&:to_f))
64
+ ::GSL::Vector.alloc(@data.to_a - [Float::NAN])
67
65
  end
68
66
 
69
67
  [:mean, :min, :max, :prod, :sum].each do |method|
@@ -14,9 +14,12 @@ module Daru
14
14
  self
15
15
  end
16
16
 
17
+ # :nocov:
18
+ # FIXME: not sure, why this kind of wrapper have such a pure coverage
17
19
  def inject(*args, &block)
18
20
  @data[0...@size].inject(*args, &block)
19
21
  end
22
+ # :nocov:
20
23
 
21
24
  attr_reader :size, :data, :nm_dtype
22
25
 
@@ -43,9 +46,11 @@ module Daru
43
46
  @data[index] = value
44
47
  end
45
48
 
49
+ # :nocov:
46
50
  def == other
47
51
  @data[0...@size] == other[0...@size] and @size == other.size
48
52
  end
53
+ # :nocov:
49
54
 
50
55
  def delete_at index
51
56
  arry = @data.to_a
@@ -58,10 +63,12 @@ module Daru
58
63
  @data.to_a.index key
59
64
  end
60
65
 
66
+ # :nocov:
61
67
  def << element
62
68
  resize if @size >= @data.size
63
69
  self[@size] = element
64
70
  end
71
+ # :nocov:
65
72
 
66
73
  def to_a
67
74
  @data[0...@size].to_a
@@ -77,6 +84,7 @@ module Daru
77
84
  @data = NMatrix.new [size], @data.to_a, dtype: @nm_dtype
78
85
  end
79
86
 
87
+ # :nocov:
80
88
  def mean
81
89
  @data[0...@size].mean.first
82
90
  end
@@ -96,6 +104,7 @@ module Daru
96
104
  def min
97
105
  @data[0...@size].min
98
106
  end
107
+ # :nocov:
99
108
  end
100
109
  end
101
110
  end if Daru.has_nmatrix?
@@ -0,0 +1,935 @@
1
+ module Daru
2
+ module Category # rubocop:disable Metrics/ModuleLength
3
+ attr_accessor :base_category
4
+ attr_reader :index, :coding_scheme, :name
5
+
6
+ # For debuggin. To be removed
7
+ attr_reader :array, :cat_hash, :map_int_cat
8
+
9
+ # Initializes a vector to store categorical data.
10
+ # @note Base category is set to the first category encountered in the vector.
11
+ # @param [Array] data the categorical data
12
+ # @param [Hash] opts the options
13
+ # @option opts [Boolean] :ordered true if data is ordered, false otherwise
14
+ # @option opts [Array] :categories categories to associate with the vector.
15
+ # It add extra categories if specified and provides order of categories also.
16
+ # @option opts [object] :index gives index to vector. By default its from 0 to size-1
17
+ # @return the categorical data created
18
+ # @example
19
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c],
20
+ # type: :category,
21
+ # ordered: true,
22
+ # categories: [:a, :b, :c, 1]
23
+ # # => #<Daru::Vector(5)>
24
+ # # 0 a
25
+ # # 1 1
26
+ # # 2 a
27
+ # # 3 1
28
+ # # 4 c
29
+ def initialize_category data, opts={}
30
+ @type = :category
31
+ initialize_core_attributes data
32
+
33
+ if opts[:categories]
34
+ validate_categories(opts[:categories])
35
+ add_extra_categories(opts[:categories] - categories)
36
+ order_with opts[:categories]
37
+ end
38
+
39
+ # Specify if the categories are ordered or not.
40
+ # By default its unordered
41
+ @ordered = opts[:ordered] || false
42
+
43
+ # The coding scheme to code with. Default is dummy coding.
44
+ @coding_scheme = :dummy
45
+
46
+ # Base category which won't be present in the coding
47
+ @base_category = @cat_hash.keys.first
48
+
49
+ # Stores the name of the vector
50
+ @name = opts[:name]
51
+
52
+ # Index of the vector
53
+ @index = coerce_index opts[:index]
54
+
55
+ self
56
+ end
57
+
58
+ def name= new_name
59
+ @name = new_name
60
+ self
61
+ end
62
+
63
+ def plotting_library= lib
64
+ case lib
65
+ when :gruff, :nyaplot
66
+ @plotting_library = lib
67
+ extend Module.const_get(
68
+ "Daru::Plotting::Category::#{lib.to_s.capitalize}Library"
69
+ ) if Daru.send("has_#{lib}?".to_sym)
70
+ else
71
+ raise ArguementError, "Plotting library #{lib} not supported. "\
72
+ 'Supported libraries are :nyaplot and :gruff'
73
+ end
74
+ end
75
+
76
+ alias_method :rename, :name=
77
+
78
+ # Returns an enumerator that enumerates on categorical data
79
+ # @return [Enumerator] an enumerator that enumerates over data stored in vector
80
+ def each
81
+ return enum_for(:each) unless block_given?
82
+ @array.each { |pos| yield cat_from_int pos }
83
+ self
84
+ end
85
+
86
+ # Returns all categorical data
87
+ # @return [Array] array of all categorical data which vector is storing
88
+ # @example
89
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
90
+ # dv.to_a
91
+ # # => [:a, 1, :a, 1, :c]
92
+ def to_a
93
+ each.to_a
94
+ end
95
+
96
+ # Duplicated a vector
97
+ # @return [Daru::Vector] duplicated vector
98
+ # @example
99
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
100
+ # dv.dup
101
+ # # => #<Daru::Vector(5)>
102
+ # # 0 a
103
+ # # 1 1
104
+ # # 2 a
105
+ # # 3 1
106
+ # # 4 c
107
+ def dup
108
+ Daru::Vector.new to_a.dup,
109
+ name: @name,
110
+ index: @index.dup,
111
+ type: :category,
112
+ categories: categories,
113
+ ordered: ordered?
114
+ end
115
+
116
+ # Associates a category to the vector.
117
+ # @param [Array] *new_categories new categories to be associated
118
+ # @example
119
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
120
+ # dv.add_category :b
121
+ # dv.categories
122
+ # # => [:a, :b, :c, 1]
123
+ def add_category(*new_categories)
124
+ new_categories -= categories
125
+ add_extra_categories new_categories
126
+ end
127
+
128
+ # Returns frequency of given category
129
+ # @param [object] category given category whose count has to be founded
130
+ # @return count/frequency of given category
131
+ # @example
132
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
133
+ # dv.count :a
134
+ # # => 2
135
+ def count category
136
+ raise ArgumentError, "Invalid category #{category}" unless
137
+ categories.include?(category)
138
+
139
+ @cat_hash[category].size
140
+ end
141
+
142
+ # Returns a vector storing count/frequency of each category
143
+ # @return [Daru::Vector] Return a vector whose indexes are categories
144
+ # and corresponding values are its count
145
+ # @example
146
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
147
+ # dv.frequencies
148
+ # # => #<Daru::Vector(4)>
149
+ # # a 2
150
+ # # b 0
151
+ # # c 1
152
+ # # 1 2
153
+ def frequencies type=:count
154
+ counts = @cat_hash.values.map(&:size)
155
+ values =
156
+ case type
157
+ when :count
158
+ counts
159
+ when :fraction
160
+ counts.map { |c| c / size.to_f }
161
+ when :percentage
162
+ counts.map { |c| c / size.to_f * 100 }
163
+ end
164
+ Daru::Vector.new values, index: categories, name: name
165
+ end
166
+
167
+ # Returns vector for indexes/positions specified
168
+ # @param [Array] *indexes indexes/positions for which values has to be retrived
169
+ # @note Since it accepts both indexes and postions. In case of collision,
170
+ # arguement will be treated as index
171
+ # @return vector containing values specified at specified indexes/positions
172
+ # @example
173
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c],
174
+ # type: :category,
175
+ # index: 'a'..'e'
176
+ # dv[:a, 1]
177
+ # # => #<Daru::Vector(2)>
178
+ # # a a
179
+ # # b 1
180
+ # dv[0, 1]
181
+ # # => #<Daru::Vector(2)>
182
+ # # a a
183
+ # # b 1
184
+ def [] *indexes
185
+ positions = @index.pos(*indexes)
186
+ return category_from_position(positions) if positions.is_a? Integer
187
+
188
+ Daru::Vector.new positions.map { |pos| category_from_position pos },
189
+ index: @index.subset(*indexes),
190
+ name: @name,
191
+ type: :category,
192
+ ordered: @ordered,
193
+ categories: categories
194
+ end
195
+
196
+ # Returns vector for positions specified.
197
+ # @param [Array] *positions positions at which values to be retrived.
198
+ # @return vector containing values specified at specified positions
199
+ # @example
200
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
201
+ # dv.at 0..-2
202
+ # # => #<Daru::Vector(4)>
203
+ # # 0 a
204
+ # # 1 1
205
+ # # 2 a
206
+ # # 3 1
207
+ def at *positions
208
+ original_positions = positions
209
+ positions = coerce_positions(*positions)
210
+ validate_positions(*positions)
211
+
212
+ return category_from_position(positions) if positions.is_a? Integer
213
+
214
+ Daru::Vector.new positions.map { |pos| category_from_position(pos) },
215
+ index: @index.at(*original_positions),
216
+ name: @name,
217
+ type: :category,
218
+ ordered: @ordered,
219
+ categories: categories
220
+ end
221
+
222
+ # Modifies values at specified indexes/positions.
223
+ # @note In order to add a new category you need to associate it via #add_category
224
+ # @param [Array] *indexes indexes/positions at which to modify value
225
+ # @param [object] val value to assign at specific indexes/positions
226
+ # @return modified vector
227
+ # @example
228
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
229
+ # dv.add_category :b
230
+ # dv[0] = :b
231
+ # dv
232
+ # # => #<Daru::Vector(5)>
233
+ # # 0 b
234
+ # # 1 1
235
+ # # 2 a
236
+ # # 3 1
237
+ # # 4 c
238
+ def []= *indexes, val
239
+ positions = @index.pos(*indexes)
240
+
241
+ if positions.is_a? Numeric
242
+ modify_category_at positions, val
243
+ else
244
+ positions.each { |pos| modify_category_at pos, val }
245
+ end
246
+ self
247
+ end
248
+
249
+ # Modifies values at specified positions.
250
+ # @param [Array] positions positions at which to modify value
251
+ # @param [object] val value to assign at specific positions
252
+ # @return modified vector
253
+ # @example
254
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
255
+ # dv.add_category :b
256
+ # dv.set_at [0, 1], :b
257
+ # # => #<Daru::Vector(5)>
258
+ # # 0 b
259
+ # # 1 b
260
+ # # 2 a
261
+ # # 3 1
262
+ # # 4 c
263
+ def set_at positions, val
264
+ validate_positions(*positions)
265
+ positions.map { |pos| modify_category_at pos, val }
266
+ self
267
+ end
268
+
269
+ # Size of categorical data.
270
+ # @return total number of values in the vector
271
+ # @example
272
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
273
+ # dv.size
274
+ # # => 5
275
+ def size
276
+ @array.size
277
+ end
278
+
279
+ # Tells whether vector is ordered or not.
280
+ # @return [Boolean] true if vector is ordered, false otherwise
281
+ # @example
282
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
283
+ # dv.ordered?
284
+ # # => false
285
+ def ordered?
286
+ @ordered
287
+ end
288
+
289
+ # Make categorical data ordered or unordered.
290
+ # @param [Boolean] bool true if categorical data is to be to ordered, false otherwise
291
+ # @example
292
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
293
+ # dv.ordered = true
294
+ # dv.ordered?
295
+ # # => true
296
+ def ordered= bool
297
+ @ordered = bool
298
+ end
299
+
300
+ # Returns all the categories with the inherent order
301
+ # @return [Array] categories of the vector with the order
302
+ # @example
303
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c],
304
+ # type: :category,
305
+ # categories: [:a, :b, :c, 1]
306
+ # dv.categories
307
+ # # => [:a, :b, :c, 1]
308
+ def categories
309
+ @cat_hash.keys
310
+ end
311
+
312
+ alias_method :order, :categories
313
+
314
+ # Sets order of the categories.
315
+ # @note If extra categories are specified, they get added too.
316
+ # @param [Array] cat_with_order categories specifying their order
317
+ # @example
318
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
319
+ # dv.categories = [:a, :b, :c, 1]
320
+ # dv.categories
321
+ # # => [:a, :b, :c, 1]
322
+ def categories= cat_with_order
323
+ validate_categories(cat_with_order)
324
+ add_extra_categories(cat_with_order - categories)
325
+ order_with cat_with_order
326
+ end
327
+
328
+ # Rename categories.
329
+ # @note The order of categories after renaming is preserved but new categories
330
+ # are added at the end in the order. Also the base-category is reassigned
331
+ # to new value if it is renamed
332
+ # @param [Hash] old_to_new a hash mapping categories whose name to be changed
333
+ # to their new names
334
+ # @example
335
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
336
+ # dv.rename_categories :a => :b
337
+ # dv
338
+ # # => #<Daru::Vector(5)>
339
+ # # 0 b
340
+ # # 1 1
341
+ # # 2 b
342
+ # # 3 1
343
+ # # 4 c
344
+ def rename_categories old_to_new
345
+ old_categories = categories
346
+ data = to_a.map do |cat|
347
+ old_to_new.include?(cat) ? old_to_new[cat] : cat
348
+ end
349
+
350
+ initialize_core_attributes data
351
+ self.categories = (old_categories - old_to_new.keys) | old_to_new.values
352
+ self.base_category = old_to_new[base_category] if
353
+ old_to_new.include? base_category
354
+ self
355
+ end
356
+
357
+ # Removes the unused categories
358
+ # @note If base category is removed, then the first occuring category in the
359
+ # data is taken as base category. Order of the undeleted categories
360
+ # remains preserved.
361
+ # @return [Daru::Vector] Makes changes in the vector itself i.e. deletes
362
+ # the unused categories and returns itself
363
+ # @example
364
+ # dv = Daru::Vector.new [:one, :two, :one], type: :category,
365
+ # categories: [:three, :two, :one]
366
+ # dv.remove_unused_categories
367
+ # dv.categories
368
+ # # => [:two, :one]
369
+ def remove_unused_categories
370
+ old_categories = categories
371
+
372
+ initialize_core_attributes to_a
373
+ self.categories = old_categories & categories
374
+ self.base_category = @cat_hash.keys.first unless
375
+ categories.include? base_category
376
+ self
377
+ end
378
+
379
+ # Returns the minimum category acording to the order specified.
380
+ # @note This operation will only work if vector is ordered.
381
+ # To set the vector ordered do `vector.ordered = true`
382
+ # @return [object] the minimum category acording to the order
383
+ # @example
384
+ # dv = Daru::Vector.new ['second', 'second', 'third', 'first'],
385
+ # categories: ['first', 'second', 'third']
386
+ # dv.min
387
+ # # => 'first'
388
+ def min
389
+ assert_ordered :min
390
+ categories.first
391
+ end
392
+
393
+ # Returns the maximum category acording to the order specified.
394
+ # @note This operation will only work if vector is ordered.
395
+ # To set the vector ordered do `vector.ordered = true`
396
+ # @return [object] the maximum category acording to the order
397
+ # @example
398
+ # dv = Daru::Vector.new ['second', 'second', 'third', 'first'],
399
+ # categories: ['first', 'second', 'third']
400
+ # dv.max
401
+ # # => 'third'
402
+ def max
403
+ assert_ordered :max
404
+ categories.last
405
+ end
406
+
407
+ # Sorts the vector in the order specified.
408
+ # @note This operation will only work if vector is ordered.
409
+ # To set the vector ordered, do `vector.ordered = true`
410
+ # @return [Daru::Vector] sorted vector
411
+ # @example
412
+ # dv = Daru::Vector.new ['second', 'second', 'third', 'first'],
413
+ # categories: ['first', 'second', 'thrid'],
414
+ # type: :categories,
415
+ # ordered: true
416
+ # dv.sort!
417
+ # # => #<Daru::Vector(4)>
418
+ # # 3 first
419
+ # # 0 second
420
+ # # 1 second
421
+ # # 2 third
422
+ def sort! # rubocop:disable Metrics/AbcSize
423
+ # TODO: Simply the code
424
+ assert_ordered :sort
425
+
426
+ # Build sorted index
427
+ old_index = @index.to_a
428
+ new_index = @cat_hash.values.map do |positions|
429
+ old_index.values_at(*positions)
430
+ end.flatten
431
+ @index = @index.class.new new_index
432
+
433
+ # Build sorted data
434
+ @cat_hash = categories.inject([{}, 0]) do |acc, cat|
435
+ hash, count = acc
436
+ cat_count = @cat_hash[cat].size
437
+ cat_count.times { |i| @array[count+i] = int_from_cat(cat) }
438
+ hash[cat] = (count...(cat_count+count)).to_a
439
+ [hash, count + cat_count]
440
+ end.first
441
+
442
+ self
443
+ end
444
+
445
+ def sort
446
+ dup.sort!
447
+ end
448
+
449
+ # Set coding scheme
450
+ # @param [Symbol] scheme to set
451
+ # @example
452
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
453
+ # dv.coding_scheme = :deviation
454
+ # dv.coding_scheme
455
+ # # => :deviation
456
+ def coding_scheme= scheme
457
+ raise ArgumentError, "Unknown or unsupported coding scheme #{scheme}." unless
458
+ CODING_SCHEMES.include? scheme
459
+ @coding_scheme = scheme
460
+ end
461
+
462
+ CODING_SCHEMES = [:dummy, :deviation, :helmert, :simple].freeze
463
+
464
+ # Contrast code the vector acording to the coding scheme set.
465
+ # @note To set the coding scheme use #coding_scheme=
466
+ # @param [true, false] full true if you want k variables for k categories,
467
+ # false if you want k-1 variables for k categories
468
+ # @return [Daru::DataFrame] dataframe containing all coded variables
469
+ # @example
470
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
471
+ # dv.contrast_code
472
+ # # => #<Daru::DataFrame(5x2)>
473
+ # # daru_1 daru_c
474
+ # # 0 0 0
475
+ # # 1 1 0
476
+ # # 2 0 0
477
+ # # 3 1 0
478
+ # # 4 0 1
479
+ def contrast_code opts={}
480
+ if opts[:user_defined]
481
+ user_defined_coding(opts[:user_defined])
482
+ else
483
+ # TODO: Make various coding schemes code DRY
484
+ send("#{coding_scheme}_coding".to_sym, opts[:full] || false)
485
+ end
486
+ end
487
+
488
+ # Two categorical vectors are equal if their index and corresponding values are same
489
+ # return [true, false] true if two vectors are similar
490
+ # @example
491
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
492
+ # other = Daru::Vector.new [:a, 1, :a, 1, :c],
493
+ # type: :category,
494
+ # index: 1..5
495
+ # dv == other
496
+ # # => false
497
+ def == other
498
+ size == other.size &&
499
+ to_a == other.to_a &&
500
+ index == other.index
501
+ end
502
+
503
+ # Returns integer coding for categorical data in the order starting from 0.
504
+ # For example if order is [:a, :b, :c], then :a, will be coded as 0, :b as 1 and :c as 2
505
+ # @return [Array] integer coding of all values of vector
506
+ # @example
507
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c],
508
+ # type: :category,
509
+ # categories: [:a, :b, :c, 1]
510
+ # dv.to_ints
511
+ # # => [0, 1, 0, 1, 2]
512
+ def to_ints
513
+ @array
514
+ end
515
+
516
+ # Reorder the vector with given positions
517
+ # @note Unlike #reindex! which takes index as input, it takes
518
+ # positions as an input to reorder the vector
519
+ # @param [Array] order the order to reorder the vector with
520
+ # @return reordered vector
521
+ # @example
522
+ # dv = Daru::Vector.new [3, 2, 1], index: ['c', 'b', 'a'], type: :category
523
+ # dv.reorder! [2, 1, 0]
524
+ # # => #<Daru::Vector(3)>
525
+ # # a 1
526
+ # # b 2
527
+ # # c 3
528
+ def reorder! order
529
+ raise ArgumentError, 'Invalid order specified' unless
530
+ order.sort == size.times.to_a
531
+ # TODO: Room for optimization
532
+ old_data = to_a
533
+ new_data = order.map { |i| old_data[i] }
534
+ initialize_core_attributes new_data
535
+ self
536
+ end
537
+
538
+ # Sets new index for vector. Preserves index->value correspondence.
539
+ # @note Unlike #reorder! which takes positions as input it takes
540
+ # index as an input to reorder the vector
541
+ # @param [Daru::Index, Daru::MultiIndex, Array] idx new index to order with
542
+ # @return [Daru::Vector] vector reindexed with new index
543
+ # @example
544
+ # dv = Daru::Vector.new [3, 2, 1], index: ['c', 'b', 'a'], type: :category
545
+ # dv.reindex! ['a', 'b', 'c']
546
+ # # => #<Daru::Vector(3)>
547
+ # # a 1
548
+ # # b 2
549
+ # # c 3
550
+ def reindex! idx
551
+ idx = Daru::Index.new idx unless idx.is_a? Daru::Index
552
+ raise ArgumentError, 'Invalid index specified' unless
553
+ idx.to_a.sort == index.to_a.sort
554
+
555
+ old_categories = categories
556
+ data = idx.map { |i| self[i] }
557
+ initialize_core_attributes data
558
+ self.categories = old_categories
559
+ self.index = idx
560
+ self
561
+ end
562
+
563
+ {
564
+ eq: :==,
565
+ not_eq: :!=,
566
+ lt: :<,
567
+ lteq: :<=,
568
+ mt: :>,
569
+ mteq: :>=
570
+ }.each do |method, operator|
571
+ define_method(method) do |other|
572
+ mod = Daru::Core::Query
573
+ if other.is_a?(Daru::Vector)
574
+ mod.apply_vector_operator operator, to_ints, other.to_ints
575
+ else
576
+ mod.apply_scalar_operator operator, @array, int_from_cat(other)
577
+ end
578
+ end
579
+ end
580
+ alias :gt :mt
581
+ alias :gteq :mteq
582
+
583
+ # For querying the data
584
+ # @param [object] arel like query syntax
585
+ # @return [Daru::Vector] Vector which makes the conditions true
586
+ # @example
587
+ # dv = Daru::Vector.new ['I', 'II', 'I', 'III', 'I', 'II'],
588
+ # type: :category,
589
+ # ordered: true,
590
+ # categories: ['I', 'II', 'III']
591
+ # dv.where(dv.mt('I') & dv.lt('III'))
592
+ # # => #<Daru::Vector(2)>
593
+ # # 1 II
594
+ # # 5 II
595
+ def where bool_array
596
+ Daru::Core::Query.vector_where self, bool_array
597
+ end
598
+
599
+ # Gives the summary of data using following parameters
600
+ # - size: size of the data
601
+ # - categories: total number of categories
602
+ # - max_freq: Max no of times a category occurs
603
+ # - max_category: The category which occurs max no of times
604
+ # - min_freq: Min no of times a category occurs
605
+ # - min_category: The category which occurs min no of times
606
+ # @return [Daru::Vector] Vector with index as following parameters
607
+ # and values as values to these parameters
608
+ # @example
609
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
610
+ # dv.describe
611
+ # # => #<Daru::Vector(6)>
612
+ # # size 5
613
+ # # categories 3
614
+ # # max_freq 2
615
+ # # max_category a
616
+ # # min_freq 1
617
+ # # min_category c
618
+ def describe
619
+ Daru::Vector.new(
620
+ size: size,
621
+ categories: categories.size,
622
+ max_freq: @cat_hash.values.map(&:size).max,
623
+ max_category: @cat_hash.keys.max_by { |cat| @cat_hash[cat].size },
624
+ min_freq: @cat_hash.values.map(&:size).min,
625
+ min_category: @cat_hash.keys.min_by { |cat| @cat_hash[cat].size }
626
+ )
627
+ end
628
+
629
+ # Does nothing since its already of type category.
630
+ # @return [Daru::Vector] categorical vector
631
+ def to_category
632
+ self
633
+ end
634
+
635
+ # Converts a category type vector to non category type vector
636
+ # @return [Daru::Vector] non category type vector
637
+ def to_non_category
638
+ Daru::Vector.new to_a, name: name, index: index
639
+ end
640
+
641
+ # Sets index of the vector
642
+ # @param [Daru::Index, Daru::MultiIndex, Daru::CategoricalIndex, Array, Range]
643
+ # idx new index to assign to vector
644
+ # @return [Daru::Index, Daru::CategoricalIndex, Daru::MultiIndex] the index assigned
645
+ # @example
646
+ # dv = Daru::Vector.new [1, 2, 3], type: :category
647
+ # dv.index = 'a'..'c'
648
+ # dv
649
+ # # => #<Daru::Vector(3)>
650
+ # # a 1
651
+ # # b 2
652
+ # # c 3
653
+ def index= idx
654
+ @index = coerce_index idx
655
+ end
656
+
657
+ # Check if any one of mentioned values occur in the vector
658
+ # @param [Array] *values values to check for
659
+ # @return [true, false] returns true if any one of specified values
660
+ # occur in the vector
661
+ # @example
662
+ # dv = Daru::Vector.new [1, 2, 3, 4, nil]
663
+ # dv.include_values? nil, Float::NAN
664
+ # # => true
665
+ def include_values?(*values)
666
+ values.any? { |v| @cat_hash.include?(v) && !@cat_hash[v].empty? }
667
+ end
668
+
669
+ # Return a vector with specified values removed
670
+ # @param [Array] *values values to reject from resultant vector
671
+ # @return [Daru::Vector] vector with specified values removed
672
+ # @example
673
+ # dv = Daru::Vector.new [1, 2, nil, Float::NAN], type: :category
674
+ # dv.reject_values nil, Float::NAN
675
+ # # => #<Daru::Vector(2)>
676
+ # # 0 1
677
+ # # 1 2
678
+ def reject_values(*values)
679
+ resultant_pos = size.times.to_a - values.flat_map { |v| @cat_hash[v] }
680
+ dv = at(*resultant_pos)
681
+ unless dv.is_a? Daru::Vector
682
+ pos = resultant_pos.first
683
+ dv = at(pos..pos)
684
+ end
685
+ dv.remove_unused_categories
686
+ end
687
+
688
+ # Count the number of values specified
689
+ # @param [Array] *values values to count for
690
+ # @return [Integer] the number of times the values mentioned occurs
691
+ # @example
692
+ # dv = Daru::Vector.new [1, 2, 1, 2, 3, 4, nil, nil]
693
+ # dv.count_values nil
694
+ # # => 2
695
+ def count_values(*values)
696
+ values.map { |v| @cat_hash[v].size if @cat_hash.include? v }
697
+ .compact
698
+ .inject(0, :+)
699
+ end
700
+
701
+ # Return indexes of values specified
702
+ # @param [Array] *values values to find indexes for
703
+ # @return [Array] array of indexes of values specified
704
+ # @example
705
+ # dv = Daru::Vector.new [1, 2, nil, Float::NAN], index: 11..14
706
+ # dv.indexes nil, Float::NAN
707
+ # # => [13, 14]
708
+ def indexes(*values)
709
+ values &= categories
710
+ index.to_a.values_at(*values.flat_map { |v| @cat_hash[v] }.sort)
711
+ end
712
+
713
+ # Replaces specified values with a new value
714
+ # @param [Array] old_values array of values to replace
715
+ # @param [object] new_value new value to replace with
716
+ # @note It performs the replace in place.
717
+ # @return [Daru::Vector] Same vector itself with values
718
+ # replaced with new value
719
+ # @example
720
+ # dv = Daru::Vector.new [1, 2, :a, :b]
721
+ # dv.replace_values [:a, :b], nil
722
+ # dv
723
+ # # =>
724
+ # # #<Daru::Vector:19903200 @name = nil @metadata = {} @size = 4 >
725
+ # # nil
726
+ # # 0 1
727
+ # # 1 2
728
+ # # 2 nil
729
+ # # 3 nil
730
+ def replace_values old_values, new_value
731
+ old_values = [old_values] unless old_values.is_a? Array
732
+ rename_hash = old_values.map { |v| [v, new_value] }.to_h
733
+ rename_categories rename_hash
734
+ end
735
+
736
+ def positions(*values)
737
+ values &= categories
738
+ values.flat_map { |v| @cat_hash[v] }.sort
739
+ end
740
+
741
+ private
742
+
743
+ def validate_categories input_categories
744
+ raise ArgumentError, 'Input categories and speculated categories mismatch' unless
745
+ (categories - input_categories).empty?
746
+ end
747
+
748
+ def add_extra_categories extra_categories
749
+ extra_categories.each { |cat| @cat_hash[cat] = [] }
750
+ end
751
+
752
+ def initialize_core_attributes data
753
+ # Create a hash to map each category to positional indexes
754
+ categories = data.each_with_index.group_by(&:first)
755
+ @cat_hash = categories.map { |cat, group| [cat, group.map(&:last)] }.to_h
756
+
757
+ # Map each category to a unique integer for effective storage in @array
758
+ map_cat_int = categories.keys.each_with_index.to_h
759
+
760
+ # To link every instance to its category,
761
+ # it stores integer for every instance representing its category
762
+ @array = map_cat_int.values_at(*data)
763
+
764
+ # Include plotting functionality
765
+ self.plotting_library = Daru.plotting_library
766
+ end
767
+
768
+ def category_from_position position
769
+ cat_from_int @array[position]
770
+ end
771
+
772
+ def assert_ordered operation
773
+ # TODO: Change ArgumentError to something more expressive
774
+ raise ArgumentError, "Can not apply #{operation} when vector is unordered. "\
775
+ 'To make the categorical data ordered, use #ordered = true'\
776
+ unless ordered?
777
+ end
778
+
779
+ def dummy_coding full
780
+ categories = @cat_hash.keys
781
+ categories.delete(base_category) unless full
782
+
783
+ df = categories.map do |category|
784
+ dummy_code @cat_hash[category]
785
+ end
786
+
787
+ Daru::DataFrame.new df,
788
+ index: @index,
789
+ order: create_names(categories)
790
+ end
791
+
792
+ def dummy_code positions
793
+ code = Array.new(size, 0)
794
+ positions.each { |pos| code[pos] = 1 }
795
+ code
796
+ end
797
+
798
+ def simple_coding full
799
+ categories = @cat_hash.keys
800
+ categories.delete(base_category) unless full
801
+
802
+ df = categories.map do |category|
803
+ simple_code @cat_hash[category]
804
+ end
805
+
806
+ Daru::DataFrame.new df,
807
+ index: @index,
808
+ order: create_names(categories)
809
+ end
810
+
811
+ def simple_code positions
812
+ n = @cat_hash.keys.size.to_f
813
+ code = Array.new(size, -1/n)
814
+ positions.each { |pos| code[pos] = (n-1)/n }
815
+ code
816
+ end
817
+
818
+ def helmert_coding(*)
819
+ categories = @cat_hash.keys[0..-2]
820
+
821
+ df = categories.each_index.map do |index|
822
+ helmert_code index
823
+ end
824
+
825
+ Daru::DataFrame.new df,
826
+ index: @index,
827
+ order: create_names(categories)
828
+ end
829
+
830
+ def helmert_code index
831
+ n = (categories.size - index).to_f
832
+
833
+ @array.map do |cat_index|
834
+ if cat_index == index
835
+ (n-1)/n
836
+ elsif cat_index > index
837
+ -1/n
838
+ else
839
+ 0
840
+ end
841
+ end
842
+ end
843
+
844
+ def deviation_coding(*)
845
+ categories = @cat_hash.keys[0..-2]
846
+
847
+ df = categories.each_index.map do |index|
848
+ deviation_code index
849
+ end
850
+
851
+ Daru::DataFrame.new df,
852
+ index: @index,
853
+ order: create_names(categories)
854
+ end
855
+
856
+ def deviation_code index
857
+ last = categories.size - 1
858
+ @array.map do |cat_index|
859
+ case cat_index
860
+ when index then 1
861
+ when last then -1
862
+ else 0
863
+ end
864
+ end
865
+ end
866
+
867
+ def user_defined_coding df
868
+ Daru::DataFrame.rows (Array.new(size) { |pos| df.row[at(pos)].to_a }),
869
+ index: @index,
870
+ order: df.vectors.to_a
871
+ end
872
+
873
+ def create_names categories
874
+ categories.map do |cat|
875
+ name.is_a?(Symbol) ? "#{name}_#{cat}".to_sym : "#{name}_#{cat}"
876
+ end
877
+ end
878
+
879
+ def coerce_index index
880
+ index =
881
+ case index
882
+ when Daru::MultiIndex, Daru::CategoricalIndex, Daru::Index
883
+ index
884
+ when nil
885
+ Daru::Index.new size
886
+ when Range
887
+ Daru::Index.new index.to_a
888
+ when Array
889
+ Daru::Index.new index
890
+ else
891
+ raise ArgumentError, "Unregnized index type #{index.class}"
892
+ end
893
+ validate_index index
894
+ index
895
+ end
896
+
897
+ def validate_index index
898
+ # Change to SizeError
899
+ raise ArgumentError, "Size of index (#{index.size}) does not matches"\
900
+ "size of vector (#{size})" if size != index.size
901
+ end
902
+
903
+ def modify_category_at pos, category
904
+ raise ArgumentError, "Invalid category #{category}, "\
905
+ 'to add a new category use #add_category' unless
906
+ categories.include? category
907
+ old_category = category_from_position pos
908
+ @array[pos] = int_from_cat category
909
+ @cat_hash[old_category].delete pos
910
+ @cat_hash[category] << pos
911
+ end
912
+
913
+ def order_with new
914
+ if new.to_set != categories.to_set
915
+ raise ArgumentError, 'The contents of new and old order must be the same.'
916
+ end
917
+
918
+ @cat_hash = new.map { |cat| [cat, @cat_hash[cat]] }.to_h
919
+
920
+ map_cat_int = @cat_hash.keys.each_with_index.to_a.to_h
921
+ @array = Array.new(size)
922
+ @cat_hash.map do |cat, positions|
923
+ positions.each { |pos| @array[pos] = map_cat_int[cat] }
924
+ end
925
+ end
926
+
927
+ def cat_from_int int
928
+ @cat_hash.keys[int]
929
+ end
930
+
931
+ def int_from_cat cat
932
+ @cat_hash.keys.index cat
933
+ end
934
+ end
935
+ end