daru 0.1.3.1 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (92) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/.rspec +2 -1
  4. data/.rspec_formatter.rb +33 -0
  5. data/.rubocop.yml +26 -2
  6. data/History.md +38 -0
  7. data/README.md +22 -13
  8. data/Rakefile +50 -2
  9. data/benchmarks/csv_reading.rb +22 -0
  10. data/daru.gemspec +9 -2
  11. data/lib/daru.rb +36 -4
  12. data/lib/daru/accessors/array_wrapper.rb +6 -1
  13. data/lib/daru/accessors/dataframe_by_row.rb +10 -2
  14. data/lib/daru/accessors/gsl_wrapper.rb +1 -3
  15. data/lib/daru/accessors/nmatrix_wrapper.rb +9 -0
  16. data/lib/daru/category.rb +935 -0
  17. data/lib/daru/core/group_by.rb +29 -38
  18. data/lib/daru/core/merge.rb +186 -145
  19. data/lib/daru/core/query.rb +22 -11
  20. data/lib/daru/dataframe.rb +976 -885
  21. data/lib/daru/date_time/index.rb +166 -166
  22. data/lib/daru/date_time/offsets.rb +66 -77
  23. data/lib/daru/formatters/table.rb +54 -0
  24. data/lib/daru/helpers/array.rb +40 -0
  25. data/lib/daru/index.rb +476 -73
  26. data/lib/daru/io/io.rb +66 -45
  27. data/lib/daru/io/sql_data_source.rb +33 -62
  28. data/lib/daru/iruby/helpers.rb +38 -0
  29. data/lib/daru/iruby/templates/dataframe.html.erb +52 -0
  30. data/lib/daru/iruby/templates/dataframe_mi.html.erb +58 -0
  31. data/lib/daru/iruby/templates/multi_index.html.erb +12 -0
  32. data/lib/daru/iruby/templates/vector.html.erb +27 -0
  33. data/lib/daru/iruby/templates/vector_mi.html.erb +36 -0
  34. data/lib/daru/maths/arithmetic/dataframe.rb +16 -18
  35. data/lib/daru/maths/arithmetic/vector.rb +4 -6
  36. data/lib/daru/maths/statistics/dataframe.rb +8 -15
  37. data/lib/daru/maths/statistics/vector.rb +120 -98
  38. data/lib/daru/monkeys.rb +12 -40
  39. data/lib/daru/plotting/gruff.rb +3 -0
  40. data/lib/daru/plotting/gruff/category.rb +49 -0
  41. data/lib/daru/plotting/gruff/dataframe.rb +91 -0
  42. data/lib/daru/plotting/gruff/vector.rb +57 -0
  43. data/lib/daru/plotting/nyaplot.rb +3 -0
  44. data/lib/daru/plotting/nyaplot/category.rb +34 -0
  45. data/lib/daru/plotting/nyaplot/dataframe.rb +187 -0
  46. data/lib/daru/plotting/nyaplot/vector.rb +46 -0
  47. data/lib/daru/vector.rb +694 -421
  48. data/lib/daru/version.rb +1 -1
  49. data/profile/_base.rb +23 -0
  50. data/profile/df_to_a.rb +10 -0
  51. data/profile/filter.rb +13 -0
  52. data/profile/joining.rb +13 -0
  53. data/profile/sorting.rb +12 -0
  54. data/profile/vector_each_with_index.rb +9 -0
  55. data/spec/accessors/wrappers_spec.rb +2 -4
  56. data/spec/categorical_spec.rb +1734 -0
  57. data/spec/core/group_by_spec.rb +52 -2
  58. data/spec/core/merge_spec.rb +63 -2
  59. data/spec/core/query_spec.rb +236 -80
  60. data/spec/dataframe_spec.rb +1373 -79
  61. data/spec/date_time/data_spec.rb +3 -5
  62. data/spec/date_time/index_spec.rb +154 -17
  63. data/spec/date_time/offsets_spec.rb +3 -4
  64. data/spec/fixtures/empties.dat +2 -0
  65. data/spec/fixtures/strings.dat +2 -0
  66. data/spec/formatters/table_formatter_spec.rb +99 -0
  67. data/spec/helpers_spec.rb +8 -0
  68. data/spec/index/categorical_index_spec.rb +168 -0
  69. data/spec/index/index_spec.rb +283 -0
  70. data/spec/index/multi_index_spec.rb +570 -0
  71. data/spec/io/io_spec.rb +31 -4
  72. data/spec/io/sql_data_source_spec.rb +0 -1
  73. data/spec/iruby/dataframe_spec.rb +172 -0
  74. data/spec/iruby/helpers_spec.rb +49 -0
  75. data/spec/iruby/multi_index_spec.rb +37 -0
  76. data/spec/iruby/vector_spec.rb +107 -0
  77. data/spec/math/arithmetic/dataframe_spec.rb +71 -13
  78. data/spec/math/arithmetic/vector_spec.rb +8 -10
  79. data/spec/math/statistics/dataframe_spec.rb +3 -5
  80. data/spec/math/statistics/vector_spec.rb +45 -55
  81. data/spec/monkeys_spec.rb +32 -9
  82. data/spec/plotting/dataframe_spec.rb +386 -0
  83. data/spec/plotting/vector_spec.rb +230 -0
  84. data/spec/shared/vector_display_spec.rb +215 -0
  85. data/spec/spec_helper.rb +23 -0
  86. data/spec/vector_spec.rb +905 -138
  87. metadata +143 -11
  88. data/.rubocop_todo.yml +0 -44
  89. data/lib/daru/plotting/dataframe.rb +0 -104
  90. data/lib/daru/plotting/vector.rb +0 -38
  91. data/spec/daru_spec.rb +0 -58
  92. data/spec/index_spec.rb +0 -375
@@ -54,6 +54,11 @@ module Daru
54
54
  set_size
55
55
  end
56
56
 
57
+ def fill(*arg)
58
+ @data.fill(*arg)
59
+ set_size
60
+ end
61
+
57
62
  def uniq
58
63
  @data.uniq
59
64
  end
@@ -67,7 +72,7 @@ module Daru
67
72
  end
68
73
 
69
74
  def compact
70
- @data - @context.missing_values
75
+ @data - Daru::MISSING_VALUES
71
76
  end
72
77
 
73
78
  def mean
@@ -9,8 +9,16 @@ module Daru
9
9
  @data_frame[*names, :row]
10
10
  end
11
11
 
12
- def []=(name, vector)
13
- @data_frame[name, :row] = vector
12
+ def []=(*names, vector)
13
+ @data_frame[*names, :row] = vector
14
+ end
15
+
16
+ def at *positions
17
+ @data_frame.row_at(*positions)
18
+ end
19
+
20
+ def set_at positions, vector
21
+ @data_frame.set_row_at(positions, vector)
14
22
  end
15
23
  end
16
24
  end
@@ -61,9 +61,7 @@ module Daru
61
61
  attr_reader :data
62
62
 
63
63
  def compact
64
- # set missing to [] incase @context is not Daru::Vector
65
- missing = @context.missing_values rescue []
66
- ::GSL::Vector.alloc(@data.to_a - missing.map(&:to_f))
64
+ ::GSL::Vector.alloc(@data.to_a - [Float::NAN])
67
65
  end
68
66
 
69
67
  [:mean, :min, :max, :prod, :sum].each do |method|
@@ -14,9 +14,12 @@ module Daru
14
14
  self
15
15
  end
16
16
 
17
+ # :nocov:
18
+ # FIXME: not sure, why this kind of wrapper have such a pure coverage
17
19
  def inject(*args, &block)
18
20
  @data[0...@size].inject(*args, &block)
19
21
  end
22
+ # :nocov:
20
23
 
21
24
  attr_reader :size, :data, :nm_dtype
22
25
 
@@ -43,9 +46,11 @@ module Daru
43
46
  @data[index] = value
44
47
  end
45
48
 
49
+ # :nocov:
46
50
  def == other
47
51
  @data[0...@size] == other[0...@size] and @size == other.size
48
52
  end
53
+ # :nocov:
49
54
 
50
55
  def delete_at index
51
56
  arry = @data.to_a
@@ -58,10 +63,12 @@ module Daru
58
63
  @data.to_a.index key
59
64
  end
60
65
 
66
+ # :nocov:
61
67
  def << element
62
68
  resize if @size >= @data.size
63
69
  self[@size] = element
64
70
  end
71
+ # :nocov:
65
72
 
66
73
  def to_a
67
74
  @data[0...@size].to_a
@@ -77,6 +84,7 @@ module Daru
77
84
  @data = NMatrix.new [size], @data.to_a, dtype: @nm_dtype
78
85
  end
79
86
 
87
+ # :nocov:
80
88
  def mean
81
89
  @data[0...@size].mean.first
82
90
  end
@@ -96,6 +104,7 @@ module Daru
96
104
  def min
97
105
  @data[0...@size].min
98
106
  end
107
+ # :nocov:
99
108
  end
100
109
  end
101
110
  end if Daru.has_nmatrix?
@@ -0,0 +1,935 @@
1
+ module Daru
2
+ module Category # rubocop:disable Metrics/ModuleLength
3
+ attr_accessor :base_category
4
+ attr_reader :index, :coding_scheme, :name
5
+
6
+ # For debuggin. To be removed
7
+ attr_reader :array, :cat_hash, :map_int_cat
8
+
9
+ # Initializes a vector to store categorical data.
10
+ # @note Base category is set to the first category encountered in the vector.
11
+ # @param [Array] data the categorical data
12
+ # @param [Hash] opts the options
13
+ # @option opts [Boolean] :ordered true if data is ordered, false otherwise
14
+ # @option opts [Array] :categories categories to associate with the vector.
15
+ # It add extra categories if specified and provides order of categories also.
16
+ # @option opts [object] :index gives index to vector. By default its from 0 to size-1
17
+ # @return the categorical data created
18
+ # @example
19
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c],
20
+ # type: :category,
21
+ # ordered: true,
22
+ # categories: [:a, :b, :c, 1]
23
+ # # => #<Daru::Vector(5)>
24
+ # # 0 a
25
+ # # 1 1
26
+ # # 2 a
27
+ # # 3 1
28
+ # # 4 c
29
+ def initialize_category data, opts={}
30
+ @type = :category
31
+ initialize_core_attributes data
32
+
33
+ if opts[:categories]
34
+ validate_categories(opts[:categories])
35
+ add_extra_categories(opts[:categories] - categories)
36
+ order_with opts[:categories]
37
+ end
38
+
39
+ # Specify if the categories are ordered or not.
40
+ # By default its unordered
41
+ @ordered = opts[:ordered] || false
42
+
43
+ # The coding scheme to code with. Default is dummy coding.
44
+ @coding_scheme = :dummy
45
+
46
+ # Base category which won't be present in the coding
47
+ @base_category = @cat_hash.keys.first
48
+
49
+ # Stores the name of the vector
50
+ @name = opts[:name]
51
+
52
+ # Index of the vector
53
+ @index = coerce_index opts[:index]
54
+
55
+ self
56
+ end
57
+
58
+ def name= new_name
59
+ @name = new_name
60
+ self
61
+ end
62
+
63
+ def plotting_library= lib
64
+ case lib
65
+ when :gruff, :nyaplot
66
+ @plotting_library = lib
67
+ extend Module.const_get(
68
+ "Daru::Plotting::Category::#{lib.to_s.capitalize}Library"
69
+ ) if Daru.send("has_#{lib}?".to_sym)
70
+ else
71
+ raise ArguementError, "Plotting library #{lib} not supported. "\
72
+ 'Supported libraries are :nyaplot and :gruff'
73
+ end
74
+ end
75
+
76
+ alias_method :rename, :name=
77
+
78
+ # Returns an enumerator that enumerates on categorical data
79
+ # @return [Enumerator] an enumerator that enumerates over data stored in vector
80
+ def each
81
+ return enum_for(:each) unless block_given?
82
+ @array.each { |pos| yield cat_from_int pos }
83
+ self
84
+ end
85
+
86
+ # Returns all categorical data
87
+ # @return [Array] array of all categorical data which vector is storing
88
+ # @example
89
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
90
+ # dv.to_a
91
+ # # => [:a, 1, :a, 1, :c]
92
+ def to_a
93
+ each.to_a
94
+ end
95
+
96
+ # Duplicated a vector
97
+ # @return [Daru::Vector] duplicated vector
98
+ # @example
99
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
100
+ # dv.dup
101
+ # # => #<Daru::Vector(5)>
102
+ # # 0 a
103
+ # # 1 1
104
+ # # 2 a
105
+ # # 3 1
106
+ # # 4 c
107
+ def dup
108
+ Daru::Vector.new to_a.dup,
109
+ name: @name,
110
+ index: @index.dup,
111
+ type: :category,
112
+ categories: categories,
113
+ ordered: ordered?
114
+ end
115
+
116
+ # Associates a category to the vector.
117
+ # @param [Array] *new_categories new categories to be associated
118
+ # @example
119
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
120
+ # dv.add_category :b
121
+ # dv.categories
122
+ # # => [:a, :b, :c, 1]
123
+ def add_category(*new_categories)
124
+ new_categories -= categories
125
+ add_extra_categories new_categories
126
+ end
127
+
128
+ # Returns frequency of given category
129
+ # @param [object] category given category whose count has to be founded
130
+ # @return count/frequency of given category
131
+ # @example
132
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
133
+ # dv.count :a
134
+ # # => 2
135
+ def count category
136
+ raise ArgumentError, "Invalid category #{category}" unless
137
+ categories.include?(category)
138
+
139
+ @cat_hash[category].size
140
+ end
141
+
142
+ # Returns a vector storing count/frequency of each category
143
+ # @return [Daru::Vector] Return a vector whose indexes are categories
144
+ # and corresponding values are its count
145
+ # @example
146
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
147
+ # dv.frequencies
148
+ # # => #<Daru::Vector(4)>
149
+ # # a 2
150
+ # # b 0
151
+ # # c 1
152
+ # # 1 2
153
+ def frequencies type=:count
154
+ counts = @cat_hash.values.map(&:size)
155
+ values =
156
+ case type
157
+ when :count
158
+ counts
159
+ when :fraction
160
+ counts.map { |c| c / size.to_f }
161
+ when :percentage
162
+ counts.map { |c| c / size.to_f * 100 }
163
+ end
164
+ Daru::Vector.new values, index: categories, name: name
165
+ end
166
+
167
+ # Returns vector for indexes/positions specified
168
+ # @param [Array] *indexes indexes/positions for which values has to be retrived
169
+ # @note Since it accepts both indexes and postions. In case of collision,
170
+ # arguement will be treated as index
171
+ # @return vector containing values specified at specified indexes/positions
172
+ # @example
173
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c],
174
+ # type: :category,
175
+ # index: 'a'..'e'
176
+ # dv[:a, 1]
177
+ # # => #<Daru::Vector(2)>
178
+ # # a a
179
+ # # b 1
180
+ # dv[0, 1]
181
+ # # => #<Daru::Vector(2)>
182
+ # # a a
183
+ # # b 1
184
+ def [] *indexes
185
+ positions = @index.pos(*indexes)
186
+ return category_from_position(positions) if positions.is_a? Integer
187
+
188
+ Daru::Vector.new positions.map { |pos| category_from_position pos },
189
+ index: @index.subset(*indexes),
190
+ name: @name,
191
+ type: :category,
192
+ ordered: @ordered,
193
+ categories: categories
194
+ end
195
+
196
+ # Returns vector for positions specified.
197
+ # @param [Array] *positions positions at which values to be retrived.
198
+ # @return vector containing values specified at specified positions
199
+ # @example
200
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
201
+ # dv.at 0..-2
202
+ # # => #<Daru::Vector(4)>
203
+ # # 0 a
204
+ # # 1 1
205
+ # # 2 a
206
+ # # 3 1
207
+ def at *positions
208
+ original_positions = positions
209
+ positions = coerce_positions(*positions)
210
+ validate_positions(*positions)
211
+
212
+ return category_from_position(positions) if positions.is_a? Integer
213
+
214
+ Daru::Vector.new positions.map { |pos| category_from_position(pos) },
215
+ index: @index.at(*original_positions),
216
+ name: @name,
217
+ type: :category,
218
+ ordered: @ordered,
219
+ categories: categories
220
+ end
221
+
222
+ # Modifies values at specified indexes/positions.
223
+ # @note In order to add a new category you need to associate it via #add_category
224
+ # @param [Array] *indexes indexes/positions at which to modify value
225
+ # @param [object] val value to assign at specific indexes/positions
226
+ # @return modified vector
227
+ # @example
228
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
229
+ # dv.add_category :b
230
+ # dv[0] = :b
231
+ # dv
232
+ # # => #<Daru::Vector(5)>
233
+ # # 0 b
234
+ # # 1 1
235
+ # # 2 a
236
+ # # 3 1
237
+ # # 4 c
238
+ def []= *indexes, val
239
+ positions = @index.pos(*indexes)
240
+
241
+ if positions.is_a? Numeric
242
+ modify_category_at positions, val
243
+ else
244
+ positions.each { |pos| modify_category_at pos, val }
245
+ end
246
+ self
247
+ end
248
+
249
+ # Modifies values at specified positions.
250
+ # @param [Array] positions positions at which to modify value
251
+ # @param [object] val value to assign at specific positions
252
+ # @return modified vector
253
+ # @example
254
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
255
+ # dv.add_category :b
256
+ # dv.set_at [0, 1], :b
257
+ # # => #<Daru::Vector(5)>
258
+ # # 0 b
259
+ # # 1 b
260
+ # # 2 a
261
+ # # 3 1
262
+ # # 4 c
263
+ def set_at positions, val
264
+ validate_positions(*positions)
265
+ positions.map { |pos| modify_category_at pos, val }
266
+ self
267
+ end
268
+
269
+ # Size of categorical data.
270
+ # @return total number of values in the vector
271
+ # @example
272
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
273
+ # dv.size
274
+ # # => 5
275
+ def size
276
+ @array.size
277
+ end
278
+
279
+ # Tells whether vector is ordered or not.
280
+ # @return [Boolean] true if vector is ordered, false otherwise
281
+ # @example
282
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
283
+ # dv.ordered?
284
+ # # => false
285
+ def ordered?
286
+ @ordered
287
+ end
288
+
289
+ # Make categorical data ordered or unordered.
290
+ # @param [Boolean] bool true if categorical data is to be to ordered, false otherwise
291
+ # @example
292
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
293
+ # dv.ordered = true
294
+ # dv.ordered?
295
+ # # => true
296
+ def ordered= bool
297
+ @ordered = bool
298
+ end
299
+
300
+ # Returns all the categories with the inherent order
301
+ # @return [Array] categories of the vector with the order
302
+ # @example
303
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c],
304
+ # type: :category,
305
+ # categories: [:a, :b, :c, 1]
306
+ # dv.categories
307
+ # # => [:a, :b, :c, 1]
308
+ def categories
309
+ @cat_hash.keys
310
+ end
311
+
312
+ alias_method :order, :categories
313
+
314
+ # Sets order of the categories.
315
+ # @note If extra categories are specified, they get added too.
316
+ # @param [Array] cat_with_order categories specifying their order
317
+ # @example
318
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
319
+ # dv.categories = [:a, :b, :c, 1]
320
+ # dv.categories
321
+ # # => [:a, :b, :c, 1]
322
+ def categories= cat_with_order
323
+ validate_categories(cat_with_order)
324
+ add_extra_categories(cat_with_order - categories)
325
+ order_with cat_with_order
326
+ end
327
+
328
+ # Rename categories.
329
+ # @note The order of categories after renaming is preserved but new categories
330
+ # are added at the end in the order. Also the base-category is reassigned
331
+ # to new value if it is renamed
332
+ # @param [Hash] old_to_new a hash mapping categories whose name to be changed
333
+ # to their new names
334
+ # @example
335
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
336
+ # dv.rename_categories :a => :b
337
+ # dv
338
+ # # => #<Daru::Vector(5)>
339
+ # # 0 b
340
+ # # 1 1
341
+ # # 2 b
342
+ # # 3 1
343
+ # # 4 c
344
+ def rename_categories old_to_new
345
+ old_categories = categories
346
+ data = to_a.map do |cat|
347
+ old_to_new.include?(cat) ? old_to_new[cat] : cat
348
+ end
349
+
350
+ initialize_core_attributes data
351
+ self.categories = (old_categories - old_to_new.keys) | old_to_new.values
352
+ self.base_category = old_to_new[base_category] if
353
+ old_to_new.include? base_category
354
+ self
355
+ end
356
+
357
+ # Removes the unused categories
358
+ # @note If base category is removed, then the first occuring category in the
359
+ # data is taken as base category. Order of the undeleted categories
360
+ # remains preserved.
361
+ # @return [Daru::Vector] Makes changes in the vector itself i.e. deletes
362
+ # the unused categories and returns itself
363
+ # @example
364
+ # dv = Daru::Vector.new [:one, :two, :one], type: :category,
365
+ # categories: [:three, :two, :one]
366
+ # dv.remove_unused_categories
367
+ # dv.categories
368
+ # # => [:two, :one]
369
+ def remove_unused_categories
370
+ old_categories = categories
371
+
372
+ initialize_core_attributes to_a
373
+ self.categories = old_categories & categories
374
+ self.base_category = @cat_hash.keys.first unless
375
+ categories.include? base_category
376
+ self
377
+ end
378
+
379
+ # Returns the minimum category acording to the order specified.
380
+ # @note This operation will only work if vector is ordered.
381
+ # To set the vector ordered do `vector.ordered = true`
382
+ # @return [object] the minimum category acording to the order
383
+ # @example
384
+ # dv = Daru::Vector.new ['second', 'second', 'third', 'first'],
385
+ # categories: ['first', 'second', 'third']
386
+ # dv.min
387
+ # # => 'first'
388
+ def min
389
+ assert_ordered :min
390
+ categories.first
391
+ end
392
+
393
+ # Returns the maximum category acording to the order specified.
394
+ # @note This operation will only work if vector is ordered.
395
+ # To set the vector ordered do `vector.ordered = true`
396
+ # @return [object] the maximum category acording to the order
397
+ # @example
398
+ # dv = Daru::Vector.new ['second', 'second', 'third', 'first'],
399
+ # categories: ['first', 'second', 'third']
400
+ # dv.max
401
+ # # => 'third'
402
+ def max
403
+ assert_ordered :max
404
+ categories.last
405
+ end
406
+
407
+ # Sorts the vector in the order specified.
408
+ # @note This operation will only work if vector is ordered.
409
+ # To set the vector ordered, do `vector.ordered = true`
410
+ # @return [Daru::Vector] sorted vector
411
+ # @example
412
+ # dv = Daru::Vector.new ['second', 'second', 'third', 'first'],
413
+ # categories: ['first', 'second', 'thrid'],
414
+ # type: :categories,
415
+ # ordered: true
416
+ # dv.sort!
417
+ # # => #<Daru::Vector(4)>
418
+ # # 3 first
419
+ # # 0 second
420
+ # # 1 second
421
+ # # 2 third
422
+ def sort! # rubocop:disable Metrics/AbcSize
423
+ # TODO: Simply the code
424
+ assert_ordered :sort
425
+
426
+ # Build sorted index
427
+ old_index = @index.to_a
428
+ new_index = @cat_hash.values.map do |positions|
429
+ old_index.values_at(*positions)
430
+ end.flatten
431
+ @index = @index.class.new new_index
432
+
433
+ # Build sorted data
434
+ @cat_hash = categories.inject([{}, 0]) do |acc, cat|
435
+ hash, count = acc
436
+ cat_count = @cat_hash[cat].size
437
+ cat_count.times { |i| @array[count+i] = int_from_cat(cat) }
438
+ hash[cat] = (count...(cat_count+count)).to_a
439
+ [hash, count + cat_count]
440
+ end.first
441
+
442
+ self
443
+ end
444
+
445
+ def sort
446
+ dup.sort!
447
+ end
448
+
449
+ # Set coding scheme
450
+ # @param [Symbol] scheme to set
451
+ # @example
452
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
453
+ # dv.coding_scheme = :deviation
454
+ # dv.coding_scheme
455
+ # # => :deviation
456
+ def coding_scheme= scheme
457
+ raise ArgumentError, "Unknown or unsupported coding scheme #{scheme}." unless
458
+ CODING_SCHEMES.include? scheme
459
+ @coding_scheme = scheme
460
+ end
461
+
462
+ CODING_SCHEMES = [:dummy, :deviation, :helmert, :simple].freeze
463
+
464
+ # Contrast code the vector acording to the coding scheme set.
465
+ # @note To set the coding scheme use #coding_scheme=
466
+ # @param [true, false] full true if you want k variables for k categories,
467
+ # false if you want k-1 variables for k categories
468
+ # @return [Daru::DataFrame] dataframe containing all coded variables
469
+ # @example
470
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
471
+ # dv.contrast_code
472
+ # # => #<Daru::DataFrame(5x2)>
473
+ # # daru_1 daru_c
474
+ # # 0 0 0
475
+ # # 1 1 0
476
+ # # 2 0 0
477
+ # # 3 1 0
478
+ # # 4 0 1
479
+ def contrast_code opts={}
480
+ if opts[:user_defined]
481
+ user_defined_coding(opts[:user_defined])
482
+ else
483
+ # TODO: Make various coding schemes code DRY
484
+ send("#{coding_scheme}_coding".to_sym, opts[:full] || false)
485
+ end
486
+ end
487
+
488
+ # Two categorical vectors are equal if their index and corresponding values are same
489
+ # return [true, false] true if two vectors are similar
490
+ # @example
491
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
492
+ # other = Daru::Vector.new [:a, 1, :a, 1, :c],
493
+ # type: :category,
494
+ # index: 1..5
495
+ # dv == other
496
+ # # => false
497
+ def == other
498
+ size == other.size &&
499
+ to_a == other.to_a &&
500
+ index == other.index
501
+ end
502
+
503
+ # Returns integer coding for categorical data in the order starting from 0.
504
+ # For example if order is [:a, :b, :c], then :a, will be coded as 0, :b as 1 and :c as 2
505
+ # @return [Array] integer coding of all values of vector
506
+ # @example
507
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c],
508
+ # type: :category,
509
+ # categories: [:a, :b, :c, 1]
510
+ # dv.to_ints
511
+ # # => [0, 1, 0, 1, 2]
512
+ def to_ints
513
+ @array
514
+ end
515
+
516
+ # Reorder the vector with given positions
517
+ # @note Unlike #reindex! which takes index as input, it takes
518
+ # positions as an input to reorder the vector
519
+ # @param [Array] order the order to reorder the vector with
520
+ # @return reordered vector
521
+ # @example
522
+ # dv = Daru::Vector.new [3, 2, 1], index: ['c', 'b', 'a'], type: :category
523
+ # dv.reorder! [2, 1, 0]
524
+ # # => #<Daru::Vector(3)>
525
+ # # a 1
526
+ # # b 2
527
+ # # c 3
528
+ def reorder! order
529
+ raise ArgumentError, 'Invalid order specified' unless
530
+ order.sort == size.times.to_a
531
+ # TODO: Room for optimization
532
+ old_data = to_a
533
+ new_data = order.map { |i| old_data[i] }
534
+ initialize_core_attributes new_data
535
+ self
536
+ end
537
+
538
+ # Sets new index for vector. Preserves index->value correspondence.
539
+ # @note Unlike #reorder! which takes positions as input it takes
540
+ # index as an input to reorder the vector
541
+ # @param [Daru::Index, Daru::MultiIndex, Array] idx new index to order with
542
+ # @return [Daru::Vector] vector reindexed with new index
543
+ # @example
544
+ # dv = Daru::Vector.new [3, 2, 1], index: ['c', 'b', 'a'], type: :category
545
+ # dv.reindex! ['a', 'b', 'c']
546
+ # # => #<Daru::Vector(3)>
547
+ # # a 1
548
+ # # b 2
549
+ # # c 3
550
+ def reindex! idx
551
+ idx = Daru::Index.new idx unless idx.is_a? Daru::Index
552
+ raise ArgumentError, 'Invalid index specified' unless
553
+ idx.to_a.sort == index.to_a.sort
554
+
555
+ old_categories = categories
556
+ data = idx.map { |i| self[i] }
557
+ initialize_core_attributes data
558
+ self.categories = old_categories
559
+ self.index = idx
560
+ self
561
+ end
562
+
563
+ {
564
+ eq: :==,
565
+ not_eq: :!=,
566
+ lt: :<,
567
+ lteq: :<=,
568
+ mt: :>,
569
+ mteq: :>=
570
+ }.each do |method, operator|
571
+ define_method(method) do |other|
572
+ mod = Daru::Core::Query
573
+ if other.is_a?(Daru::Vector)
574
+ mod.apply_vector_operator operator, to_ints, other.to_ints
575
+ else
576
+ mod.apply_scalar_operator operator, @array, int_from_cat(other)
577
+ end
578
+ end
579
+ end
580
+ alias :gt :mt
581
+ alias :gteq :mteq
582
+
583
+ # For querying the data
584
+ # @param [object] arel like query syntax
585
+ # @return [Daru::Vector] Vector which makes the conditions true
586
+ # @example
587
+ # dv = Daru::Vector.new ['I', 'II', 'I', 'III', 'I', 'II'],
588
+ # type: :category,
589
+ # ordered: true,
590
+ # categories: ['I', 'II', 'III']
591
+ # dv.where(dv.mt('I') & dv.lt('III'))
592
+ # # => #<Daru::Vector(2)>
593
+ # # 1 II
594
+ # # 5 II
595
+ def where bool_array
596
+ Daru::Core::Query.vector_where self, bool_array
597
+ end
598
+
599
+ # Gives the summary of data using following parameters
600
+ # - size: size of the data
601
+ # - categories: total number of categories
602
+ # - max_freq: Max no of times a category occurs
603
+ # - max_category: The category which occurs max no of times
604
+ # - min_freq: Min no of times a category occurs
605
+ # - min_category: The category which occurs min no of times
606
+ # @return [Daru::Vector] Vector with index as following parameters
607
+ # and values as values to these parameters
608
+ # @example
609
+ # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
610
+ # dv.describe
611
+ # # => #<Daru::Vector(6)>
612
+ # # size 5
613
+ # # categories 3
614
+ # # max_freq 2
615
+ # # max_category a
616
+ # # min_freq 1
617
+ # # min_category c
618
+ def describe
619
+ Daru::Vector.new(
620
+ size: size,
621
+ categories: categories.size,
622
+ max_freq: @cat_hash.values.map(&:size).max,
623
+ max_category: @cat_hash.keys.max_by { |cat| @cat_hash[cat].size },
624
+ min_freq: @cat_hash.values.map(&:size).min,
625
+ min_category: @cat_hash.keys.min_by { |cat| @cat_hash[cat].size }
626
+ )
627
+ end
628
+
629
+ # Does nothing since its already of type category.
630
+ # @return [Daru::Vector] categorical vector
631
+ def to_category
632
+ self
633
+ end
634
+
635
+ # Converts a category type vector to non category type vector
636
+ # @return [Daru::Vector] non category type vector
637
+ def to_non_category
638
+ Daru::Vector.new to_a, name: name, index: index
639
+ end
640
+
641
+ # Sets index of the vector
642
+ # @param [Daru::Index, Daru::MultiIndex, Daru::CategoricalIndex, Array, Range]
643
+ # idx new index to assign to vector
644
+ # @return [Daru::Index, Daru::CategoricalIndex, Daru::MultiIndex] the index assigned
645
+ # @example
646
+ # dv = Daru::Vector.new [1, 2, 3], type: :category
647
+ # dv.index = 'a'..'c'
648
+ # dv
649
+ # # => #<Daru::Vector(3)>
650
+ # # a 1
651
+ # # b 2
652
+ # # c 3
653
+ def index= idx
654
+ @index = coerce_index idx
655
+ end
656
+
657
+ # Check if any one of mentioned values occur in the vector
658
+ # @param [Array] *values values to check for
659
+ # @return [true, false] returns true if any one of specified values
660
+ # occur in the vector
661
+ # @example
662
+ # dv = Daru::Vector.new [1, 2, 3, 4, nil]
663
+ # dv.include_values? nil, Float::NAN
664
+ # # => true
665
+ def include_values?(*values)
666
+ values.any? { |v| @cat_hash.include?(v) && !@cat_hash[v].empty? }
667
+ end
668
+
669
+ # Return a vector with specified values removed
670
+ # @param [Array] *values values to reject from resultant vector
671
+ # @return [Daru::Vector] vector with specified values removed
672
+ # @example
673
+ # dv = Daru::Vector.new [1, 2, nil, Float::NAN], type: :category
674
+ # dv.reject_values nil, Float::NAN
675
+ # # => #<Daru::Vector(2)>
676
+ # # 0 1
677
+ # # 1 2
678
+ def reject_values(*values)
679
+ resultant_pos = size.times.to_a - values.flat_map { |v| @cat_hash[v] }
680
+ dv = at(*resultant_pos)
681
+ unless dv.is_a? Daru::Vector
682
+ pos = resultant_pos.first
683
+ dv = at(pos..pos)
684
+ end
685
+ dv.remove_unused_categories
686
+ end
687
+
688
+ # Count the number of values specified
689
+ # @param [Array] *values values to count for
690
+ # @return [Integer] the number of times the values mentioned occurs
691
+ # @example
692
+ # dv = Daru::Vector.new [1, 2, 1, 2, 3, 4, nil, nil]
693
+ # dv.count_values nil
694
+ # # => 2
695
+ def count_values(*values)
696
+ values.map { |v| @cat_hash[v].size if @cat_hash.include? v }
697
+ .compact
698
+ .inject(0, :+)
699
+ end
700
+
701
+ # Return indexes of values specified
702
+ # @param [Array] *values values to find indexes for
703
+ # @return [Array] array of indexes of values specified
704
+ # @example
705
+ # dv = Daru::Vector.new [1, 2, nil, Float::NAN], index: 11..14
706
+ # dv.indexes nil, Float::NAN
707
+ # # => [13, 14]
708
+ def indexes(*values)
709
+ values &= categories
710
+ index.to_a.values_at(*values.flat_map { |v| @cat_hash[v] }.sort)
711
+ end
712
+
713
+ # Replaces specified values with a new value
714
+ # @param [Array] old_values array of values to replace
715
+ # @param [object] new_value new value to replace with
716
+ # @note It performs the replace in place.
717
+ # @return [Daru::Vector] Same vector itself with values
718
+ # replaced with new value
719
+ # @example
720
+ # dv = Daru::Vector.new [1, 2, :a, :b]
721
+ # dv.replace_values [:a, :b], nil
722
+ # dv
723
+ # # =>
724
+ # # #<Daru::Vector:19903200 @name = nil @metadata = {} @size = 4 >
725
+ # # nil
726
+ # # 0 1
727
+ # # 1 2
728
+ # # 2 nil
729
+ # # 3 nil
730
+ def replace_values old_values, new_value
731
+ old_values = [old_values] unless old_values.is_a? Array
732
+ rename_hash = old_values.map { |v| [v, new_value] }.to_h
733
+ rename_categories rename_hash
734
+ end
735
+
736
+ def positions(*values)
737
+ values &= categories
738
+ values.flat_map { |v| @cat_hash[v] }.sort
739
+ end
740
+
741
+ private
742
+
743
+ def validate_categories input_categories
744
+ raise ArgumentError, 'Input categories and speculated categories mismatch' unless
745
+ (categories - input_categories).empty?
746
+ end
747
+
748
+ def add_extra_categories extra_categories
749
+ extra_categories.each { |cat| @cat_hash[cat] = [] }
750
+ end
751
+
752
+ def initialize_core_attributes data
753
+ # Create a hash to map each category to positional indexes
754
+ categories = data.each_with_index.group_by(&:first)
755
+ @cat_hash = categories.map { |cat, group| [cat, group.map(&:last)] }.to_h
756
+
757
+ # Map each category to a unique integer for effective storage in @array
758
+ map_cat_int = categories.keys.each_with_index.to_h
759
+
760
+ # To link every instance to its category,
761
+ # it stores integer for every instance representing its category
762
+ @array = map_cat_int.values_at(*data)
763
+
764
+ # Include plotting functionality
765
+ self.plotting_library = Daru.plotting_library
766
+ end
767
+
768
+ def category_from_position position
769
+ cat_from_int @array[position]
770
+ end
771
+
772
+ def assert_ordered operation
773
+ # TODO: Change ArgumentError to something more expressive
774
+ raise ArgumentError, "Can not apply #{operation} when vector is unordered. "\
775
+ 'To make the categorical data ordered, use #ordered = true'\
776
+ unless ordered?
777
+ end
778
+
779
+ def dummy_coding full
780
+ categories = @cat_hash.keys
781
+ categories.delete(base_category) unless full
782
+
783
+ df = categories.map do |category|
784
+ dummy_code @cat_hash[category]
785
+ end
786
+
787
+ Daru::DataFrame.new df,
788
+ index: @index,
789
+ order: create_names(categories)
790
+ end
791
+
792
+ def dummy_code positions
793
+ code = Array.new(size, 0)
794
+ positions.each { |pos| code[pos] = 1 }
795
+ code
796
+ end
797
+
798
+ def simple_coding full
799
+ categories = @cat_hash.keys
800
+ categories.delete(base_category) unless full
801
+
802
+ df = categories.map do |category|
803
+ simple_code @cat_hash[category]
804
+ end
805
+
806
+ Daru::DataFrame.new df,
807
+ index: @index,
808
+ order: create_names(categories)
809
+ end
810
+
811
+ def simple_code positions
812
+ n = @cat_hash.keys.size.to_f
813
+ code = Array.new(size, -1/n)
814
+ positions.each { |pos| code[pos] = (n-1)/n }
815
+ code
816
+ end
817
+
818
+ def helmert_coding(*)
819
+ categories = @cat_hash.keys[0..-2]
820
+
821
+ df = categories.each_index.map do |index|
822
+ helmert_code index
823
+ end
824
+
825
+ Daru::DataFrame.new df,
826
+ index: @index,
827
+ order: create_names(categories)
828
+ end
829
+
830
+ def helmert_code index
831
+ n = (categories.size - index).to_f
832
+
833
+ @array.map do |cat_index|
834
+ if cat_index == index
835
+ (n-1)/n
836
+ elsif cat_index > index
837
+ -1/n
838
+ else
839
+ 0
840
+ end
841
+ end
842
+ end
843
+
844
+ def deviation_coding(*)
845
+ categories = @cat_hash.keys[0..-2]
846
+
847
+ df = categories.each_index.map do |index|
848
+ deviation_code index
849
+ end
850
+
851
+ Daru::DataFrame.new df,
852
+ index: @index,
853
+ order: create_names(categories)
854
+ end
855
+
856
+ def deviation_code index
857
+ last = categories.size - 1
858
+ @array.map do |cat_index|
859
+ case cat_index
860
+ when index then 1
861
+ when last then -1
862
+ else 0
863
+ end
864
+ end
865
+ end
866
+
867
+ def user_defined_coding df
868
+ Daru::DataFrame.rows (Array.new(size) { |pos| df.row[at(pos)].to_a }),
869
+ index: @index,
870
+ order: df.vectors.to_a
871
+ end
872
+
873
+ def create_names categories
874
+ categories.map do |cat|
875
+ name.is_a?(Symbol) ? "#{name}_#{cat}".to_sym : "#{name}_#{cat}"
876
+ end
877
+ end
878
+
879
+ def coerce_index index
880
+ index =
881
+ case index
882
+ when Daru::MultiIndex, Daru::CategoricalIndex, Daru::Index
883
+ index
884
+ when nil
885
+ Daru::Index.new size
886
+ when Range
887
+ Daru::Index.new index.to_a
888
+ when Array
889
+ Daru::Index.new index
890
+ else
891
+ raise ArgumentError, "Unregnized index type #{index.class}"
892
+ end
893
+ validate_index index
894
+ index
895
+ end
896
+
897
+ def validate_index index
898
+ # Change to SizeError
899
+ raise ArgumentError, "Size of index (#{index.size}) does not matches"\
900
+ "size of vector (#{size})" if size != index.size
901
+ end
902
+
903
+ def modify_category_at pos, category
904
+ raise ArgumentError, "Invalid category #{category}, "\
905
+ 'to add a new category use #add_category' unless
906
+ categories.include? category
907
+ old_category = category_from_position pos
908
+ @array[pos] = int_from_cat category
909
+ @cat_hash[old_category].delete pos
910
+ @cat_hash[category] << pos
911
+ end
912
+
913
+ def order_with new
914
+ if new.to_set != categories.to_set
915
+ raise ArgumentError, 'The contents of new and old order must be the same.'
916
+ end
917
+
918
+ @cat_hash = new.map { |cat| [cat, @cat_hash[cat]] }.to_h
919
+
920
+ map_cat_int = @cat_hash.keys.each_with_index.to_a.to_h
921
+ @array = Array.new(size)
922
+ @cat_hash.map do |cat, positions|
923
+ positions.each { |pos| @array[pos] = map_cat_int[cat] }
924
+ end
925
+ end
926
+
927
+ def cat_from_int int
928
+ @cat_hash.keys[int]
929
+ end
930
+
931
+ def int_from_cat cat
932
+ @cat_hash.keys.index cat
933
+ end
934
+ end
935
+ end