evoc 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +4 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/Makefile +4 -0
  8. data/README.md +61 -0
  9. data/Rakefile +6 -0
  10. data/bin/console +14 -0
  11. data/bin/evoc +3 -0
  12. data/bin/setup +7 -0
  13. data/evoc.gemspec +30 -0
  14. data/lib/evoc/algorithm.rb +147 -0
  15. data/lib/evoc/algorithms/top_k.rb +86 -0
  16. data/lib/evoc/analyze.rb +395 -0
  17. data/lib/evoc/array.rb +43 -0
  18. data/lib/evoc/evaluate.rb +109 -0
  19. data/lib/evoc/exceptions/aggregation_error.rb +6 -0
  20. data/lib/evoc/exceptions/expectedoutcome_nil_or_empty.rb +6 -0
  21. data/lib/evoc/exceptions/measure_calculation_error.rb +6 -0
  22. data/lib/evoc/exceptions/no_changed_items_in_changes.rb +6 -0
  23. data/lib/evoc/exceptions/no_changes_in_json_object.rb +6 -0
  24. data/lib/evoc/exceptions/no_date_in_json_object.rb +6 -0
  25. data/lib/evoc/exceptions/no_result.rb +6 -0
  26. data/lib/evoc/exceptions/non_finite.rb +8 -0
  27. data/lib/evoc/exceptions/non_numeric.rb +8 -0
  28. data/lib/evoc/exceptions/not_a_query.rb +6 -0
  29. data/lib/evoc/exceptions/not_a_result.rb +6 -0
  30. data/lib/evoc/exceptions/not_a_transaction.rb +6 -0
  31. data/lib/evoc/exceptions/not_initialized.rb +6 -0
  32. data/lib/evoc/exceptions/only_nil_in_changes.rb +6 -0
  33. data/lib/evoc/exceptions/query_nil_or_empty.rb +6 -0
  34. data/lib/evoc/exceptions/unable_to_convert_json_to_tx.rb +6 -0
  35. data/lib/evoc/experiment.rb +239 -0
  36. data/lib/evoc/hash.rb +56 -0
  37. data/lib/evoc/history_store.rb +53 -0
  38. data/lib/evoc/hyper_rule.rb +53 -0
  39. data/lib/evoc/interestingness_measure.rb +77 -0
  40. data/lib/evoc/interestingness_measure_aggregator.rb +147 -0
  41. data/lib/evoc/interestingness_measures.rb +882 -0
  42. data/lib/evoc/logger.rb +34 -0
  43. data/lib/evoc/memory_profiler.rb +43 -0
  44. data/lib/evoc/recommendation_cache.rb +152 -0
  45. data/lib/evoc/rule.rb +32 -0
  46. data/lib/evoc/rule_store.rb +340 -0
  47. data/lib/evoc/scenario.rb +303 -0
  48. data/lib/evoc/svd.rb +124 -0
  49. data/lib/evoc/tx.rb +34 -0
  50. data/lib/evoc/tx_store.rb +379 -0
  51. data/lib/evoc/version.rb +3 -0
  52. data/lib/evoc.rb +4 -0
  53. data/lib/evoc_cli/analyze.rb +198 -0
  54. data/lib/evoc_cli/cli_helper.rb +1 -0
  55. data/lib/evoc_cli/experiment.rb +78 -0
  56. data/lib/evoc_cli/info.rb +22 -0
  57. data/lib/evoc_cli/main.rb +29 -0
  58. data/lib/evoc_cli/util.rb +36 -0
  59. data/lib/evoc_helper.rb +40 -0
  60. data/mem_profiler/Gemfile.lock +39 -0
  61. data/mem_profiler/README.md +126 -0
  62. data/mem_profiler/createdb.rb +4 -0
  63. data/mem_profiler/db.rb +82 -0
  64. data/mem_profiler/gemfile +6 -0
  65. data/mem_profiler/gencsv.rb +64 -0
  66. data/mem_profiler/genimport.sh +8 -0
  67. data/mem_profiler/graph.rb +91 -0
  68. metadata +251 -0
@@ -0,0 +1,882 @@
1
+ module Evoc
2
+ module InterestingnessMeasures
3
+
4
+ VALUE_TYPE = Rational
5
+ # USE Rational(Math.log(1/3)).rationalize(0.001)
6
+
7
+ #
8
+ # methods that must be implemented in the class that uses this module as a mixin
9
+ #
10
+ def tx_store
11
+ raise NotImplementedError.new, "tx_store has not been implemented on the current class"
12
+ end
13
+
14
+ def lhs
15
+ raise NotImplementedError.new, "lhs has not been implemented on the current class"
16
+ end
17
+
18
+ def rhs
19
+ raise NotImplementedError.new, "rhs has not been implemented on the current class"
20
+ end
21
+
22
+ def name
23
+ raise NotImplementedError.new, "name has not been implemented on the current class"
24
+ end
25
+
26
+ ##
27
+ # class methods
28
+ #
29
+ # the following is a idiom/hack that enables also including class methods when a class includes this module
30
+ # (normally one would use 'extend')
31
+ #
32
+ ##
33
+ def self.included(base)
34
+ base.extend(ClassMethods)
35
+ end
36
+
37
+ def self.get_min(measure)
38
+ const_get(measure.to_s.upcase+"_MIN")
39
+ end
40
+
41
+ def self.get_max(measure)
42
+ const_get(measure.to_s.upcase+"_MAX")
43
+ end
44
+
45
+ def self.get_mid(measure)
46
+ const_get(measure.to_s.upcase+"_MID")
47
+ end
48
+
49
+ def self.measures
50
+ self.instance_methods.grep(/\Am_(.*)/)
51
+ end
52
+
53
+ def self.hyper_measures
54
+ self.instance_methods.grep(/\Am_(.*)/).select {|m| is_hyper_measure?(m)}
55
+ end
56
+
57
+ def self.is_hyper_measure?(m)
58
+ begin
59
+ const_get(m.to_s.upcase+"_HYPER_MEASURE")
60
+ rescue NameError
61
+ return false
62
+ end
63
+ end
64
+
65
+ module ClassMethods
66
+
67
+ def initialize_measure(measure, hyper_measure: false)
68
+ if block_given?
69
+ Evoc::InterestingnessMeasure.new(type: measure,min: get_min(measure),mid: get_mid(measure), max: get_max(measure), hyper_measure: hyper_measure) {
70
+ yield
71
+ }
72
+ else
73
+ Evoc::InterestingnessMeasure.new(type: measure,min: get_min(measure),mid: get_mid(measure), max: get_max(measure), hyper_measure: hyper_measure)
74
+ end
75
+ end
76
+
77
+ ##
78
+ # returns the list of interestingness measures which are implemented
79
+ def measures
80
+ self.instance_methods.grep(/\Am_(.*)/)
81
+ end
82
+
83
+ def p_measures
84
+ self.instance_methods.grep(/\Ap_(.*)/)
85
+ end
86
+
87
+ def csv_header
88
+ ['lhs','rhs'] + measures
89
+ end
90
+
91
+ def pretty_csv_header
92
+ ['lhs','rhs'] + measures.map {|m| m.to_s.gsub(/m_/,'')}
93
+ end
94
+
95
+ def get_min(measure)
96
+ const_get(measure.to_s.upcase+"_MIN")
97
+ end
98
+
99
+ def get_max(measure)
100
+ const_get(measure.to_s.upcase+"_MAX")
101
+ end
102
+
103
+ def get_mid(measure)
104
+ const_get(measure.to_s.upcase+"_MID")
105
+ end
106
+
107
+ end
108
+
109
+ def to_a
110
+ [lhs.join(',')] + [rhs.join(',')] + instantiated_measures.map {|m| self.get_measure(m).value}
111
+ end
112
+
113
+
114
+
115
+ ##
116
+ # a common getter for all measures
117
+ #
118
+ # handles exceptions and converts the final measure to float
119
+ def get_measure(measure)
120
+ if m = self.method(measure).call
121
+ return m
122
+ else
123
+ raise NotImplementedError.new, "#{measure} not implemented"
124
+ end
125
+ rescue Evoc::Exceptions::MeasureCalculationError => e
126
+ logger.warn "#{measure} was undefined for #{self.name} on the current history, error: #{e}"
127
+ m = self.class.initialize_measure(measure)
128
+ self.instance_variable_set('@'+measure.to_s,m)
129
+ self.method(measure).call
130
+ end
131
+
132
+ def set_measure(measure,value,hyper_measure: false)
133
+ m = self.class.initialize_measure(measure, hyper_measure: hyper_measure) {value}
134
+ self.instance_variable_set('@'+measure.to_s,m)
135
+ end
136
+
137
+ ##
138
+ # manually set the probability p of this rule
139
+ def set_p(p,value)
140
+ self.instance_variable_set('@'+p.to_s,value)
141
+ end
142
+
143
+ ##
144
+ # @param [String] the p probability to get
145
+ def get_p(p)
146
+ p = self.method(p).call
147
+ if p_A < p_AB
148
+ raise Evoc::Exceptions::MeasureCalculationError, "p_A was smaller than p_AB"
149
+ elsif p_B < p_AB
150
+ raise Evoc::Exceptions::MeasureCalculationError, "p_B was smaller than p_AB"
151
+ elsif p_A == 0
152
+ raise Evoc::Exceptions::MeasureCalculationError, "p_A was 0"
153
+ elsif p_B == 0
154
+ raise Evoc::Exceptions::MeasureCalculationError, "p_B was 0"
155
+ elsif p_B > 1-p_A+p_AB
156
+ raise Evoc::Exceptions::MeasureCalculationError, "p_B mismatch with p_A and p_AB, (a #{p_a}, b #{p_B}, ab #{p_AB})"
157
+ elsif p_A > 1-p_B+p_AB
158
+ raise Evoc::Exceptions::MeasureCalculationError, "p_A mismatch with p_B and p_AB, (a #{p_a}, b #{p_B}, ab #{p_AB})"
159
+ end
160
+ return p
161
+ end
162
+
163
+ ##
164
+ # @return [Hash] a hash containing the p probabilities of this rule
165
+ def get_p_values
166
+ values = Hash.new
167
+ Evoc::Rule.p_measures.each do |p|
168
+ values[p] = self.get_p(p)
169
+ end
170
+ return values
171
+ end
172
+
173
+ ##
174
+ # returns the measures which has been instantiated
175
+ def instantiated_measures
176
+ self.class.measures.select {|m| measure_instantiated?(m) }
177
+ end
178
+
179
+ ##
180
+ # Returns true if the measure has been instantiated
181
+ def measure_instantiated?(measure)
182
+ !self.instance_variable_get('@'+measure.to_s).nil?
183
+ end
184
+
185
+ ##
186
+ # NATIVE AGGREGATED MEASURES
187
+ #
188
+ # these measures are defined natively for aggregated rules
189
+ #
190
+ # e.g.,
191
+ # a -> c
192
+ # b -> c
193
+ # aggregated is: a,b -> c
194
+ #
195
+ # the listed measures are well defined also for the aggregated rule
196
+ #
197
+ # With the exception of the hyper coefficient, all other hyper measures
198
+ # are allways recalculated on request.
199
+ # I.e., they use '@m = ..' rather than '@m ||= ..'
200
+ # This way they will overwrite the aggregation that has been calculated on the hyper rule
201
+ # (which is done for the concentional rules)
202
+
203
+
204
+ ##
205
+ # Hyper Coefficient
206
+ # The number of rules used to form a hyper rule
207
+ M_HYPER_COEFFICIENT_MIN = 0
208
+ M_HYPER_COEFFICIENT_MID = 0
209
+ M_HYPER_COEFFICIENT_MAX = Float::INFINITY
210
+ M_HYPER_COEFFICIENT_HYPER_MEASURE = true
211
+ def m_hyper_coefficient
212
+ @m_hyper_coefficient = self.class.initialize_measure(__method__) {
213
+ if self.respond_to?(:hyper_coefficient)
214
+ self.hyper_coefficient
215
+ else
216
+ 0
217
+ end
218
+ }
219
+ end
220
+
221
+ ##
222
+ # Hyper confidence
223
+ #
224
+ # A confidence like measure that is well defined for all hyper rules
225
+ #
226
+ # "the number of times something in rhs changed with something in lhs, divided by the number of times something in lhs changed"
227
+ ##
228
+ #M_HYPER_CONFIDENCE_MIN = 0
229
+ #M_HYPER_CONFIDENCE_MID = 0
230
+ #M_HYPER_CONFIDENCE_MAX = 1
231
+ #M_HYPER_CONFIDENCE_HYPER_MEASURE = true
232
+ #def m_hyper_confidence
233
+ # @m_hyper_confidence ||= self.class.initialize_measure(__method__) {
234
+ # # hyper confidence is equal to the confidence for non hyper rules
235
+ # if !self.respond_to?(:hyper_confidence)
236
+ # m_confidence.value
237
+ # else
238
+ # raise ArgumentError, "Asked for the hyper confidence of a hyper rule, the value was not initialized, but should have been when creating the hyper rule"
239
+ # end
240
+ # }
241
+ #end
242
+
243
+
244
+ ##
245
+ # INTERESTINGNESS MEASURES
246
+ #
247
+ # if not stated otherwise, all of the implementations are based on
248
+ # Michael Hahslers overview at:
249
+ # http://michael.hahsler.net/research/association_rules/measures.html
250
+ #
251
+ ##
252
+
253
+ M_SUPPORT_MIN = 0
254
+ M_SUPPORT_MID = 0
255
+ M_SUPPORT_MAX = 1
256
+ def m_support
257
+ @m_support ||= self.class.initialize_measure(__method__) {
258
+ p_AB
259
+ }
260
+ end
261
+
262
+ M_CONFIDENCE_MIN = 0
263
+ M_CONFIDENCE_MID = 0
264
+ M_CONFIDENCE_MAX = 1
265
+ def m_confidence
266
+ @m_confidence ||= self.class.initialize_measure(__method__) {
267
+ p_BgivenA
268
+ }
269
+ end
270
+
271
+ M_COVERAGE_MIN = 0
272
+ M_COVERAGE_MID = 0
273
+ M_COVERAGE_MAX = 1
274
+ def m_coverage
275
+ @m_coverage ||= self.class.initialize_measure(__method__) {
276
+ p_A
277
+ }
278
+ end
279
+
280
+ M_PREVALENCE_MIN = 0
281
+ M_PREVALENCE_MID = 0
282
+ M_PREVALENCE_MAX = 1
283
+ def m_prevalence
284
+ @m_prevalence ||= self.class.initialize_measure(__method__) {
285
+ p_B
286
+ }
287
+ end
288
+
289
+ M_RECALL_MIN = 0
290
+ M_RECALL_MID = 0
291
+ M_RECALL_MAX = 1
292
+ def m_recall
293
+ @m_recall ||= self.class.initialize_measure(__method__) {
294
+ p_AgivenB
295
+ }
296
+ end
297
+
298
+ M_SPECIFICITY_MIN = 0
299
+ M_SPECIFICITY_MID = 0
300
+ M_SPECIFICITY_MAX = 1
301
+ def m_specificity
302
+ @m_specificity ||= self.class.initialize_measure(__method__) {
303
+ p_notBgivennotA
304
+ }
305
+ end
306
+
307
+ M_LIFT_MIN = 0
308
+ M_LIFT_MID = 1
309
+ M_LIFT_MAX = Float::INFINITY
310
+ ##
311
+ # aka interest
312
+ # Lift measures how many times more often X and Y occur
313
+ # together than expected if they where statistically independent
314
+ ##
315
+ def m_lift
316
+ @m_lift ||= self.class.initialize_measure(__method__) {
317
+ p_AB/(p_A*p_B)
318
+ }
319
+ end
320
+
321
+ M_LEVERAGE_MIN = -1
322
+ M_LEVERAGE_MID = 0
323
+ M_LEVERAGE_MAX = 1
324
+ ##
325
+ # Leverage measures the difference of X and Y appearing together
326
+ # in the data set and what would be expected if X and Y where statistically dependent
327
+ ##
328
+ def m_leverage
329
+ @m_leverage ||= self.class.initialize_measure(__method__) {
330
+ p_BgivenA - (p_A*p_B)
331
+ }
332
+ end
333
+
334
+ M_PIATETSKY_SHAPIRO_MIN = -0.25
335
+ M_PIATETSKY_SHAPIRO_MID = 0
336
+ M_PIATETSKY_SHAPIRO_MAX = 0.25
337
+ def m_piatetsky_shapiro
338
+ @m_piatetsky_shapiro ||= self.class.initialize_measure(__method__) {
339
+ p_AB - p_A*p_B
340
+ }
341
+ end
342
+
343
+ # aka: pavillion index, centered confidence
344
+ M_ADDED_VALUE_MIN = -0.5
345
+ M_ADDED_VALUE_MID = 0
346
+ M_ADDED_VALUE_MAX = 1
347
+ def m_added_value
348
+ @m_added_value ||= self.class.initialize_measure(__method__) {
349
+ p_BgivenA - p_B
350
+ }
351
+ end
352
+
353
+ M_CAUSAL_CONFIDENCE_MIN = 0
354
+ M_CAUSAL_CONFIDENCE_MID = 0
355
+ M_CAUSAL_CONFIDENCE_MAX = 1
356
+ def m_causal_confidence
357
+ @m_causal_confidence ||= self.class.initialize_measure(__method__) {
358
+ (1.to_r/2)*(p_BgivenA + p_notAgivennotB)
359
+ }
360
+ end
361
+
362
+ M_CAUSAL_SUPPORT_MIN = 0
363
+ M_CAUSAL_SUPPORT_MID = 0
364
+ M_CAUSAL_SUPPORT_MAX = 1
365
+ def m_causal_support
366
+ @m_causal_support ||= self.class.initialize_measure(__method__) {
367
+ p_AB + p_notA_notB
368
+ }
369
+ end
370
+
371
+ M_DESCRIPTIVE_CONFIRMED_CONFIDENCE_MIN = -1
372
+ M_DESCRIPTIVE_CONFIRMED_CONFIDENCE_MID = 0
373
+ M_DESCRIPTIVE_CONFIRMED_CONFIDENCE_MAX = 1
374
+ def m_descriptive_confirmed_confidence
375
+ @m_descriptive_confirmed_confidence ||= self.class.initialize_measure(__method__) {
376
+ p_BgivenA - p_notBgivenA
377
+ }
378
+ end
379
+
380
+ M_DIFFERENCE_OF_CONFIDENCE_MIN = -1
381
+ M_DIFFERENCE_OF_CONFIDENCE_MID = 0
382
+ M_DIFFERENCE_OF_CONFIDENCE_MAX = 1
383
+ def m_difference_of_confidence
384
+ @m_difference_of_confidence ||= self.class.initialize_measure(__method__) {
385
+ p_BgivenA - p_BgivennotA
386
+ }
387
+ end
388
+
389
+ M_RELATIVE_RISK_MIN = 0
390
+ M_RELATIVE_RISK_MID = 0
391
+ M_RELATIVE_RISK_MAX = Float::INFINITY
392
+ def m_relative_risk
393
+ @m_relative_risk ||= self.class.initialize_measure(__method__) {
394
+ if p_BgivennotA == 0
395
+ Float::INFINITY
396
+ else
397
+ p_BgivenA/p_BgivennotA
398
+ end
399
+ }
400
+ end
401
+
402
+ M_JACCARD_MIN = 0
403
+ M_JACCARD_MID = 0
404
+ M_JACCARD_MAX = 1
405
+ def m_jaccard
406
+ @m_jaccard ||= self.class.initialize_measure(__method__) {
407
+ p_AB/(p_A+p_B-p_AB)
408
+ }
409
+ end
410
+
411
+
412
+ M_IMBALANCE_RATIO_MIN = 0
413
+ M_IMBALANCE_RATIO_MID = 0
414
+ M_IMBALANCE_RATIO_MAX = 1
415
+ ##
416
+ # IR gauges the degree of imbalance between two events that the lhs and the rhs are contained in a transaction.
417
+ # The ratio is close to 0 if the conditional probabilities are similar (i.e., very balanced) and close to 1 if they are very different
418
+ ##
419
+ def m_imbalance_ratio
420
+ @m_imbalance_ratio ||= self.class.initialize_measure(__method__) {
421
+ numerator = (p_AgivenB - p_BgivenA).abs
422
+ denominator = (p_AgivenB + p_BgivenA - p_AgivenB*p_BgivenA)
423
+ if denominator == 0
424
+ if numerator == 0
425
+ 0
426
+ else
427
+ raise Evoc::MeasureCalculationError.new, "Numerator was not 0 when denominator was 0 when calculating imbalance ratio"
428
+ end
429
+ else
430
+ numerator/denominator
431
+ end
432
+ }
433
+ end
434
+
435
+ M_ODDS_RATIO_MIN = 0
436
+ M_ODDS_RATIO_MID = 1
437
+ M_ODDS_RATIO_MAX = Float::INFINITY
438
+ ##
439
+ # The odds of finding X in transactions which contain Y divided by the
440
+ # odds of finding X in transactions which do not contain Y
441
+ ##
442
+ def m_odds_ratio
443
+ @m_odds_ratio ||= self.class.initialize_measure(__method__) {
444
+ numerator = p_AB*p_notA_notB
445
+ denominator = p_A_notB*p_notA_B
446
+ if denominator == 0
447
+ Float::INFINITY
448
+ else
449
+ numerator/denominator
450
+ end
451
+ }
452
+ end
453
+
454
+ M_YULES_Q_MIN = -1
455
+ M_YULES_Q_MID = 0
456
+ M_YULES_Q_MAX = 1
457
+ def m_yules_q
458
+ @m_yules_q ||= self.class.initialize_measure(__method__) {
459
+ odds_ratio = self.m_odds_ratio.value
460
+ if !odds_ratio.nil?
461
+ if odds_ratio.to_f.finite?
462
+ (odds_ratio - 1)/(odds_ratio + 1)
463
+ else
464
+ # -1 if odds ratio -inf
465
+ # 1 if odds ratio +inf
466
+ odds_ratio.to_f.infinite?
467
+ end
468
+ else
469
+ raise Evoc::MeasureCalculationError.new, "Odds ratio was nil when calculating yules q"
470
+ end
471
+ }
472
+ end
473
+
474
+ M_YULES_Y_MIN = -1
475
+ M_YULES_Y_MID = 0
476
+ M_YULES_Y_MAX = 1
477
+ def m_yules_y
478
+ @m_yules_y ||= self.class.initialize_measure(__method__) {
479
+ odds_ratio = self.m_odds_ratio.value
480
+ if !odds_ratio.nil?
481
+ if odds_ratio.to_f.finite?
482
+ ((Math.sqrt(odds_ratio).rationalize) - 1)/((Math.sqrt(odds_ratio).rationalize) + 1)
483
+ else
484
+ odds_ratio.to_f.infinite?
485
+ end
486
+ else
487
+ raise Evoc::Exceptions::MeasureCalculationError.new, "Odds ratio was nil when calculating yules y"
488
+ end
489
+ }
490
+ end
491
+
492
+ # from Tan2004
493
+ M_KLOSGEN_MIN = -1
494
+ M_KLOSGEN_MID = 0
495
+ M_KLOSGEN_MAX = 1
496
+ def m_klosgen
497
+ @m_klosgen ||= self.class.initialize_measure(__method__) {
498
+ (Math.sqrt(p_AB)*[(p_BgivenA-p_B),p_AgivenB-p_A].max).rationalize
499
+ }
500
+ end
501
+
502
+ M_KULCZYNSKI_MIN = 0
503
+ M_KULCZYNSKI_MID = 0
504
+ M_KULCZYNSKI_MAX = 1
505
+ ##
506
+ # Calculate the null-invariant Kulczynski measure with a preference for skewed patterns.
507
+ ##
508
+ def m_kulczynski
509
+ @m_kulczynski ||= self.class.initialize_measure(__method__) {
510
+ (p_AB/2)*((1/p_A)+(1/p_B))
511
+ }
512
+ end
513
+
514
+ M_CONVICTION_MIN = 0
515
+ M_CONVICTION_MID = 0
516
+ M_CONVICTION_MAX = Float::INFINITY
517
+ def m_conviction
518
+ @m_conviction ||= self.class.initialize_measure(__method__) {
519
+ numerator = p_A*p_notB
520
+ denominator = p_A_notB
521
+ if denominator == 0
522
+ if numerator == 0
523
+ 0
524
+ else
525
+ Float::INFINITY
526
+ end
527
+ else
528
+ numerator/denominator
529
+ end
530
+ }
531
+ end
532
+
533
+ ##
534
+ # uses 2 coefficients two weight the importance of the two factors
535
+ # k : dependency
536
+ # m : generality
537
+ M_INTERESTINGNESS_WEIGHTING_DEPENDENCY_MIN = 0
538
+ M_INTERESTINGNESS_WEIGHTING_DEPENDENCY_MID = 0
539
+ M_INTERESTINGNESS_WEIGHTING_DEPENDENCY_MAX = 1
540
+ def m_interestingness_weighting_dependency
541
+ k = 2
542
+ m = 2
543
+ @m_interestingness_weighting_dependency ||= self.class.initialize_measure(__method__) {
544
+ ((p_BgivenA/p_B)**(k-1))*(p_AB**m)
545
+ }
546
+ end
547
+
548
+ M_COLLECTIVE_STRENGTH_MIN = -Float::INFINITY
549
+ M_COLLECTIVE_STRENGTH_MID = 1
550
+ M_COLLECTIVE_STRENGTH_MAX = Float::INFINITY
551
+ # range from Aggarwal1998
552
+ def m_collective_strength
553
+ @m_collective_strength ||= self.class.initialize_measure(__method__) {
554
+ n1 = (p_AB+p_notBgivennotA)
555
+ d1 = (p_A*p_B+p_notA*p_notB)
556
+ first = ((d1 == 0) ? n1/d1.to_f : n1/d1)
557
+ n2 = (1-p_A*p_B-p_notA*p_notB)
558
+ d2 = (1-p_AB-p_notBgivennotA)
559
+ second = ((d2 == 0) ? n2/d2.to_f : n2/d2)
560
+ first * second
561
+ }
562
+ end
563
+
564
+ M_GINI_INDEX_MIN = 0
565
+ M_GINI_INDEX_MID = 0
566
+ M_GINI_INDEX_MAX = 1
567
+ ##
568
+ # Measures quadratic entropy
569
+ #
570
+ ##
571
+ def m_gini_index
572
+ @m_gini_index ||= self.class.initialize_measure(__method__) {
573
+ p_A*((p_BgivenA**2)+(p_notBgivenA**2))+p_notA*((p_BgivennotA**2)+(p_notBgivennotA**2))-p_B**2-p_notB**2
574
+ }
575
+ end
576
+
577
+ M_KAPPA_MIN = -1
578
+ M_KAPPA_MID = 0
579
+ M_KAPPA_MAX = 1
580
+ def m_kappa
581
+ @m_kappa ||= self.class.initialize_measure(__method__) {
582
+ num = (p_AB + p_notA_notB - p_A*p_B - p_notA*p_notB)
583
+ den = (1 - p_A*p_B - p_notA*p_notB)
584
+ ((den == 0) ? num/den.to_f : num/den)
585
+ }
586
+ end
587
+
588
+ M_J_MEASURE_MIN = 0
589
+ M_J_MEASURE_MID = 0
590
+ M_J_MEASURE_MAX = 1
591
+ ##
592
+ # Measures cross entropy
593
+ #
594
+ ##
595
+ def m_j_measure
596
+ @m_j_measure ||= self.class.initialize_measure(__method__) {
597
+ first_log = Math.log(p_BgivenA/(p_B.to_f))
598
+ second_log = Math.log(p_notBgivenA/(p_notB.to_f))
599
+ first = ((p_AB == 0) ? 0 : p_AB*first_log)
600
+ second = ((p_A_notB == 0) ? 0 : p_A_notB*second_log)
601
+
602
+ first+second
603
+ }
604
+ end
605
+
606
+ M_ONE_WAY_SUPPORT_MIN = -1
607
+ M_ONE_WAY_SUPPORT_MID = 0
608
+ M_ONE_WAY_SUPPORT_MAX = Float::INFINITY
609
+ def m_one_way_support
610
+ @m_one_way_support ||= self.class.initialize_measure(__method__) {
611
+ if p_BgivenA == 0
612
+ 0
613
+ else
614
+ p_BgivenA*(Math.log2(p_AB/(p_A*p_B)).rationalize)
615
+ end
616
+ }
617
+ end
618
+
619
+ M_TWO_WAY_SUPPORT_MIN = -1
620
+ M_TWO_WAY_SUPPORT_MID = 0
621
+ M_TWO_WAY_SUPPORT_MAX = 1
622
+ def m_two_way_support
623
+ @m_two_way_support ||= self.class.initialize_measure(__method__) {
624
+ if p_AB == 0
625
+ 0
626
+ else
627
+ p_AB*(Math.log2(p_AB/(p_A*p_B)).rationalize)
628
+ end
629
+ }
630
+ end
631
+
632
+ # aka Ø-coefficient
633
+ M_LINEAR_CORRELATION_COEFFICIENT_MIN = -1
634
+ M_LINEAR_CORRELATION_COEFFICIENT_MID = 0
635
+ M_LINEAR_CORRELATION_COEFFICIENT_MAX = 1
636
+ def m_linear_correlation_coefficient
637
+ @m_linear_correlation_coefficient ||= self.class.initialize_measure(__method__) {
638
+ num = (p_AB-(p_A*p_B))
639
+ den = (Math.sqrt(p_A*p_B*p_notA*p_notB))
640
+ ((den == 0) ? num/den.to_f : num/(den.rationalize))
641
+ }
642
+ end
643
+
644
+ M_COSINE_MIN = 0
645
+ M_COSINE_MID = 0
646
+ M_COSINE_MAX = 1
647
+ def m_cosine
648
+ @m_cosine ||= self.class.initialize_measure(__method__) {
649
+ num = p_AB
650
+ den = Math.sqrt(p_A*p_B).rationalize
651
+ if den == 0
652
+ raise Evoc::Exceptions::MeasureCalculationError, "Denominator became 0 when calculating cosine (a #{p_a}, b #{p_B}, ab #{p_AB})"
653
+ else
654
+ num/den
655
+ end
656
+ }
657
+ end
658
+
659
+ M_LOEVINGER_MIN = -1
660
+ M_LOEVINGER_MID = 0
661
+ M_LOEVINGER_MAX = 1
662
+ ##
663
+ # aka Certainty Factor
664
+ # The certainty factor is a measure of variation of the probability that
665
+ # Y is in a transaction when only considering transactions with X.
666
+ # An increasing CF means a decrease of the probability that Y is not in
667
+ # a transaction that X is in. Negative CFs have a similar interpretation
668
+ #
669
+ ##
670
+ def m_loevinger
671
+ @m_loevinger ||= self.class.initialize_measure(__method__) {
672
+ if p_A*p_notB == 0
673
+ 1
674
+ else
675
+ 1 - ((p_A_notB)/(p_A*p_notB))
676
+ end
677
+ }
678
+ end
679
+
680
+ M_SEBAG_SCHOENAUER_MIN = 0
681
+ M_SEBAG_SCHOENAUER_MID = 0
682
+ M_SEBAG_SCHOENAUER_MAX = Float::INFINITY
683
+ def m_sebag_schoenauer
684
+ @m_sebag_schoenauer ||= self.class.initialize_measure(__method__) {
685
+ if p_A_notB == 0
686
+ Float::INFINITY
687
+ else
688
+ p_AB/p_A_notB
689
+ end
690
+ }
691
+ end
692
+
693
+ M_VARYING_RATES_LIAISON_MIN = -1
694
+ M_VARYING_RATES_LIAISON_MID = 0
695
+ M_VARYING_RATES_LIAISON_MAX = Float::INFINITY
696
+ def m_varying_rates_liaison
697
+ @m_varying_rates_liaison ||= self.class.initialize_measure(__method__) {
698
+ (p_AB/(p_A*p_B)) - 1
699
+ }
700
+ end
701
+
702
+ M_LEAST_CONTRADICTION_MIN = -Float::INFINITY
703
+ M_LEAST_CONTRADICTION_MID = 0
704
+ M_LEAST_CONTRADICTION_MAX = 1
705
+ def m_least_contradiction
706
+ @m_least_contradiction ||= self.class.initialize_measure(__method__) {
707
+ (p_AB-p_A_notB)/p_B
708
+ }
709
+ end
710
+
711
+ M_ODD_MULTIPLIER_MIN = 0
712
+ M_ODD_MULTIPLIER_MID = 0
713
+ M_ODD_MULTIPLIER_MAX = Float::INFINITY
714
+ def m_odd_multiplier
715
+ @m_odd_multiplier ||= self.class.initialize_measure(__method__) {
716
+ numerator = p_AB*p_notB
717
+ denominator = p_B*p_A_notB
718
+ if denominator == 0
719
+ if numerator == 0
720
+ 0
721
+ else
722
+ Float::INFINITY
723
+ end
724
+ else
725
+ numerator/denominator
726
+ end
727
+ }
728
+ end
729
+
730
+ M_EXAMPLE_AND_COUNTEREXAMPLE_RATE_MIN = -Float::INFINITY
731
+ M_EXAMPLE_AND_COUNTEREXAMPLE_RATE_MID = 0
732
+ M_EXAMPLE_AND_COUNTEREXAMPLE_RATE_MAX = 1
733
+ # 0 when equally many examples as counter examples
734
+ def m_example_and_counterexample_rate
735
+ @m_example_and_counterexample_rate ||= self.class.initialize_measure(__method__) {
736
+ numerator = (p_AB - p_A_notB)
737
+ denominator = p_AB
738
+ if denominator == 0
739
+ -Float::INFINITY
740
+ else
741
+ numerator/denominator
742
+ end
743
+ }
744
+ end
745
+
746
+ M_ZHANG_MIN = -1
747
+ M_ZHANG_MID = 0
748
+ M_ZHANG_MAX = 1
749
+ def m_zhang
750
+ @m_zhang ||= self.class.initialize_measure(__method__) {
751
+ numerator = p_AB-p_A*p_B
752
+ denominator = [p_AB*p_notB,p_B*p_A_notB].max
753
+ if denominator == 0
754
+ 0
755
+ else
756
+ numerator/denominator
757
+ end
758
+ }
759
+ end
760
+
761
+ M_LAPLACE_CORRECTED_CONFIDENCE_MIN = 0
762
+ M_LAPLACE_CORRECTED_CONFIDENCE_MID = 0
763
+ M_LAPLACE_CORRECTED_CONFIDENCE_MAX = 1
764
+ ##
765
+ # Corrected confidence estimate decreases with lower support
766
+ # to account for estimation uncertainty with low counts.
767
+ ##
768
+ def m_laplace_corrected_confidence
769
+ @m_laplace_corrected_confidence ||= self.class.initialize_measure(__method__) {
770
+ (p_AB + 1)/(p_B + 2)
771
+ }
772
+ end
773
+
774
+ ##
775
+ # building blocks for interestingness measures
776
+ #
777
+ # A refers to the antecedent of a rule
778
+ # B refers to the consequent of a rule
779
+
780
+ ##
781
+ # the number of transactions
782
+ # n is converted into the specified type to ensure
783
+ # that the type is used throughout calculations
784
+ def n
785
+ @n ||= VALUE_TYPE.method(VALUE_TYPE.to_s).call(tx_store.size)
786
+ end
787
+
788
+ ##
789
+ # the ratio of tx with A as a subset
790
+ def p_A
791
+ @p_A ||= tx_store.transactions_of_list(lhs,strict: true).size/n
792
+ end
793
+
794
+ ##
795
+ # the ratio of tx where A is not a subset
796
+ def p_notA
797
+ @p_notA ||= (1 - p_A)
798
+ end
799
+
800
+ ##
801
+ # the ratio of tx with B as a subset
802
+ def p_B
803
+ @p_B ||= tx_store.transactions_of_list(rhs,strict: true).size/n
804
+ end
805
+
806
+ ##
807
+ # the ratio of tx where B is not a subset
808
+ def p_notB
809
+ @p_notB ||= (1 - p_B)
810
+ end
811
+
812
+ ##
813
+ # the ratio of tx with the union of A and B as a subset
814
+ def p_AB
815
+ @p_AB ||= tx_store.transactions_of_list((lhs | rhs),strict: true).size/n
816
+ end
817
+
818
+ ##
819
+ # the ratio of tx where A or B is a subset
820
+ def p_AorB
821
+ @p_AorB ||= p_A + p_B - p_AB
822
+ end
823
+
824
+ ##
825
+ # the ratio of tx where neither A or B is a subset
826
+ def p_notA_notB
827
+ @p_notA_notB ||= 1 - (p_A + p_B) + p_AB
828
+ end
829
+
830
+ ##
831
+ # the ratio of tx where A is not a subset but B is
832
+ def p_notA_B
833
+ @p_notA_B ||= p_B - p_AB
834
+ end
835
+
836
+ ##
837
+ # the ratio of tx where A is a subset but B is not
838
+ def p_A_notB
839
+ @p_A_notB ||= p_A - p_AB
840
+ end
841
+
842
+ ##
843
+ # the ratio of the union being a subset to the number of txes where B is a subset
844
+ def p_AgivenB
845
+ @p_AgivenB ||= ((p_B == 0) ? 0 : p_AB/p_B)
846
+ end
847
+
848
+ ##
849
+ # the ratio of the union being a subset to the number of txes where A is a subset
850
+ def p_BgivenA
851
+ @p_BgivenA ||= ((p_A == 0) ? 0 : p_AB/p_A)
852
+ end
853
+
854
+ def p_notAgivenB
855
+ @p_notAgivenB ||= ((p_B == 0) ? 0 : p_notA_B/p_B)
856
+ end
857
+
858
+ # if A is 0, so is A,notB
859
+ def p_notBgivenA
860
+ @p_notBgivenA ||= ((p_A == 0) ? 0 : p_A_notB/p_A)
861
+ end
862
+
863
+ # if notB is 0, so is A_notB
864
+ def p_AgivennotB
865
+ @p_AgivennotB ||= ((p_notB == 0) ? 0 : p_A_notB/p_notB)
866
+ end
867
+
868
+ # if notA is 0, so is notA_B
869
+ def p_BgivennotA
870
+ @p_BgivennotA ||= ((p_notA == 0) ? 0 : p_notA_B/p_notA)
871
+ end
872
+
873
+ def p_notBgivennotA
874
+ @p_notBgivennotA ||= ((p_notA == 0) ? 0 : p_notA_notB/p_notA)
875
+ end
876
+
877
+ # if notB is 0, so is notA and notB
878
+ def p_notAgivennotB
879
+ @p_notAgivennotB ||= ((p_notB == 0) ? 0 : p_notA_notB/p_notB)
880
+ end
881
+ end
882
+ end