evoc 3.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +4 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/Makefile +4 -0
  8. data/README.md +61 -0
  9. data/Rakefile +6 -0
  10. data/bin/console +14 -0
  11. data/bin/evoc +3 -0
  12. data/bin/setup +7 -0
  13. data/evoc.gemspec +30 -0
  14. data/lib/evoc/algorithm.rb +147 -0
  15. data/lib/evoc/algorithms/top_k.rb +86 -0
  16. data/lib/evoc/analyze.rb +395 -0
  17. data/lib/evoc/array.rb +43 -0
  18. data/lib/evoc/evaluate.rb +109 -0
  19. data/lib/evoc/exceptions/aggregation_error.rb +6 -0
  20. data/lib/evoc/exceptions/expectedoutcome_nil_or_empty.rb +6 -0
  21. data/lib/evoc/exceptions/measure_calculation_error.rb +6 -0
  22. data/lib/evoc/exceptions/no_changed_items_in_changes.rb +6 -0
  23. data/lib/evoc/exceptions/no_changes_in_json_object.rb +6 -0
  24. data/lib/evoc/exceptions/no_date_in_json_object.rb +6 -0
  25. data/lib/evoc/exceptions/no_result.rb +6 -0
  26. data/lib/evoc/exceptions/non_finite.rb +8 -0
  27. data/lib/evoc/exceptions/non_numeric.rb +8 -0
  28. data/lib/evoc/exceptions/not_a_query.rb +6 -0
  29. data/lib/evoc/exceptions/not_a_result.rb +6 -0
  30. data/lib/evoc/exceptions/not_a_transaction.rb +6 -0
  31. data/lib/evoc/exceptions/not_initialized.rb +6 -0
  32. data/lib/evoc/exceptions/only_nil_in_changes.rb +6 -0
  33. data/lib/evoc/exceptions/query_nil_or_empty.rb +6 -0
  34. data/lib/evoc/exceptions/unable_to_convert_json_to_tx.rb +6 -0
  35. data/lib/evoc/experiment.rb +239 -0
  36. data/lib/evoc/hash.rb +56 -0
  37. data/lib/evoc/history_store.rb +53 -0
  38. data/lib/evoc/hyper_rule.rb +53 -0
  39. data/lib/evoc/interestingness_measure.rb +77 -0
  40. data/lib/evoc/interestingness_measure_aggregator.rb +147 -0
  41. data/lib/evoc/interestingness_measures.rb +882 -0
  42. data/lib/evoc/logger.rb +34 -0
  43. data/lib/evoc/memory_profiler.rb +43 -0
  44. data/lib/evoc/recommendation_cache.rb +152 -0
  45. data/lib/evoc/rule.rb +32 -0
  46. data/lib/evoc/rule_store.rb +340 -0
  47. data/lib/evoc/scenario.rb +303 -0
  48. data/lib/evoc/svd.rb +124 -0
  49. data/lib/evoc/tx.rb +34 -0
  50. data/lib/evoc/tx_store.rb +379 -0
  51. data/lib/evoc/version.rb +3 -0
  52. data/lib/evoc.rb +4 -0
  53. data/lib/evoc_cli/analyze.rb +198 -0
  54. data/lib/evoc_cli/cli_helper.rb +1 -0
  55. data/lib/evoc_cli/experiment.rb +78 -0
  56. data/lib/evoc_cli/info.rb +22 -0
  57. data/lib/evoc_cli/main.rb +29 -0
  58. data/lib/evoc_cli/util.rb +36 -0
  59. data/lib/evoc_helper.rb +40 -0
  60. data/mem_profiler/Gemfile.lock +39 -0
  61. data/mem_profiler/README.md +126 -0
  62. data/mem_profiler/createdb.rb +4 -0
  63. data/mem_profiler/db.rb +82 -0
  64. data/mem_profiler/gemfile +6 -0
  65. data/mem_profiler/gencsv.rb +64 -0
  66. data/mem_profiler/genimport.sh +8 -0
  67. data/mem_profiler/graph.rb +91 -0
  68. metadata +251 -0
@@ -0,0 +1,882 @@
1
+ module Evoc
2
+ module InterestingnessMeasures
3
+
4
+ VALUE_TYPE = Rational
5
+ # USE Rational(Math.log(1/3)).rationalize(0.001)
6
+
7
+ #
8
+ # methods that must be implemented in the class that uses this module as a mixin
9
+ #
10
+ def tx_store
11
+ raise NotImplementedError.new, "tx_store has not been implemented on the current class"
12
+ end
13
+
14
+ def lhs
15
+ raise NotImplementedError.new, "lhs has not been implemented on the current class"
16
+ end
17
+
18
+ def rhs
19
+ raise NotImplementedError.new, "rhs has not been implemented on the current class"
20
+ end
21
+
22
+ def name
23
+ raise NotImplementedError.new, "name has not been implemented on the current class"
24
+ end
25
+
26
+ ##
27
+ # class methods
28
+ #
29
+ # the following is a idiom/hack that enables also including class methods when a class includes this module
30
+ # (normally one would use 'extend')
31
+ #
32
+ ##
33
+ def self.included(base)
34
+ base.extend(ClassMethods)
35
+ end
36
+
37
+ def self.get_min(measure)
38
+ const_get(measure.to_s.upcase+"_MIN")
39
+ end
40
+
41
+ def self.get_max(measure)
42
+ const_get(measure.to_s.upcase+"_MAX")
43
+ end
44
+
45
+ def self.get_mid(measure)
46
+ const_get(measure.to_s.upcase+"_MID")
47
+ end
48
+
49
+ def self.measures
50
+ self.instance_methods.grep(/\Am_(.*)/)
51
+ end
52
+
53
+ def self.hyper_measures
54
+ self.instance_methods.grep(/\Am_(.*)/).select {|m| is_hyper_measure?(m)}
55
+ end
56
+
57
+ def self.is_hyper_measure?(m)
58
+ begin
59
+ const_get(m.to_s.upcase+"_HYPER_MEASURE")
60
+ rescue NameError
61
+ return false
62
+ end
63
+ end
64
+
65
+ module ClassMethods
66
+
67
+ def initialize_measure(measure, hyper_measure: false)
68
+ if block_given?
69
+ Evoc::InterestingnessMeasure.new(type: measure,min: get_min(measure),mid: get_mid(measure), max: get_max(measure), hyper_measure: hyper_measure) {
70
+ yield
71
+ }
72
+ else
73
+ Evoc::InterestingnessMeasure.new(type: measure,min: get_min(measure),mid: get_mid(measure), max: get_max(measure), hyper_measure: hyper_measure)
74
+ end
75
+ end
76
+
77
+ ##
78
+ # returns the list of interestingness measures which are implemented
79
+ def measures
80
+ self.instance_methods.grep(/\Am_(.*)/)
81
+ end
82
+
83
+ def p_measures
84
+ self.instance_methods.grep(/\Ap_(.*)/)
85
+ end
86
+
87
+ def csv_header
88
+ ['lhs','rhs'] + measures
89
+ end
90
+
91
+ def pretty_csv_header
92
+ ['lhs','rhs'] + measures.map {|m| m.to_s.gsub(/m_/,'')}
93
+ end
94
+
95
+ def get_min(measure)
96
+ const_get(measure.to_s.upcase+"_MIN")
97
+ end
98
+
99
+ def get_max(measure)
100
+ const_get(measure.to_s.upcase+"_MAX")
101
+ end
102
+
103
+ def get_mid(measure)
104
+ const_get(measure.to_s.upcase+"_MID")
105
+ end
106
+
107
+ end
108
+
109
+ def to_a
110
+ [lhs.join(',')] + [rhs.join(',')] + instantiated_measures.map {|m| self.get_measure(m).value}
111
+ end
112
+
113
+
114
+
115
+ ##
116
+ # a common getter for all measures
117
+ #
118
+ # handles exceptions and converts the final measure to float
119
+ def get_measure(measure)
120
+ if m = self.method(measure).call
121
+ return m
122
+ else
123
+ raise NotImplementedError.new, "#{measure} not implemented"
124
+ end
125
+ rescue Evoc::Exceptions::MeasureCalculationError => e
126
+ logger.warn "#{measure} was undefined for #{self.name} on the current history, error: #{e}"
127
+ m = self.class.initialize_measure(measure)
128
+ self.instance_variable_set('@'+measure.to_s,m)
129
+ self.method(measure).call
130
+ end
131
+
132
+ def set_measure(measure,value,hyper_measure: false)
133
+ m = self.class.initialize_measure(measure, hyper_measure: hyper_measure) {value}
134
+ self.instance_variable_set('@'+measure.to_s,m)
135
+ end
136
+
137
+ ##
138
+ # manually set the probability p of this rule
139
+ def set_p(p,value)
140
+ self.instance_variable_set('@'+p.to_s,value)
141
+ end
142
+
143
+ ##
144
+ # @param [String] the p probability to get
145
+ def get_p(p)
146
+ p = self.method(p).call
147
+ if p_A < p_AB
148
+ raise Evoc::Exceptions::MeasureCalculationError, "p_A was smaller than p_AB"
149
+ elsif p_B < p_AB
150
+ raise Evoc::Exceptions::MeasureCalculationError, "p_B was smaller than p_AB"
151
+ elsif p_A == 0
152
+ raise Evoc::Exceptions::MeasureCalculationError, "p_A was 0"
153
+ elsif p_B == 0
154
+ raise Evoc::Exceptions::MeasureCalculationError, "p_B was 0"
155
+ elsif p_B > 1-p_A+p_AB
156
+ raise Evoc::Exceptions::MeasureCalculationError, "p_B mismatch with p_A and p_AB, (a #{p_a}, b #{p_B}, ab #{p_AB})"
157
+ elsif p_A > 1-p_B+p_AB
158
+ raise Evoc::Exceptions::MeasureCalculationError, "p_A mismatch with p_B and p_AB, (a #{p_a}, b #{p_B}, ab #{p_AB})"
159
+ end
160
+ return p
161
+ end
162
+
163
+ ##
164
+ # @return [Hash] a hash containing the p probabilities of this rule
165
+ def get_p_values
166
+ values = Hash.new
167
+ Evoc::Rule.p_measures.each do |p|
168
+ values[p] = self.get_p(p)
169
+ end
170
+ return values
171
+ end
172
+
173
+ ##
174
+ # returns the measures which has been instantiated
175
+ def instantiated_measures
176
+ self.class.measures.select {|m| measure_instantiated?(m) }
177
+ end
178
+
179
+ ##
180
+ # Returns true if the measure has been instantiated
181
+ def measure_instantiated?(measure)
182
+ !self.instance_variable_get('@'+measure.to_s).nil?
183
+ end
184
+
185
+ ##
186
+ # NATIVE AGGREGATED MEASURES
187
+ #
188
+ # these measures are defined natively for aggregated rules
189
+ #
190
+ # e.g.,
191
+ # a -> c
192
+ # b -> c
193
+ # aggregated is: a,b -> c
194
+ #
195
+ # the listed measures are well defined also for the aggregated rule
196
+ #
197
+ # With the exception of the hyper coefficient, all other hyper measures
198
+ # are allways recalculated on request.
199
+ # I.e., they use '@m = ..' rather than '@m ||= ..'
200
+ # This way they will overwrite the aggregation that has been calculated on the hyper rule
201
+ # (which is done for the concentional rules)
202
+
203
+
204
+ ##
205
+ # Hyper Coefficient
206
+ # The number of rules used to form a hyper rule
207
+ M_HYPER_COEFFICIENT_MIN = 0
208
+ M_HYPER_COEFFICIENT_MID = 0
209
+ M_HYPER_COEFFICIENT_MAX = Float::INFINITY
210
+ M_HYPER_COEFFICIENT_HYPER_MEASURE = true
211
+ def m_hyper_coefficient
212
+ @m_hyper_coefficient = self.class.initialize_measure(__method__) {
213
+ if self.respond_to?(:hyper_coefficient)
214
+ self.hyper_coefficient
215
+ else
216
+ 0
217
+ end
218
+ }
219
+ end
220
+
221
+ ##
222
+ # Hyper confidence
223
+ #
224
+ # A confidence like measure that is well defined for all hyper rules
225
+ #
226
+ # "the number of times something in rhs changed with something in lhs, divided by the number of times something in lhs changed"
227
+ ##
228
+ #M_HYPER_CONFIDENCE_MIN = 0
229
+ #M_HYPER_CONFIDENCE_MID = 0
230
+ #M_HYPER_CONFIDENCE_MAX = 1
231
+ #M_HYPER_CONFIDENCE_HYPER_MEASURE = true
232
+ #def m_hyper_confidence
233
+ # @m_hyper_confidence ||= self.class.initialize_measure(__method__) {
234
+ # # hyper confidence is equal to the confidence for non hyper rules
235
+ # if !self.respond_to?(:hyper_confidence)
236
+ # m_confidence.value
237
+ # else
238
+ # raise ArgumentError, "Asked for the hyper confidence of a hyper rule, the value was not initialized, but should have been when creating the hyper rule"
239
+ # end
240
+ # }
241
+ #end
242
+
243
+
244
+ ##
245
+ # INTERESTINGNESS MEASURES
246
+ #
247
+ # if not stated otherwise, all of the implementations are based on
248
+ # Michael Hahslers overview at:
249
+ # http://michael.hahsler.net/research/association_rules/measures.html
250
+ #
251
+ ##
252
+
253
+ M_SUPPORT_MIN = 0
254
+ M_SUPPORT_MID = 0
255
+ M_SUPPORT_MAX = 1
256
+ def m_support
257
+ @m_support ||= self.class.initialize_measure(__method__) {
258
+ p_AB
259
+ }
260
+ end
261
+
262
+ M_CONFIDENCE_MIN = 0
263
+ M_CONFIDENCE_MID = 0
264
+ M_CONFIDENCE_MAX = 1
265
+ def m_confidence
266
+ @m_confidence ||= self.class.initialize_measure(__method__) {
267
+ p_BgivenA
268
+ }
269
+ end
270
+
271
+ M_COVERAGE_MIN = 0
272
+ M_COVERAGE_MID = 0
273
+ M_COVERAGE_MAX = 1
274
+ def m_coverage
275
+ @m_coverage ||= self.class.initialize_measure(__method__) {
276
+ p_A
277
+ }
278
+ end
279
+
280
+ M_PREVALENCE_MIN = 0
281
+ M_PREVALENCE_MID = 0
282
+ M_PREVALENCE_MAX = 1
283
+ def m_prevalence
284
+ @m_prevalence ||= self.class.initialize_measure(__method__) {
285
+ p_B
286
+ }
287
+ end
288
+
289
+ M_RECALL_MIN = 0
290
+ M_RECALL_MID = 0
291
+ M_RECALL_MAX = 1
292
+ def m_recall
293
+ @m_recall ||= self.class.initialize_measure(__method__) {
294
+ p_AgivenB
295
+ }
296
+ end
297
+
298
+ M_SPECIFICITY_MIN = 0
299
+ M_SPECIFICITY_MID = 0
300
+ M_SPECIFICITY_MAX = 1
301
+ def m_specificity
302
+ @m_specificity ||= self.class.initialize_measure(__method__) {
303
+ p_notBgivennotA
304
+ }
305
+ end
306
+
307
+ M_LIFT_MIN = 0
308
+ M_LIFT_MID = 1
309
+ M_LIFT_MAX = Float::INFINITY
310
+ ##
311
+ # aka interest
312
+ # Lift measures how many times more often X and Y occur
313
+ # together than expected if they where statistically independent
314
+ ##
315
+ def m_lift
316
+ @m_lift ||= self.class.initialize_measure(__method__) {
317
+ p_AB/(p_A*p_B)
318
+ }
319
+ end
320
+
321
+ M_LEVERAGE_MIN = -1
322
+ M_LEVERAGE_MID = 0
323
+ M_LEVERAGE_MAX = 1
324
+ ##
325
+ # Leverage measures the difference of X and Y appearing together
326
+ # in the data set and what would be expected if X and Y where statistically dependent
327
+ ##
328
+ def m_leverage
329
+ @m_leverage ||= self.class.initialize_measure(__method__) {
330
+ p_BgivenA - (p_A*p_B)
331
+ }
332
+ end
333
+
334
+ M_PIATETSKY_SHAPIRO_MIN = -0.25
335
+ M_PIATETSKY_SHAPIRO_MID = 0
336
+ M_PIATETSKY_SHAPIRO_MAX = 0.25
337
+ def m_piatetsky_shapiro
338
+ @m_piatetsky_shapiro ||= self.class.initialize_measure(__method__) {
339
+ p_AB - p_A*p_B
340
+ }
341
+ end
342
+
343
+ # aka: pavillion index, centered confidence
344
+ M_ADDED_VALUE_MIN = -0.5
345
+ M_ADDED_VALUE_MID = 0
346
+ M_ADDED_VALUE_MAX = 1
347
+ def m_added_value
348
+ @m_added_value ||= self.class.initialize_measure(__method__) {
349
+ p_BgivenA - p_B
350
+ }
351
+ end
352
+
353
+ M_CAUSAL_CONFIDENCE_MIN = 0
354
+ M_CAUSAL_CONFIDENCE_MID = 0
355
+ M_CAUSAL_CONFIDENCE_MAX = 1
356
+ def m_causal_confidence
357
+ @m_causal_confidence ||= self.class.initialize_measure(__method__) {
358
+ (1.to_r/2)*(p_BgivenA + p_notAgivennotB)
359
+ }
360
+ end
361
+
362
+ M_CAUSAL_SUPPORT_MIN = 0
363
+ M_CAUSAL_SUPPORT_MID = 0
364
+ M_CAUSAL_SUPPORT_MAX = 1
365
+ def m_causal_support
366
+ @m_causal_support ||= self.class.initialize_measure(__method__) {
367
+ p_AB + p_notA_notB
368
+ }
369
+ end
370
+
371
+ M_DESCRIPTIVE_CONFIRMED_CONFIDENCE_MIN = -1
372
+ M_DESCRIPTIVE_CONFIRMED_CONFIDENCE_MID = 0
373
+ M_DESCRIPTIVE_CONFIRMED_CONFIDENCE_MAX = 1
374
+ def m_descriptive_confirmed_confidence
375
+ @m_descriptive_confirmed_confidence ||= self.class.initialize_measure(__method__) {
376
+ p_BgivenA - p_notBgivenA
377
+ }
378
+ end
379
+
380
+ M_DIFFERENCE_OF_CONFIDENCE_MIN = -1
381
+ M_DIFFERENCE_OF_CONFIDENCE_MID = 0
382
+ M_DIFFERENCE_OF_CONFIDENCE_MAX = 1
383
+ def m_difference_of_confidence
384
+ @m_difference_of_confidence ||= self.class.initialize_measure(__method__) {
385
+ p_BgivenA - p_BgivennotA
386
+ }
387
+ end
388
+
389
+ M_RELATIVE_RISK_MIN = 0
390
+ M_RELATIVE_RISK_MID = 0
391
+ M_RELATIVE_RISK_MAX = Float::INFINITY
392
+ def m_relative_risk
393
+ @m_relative_risk ||= self.class.initialize_measure(__method__) {
394
+ if p_BgivennotA == 0
395
+ Float::INFINITY
396
+ else
397
+ p_BgivenA/p_BgivennotA
398
+ end
399
+ }
400
+ end
401
+
402
+ M_JACCARD_MIN = 0
403
+ M_JACCARD_MID = 0
404
+ M_JACCARD_MAX = 1
405
+ def m_jaccard
406
+ @m_jaccard ||= self.class.initialize_measure(__method__) {
407
+ p_AB/(p_A+p_B-p_AB)
408
+ }
409
+ end
410
+
411
+
412
+ M_IMBALANCE_RATIO_MIN = 0
413
+ M_IMBALANCE_RATIO_MID = 0
414
+ M_IMBALANCE_RATIO_MAX = 1
415
+ ##
416
+ # IR gauges the degree of imbalance between two events that the lhs and the rhs are contained in a transaction.
417
+ # The ratio is close to 0 if the conditional probabilities are similar (i.e., very balanced) and close to 1 if they are very different
418
+ ##
419
+ def m_imbalance_ratio
420
+ @m_imbalance_ratio ||= self.class.initialize_measure(__method__) {
421
+ numerator = (p_AgivenB - p_BgivenA).abs
422
+ denominator = (p_AgivenB + p_BgivenA - p_AgivenB*p_BgivenA)
423
+ if denominator == 0
424
+ if numerator == 0
425
+ 0
426
+ else
427
+ raise Evoc::MeasureCalculationError.new, "Numerator was not 0 when denominator was 0 when calculating imbalance ratio"
428
+ end
429
+ else
430
+ numerator/denominator
431
+ end
432
+ }
433
+ end
434
+
435
+ M_ODDS_RATIO_MIN = 0
436
+ M_ODDS_RATIO_MID = 1
437
+ M_ODDS_RATIO_MAX = Float::INFINITY
438
+ ##
439
+ # The odds of finding X in transactions which contain Y divided by the
440
+ # odds of finding X in transactions which do not contain Y
441
+ ##
442
+ def m_odds_ratio
443
+ @m_odds_ratio ||= self.class.initialize_measure(__method__) {
444
+ numerator = p_AB*p_notA_notB
445
+ denominator = p_A_notB*p_notA_B
446
+ if denominator == 0
447
+ Float::INFINITY
448
+ else
449
+ numerator/denominator
450
+ end
451
+ }
452
+ end
453
+
454
+ M_YULES_Q_MIN = -1
455
+ M_YULES_Q_MID = 0
456
+ M_YULES_Q_MAX = 1
457
+ def m_yules_q
458
+ @m_yules_q ||= self.class.initialize_measure(__method__) {
459
+ odds_ratio = self.m_odds_ratio.value
460
+ if !odds_ratio.nil?
461
+ if odds_ratio.to_f.finite?
462
+ (odds_ratio - 1)/(odds_ratio + 1)
463
+ else
464
+ # -1 if odds ratio -inf
465
+ # 1 if odds ratio +inf
466
+ odds_ratio.to_f.infinite?
467
+ end
468
+ else
469
+ raise Evoc::MeasureCalculationError.new, "Odds ratio was nil when calculating yules q"
470
+ end
471
+ }
472
+ end
473
+
474
+ M_YULES_Y_MIN = -1
475
+ M_YULES_Y_MID = 0
476
+ M_YULES_Y_MAX = 1
477
+ def m_yules_y
478
+ @m_yules_y ||= self.class.initialize_measure(__method__) {
479
+ odds_ratio = self.m_odds_ratio.value
480
+ if !odds_ratio.nil?
481
+ if odds_ratio.to_f.finite?
482
+ ((Math.sqrt(odds_ratio).rationalize) - 1)/((Math.sqrt(odds_ratio).rationalize) + 1)
483
+ else
484
+ odds_ratio.to_f.infinite?
485
+ end
486
+ else
487
+ raise Evoc::Exceptions::MeasureCalculationError.new, "Odds ratio was nil when calculating yules y"
488
+ end
489
+ }
490
+ end
491
+
492
+ # from Tan2004
493
+ M_KLOSGEN_MIN = -1
494
+ M_KLOSGEN_MID = 0
495
+ M_KLOSGEN_MAX = 1
496
+ def m_klosgen
497
+ @m_klosgen ||= self.class.initialize_measure(__method__) {
498
+ (Math.sqrt(p_AB)*[(p_BgivenA-p_B),p_AgivenB-p_A].max).rationalize
499
+ }
500
+ end
501
+
502
+ M_KULCZYNSKI_MIN = 0
503
+ M_KULCZYNSKI_MID = 0
504
+ M_KULCZYNSKI_MAX = 1
505
+ ##
506
+ # Calculate the null-invariant Kulczynski measure with a preference for skewed patterns.
507
+ ##
508
+ def m_kulczynski
509
+ @m_kulczynski ||= self.class.initialize_measure(__method__) {
510
+ (p_AB/2)*((1/p_A)+(1/p_B))
511
+ }
512
+ end
513
+
514
+ M_CONVICTION_MIN = 0
515
+ M_CONVICTION_MID = 0
516
+ M_CONVICTION_MAX = Float::INFINITY
517
+ def m_conviction
518
+ @m_conviction ||= self.class.initialize_measure(__method__) {
519
+ numerator = p_A*p_notB
520
+ denominator = p_A_notB
521
+ if denominator == 0
522
+ if numerator == 0
523
+ 0
524
+ else
525
+ Float::INFINITY
526
+ end
527
+ else
528
+ numerator/denominator
529
+ end
530
+ }
531
+ end
532
+
533
+ ##
534
+ # uses 2 coefficients two weight the importance of the two factors
535
+ # k : dependency
536
+ # m : generality
537
+ M_INTERESTINGNESS_WEIGHTING_DEPENDENCY_MIN = 0
538
+ M_INTERESTINGNESS_WEIGHTING_DEPENDENCY_MID = 0
539
+ M_INTERESTINGNESS_WEIGHTING_DEPENDENCY_MAX = 1
540
+ def m_interestingness_weighting_dependency
541
+ k = 2
542
+ m = 2
543
+ @m_interestingness_weighting_dependency ||= self.class.initialize_measure(__method__) {
544
+ ((p_BgivenA/p_B)**(k-1))*(p_AB**m)
545
+ }
546
+ end
547
+
548
+ M_COLLECTIVE_STRENGTH_MIN = -Float::INFINITY
549
+ M_COLLECTIVE_STRENGTH_MID = 1
550
+ M_COLLECTIVE_STRENGTH_MAX = Float::INFINITY
551
+ # range from Aggarwal1998
552
+ def m_collective_strength
553
+ @m_collective_strength ||= self.class.initialize_measure(__method__) {
554
+ n1 = (p_AB+p_notBgivennotA)
555
+ d1 = (p_A*p_B+p_notA*p_notB)
556
+ first = ((d1 == 0) ? n1/d1.to_f : n1/d1)
557
+ n2 = (1-p_A*p_B-p_notA*p_notB)
558
+ d2 = (1-p_AB-p_notBgivennotA)
559
+ second = ((d2 == 0) ? n2/d2.to_f : n2/d2)
560
+ first * second
561
+ }
562
+ end
563
+
564
+ M_GINI_INDEX_MIN = 0
565
+ M_GINI_INDEX_MID = 0
566
+ M_GINI_INDEX_MAX = 1
567
+ ##
568
+ # Measures quadratic entropy
569
+ #
570
+ ##
571
+ def m_gini_index
572
+ @m_gini_index ||= self.class.initialize_measure(__method__) {
573
+ p_A*((p_BgivenA**2)+(p_notBgivenA**2))+p_notA*((p_BgivennotA**2)+(p_notBgivennotA**2))-p_B**2-p_notB**2
574
+ }
575
+ end
576
+
577
+ M_KAPPA_MIN = -1
578
+ M_KAPPA_MID = 0
579
+ M_KAPPA_MAX = 1
580
+ def m_kappa
581
+ @m_kappa ||= self.class.initialize_measure(__method__) {
582
+ num = (p_AB + p_notA_notB - p_A*p_B - p_notA*p_notB)
583
+ den = (1 - p_A*p_B - p_notA*p_notB)
584
+ ((den == 0) ? num/den.to_f : num/den)
585
+ }
586
+ end
587
+
588
+ M_J_MEASURE_MIN = 0
589
+ M_J_MEASURE_MID = 0
590
+ M_J_MEASURE_MAX = 1
591
+ ##
592
+ # Measures cross entropy
593
+ #
594
+ ##
595
+ def m_j_measure
596
+ @m_j_measure ||= self.class.initialize_measure(__method__) {
597
+ first_log = Math.log(p_BgivenA/(p_B.to_f))
598
+ second_log = Math.log(p_notBgivenA/(p_notB.to_f))
599
+ first = ((p_AB == 0) ? 0 : p_AB*first_log)
600
+ second = ((p_A_notB == 0) ? 0 : p_A_notB*second_log)
601
+
602
+ first+second
603
+ }
604
+ end
605
+
606
+ M_ONE_WAY_SUPPORT_MIN = -1
607
+ M_ONE_WAY_SUPPORT_MID = 0
608
+ M_ONE_WAY_SUPPORT_MAX = Float::INFINITY
609
+ def m_one_way_support
610
+ @m_one_way_support ||= self.class.initialize_measure(__method__) {
611
+ if p_BgivenA == 0
612
+ 0
613
+ else
614
+ p_BgivenA*(Math.log2(p_AB/(p_A*p_B)).rationalize)
615
+ end
616
+ }
617
+ end
618
+
619
+ M_TWO_WAY_SUPPORT_MIN = -1
620
+ M_TWO_WAY_SUPPORT_MID = 0
621
+ M_TWO_WAY_SUPPORT_MAX = 1
622
+ def m_two_way_support
623
+ @m_two_way_support ||= self.class.initialize_measure(__method__) {
624
+ if p_AB == 0
625
+ 0
626
+ else
627
+ p_AB*(Math.log2(p_AB/(p_A*p_B)).rationalize)
628
+ end
629
+ }
630
+ end
631
+
632
+ # aka Ø-coefficient
633
+ M_LINEAR_CORRELATION_COEFFICIENT_MIN = -1
634
+ M_LINEAR_CORRELATION_COEFFICIENT_MID = 0
635
+ M_LINEAR_CORRELATION_COEFFICIENT_MAX = 1
636
+ def m_linear_correlation_coefficient
637
+ @m_linear_correlation_coefficient ||= self.class.initialize_measure(__method__) {
638
+ num = (p_AB-(p_A*p_B))
639
+ den = (Math.sqrt(p_A*p_B*p_notA*p_notB))
640
+ ((den == 0) ? num/den.to_f : num/(den.rationalize))
641
+ }
642
+ end
643
+
644
+ M_COSINE_MIN = 0
645
+ M_COSINE_MID = 0
646
+ M_COSINE_MAX = 1
647
+ def m_cosine
648
+ @m_cosine ||= self.class.initialize_measure(__method__) {
649
+ num = p_AB
650
+ den = Math.sqrt(p_A*p_B).rationalize
651
+ if den == 0
652
+ raise Evoc::Exceptions::MeasureCalculationError, "Denominator became 0 when calculating cosine (a #{p_a}, b #{p_B}, ab #{p_AB})"
653
+ else
654
+ num/den
655
+ end
656
+ }
657
+ end
658
+
659
+ M_LOEVINGER_MIN = -1
660
+ M_LOEVINGER_MID = 0
661
+ M_LOEVINGER_MAX = 1
662
+ ##
663
+ # aka Certainty Factor
664
+ # The certainty factor is a measure of variation of the probability that
665
+ # Y is in a transaction when only considering transactions with X.
666
+ # An increasing CF means a decrease of the probability that Y is not in
667
+ # a transaction that X is in. Negative CFs have a similar interpretation
668
+ #
669
+ ##
670
+ def m_loevinger
671
+ @m_loevinger ||= self.class.initialize_measure(__method__) {
672
+ if p_A*p_notB == 0
673
+ 1
674
+ else
675
+ 1 - ((p_A_notB)/(p_A*p_notB))
676
+ end
677
+ }
678
+ end
679
+
680
+ M_SEBAG_SCHOENAUER_MIN = 0
681
+ M_SEBAG_SCHOENAUER_MID = 0
682
+ M_SEBAG_SCHOENAUER_MAX = Float::INFINITY
683
+ def m_sebag_schoenauer
684
+ @m_sebag_schoenauer ||= self.class.initialize_measure(__method__) {
685
+ if p_A_notB == 0
686
+ Float::INFINITY
687
+ else
688
+ p_AB/p_A_notB
689
+ end
690
+ }
691
+ end
692
+
693
+ M_VARYING_RATES_LIAISON_MIN = -1
694
+ M_VARYING_RATES_LIAISON_MID = 0
695
+ M_VARYING_RATES_LIAISON_MAX = Float::INFINITY
696
+ def m_varying_rates_liaison
697
+ @m_varying_rates_liaison ||= self.class.initialize_measure(__method__) {
698
+ (p_AB/(p_A*p_B)) - 1
699
+ }
700
+ end
701
+
702
+ M_LEAST_CONTRADICTION_MIN = -Float::INFINITY
703
+ M_LEAST_CONTRADICTION_MID = 0
704
+ M_LEAST_CONTRADICTION_MAX = 1
705
+ def m_least_contradiction
706
+ @m_least_contradiction ||= self.class.initialize_measure(__method__) {
707
+ (p_AB-p_A_notB)/p_B
708
+ }
709
+ end
710
+
711
+ M_ODD_MULTIPLIER_MIN = 0
712
+ M_ODD_MULTIPLIER_MID = 0
713
+ M_ODD_MULTIPLIER_MAX = Float::INFINITY
714
+ def m_odd_multiplier
715
+ @m_odd_multiplier ||= self.class.initialize_measure(__method__) {
716
+ numerator = p_AB*p_notB
717
+ denominator = p_B*p_A_notB
718
+ if denominator == 0
719
+ if numerator == 0
720
+ 0
721
+ else
722
+ Float::INFINITY
723
+ end
724
+ else
725
+ numerator/denominator
726
+ end
727
+ }
728
+ end
729
+
730
+ M_EXAMPLE_AND_COUNTEREXAMPLE_RATE_MIN = -Float::INFINITY
731
+ M_EXAMPLE_AND_COUNTEREXAMPLE_RATE_MID = 0
732
+ M_EXAMPLE_AND_COUNTEREXAMPLE_RATE_MAX = 1
733
+ # 0 when equally many examples as counter examples
734
+ def m_example_and_counterexample_rate
735
+ @m_example_and_counterexample_rate ||= self.class.initialize_measure(__method__) {
736
+ numerator = (p_AB - p_A_notB)
737
+ denominator = p_AB
738
+ if denominator == 0
739
+ -Float::INFINITY
740
+ else
741
+ numerator/denominator
742
+ end
743
+ }
744
+ end
745
+
746
+ M_ZHANG_MIN = -1
747
+ M_ZHANG_MID = 0
748
+ M_ZHANG_MAX = 1
749
+ def m_zhang
750
+ @m_zhang ||= self.class.initialize_measure(__method__) {
751
+ numerator = p_AB-p_A*p_B
752
+ denominator = [p_AB*p_notB,p_B*p_A_notB].max
753
+ if denominator == 0
754
+ 0
755
+ else
756
+ numerator/denominator
757
+ end
758
+ }
759
+ end
760
+
761
+ M_LAPLACE_CORRECTED_CONFIDENCE_MIN = 0
762
+ M_LAPLACE_CORRECTED_CONFIDENCE_MID = 0
763
+ M_LAPLACE_CORRECTED_CONFIDENCE_MAX = 1
764
+ ##
765
+ # Corrected confidence estimate decreases with lower support
766
+ # to account for estimation uncertainty with low counts.
767
+ ##
768
+ def m_laplace_corrected_confidence
769
+ @m_laplace_corrected_confidence ||= self.class.initialize_measure(__method__) {
770
+ (p_AB + 1)/(p_B + 2)
771
+ }
772
+ end
773
+
774
+ ##
775
+ # building blocks for interestingness measures
776
+ #
777
+ # A refers to the antecedent of a rule
778
+ # B refers to the consequent of a rule
779
+
780
+ ##
781
+ # the number of transactions
782
+ # n is converted into the specified type to ensure
783
+ # that the type is used throughout calculations
784
+ def n
785
+ @n ||= VALUE_TYPE.method(VALUE_TYPE.to_s).call(tx_store.size)
786
+ end
787
+
788
+ ##
789
+ # the ratio of tx with A as a subset
790
+ def p_A
791
+ @p_A ||= tx_store.transactions_of_list(lhs,strict: true).size/n
792
+ end
793
+
794
+ ##
795
+ # the ratio of tx where A is not a subset
796
+ def p_notA
797
+ @p_notA ||= (1 - p_A)
798
+ end
799
+
800
+ ##
801
+ # the ratio of tx with B as a subset
802
+ def p_B
803
+ @p_B ||= tx_store.transactions_of_list(rhs,strict: true).size/n
804
+ end
805
+
806
+ ##
807
+ # the ratio of tx where B is not a subset
808
+ def p_notB
809
+ @p_notB ||= (1 - p_B)
810
+ end
811
+
812
+ ##
813
+ # the ratio of tx with the union of A and B as a subset
814
+ def p_AB
815
+ @p_AB ||= tx_store.transactions_of_list((lhs | rhs),strict: true).size/n
816
+ end
817
+
818
+ ##
819
+ # the ratio of tx where A or B is a subset
820
+ def p_AorB
821
+ @p_AorB ||= p_A + p_B - p_AB
822
+ end
823
+
824
+ ##
825
+ # the ratio of tx where neither A or B is a subset
826
+ def p_notA_notB
827
+ @p_notA_notB ||= 1 - (p_A + p_B) + p_AB
828
+ end
829
+
830
+ ##
831
+ # the ratio of tx where A is not a subset but B is
832
+ def p_notA_B
833
+ @p_notA_B ||= p_B - p_AB
834
+ end
835
+
836
+ ##
837
+ # the ratio of tx where A is a subset but B is not
838
+ def p_A_notB
839
+ @p_A_notB ||= p_A - p_AB
840
+ end
841
+
842
+ ##
843
+ # the ratio of the union being a subset to the number of txes where B is a subset
844
+ def p_AgivenB
845
+ @p_AgivenB ||= ((p_B == 0) ? 0 : p_AB/p_B)
846
+ end
847
+
848
+ ##
849
+ # the ratio of the union being a subset to the number of txes where A is a subset
850
+ def p_BgivenA
851
+ @p_BgivenA ||= ((p_A == 0) ? 0 : p_AB/p_A)
852
+ end
853
+
854
+ def p_notAgivenB
855
+ @p_notAgivenB ||= ((p_B == 0) ? 0 : p_notA_B/p_B)
856
+ end
857
+
858
+ # if A is 0, so is A,notB
859
+ def p_notBgivenA
860
+ @p_notBgivenA ||= ((p_A == 0) ? 0 : p_A_notB/p_A)
861
+ end
862
+
863
+ # if notB is 0, so is A_notB
864
+ def p_AgivennotB
865
+ @p_AgivennotB ||= ((p_notB == 0) ? 0 : p_A_notB/p_notB)
866
+ end
867
+
868
+ # if notA is 0, so is notA_B
869
+ def p_BgivennotA
870
+ @p_BgivennotA ||= ((p_notA == 0) ? 0 : p_notA_B/p_notA)
871
+ end
872
+
873
+ def p_notBgivennotA
874
+ @p_notBgivennotA ||= ((p_notA == 0) ? 0 : p_notA_notB/p_notA)
875
+ end
876
+
877
+ # if notB is 0, so is notA and notB
878
+ def p_notAgivennotB
879
+ @p_notAgivennotB ||= ((p_notB == 0) ? 0 : p_notA_notB/p_notB)
880
+ end
881
+ end
882
+ end