evoc 3.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/Makefile +4 -0
- data/README.md +61 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/evoc +3 -0
- data/bin/setup +7 -0
- data/evoc.gemspec +30 -0
- data/lib/evoc/algorithm.rb +147 -0
- data/lib/evoc/algorithms/top_k.rb +86 -0
- data/lib/evoc/analyze.rb +395 -0
- data/lib/evoc/array.rb +43 -0
- data/lib/evoc/evaluate.rb +109 -0
- data/lib/evoc/exceptions/aggregation_error.rb +6 -0
- data/lib/evoc/exceptions/expectedoutcome_nil_or_empty.rb +6 -0
- data/lib/evoc/exceptions/measure_calculation_error.rb +6 -0
- data/lib/evoc/exceptions/no_changed_items_in_changes.rb +6 -0
- data/lib/evoc/exceptions/no_changes_in_json_object.rb +6 -0
- data/lib/evoc/exceptions/no_date_in_json_object.rb +6 -0
- data/lib/evoc/exceptions/no_result.rb +6 -0
- data/lib/evoc/exceptions/non_finite.rb +8 -0
- data/lib/evoc/exceptions/non_numeric.rb +8 -0
- data/lib/evoc/exceptions/not_a_query.rb +6 -0
- data/lib/evoc/exceptions/not_a_result.rb +6 -0
- data/lib/evoc/exceptions/not_a_transaction.rb +6 -0
- data/lib/evoc/exceptions/not_initialized.rb +6 -0
- data/lib/evoc/exceptions/only_nil_in_changes.rb +6 -0
- data/lib/evoc/exceptions/query_nil_or_empty.rb +6 -0
- data/lib/evoc/exceptions/unable_to_convert_json_to_tx.rb +6 -0
- data/lib/evoc/experiment.rb +239 -0
- data/lib/evoc/hash.rb +56 -0
- data/lib/evoc/history_store.rb +53 -0
- data/lib/evoc/hyper_rule.rb +53 -0
- data/lib/evoc/interestingness_measure.rb +77 -0
- data/lib/evoc/interestingness_measure_aggregator.rb +147 -0
- data/lib/evoc/interestingness_measures.rb +882 -0
- data/lib/evoc/logger.rb +34 -0
- data/lib/evoc/memory_profiler.rb +43 -0
- data/lib/evoc/recommendation_cache.rb +152 -0
- data/lib/evoc/rule.rb +32 -0
- data/lib/evoc/rule_store.rb +340 -0
- data/lib/evoc/scenario.rb +303 -0
- data/lib/evoc/svd.rb +124 -0
- data/lib/evoc/tx.rb +34 -0
- data/lib/evoc/tx_store.rb +379 -0
- data/lib/evoc/version.rb +3 -0
- data/lib/evoc.rb +4 -0
- data/lib/evoc_cli/analyze.rb +198 -0
- data/lib/evoc_cli/cli_helper.rb +1 -0
- data/lib/evoc_cli/experiment.rb +78 -0
- data/lib/evoc_cli/info.rb +22 -0
- data/lib/evoc_cli/main.rb +29 -0
- data/lib/evoc_cli/util.rb +36 -0
- data/lib/evoc_helper.rb +40 -0
- data/mem_profiler/Gemfile.lock +39 -0
- data/mem_profiler/README.md +126 -0
- data/mem_profiler/createdb.rb +4 -0
- data/mem_profiler/db.rb +82 -0
- data/mem_profiler/gemfile +6 -0
- data/mem_profiler/gencsv.rb +64 -0
- data/mem_profiler/genimport.sh +8 -0
- data/mem_profiler/graph.rb +91 -0
- metadata +251 -0
@@ -0,0 +1,882 @@
|
|
1
|
+
module Evoc
|
2
|
+
module InterestingnessMeasures
|
3
|
+
|
4
|
+
VALUE_TYPE = Rational
|
5
|
+
# USE Rational(Math.log(1/3)).rationalize(0.001)
|
6
|
+
|
7
|
+
#
|
8
|
+
# methods that must be implemented in the class that uses this module as a mixin
|
9
|
+
#
|
10
|
+
def tx_store
|
11
|
+
raise NotImplementedError.new, "tx_store has not been implemented on the current class"
|
12
|
+
end
|
13
|
+
|
14
|
+
def lhs
|
15
|
+
raise NotImplementedError.new, "lhs has not been implemented on the current class"
|
16
|
+
end
|
17
|
+
|
18
|
+
def rhs
|
19
|
+
raise NotImplementedError.new, "rhs has not been implemented on the current class"
|
20
|
+
end
|
21
|
+
|
22
|
+
def name
|
23
|
+
raise NotImplementedError.new, "name has not been implemented on the current class"
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# class methods
|
28
|
+
#
|
29
|
+
# the following is a idiom/hack that enables also including class methods when a class includes this module
|
30
|
+
# (normally one would use 'extend')
|
31
|
+
#
|
32
|
+
##
|
33
|
+
def self.included(base)
|
34
|
+
base.extend(ClassMethods)
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.get_min(measure)
|
38
|
+
const_get(measure.to_s.upcase+"_MIN")
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.get_max(measure)
|
42
|
+
const_get(measure.to_s.upcase+"_MAX")
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.get_mid(measure)
|
46
|
+
const_get(measure.to_s.upcase+"_MID")
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.measures
|
50
|
+
self.instance_methods.grep(/\Am_(.*)/)
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.hyper_measures
|
54
|
+
self.instance_methods.grep(/\Am_(.*)/).select {|m| is_hyper_measure?(m)}
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.is_hyper_measure?(m)
|
58
|
+
begin
|
59
|
+
const_get(m.to_s.upcase+"_HYPER_MEASURE")
|
60
|
+
rescue NameError
|
61
|
+
return false
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
module ClassMethods
|
66
|
+
|
67
|
+
def initialize_measure(measure, hyper_measure: false)
|
68
|
+
if block_given?
|
69
|
+
Evoc::InterestingnessMeasure.new(type: measure,min: get_min(measure),mid: get_mid(measure), max: get_max(measure), hyper_measure: hyper_measure) {
|
70
|
+
yield
|
71
|
+
}
|
72
|
+
else
|
73
|
+
Evoc::InterestingnessMeasure.new(type: measure,min: get_min(measure),mid: get_mid(measure), max: get_max(measure), hyper_measure: hyper_measure)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
##
|
78
|
+
# returns the list of interestingness measures which are implemented
|
79
|
+
def measures
|
80
|
+
self.instance_methods.grep(/\Am_(.*)/)
|
81
|
+
end
|
82
|
+
|
83
|
+
def p_measures
|
84
|
+
self.instance_methods.grep(/\Ap_(.*)/)
|
85
|
+
end
|
86
|
+
|
87
|
+
def csv_header
|
88
|
+
['lhs','rhs'] + measures
|
89
|
+
end
|
90
|
+
|
91
|
+
def pretty_csv_header
|
92
|
+
['lhs','rhs'] + measures.map {|m| m.to_s.gsub(/m_/,'')}
|
93
|
+
end
|
94
|
+
|
95
|
+
def get_min(measure)
|
96
|
+
const_get(measure.to_s.upcase+"_MIN")
|
97
|
+
end
|
98
|
+
|
99
|
+
def get_max(measure)
|
100
|
+
const_get(measure.to_s.upcase+"_MAX")
|
101
|
+
end
|
102
|
+
|
103
|
+
def get_mid(measure)
|
104
|
+
const_get(measure.to_s.upcase+"_MID")
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
def to_a
|
110
|
+
[lhs.join(',')] + [rhs.join(',')] + instantiated_measures.map {|m| self.get_measure(m).value}
|
111
|
+
end
|
112
|
+
|
113
|
+
|
114
|
+
|
115
|
+
##
|
116
|
+
# a common getter for all measures
|
117
|
+
#
|
118
|
+
# handles exceptions and converts the final measure to float
|
119
|
+
def get_measure(measure)
|
120
|
+
if m = self.method(measure).call
|
121
|
+
return m
|
122
|
+
else
|
123
|
+
raise NotImplementedError.new, "#{measure} not implemented"
|
124
|
+
end
|
125
|
+
rescue Evoc::Exceptions::MeasureCalculationError => e
|
126
|
+
logger.warn "#{measure} was undefined for #{self.name} on the current history, error: #{e}"
|
127
|
+
m = self.class.initialize_measure(measure)
|
128
|
+
self.instance_variable_set('@'+measure.to_s,m)
|
129
|
+
self.method(measure).call
|
130
|
+
end
|
131
|
+
|
132
|
+
def set_measure(measure,value,hyper_measure: false)
|
133
|
+
m = self.class.initialize_measure(measure, hyper_measure: hyper_measure) {value}
|
134
|
+
self.instance_variable_set('@'+measure.to_s,m)
|
135
|
+
end
|
136
|
+
|
137
|
+
##
|
138
|
+
# manually set the probability p of this rule
|
139
|
+
def set_p(p,value)
|
140
|
+
self.instance_variable_set('@'+p.to_s,value)
|
141
|
+
end
|
142
|
+
|
143
|
+
##
|
144
|
+
# @param [String] the p probability to get
|
145
|
+
def get_p(p)
|
146
|
+
p = self.method(p).call
|
147
|
+
if p_A < p_AB
|
148
|
+
raise Evoc::Exceptions::MeasureCalculationError, "p_A was smaller than p_AB"
|
149
|
+
elsif p_B < p_AB
|
150
|
+
raise Evoc::Exceptions::MeasureCalculationError, "p_B was smaller than p_AB"
|
151
|
+
elsif p_A == 0
|
152
|
+
raise Evoc::Exceptions::MeasureCalculationError, "p_A was 0"
|
153
|
+
elsif p_B == 0
|
154
|
+
raise Evoc::Exceptions::MeasureCalculationError, "p_B was 0"
|
155
|
+
elsif p_B > 1-p_A+p_AB
|
156
|
+
raise Evoc::Exceptions::MeasureCalculationError, "p_B mismatch with p_A and p_AB, (a #{p_a}, b #{p_B}, ab #{p_AB})"
|
157
|
+
elsif p_A > 1-p_B+p_AB
|
158
|
+
raise Evoc::Exceptions::MeasureCalculationError, "p_A mismatch with p_B and p_AB, (a #{p_a}, b #{p_B}, ab #{p_AB})"
|
159
|
+
end
|
160
|
+
return p
|
161
|
+
end
|
162
|
+
|
163
|
+
##
|
164
|
+
# @return [Hash] a hash containing the p probabilities of this rule
|
165
|
+
def get_p_values
|
166
|
+
values = Hash.new
|
167
|
+
Evoc::Rule.p_measures.each do |p|
|
168
|
+
values[p] = self.get_p(p)
|
169
|
+
end
|
170
|
+
return values
|
171
|
+
end
|
172
|
+
|
173
|
+
##
|
174
|
+
# returns the measures which has been instantiated
|
175
|
+
def instantiated_measures
|
176
|
+
self.class.measures.select {|m| measure_instantiated?(m) }
|
177
|
+
end
|
178
|
+
|
179
|
+
##
|
180
|
+
# Returns true if the measure has been instantiated
|
181
|
+
def measure_instantiated?(measure)
|
182
|
+
!self.instance_variable_get('@'+measure.to_s).nil?
|
183
|
+
end
|
184
|
+
|
185
|
+
##
|
186
|
+
# NATIVE AGGREGATED MEASURES
|
187
|
+
#
|
188
|
+
# these measures are defined natively for aggregated rules
|
189
|
+
#
|
190
|
+
# e.g.,
|
191
|
+
# a -> c
|
192
|
+
# b -> c
|
193
|
+
# aggregated is: a,b -> c
|
194
|
+
#
|
195
|
+
# the listed measures are well defined also for the aggregated rule
|
196
|
+
#
|
197
|
+
# With the exception of the hyper coefficient, all other hyper measures
|
198
|
+
# are allways recalculated on request.
|
199
|
+
# I.e., they use '@m = ..' rather than '@m ||= ..'
|
200
|
+
# This way they will overwrite the aggregation that has been calculated on the hyper rule
|
201
|
+
# (which is done for the concentional rules)
|
202
|
+
|
203
|
+
|
204
|
+
##
|
205
|
+
# Hyper Coefficient
|
206
|
+
# The number of rules used to form a hyper rule
|
207
|
+
M_HYPER_COEFFICIENT_MIN = 0
|
208
|
+
M_HYPER_COEFFICIENT_MID = 0
|
209
|
+
M_HYPER_COEFFICIENT_MAX = Float::INFINITY
|
210
|
+
M_HYPER_COEFFICIENT_HYPER_MEASURE = true
|
211
|
+
def m_hyper_coefficient
|
212
|
+
@m_hyper_coefficient = self.class.initialize_measure(__method__) {
|
213
|
+
if self.respond_to?(:hyper_coefficient)
|
214
|
+
self.hyper_coefficient
|
215
|
+
else
|
216
|
+
0
|
217
|
+
end
|
218
|
+
}
|
219
|
+
end
|
220
|
+
|
221
|
+
##
|
222
|
+
# Hyper confidence
|
223
|
+
#
|
224
|
+
# A confidence like measure that is well defined for all hyper rules
|
225
|
+
#
|
226
|
+
# "the number of times something in rhs changed with something in lhs, divided by the number of times something in lhs changed"
|
227
|
+
##
|
228
|
+
#M_HYPER_CONFIDENCE_MIN = 0
|
229
|
+
#M_HYPER_CONFIDENCE_MID = 0
|
230
|
+
#M_HYPER_CONFIDENCE_MAX = 1
|
231
|
+
#M_HYPER_CONFIDENCE_HYPER_MEASURE = true
|
232
|
+
#def m_hyper_confidence
|
233
|
+
# @m_hyper_confidence ||= self.class.initialize_measure(__method__) {
|
234
|
+
# # hyper confidence is equal to the confidence for non hyper rules
|
235
|
+
# if !self.respond_to?(:hyper_confidence)
|
236
|
+
# m_confidence.value
|
237
|
+
# else
|
238
|
+
# raise ArgumentError, "Asked for the hyper confidence of a hyper rule, the value was not initialized, but should have been when creating the hyper rule"
|
239
|
+
# end
|
240
|
+
# }
|
241
|
+
#end
|
242
|
+
|
243
|
+
|
244
|
+
##
|
245
|
+
# INTERESTINGNESS MEASURES
|
246
|
+
#
|
247
|
+
# if not stated otherwise, all of the implementations are based on
|
248
|
+
# Michael Hahslers overview at:
|
249
|
+
# http://michael.hahsler.net/research/association_rules/measures.html
|
250
|
+
#
|
251
|
+
##
|
252
|
+
|
253
|
+
M_SUPPORT_MIN = 0
|
254
|
+
M_SUPPORT_MID = 0
|
255
|
+
M_SUPPORT_MAX = 1
|
256
|
+
def m_support
|
257
|
+
@m_support ||= self.class.initialize_measure(__method__) {
|
258
|
+
p_AB
|
259
|
+
}
|
260
|
+
end
|
261
|
+
|
262
|
+
M_CONFIDENCE_MIN = 0
|
263
|
+
M_CONFIDENCE_MID = 0
|
264
|
+
M_CONFIDENCE_MAX = 1
|
265
|
+
def m_confidence
|
266
|
+
@m_confidence ||= self.class.initialize_measure(__method__) {
|
267
|
+
p_BgivenA
|
268
|
+
}
|
269
|
+
end
|
270
|
+
|
271
|
+
M_COVERAGE_MIN = 0
|
272
|
+
M_COVERAGE_MID = 0
|
273
|
+
M_COVERAGE_MAX = 1
|
274
|
+
def m_coverage
|
275
|
+
@m_coverage ||= self.class.initialize_measure(__method__) {
|
276
|
+
p_A
|
277
|
+
}
|
278
|
+
end
|
279
|
+
|
280
|
+
M_PREVALENCE_MIN = 0
|
281
|
+
M_PREVALENCE_MID = 0
|
282
|
+
M_PREVALENCE_MAX = 1
|
283
|
+
def m_prevalence
|
284
|
+
@m_prevalence ||= self.class.initialize_measure(__method__) {
|
285
|
+
p_B
|
286
|
+
}
|
287
|
+
end
|
288
|
+
|
289
|
+
M_RECALL_MIN = 0
|
290
|
+
M_RECALL_MID = 0
|
291
|
+
M_RECALL_MAX = 1
|
292
|
+
def m_recall
|
293
|
+
@m_recall ||= self.class.initialize_measure(__method__) {
|
294
|
+
p_AgivenB
|
295
|
+
}
|
296
|
+
end
|
297
|
+
|
298
|
+
M_SPECIFICITY_MIN = 0
|
299
|
+
M_SPECIFICITY_MID = 0
|
300
|
+
M_SPECIFICITY_MAX = 1
|
301
|
+
def m_specificity
|
302
|
+
@m_specificity ||= self.class.initialize_measure(__method__) {
|
303
|
+
p_notBgivennotA
|
304
|
+
}
|
305
|
+
end
|
306
|
+
|
307
|
+
M_LIFT_MIN = 0
|
308
|
+
M_LIFT_MID = 1
|
309
|
+
M_LIFT_MAX = Float::INFINITY
|
310
|
+
##
|
311
|
+
# aka interest
|
312
|
+
# Lift measures how many times more often X and Y occur
|
313
|
+
# together than expected if they where statistically independent
|
314
|
+
##
|
315
|
+
def m_lift
|
316
|
+
@m_lift ||= self.class.initialize_measure(__method__) {
|
317
|
+
p_AB/(p_A*p_B)
|
318
|
+
}
|
319
|
+
end
|
320
|
+
|
321
|
+
M_LEVERAGE_MIN = -1
|
322
|
+
M_LEVERAGE_MID = 0
|
323
|
+
M_LEVERAGE_MAX = 1
|
324
|
+
##
|
325
|
+
# Leverage measures the difference of X and Y appearing together
|
326
|
+
# in the data set and what would be expected if X and Y where statistically dependent
|
327
|
+
##
|
328
|
+
def m_leverage
|
329
|
+
@m_leverage ||= self.class.initialize_measure(__method__) {
|
330
|
+
p_BgivenA - (p_A*p_B)
|
331
|
+
}
|
332
|
+
end
|
333
|
+
|
334
|
+
M_PIATETSKY_SHAPIRO_MIN = -0.25
|
335
|
+
M_PIATETSKY_SHAPIRO_MID = 0
|
336
|
+
M_PIATETSKY_SHAPIRO_MAX = 0.25
|
337
|
+
def m_piatetsky_shapiro
|
338
|
+
@m_piatetsky_shapiro ||= self.class.initialize_measure(__method__) {
|
339
|
+
p_AB - p_A*p_B
|
340
|
+
}
|
341
|
+
end
|
342
|
+
|
343
|
+
# aka: pavillion index, centered confidence
|
344
|
+
M_ADDED_VALUE_MIN = -0.5
|
345
|
+
M_ADDED_VALUE_MID = 0
|
346
|
+
M_ADDED_VALUE_MAX = 1
|
347
|
+
def m_added_value
|
348
|
+
@m_added_value ||= self.class.initialize_measure(__method__) {
|
349
|
+
p_BgivenA - p_B
|
350
|
+
}
|
351
|
+
end
|
352
|
+
|
353
|
+
M_CAUSAL_CONFIDENCE_MIN = 0
|
354
|
+
M_CAUSAL_CONFIDENCE_MID = 0
|
355
|
+
M_CAUSAL_CONFIDENCE_MAX = 1
|
356
|
+
def m_causal_confidence
|
357
|
+
@m_causal_confidence ||= self.class.initialize_measure(__method__) {
|
358
|
+
(1.to_r/2)*(p_BgivenA + p_notAgivennotB)
|
359
|
+
}
|
360
|
+
end
|
361
|
+
|
362
|
+
M_CAUSAL_SUPPORT_MIN = 0
|
363
|
+
M_CAUSAL_SUPPORT_MID = 0
|
364
|
+
M_CAUSAL_SUPPORT_MAX = 1
|
365
|
+
def m_causal_support
|
366
|
+
@m_causal_support ||= self.class.initialize_measure(__method__) {
|
367
|
+
p_AB + p_notA_notB
|
368
|
+
}
|
369
|
+
end
|
370
|
+
|
371
|
+
M_DESCRIPTIVE_CONFIRMED_CONFIDENCE_MIN = -1
|
372
|
+
M_DESCRIPTIVE_CONFIRMED_CONFIDENCE_MID = 0
|
373
|
+
M_DESCRIPTIVE_CONFIRMED_CONFIDENCE_MAX = 1
|
374
|
+
def m_descriptive_confirmed_confidence
|
375
|
+
@m_descriptive_confirmed_confidence ||= self.class.initialize_measure(__method__) {
|
376
|
+
p_BgivenA - p_notBgivenA
|
377
|
+
}
|
378
|
+
end
|
379
|
+
|
380
|
+
M_DIFFERENCE_OF_CONFIDENCE_MIN = -1
|
381
|
+
M_DIFFERENCE_OF_CONFIDENCE_MID = 0
|
382
|
+
M_DIFFERENCE_OF_CONFIDENCE_MAX = 1
|
383
|
+
def m_difference_of_confidence
|
384
|
+
@m_difference_of_confidence ||= self.class.initialize_measure(__method__) {
|
385
|
+
p_BgivenA - p_BgivennotA
|
386
|
+
}
|
387
|
+
end
|
388
|
+
|
389
|
+
M_RELATIVE_RISK_MIN = 0
|
390
|
+
M_RELATIVE_RISK_MID = 0
|
391
|
+
M_RELATIVE_RISK_MAX = Float::INFINITY
|
392
|
+
def m_relative_risk
|
393
|
+
@m_relative_risk ||= self.class.initialize_measure(__method__) {
|
394
|
+
if p_BgivennotA == 0
|
395
|
+
Float::INFINITY
|
396
|
+
else
|
397
|
+
p_BgivenA/p_BgivennotA
|
398
|
+
end
|
399
|
+
}
|
400
|
+
end
|
401
|
+
|
402
|
+
M_JACCARD_MIN = 0
|
403
|
+
M_JACCARD_MID = 0
|
404
|
+
M_JACCARD_MAX = 1
|
405
|
+
def m_jaccard
|
406
|
+
@m_jaccard ||= self.class.initialize_measure(__method__) {
|
407
|
+
p_AB/(p_A+p_B-p_AB)
|
408
|
+
}
|
409
|
+
end
|
410
|
+
|
411
|
+
|
412
|
+
M_IMBALANCE_RATIO_MIN = 0
|
413
|
+
M_IMBALANCE_RATIO_MID = 0
|
414
|
+
M_IMBALANCE_RATIO_MAX = 1
|
415
|
+
##
|
416
|
+
# IR gauges the degree of imbalance between two events that the lhs and the rhs are contained in a transaction.
|
417
|
+
# The ratio is close to 0 if the conditional probabilities are similar (i.e., very balanced) and close to 1 if they are very different
|
418
|
+
##
|
419
|
+
def m_imbalance_ratio
|
420
|
+
@m_imbalance_ratio ||= self.class.initialize_measure(__method__) {
|
421
|
+
numerator = (p_AgivenB - p_BgivenA).abs
|
422
|
+
denominator = (p_AgivenB + p_BgivenA - p_AgivenB*p_BgivenA)
|
423
|
+
if denominator == 0
|
424
|
+
if numerator == 0
|
425
|
+
0
|
426
|
+
else
|
427
|
+
raise Evoc::MeasureCalculationError.new, "Numerator was not 0 when denominator was 0 when calculating imbalance ratio"
|
428
|
+
end
|
429
|
+
else
|
430
|
+
numerator/denominator
|
431
|
+
end
|
432
|
+
}
|
433
|
+
end
|
434
|
+
|
435
|
+
M_ODDS_RATIO_MIN = 0
|
436
|
+
M_ODDS_RATIO_MID = 1
|
437
|
+
M_ODDS_RATIO_MAX = Float::INFINITY
|
438
|
+
##
|
439
|
+
# The odds of finding X in transactions which contain Y divided by the
|
440
|
+
# odds of finding X in transactions which do not contain Y
|
441
|
+
##
|
442
|
+
def m_odds_ratio
|
443
|
+
@m_odds_ratio ||= self.class.initialize_measure(__method__) {
|
444
|
+
numerator = p_AB*p_notA_notB
|
445
|
+
denominator = p_A_notB*p_notA_B
|
446
|
+
if denominator == 0
|
447
|
+
Float::INFINITY
|
448
|
+
else
|
449
|
+
numerator/denominator
|
450
|
+
end
|
451
|
+
}
|
452
|
+
end
|
453
|
+
|
454
|
+
M_YULES_Q_MIN = -1
|
455
|
+
M_YULES_Q_MID = 0
|
456
|
+
M_YULES_Q_MAX = 1
|
457
|
+
def m_yules_q
|
458
|
+
@m_yules_q ||= self.class.initialize_measure(__method__) {
|
459
|
+
odds_ratio = self.m_odds_ratio.value
|
460
|
+
if !odds_ratio.nil?
|
461
|
+
if odds_ratio.to_f.finite?
|
462
|
+
(odds_ratio - 1)/(odds_ratio + 1)
|
463
|
+
else
|
464
|
+
# -1 if odds ratio -inf
|
465
|
+
# 1 if odds ratio +inf
|
466
|
+
odds_ratio.to_f.infinite?
|
467
|
+
end
|
468
|
+
else
|
469
|
+
raise Evoc::MeasureCalculationError.new, "Odds ratio was nil when calculating yules q"
|
470
|
+
end
|
471
|
+
}
|
472
|
+
end
|
473
|
+
|
474
|
+
M_YULES_Y_MIN = -1
|
475
|
+
M_YULES_Y_MID = 0
|
476
|
+
M_YULES_Y_MAX = 1
|
477
|
+
def m_yules_y
|
478
|
+
@m_yules_y ||= self.class.initialize_measure(__method__) {
|
479
|
+
odds_ratio = self.m_odds_ratio.value
|
480
|
+
if !odds_ratio.nil?
|
481
|
+
if odds_ratio.to_f.finite?
|
482
|
+
((Math.sqrt(odds_ratio).rationalize) - 1)/((Math.sqrt(odds_ratio).rationalize) + 1)
|
483
|
+
else
|
484
|
+
odds_ratio.to_f.infinite?
|
485
|
+
end
|
486
|
+
else
|
487
|
+
raise Evoc::Exceptions::MeasureCalculationError.new, "Odds ratio was nil when calculating yules y"
|
488
|
+
end
|
489
|
+
}
|
490
|
+
end
|
491
|
+
|
492
|
+
# from Tan2004
|
493
|
+
M_KLOSGEN_MIN = -1
|
494
|
+
M_KLOSGEN_MID = 0
|
495
|
+
M_KLOSGEN_MAX = 1
|
496
|
+
def m_klosgen
|
497
|
+
@m_klosgen ||= self.class.initialize_measure(__method__) {
|
498
|
+
(Math.sqrt(p_AB)*[(p_BgivenA-p_B),p_AgivenB-p_A].max).rationalize
|
499
|
+
}
|
500
|
+
end
|
501
|
+
|
502
|
+
M_KULCZYNSKI_MIN = 0
|
503
|
+
M_KULCZYNSKI_MID = 0
|
504
|
+
M_KULCZYNSKI_MAX = 1
|
505
|
+
##
|
506
|
+
# Calculate the null-invariant Kulczynski measure with a preference for skewed patterns.
|
507
|
+
##
|
508
|
+
def m_kulczynski
|
509
|
+
@m_kulczynski ||= self.class.initialize_measure(__method__) {
|
510
|
+
(p_AB/2)*((1/p_A)+(1/p_B))
|
511
|
+
}
|
512
|
+
end
|
513
|
+
|
514
|
+
M_CONVICTION_MIN = 0
|
515
|
+
M_CONVICTION_MID = 0
|
516
|
+
M_CONVICTION_MAX = Float::INFINITY
|
517
|
+
def m_conviction
|
518
|
+
@m_conviction ||= self.class.initialize_measure(__method__) {
|
519
|
+
numerator = p_A*p_notB
|
520
|
+
denominator = p_A_notB
|
521
|
+
if denominator == 0
|
522
|
+
if numerator == 0
|
523
|
+
0
|
524
|
+
else
|
525
|
+
Float::INFINITY
|
526
|
+
end
|
527
|
+
else
|
528
|
+
numerator/denominator
|
529
|
+
end
|
530
|
+
}
|
531
|
+
end
|
532
|
+
|
533
|
+
##
|
534
|
+
# uses 2 coefficients two weight the importance of the two factors
|
535
|
+
# k : dependency
|
536
|
+
# m : generality
|
537
|
+
M_INTERESTINGNESS_WEIGHTING_DEPENDENCY_MIN = 0
|
538
|
+
M_INTERESTINGNESS_WEIGHTING_DEPENDENCY_MID = 0
|
539
|
+
M_INTERESTINGNESS_WEIGHTING_DEPENDENCY_MAX = 1
|
540
|
+
def m_interestingness_weighting_dependency
|
541
|
+
k = 2
|
542
|
+
m = 2
|
543
|
+
@m_interestingness_weighting_dependency ||= self.class.initialize_measure(__method__) {
|
544
|
+
((p_BgivenA/p_B)**(k-1))*(p_AB**m)
|
545
|
+
}
|
546
|
+
end
|
547
|
+
|
548
|
+
M_COLLECTIVE_STRENGTH_MIN = -Float::INFINITY
|
549
|
+
M_COLLECTIVE_STRENGTH_MID = 1
|
550
|
+
M_COLLECTIVE_STRENGTH_MAX = Float::INFINITY
|
551
|
+
# range from Aggarwal1998
|
552
|
+
def m_collective_strength
|
553
|
+
@m_collective_strength ||= self.class.initialize_measure(__method__) {
|
554
|
+
n1 = (p_AB+p_notBgivennotA)
|
555
|
+
d1 = (p_A*p_B+p_notA*p_notB)
|
556
|
+
first = ((d1 == 0) ? n1/d1.to_f : n1/d1)
|
557
|
+
n2 = (1-p_A*p_B-p_notA*p_notB)
|
558
|
+
d2 = (1-p_AB-p_notBgivennotA)
|
559
|
+
second = ((d2 == 0) ? n2/d2.to_f : n2/d2)
|
560
|
+
first * second
|
561
|
+
}
|
562
|
+
end
|
563
|
+
|
564
|
+
M_GINI_INDEX_MIN = 0
|
565
|
+
M_GINI_INDEX_MID = 0
|
566
|
+
M_GINI_INDEX_MAX = 1
|
567
|
+
##
|
568
|
+
# Measures quadratic entropy
|
569
|
+
#
|
570
|
+
##
|
571
|
+
def m_gini_index
|
572
|
+
@m_gini_index ||= self.class.initialize_measure(__method__) {
|
573
|
+
p_A*((p_BgivenA**2)+(p_notBgivenA**2))+p_notA*((p_BgivennotA**2)+(p_notBgivennotA**2))-p_B**2-p_notB**2
|
574
|
+
}
|
575
|
+
end
|
576
|
+
|
577
|
+
M_KAPPA_MIN = -1
|
578
|
+
M_KAPPA_MID = 0
|
579
|
+
M_KAPPA_MAX = 1
|
580
|
+
def m_kappa
|
581
|
+
@m_kappa ||= self.class.initialize_measure(__method__) {
|
582
|
+
num = (p_AB + p_notA_notB - p_A*p_B - p_notA*p_notB)
|
583
|
+
den = (1 - p_A*p_B - p_notA*p_notB)
|
584
|
+
((den == 0) ? num/den.to_f : num/den)
|
585
|
+
}
|
586
|
+
end
|
587
|
+
|
588
|
+
M_J_MEASURE_MIN = 0
|
589
|
+
M_J_MEASURE_MID = 0
|
590
|
+
M_J_MEASURE_MAX = 1
|
591
|
+
##
|
592
|
+
# Measures cross entropy
|
593
|
+
#
|
594
|
+
##
|
595
|
+
def m_j_measure
|
596
|
+
@m_j_measure ||= self.class.initialize_measure(__method__) {
|
597
|
+
first_log = Math.log(p_BgivenA/(p_B.to_f))
|
598
|
+
second_log = Math.log(p_notBgivenA/(p_notB.to_f))
|
599
|
+
first = ((p_AB == 0) ? 0 : p_AB*first_log)
|
600
|
+
second = ((p_A_notB == 0) ? 0 : p_A_notB*second_log)
|
601
|
+
|
602
|
+
first+second
|
603
|
+
}
|
604
|
+
end
|
605
|
+
|
606
|
+
M_ONE_WAY_SUPPORT_MIN = -1
|
607
|
+
M_ONE_WAY_SUPPORT_MID = 0
|
608
|
+
M_ONE_WAY_SUPPORT_MAX = Float::INFINITY
|
609
|
+
def m_one_way_support
|
610
|
+
@m_one_way_support ||= self.class.initialize_measure(__method__) {
|
611
|
+
if p_BgivenA == 0
|
612
|
+
0
|
613
|
+
else
|
614
|
+
p_BgivenA*(Math.log2(p_AB/(p_A*p_B)).rationalize)
|
615
|
+
end
|
616
|
+
}
|
617
|
+
end
|
618
|
+
|
619
|
+
M_TWO_WAY_SUPPORT_MIN = -1
|
620
|
+
M_TWO_WAY_SUPPORT_MID = 0
|
621
|
+
M_TWO_WAY_SUPPORT_MAX = 1
|
622
|
+
def m_two_way_support
|
623
|
+
@m_two_way_support ||= self.class.initialize_measure(__method__) {
|
624
|
+
if p_AB == 0
|
625
|
+
0
|
626
|
+
else
|
627
|
+
p_AB*(Math.log2(p_AB/(p_A*p_B)).rationalize)
|
628
|
+
end
|
629
|
+
}
|
630
|
+
end
|
631
|
+
|
632
|
+
# aka Ø-coefficient
|
633
|
+
M_LINEAR_CORRELATION_COEFFICIENT_MIN = -1
|
634
|
+
M_LINEAR_CORRELATION_COEFFICIENT_MID = 0
|
635
|
+
M_LINEAR_CORRELATION_COEFFICIENT_MAX = 1
|
636
|
+
def m_linear_correlation_coefficient
|
637
|
+
@m_linear_correlation_coefficient ||= self.class.initialize_measure(__method__) {
|
638
|
+
num = (p_AB-(p_A*p_B))
|
639
|
+
den = (Math.sqrt(p_A*p_B*p_notA*p_notB))
|
640
|
+
((den == 0) ? num/den.to_f : num/(den.rationalize))
|
641
|
+
}
|
642
|
+
end
|
643
|
+
|
644
|
+
M_COSINE_MIN = 0
|
645
|
+
M_COSINE_MID = 0
|
646
|
+
M_COSINE_MAX = 1
|
647
|
+
def m_cosine
|
648
|
+
@m_cosine ||= self.class.initialize_measure(__method__) {
|
649
|
+
num = p_AB
|
650
|
+
den = Math.sqrt(p_A*p_B).rationalize
|
651
|
+
if den == 0
|
652
|
+
raise Evoc::Exceptions::MeasureCalculationError, "Denominator became 0 when calculating cosine (a #{p_a}, b #{p_B}, ab #{p_AB})"
|
653
|
+
else
|
654
|
+
num/den
|
655
|
+
end
|
656
|
+
}
|
657
|
+
end
|
658
|
+
|
659
|
+
M_LOEVINGER_MIN = -1
|
660
|
+
M_LOEVINGER_MID = 0
|
661
|
+
M_LOEVINGER_MAX = 1
|
662
|
+
##
|
663
|
+
# aka Certainty Factor
|
664
|
+
# The certainty factor is a measure of variation of the probability that
|
665
|
+
# Y is in a transaction when only considering transactions with X.
|
666
|
+
# An increasing CF means a decrease of the probability that Y is not in
|
667
|
+
# a transaction that X is in. Negative CFs have a similar interpretation
|
668
|
+
#
|
669
|
+
##
|
670
|
+
def m_loevinger
|
671
|
+
@m_loevinger ||= self.class.initialize_measure(__method__) {
|
672
|
+
if p_A*p_notB == 0
|
673
|
+
1
|
674
|
+
else
|
675
|
+
1 - ((p_A_notB)/(p_A*p_notB))
|
676
|
+
end
|
677
|
+
}
|
678
|
+
end
|
679
|
+
|
680
|
+
M_SEBAG_SCHOENAUER_MIN = 0
|
681
|
+
M_SEBAG_SCHOENAUER_MID = 0
|
682
|
+
M_SEBAG_SCHOENAUER_MAX = Float::INFINITY
|
683
|
+
def m_sebag_schoenauer
|
684
|
+
@m_sebag_schoenauer ||= self.class.initialize_measure(__method__) {
|
685
|
+
if p_A_notB == 0
|
686
|
+
Float::INFINITY
|
687
|
+
else
|
688
|
+
p_AB/p_A_notB
|
689
|
+
end
|
690
|
+
}
|
691
|
+
end
|
692
|
+
|
693
|
+
M_VARYING_RATES_LIAISON_MIN = -1
|
694
|
+
M_VARYING_RATES_LIAISON_MID = 0
|
695
|
+
M_VARYING_RATES_LIAISON_MAX = Float::INFINITY
|
696
|
+
def m_varying_rates_liaison
|
697
|
+
@m_varying_rates_liaison ||= self.class.initialize_measure(__method__) {
|
698
|
+
(p_AB/(p_A*p_B)) - 1
|
699
|
+
}
|
700
|
+
end
|
701
|
+
|
702
|
+
M_LEAST_CONTRADICTION_MIN = -Float::INFINITY
|
703
|
+
M_LEAST_CONTRADICTION_MID = 0
|
704
|
+
M_LEAST_CONTRADICTION_MAX = 1
|
705
|
+
def m_least_contradiction
|
706
|
+
@m_least_contradiction ||= self.class.initialize_measure(__method__) {
|
707
|
+
(p_AB-p_A_notB)/p_B
|
708
|
+
}
|
709
|
+
end
|
710
|
+
|
711
|
+
M_ODD_MULTIPLIER_MIN = 0
|
712
|
+
M_ODD_MULTIPLIER_MID = 0
|
713
|
+
M_ODD_MULTIPLIER_MAX = Float::INFINITY
|
714
|
+
def m_odd_multiplier
|
715
|
+
@m_odd_multiplier ||= self.class.initialize_measure(__method__) {
|
716
|
+
numerator = p_AB*p_notB
|
717
|
+
denominator = p_B*p_A_notB
|
718
|
+
if denominator == 0
|
719
|
+
if numerator == 0
|
720
|
+
0
|
721
|
+
else
|
722
|
+
Float::INFINITY
|
723
|
+
end
|
724
|
+
else
|
725
|
+
numerator/denominator
|
726
|
+
end
|
727
|
+
}
|
728
|
+
end
|
729
|
+
|
730
|
+
M_EXAMPLE_AND_COUNTEREXAMPLE_RATE_MIN = -Float::INFINITY
|
731
|
+
M_EXAMPLE_AND_COUNTEREXAMPLE_RATE_MID = 0
|
732
|
+
M_EXAMPLE_AND_COUNTEREXAMPLE_RATE_MAX = 1
|
733
|
+
# 0 when equally many examples as counter examples
|
734
|
+
def m_example_and_counterexample_rate
|
735
|
+
@m_example_and_counterexample_rate ||= self.class.initialize_measure(__method__) {
|
736
|
+
numerator = (p_AB - p_A_notB)
|
737
|
+
denominator = p_AB
|
738
|
+
if denominator == 0
|
739
|
+
-Float::INFINITY
|
740
|
+
else
|
741
|
+
numerator/denominator
|
742
|
+
end
|
743
|
+
}
|
744
|
+
end
|
745
|
+
|
746
|
+
M_ZHANG_MIN = -1
|
747
|
+
M_ZHANG_MID = 0
|
748
|
+
M_ZHANG_MAX = 1
|
749
|
+
def m_zhang
|
750
|
+
@m_zhang ||= self.class.initialize_measure(__method__) {
|
751
|
+
numerator = p_AB-p_A*p_B
|
752
|
+
denominator = [p_AB*p_notB,p_B*p_A_notB].max
|
753
|
+
if denominator == 0
|
754
|
+
0
|
755
|
+
else
|
756
|
+
numerator/denominator
|
757
|
+
end
|
758
|
+
}
|
759
|
+
end
|
760
|
+
|
761
|
+
M_LAPLACE_CORRECTED_CONFIDENCE_MIN = 0
|
762
|
+
M_LAPLACE_CORRECTED_CONFIDENCE_MID = 0
|
763
|
+
M_LAPLACE_CORRECTED_CONFIDENCE_MAX = 1
|
764
|
+
##
|
765
|
+
# Corrected confidence estimate decreases with lower support
|
766
|
+
# to account for estimation uncertainty with low counts.
|
767
|
+
##
|
768
|
+
def m_laplace_corrected_confidence
|
769
|
+
@m_laplace_corrected_confidence ||= self.class.initialize_measure(__method__) {
|
770
|
+
(p_AB + 1)/(p_B + 2)
|
771
|
+
}
|
772
|
+
end
|
773
|
+
|
774
|
+
##
|
775
|
+
# building blocks for interestingness measures
|
776
|
+
#
|
777
|
+
# A refers to the antecedent of a rule
|
778
|
+
# B refers to the consequent of a rule
|
779
|
+
|
780
|
+
##
|
781
|
+
# the number of transactions
|
782
|
+
# n is converted into the specified type to ensure
|
783
|
+
# that the type is used throughout calculations
|
784
|
+
def n
|
785
|
+
@n ||= VALUE_TYPE.method(VALUE_TYPE.to_s).call(tx_store.size)
|
786
|
+
end
|
787
|
+
|
788
|
+
##
|
789
|
+
# the ratio of tx with A as a subset
|
790
|
+
def p_A
|
791
|
+
@p_A ||= tx_store.transactions_of_list(lhs,strict: true).size/n
|
792
|
+
end
|
793
|
+
|
794
|
+
##
|
795
|
+
# the ratio of tx where A is not a subset
|
796
|
+
def p_notA
|
797
|
+
@p_notA ||= (1 - p_A)
|
798
|
+
end
|
799
|
+
|
800
|
+
##
|
801
|
+
# the ratio of tx with B as a subset
|
802
|
+
def p_B
|
803
|
+
@p_B ||= tx_store.transactions_of_list(rhs,strict: true).size/n
|
804
|
+
end
|
805
|
+
|
806
|
+
##
|
807
|
+
# the ratio of tx where B is not a subset
|
808
|
+
def p_notB
|
809
|
+
@p_notB ||= (1 - p_B)
|
810
|
+
end
|
811
|
+
|
812
|
+
##
|
813
|
+
# the ratio of tx with the union of A and B as a subset
|
814
|
+
def p_AB
|
815
|
+
@p_AB ||= tx_store.transactions_of_list((lhs | rhs),strict: true).size/n
|
816
|
+
end
|
817
|
+
|
818
|
+
##
|
819
|
+
# the ratio of tx where A or B is a subset
|
820
|
+
def p_AorB
|
821
|
+
@p_AorB ||= p_A + p_B - p_AB
|
822
|
+
end
|
823
|
+
|
824
|
+
##
|
825
|
+
# the ratio of tx where neither A or B is a subset
|
826
|
+
def p_notA_notB
|
827
|
+
@p_notA_notB ||= 1 - (p_A + p_B) + p_AB
|
828
|
+
end
|
829
|
+
|
830
|
+
##
|
831
|
+
# the ratio of tx where A is not a subset but B is
|
832
|
+
def p_notA_B
|
833
|
+
@p_notA_B ||= p_B - p_AB
|
834
|
+
end
|
835
|
+
|
836
|
+
##
|
837
|
+
# the ratio of tx where A is a subset but B is not
|
838
|
+
def p_A_notB
|
839
|
+
@p_A_notB ||= p_A - p_AB
|
840
|
+
end
|
841
|
+
|
842
|
+
##
|
843
|
+
# the ratio of the union being a subset to the number of txes where B is a subset
|
844
|
+
def p_AgivenB
|
845
|
+
@p_AgivenB ||= ((p_B == 0) ? 0 : p_AB/p_B)
|
846
|
+
end
|
847
|
+
|
848
|
+
##
|
849
|
+
# the ratio of the union being a subset to the number of txes where A is a subset
|
850
|
+
def p_BgivenA
|
851
|
+
@p_BgivenA ||= ((p_A == 0) ? 0 : p_AB/p_A)
|
852
|
+
end
|
853
|
+
|
854
|
+
def p_notAgivenB
|
855
|
+
@p_notAgivenB ||= ((p_B == 0) ? 0 : p_notA_B/p_B)
|
856
|
+
end
|
857
|
+
|
858
|
+
# if A is 0, so is A,notB
|
859
|
+
def p_notBgivenA
|
860
|
+
@p_notBgivenA ||= ((p_A == 0) ? 0 : p_A_notB/p_A)
|
861
|
+
end
|
862
|
+
|
863
|
+
# if notB is 0, so is A_notB
|
864
|
+
def p_AgivennotB
|
865
|
+
@p_AgivennotB ||= ((p_notB == 0) ? 0 : p_A_notB/p_notB)
|
866
|
+
end
|
867
|
+
|
868
|
+
# if notA is 0, so is notA_B
|
869
|
+
def p_BgivennotA
|
870
|
+
@p_BgivennotA ||= ((p_notA == 0) ? 0 : p_notA_B/p_notA)
|
871
|
+
end
|
872
|
+
|
873
|
+
def p_notBgivennotA
|
874
|
+
@p_notBgivennotA ||= ((p_notA == 0) ? 0 : p_notA_notB/p_notA)
|
875
|
+
end
|
876
|
+
|
877
|
+
# if notB is 0, so is notA and notB
|
878
|
+
def p_notAgivennotB
|
879
|
+
@p_notAgivennotB ||= ((p_notB == 0) ? 0 : p_notA_notB/p_notB)
|
880
|
+
end
|
881
|
+
end
|
882
|
+
end
|