evoc 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/Makefile +4 -0
- data/README.md +61 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/evoc +3 -0
- data/bin/setup +7 -0
- data/evoc.gemspec +30 -0
- data/lib/evoc/algorithm.rb +147 -0
- data/lib/evoc/algorithms/top_k.rb +86 -0
- data/lib/evoc/analyze.rb +395 -0
- data/lib/evoc/array.rb +43 -0
- data/lib/evoc/evaluate.rb +109 -0
- data/lib/evoc/exceptions/aggregation_error.rb +6 -0
- data/lib/evoc/exceptions/expectedoutcome_nil_or_empty.rb +6 -0
- data/lib/evoc/exceptions/measure_calculation_error.rb +6 -0
- data/lib/evoc/exceptions/no_changed_items_in_changes.rb +6 -0
- data/lib/evoc/exceptions/no_changes_in_json_object.rb +6 -0
- data/lib/evoc/exceptions/no_date_in_json_object.rb +6 -0
- data/lib/evoc/exceptions/no_result.rb +6 -0
- data/lib/evoc/exceptions/non_finite.rb +8 -0
- data/lib/evoc/exceptions/non_numeric.rb +8 -0
- data/lib/evoc/exceptions/not_a_query.rb +6 -0
- data/lib/evoc/exceptions/not_a_result.rb +6 -0
- data/lib/evoc/exceptions/not_a_transaction.rb +6 -0
- data/lib/evoc/exceptions/not_initialized.rb +6 -0
- data/lib/evoc/exceptions/only_nil_in_changes.rb +6 -0
- data/lib/evoc/exceptions/query_nil_or_empty.rb +6 -0
- data/lib/evoc/exceptions/unable_to_convert_json_to_tx.rb +6 -0
- data/lib/evoc/experiment.rb +239 -0
- data/lib/evoc/hash.rb +56 -0
- data/lib/evoc/history_store.rb +53 -0
- data/lib/evoc/hyper_rule.rb +53 -0
- data/lib/evoc/interestingness_measure.rb +77 -0
- data/lib/evoc/interestingness_measure_aggregator.rb +147 -0
- data/lib/evoc/interestingness_measures.rb +882 -0
- data/lib/evoc/logger.rb +34 -0
- data/lib/evoc/memory_profiler.rb +43 -0
- data/lib/evoc/recommendation_cache.rb +152 -0
- data/lib/evoc/rule.rb +32 -0
- data/lib/evoc/rule_store.rb +340 -0
- data/lib/evoc/scenario.rb +303 -0
- data/lib/evoc/svd.rb +124 -0
- data/lib/evoc/tx.rb +34 -0
- data/lib/evoc/tx_store.rb +379 -0
- data/lib/evoc/version.rb +3 -0
- data/lib/evoc.rb +4 -0
- data/lib/evoc_cli/analyze.rb +198 -0
- data/lib/evoc_cli/cli_helper.rb +1 -0
- data/lib/evoc_cli/experiment.rb +78 -0
- data/lib/evoc_cli/info.rb +22 -0
- data/lib/evoc_cli/main.rb +29 -0
- data/lib/evoc_cli/util.rb +36 -0
- data/lib/evoc_helper.rb +40 -0
- data/mem_profiler/Gemfile.lock +39 -0
- data/mem_profiler/README.md +126 -0
- data/mem_profiler/createdb.rb +4 -0
- data/mem_profiler/db.rb +82 -0
- data/mem_profiler/gemfile +6 -0
- data/mem_profiler/gencsv.rb +64 -0
- data/mem_profiler/genimport.sh +8 -0
- data/mem_profiler/graph.rb +91 -0
- metadata +251 -0
@@ -0,0 +1,882 @@
|
|
1
|
+
module Evoc
|
2
|
+
module InterestingnessMeasures
|
3
|
+
|
4
|
+
VALUE_TYPE = Rational
|
5
|
+
# USE Rational(Math.log(1/3)).rationalize(0.001)
|
6
|
+
|
7
|
+
#
|
8
|
+
# methods that must be implemented in the class that uses this module as a mixin
|
9
|
+
#
|
10
|
+
def tx_store
|
11
|
+
raise NotImplementedError.new, "tx_store has not been implemented on the current class"
|
12
|
+
end
|
13
|
+
|
14
|
+
def lhs
|
15
|
+
raise NotImplementedError.new, "lhs has not been implemented on the current class"
|
16
|
+
end
|
17
|
+
|
18
|
+
def rhs
|
19
|
+
raise NotImplementedError.new, "rhs has not been implemented on the current class"
|
20
|
+
end
|
21
|
+
|
22
|
+
def name
|
23
|
+
raise NotImplementedError.new, "name has not been implemented on the current class"
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# class methods
|
28
|
+
#
|
29
|
+
# the following is a idiom/hack that enables also including class methods when a class includes this module
|
30
|
+
# (normally one would use 'extend')
|
31
|
+
#
|
32
|
+
##
|
33
|
+
def self.included(base)
|
34
|
+
base.extend(ClassMethods)
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.get_min(measure)
|
38
|
+
const_get(measure.to_s.upcase+"_MIN")
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.get_max(measure)
|
42
|
+
const_get(measure.to_s.upcase+"_MAX")
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.get_mid(measure)
|
46
|
+
const_get(measure.to_s.upcase+"_MID")
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.measures
|
50
|
+
self.instance_methods.grep(/\Am_(.*)/)
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.hyper_measures
|
54
|
+
self.instance_methods.grep(/\Am_(.*)/).select {|m| is_hyper_measure?(m)}
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.is_hyper_measure?(m)
|
58
|
+
begin
|
59
|
+
const_get(m.to_s.upcase+"_HYPER_MEASURE")
|
60
|
+
rescue NameError
|
61
|
+
return false
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
module ClassMethods
|
66
|
+
|
67
|
+
def initialize_measure(measure, hyper_measure: false)
|
68
|
+
if block_given?
|
69
|
+
Evoc::InterestingnessMeasure.new(type: measure,min: get_min(measure),mid: get_mid(measure), max: get_max(measure), hyper_measure: hyper_measure) {
|
70
|
+
yield
|
71
|
+
}
|
72
|
+
else
|
73
|
+
Evoc::InterestingnessMeasure.new(type: measure,min: get_min(measure),mid: get_mid(measure), max: get_max(measure), hyper_measure: hyper_measure)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
##
|
78
|
+
# returns the list of interestingness measures which are implemented
|
79
|
+
def measures
|
80
|
+
self.instance_methods.grep(/\Am_(.*)/)
|
81
|
+
end
|
82
|
+
|
83
|
+
def p_measures
|
84
|
+
self.instance_methods.grep(/\Ap_(.*)/)
|
85
|
+
end
|
86
|
+
|
87
|
+
def csv_header
|
88
|
+
['lhs','rhs'] + measures
|
89
|
+
end
|
90
|
+
|
91
|
+
def pretty_csv_header
|
92
|
+
['lhs','rhs'] + measures.map {|m| m.to_s.gsub(/m_/,'')}
|
93
|
+
end
|
94
|
+
|
95
|
+
def get_min(measure)
|
96
|
+
const_get(measure.to_s.upcase+"_MIN")
|
97
|
+
end
|
98
|
+
|
99
|
+
def get_max(measure)
|
100
|
+
const_get(measure.to_s.upcase+"_MAX")
|
101
|
+
end
|
102
|
+
|
103
|
+
def get_mid(measure)
|
104
|
+
const_get(measure.to_s.upcase+"_MID")
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
def to_a
|
110
|
+
[lhs.join(',')] + [rhs.join(',')] + instantiated_measures.map {|m| self.get_measure(m).value}
|
111
|
+
end
|
112
|
+
|
113
|
+
|
114
|
+
|
115
|
+
##
|
116
|
+
# a common getter for all measures
|
117
|
+
#
|
118
|
+
# handles exceptions and converts the final measure to float
|
119
|
+
def get_measure(measure)
|
120
|
+
if m = self.method(measure).call
|
121
|
+
return m
|
122
|
+
else
|
123
|
+
raise NotImplementedError.new, "#{measure} not implemented"
|
124
|
+
end
|
125
|
+
rescue Evoc::Exceptions::MeasureCalculationError => e
|
126
|
+
logger.warn "#{measure} was undefined for #{self.name} on the current history, error: #{e}"
|
127
|
+
m = self.class.initialize_measure(measure)
|
128
|
+
self.instance_variable_set('@'+measure.to_s,m)
|
129
|
+
self.method(measure).call
|
130
|
+
end
|
131
|
+
|
132
|
+
def set_measure(measure,value,hyper_measure: false)
|
133
|
+
m = self.class.initialize_measure(measure, hyper_measure: hyper_measure) {value}
|
134
|
+
self.instance_variable_set('@'+measure.to_s,m)
|
135
|
+
end
|
136
|
+
|
137
|
+
##
|
138
|
+
# manually set the probability p of this rule
|
139
|
+
def set_p(p,value)
|
140
|
+
self.instance_variable_set('@'+p.to_s,value)
|
141
|
+
end
|
142
|
+
|
143
|
+
##
|
144
|
+
# @param [String] the p probability to get
|
145
|
+
def get_p(p)
|
146
|
+
p = self.method(p).call
|
147
|
+
if p_A < p_AB
|
148
|
+
raise Evoc::Exceptions::MeasureCalculationError, "p_A was smaller than p_AB"
|
149
|
+
elsif p_B < p_AB
|
150
|
+
raise Evoc::Exceptions::MeasureCalculationError, "p_B was smaller than p_AB"
|
151
|
+
elsif p_A == 0
|
152
|
+
raise Evoc::Exceptions::MeasureCalculationError, "p_A was 0"
|
153
|
+
elsif p_B == 0
|
154
|
+
raise Evoc::Exceptions::MeasureCalculationError, "p_B was 0"
|
155
|
+
elsif p_B > 1-p_A+p_AB
|
156
|
+
raise Evoc::Exceptions::MeasureCalculationError, "p_B mismatch with p_A and p_AB, (a #{p_a}, b #{p_B}, ab #{p_AB})"
|
157
|
+
elsif p_A > 1-p_B+p_AB
|
158
|
+
raise Evoc::Exceptions::MeasureCalculationError, "p_A mismatch with p_B and p_AB, (a #{p_a}, b #{p_B}, ab #{p_AB})"
|
159
|
+
end
|
160
|
+
return p
|
161
|
+
end
|
162
|
+
|
163
|
+
##
|
164
|
+
# @return [Hash] a hash containing the p probabilities of this rule
|
165
|
+
def get_p_values
|
166
|
+
values = Hash.new
|
167
|
+
Evoc::Rule.p_measures.each do |p|
|
168
|
+
values[p] = self.get_p(p)
|
169
|
+
end
|
170
|
+
return values
|
171
|
+
end
|
172
|
+
|
173
|
+
##
|
174
|
+
# returns the measures which has been instantiated
|
175
|
+
def instantiated_measures
|
176
|
+
self.class.measures.select {|m| measure_instantiated?(m) }
|
177
|
+
end
|
178
|
+
|
179
|
+
##
|
180
|
+
# Returns true if the measure has been instantiated
|
181
|
+
def measure_instantiated?(measure)
|
182
|
+
!self.instance_variable_get('@'+measure.to_s).nil?
|
183
|
+
end
|
184
|
+
|
185
|
+
##
|
186
|
+
# NATIVE AGGREGATED MEASURES
|
187
|
+
#
|
188
|
+
# these measures are defined natively for aggregated rules
|
189
|
+
#
|
190
|
+
# e.g.,
|
191
|
+
# a -> c
|
192
|
+
# b -> c
|
193
|
+
# aggregated is: a,b -> c
|
194
|
+
#
|
195
|
+
# the listed measures are well defined also for the aggregated rule
|
196
|
+
#
|
197
|
+
# With the exception of the hyper coefficient, all other hyper measures
|
198
|
+
# are allways recalculated on request.
|
199
|
+
# I.e., they use '@m = ..' rather than '@m ||= ..'
|
200
|
+
# This way they will overwrite the aggregation that has been calculated on the hyper rule
|
201
|
+
# (which is done for the concentional rules)
|
202
|
+
|
203
|
+
|
204
|
+
##
|
205
|
+
# Hyper Coefficient
|
206
|
+
# The number of rules used to form a hyper rule
|
207
|
+
M_HYPER_COEFFICIENT_MIN = 0
|
208
|
+
M_HYPER_COEFFICIENT_MID = 0
|
209
|
+
M_HYPER_COEFFICIENT_MAX = Float::INFINITY
|
210
|
+
M_HYPER_COEFFICIENT_HYPER_MEASURE = true
|
211
|
+
def m_hyper_coefficient
|
212
|
+
@m_hyper_coefficient = self.class.initialize_measure(__method__) {
|
213
|
+
if self.respond_to?(:hyper_coefficient)
|
214
|
+
self.hyper_coefficient
|
215
|
+
else
|
216
|
+
0
|
217
|
+
end
|
218
|
+
}
|
219
|
+
end
|
220
|
+
|
221
|
+
##
|
222
|
+
# Hyper confidence
|
223
|
+
#
|
224
|
+
# A confidence like measure that is well defined for all hyper rules
|
225
|
+
#
|
226
|
+
# "the number of times something in rhs changed with something in lhs, divided by the number of times something in lhs changed"
|
227
|
+
##
|
228
|
+
#M_HYPER_CONFIDENCE_MIN = 0
|
229
|
+
#M_HYPER_CONFIDENCE_MID = 0
|
230
|
+
#M_HYPER_CONFIDENCE_MAX = 1
|
231
|
+
#M_HYPER_CONFIDENCE_HYPER_MEASURE = true
|
232
|
+
#def m_hyper_confidence
|
233
|
+
# @m_hyper_confidence ||= self.class.initialize_measure(__method__) {
|
234
|
+
# # hyper confidence is equal to the confidence for non hyper rules
|
235
|
+
# if !self.respond_to?(:hyper_confidence)
|
236
|
+
# m_confidence.value
|
237
|
+
# else
|
238
|
+
# raise ArgumentError, "Asked for the hyper confidence of a hyper rule, the value was not initialized, but should have been when creating the hyper rule"
|
239
|
+
# end
|
240
|
+
# }
|
241
|
+
#end
|
242
|
+
|
243
|
+
|
244
|
+
##
|
245
|
+
# INTERESTINGNESS MEASURES
|
246
|
+
#
|
247
|
+
# if not stated otherwise, all of the implementations are based on
|
248
|
+
# Michael Hahslers overview at:
|
249
|
+
# http://michael.hahsler.net/research/association_rules/measures.html
|
250
|
+
#
|
251
|
+
##
|
252
|
+
|
253
|
+
M_SUPPORT_MIN = 0
|
254
|
+
M_SUPPORT_MID = 0
|
255
|
+
M_SUPPORT_MAX = 1
|
256
|
+
def m_support
|
257
|
+
@m_support ||= self.class.initialize_measure(__method__) {
|
258
|
+
p_AB
|
259
|
+
}
|
260
|
+
end
|
261
|
+
|
262
|
+
M_CONFIDENCE_MIN = 0
|
263
|
+
M_CONFIDENCE_MID = 0
|
264
|
+
M_CONFIDENCE_MAX = 1
|
265
|
+
def m_confidence
|
266
|
+
@m_confidence ||= self.class.initialize_measure(__method__) {
|
267
|
+
p_BgivenA
|
268
|
+
}
|
269
|
+
end
|
270
|
+
|
271
|
+
M_COVERAGE_MIN = 0
|
272
|
+
M_COVERAGE_MID = 0
|
273
|
+
M_COVERAGE_MAX = 1
|
274
|
+
def m_coverage
|
275
|
+
@m_coverage ||= self.class.initialize_measure(__method__) {
|
276
|
+
p_A
|
277
|
+
}
|
278
|
+
end
|
279
|
+
|
280
|
+
M_PREVALENCE_MIN = 0
|
281
|
+
M_PREVALENCE_MID = 0
|
282
|
+
M_PREVALENCE_MAX = 1
|
283
|
+
def m_prevalence
|
284
|
+
@m_prevalence ||= self.class.initialize_measure(__method__) {
|
285
|
+
p_B
|
286
|
+
}
|
287
|
+
end
|
288
|
+
|
289
|
+
M_RECALL_MIN = 0
|
290
|
+
M_RECALL_MID = 0
|
291
|
+
M_RECALL_MAX = 1
|
292
|
+
def m_recall
|
293
|
+
@m_recall ||= self.class.initialize_measure(__method__) {
|
294
|
+
p_AgivenB
|
295
|
+
}
|
296
|
+
end
|
297
|
+
|
298
|
+
M_SPECIFICITY_MIN = 0
|
299
|
+
M_SPECIFICITY_MID = 0
|
300
|
+
M_SPECIFICITY_MAX = 1
|
301
|
+
def m_specificity
|
302
|
+
@m_specificity ||= self.class.initialize_measure(__method__) {
|
303
|
+
p_notBgivennotA
|
304
|
+
}
|
305
|
+
end
|
306
|
+
|
307
|
+
M_LIFT_MIN = 0
|
308
|
+
M_LIFT_MID = 1
|
309
|
+
M_LIFT_MAX = Float::INFINITY
|
310
|
+
##
|
311
|
+
# aka interest
|
312
|
+
# Lift measures how many times more often X and Y occur
|
313
|
+
# together than expected if they where statistically independent
|
314
|
+
##
|
315
|
+
def m_lift
|
316
|
+
@m_lift ||= self.class.initialize_measure(__method__) {
|
317
|
+
p_AB/(p_A*p_B)
|
318
|
+
}
|
319
|
+
end
|
320
|
+
|
321
|
+
M_LEVERAGE_MIN = -1
|
322
|
+
M_LEVERAGE_MID = 0
|
323
|
+
M_LEVERAGE_MAX = 1
|
324
|
+
##
|
325
|
+
# Leverage measures the difference of X and Y appearing together
|
326
|
+
# in the data set and what would be expected if X and Y where statistically dependent
|
327
|
+
##
|
328
|
+
def m_leverage
|
329
|
+
@m_leverage ||= self.class.initialize_measure(__method__) {
|
330
|
+
p_BgivenA - (p_A*p_B)
|
331
|
+
}
|
332
|
+
end
|
333
|
+
|
334
|
+
M_PIATETSKY_SHAPIRO_MIN = -0.25
|
335
|
+
M_PIATETSKY_SHAPIRO_MID = 0
|
336
|
+
M_PIATETSKY_SHAPIRO_MAX = 0.25
|
337
|
+
def m_piatetsky_shapiro
|
338
|
+
@m_piatetsky_shapiro ||= self.class.initialize_measure(__method__) {
|
339
|
+
p_AB - p_A*p_B
|
340
|
+
}
|
341
|
+
end
|
342
|
+
|
343
|
+
# aka: pavillion index, centered confidence
|
344
|
+
M_ADDED_VALUE_MIN = -0.5
|
345
|
+
M_ADDED_VALUE_MID = 0
|
346
|
+
M_ADDED_VALUE_MAX = 1
|
347
|
+
def m_added_value
|
348
|
+
@m_added_value ||= self.class.initialize_measure(__method__) {
|
349
|
+
p_BgivenA - p_B
|
350
|
+
}
|
351
|
+
end
|
352
|
+
|
353
|
+
M_CAUSAL_CONFIDENCE_MIN = 0
|
354
|
+
M_CAUSAL_CONFIDENCE_MID = 0
|
355
|
+
M_CAUSAL_CONFIDENCE_MAX = 1
|
356
|
+
def m_causal_confidence
|
357
|
+
@m_causal_confidence ||= self.class.initialize_measure(__method__) {
|
358
|
+
(1.to_r/2)*(p_BgivenA + p_notAgivennotB)
|
359
|
+
}
|
360
|
+
end
|
361
|
+
|
362
|
+
M_CAUSAL_SUPPORT_MIN = 0
|
363
|
+
M_CAUSAL_SUPPORT_MID = 0
|
364
|
+
M_CAUSAL_SUPPORT_MAX = 1
|
365
|
+
def m_causal_support
|
366
|
+
@m_causal_support ||= self.class.initialize_measure(__method__) {
|
367
|
+
p_AB + p_notA_notB
|
368
|
+
}
|
369
|
+
end
|
370
|
+
|
371
|
+
M_DESCRIPTIVE_CONFIRMED_CONFIDENCE_MIN = -1
|
372
|
+
M_DESCRIPTIVE_CONFIRMED_CONFIDENCE_MID = 0
|
373
|
+
M_DESCRIPTIVE_CONFIRMED_CONFIDENCE_MAX = 1
|
374
|
+
def m_descriptive_confirmed_confidence
|
375
|
+
@m_descriptive_confirmed_confidence ||= self.class.initialize_measure(__method__) {
|
376
|
+
p_BgivenA - p_notBgivenA
|
377
|
+
}
|
378
|
+
end
|
379
|
+
|
380
|
+
M_DIFFERENCE_OF_CONFIDENCE_MIN = -1
|
381
|
+
M_DIFFERENCE_OF_CONFIDENCE_MID = 0
|
382
|
+
M_DIFFERENCE_OF_CONFIDENCE_MAX = 1
|
383
|
+
def m_difference_of_confidence
|
384
|
+
@m_difference_of_confidence ||= self.class.initialize_measure(__method__) {
|
385
|
+
p_BgivenA - p_BgivennotA
|
386
|
+
}
|
387
|
+
end
|
388
|
+
|
389
|
+
M_RELATIVE_RISK_MIN = 0
|
390
|
+
M_RELATIVE_RISK_MID = 0
|
391
|
+
M_RELATIVE_RISK_MAX = Float::INFINITY
|
392
|
+
def m_relative_risk
|
393
|
+
@m_relative_risk ||= self.class.initialize_measure(__method__) {
|
394
|
+
if p_BgivennotA == 0
|
395
|
+
Float::INFINITY
|
396
|
+
else
|
397
|
+
p_BgivenA/p_BgivennotA
|
398
|
+
end
|
399
|
+
}
|
400
|
+
end
|
401
|
+
|
402
|
+
M_JACCARD_MIN = 0
|
403
|
+
M_JACCARD_MID = 0
|
404
|
+
M_JACCARD_MAX = 1
|
405
|
+
def m_jaccard
|
406
|
+
@m_jaccard ||= self.class.initialize_measure(__method__) {
|
407
|
+
p_AB/(p_A+p_B-p_AB)
|
408
|
+
}
|
409
|
+
end
|
410
|
+
|
411
|
+
|
412
|
+
M_IMBALANCE_RATIO_MIN = 0
|
413
|
+
M_IMBALANCE_RATIO_MID = 0
|
414
|
+
M_IMBALANCE_RATIO_MAX = 1
|
415
|
+
##
|
416
|
+
# IR gauges the degree of imbalance between two events that the lhs and the rhs are contained in a transaction.
|
417
|
+
# The ratio is close to 0 if the conditional probabilities are similar (i.e., very balanced) and close to 1 if they are very different
|
418
|
+
##
|
419
|
+
def m_imbalance_ratio
|
420
|
+
@m_imbalance_ratio ||= self.class.initialize_measure(__method__) {
|
421
|
+
numerator = (p_AgivenB - p_BgivenA).abs
|
422
|
+
denominator = (p_AgivenB + p_BgivenA - p_AgivenB*p_BgivenA)
|
423
|
+
if denominator == 0
|
424
|
+
if numerator == 0
|
425
|
+
0
|
426
|
+
else
|
427
|
+
raise Evoc::MeasureCalculationError.new, "Numerator was not 0 when denominator was 0 when calculating imbalance ratio"
|
428
|
+
end
|
429
|
+
else
|
430
|
+
numerator/denominator
|
431
|
+
end
|
432
|
+
}
|
433
|
+
end
|
434
|
+
|
435
|
+
M_ODDS_RATIO_MIN = 0
|
436
|
+
M_ODDS_RATIO_MID = 1
|
437
|
+
M_ODDS_RATIO_MAX = Float::INFINITY
|
438
|
+
##
|
439
|
+
# The odds of finding X in transactions which contain Y divided by the
|
440
|
+
# odds of finding X in transactions which do not contain Y
|
441
|
+
##
|
442
|
+
def m_odds_ratio
|
443
|
+
@m_odds_ratio ||= self.class.initialize_measure(__method__) {
|
444
|
+
numerator = p_AB*p_notA_notB
|
445
|
+
denominator = p_A_notB*p_notA_B
|
446
|
+
if denominator == 0
|
447
|
+
Float::INFINITY
|
448
|
+
else
|
449
|
+
numerator/denominator
|
450
|
+
end
|
451
|
+
}
|
452
|
+
end
|
453
|
+
|
454
|
+
M_YULES_Q_MIN = -1
|
455
|
+
M_YULES_Q_MID = 0
|
456
|
+
M_YULES_Q_MAX = 1
|
457
|
+
def m_yules_q
|
458
|
+
@m_yules_q ||= self.class.initialize_measure(__method__) {
|
459
|
+
odds_ratio = self.m_odds_ratio.value
|
460
|
+
if !odds_ratio.nil?
|
461
|
+
if odds_ratio.to_f.finite?
|
462
|
+
(odds_ratio - 1)/(odds_ratio + 1)
|
463
|
+
else
|
464
|
+
# -1 if odds ratio -inf
|
465
|
+
# 1 if odds ratio +inf
|
466
|
+
odds_ratio.to_f.infinite?
|
467
|
+
end
|
468
|
+
else
|
469
|
+
raise Evoc::MeasureCalculationError.new, "Odds ratio was nil when calculating yules q"
|
470
|
+
end
|
471
|
+
}
|
472
|
+
end
|
473
|
+
|
474
|
+
M_YULES_Y_MIN = -1
|
475
|
+
M_YULES_Y_MID = 0
|
476
|
+
M_YULES_Y_MAX = 1
|
477
|
+
def m_yules_y
|
478
|
+
@m_yules_y ||= self.class.initialize_measure(__method__) {
|
479
|
+
odds_ratio = self.m_odds_ratio.value
|
480
|
+
if !odds_ratio.nil?
|
481
|
+
if odds_ratio.to_f.finite?
|
482
|
+
((Math.sqrt(odds_ratio).rationalize) - 1)/((Math.sqrt(odds_ratio).rationalize) + 1)
|
483
|
+
else
|
484
|
+
odds_ratio.to_f.infinite?
|
485
|
+
end
|
486
|
+
else
|
487
|
+
raise Evoc::Exceptions::MeasureCalculationError.new, "Odds ratio was nil when calculating yules y"
|
488
|
+
end
|
489
|
+
}
|
490
|
+
end
|
491
|
+
|
492
|
+
# from Tan2004
|
493
|
+
M_KLOSGEN_MIN = -1
|
494
|
+
M_KLOSGEN_MID = 0
|
495
|
+
M_KLOSGEN_MAX = 1
|
496
|
+
def m_klosgen
|
497
|
+
@m_klosgen ||= self.class.initialize_measure(__method__) {
|
498
|
+
(Math.sqrt(p_AB)*[(p_BgivenA-p_B),p_AgivenB-p_A].max).rationalize
|
499
|
+
}
|
500
|
+
end
|
501
|
+
|
502
|
+
M_KULCZYNSKI_MIN = 0
|
503
|
+
M_KULCZYNSKI_MID = 0
|
504
|
+
M_KULCZYNSKI_MAX = 1
|
505
|
+
##
|
506
|
+
# Calculate the null-invariant Kulczynski measure with a preference for skewed patterns.
|
507
|
+
##
|
508
|
+
def m_kulczynski
|
509
|
+
@m_kulczynski ||= self.class.initialize_measure(__method__) {
|
510
|
+
(p_AB/2)*((1/p_A)+(1/p_B))
|
511
|
+
}
|
512
|
+
end
|
513
|
+
|
514
|
+
M_CONVICTION_MIN = 0
|
515
|
+
M_CONVICTION_MID = 0
|
516
|
+
M_CONVICTION_MAX = Float::INFINITY
|
517
|
+
def m_conviction
|
518
|
+
@m_conviction ||= self.class.initialize_measure(__method__) {
|
519
|
+
numerator = p_A*p_notB
|
520
|
+
denominator = p_A_notB
|
521
|
+
if denominator == 0
|
522
|
+
if numerator == 0
|
523
|
+
0
|
524
|
+
else
|
525
|
+
Float::INFINITY
|
526
|
+
end
|
527
|
+
else
|
528
|
+
numerator/denominator
|
529
|
+
end
|
530
|
+
}
|
531
|
+
end
|
532
|
+
|
533
|
+
##
|
534
|
+
# uses 2 coefficients two weight the importance of the two factors
|
535
|
+
# k : dependency
|
536
|
+
# m : generality
|
537
|
+
M_INTERESTINGNESS_WEIGHTING_DEPENDENCY_MIN = 0
|
538
|
+
M_INTERESTINGNESS_WEIGHTING_DEPENDENCY_MID = 0
|
539
|
+
M_INTERESTINGNESS_WEIGHTING_DEPENDENCY_MAX = 1
|
540
|
+
def m_interestingness_weighting_dependency
|
541
|
+
k = 2
|
542
|
+
m = 2
|
543
|
+
@m_interestingness_weighting_dependency ||= self.class.initialize_measure(__method__) {
|
544
|
+
((p_BgivenA/p_B)**(k-1))*(p_AB**m)
|
545
|
+
}
|
546
|
+
end
|
547
|
+
|
548
|
+
M_COLLECTIVE_STRENGTH_MIN = -Float::INFINITY
|
549
|
+
M_COLLECTIVE_STRENGTH_MID = 1
|
550
|
+
M_COLLECTIVE_STRENGTH_MAX = Float::INFINITY
|
551
|
+
# range from Aggarwal1998
|
552
|
+
def m_collective_strength
|
553
|
+
@m_collective_strength ||= self.class.initialize_measure(__method__) {
|
554
|
+
n1 = (p_AB+p_notBgivennotA)
|
555
|
+
d1 = (p_A*p_B+p_notA*p_notB)
|
556
|
+
first = ((d1 == 0) ? n1/d1.to_f : n1/d1)
|
557
|
+
n2 = (1-p_A*p_B-p_notA*p_notB)
|
558
|
+
d2 = (1-p_AB-p_notBgivennotA)
|
559
|
+
second = ((d2 == 0) ? n2/d2.to_f : n2/d2)
|
560
|
+
first * second
|
561
|
+
}
|
562
|
+
end
|
563
|
+
|
564
|
+
M_GINI_INDEX_MIN = 0
|
565
|
+
M_GINI_INDEX_MID = 0
|
566
|
+
M_GINI_INDEX_MAX = 1
|
567
|
+
##
|
568
|
+
# Measures quadratic entropy
|
569
|
+
#
|
570
|
+
##
|
571
|
+
def m_gini_index
|
572
|
+
@m_gini_index ||= self.class.initialize_measure(__method__) {
|
573
|
+
p_A*((p_BgivenA**2)+(p_notBgivenA**2))+p_notA*((p_BgivennotA**2)+(p_notBgivennotA**2))-p_B**2-p_notB**2
|
574
|
+
}
|
575
|
+
end
|
576
|
+
|
577
|
+
M_KAPPA_MIN = -1
|
578
|
+
M_KAPPA_MID = 0
|
579
|
+
M_KAPPA_MAX = 1
|
580
|
+
def m_kappa
|
581
|
+
@m_kappa ||= self.class.initialize_measure(__method__) {
|
582
|
+
num = (p_AB + p_notA_notB - p_A*p_B - p_notA*p_notB)
|
583
|
+
den = (1 - p_A*p_B - p_notA*p_notB)
|
584
|
+
((den == 0) ? num/den.to_f : num/den)
|
585
|
+
}
|
586
|
+
end
|
587
|
+
|
588
|
+
M_J_MEASURE_MIN = 0
|
589
|
+
M_J_MEASURE_MID = 0
|
590
|
+
M_J_MEASURE_MAX = 1
|
591
|
+
##
|
592
|
+
# Measures cross entropy
|
593
|
+
#
|
594
|
+
##
|
595
|
+
def m_j_measure
|
596
|
+
@m_j_measure ||= self.class.initialize_measure(__method__) {
|
597
|
+
first_log = Math.log(p_BgivenA/(p_B.to_f))
|
598
|
+
second_log = Math.log(p_notBgivenA/(p_notB.to_f))
|
599
|
+
first = ((p_AB == 0) ? 0 : p_AB*first_log)
|
600
|
+
second = ((p_A_notB == 0) ? 0 : p_A_notB*second_log)
|
601
|
+
|
602
|
+
first+second
|
603
|
+
}
|
604
|
+
end
|
605
|
+
|
606
|
+
M_ONE_WAY_SUPPORT_MIN = -1
|
607
|
+
M_ONE_WAY_SUPPORT_MID = 0
|
608
|
+
M_ONE_WAY_SUPPORT_MAX = Float::INFINITY
|
609
|
+
def m_one_way_support
|
610
|
+
@m_one_way_support ||= self.class.initialize_measure(__method__) {
|
611
|
+
if p_BgivenA == 0
|
612
|
+
0
|
613
|
+
else
|
614
|
+
p_BgivenA*(Math.log2(p_AB/(p_A*p_B)).rationalize)
|
615
|
+
end
|
616
|
+
}
|
617
|
+
end
|
618
|
+
|
619
|
+
M_TWO_WAY_SUPPORT_MIN = -1
|
620
|
+
M_TWO_WAY_SUPPORT_MID = 0
|
621
|
+
M_TWO_WAY_SUPPORT_MAX = 1
|
622
|
+
def m_two_way_support
|
623
|
+
@m_two_way_support ||= self.class.initialize_measure(__method__) {
|
624
|
+
if p_AB == 0
|
625
|
+
0
|
626
|
+
else
|
627
|
+
p_AB*(Math.log2(p_AB/(p_A*p_B)).rationalize)
|
628
|
+
end
|
629
|
+
}
|
630
|
+
end
|
631
|
+
|
632
|
+
# aka Ø-coefficient
|
633
|
+
M_LINEAR_CORRELATION_COEFFICIENT_MIN = -1
|
634
|
+
M_LINEAR_CORRELATION_COEFFICIENT_MID = 0
|
635
|
+
M_LINEAR_CORRELATION_COEFFICIENT_MAX = 1
|
636
|
+
def m_linear_correlation_coefficient
|
637
|
+
@m_linear_correlation_coefficient ||= self.class.initialize_measure(__method__) {
|
638
|
+
num = (p_AB-(p_A*p_B))
|
639
|
+
den = (Math.sqrt(p_A*p_B*p_notA*p_notB))
|
640
|
+
((den == 0) ? num/den.to_f : num/(den.rationalize))
|
641
|
+
}
|
642
|
+
end
|
643
|
+
|
644
|
+
M_COSINE_MIN = 0
|
645
|
+
M_COSINE_MID = 0
|
646
|
+
M_COSINE_MAX = 1
|
647
|
+
def m_cosine
|
648
|
+
@m_cosine ||= self.class.initialize_measure(__method__) {
|
649
|
+
num = p_AB
|
650
|
+
den = Math.sqrt(p_A*p_B).rationalize
|
651
|
+
if den == 0
|
652
|
+
raise Evoc::Exceptions::MeasureCalculationError, "Denominator became 0 when calculating cosine (a #{p_a}, b #{p_B}, ab #{p_AB})"
|
653
|
+
else
|
654
|
+
num/den
|
655
|
+
end
|
656
|
+
}
|
657
|
+
end
|
658
|
+
|
659
|
+
M_LOEVINGER_MIN = -1
|
660
|
+
M_LOEVINGER_MID = 0
|
661
|
+
M_LOEVINGER_MAX = 1
|
662
|
+
##
|
663
|
+
# aka Certainty Factor
|
664
|
+
# The certainty factor is a measure of variation of the probability that
|
665
|
+
# Y is in a transaction when only considering transactions with X.
|
666
|
+
# An increasing CF means a decrease of the probability that Y is not in
|
667
|
+
# a transaction that X is in. Negative CFs have a similar interpretation
|
668
|
+
#
|
669
|
+
##
|
670
|
+
def m_loevinger
|
671
|
+
@m_loevinger ||= self.class.initialize_measure(__method__) {
|
672
|
+
if p_A*p_notB == 0
|
673
|
+
1
|
674
|
+
else
|
675
|
+
1 - ((p_A_notB)/(p_A*p_notB))
|
676
|
+
end
|
677
|
+
}
|
678
|
+
end
|
679
|
+
|
680
|
+
M_SEBAG_SCHOENAUER_MIN = 0
|
681
|
+
M_SEBAG_SCHOENAUER_MID = 0
|
682
|
+
M_SEBAG_SCHOENAUER_MAX = Float::INFINITY
|
683
|
+
def m_sebag_schoenauer
|
684
|
+
@m_sebag_schoenauer ||= self.class.initialize_measure(__method__) {
|
685
|
+
if p_A_notB == 0
|
686
|
+
Float::INFINITY
|
687
|
+
else
|
688
|
+
p_AB/p_A_notB
|
689
|
+
end
|
690
|
+
}
|
691
|
+
end
|
692
|
+
|
693
|
+
M_VARYING_RATES_LIAISON_MIN = -1
|
694
|
+
M_VARYING_RATES_LIAISON_MID = 0
|
695
|
+
M_VARYING_RATES_LIAISON_MAX = Float::INFINITY
|
696
|
+
def m_varying_rates_liaison
|
697
|
+
@m_varying_rates_liaison ||= self.class.initialize_measure(__method__) {
|
698
|
+
(p_AB/(p_A*p_B)) - 1
|
699
|
+
}
|
700
|
+
end
|
701
|
+
|
702
|
+
M_LEAST_CONTRADICTION_MIN = -Float::INFINITY
|
703
|
+
M_LEAST_CONTRADICTION_MID = 0
|
704
|
+
M_LEAST_CONTRADICTION_MAX = 1
|
705
|
+
def m_least_contradiction
|
706
|
+
@m_least_contradiction ||= self.class.initialize_measure(__method__) {
|
707
|
+
(p_AB-p_A_notB)/p_B
|
708
|
+
}
|
709
|
+
end
|
710
|
+
|
711
|
+
M_ODD_MULTIPLIER_MIN = 0
|
712
|
+
M_ODD_MULTIPLIER_MID = 0
|
713
|
+
M_ODD_MULTIPLIER_MAX = Float::INFINITY
|
714
|
+
def m_odd_multiplier
|
715
|
+
@m_odd_multiplier ||= self.class.initialize_measure(__method__) {
|
716
|
+
numerator = p_AB*p_notB
|
717
|
+
denominator = p_B*p_A_notB
|
718
|
+
if denominator == 0
|
719
|
+
if numerator == 0
|
720
|
+
0
|
721
|
+
else
|
722
|
+
Float::INFINITY
|
723
|
+
end
|
724
|
+
else
|
725
|
+
numerator/denominator
|
726
|
+
end
|
727
|
+
}
|
728
|
+
end
|
729
|
+
|
730
|
+
M_EXAMPLE_AND_COUNTEREXAMPLE_RATE_MIN = -Float::INFINITY
|
731
|
+
M_EXAMPLE_AND_COUNTEREXAMPLE_RATE_MID = 0
|
732
|
+
M_EXAMPLE_AND_COUNTEREXAMPLE_RATE_MAX = 1
|
733
|
+
# 0 when equally many examples as counter examples
|
734
|
+
def m_example_and_counterexample_rate
|
735
|
+
@m_example_and_counterexample_rate ||= self.class.initialize_measure(__method__) {
|
736
|
+
numerator = (p_AB - p_A_notB)
|
737
|
+
denominator = p_AB
|
738
|
+
if denominator == 0
|
739
|
+
-Float::INFINITY
|
740
|
+
else
|
741
|
+
numerator/denominator
|
742
|
+
end
|
743
|
+
}
|
744
|
+
end
|
745
|
+
|
746
|
+
M_ZHANG_MIN = -1
|
747
|
+
M_ZHANG_MID = 0
|
748
|
+
M_ZHANG_MAX = 1
|
749
|
+
def m_zhang
|
750
|
+
@m_zhang ||= self.class.initialize_measure(__method__) {
|
751
|
+
numerator = p_AB-p_A*p_B
|
752
|
+
denominator = [p_AB*p_notB,p_B*p_A_notB].max
|
753
|
+
if denominator == 0
|
754
|
+
0
|
755
|
+
else
|
756
|
+
numerator/denominator
|
757
|
+
end
|
758
|
+
}
|
759
|
+
end
|
760
|
+
|
761
|
+
M_LAPLACE_CORRECTED_CONFIDENCE_MIN = 0
|
762
|
+
M_LAPLACE_CORRECTED_CONFIDENCE_MID = 0
|
763
|
+
M_LAPLACE_CORRECTED_CONFIDENCE_MAX = 1
|
764
|
+
##
|
765
|
+
# Corrected confidence estimate decreases with lower support
|
766
|
+
# to account for estimation uncertainty with low counts.
|
767
|
+
##
|
768
|
+
def m_laplace_corrected_confidence
|
769
|
+
@m_laplace_corrected_confidence ||= self.class.initialize_measure(__method__) {
|
770
|
+
(p_AB + 1)/(p_B + 2)
|
771
|
+
}
|
772
|
+
end
|
773
|
+
|
774
|
+
##
|
775
|
+
# building blocks for interestingness measures
|
776
|
+
#
|
777
|
+
# A refers to the antecedent of a rule
|
778
|
+
# B refers to the consequent of a rule
|
779
|
+
|
780
|
+
##
|
781
|
+
# the number of transactions
|
782
|
+
# n is converted into the specified type to ensure
|
783
|
+
# that the type is used throughout calculations
|
784
|
+
def n
|
785
|
+
@n ||= VALUE_TYPE.method(VALUE_TYPE.to_s).call(tx_store.size)
|
786
|
+
end
|
787
|
+
|
788
|
+
##
|
789
|
+
# the ratio of tx with A as a subset
|
790
|
+
def p_A
|
791
|
+
@p_A ||= tx_store.transactions_of_list(lhs,strict: true).size/n
|
792
|
+
end
|
793
|
+
|
794
|
+
##
|
795
|
+
# the ratio of tx where A is not a subset
|
796
|
+
def p_notA
|
797
|
+
@p_notA ||= (1 - p_A)
|
798
|
+
end
|
799
|
+
|
800
|
+
##
|
801
|
+
# the ratio of tx with B as a subset
|
802
|
+
def p_B
|
803
|
+
@p_B ||= tx_store.transactions_of_list(rhs,strict: true).size/n
|
804
|
+
end
|
805
|
+
|
806
|
+
##
|
807
|
+
# the ratio of tx where B is not a subset
|
808
|
+
def p_notB
|
809
|
+
@p_notB ||= (1 - p_B)
|
810
|
+
end
|
811
|
+
|
812
|
+
##
|
813
|
+
# the ratio of tx with the union of A and B as a subset
|
814
|
+
def p_AB
|
815
|
+
@p_AB ||= tx_store.transactions_of_list((lhs | rhs),strict: true).size/n
|
816
|
+
end
|
817
|
+
|
818
|
+
##
|
819
|
+
# the ratio of tx where A or B is a subset
|
820
|
+
def p_AorB
|
821
|
+
@p_AorB ||= p_A + p_B - p_AB
|
822
|
+
end
|
823
|
+
|
824
|
+
##
|
825
|
+
# the ratio of tx where neither A or B is a subset
|
826
|
+
def p_notA_notB
|
827
|
+
@p_notA_notB ||= 1 - (p_A + p_B) + p_AB
|
828
|
+
end
|
829
|
+
|
830
|
+
##
|
831
|
+
# the ratio of tx where A is not a subset but B is
|
832
|
+
def p_notA_B
|
833
|
+
@p_notA_B ||= p_B - p_AB
|
834
|
+
end
|
835
|
+
|
836
|
+
##
|
837
|
+
# the ratio of tx where A is a subset but B is not
|
838
|
+
def p_A_notB
|
839
|
+
@p_A_notB ||= p_A - p_AB
|
840
|
+
end
|
841
|
+
|
842
|
+
##
|
843
|
+
# the ratio of the union being a subset to the number of txes where B is a subset
|
844
|
+
def p_AgivenB
|
845
|
+
@p_AgivenB ||= ((p_B == 0) ? 0 : p_AB/p_B)
|
846
|
+
end
|
847
|
+
|
848
|
+
##
|
849
|
+
# the ratio of the union being a subset to the number of txes where A is a subset
|
850
|
+
def p_BgivenA
|
851
|
+
@p_BgivenA ||= ((p_A == 0) ? 0 : p_AB/p_A)
|
852
|
+
end
|
853
|
+
|
854
|
+
def p_notAgivenB
|
855
|
+
@p_notAgivenB ||= ((p_B == 0) ? 0 : p_notA_B/p_B)
|
856
|
+
end
|
857
|
+
|
858
|
+
# if A is 0, so is A,notB
|
859
|
+
def p_notBgivenA
|
860
|
+
@p_notBgivenA ||= ((p_A == 0) ? 0 : p_A_notB/p_A)
|
861
|
+
end
|
862
|
+
|
863
|
+
# if notB is 0, so is A_notB
|
864
|
+
def p_AgivennotB
|
865
|
+
@p_AgivennotB ||= ((p_notB == 0) ? 0 : p_A_notB/p_notB)
|
866
|
+
end
|
867
|
+
|
868
|
+
# if notA is 0, so is notA_B
|
869
|
+
def p_BgivennotA
|
870
|
+
@p_BgivennotA ||= ((p_notA == 0) ? 0 : p_notA_B/p_notA)
|
871
|
+
end
|
872
|
+
|
873
|
+
def p_notBgivennotA
|
874
|
+
@p_notBgivennotA ||= ((p_notA == 0) ? 0 : p_notA_notB/p_notA)
|
875
|
+
end
|
876
|
+
|
877
|
+
# if notB is 0, so is notA and notB
|
878
|
+
def p_notAgivennotB
|
879
|
+
@p_notAgivennotB ||= ((p_notB == 0) ? 0 : p_notA_notB/p_notB)
|
880
|
+
end
|
881
|
+
end
|
882
|
+
end
|