evoc 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +4 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/Makefile +4 -0
  8. data/README.md +61 -0
  9. data/Rakefile +6 -0
  10. data/bin/console +14 -0
  11. data/bin/evoc +3 -0
  12. data/bin/setup +7 -0
  13. data/evoc.gemspec +30 -0
  14. data/lib/evoc/algorithm.rb +147 -0
  15. data/lib/evoc/algorithms/top_k.rb +86 -0
  16. data/lib/evoc/analyze.rb +395 -0
  17. data/lib/evoc/array.rb +43 -0
  18. data/lib/evoc/evaluate.rb +109 -0
  19. data/lib/evoc/exceptions/aggregation_error.rb +6 -0
  20. data/lib/evoc/exceptions/expectedoutcome_nil_or_empty.rb +6 -0
  21. data/lib/evoc/exceptions/measure_calculation_error.rb +6 -0
  22. data/lib/evoc/exceptions/no_changed_items_in_changes.rb +6 -0
  23. data/lib/evoc/exceptions/no_changes_in_json_object.rb +6 -0
  24. data/lib/evoc/exceptions/no_date_in_json_object.rb +6 -0
  25. data/lib/evoc/exceptions/no_result.rb +6 -0
  26. data/lib/evoc/exceptions/non_finite.rb +8 -0
  27. data/lib/evoc/exceptions/non_numeric.rb +8 -0
  28. data/lib/evoc/exceptions/not_a_query.rb +6 -0
  29. data/lib/evoc/exceptions/not_a_result.rb +6 -0
  30. data/lib/evoc/exceptions/not_a_transaction.rb +6 -0
  31. data/lib/evoc/exceptions/not_initialized.rb +6 -0
  32. data/lib/evoc/exceptions/only_nil_in_changes.rb +6 -0
  33. data/lib/evoc/exceptions/query_nil_or_empty.rb +6 -0
  34. data/lib/evoc/exceptions/unable_to_convert_json_to_tx.rb +6 -0
  35. data/lib/evoc/experiment.rb +239 -0
  36. data/lib/evoc/hash.rb +56 -0
  37. data/lib/evoc/history_store.rb +53 -0
  38. data/lib/evoc/hyper_rule.rb +53 -0
  39. data/lib/evoc/interestingness_measure.rb +77 -0
  40. data/lib/evoc/interestingness_measure_aggregator.rb +147 -0
  41. data/lib/evoc/interestingness_measures.rb +882 -0
  42. data/lib/evoc/logger.rb +34 -0
  43. data/lib/evoc/memory_profiler.rb +43 -0
  44. data/lib/evoc/recommendation_cache.rb +152 -0
  45. data/lib/evoc/rule.rb +32 -0
  46. data/lib/evoc/rule_store.rb +340 -0
  47. data/lib/evoc/scenario.rb +303 -0
  48. data/lib/evoc/svd.rb +124 -0
  49. data/lib/evoc/tx.rb +34 -0
  50. data/lib/evoc/tx_store.rb +379 -0
  51. data/lib/evoc/version.rb +3 -0
  52. data/lib/evoc.rb +4 -0
  53. data/lib/evoc_cli/analyze.rb +198 -0
  54. data/lib/evoc_cli/cli_helper.rb +1 -0
  55. data/lib/evoc_cli/experiment.rb +78 -0
  56. data/lib/evoc_cli/info.rb +22 -0
  57. data/lib/evoc_cli/main.rb +29 -0
  58. data/lib/evoc_cli/util.rb +36 -0
  59. data/lib/evoc_helper.rb +40 -0
  60. data/mem_profiler/Gemfile.lock +39 -0
  61. data/mem_profiler/README.md +126 -0
  62. data/mem_profiler/createdb.rb +4 -0
  63. data/mem_profiler/db.rb +82 -0
  64. data/mem_profiler/gemfile +6 -0
  65. data/mem_profiler/gencsv.rb +64 -0
  66. data/mem_profiler/genimport.sh +8 -0
  67. data/mem_profiler/graph.rb +91 -0
  68. metadata +251 -0
@@ -0,0 +1,239 @@
1
+ module Evoc
2
+ class Experiment
3
+ include Logging
4
+ attr_accessor :opts
5
+
6
+ @@logger_level = :error
7
+
8
+ def initialize(opts = Hash.new)
9
+ self.opts = opts
10
+ # Set logger level
11
+ Logging.set_level(self.opts[:logger_level])
12
+ logger.debug "Initialized experiment with options: #{opts}"
13
+ # setup history
14
+ if !opts[:transactions].nil?
15
+ Evoc::HistoryStore.initialize(path: self.opts[:transactions],case_id: self.opts[:case_id], granularity: self.opts[:granularity])
16
+ end
17
+ end
18
+
19
+ def sample_transactions
20
+ STDERR.puts "Sampling transactions.."
21
+ # by default we can sample from the whole history
22
+ sampling_history = Evoc::HistoryStore.base_history
23
+ sample = []
24
+ # filter out transactions larger than X
25
+ if !self.opts[:maximum_commit_size].nil?
26
+ STDERR.puts "Only sampling txes smaller than #{self.opts[:maximum_commit_size]}"
27
+ sampling_history = sampling_history.clone_with_subset(0,sampling_history.size-1,self.opts[:maximum_commit_size])
28
+ end
29
+ # only sample transactions that have at least 'minimum_history' previous history
30
+ if !self.opts[:minimum_history].nil?
31
+ STDERR.puts "Only sampling txes with at least #{self.opts[:minimum_history]} previous txes (history)"
32
+ if self.opts[:minimum_history] >= sampling_history.size-1
33
+ raise ArgumentError, "The history you provided (#{self.opts[:transactions]}), only contains #{sampling_history.size}, not enough to sample with a minimum history set to #{self.opts[:minimum_history]}. Perhaps also #{self.opts[:maximum_commit_size]} must be increased."
34
+ end
35
+ sampling_history = sampling_history.clone_with_subset(self.opts[:minimum_history],sampling_history.size-1)
36
+ end
37
+ # group the txes by size
38
+ groups = sampling_history.group_by {|tx| tx.size}
39
+ # sort the sample_groups option to reduce the need for maintaining control over which txes that have been sampled
40
+ # i.e., random sampling is done first, then the sampled txes are removed from the sampling
41
+ tx_sizes_to_sample_from = self.opts[:sample_groups].sort_by(&:to_s)
42
+ tx_sizes_to_sample_from.each do |group_size|
43
+ if group_size == '*'
44
+ # TODO: > 2 should be generalized to > X
45
+ txes_larger_than_one = sampling_history.select {|tx| tx.size > 2}.map(&:id)
46
+ sampled_ids = txes_larger_than_one.sample(self.opts[:sample_size])
47
+ sample << sampled_ids
48
+ STDERR.puts "Sampled #{sampled_ids.size} txes from the whole history"
49
+ # remove sampled txes from sampling_history
50
+ filtered_hist = sampling_history.reject {|tx| sampled_ids.include? tx.id}
51
+ sampling_history.clear
52
+ filtered_hist.each {|tx| sampling_history << tx}
53
+ elsif group_size.to_i
54
+ # check if there were any txes of this size
55
+ if group = groups[group_size.to_i]
56
+ if group.size < self.opts[:sample_size]
57
+ logger.warn "Only #{group.size} transactions found of size #{group_size}, asked for #{self.opts[:sample_size]}"
58
+ end
59
+ sampled_ids = group.sample(self.opts[:sample_size]).map(&:id)
60
+ sample << sampled_ids
61
+ STDERR.puts "Sampled #{sampled_ids.size} txes of size #{group_size}"
62
+ else
63
+ logger.warn "No transactions found of size #{group_size}, asked for #{self.opts[:sample_size]} (minimum history: #{self.opts[:minimum_history]})"
64
+ end
65
+ else
66
+ raise ArgumentError.new, "Tx size for sampling must either be specified by an Integer or '*' (was #{group_size}:#{group_size.class})"
67
+ end
68
+ end
69
+ sample.flatten.uniq
70
+ end
71
+
72
+
73
+ ##
74
+ # Generates a CSV of queries according to the given options
75
+ #
76
+ # CSV HEADER:
77
+ #
78
+ # tx_id, query
79
+ #
80
+ def generate_queries
81
+ ##
82
+ # WRITE CSV HEADER
83
+ CSV {|row| row << %W(tx_id query)}
84
+
85
+ ###
86
+ # Iterate over sampled tx ids
87
+ CSV.foreach(self.opts[:transaction_ids_path], headers: true) do |row|
88
+ tx_id = row['tx_id']
89
+ ##
90
+ # GET THE TRANSACTION
91
+ if tx = Evoc::HistoryStore.base_history.get_tx(id: tx_id, id_type: :id)
92
+ items = tx.items
93
+ tx_size = items.size
94
+ ##
95
+ # SAMPLE QUERIES
96
+ #
97
+ # We have 3 different strategies, which may produce the same sizes,
98
+ # but the same size does not need to be executed several times,
99
+ # so duplicates are removed
100
+ specified_sizes = []
101
+ if !self.opts[:select].nil? then specified_sizes << self.opts[:select].map(&:to_i) end
102
+ if !self.opts[:reverse_select].nil? then specified_sizes << self.opts[:reverse_select].map {|i| tx_size-i.to_i} end
103
+ if !self.opts[:percentage].nil? then specified_sizes << self.opts[:percentage].map {|p| (p.to_f/100*tx_size).ceil} end
104
+ # filter out sizes <= 1
105
+ specified_sizes.flatten!.select! {|s| s > 0}
106
+ specified_sizes.uniq!
107
+
108
+ random_sizes = []
109
+ if self.opts[:random_select] then random_sizes << Random.new.rand(1..(tx_size-1)) end
110
+
111
+ sampled_queries = []
112
+ # only specified sizes
113
+ if random_sizes.empty? & !specified_sizes.empty?
114
+ sampled_queries = specified_sizes.map {|s| items.sample(s)}
115
+ # only random sizes
116
+ elsif !random_sizes.empty? & specified_sizes.empty?
117
+ sampled_queries = random_sizes.map {|s| items.sample(s)}
118
+ # random + specified = randomly sample in range defined by specified
119
+ # ex:
120
+ # specified = [1,3,10,20]
121
+ # tx size = 4
122
+ #
123
+ # 1. remove X in specified that are larger than or equal to 4
124
+ # 2. randomly select X in specified = Y
125
+ # 3. randomly select Y in tx
126
+ elsif !random_sizes.empty? & !specified_sizes.empty?
127
+ specified_sizes.select! {|s| (s < tx_size) & (s > 1)} #1.
128
+ if randomly_sampled_size = specified_sizes.sample #2.
129
+ sampled_queries = [items.sample(randomly_sampled_size)] #3.
130
+ end
131
+ end
132
+
133
+ if sampled_queries.empty?
134
+ logger.warn "Unable to generate query from tx: #{items}, with params #{self.opts}"
135
+ end
136
+
137
+ ##
138
+ # WRITE CSV
139
+ sampled_queries.each do |query|
140
+ if query.size == tx_size
141
+ logger.debug "The size of the sampled query was equal to the size of the transaction, skipping.. Tx ID: #{tx_id}. Query size: #{query.size}"
142
+ next
143
+ end
144
+ CSV {|row| row << [tx_id,query.join(',')]}
145
+ end
146
+ else
147
+ raise ArgumentError, "The tx with id '#{tx_id}' was not found in the history: #{self.opts[:transactions]}, wrong file?"
148
+ end
149
+ end
150
+
151
+ #TODO possibly move this to execution
152
+ #############################################
153
+ # Filter the generated queries if requested #
154
+ #############################################
155
+ #
156
+ #
157
+ # logger.debug "Number of queries before filtering: #{query_store.size}"
158
+ #
159
+ # if self.opts[:filter_expected_outcome]
160
+ # STDERR.puts "Filtering expected outcome.."
161
+ # init_size = query_store.size
162
+ # query_store.filter_expected_outcome!
163
+ # STDERR.puts "Had to remove #{init_size - query_store.size} queries as a result of the expected outcome now being empty"
164
+ # end
165
+ # if query_store.empty?
166
+ # STDERR.puts "WARNING: Returning 0 queries (maybe increase sample size?)"
167
+ # end
168
+ end
169
+
170
+ ##
171
+ # Execute a set of scenarios
172
+ #
173
+ # @return: json lines stream to stdout (http://jsonlines.org/)
174
+ def execute_scenarios
175
+
176
+ ######
177
+ # Setup factors
178
+ #####
179
+
180
+ # Factor: model size aka model/learning/history size
181
+ factor_model_size = self.opts[:model_size].nil? ? nil : self.opts[:model_size].map {|s| [ 'model_size',s ]}
182
+ # Factor: Max size aka filtering constraint on history
183
+ factor_max_size = self.opts[:max_size].nil? ? nil : self.opts[:max_size].map {|s| [ 'max_size',s ]}
184
+ # Factor: Model age aka number of commits between query and last tx in history
185
+ factor_model_age = self.opts[:model_age].nil? ? nil : self.opts[:model_age].map {|s| [ 'model_age',s ]}
186
+ # Factor: Algorithm
187
+ factor_algorithms = self.opts[:algorithms].nil? ? nil : self.opts[:algorithms].map {|a| ['algorithm',a]}
188
+ # Factor: Measures
189
+ factor_measures = self.opts[:measures].map {|c| ['measures',c]}
190
+ # Factor: Aggregator
191
+ # if aggregation is requested, we also assume that no aggregation wants to be tested
192
+ # the non-aggregation rows are specified with 'aggregator' == nil
193
+ factor_aggregators = self.opts[:aggregators].nil? ? nil : (self.opts[:aggregators]+[nil]).map {|a| ['aggregator',a]}
194
+ # deprecated factor, allways set to 1 for backwards compatibility
195
+ # factor_permutation = self.opts[:permutation].nil? ? nil : (1..self.opts[:permutation]).to_a.map {|p| [ 'permutation',p ]}
196
+ factor_permutation = [[ 'permutation',1 ]]
197
+
198
+ ####
199
+ # Iterate over the queries provided and execute each query in each scenario
200
+ #
201
+ # queries CSV header:
202
+ # tx_id,tx_index,tx,query,query_size,expected_outcome,expected_outcome_size
203
+
204
+ # Count the number of lines so we can properly format the output json
205
+ num_lines = File.read(self.opts[:queries]).each_line.count-1
206
+ current_line = 1
207
+
208
+ factors = [factor_model_size,factor_max_size,factor_model_age,factor_algorithms,factor_measures,factor_permutation,factor_aggregators].compact
209
+ num_of_scenarios = factors.inject(1) {|product,f| product * f.size}
210
+ CSV.foreach(self.opts[:queries], headers: true) do |query|
211
+ # abort if the failsafe file is present
212
+ if !self.opts[:fail_safe].nil?
213
+ if File.exists?(self.opts[:fail_safe])
214
+ $stderr.puts "\nFail safe detected, exiting.."
215
+ break
216
+ end
217
+ end
218
+ current_scenario = 1
219
+ # - compact removes nil values (not used factors)
220
+ # - the splat operator '*' turns the array into parameters for #product
221
+ # - the block form of #product makes it lazy (i.e., the whole cartesian product isn't generated at once)
222
+ factors.first.product(*factors[1..-1]).each do |scenario|
223
+ # Print progress to stderr
224
+ STDERR.print "(#{self.opts[:case_id]}) Executing scenario #{current_scenario} of #{num_of_scenarios} on query #{current_line} of #{num_lines} \r"
225
+
226
+ params = query.to_h.merge(scenario.to_h)
227
+ params[:case_id] = self.opts[:case_id]
228
+ params[:granularity] = self.opts[:granularity]
229
+ # initialize scenario
230
+ q = Evoc::Scenario.new(params)
231
+ $stdout.puts q.call(evaluators: self.opts[:evaluators]).to_json
232
+ current_scenario += 1
233
+ end
234
+ current_line += 1
235
+ end
236
+ STDERR.puts "\n(#{self.opts[:case_id]}) DONE"
237
+ end
238
+ end
239
+ end
data/lib/evoc/hash.rb ADDED
@@ -0,0 +1,56 @@
1
+ class Hash
2
+ include Logging
3
+
4
+ def convert_values(except: [],converter: Rational, convert_empty: false)
5
+ logger.debug "Converter: #{converter}, Hash: #{self}"
6
+ if except.is_a?(Array)
7
+ Hash[self.map {|k, v|
8
+ if except.include?(k)
9
+ [k,v]
10
+ else
11
+ # don't convert empty fields to avoid side-effects
12
+ # "".to_i is 0 for example
13
+ if convert_empty
14
+ [k, converter.method(converter.to_s).call(v)]
15
+ else
16
+ if v.to_s.empty?
17
+ [k,v]
18
+ else
19
+ [k, converter.method(converter.to_s).call(v)]
20
+ end
21
+ end
22
+ end}]
23
+ else
24
+ raise ArgumentError.new, "Specify an array of keys whose values you don't want to convert"
25
+ end
26
+ end
27
+
28
+ ##
29
+ # stolen from rails
30
+ # http://api.rubyonrails.org/classes/Hash.html
31
+ ##
32
+ def symbolize_keys
33
+ transform_keys{ |key| key.to_sym rescue key }
34
+ end
35
+
36
+ def symbolize_keys!
37
+ transform_keys!{ |key| key.to_sym rescue key }
38
+ end
39
+
40
+ def transform_keys
41
+ return enum_for(:transform_keys) unless block_given?
42
+ result = self.class.new
43
+ each_key do |key|
44
+ result[yield(key)] = self[key]
45
+ end
46
+ result
47
+ end
48
+
49
+ def transform_keys!
50
+ return enum_for(:transform_keys!) unless block_given?
51
+ keys.each do |key|
52
+ self[yield(key)] = delete(key)
53
+ end
54
+ self
55
+ end
56
+ end
@@ -0,0 +1,53 @@
1
+ module Evoc
2
+ class HistoryStore
3
+ extend Logging
4
+
5
+ # create accessors for class level instance variables
6
+ class << self
7
+ attr_accessor :tag, :history, :svd
8
+ end
9
+
10
+ def self.initialize(path:,case_id: 'CASEID_NOT_PROVIDED', granularity: 'mixed')
11
+ self.base_history = Evoc::TxStore.new(path: path,case_id: case_id, granularity: granularity)
12
+ self.tag = gen_tag(0,self.base_history.size,"all")
13
+ return self
14
+ end
15
+
16
+ def self.get_history(start_index,end_index,max_size=nil)
17
+ tag = gen_tag(start_index,end_index,max_size)
18
+ if self.tag.nil?
19
+ raise Evoc::Exceptions::NotInitialized.new, "The history store must be initialized with a base history before fetching subhistories"
20
+ elsif self.tag != tag
21
+ # new history
22
+ self.tag = tag
23
+ # create new subset
24
+ self.history = self.base_history.clone_with_subset(start_index,end_index,max_size)
25
+ logger.info "Caching new history | start_index: #{start_index}, end_index: #{end_index}, max_size: #{max_size}, actual filtered size: #{self.history.size}"
26
+ # make the history unmutable
27
+ self.history.freeze
28
+ end
29
+ self.history
30
+ end
31
+
32
+ def self.base_history=tx_store
33
+ @@base_history = tx_store.freeze
34
+ end
35
+
36
+ def self.base_history
37
+ @@base_history
38
+ end
39
+
40
+ def self.get_svd(start_index,end_index,max_size=nil)
41
+ tag = self.gen_tag(start_index,end_index,max_size)
42
+ if self.svd.nil? || (self.tag != tag)
43
+ self.svd = Evoc::SVD.new(get_history(start_index,end_index,max_size))
44
+ end
45
+ self.svd
46
+ end
47
+
48
+ private
49
+ def self.gen_tag(start_index,end_index,max_size)
50
+ start_index.to_s+end_index.to_s+max_size.to_s
51
+ end
52
+ end # HistoryStore
53
+ end # Evoc
@@ -0,0 +1,53 @@
1
+ module Evoc
2
+ class HyperRule < Rule
3
+ attr_accessor :lhs, :rhs, :hyper_coefficient, :hyper_confidence
4
+
5
+ def initialize(rules,aggregator,measures)
6
+ if measures.nil? || measures.empty?
7
+ raise ArgumentError, "At least one measure must be provided when defining a new hyper rule"
8
+ end
9
+ self.lhs = rules.map(&:lhs).array_union
10
+ self.rhs = rules.map(&:rhs).array_union
11
+ self.tx_store = rules.first.tx_store
12
+ self.hyper_coefficient = rules.size
13
+ # aggregate measures
14
+ measures.each do |m|
15
+ logger.debug "Aggregating #{self.hyper_coefficient} rules on #{m}"
16
+ # check if its the hyper confidence
17
+ if m == 'm_hyper_confidence'
18
+ antecedent = self.tx_store.transactions_of_list(self.lhs)
19
+ consequent = self.tx_store.transactions_of_list(self.rhs)
20
+ numerator = (antecedent & consequent).size
21
+ denominator = antecedent.size
22
+ hyper_confidence = numerator/denominator
23
+ self.set_measure(m,hyper_confidence,hyper_measure: true)
24
+ else
25
+ begin
26
+ # get the 'm' measure of each rule
27
+ m_measures = rules.map {|rule| rule.get_measure(m)}
28
+ # remove undefined measures
29
+ m_measures.reject! {|a| a.value.nil?}
30
+ # group the measures into negative and positive correlations
31
+ pos_correlation = m_measures.group_by {|a| a.value > Evoc::Rule.get_mid(m)}
32
+ positive_correlation = pos_correlation[true].nil? ? 0 : Evoc::InterestingnessMeasureAggregator.new(m,pos_correlation[true]).method(aggregator).call.value
33
+ # add the agggregated measure
34
+ self.set_measure(m,positive_correlation,hyper_measure: true)
35
+
36
+ # add the hyper coefficient as a measure for aggregators called with '.._hc'
37
+ if !(aggregator =~ /_hc\z/).nil?
38
+ self.set_measure('m_hyper_coefficient',hyper_coefficient,hyper_measure: true)
39
+ end
40
+
41
+ rescue Evoc::Exceptions::InterestingnessMeasure::NonFinite
42
+ logger.debug "Could not aggregate #{m} using #{aggregator}, aggregation produced a non-finite value, probably a range problem, setting measure value to nil"
43
+ self.set_measure(m,nil, hyper_measure: true)
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ def name
50
+ self.lhs.join(',') + " => " + self.rhs.join(',')
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,77 @@
1
+ module Evoc
2
+ class InterestingnessMeasure
3
+ include Comparable, Logging
4
+ attr_accessor :type, :value, :min, :max, :mid, :hyper_measure
5
+
6
+ def initialize(type:,min:,mid:,max:,value: nil,hyper_measure: false)
7
+ @type = type
8
+ self.min = min
9
+ self.max = max
10
+ self.mid = mid
11
+ self.hyper_measure = hyper_measure
12
+ if block_given?
13
+ self.value = yield
14
+ else
15
+ self.value = value
16
+ end
17
+ end
18
+
19
+ #def value
20
+ # @value.nil? ? @value : @value
21
+ #end
22
+
23
+ def min=m
24
+ @min = m
25
+ end
26
+
27
+ def max=m
28
+ @max = m
29
+ end
30
+
31
+ def mid=m
32
+ @mid = m
33
+ end
34
+
35
+ def value=v
36
+ if v.nil?
37
+ @value = nil
38
+ elsif v.to_f.nan?
39
+ logger.warn "#{self.type}: #{v} was NAN"
40
+ @value = nil
41
+ else
42
+ @value = v
43
+ if !self.hyper_measure
44
+ if !v.between?(self.min,self.max)
45
+ logger.warn "#{self.type}: #{v} is not in the domain of [#{self.min},#{self.max}]"
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ def <=> other
52
+ return nil unless constr_equal_type other
53
+ self.value <=> other.value
54
+ end
55
+
56
+ def -@
57
+ -self.value
58
+ end
59
+
60
+ def finite?
61
+ self.value.finite?
62
+ end
63
+
64
+ def to_s
65
+ self.value.to_s
66
+ end
67
+
68
+ private
69
+ def constr_equal_type other
70
+ (self.class == other.class ? true : ( raise ArgumentError, "self: #{self.type}: #{self.class} was of different class than other: #{other.class}" ) ) &
71
+ (self.max == other.max ? true : ( raise ArgumentError, "self: #{self.type}: #{self.max} had a different max than other: #{other.max}" ) ) &
72
+ (self.mid == other.mid ? true : ( raise ArgumentError, "self: #{self.type}: #{self.mid} had a different mid than other: #{other.mid}" ) ) &
73
+ (self.min == other.min ? true : ( raise ArgumentError, "self: #{self.type}: #{self.min} had a different min than other: #{other.min}" ) ) &
74
+ (self.type == other.type ? true : ( raise ArgumentError, "self: #{self.type}: #{self.type}had a different type than other: #{other.type}" ) )
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,147 @@
1
+ module Evoc
2
+ class InterestingnessMeasureAggregator
3
+ attr_accessor :type,:min,:mid,:max,:values,:size
4
+
5
+ def initialize(type,values)
6
+ self.type = type
7
+ self.min = Evoc::InterestingnessMeasures.get_min(type)
8
+ self.mid = Evoc::InterestingnessMeasures.get_mid(type)
9
+ self.max = Evoc::InterestingnessMeasures.get_max(type)
10
+ self.values = values
11
+ self.size = values.size
12
+ end
13
+
14
+ ###
15
+ # Our own aggregation functions
16
+
17
+ def hcg
18
+ agr = normalize_measures.inject {|tot,i|
19
+ direction = i > 0 ? self.max-self.mid : self.min-self.mid
20
+ coefficient = direction.to_f.finite? ? (direction-tot)/direction : 1
21
+ tot + coefficient*i
22
+ } + self.mid
23
+ Evoc::InterestingnessMeasure.new(type: self.type, min: self.min, mid: self.mid, max: self.max, value: agr)
24
+ end
25
+
26
+ def hcg_hc
27
+ self.hcg
28
+ end
29
+
30
+ ##
31
+ # Aggregation functions borrowed from IR
32
+
33
+ ##
34
+ # cumulative gain aka sum
35
+ def cg
36
+ aggregated = normalize_measures.inject(:+) + self.mid
37
+ Evoc::InterestingnessMeasure.new(type: self.type, min: self.min, mid: self.mid, max: self.max, value: aggregated)
38
+ end
39
+
40
+ # discounted CG
41
+ def dcg
42
+ agr = normalize_measures.first + normalize_measures[1..-1].each_with_index.inject(0) {|sum,(element,index)| sum + element/Math.log2(index+2)}
43
+ agr = agr + self.mid
44
+ Evoc::InterestingnessMeasure.new(type: self.type, min: self.min, mid: self.mid, max: self.max, value: agr)
45
+ end
46
+
47
+ # discounted CG2
48
+ def dcg2
49
+ agr = normalize_measures.first + normalize_measures[1..-1].each_with_index.inject(0) {|sum,(element,index)|
50
+ sum + (element >= 0 ? (2**element-1)/Math.log2(index+2) : -(2**element.abs-1)/Math.log2(index+2))
51
+ }
52
+ agr = agr + self.mid
53
+ Evoc::InterestingnessMeasure.new(type: self.type, min: self.min, mid: self.mid, max: self.max, value: agr)
54
+ end
55
+
56
+ # def discounted_cg3
57
+ # agr = normalize_measures.each_with_index.inject(0) {|sum,(element,index)|
58
+ # sum + element >= 0 ? (2**element-1)/Math.log2(index+2) : -(2**element.abs-1)/Math.log2(index+2)
59
+ # }
60
+ # agr = agr + self.mid
61
+ # Evoc::InterestingnessMeasure.new(type: self.type, min: self.min, mid: self.mid, max: self.max, value: agr)
62
+ # end
63
+ #
64
+ # def discounted_cg4
65
+ # agr = normalize_measures.each_with_index.inject(0) {|sum,(element,index)|
66
+ # sum + (2**element)/Math.log2(index+2)
67
+ # }
68
+ # agr = agr + self.mid
69
+ # Evoc::InterestingnessMeasure.new(type: self.type, min: self.min, mid: self.mid, max: self.max, value: agr)
70
+ # end
71
+ #
72
+ # def discounted_cg5
73
+ # agr = normalize_measures.each_with_index.inject(0) {|sum,(element,index)|
74
+ # sum + element == 0 ? 0 : (2**element)/Math.log2(index+2)
75
+ # }
76
+ # agr = agr + self.mid
77
+ # Evoc::InterestingnessMeasure.new(type: self.type, min: self.min, mid: self.mid, max: self.max, value: agr)
78
+ # end
79
+ #
80
+ # def discounted_rank_cg
81
+ # agr = normalize_measures.each_with_index.inject(0) {|sum,(element,index)|
82
+ # sum + element/(index+1)
83
+ # }
84
+ # agr = agr + self.mid
85
+ # Evoc::InterestingnessMeasure.new(type: self.type, min: self.min, mid: self.mid, max: self.max, value: agr)
86
+ # end
87
+ #
88
+
89
+ def normalize_measures
90
+ self.values.map {|m| m - self.mid}
91
+ end
92
+
93
+ ##
94
+ # values=
95
+ #
96
+ # set and sort by abs value
97
+ def values=(measures)
98
+ if measures.all? {|m| m.is_a?(Evoc::InterestingnessMeasure)}
99
+ @values = measures.map(&:value)
100
+ else
101
+ @values = measures
102
+ end
103
+ @values.map!(&:to_f)
104
+ @values.sort! {|x,y| y.abs<=>x.abs}
105
+ end
106
+
107
+ private
108
+ def constr_equal_type other
109
+ (self.class == other.class ? true : ( raise ArgumentError, "self: #{self.type}: #{self.class} was of different class than other: #{other.class}" ) ) &
110
+ (self.max == other.max ? true : ( raise ArgumentError, "self: #{self.type}: #{self.max} had a different max than other: #{other.max}" ) ) &
111
+ (self.mid == other.mid ? true : ( raise ArgumentError, "self: #{self.type}: #{self.mid} had a different mid than other: #{other.mid}" ) ) &
112
+ (self.min == other.min ? true : ( raise ArgumentError, "self: #{self.type}: #{self.min} had a different min than other: #{other.min}" ) ) &
113
+ (self.type == other.type ? true : ( raise ArgumentError, "self: #{self.type}: #{self.type}had a different type than other: #{other.type}" ) )
114
+ # ((self.value.round(2) >= self.mid) == (other.value.round(2) >= self.mid) ? true : ( raise ArgumentError, "#{self.type}: self: #{self.value} was on another side of the midpoint than other: #{other.value}, midpoint was #{self.mid}" ))
115
+ end
116
+
117
+ def constr_value_in_range(agr_val)
118
+ if !agr_val.between?(self.min,self.max)
119
+ #raise Evoc::Exceptions::AggregationError, "#{self.type}: #{agr_val} was not in range: [#{self.min},#{self.max}]"
120
+ logger.warn "#{self.type}: #{agr_val} was not in range: [#{self.min},#{self.max}]"
121
+ end
122
+ end
123
+
124
+ def constr_self_value_equal_mid
125
+ satisfied = self.value == self.mid
126
+ if satisfied
127
+ logger.warn "self value (#{self.value}) was equal to mid value (#{self.mid}) when aggregating #{self.type} measure, just returning other"
128
+ end
129
+ satisfied
130
+ end
131
+
132
+ def constr_other_value_equal_mid(other)
133
+ satisfied = other.value == self.mid
134
+ if satisfied
135
+ logger.warn "other value (#{other.value}) was equal to mid value (#{self.mid}) when aggregating #{self.type} measure, just returning self"
136
+ end
137
+ satisfied
138
+ end
139
+
140
+ def constr_other_absvalue_smaller_than_self_absvalue(self_value,other_value)
141
+ if other_value.abs > self_value.abs
142
+ #raise ArgumentError.new, "The absolute value of the right hand argument must always be smaller or equal to the absolute value of the left hand side. lhs was #{self_value}, rhs was #{other_value} (possibly normalized around 0)"
143
+ logger.warn "The absolute value of the right hand argument must always be smaller or equal to the absolute value of the left hand side. lhs was #{self_value}, rhs was #{other_value} (possibly normalized around 0)"
144
+ end
145
+ end
146
+ end
147
+ end