rank-aggregation 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.markdown ADDED
@@ -0,0 +1,22 @@
1
+ # Rank Aggregation
2
+
3
+ This is a simple library for rank aggregation: the process of taking a collection of individual preferences amongst items and turning them into a single global ranking.
4
+
5
+ It was developed for [Hammer Principe](http://hammerprinciple.com) and is based heavily on [Rank Aggregation Revisited](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.113.2507&rep=rep1&type=pdf), with some minor innovations.
6
+
7
+ This isn't actually the version that's currently running on the site, but it's similar. I created this to experiment with variations on the algorithm and clean up the code. Eventually the two will be one and the same.
8
+
9
+ ## Example usage
10
+
11
+ The use case this is optimised for is when the number of items is relatively small (say a few hundred) but the number of votes is relatively large.
12
+
13
+ This can be used either as a command line tool or from within ruby code.
14
+
15
+ The simpest usage in ruby is as follows:
16
+
17
+ irb(main):002:0> RankAggregation.combine_rankings [ [1, 2, 3], [1, 2, 4], [3, 4] ]
18
+ => [1, 2, 3, 4]
19
+
20
+ This can be any enumerable of enumerables. If you passed it something which streams its items lazily it shouldn't need more than O(number of items to rank^2) memory.
21
+
22
+ The command line tool takes a list of tab separated rankings (one per line) and outputs a single aggregate ranking in the same format.
data/Rakefile ADDED
@@ -0,0 +1,46 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'spec/rake/spectask'
4
+
5
+ begin
6
+ require 'jeweler'
7
+ Jeweler::Tasks.new do |gem|
8
+ gem.name = "rank-aggregation"
9
+ gem.summary = %Q{Turn a collection of individual preferences on items into an aggregate rank for those preferences}
10
+ gem.email = "david@drmaciver.com"
11
+ gem.homepage = "http://github.com/DRMacIver/rank-aggregation"
12
+ gem.authors = ["David R. MacIver"]
13
+ gem.add_dependency "trollop"
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
18
+ end
19
+
20
+
21
+ desc "Run all examples"
22
+ Spec::Rake::SpecTask.new('spec') do |t|
23
+ t.spec_files = FileList['spec/**/*_spec.rb']
24
+ end
25
+
26
+ task "scores" do
27
+ $: << "lib"
28
+ require "rank-aggregation"
29
+ require "rank-aggregation/scorer"
30
+
31
+ File.open("scores", "w"){ |o|
32
+ Dir["samples/*"].sort.each{|file|
33
+ name = file.gsub(/^samples\//, "")
34
+
35
+ items = IO.read(file).split("\n").map{|x| x.split("\t").map{|x| x.strip} }
36
+
37
+ r = RankAggregation::Ranker.new
38
+
39
+ items.each{|i| r.add_ranking i }
40
+
41
+ kendall_distance = RankAggregation::Scorer.average_kendall_distance(r.combined_rankings, items)
42
+
43
+ o.puts "#{name}: #{kendall_distance}"
44
+ }
45
+ }
46
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
data/bin/rank ADDED
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << "#{File.dirname __FILE__}/../lib"
4
+
5
+ require "rank-aggregation/ranker"
6
+ require "rubygems"
7
+ require "trollop"
8
+
9
+ opts = Trollop.options do
10
+ opt :score, "Output the score as well as the aggregate ranking"
11
+ opt :rough, "Use only the rough score for sorting"
12
+ end
13
+
14
+ ranker = RankAggregation::Ranker.new
15
+
16
+ collect = [] if opts[:score]
17
+
18
+ STDIN.each_line{|l|
19
+ items = l.split("\t").map{|x| x.strip}.select{|x| x.length > 0}
20
+
21
+ collect << items if collect
22
+
23
+ ranker.add_ranking items
24
+ }
25
+
26
+ result = if opts[:rough] then ranker.rough_combined_rankings else ranker.combined_rankings end
27
+
28
+ puts result.join("\t")
29
+
30
+ if opts[:score]
31
+ require "rank-aggregation/scorer"
32
+
33
+ puts RankAggregation::Scorer.average_kendall_distance result, collect
34
+ end
@@ -0,0 +1,14 @@
1
+ module RankAggregation
2
+ def combine_rankings(ranks)
3
+ it = Ranker.new
4
+ ranks.each{|r| it.add_ranking(r) }
5
+
6
+ it.combined_rankings
7
+ end
8
+ end
9
+
10
+ class <<RankAggregation
11
+ include RankAggregation
12
+ end
13
+
14
+ require "rank-aggregation/ranker"
@@ -0,0 +1,35 @@
1
+ module RankAggregation
2
+ class MarkovChain
3
+ def initialize(items, transitions)
4
+ @transitions = {}
5
+ @items = items
6
+
7
+ items.each{|x|
8
+ tot = items.map{|y| transitions[x][y]}.inject(0.0){|u, v| u + v}
9
+
10
+ tx = (@transitions[x] = {})
11
+
12
+ items.each{|y| tx[y] = transitions[x][y] / tot }
13
+ }
14
+ end
15
+
16
+ def stationary_distribution
17
+ dist = {}
18
+
19
+ @items.each{|x| dist[x] = 1.0 / @items.size }
20
+
21
+ 10.times{
22
+ new_dist = Hash.new(0.0)
23
+
24
+ dist.each{|x, p|
25
+ @transitions[x].each{|y, q|
26
+ new_dist[y] += p * q
27
+ }
28
+ }
29
+ dist = new_dist
30
+ }
31
+
32
+ dist
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,44 @@
1
+ require "set"
2
+
3
+ module RankAggregation
4
+ class Ordering
5
+ attr_accessor :order, :items, :determined
6
+
7
+ def initialize(items)
8
+ @items = Set[*items.to_a]
9
+ @determined = Set.new
10
+ @order = Hash.new{|h, k| h[k] = { k => 0 }}
11
+ end
12
+
13
+ def determine(x, y)
14
+ return false if @order[x][y]
15
+
16
+ @order[x][y] = -1
17
+ @order[y][x] = 1
18
+
19
+ @determined << x if @order[x].size == @items.size
20
+ @determined << y if @order[y].size == @items.size
21
+
22
+ @order[x].each{|z, v|
23
+ determine(z, y) if v == 1
24
+ }
25
+
26
+ @order[y].each{|z, v|
27
+ determine(x, z) if v == -1
28
+ }
29
+ true
30
+ end
31
+
32
+ def determined?(item=nil)
33
+ if item
34
+ return self.determined.include?(item)
35
+ else
36
+ return self.determined.size == self.items.size
37
+ end
38
+ end
39
+
40
+ def [](x, y)
41
+ @order[x][y]
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,115 @@
1
+ require "rank-aggregation/ordering"
2
+ require "rank-aggregation/markov"
3
+
4
+ module RankAggregation
5
+ class Ranker
6
+ attr_accessor :less_counts, :smoothing, :items
7
+
8
+ def initialize
9
+ @less_counts = {}
10
+ @items = Set.new
11
+ @smoothing = 5
12
+ @vote_count = 0
13
+ end
14
+
15
+ def add_ranking(xs)
16
+ return if xs.size <= 1
17
+
18
+ reset_cached
19
+ xs.each{|x| @less_counts[x] ||= Hash.new(0); items.add x }
20
+
21
+ (0...xs.length).each{|i|
22
+ ((i+1)...xs.length).each{|j|
23
+ @less_counts[xs[i]][xs[j]] += 1
24
+ }
25
+ }
26
+
27
+ @vote_count += 1
28
+ end
29
+
30
+ def less_chances
31
+ @_less_chances ||= begin
32
+ less_chances = Hash.new{|h, k| h[k] = Hash.new(0.5)}
33
+
34
+ less_counts.each{|x, vs|
35
+ vs.each{|y, c|
36
+ p = (c + 0.5 * self.smoothing) / (self.smoothing + c + less_counts[y][x])
37
+ less_chances[x][y] = p
38
+ less_chances[y][x] = 1 - p
39
+ }
40
+ }
41
+ less_chances
42
+ end
43
+ end
44
+
45
+ def base_ordering
46
+ @_base_ordering ||= begin
47
+ edges = []
48
+ less_chances.each{|x, ys|
49
+ ys.each{|y, v|
50
+ edges << [x, y, v] if v > 0.5
51
+ }
52
+ }
53
+ edges.sort!{|x, y| y[2] <=> x[2]}
54
+
55
+ ordering = Ordering.new(less_chances.keys)
56
+
57
+ edges.each{|x, y, v|
58
+ ordering.determine(x, y)
59
+ break if ordering.determined?
60
+ }
61
+ ordering
62
+ end
63
+ end
64
+
65
+ # The rough score for x is the average chance of it being > y for all y we've got a comparison with
66
+ # We use this as a tie breaking heuristic.
67
+ def rough_scores
68
+ @_rough_scores ||= begin
69
+ # This markov chain is based off MC4. The idea is as follows:
70
+ # Starting at an item we pick one of the other items at random.
71
+ # We then transition to that item with probability P(i < j).
72
+ # If we fail to transition we stay where we are.
73
+ # i.e. the probability of transitioning form i to j with i != j is 1/(n-1) P(i < j).
74
+
75
+ transitions = {}
76
+
77
+ @items.each{|i|
78
+ transitions[i] = {}
79
+ tot = 0.0
80
+ @items.each{|j|
81
+ next if i == j
82
+ p = less_chances[i][j] / (@items.size - 1)
83
+ tot += p
84
+ transitions[i][j] = p
85
+ }
86
+ if tot <= 1
87
+ transitions[i][i] = 1 - tot
88
+ else
89
+ transitions[i][i] = 0
90
+ end
91
+ }
92
+
93
+ MarkovChain.new(@items, transitions).stationary_distribution
94
+ end
95
+ end
96
+
97
+ def rough_combined_rankings
98
+ @_rough_combined_rankings ||= begin
99
+ @items.sort{|x, y| rough_scores[x] <=> rough_scores[y] }
100
+ end
101
+ end
102
+
103
+ def combined_rankings
104
+ @_combined_rankings ||= begin
105
+ @items.sort{|x, y| base_ordering[x, y] || (rough_scores[x] <=> rough_scores[y]) }
106
+ end
107
+ end
108
+
109
+ private
110
+
111
+ def reset_cached
112
+ self.instance_variables.grep(/^@_/).each{|v| instance_variable_set(v, nil)}
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,41 @@
1
+ module RankAggregation
2
+ module Scorer
3
+ def average_kendall_distance(aggregate, ranks)
4
+ a_indices = {}
5
+
6
+ aggregate.each_with_index{|x, i|
7
+ a_indices[x] = i
8
+ }
9
+
10
+ parts = ranks.select{|x| x.size > 1 }.map{|x| kendal_distance(a_indices, x) }
11
+ parts.inject(0.0){|x, y| x + y} / ranks.size
12
+ end
13
+
14
+ # TODO: Decent implementation of this
15
+ private
16
+
17
+ def kendal_distance(a_indices, b)
18
+
19
+ tot = 0.0
20
+ (0...b.length).each{|i|
21
+ ((i + 1)...b.length).each{|j|
22
+ x = a_indices[b[i]]
23
+ y = a_indices[b[j]]
24
+ next unless x && y
25
+
26
+ tot += 1 if x > y
27
+ }
28
+ }
29
+
30
+ n = (a_indices.keys & b).size
31
+
32
+ p b if n <= 1
33
+
34
+ score = tot / (0.5 * n * (n - 1))
35
+ end
36
+ end
37
+
38
+ class <<Scorer
39
+ include Scorer
40
+ end
41
+ end
@@ -0,0 +1,52 @@
1
+ # Algorithm description
2
+
3
+ This is a description of the algorithm embodied in this library.
4
+
5
+ The input of this algorithm is a list of partial rankings of a set of items (the set of items is not known up front - it's computed from the lists. This is a minor and not terribly important detail).
6
+ The output of this algorithm is an aggregate ranking which is intended to reflect the input rankings as well as possible.
7
+
8
+ Our algorithm is heavily based on [Rank Aggregation Revisited](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.113.2507&rep=rep1&type=pdf), though the details are fairly different. This is largely as a result of our slightly different domain of interest: That where the number of items is expected to be small compared to the number of votes.
9
+
10
+ First we convert the list of rankings into pairwise probabilities. For any two items i, j we work out an estimate of P(i < j). We work this out as follows:
11
+
12
+ P(i < j) = (0.5 * SMOOTHING + #{ vote : vote(i < j)}) / (SMOOTHING + #{vote : vote ranks both i and j })
13
+
14
+ Where SMOOTHING is a constant designed to compensate for small data. In our implementation it defaults to 5. The idea is that our default belief is that the probability is 0.5 and we require more data to pull our beliefs away from that.
15
+
16
+ This allows us to build a majority graph, with an edge from i to j if the majority think i < j. Due to [Condorcet's paradox](http://en.wikipedia.org/wiki/Voting_paradox) this graph may contain cycles. Our first task is to remove cycles from this graph and turn it into a preorder on the data. We do this as follows:
17
+
18
+ We start with a graph with no edges. In order of decreasing P we add in the majority edges. So we start by adding an edge i -> j for i, j which maximize P(i < j), we then add the second highest value, etc. Whenever adding an edge would introduce a cycle we skip it.
19
+
20
+ The idea is that we prefer larger majorities to smaller ones, and where a majority vote contradicts one we've already taken into account we ignore it. This is a reasonable but of course not necessarily optimal strategy.
21
+
22
+ Because there are no cycles, this gives us a partially defined order on the items: If there is a path from i to j then we declare i << j.
23
+
24
+ Note: We implement this by caching the values of <<, so we can efficiently determine whether there is a cycle. This results in an O(|items|^2) algorithm. This is actually not bad: This algorithm is optimised for the case where there are many more votes than items.
25
+
26
+ Claim: Anything sorted with respect to << is locally kemeny optimal.
27
+
28
+ Proof: Let i, j be adjacent in the final order with i << j. Consider the point in the algorithm at which we first decide that i << j. i.e. we have just added a path from i to j. By hypothesis, i and j are adjacent in the order, but any z on the path between them would have i << z << j. Thus we must have added an edge from i to j. But we only add edges in line with the majority vote. Therefore the majority must think i < j.
29
+
30
+ Most of the time this order is pretty near to totally defined:
31
+
32
+ Claim: If the majority have a preference between i and j (i.e. P(i < j) != 0.5) then either i << j or j << i.
33
+
34
+ Proof: Run the algorithm to the point where we try to add the majority edge between i and j. If we've not already formed a path between the two we form a path then.
35
+
36
+ Additionally there are cases where this decides i << j without a majority preference either way: e.g. if a majority think A < B and B < C but we don't know anything about what they think between A and C we could still conclude A << C.
37
+
38
+ So, the idea is that we want to sort However for sparse data we may end up with scenarios where there are gaps in the ordering.
39
+
40
+ The expected pattern is that we get things like A << B, C << D but don't have any opinion on whether B < C. The idea is that if we had a scenario where 90% of people think that B < D but only 60% of people think that C < D then that's reasonable evidence that C is closer to D than B is.
41
+
42
+ So we define a score which we use as a tie breaker: Things with a higher score will be placed after things with a lower score if << is not defined for the pair. This score only gets calculated if we ever need it.
43
+
44
+ We use a Markov chain method. The idea of these is as follows: You define a random walk on your items, where items are more likely to transition to "good" items. You then take the stationary distribution of the markov chain (basically the long term behaviour) and use this as the score: Because we're more likely to transition to good items, we expect to spend more time there. Thus the probability of being in a state is a good proxy for its goodness.
45
+
46
+ The source paper describes various markov chain methods. The one we use is close to what they call MC4, but uses a slightly more nuanced approach which takes into account the actual probabilities rather than just whether they were a majority vote. The probability of transitioning from i to j is proportional to P(i < j). Thus higher probability items should appear towards the end of the list.
47
+
48
+ This then gives us our algorithm:
49
+
50
+ Work out <<. Work out the markov scores if necessary. Sort the items by the relationship i < j if i << j or if neither i << j or j << i are defined and score(i) < score(j).
51
+
52
+ In principle this can result in ambiguity because you could use the score to determine if i < j and have score(i) = score(j). Really these scenarios should be considered ties. However they're sufficiently unlikely in real data that we don't worry about that and break the ties arbitrarily.
@@ -0,0 +1,62 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{rank-aggregation}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["David R. MacIver"]
12
+ s.date = %q{2010-06-20}
13
+ s.default_executable = %q{rank}
14
+ s.email = %q{david@drmaciver.com}
15
+ s.executables = ["rank"]
16
+ s.extra_rdoc_files = [
17
+ "README.markdown"
18
+ ]
19
+ s.files = [
20
+ "README.markdown",
21
+ "Rakefile",
22
+ "VERSION",
23
+ "bin/rank",
24
+ "lib/rank-aggregation.rb",
25
+ "lib/rank-aggregation/markov.rb",
26
+ "lib/rank-aggregation/ordering.rb",
27
+ "lib/rank-aggregation/ranker.rb",
28
+ "lib/rank-aggregation/scorer.rb",
29
+ "notes/algorithm.markdown",
30
+ "rank-aggregation.gemspec",
31
+ "samples/clear-with-noise",
32
+ "samples/languages",
33
+ "samples/pairs",
34
+ "samples/random",
35
+ "scores",
36
+ "spec/helper.rb",
37
+ "spec/rank-aggregation_spec.rb"
38
+ ]
39
+ s.homepage = %q{http://github.com/DRMacIver/rank-aggregation}
40
+ s.rdoc_options = ["--charset=UTF-8"]
41
+ s.require_paths = ["lib"]
42
+ s.rubygems_version = %q{1.3.7}
43
+ s.summary = %q{Turn a collection of individual preferences on items into an aggregate rank for those preferences}
44
+ s.test_files = [
45
+ "spec/rank-aggregation_spec.rb",
46
+ "spec/helper.rb"
47
+ ]
48
+
49
+ if s.respond_to? :specification_version then
50
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
51
+ s.specification_version = 3
52
+
53
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
54
+ s.add_runtime_dependency(%q<trollop>, [">= 0"])
55
+ else
56
+ s.add_dependency(%q<trollop>, [">= 0"])
57
+ end
58
+ else
59
+ s.add_dependency(%q<trollop>, [">= 0"])
60
+ end
61
+ end
62
+