RubyGems - rank-aggregation - Versions diffs - 0.0.1 - Mend

rank-aggregation 0.0.1

Files changed (19) hide show

data/README.markdown +22 -0
data/Rakefile +46 -0
data/VERSION +1 -0
data/bin/rank +34 -0
data/lib/rank-aggregation.rb +14 -0
data/lib/rank-aggregation/markov.rb +35 -0
data/lib/rank-aggregation/ordering.rb +44 -0
data/lib/rank-aggregation/ranker.rb +115 -0
data/lib/rank-aggregation/scorer.rb +41 -0
data/notes/algorithm.markdown +52 -0
data/rank-aggregation.gemspec +62 -0
data/samples/clear-with-noise +35 -0
data/samples/languages +1000 -0
data/samples/pairs +14850 -0
data/samples/random +1000 -0
data/scores +4 -0
data/spec/helper.rb +4 -0
data/spec/rank-aggregation_spec.rb +30 -0
metadata +98 -0

data/README.markdown ADDED Viewed

@@ -0,0 +1,22 @@
+# Rank Aggregation
+This is a simple library for rank aggregation: the process of taking a collection of individual preferences amongst items and turning them into a single global ranking.
+It was developed for [Hammer Principe](http://hammerprinciple.com) and is based heavily on [Rank Aggregation Revisited](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.113.2507&rep=rep1&type=pdf), with some minor innovations.
+This isn't actually the version that's currently running on the site, but it's similar. I created this to experiment with variations on the algorithm and clean up the code. Eventually the two will be one and the same.
+## Example usage
+The use case this is optimised for is when the number of items is relatively small (say a few hundred) but the number of votes is relatively large.
+This can be used either as a command line tool or from within ruby code.
+The simpest usage in ruby is as follows:
+  irb(main):002:0> RankAggregation.combine_rankings [ [1, 2, 3], [1, 2, 4], [3, 4] ]
+  => [1, 2, 3, 4]
+This can be any enumerable of enumerables. If you passed it something which streams its items lazily it shouldn't need more than O(number of items to rank^2) memory.
+The command line tool takes a list of tab separated rankings (one per line) and outputs a single aggregate ranking in the same format.

data/Rakefile ADDED Viewed

@@ -0,0 +1,46 @@
+require 'rubygems'
+require 'rake'
+require 'spec/rake/spectask'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gem|
+    gem.name = "rank-aggregation"
+    gem.summary = %Q{Turn a collection of individual preferences on items into an aggregate rank for those preferences}
+    gem.email = "david@drmaciver.com"
+    gem.homepage = "http://github.com/DRMacIver/rank-aggregation"
+    gem.authors = ["David R. MacIver"]
+    gem.add_dependency "trollop"
+  end
+  Jeweler::GemcutterTasks.new
+rescue LoadError
+  puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
+end
+desc "Run all examples"
+Spec::Rake::SpecTask.new('spec') do |t|
+  t.spec_files = FileList['spec/**/*_spec.rb']
+end
+task "scores" do
+  $: << "lib"
+  require "rank-aggregation"
+  require "rank-aggregation/scorer"
+  File.open("scores", "w"){ |o|
+    Dir["samples/*"].sort.each{|file|
+      name = file.gsub(/^samples\//, "")
+      items = IO.read(file).split("\n").map{|x| x.split("\t").map{|x| x.strip} }
+      r = RankAggregation::Ranker.new
+      items.each{|i| r.add_ranking i }
+      kendall_distance = RankAggregation::Scorer.average_kendall_distance(r.combined_rankings, items)
+      o.puts "#{name}: #{kendall_distance}"
+    }
+  }
+end

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.0.1

data/bin/rank ADDED Viewed

@@ -0,0 +1,34 @@
+#!/usr/bin/env ruby
+$: << "#{File.dirname __FILE__}/../lib"
+require "rank-aggregation/ranker"
+require "rubygems"
+require "trollop"
+opts = Trollop.options do
+  opt :score, "Output the score as well as the aggregate ranking"
+  opt :rough, "Use only the rough score for sorting"
+end
+ranker = RankAggregation::Ranker.new
+collect = [] if opts[:score]
+STDIN.each_line{|l|
+  items = l.split("\t").map{|x| x.strip}.select{|x| x.length > 0}
+  collect << items if collect
+  ranker.add_ranking items
+}
+result = if opts[:rough] then ranker.rough_combined_rankings else ranker.combined_rankings end
+puts result.join("\t")
+if opts[:score]
+  require "rank-aggregation/scorer"
+  puts RankAggregation::Scorer.average_kendall_distance result, collect
+end

data/lib/rank-aggregation.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module RankAggregation
+  def combine_rankings(ranks)
+    it = Ranker.new
+    ranks.each{|r| it.add_ranking(r) }
+    it.combined_rankings
+  end
+end
+class <<RankAggregation
+  include RankAggregation
+end
+require "rank-aggregation/ranker"

data/lib/rank-aggregation/markov.rb ADDED Viewed

@@ -0,0 +1,35 @@
+module RankAggregation
+  class MarkovChain
+    def initialize(items, transitions)
+      @transitions = {}
+      @items = items
+      items.each{|x|
+        tot = items.map{|y| transitions[x][y]}.inject(0.0){|u, v| u + v}
+        tx = (@transitions[x] = {})
+        items.each{|y| tx[y] = transitions[x][y] / tot }
+      }
+    end
+    def stationary_distribution
+      dist = {}
+      @items.each{|x| dist[x] = 1.0 / @items.size }
+      10.times{
+        new_dist = Hash.new(0.0)
+        dist.each{|x, p|
+          @transitions[x].each{|y, q|
+            new_dist[y] += p * q
+          }
+        }
+        dist = new_dist
+      }
+      dist
+    end
+  end
+end

data/lib/rank-aggregation/ordering.rb ADDED Viewed

@@ -0,0 +1,44 @@
+require "set"
+module RankAggregation
+  class Ordering
+    attr_accessor :order, :items, :determined
+    def initialize(items)
+      @items = Set[*items.to_a]
+      @determined = Set.new
+      @order = Hash.new{|h, k| h[k] = { k => 0 }}
+    end
+    def determine(x, y)
+      return false if @order[x][y]
+      @order[x][y] = -1
+      @order[y][x] = 1
+      @determined << x if @order[x].size == @items.size
+      @determined << y if @order[y].size == @items.size
+      @order[x].each{|z, v|
+        determine(z, y) if v == 1
+      }
+      @order[y].each{|z, v|
+        determine(x, z) if v == -1
+      }
+      true
+    end
+    def determined?(item=nil)
+      if item
+        return self.determined.include?(item)
+      else
+        return self.determined.size == self.items.size
+      end
+    end
+    def [](x, y)
+      @order[x][y]
+    end
+  end
+end

data/lib/rank-aggregation/ranker.rb ADDED Viewed

@@ -0,0 +1,115 @@
+require "rank-aggregation/ordering"
+require "rank-aggregation/markov"
+module RankAggregation
+  class Ranker
+    attr_accessor :less_counts, :smoothing, :items
+    def initialize
+      @less_counts = {}
+      @items = Set.new
+      @smoothing = 5
+      @vote_count = 0
+    end
+    def add_ranking(xs)
+      return if xs.size <= 1
+      reset_cached
+      xs.each{|x| @less_counts[x] ||= Hash.new(0); items.add x }
+      (0...xs.length).each{|i|
+        ((i+1)...xs.length).each{|j|
+          @less_counts[xs[i]][xs[j]] += 1
+        }
+      }
+      @vote_count += 1
+    end
+    def less_chances
+      @_less_chances ||= begin
+        less_chances = Hash.new{|h, k| h[k] = Hash.new(0.5)}
+        less_counts.each{|x, vs|
+          vs.each{|y, c|
+            p = (c + 0.5 * self.smoothing) / (self.smoothing + c + less_counts[y][x])
+            less_chances[x][y] = p
+            less_chances[y][x] = 1 - p
+          }
+        }
+        less_chances
+      end
+    end
+    def base_ordering
+      @_base_ordering ||= begin
+        edges = []
+        less_chances.each{|x, ys|
+          ys.each{|y, v|
+            edges << [x, y, v] if v > 0.5
+          }
+        }
+        edges.sort!{|x, y| y[2] <=> x[2]}
+        ordering = Ordering.new(less_chances.keys)
+        edges.each{|x, y, v|
+          ordering.determine(x, y)
+          break if ordering.determined?
+        }
+        ordering
+      end
+    end
+    # The rough score for x is the average chance of it being > y for all y we've got a comparison with
+    # We use this as a tie breaking heuristic.
+    def rough_scores
+      @_rough_scores ||= begin
+        # This markov chain is based off MC4. The idea is as follows:
+        # Starting at an item we pick one of the other items at random.
+        # We then transition to that item with probability P(i < j).
+        # If we fail to transition we stay where we are.
+        # i.e. the probability of transitioning form i to j with i != j is 1/(n-1) P(i < j).
+        transitions = {}
+        @items.each{|i|
+          transitions[i] = {}
+          tot = 0.0
+          @items.each{|j|
+            next if i == j
+            p = less_chances[i][j] / (@items.size - 1)
+            tot += p
+            transitions[i][j] = p
+          }
+          if tot <= 1
+            transitions[i][i] = 1 - tot
+          else
+            transitions[i][i] = 0
+          end
+        }
+        MarkovChain.new(@items, transitions).stationary_distribution
+      end
+    end
+    def rough_combined_rankings
+      @_rough_combined_rankings ||= begin
+        @items.sort{|x, y| rough_scores[x] <=> rough_scores[y] }
+      end
+    end
+    def combined_rankings
+      @_combined_rankings ||= begin
+        @items.sort{|x, y| base_ordering[x, y] || (rough_scores[x] <=> rough_scores[y]) }
+      end
+    end
+    private
+    def reset_cached
+      self.instance_variables.grep(/^@_/).each{|v| instance_variable_set(v, nil)}
+    end
+  end
+end

data/lib/rank-aggregation/scorer.rb ADDED Viewed

@@ -0,0 +1,41 @@
+module RankAggregation
+  module Scorer
+    def average_kendall_distance(aggregate, ranks)
+      a_indices = {}
+      aggregate.each_with_index{|x, i|
+        a_indices[x] = i
+      }
+      parts = ranks.select{|x| x.size > 1 }.map{|x| kendal_distance(a_indices, x) }
+      parts.inject(0.0){|x, y| x + y} / ranks.size
+    end
+    # TODO: Decent implementation of this
+    private
+    def kendal_distance(a_indices, b)
+      tot = 0.0
+      (0...b.length).each{|i|
+        ((i + 1)...b.length).each{|j|
+          x = a_indices[b[i]]
+          y = a_indices[b[j]]
+          next unless x && y
+          tot += 1 if x > y
+        }
+      }
+      n = (a_indices.keys & b).size
+      p b if n <= 1
+      score = tot / (0.5 * n * (n - 1))
+    end
+  end
+  class <<Scorer
+    include Scorer
+  end
+end

data/notes/algorithm.markdown ADDED Viewed

@@ -0,0 +1,52 @@
+# Algorithm description
+This is a description of the algorithm embodied in this library.
+The input of this algorithm is a list of partial rankings of a set of items (the set of items is not known up front - it's computed from the lists. This is a minor and not terribly important detail).
+The output of this algorithm is an aggregate ranking which is intended to reflect the input rankings as well as possible.
+Our algorithm is heavily based on [Rank Aggregation Revisited](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.113.2507&rep=rep1&type=pdf), though the details are fairly different. This is largely as a result of our slightly different domain of interest: That where the number of items is expected to be small compared to the number of votes.
+First we convert the list of rankings into pairwise probabilities. For any two items i, j we work out an estimate of P(i < j). We work this out as follows:
+  P(i < j) = (0.5  * SMOOTHING + #{ vote : vote(i < j)}) / (SMOOTHING + #{vote : vote ranks both i and j })
+Where SMOOTHING is a constant designed to compensate for small data. In our implementation it defaults to 5. The idea is that our default belief is that the probability is 0.5 and we require more data to pull our beliefs away from that.
+This allows us to build a majority graph, with an edge from i to j if the majority think i < j. Due to [Condorcet's paradox](http://en.wikipedia.org/wiki/Voting_paradox) this graph may contain cycles. Our first task is to remove cycles from this graph and turn it into a preorder on the data. We do this as follows:
+We start with a graph with no edges. In order of decreasing P we add in the majority edges. So we start by adding an edge i -> j for i, j which maximize P(i < j), we then add the second highest value, etc. Whenever adding an edge would introduce a cycle we skip it.
+The idea is that we prefer larger majorities to smaller ones, and where a majority vote contradicts one we've already taken into account we ignore it. This is a reasonable but of course not necessarily optimal strategy.
+Because there are no cycles, this gives us a partially defined order on the items: If there is a path from i to j then we declare i << j.
+Note: We implement this by caching the values of <<, so we can efficiently determine whether there is a cycle. This results in an O(|items|^2) algorithm. This is actually not bad: This algorithm is optimised for the case where there are many more votes than items.
+Claim: Anything sorted with respect to << is locally kemeny optimal.
+Proof: Let i, j be adjacent in the final order with i << j. Consider the point in the algorithm at which we first decide that i << j. i.e. we have just added a path from i to j. By hypothesis, i and j are adjacent in the order, but any z on the path between them would have i << z << j. Thus we must have added an edge from i to j. But we only add edges in line with the majority vote. Therefore the majority must think i < j.
+Most of the time this order is pretty near to totally defined:
+Claim: If the majority have a preference between i and j (i.e. P(i < j) != 0.5) then either i << j or j << i.
+Proof: Run the algorithm to the point where we try to add the majority edge between i and j. If we've not already formed a path between the two we form a path then.
+Additionally there are cases where this decides i << j without a majority preference either way: e.g. if a majority think A < B and B < C but we don't know anything about what they think between A and C we could still conclude A << C.
+So, the idea is that we want to sort However for sparse data we may end up with scenarios where there are gaps in the ordering.
+The expected pattern is that we get things like A << B, C << D but don't have any opinion on whether B < C. The idea is that if we had a scenario where 90% of people think that B < D but only 60% of people think that C < D then that's reasonable evidence that C is closer to D than B is.
+So we define a score which we use as a tie breaker: Things with a higher score will be placed after things with a lower score if << is not defined for the pair. This score only gets calculated if we ever need it.
+We use a Markov chain method. The idea of these is as follows: You define a random walk on your items, where items are more likely to transition to "good" items. You then take the stationary distribution of the markov chain (basically the long term behaviour) and use this as the score: Because we're more likely to transition to good items, we expect to spend more time there. Thus the probability of being in a state is a good proxy for its goodness.
+The source paper describes various markov chain methods. The one we use is close to what they call MC4, but uses a slightly more nuanced approach which takes into account the actual probabilities rather than just whether they were a majority vote. The probability of transitioning from i to j is proportional to P(i < j). Thus higher probability items should appear towards the end of the list.
+This then gives us our algorithm:
+Work out <<. Work out the markov scores if necessary. Sort the items by the relationship i < j if i << j or if neither i << j or j << i are defined and score(i) < score(j).
+In principle this can result in ambiguity because you could use the score to determine if i < j and have score(i) = score(j). Really these scenarios should be considered ties. However they're sufficiently unlikely in real data that we don't worry about that and break the ties arbitrarily.

data/rank-aggregation.gemspec ADDED Viewed

@@ -0,0 +1,62 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{rank-aggregation}
+  s.version = "0.0.1"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["David R. MacIver"]
+  s.date = %q{2010-06-20}
+  s.default_executable = %q{rank}
+  s.email = %q{david@drmaciver.com}
+  s.executables = ["rank"]
+  s.extra_rdoc_files = [
+    "README.markdown"
+  ]
+  s.files = [
+    "README.markdown",
+     "Rakefile",
+     "VERSION",
+     "bin/rank",
+     "lib/rank-aggregation.rb",
+     "lib/rank-aggregation/markov.rb",
+     "lib/rank-aggregation/ordering.rb",
+     "lib/rank-aggregation/ranker.rb",
+     "lib/rank-aggregation/scorer.rb",
+     "notes/algorithm.markdown",
+     "rank-aggregation.gemspec",
+     "samples/clear-with-noise",
+     "samples/languages",
+     "samples/pairs",
+     "samples/random",
+     "scores",
+     "spec/helper.rb",
+     "spec/rank-aggregation_spec.rb"
+  ]
+  s.homepage = %q{http://github.com/DRMacIver/rank-aggregation}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.7}
+  s.summary = %q{Turn a collection of individual preferences on items into an aggregate rank for those preferences}
+  s.test_files = [
+    "spec/rank-aggregation_spec.rb",
+     "spec/helper.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_runtime_dependency(%q<trollop>, [">= 0"])
+    else
+      s.add_dependency(%q<trollop>, [">= 0"])
+    end
+  else
+    s.add_dependency(%q<trollop>, [">= 0"])
+  end
+end