RubyGems - rank-aggregation - Versions diffs - 0.0.3 → 0.0.5 - Mend

rank-aggregation 0.0.3 → 0.0.5

Files changed (13) hide show

data/Rakefile +1 -4
data/VERSION +1 -1
data/bin/rank +8 -10
data/lib/rank-aggregation/markov.rb +5 -11
data/lib/rank-aggregation/ranker.rb +87 -31
data/notes/algorithm.markdown +2 -0
data/rank-aggregation.gemspec +3 -4
data/samples/images +249 -0
data/scores +4 -3
data/spec/rank-aggregation_spec.rb +25 -0
metadata +5 -6
data/lib/rank-aggregation/ordering.rb +0 -44
data/lib/rank-aggregation/scorer.rb +0 -41

data/Rakefile CHANGED Viewed

@@ -26,7 +26,6 @@ end
 task "scores" do
   $: << "lib"
   require "rank-aggregation"
-  require "rank-aggregation/scorer"
   File.open("scores", "w"){ |o|
     Dir["samples/*"].sort.each{|file|
@@ -38,9 +37,7 @@ task "scores" do
       items.each{|i| r.add_ranking i }
-      kendall_distance = RankAggregation::Scorer.average_kendall_distance(r.combined_rankings, items)
-      o.puts "#{name}: #{kendall_distance}"
+      o.puts "#{name}: #{r.kendall_distance}"
     }
   }
 end

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.3
1	+ 0.0.5

data/bin/rank CHANGED Viewed

@@ -8,30 +8,28 @@ require "trollop"
 opts = Trollop.options do
   opt :score, "Output scoring metrics on STDERR"
-  opt :rough, "Use only the rough score for sorting"
+  opt :rough, "Use only the rough score for sorting, don't perform any optimisation on top of that"
   opt :smoothing, "Set the smoothing parameter: It should be approximately equal to the sample size you consider large enough to matter", :type => :int
+  opt :debug
 end
 ranker = RankAggregation::Ranker.new
 ranker.smoothing = opts[:smoothing] if opts[:smoothing]
-collect = [] if opts[:score]
+ranker.logger.level = Logger::DEBUG if opts[:debug]
 ARGF.each_line{|l|
   items = l.split("\t").map{|x| x.strip}.select{|x| x.length > 0}
-  collect << items if collect
   ranker.add_ranking items
 }
-result = if opts[:rough] then ranker.rough_combined_rankings else ranker.combined_rankings end
+result = (
+  if opts[:rough] then ranker.rough_combined_rankings
+  else ranker.combined_rankings end
+)
 puts result.join("\t")
 if opts[:score]
-  require "rank-aggregation/scorer"
-  STDERR.puts "Average kendal distance: #{RankAggregation::Scorer.average_kendall_distance result, collect}"
+  STDERR.puts "Average kendal distance: #{ranker.kendall_distance}"
 end

data/lib/rank-aggregation/markov.rb CHANGED Viewed

@@ -1,16 +1,9 @@
 module RankAggregation
   class MarkovChain
-    def initialize(items, transitions)
-      @transitions = {}
+    def initialize(items, transitions, logger)
+      @logger = logger
+      @transitions = transitions
       @items = items
-      items.each{|x|
-        tot = items.map{|y| transitions[x][y]}.inject(0.0){|u, v| u + v}
-        tx = (@transitions[x] = {})
-        items.each{|y| tx[y] = transitions[x][y] / tot }
-      }
     end
     def stationary_distribution
@@ -18,7 +11,8 @@ module RankAggregation
       @items.each{|x| dist[x] = 1.0 / @items.size }
-      10.times{
+      3.times{ |i|
+        @logger.debug "markov chain iteration #{i}"
         new_dist = Hash.new(0.0)
         dist.each{|x, p|

data/lib/rank-aggregation/ranker.rb CHANGED Viewed

@@ -1,26 +1,37 @@
-require "rank-aggregation/ordering"
+require "set"
+require "logger"
 require "rank-aggregation/markov"
 module RankAggregation
   class Ranker
-    attr_accessor :less_counts, :smoothing, :items
+    attr_accessor :less_scores, :smoothing, :items, :logger
     def initialize
-      @less_counts = {}
+      @less_scores = {}
       @items = Set.new
       @smoothing = 5
       @vote_count = 0
+      @rank_count = 0
+      self.logger = Logger.new(STDERR)
+      self.logger.level = Logger::WARN
     end
     def add_ranking(xs)
+      xs = xs.uniq
       return if xs.size <= 1
+      @rank_count += 1
       reset_cached
-      xs.each{|x| @less_counts[x] ||= Hash.new(0); items.add x }
+      xs.each{|x| @less_scores[x] ||= Hash.new(0.0); items.add x }
+      weight = 1.0 / (0.5 * xs.size * (xs.size - 1))
       (0...xs.length).each{|i|
         ((i+1)...xs.length).each{|j|
-          @less_counts[xs[i]][xs[j]] += 1
+          @less_scores[xs[i]][xs[j]] += weight
         }
       }
@@ -29,49 +40,31 @@ module RankAggregation
     def less_chances
       @_less_chances ||= begin
+        logger.debug "calculating less_chances"
         less_chances = Hash.new{|h, k| h[k] = Hash.new(0.5)}
-        less_counts.each{|x, vs|
+        less_scores.each{|x, vs|
           vs.each{|y, c|
-            p = (c + 0.5 * self.smoothing) / (self.smoothing + c + less_counts[y][x])
+            p = (c + 0.5 * self.smoothing) / (self.smoothing + c + less_scores[y][x])
             less_chances[x][y] = p
             less_chances[y][x] = 1 - p
           }
         }
+        logger.debug "calculating less_chances complete"
         less_chances
       end
     end
-    def base_ordering
-      @_base_ordering ||= begin
-        edges = []
-        less_chances.each{|x, ys|
-          ys.each{|y, v|
-            edges << [x, y, v] if v > 0.5
-          }
-        }
-        edges.sort!{|x, y| y[2] <=> x[2]}
-        ordering = Ordering.new(less_chances.keys)
-        edges.each{|x, y, v|
-          ordering.determine(x, y)
-          break if ordering.determined?
-        }
-        ordering
-      end
-    end
-    # The rough score for x is the average chance of it being > y for all y we've got a comparison with
-    # We use this as a tie breaking heuristic.
     def rough_scores
       @_rough_scores ||= begin
+        logger.debug "calculating rough_scores"
         # This markov chain is based off MC4. The idea is as follows:
         # Starting at an item we pick one of the other items at random.
         # We then transition to that item with probability P(i < j).
         # If we fail to transition we stay where we are.
         # i.e. the probability of transitioning form i to j with i != j is 1/(n-1) P(i < j).
+        logger.debug "calculating transition probabilities"
         transitions = {}
         @items.each{|i|
@@ -90,7 +83,10 @@ module RankAggregation
           end
         }
-        MarkovChain.new(@items, transitions).stationary_distribution
+        logger.debug "calculating transition probabilities complete"
+        result = MarkovChain.new(@items, transitions, logger).stationary_distribution
+        logger.debug "calculating rough_scores complete"
+        result
       end
     end
@@ -102,12 +98,72 @@ module RankAggregation
     def combined_rankings
       @_combined_rankings ||= begin
-        @items.sort{|x, y| base_ordering[x, y] || (rough_scores[x] <=> rough_scores[y]) }
+        triangle_shuffle(rough_combined_rankings)
       end
     end
+    def kendall_distance
+      @_kendall_distance ||= kendall_distance_for(combined_rankings)
+    end
     private
+    def kendall_distance_for(ranks)
+      tot = 0.0
+      (0...ranks.length).each{|i|
+        (i+1...ranks.length).each{|j|
+          tot += less_scores[ranks[j]][ranks[i]]
+        }
+      }
+      tot / @rank_count
+    end
+    def triangle_shuffle(ranks)
+      ranks = ranks.dup
+      i = 0
+      changed = true
+      iterations = 0
+      while changed
+        iterations += 1
+        shuffle_count = 0
+        changed = false
+        (0...ranks.length - 2).each do |i|
+          # we look at the positions i, i+1 and i+2 and form a kemeny optimal ordering of the items
+          # there
+          a, b, c = ranks[i..i+2]
+          shuffles = [
+            [a, b, c],
+            [a, c, b],
+            [b, a, c],
+            [b, c, a],
+            [c, a, b],
+            [c, b, a]
+          ].map{|x| [kendall_distance_for(x), x]}
+          original_score = shuffles[0][0]
+          best_score, best = shuffles.min
+          if best_score < original_score
+            shuffle_count += 1
+            changed = true
+            ranks[i..i+2] = best
+            logger.debug{
+              "#{a}, #{b}, #{c} shuffled to #{best.join(", ")}. Score went from #{original_score} to #{best_score}"
+            }
+          end
+        end
+        logger.debug "triangle shuffling iteration ##{iterations} performed #{shuffle_count} shuffles"
+      end
+      ranks
+    end
     def reset_cached
       self.instance_variables.grep(/^@_/).each{|v| instance_variable_set(v, nil)}
     end

data/notes/algorithm.markdown CHANGED Viewed

@@ -1,5 +1,7 @@
 # Algorithm description
+Warning: These notes are a bit out of date.
 This is a description of the algorithm embodied in this library.
 The input of this algorithm is a list of partial rankings of a set of items (the set of items is not known up front - it's computed from the lists. This is a minor and not terribly important detail).

data/rank-aggregation.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{rank-aggregation}
-  s.version = "0.0.3"
+  s.version = "0.0.5"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["David R. MacIver"]
-  s.date = %q{2010-06-20}
+  s.date = %q{2010-07-08}
   s.default_executable = %q{rank}
   s.email = %q{david@drmaciver.com}
   s.executables = ["rank"]
@@ -23,13 +23,12 @@ Gem::Specification.new do |s|
      "bin/rank",
      "lib/rank-aggregation.rb",
      "lib/rank-aggregation/markov.rb",
-     "lib/rank-aggregation/ordering.rb",
      "lib/rank-aggregation/ranker.rb",
-     "lib/rank-aggregation/scorer.rb",
      "notes/algorithm.markdown",
      "rank-aggregation.gemspec",
      "samples/clear-with-noise",
      "samples/different-sizes-interleaved",
+     "samples/images",
      "samples/languages",
      "samples/pairs",
      "samples/random",

data/samples/images ADDED Viewed

@@ -0,0 +1,249 @@
+4713907317	4714793516
+4714044861	4715364378
+4713401785	4715058159
+4713685521	4713662041
+4714695978	4714036236
+4714094858	4714570862
+4714120928	4714183405
+4714519726	4713952977
+4714809167	4714181334
+4713940506	4713660541
+4715424340	4714519726
+4713379937	0
+4714441346	4713614365
+4713940506	4714519726
+4714866582	4714214085
+4713907317	4714094858
+4714707313	4714173098
+4714441346	4713387133
+4713981249	4714633673
+4714994910	4714570862
+4714157060	4714209769
+4714967844	4714180352
+4715190691	4714982926
+4714629861	4714214085
+4714793516	4714633673
+4715213970	4714625095
+4714286467	4715003796
+4715856420	4714073463
+4714695978	4714214085
+4714866582	4713526865
+4714625095	4713660541
+4715424340	4714834229
+4714930683	4714930683
+4714274838	4714574641
+4714589388	4714286467
+4714241833	4714384952
+4714820813	4714157060
+4714793516	4714820813
+4714157060	4713981249
+4713660541	4714180352
+4714990888	4714100910
+4715453204	4714319362
+4714664778	4713651819
+4714866582	4714151904
+4713693925	4713981249
+4713906389	4714227818
+4714393079	4714367616
+4715073620	4714183405
+4715814487	4713589101
+4714509338	4714192076
+4714291607	4714871415
+4713788611	4716484338
+4713965261	4714982926
+4713685521	4714793516
+4714928611	4714068646
+4715085584	4713616757
+4713379937	4714183405
+4715356822	4715012282
+4715100820	4716703934
+4715714373	4714227078
+4715763896	4714087637
+4713844607	4714133515
+4714661034	4713997878
+4714871415	4715860020
+4714797576	4714534020
+4715558413	4714589388
+4714096389	4715073620
+4716091440	4714763663
+4714121666	4714137974
+4715343302	4714145722
+4714629861	4713776313
+4714866582	4713736745
+4714512738	4715963672
+4714185386	4715242266
+4714967844	4715888970
+4714173098	4716213882
+4715049826	4713612015
+4713752261	4713979531
+4713365017	4716213882
+4713744349	4713686267
+4714810870	4714658782
+4714533206	4715356822
+4713314831	4714905429
+4715388191	4715489479
+4714692816	4715453204
+4713898663	4714137996
+4714522757	4714107938
+4713781919	4715430345
+4716515978	4715003796
+4714533547	4714506478
+4716526328	4715040480
+4714182023	4714681315
+4715940337	4714822451
+4714435781	4715888970
+4714423390	4714930683
+4715215778	4714644144
+4714533547	4715324294
+4713495537	4713951401
+4715215778	4714931374
+4713575079	4713418207
+4715602724	4714692816
+4714274838	4715426962
+4714179261	4716714528
+4713853729	4715075632
+4714525315	4713936967
+4714264652	4714928611
+4714207610	4713940506
+4715215778	4713405627
+4714522289	4713853729
+4715992740	4713547921
+4715233482	4715075632
+4716115538	4714695978
+4713980054	4715683996
+4715073620	4713892543
+4714625095	4713854897
+4714967844	4714820813
+4714067235	4713587865
+4713844121	4715200759
+4714430684	4714684363
+4714442589	4713917102
+4714001813	4714574641
+4715331713	4714535413
+4715153745	4715617690
+4713575079	4715888970
+4713879311	4714344426
+4715217508	4716083048
+4715741624	4713749569
+4713788611	4715242266
+4714449713	4714329119
+4714808591	4714589388
+4714959812	4714133515
+4714037152	4714490458
+4714941210	4714970667
+4713574409	4714103021
+4713997878	4714241833
+4715535678	4715856420
+4715324294	4713322691
+4715477764	4714522289
+4714236828	4714534020
+4714545096	4713495101
+4715322098	4714341509
+4714137996	4714341509
+4714990888	4714179898
+4713981249	4713691107
+4713947773	4713401785
+4713879311	4713689031
+4715992740	4715103668
+4715190691	4715005229
+4713685665	4713937913
+4714091142	4713689031
+4714695978	4715005260
+4713917539	4713575079
+4715049826	4713631637
+4715669395	4715503820
+4714493522	4713685521
+4715364378	4713907317
+4713844607	4714808591
+4715260222	4714075745
+4713612015	4713913835
+4715501923	4713913835
+4715627740	4714367073
+4715669286	4715215778
+4715069575	4715859809
+4713931385	4714121666
+4713980054	4715678715
+4715217508	4715739086
+4713566967	4714592156
+4715669286	4715683996
+4715069575	4713458933
+4713443893	4713575079
+4716120110	4713951401
+4714232322	4713967155
+4714866582	4714341509
+4715217376	4713879311
+4715605259	4714219924
+4713322691	4715374569
+4714149874	4714034091
+4715374569	4714341509
+4714797576	4714209769
+4714326747	4713604321
+4714479137	4714905514
+4714535413	4713892543
+4714493522	4715088795
+4713666531	4715058159
+4713611777	4713666531
+4713969986	4715285982
+4713570275	4714132776
+4714590394	4715525984
+4714150388	4714522289
+4714345492	4714592156
+4715775475	4714393079
+4714423390	4714335269
+4713570275	4714589388
+4715602724	4715219978
+4714120928	4715266308
+4714695978	4713931385
+4714990888	4714509338
+4714353096	4714678696
+4714096389	4714522289
+4715056162	4714274838
+4714107938	4716703934
+4714292768	4714763663
+4715741624	4714036236
+4714545096	4713604321
+4715091935	4713952977
+4715049826	4715169543
+4713931385	4713502361
+4714967844	4714522757
+4714936074	4716091609
+4715100820	4713740667
+4714075745	4714522289
+4715175414	4714590394
+4716213882	4713574409
+4713339825	4715739086
+4715775475	4716193284
+4715388191	4714384952
+4714274838	4713781919
+4714546309	4715739086
+4714207610	4714423390
+4714586002	4715233482
+4714227078	4714180352
+4714546309	4714254254
+4715477764	4714548260
+4713635095	4714661034
+4713339825	4714120928
+4714094858	4714590394
+4715364099	4714209769
+4714187130	4714391365
+4714614348	4713972161
+4713441293	4714227818
+4714941210	4715012282
+4715963672	4714317837
+4713363381	4714440750
+4715665972	4713749569
+4714329119	4714909026
+4713350507	4714930683
+4713285765	4713931385
+4715284379	4714535413
+4714187130	4714525527
+4715286921	4714448373
+4713368857	4713631637
+4714695978	4715005260
+4714599345	4713506075
+4713651819	4714796667
+4714187130	4714367073
+4713614365	4714442589
+4714091142	4714137974
+4713980054	4714353096
+4715550743	4714467446

data/scores CHANGED Viewed

@@ -1,5 +1,6 @@
 clear-with-noise: 0.213968253968254
-different-sizes-interleaved: 0.294117647058824
-languages: 0.350065325612717
+different-sizes-interleaved: 0.043343653250774
+images: 0.0524193548387097
+languages: 0.372378907195571
 pairs: 0.333333333333333
-random: 0.479696100794631
+random: 0.478733444496679

data/spec/rank-aggregation_spec.rb CHANGED Viewed

@@ -1,6 +1,9 @@
 require "helper"
+A, B, C = [:a, :b, :c]
 describe RankAggregation do
   it "should aggregate a single rank into itself" do
     r = (1..10).to_a
     RankAggregation.combine_rankings([r]).should == r
@@ -27,4 +30,26 @@ describe RankAggregation do
     RankAggregation.combine_rankings(r2).should == b
     RankAggregation.combine_rankings(r3).should == c
   end
+  it "should not consider alternatives to be irrelevant" do
+    # This is my example from "Irrelevant alternatives aren't"
+    # which shows that you need to consider C to get the order of
+    # A and B right, as there's a 50/50 voting split between the
+    # two of them, but clear majorities showing that B < C < A
+    RankAggregation.combine_rankings([
+      [A, B, C],
+      [B, C, A],
+      [C, A, B],
+      [B, C, A]
+    ]).should == [B, C, A]
+  end
+  it "should follow the majority with greater weight behind it" do
+    RankAggregation.combine_rankings(
+      [[B, C]] * 6 +
+      [[A, B]] * 10 +
+      [[C, A]] * 5
+    ).should == [A, B, C]
+  end
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: rank-aggregation
 version: !ruby/object:Gem::Version
-  hash: 25
+  hash: 21
   prerelease: false
   segments:
   - 0
   - 0
-  - 3
-  version: 0.0.3
+  - 5
+  version: 0.0.5
 platform: ruby
 authors:
 - David R. MacIver
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-06-20 00:00:00 +01:00
+date: 2010-07-08 00:00:00 +01:00
 default_executable: rank
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -47,13 +47,12 @@ files:
 - bin/rank
 - lib/rank-aggregation.rb
 - lib/rank-aggregation/markov.rb
-- lib/rank-aggregation/ordering.rb
 - lib/rank-aggregation/ranker.rb
-- lib/rank-aggregation/scorer.rb
 - notes/algorithm.markdown
 - rank-aggregation.gemspec
 - samples/clear-with-noise
 - samples/different-sizes-interleaved
+- samples/images
 - samples/languages
 - samples/pairs
 - samples/random

data/lib/rank-aggregation/ordering.rb DELETED Viewed

@@ -1,44 +0,0 @@
-require "set"
-module RankAggregation
-  class Ordering
-    attr_accessor :order, :items, :determined
-    def initialize(items)
-      @items = Set[*items.to_a]
-      @determined = Set.new
-      @order = Hash.new{|h, k| h[k] = { k => 0 }}
-    end
-    def determine(x, y)
-      return false if @order[x][y]
-      @order[x][y] = -1
-      @order[y][x] = 1
-      @determined << x if @order[x].size == @items.size
-      @determined << y if @order[y].size == @items.size
-      @order[x].each{|z, v|
-        determine(z, y) if v == 1
-      }
-      @order[y].each{|z, v|
-        determine(x, z) if v == -1
-      }
-      true
-    end
-    def determined?(item=nil)
-      if item
-        return self.determined.include?(item)
-      else
-        return self.determined.size == self.items.size
-      end
-    end
-    def [](x, y)
-      @order[x][y]
-    end
-  end
-end

data/lib/rank-aggregation/scorer.rb DELETED Viewed

@@ -1,41 +0,0 @@
-module RankAggregation
-  module Scorer
-    def average_kendall_distance(aggregate, ranks)
-      a_indices = {}
-      aggregate.each_with_index{|x, i|
-        a_indices[x] = i
-      }
-      parts = ranks.select{|x| x.size > 1 }.map{|x| kendal_distance(a_indices, x) }
-      parts.inject(0.0){|x, y| x + y} / ranks.size
-    end
-    # TODO: Decent implementation of this
-    private
-    def kendal_distance(a_indices, b)
-      tot = 0.0
-      (0...b.length).each{|i|
-        ((i + 1)...b.length).each{|j|
-          x = a_indices[b[i]]
-          y = a_indices[b[j]]
-          next unless x && y
-          tot += 1 if x > y
-        }
-      }
-      n = (a_indices.keys & b).size
-      p b if n <= 1
-      score = tot / (0.5 * n * (n - 1))
-    end
-  end
-  class <<Scorer
-    include Scorer
-  end
-end