RubyGems - rank-aggregation - Versions diffs - 0.0.3 → 0.0.5 - Mend

rank-aggregation 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/Rakefile +1 -4
data/VERSION +1 -1
data/bin/rank +8 -10
data/lib/rank-aggregation/markov.rb +5 -11
data/lib/rank-aggregation/ranker.rb +87 -31
data/notes/algorithm.markdown +2 -0
data/rank-aggregation.gemspec +3 -4
data/samples/images +249 -0
data/scores +4 -3
data/spec/rank-aggregation_spec.rb +25 -0
metadata +5 -6
data/lib/rank-aggregation/ordering.rb +0 -44
data/lib/rank-aggregation/scorer.rb +0 -41

data/Rakefile CHANGED Viewed

@@ -26,7 +26,6 @@ end
 task "scores" do
   $: << "lib"
   require "rank-aggregation"
-  require "rank-aggregation/scorer"
   File.open("scores", "w"){ |o|
     Dir["samples/*"].sort.each{|file|
@@ -38,9 +37,7 @@ task "scores" do
       items.each{|i| r.add_ranking i }
-      kendall_distance = RankAggregation::Scorer.average_kendall_distance(r.combined_rankings, items)
-      o.puts "#{name}: #{kendall_distance}"
+      o.puts "#{name}: #{r.kendall_distance}"
     }
   }
 end

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.3
1	+ 0.0.5

data/bin/rank CHANGED Viewed

@@ -8,30 +8,28 @@ require "trollop"
 opts = Trollop.options do
   opt :score, "Output scoring metrics on STDERR"
-  opt :rough, "Use only the rough score for sorting"
+  opt :rough, "Use only the rough score for sorting, don't perform any optimisation on top of that"
   opt :smoothing, "Set the smoothing parameter: It should be approximately equal to the sample size you consider large enough to matter", :type => :int
+  opt :debug
 end
 ranker = RankAggregation::Ranker.new
 ranker.smoothing = opts[:smoothing] if opts[:smoothing]
-collect = [] if opts[:score]
+ranker.logger.level = Logger::DEBUG if opts[:debug]
 ARGF.each_line{|l|
   items = l.split("\t").map{|x| x.strip}.select{|x| x.length > 0}
-  collect << items if collect
   ranker.add_ranking items
 }
-result = if opts[:rough] then ranker.rough_combined_rankings else ranker.combined_rankings end
+result = (
+  if opts[:rough] then ranker.rough_combined_rankings
+  else ranker.combined_rankings end
+)
 puts result.join("\t")
 if opts[:score]
-  require "rank-aggregation/scorer"
-  STDERR.puts "Average kendal distance: #{RankAggregation::Scorer.average_kendall_distance result, collect}"
+  STDERR.puts "Average kendal distance: #{ranker.kendall_distance}"
 end

data/lib/rank-aggregation/markov.rb CHANGED Viewed

@@ -1,16 +1,9 @@
 module RankAggregation
   class MarkovChain
-    def initialize(items, transitions)
-      @transitions = {}
+    def initialize(items, transitions, logger)
+      @logger = logger
+      @transitions = transitions
       @items = items
-      items.each{|x|
-        tot = items.map{|y| transitions[x][y]}.inject(0.0){|u, v| u + v}
-        tx = (@transitions[x] = {})
-        items.each{|y| tx[y] = transitions[x][y] / tot }
-      }
     end
     def stationary_distribution
@@ -18,7 +11,8 @@ module RankAggregation
       @items.each{|x| dist[x] = 1.0 / @items.size }
-      10.times{
+      3.times{ |i|
+        @logger.debug "markov chain iteration #{i}"
         new_dist = Hash.new(0.0)
         dist.each{|x, p|

data/lib/rank-aggregation/ranker.rb CHANGED Viewed

@@ -1,26 +1,37 @@
-require "rank-aggregation/ordering"
+require "set"
+require "logger"
 require "rank-aggregation/markov"
 module RankAggregation
   class Ranker
-    attr_accessor :less_counts, :smoothing, :items
+    attr_accessor :less_scores, :smoothing, :items, :logger
     def initialize
-      @less_counts = {}
+      @less_scores = {}
       @items = Set.new
       @smoothing = 5
       @vote_count = 0
+      @rank_count = 0
+      self.logger = Logger.new(STDERR)
+      self.logger.level = Logger::WARN
     end
     def add_ranking(xs)
+      xs = xs.uniq
       return if xs.size <= 1
+      @rank_count += 1
       reset_cached
-      xs.each{|x| @less_counts[x] ||= Hash.new(0); items.add x }
+      xs.each{|x| @less_scores[x] ||= Hash.new(0.0); items.add x }
+      weight = 1.0 / (0.5 * xs.size * (xs.size - 1))
       (0...xs.length).each{|i|
         ((i+1)...xs.length).each{|j|
-          @less_counts[xs[i]][xs[j]] += 1
+          @less_scores[xs[i]][xs[j]] += weight
         }
       }
@@ -29,49 +40,31 @@ module RankAggregation
     def less_chances
       @_less_chances ||= begin
+        logger.debug "calculating less_chances"
         less_chances = Hash.new{|h, k| h[k] = Hash.new(0.5)}
-        less_counts.each{|x, vs|
+        less_scores.each{|x, vs|
           vs.each{|y, c|
-            p = (c + 0.5 * self.smoothing) / (self.smoothing + c + less_counts[y][x])
+            p = (c + 0.5 * self.smoothing) / (self.smoothing + c + less_scores[y][x])
             less_chances[x][y] = p
             less_chances[y][x] = 1 - p
           }
         }
+        logger.debug "calculating less_chances complete"
         less_chances
       end
     end
-    def base_ordering
-      @_base_ordering ||= begin
-        edges = []
-        less_chances.each{|x, ys|
-          ys.each{|y, v|
-            edges << [x, y, v] if v > 0.5
-          }
-        }
-        edges.sort!{|x, y| y[2] <=> x[2]}
-        ordering = Ordering.new(less_chances.keys)
-        edges.each{|x, y, v|
-          ordering.determine(x, y)
-          break if ordering.determined?
-        }
-        ordering
-      end
-    end
-    # The rough score for x is the average chance of it being > y for all y we've got a comparison with
-    # We use this as a tie breaking heuristic.
     def rough_scores
       @_rough_scores ||= begin
+        logger.debug "calculating rough_scores"
         # This markov chain is based off MC4. The idea is as follows:
         # Starting at an item we pick one of the other items at random.
         # We then transition to that item with probability P(i < j).
         # If we fail to transition we stay where we are.
         # i.e. the probability of transitioning form i to j with i != j is 1/(n-1) P(i < j).
+        logger.debug "calculating transition probabilities"
         transitions = {}
         @items.each{|i|
@@ -90,7 +83,10 @@ module RankAggregation
           end
         }
-        MarkovChain.new(@items, transitions).stationary_distribution
+        logger.debug "calculating transition probabilities complete"
+        result = MarkovChain.new(@items, transitions, logger).stationary_distribution
+        logger.debug "calculating rough_scores complete"
+        result
       end
     end
@@ -102,12 +98,72 @@ module RankAggregation
     def combined_rankings
       @_combined_rankings ||= begin
-        @items.sort{|x, y| base_ordering[x, y] || (rough_scores[x] <=> rough_scores[y]) }
+        triangle_shuffle(rough_combined_rankings)
       end
     end
+    def kendall_distance
+      @_kendall_distance ||= kendall_distance_for(combined_rankings)
+    end
     private
+    def kendall_distance_for(ranks)
+      tot = 0.0
+      (0...ranks.length).each{|i|
+        (i+1...ranks.length).each{|j|
+          tot += less_scores[ranks[j]][ranks[i]]
+        }
+      }
+      tot / @rank_count
+    end
+    def triangle_shuffle(ranks)
+      ranks = ranks.dup
+      i = 0
+      changed = true
+      iterations = 0
+      while changed
+        iterations += 1
+        shuffle_count = 0
+        changed = false
+        (0...ranks.length - 2).each do |i|
+          # we look at the positions i, i+1 and i+2 and form a kemeny optimal ordering of the items
+          # there
+          a, b, c = ranks[i..i+2]
+          shuffles = [
+            [a, b, c],
+            [a, c, b],
+            [b, a, c],
+            [b, c, a],
+            [c, a, b],
+            [c, b, a]
+          ].map{|x| [kendall_distance_for(x), x]}
+          original_score = shuffles[0][0]
+          best_score, best = shuffles.min
+          if best_score < original_score
+            shuffle_count += 1
+            changed = true
+            ranks[i..i+2] = best
+            logger.debug{
+              "#{a}, #{b}, #{c} shuffled to #{best.join(", ")}. Score went from #{original_score} to #{best_score}"
+            }
+          end
+        end
+        logger.debug "triangle shuffling iteration ##{iterations} performed #{shuffle_count} shuffles"
+      end
+      ranks
+    end
     def reset_cached
       self.instance_variables.grep(/^@_/).each{|v| instance_variable_set(v, nil)}
     end

data/notes/algorithm.markdown CHANGED Viewed

@@ -1,5 +1,7 @@
 # Algorithm description
+Warning: These notes are a bit out of date.
 This is a description of the algorithm embodied in this library.
 The input of this algorithm is a list of partial rankings of a set of items (the set of items is not known up front - it's computed from the lists. This is a minor and not terribly important detail).

data/rank-aggregation.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{rank-aggregation}
-  s.version = "0.0.3"
+  s.version = "0.0.5"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["David R. MacIver"]
-  s.date = %q{2010-06-20}
+  s.date = %q{2010-07-08}
   s.default_executable = %q{rank}
   s.email = %q{david@drmaciver.com}
   s.executables = ["rank"]
@@ -23,13 +23,12 @@ Gem::Specification.new do |s|
      "bin/rank",
      "lib/rank-aggregation.rb",
      "lib/rank-aggregation/markov.rb",
-     "lib/rank-aggregation/ordering.rb",
      "lib/rank-aggregation/ranker.rb",
-     "lib/rank-aggregation/scorer.rb",
      "notes/algorithm.markdown",
      "rank-aggregation.gemspec",
      "samples/clear-with-noise",
      "samples/different-sizes-interleaved",
+     "samples/images",
      "samples/languages",
      "samples/pairs",
      "samples/random",

data/samples/images ADDED Viewed

@@ -0,0 +1,249 @@
+4713907317	4714793516
+4714044861	4715364378
+4713401785	4715058159
+4713685521	4713662041
+4714695978	4714036236
+4714094858	4714570862
+4714120928	4714183405
+4714519726	4713952977
+4714809167	4714181334
+4713940506	4713660541
+4715424340	4714519726
+4713379937	0
+4714441346	4713614365
+4713940506	4714519726
+4714866582	4714214085
+4713907317	4714094858
+4714707313	4714173098
+4714441346	4713387133
+4713981249	4714633673
+4714994910	4714570862
+4714157060	4714209769
+4714967844	4714180352
+4715190691	4714982926
+4714629861	4714214085
+4714793516	4714633673
+4715213970	4714625095
+4714286467	4715003796
+4715856420	4714073463
+4714695978	4714214085
+4714866582	4713526865
+4714625095	4713660541
+4715424340	4714834229
+4714930683	4714930683
+4714274838	4714574641
+4714589388	4714286467
+4714241833	4714384952
+4714820813	4714157060
+4714793516	4714820813
+4714157060	4713981249
+4713660541	4714180352
+4714990888	4714100910
+4715453204	4714319362
+4714664778	4713651819
+4714866582	4714151904
+4713693925	4713981249
+4713906389	4714227818
+4714393079	4714367616
+4715073620	4714183405
+4715814487	4713589101
+4714509338	4714192076
+4714291607	4714871415
+4713788611	4716484338
+4713965261	4714982926
+4713685521	4714793516
+4714928611	4714068646
+4715085584	4713616757
+4713379937	4714183405
+4715356822	4715012282
+4715100820	4716703934
+4715714373	4714227078
+4715763896	4714087637
+4713844607	4714133515
+4714661034	4713997878
+4714871415	4715860020
+4714797576	4714534020
+4715558413	4714589388
+4714096389	4715073620
+4716091440	4714763663
+4714121666	4714137974
+4715343302	4714145722
+4714629861	4713776313
+4714866582	4713736745
+4714512738	4715963672
+4714185386	4715242266
+4714967844	4715888970
+4714173098	4716213882
+4715049826	4713612015
+4713752261	4713979531
+4713365017	4716213882
+4713744349	4713686267
+4714810870	4714658782
+4714533206	4715356822
+4713314831	4714905429
+4715388191	4715489479
+4714692816	4715453204
+4713898663	4714137996
+4714522757	4714107938
+4713781919	4715430345
+4716515978	4715003796
+4714533547	4714506478
+4716526328	4715040480
+4714182023	4714681315
+4715940337	4714822451
+4714435781	4715888970
+4714423390	4714930683
+4715215778	4714644144
+4714533547	4715324294
+4713495537	4713951401
+4715215778	4714931374
+4713575079	4713418207
+4715602724	4714692816
+4714274838	4715426962
+4714179261	4716714528
+4713853729	4715075632
+4714525315	4713936967
+4714264652	4714928611
+4714207610	4713940506
+4715215778	4713405627
+4714522289	4713853729
+4715992740	4713547921
+4715233482	4715075632
+4716115538	4714695978
+4713980054	4715683996
+4715073620	4713892543
+4714625095	4713854897
+4714967844	4714820813
+4714067235	4713587865
+4713844121	4715200759
+4714430684	4714684363
+4714442589	4713917102
+4714001813	4714574641
+4715331713	4714535413
+4715153745	4715617690
+4713575079	4715888970
+4713879311	4714344426
+4715217508	4716083048
+4715741624	4713749569
+4713788611	4715242266
+4714449713	4714329119
+4714808591	4714589388
+4714959812	4714133515
+4714037152	4714490458
+4714941210	4714970667
+4713574409	4714103021
+4713997878	4714241833
+4715535678	4715856420
+4715324294	4713322691
+4715477764	4714522289
+4714236828	4714534020
+4714545096	4713495101
+4715322098	4714341509
+4714137996	4714341509
+4714990888	4714179898
+4713981249	4713691107
+4713947773	4713401785
+4713879311	4713689031
+4715992740	4715103668
+4715190691	4715005229
+4713685665	4713937913
+4714091142	4713689031
+4714695978	4715005260
+4713917539	4713575079
+4715049826	4713631637
+4715669395	4715503820
+4714493522	4713685521
+4715364378	4713907317
+4713844607	4714808591
+4715260222	4714075745
+4713612015	4713913835
+4715501923	4713913835
+4715627740	4714367073
+4715669286	4715215778
+4715069575	4715859809
+4713931385	4714121666
+4713980054	4715678715
+4715217508	4715739086
+4713566967	4714592156
+4715669286	4715683996
+4715069575	4713458933
+4713443893	4713575079
+4716120110	4713951401
+4714232322	4713967155
+4714866582	4714341509
+4715217376	4713879311
+4715605259	4714219924
+4713322691	4715374569
+4714149874	4714034091
+4715374569	4714341509
+4714797576	4714209769
+4714326747	4713604321
+4714479137	4714905514
+4714535413	4713892543
+4714493522	4715088795
+4713666531	4715058159
+4713611777	4713666531
+4713969986	4715285982
+4713570275	4714132776
+4714590394	4715525984
+4714150388	4714522289
+4714345492	4714592156
+4715775475	4714393079
+4714423390	4714335269
+4713570275	4714589388
+4715602724	4715219978
+4714120928	4715266308
+4714695978	4713931385
+4714990888	4714509338
+4714353096	4714678696
+4714096389	4714522289
+4715056162	4714274838
+4714107938	4716703934
+4714292768	4714763663
+4715741624	4714036236
+4714545096	4713604321
+4715091935	4713952977
+4715049826	4715169543
+4713931385	4713502361
+4714967844	4714522757
+4714936074	4716091609
+4715100820	4713740667
+4714075745	4714522289
+4715175414	4714590394
+4716213882	4713574409
+4713339825	4715739086
+4715775475	4716193284
+4715388191	4714384952
+4714274838	4713781919
+4714546309	4715739086
+4714207610	4714423390
+4714586002	4715233482
+4714227078	4714180352
+4714546309	4714254254
+4715477764	4714548260
+4713635095	4714661034
+4713339825	4714120928
+4714094858	4714590394
+4715364099	4714209769
+4714187130	4714391365
+4714614348	4713972161
+4713441293	4714227818
+4714941210	4715012282
+4715963672	4714317837
+4713363381	4714440750
+4715665972	4713749569
+4714329119	4714909026
+4713350507	4714930683
+4713285765	4713931385
+4715284379	4714535413
+4714187130	4714525527
+4715286921	4714448373
+4713368857	4713631637
+4714695978	4715005260
+4714599345	4713506075
+4713651819	4714796667
+4714187130	4714367073
+4713614365	4714442589
+4714091142	4714137974
+4713980054	4714353096
+4715550743	4714467446

data/scores CHANGED Viewed

@@ -1,5 +1,6 @@
 clear-with-noise: 0.213968253968254
-different-sizes-interleaved: 0.294117647058824
-languages: 0.350065325612717
+different-sizes-interleaved: 0.043343653250774
+images: 0.0524193548387097
+languages: 0.372378907195571
 pairs: 0.333333333333333
-random: 0.479696100794631
+random: 0.478733444496679

data/spec/rank-aggregation_spec.rb CHANGED Viewed

@@ -1,6 +1,9 @@
 require "helper"
+A, B, C = [:a, :b, :c]
 describe RankAggregation do
   it "should aggregate a single rank into itself" do
     r = (1..10).to_a
     RankAggregation.combine_rankings([r]).should == r
@@ -27,4 +30,26 @@ describe RankAggregation do
     RankAggregation.combine_rankings(r2).should == b
     RankAggregation.combine_rankings(r3).should == c
   end
+  it "should not consider alternatives to be irrelevant" do
+    # This is my example from "Irrelevant alternatives aren't"
+    # which shows that you need to consider C to get the order of
+    # A and B right, as there's a 50/50 voting split between the
+    # two of them, but clear majorities showing that B < C < A
+    RankAggregation.combine_rankings([
+      [A, B, C],
+      [B, C, A],
+      [C, A, B],
+      [B, C, A]
+    ]).should == [B, C, A]
+  end
+  it "should follow the majority with greater weight behind it" do
+    RankAggregation.combine_rankings(
+      [[B, C]] * 6 +
+      [[A, B]] * 10 +
+      [[C, A]] * 5
+    ).should == [A, B, C]
+  end
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: rank-aggregation
 version: !ruby/object:Gem::Version
-  hash: 25
+  hash: 21
   prerelease: false
   segments:
   - 0
   - 0
-  - 3
-  version: 0.0.3
+  - 5
+  version: 0.0.5
 platform: ruby
 authors:
 - David R. MacIver
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-06-20 00:00:00 +01:00
+date: 2010-07-08 00:00:00 +01:00
 default_executable: rank
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -47,13 +47,12 @@ files:
 - bin/rank
 - lib/rank-aggregation.rb
 - lib/rank-aggregation/markov.rb
-- lib/rank-aggregation/ordering.rb
 - lib/rank-aggregation/ranker.rb
-- lib/rank-aggregation/scorer.rb
 - notes/algorithm.markdown
 - rank-aggregation.gemspec
 - samples/clear-with-noise
 - samples/different-sizes-interleaved
+- samples/images
 - samples/languages
 - samples/pairs
 - samples/random

data/lib/rank-aggregation/ordering.rb DELETED Viewed

@@ -1,44 +0,0 @@
-require "set"
-module RankAggregation
-  class Ordering
-    attr_accessor :order, :items, :determined
-    def initialize(items)
-      @items = Set[*items.to_a]
-      @determined = Set.new
-      @order = Hash.new{|h, k| h[k] = { k => 0 }}
-    end
-    def determine(x, y)
-      return false if @order[x][y]
-      @order[x][y] = -1
-      @order[y][x] = 1
-      @determined << x if @order[x].size == @items.size
-      @determined << y if @order[y].size == @items.size
-      @order[x].each{|z, v|
-        determine(z, y) if v == 1
-      }
-      @order[y].each{|z, v|
-        determine(x, z) if v == -1
-      }
-      true
-    end
-    def determined?(item=nil)
-      if item
-        return self.determined.include?(item)
-      else
-        return self.determined.size == self.items.size
-      end
-    end
-    def [](x, y)
-      @order[x][y]
-    end
-  end
-end

data/lib/rank-aggregation/scorer.rb DELETED Viewed

@@ -1,41 +0,0 @@
-module RankAggregation
-  module Scorer
-    def average_kendall_distance(aggregate, ranks)
-      a_indices = {}
-      aggregate.each_with_index{|x, i|
-        a_indices[x] = i
-      }
-      parts = ranks.select{|x| x.size > 1 }.map{|x| kendal_distance(a_indices, x) }
-      parts.inject(0.0){|x, y| x + y} / ranks.size
-    end
-    # TODO: Decent implementation of this
-    private
-    def kendal_distance(a_indices, b)
-      tot = 0.0
-      (0...b.length).each{|i|
-        ((i + 1)...b.length).each{|j|
-          x = a_indices[b[i]]
-          y = a_indices[b[j]]
-          next unless x && y
-          tot += 1 if x > y
-        }
-      }
-      n = (a_indices.keys & b).size
-      p b if n <= 1
-      score = tot / (0.5 * n * (n - 1))
-    end
-  end
-  class <<Scorer
-    include Scorer
-  end
-end