RubyGems - same_same - Versions diffs - 0.0.1 - Mend

same_same 0.0.1

Files changed (43) hide show

data/.gitignore +18 -0
data/Gemfile +4 -0
data/Gemfile.lock +44 -0
data/LICENSE.txt +22 -0
data/README.md +39 -0
data/Rakefile +1 -0
data/examples/dbscan_digg.rb +25 -0
data/examples/dbscan_lines.rb +35 -0
data/examples/rock_digg.rb +20 -0
data/examples/rock_lines.rb +31 -0
data/lib/same_same.rb +15 -0
data/lib/same_same/cluster.rb +27 -0
data/lib/same_same/cluster_similarity.rb +10 -0
data/lib/same_same/cosine_distance.rb +27 -0
data/lib/same_same/cosine_similarity.rb +22 -0
data/lib/same_same/data_point.rb +12 -0
data/lib/same_same/dbscan_algorithm.rb +135 -0
data/lib/same_same/dbscan_clusters.rb +88 -0
data/lib/same_same/dbscan_neighborhood.rb +68 -0
data/lib/same_same/dbscan_numeric_vectors.rb +7 -0
data/lib/same_same/dbscan_term_frequency_vectors.rb +7 -0
data/lib/same_same/dendrogram.rb +28 -0
data/lib/same_same/dendrogram_printer.rb +74 -0
data/lib/same_same/jaquard_coefficient.rb +9 -0
data/lib/same_same/link_matrix.rb +62 -0
data/lib/same_same/merge_goodness_measure.rb +30 -0
data/lib/same_same/rock_algorithm.rb +51 -0
data/lib/same_same/rock_clusters.rb +68 -0
data/lib/same_same/similarity_matrix.rb +20 -0
data/lib/same_same/symmetrical_matrix.rb +39 -0
data/lib/same_same/term_frequency_builder.rb +20 -0
data/lib/same_same/version.rb +3 -0
data/same_same.gemspec +23 -0
data/spec/fixtures/digg_stories.csv +49 -0
data/spec/fixtures/lines.csv +899 -0
data/spec/same_same/dbscan_algorithm_spec.rb +72 -0
data/spec/same_same/jaquard_coefficient_spec.rb +24 -0
data/spec/same_same/link_matrix_spec.rb +29 -0
data/spec/same_same/merge_goodness_measure_spec.rb +34 -0
data/spec/same_same/rock_algorithm_spec.rb +71 -0
data/spec/same_same/similarity_matrix_spec.rb +20 -0
data/spec/same_same/symmetrical_matrix_spec.rb +69 -0
metadata +144 -0

data/lib/same_same/dbscan_clusters.rb ADDED

@@ -0,0 +1,88 @@
+require 'set'
+module SameSame
+  class DbscanClusters
+    # Identifies a set of Noise points.
+    NOISE_ID = -1
+    # Identifies a set of Unclassified points.
+    UNCLASSIFIED_ID = 0
+    attr_accessor :clusters, :last_id
+    def initialize( unclassified )
+      self.clusters = Hash.new {|hash, key|
+        hash[key] = Set.new
+      }
+      self.last_id = 0
+      assign_points( unclassified, UNCLASSIFIED_ID )
+    end
+    def assign_to_noise( p )
+      assign_point( p, NOISE_ID)
+    end
+    def unclassified?(p)
+      point_in_cluster?(p, UNCLASSIFIED_ID)
+    end
+    def noise?(p)
+      point_in_cluster?(p, NOISE_ID)
+    end
+    def point_in_cluster?( p, cluster_id)
+      clusters[cluster_id].include?( p )
+    end
+    def assign_points(points, cluster_id)
+      points.each {|p| assign_point( p, cluster_id)}
+    end
+    def assign_point( p, cluster_id)
+      # Remove point from the group that it currently belongs to...
+      if noise?(p)
+        remove_point_from_cluster(p, NOISE_ID)
+      elsif unclassified?(p)
+        remove_point_from_cluster(p, UNCLASSIFIED_ID)
+      else
+        if cluster_id != UNCLASSIFIED_ID
+          raise ArgumentError.new("Trying to move point that has already been assigned to some other cluster. Point: #{p}, cluster_id=#{cluster_id}")
+        end
+      end
+      clusters[cluster_id] << p
+    end
+    def to_clusters
+      [].tap do |all_clusters|
+        clusters.each do |id, points|
+          all_clusters << Cluster.new(points, cluster_name(id)) unless points.empty?
+        end
+      end
+    end
+    def cluster_name(id)
+      case id
+      when NOISE_ID then "Noise"
+      when UNCLASSIFIED_ID then "Unclassified"
+      else "Cluster #{id}"
+      end
+    end
+    def remove_point_from_cluster(p, cluster_id)
+      cluster = clusters[cluster_id]
+      return false if cluster.nil?
+      cluster.include?(p).tap do
+        cluster.delete p
+      end
+    end
+    def get_next_cluster_id
+      self.last_id = last_id + 1
+    end
+  end
+end

data/lib/same_same/dbscan_neighborhood.rb ADDED

@@ -0,0 +1,68 @@
+module SameSame
+  class DbscanNeighborhood
+    # Contains distances between points.
+    attr_accessor :adjacency_matrix
+    # Threshold value. Determines which points will be considered as
+    # neighbors. Two points are neighbors if the distance between them does
+    # not exceed threshold value.
+    attr_accessor :eps
+    # used to cache index of points in the matrix
+    attr_accessor :index_mapping
+    attr_accessor :points
+    # Initializes algorithm with all data that it needs.
+    #
+    #  * points - points to cluster
+    #  * eps - distance threshold value
+    #  * minPoints - number of neighbors for point to be considered a core point.
+    def initialize(attrs = {})
+      self.eps    = attrs.fetch(:eps)
+      self.points = attrs.fetch(:points)
+      build_index_mapping
+      vector_calculator    = attrs[:vector_calculator] || DbscanNumericVectors.new
+      distance             = attrs.fetch( :distance )
+      use_term_frequencies = attrs[:use_term_frequencies] || false
+      self.adjacency_matrix =
+            calculate_adjacency_matrix(distance, points, vector_calculator)
+    end
+    def neighbors_of( p )
+      Set.new.tap do |neighbors|
+        i = index_mapping[p]
+        (0..index_mapping.size-1).each do |j|
+          neighbors.add(points[j]) if adjacency_matrix.lookup(i,j) <= eps
+        end
+      end
+    end
+    private
+    def build_index_mapping
+      self.index_mapping = {}
+      points.each_with_index do |p,i|
+        index_mapping[p] = i
+      end
+      index_mapping
+    end
+    def calculate_adjacency_matrix(distance, points, vector_calculator)
+      SymmetricalMatrix.new( points.size ).tap do |m|
+        (0..points.size - 1).each do |i|
+          m.set(i,i, 0.0)
+          ((i+1)..(points.size - 1)).each do |j|
+            x, y = vector_calculator.vectors( points[i], points[j] )
+            d = distance.distance(x, y)
+            m.set(i,j,d)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/same_same/dbscan_numeric_vectors.rb ADDED

@@ -0,0 +1,7 @@
+module SameSame
+  class DbscanNumericVectors
+    def vectors( p1, p2 )
+      [p1.data, p2.data]
+    end
+  end
+end

data/lib/same_same/dbscan_term_frequency_vectors.rb ADDED

@@ -0,0 +1,7 @@
+module SameSame
+  class DbscanTermFrequencyVectors
+    def vectors(p1, p2)
+      TermFrequencyBuilder.build_vectors( p1.data, p2.data )
+    end
+  end
+end

data/lib/same_same/dendrogram.rb ADDED

@@ -0,0 +1,28 @@
+module SameSame
+  class Dendrogram
+    Level = Struct.new(:name, :clusters)
+    attr_accessor :levels, :level_label
+    def initialize(name)
+      self.levels = []
+      self.level_label = name
+    end
+    def add_level( name, clusters )
+      self.levels << Level.new(name, clusters.map(&:dup))
+    end
+    def [](i)
+      levels[i]
+    end
+    def non_singelton_leaves?
+      levels.last.clusters.any? {|cluster| cluster.size > 1}
+    end
+  end
+end

data/lib/same_same/dendrogram_printer.rb ADDED

@@ -0,0 +1,74 @@
+require 'colored'
+module SameSame
+  class DendrogramPrinter
+    def print_last(dnd)
+      level = dnd.levels.last
+      print_clusters( level.clusters )
+    end
+    def print_clusters(clusters)
+      clusters.each do |cluster|
+        if cluster.size > 1
+          puts "## #{cluster.name}" if cluster.name
+          print_points( cluster.datapoints )
+        end
+      end
+    end
+    def print(dnd)
+      dnd.levels.each_with_index do |level, i|
+        single_point_clusters = level.clusters.select {|cluster| cluster.size == 1}
+        ungrouped = single_point_clusters.map {|c| c.datapoints}.flatten
+        puts
+        puts "-" * 80
+        puts "#{dnd.level_label}: #{level.name}"
+        puts
+        puts "Clusters: #{level.clusters.size - single_point_clusters.size}"
+        puts "Ungrouped: #{ungrouped.size}"
+        puts "-" * 80
+        puts
+        level.clusters.each do |cluster|
+          if cluster.size > 1
+            print_points( cluster.datapoints )
+          end
+        end
+        puts
+        if i == dnd.levels.size - 1
+          puts "FINAL UNGROUPED"
+          print_points(ungrouped)
+        end
+      end
+    end
+    def highlight_common( content, common_words )
+      words = content.strip.split(/\s+/)
+      words.map {|word| common_words.include?(word.downcase) ? word : word.bold.red}.join(" ")
+    end
+    def formatted_datapoint_name( content, common_words )
+      if content =~ /^(\d+:)(.*)/
+        "#{$1.cyan} #{highlight_common( $2, common_words) }"
+      else
+        highlight_common( content, common_words )
+      end
+    end
+    def print_points(datapoints)
+      puts
+      all_terms = datapoints.map(&:id).map(&:downcase).map {|id| id.split(/\s+/)}
+      common_words = all_terms.inject(all_terms.flatten.uniq) {|m,v| m & v}
+      datapoints.sort_by {|dp| dp.id.gsub(/^\d+:/, '')}.each do |dp|
+        puts formatted_datapoint_name( dp.id, common_words )
+      end
+      puts
+    end
+  end
+end

data/lib/same_same/jaquard_coefficient.rb ADDED

@@ -0,0 +1,9 @@
+module SameSame
+  class JaquardCoefficient
+    def similarity( x, y )
+      raise(ArgumentError, "both sets cannot be empty") if x.empty? && y.empty?
+      return 0.0 if x.empty? || y.empty?
+      (x & y).size.to_f / (x | y).size.to_f
+    end
+  end
+end

data/lib/same_same/link_matrix.rb ADDED

@@ -0,0 +1,62 @@
+module SameSame
+  class LinkMatrix
+    attr_reader :links, :index_lookup
+    def initialize( attrs = {} )
+      similarity_matrix = attrs.fetch(:similarity_matrix)
+      datapoints        = attrs.fetch(:datapoints)
+      th                = attrs.fetch(:th)
+      neighbours        = calculate_neighbours( datapoints, similarity_matrix, th )
+      @links             = calculate_links( neighbours, datapoints )
+      @index_lookup      = calculate_index_lookup( datapoints )
+    end
+    def count_links_between_clusters( cluster1, cluster2 )
+      cluster1.inject(0) do |sum, p1|
+        cluster2.inject(sum) do |sum2, p2|
+          sum2 + number_of_links_between_points(p1, p2)
+        end
+      end
+    end
+    def number_of_links_between_points( datapoint1, datapoint2 )
+      links.lookup( index_lookup[datapoint1], index_lookup[datapoint2] )
+    end
+    private
+    def calculate_index_lookup( datapoints )
+      {}.tap do |index|
+        datapoints.each_with_index {|p, i| index[p] = i}
+      end
+    end
+    def calculate_links( neighbours, datapoints )
+      SymmetricalMatrix.new(
+        neighbours.size,
+        ->(x,y) {number_of_links(neighbours, datapoints, x, y)}
+      )
+    end
+    def calculate_neighbours( datapoints, similarity_matrix, th )
+      SymmetricalMatrix.new(
+        datapoints.size,
+         ->(x,y) {similarity_matrix.lookup(x,y) >= th ? 1 : 0}
+      )
+    end
+    #     0  1  2  3
+    #   ------------
+    # 0 | Y  -  Y  -
+    # 1 | -  Y  -  -
+    # 2 | -  -  -  -
+    # 3 | -  Y  -  -
+    #
+    def number_of_links(neighbors, datapoints, x, y)
+      (0..datapoints.size-1).map do |i|
+        neighbors.lookup(x,i) * neighbors.lookup(i,y)
+      end.inject(0) {|m,v| m+v}
+    end
+  end
+end

data/lib/same_same/merge_goodness_measure.rb ADDED

@@ -0,0 +1,30 @@
+module SameSame
+  class MergeGoodnessMeasure
+    attr_reader :th, :p
+    # th should be >= 0 and <= 1
+    #   0 means all datapoints are neighbours
+    #   1 means no datapoints are neighbours
+    #   (proximity)
+    def initialize( th )
+      @th = th
+      @p = 1.0 + 2.0 * f( th )
+    end
+    def g(number_of_links, size_x, size_y)
+      a = (size_x + size_y) ** p
+      b = size_x ** p
+      c = size_x ** p
+      number_of_links / (a - b - c)
+    end
+    private
+    def f( th )
+      (1.0 - th) / (1.0 + th)
+    end
+  end
+end

data/lib/same_same/rock_algorithm.rb ADDED

@@ -0,0 +1,51 @@
+require 'same_same/dendrogram'
+require 'same_same/link_matrix'
+require 'same_same/merge_goodness_measure'
+require 'same_same/rock_clusters'
+require 'same_same/similarity_matrix'
+module SameSame
+  class RockAlgorithm
+    attr_accessor :datapoints, :similarity_measure, :k, :th, :link_matrix, :similarity_matrix
+    def initialize( attrs = {} )
+      self.datapoints         = attrs.fetch( :datapoints )
+      self.similarity_measure = attrs[ :similarity_measure ] || JaquardCoefficient.new
+      self.k                  = attrs.fetch(:k)
+      self.th                 = attrs.fetch(:th)
+      self.similarity_matrix  = SimilarityMatrix.new( similarity_measure, datapoints.map {|d| d.data} )
+      self.link_matrix        = LinkMatrix.new( datapoints: datapoints, similarity_matrix: similarity_matrix, th: th)
+    end
+    def cluster
+      Dendrogram.new( "Goodness" ).tap do |dnd|
+        initial_clusters = one_point_per_cluster
+        g = Float::INFINITY
+        dnd.add_level(g.to_s, initial_clusters)
+        goodness = MergeGoodnessMeasure.new( th )
+        rock_clusters = RockClusters.new(
+          link_matrix:      link_matrix,
+          clusters:         initial_clusters,
+          goodness_measure: goodness)
+        number_of_clusters = rock_clusters.size
+        while number_of_clusters > k do
+          number_of_clusters_before_merge = number_of_clusters
+          g = rock_clusters.merge_best_candidates
+          number_of_clusters = rock_clusters.size
+          # finish if there are no linked clusters to merge
+          break if number_of_clusters == number_of_clusters_before_merge
+          dnd.add_level(g.to_s, rock_clusters.clusters)
+        end
+      end
+    end
+    def one_point_per_cluster
+      datapoints.map {|point| Cluster.new([point])}
+    end
+  end
+end

data/lib/same_same/rock_clusters.rb ADDED

@@ -0,0 +1,68 @@
+require 'same_same/cluster_similarity'
+module SameSame
+  class RockClusters
+    attr_accessor :link_matrix, :clusters, :goodness_measure, :cluster_map, :closest_clusters
+    def initialize( attrs = {} )
+      self.link_matrix = attrs.fetch(:link_matrix)
+      self.goodness_measure = attrs.fetch(:goodness_measure)
+      self.cluster_map = {}
+      @last_key = -1
+      attrs[:clusters].each {|c| add_cluster(c) }
+      calculate_closest_clusters
+    end
+    def merge_best_candidates
+      key1, similarity = find_most_similar_pair
+      if key1
+        merge_clusters key1, similarity.cluster_key
+        similarity.goodness
+      end
+    end
+    def find_most_similar_pair
+      closest_clusters.sort_by {|_, similarity| similarity.goodness}.first || []
+    end
+    def size
+      cluster_map.size
+    end
+    def clusters
+      cluster_map.values
+    end
+    def add_cluster( c )
+      cluster_map[next_key] = c
+    end
+    def next_key
+      @last_key = @last_key + 1
+    end
+    def calculate_closest_clusters
+      self.closest_clusters = {}
+      cluster_map.each do |cluster_key, cluster|
+        similarity = cluster_map.map do |other_key, other_cluster|
+          if cluster_key != other_key
+            number_of_links = link_matrix.count_links_between_clusters( cluster, other_cluster )
+            if number_of_links > 0
+              goodness = goodness_measure.g( number_of_links, cluster.size, other_cluster.size)
+              ClusterSimilarity.new( other_key, goodness )
+            end
+          end
+        end.compact.sort.first
+        closest_clusters[cluster_key] = similarity if similarity
+      end
+    end
+    def merge_clusters(key1, key2)
+      merged_key = add_cluster( cluster_map.delete(key1) + cluster_map.delete(key2) )
+      calculate_closest_clusters
+      merged_key
+    end
+  end
+end