RubyGems - text_rank - Versions diffs - 1.1.6 → 1.1.7 - Mend

text_rank 1.1.6 → 1.1.7

Files changed (5) hide show

checksums.yaml +4 -4
data/lib/text_rank/fingerprint.rb +99 -0
data/lib/text_rank/version.rb +1 -1
data/lib/text_rank.rb +5 -0
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 0526b3b997480a7ff013bb2b6eebccb0cd0eb017
-  data.tar.gz: de2b5c80ba02acb4f2e987f9bd2ab39e6043a6d1
+  metadata.gz: 1625933f78441107094a4306a488cf0d68f45e53
+  data.tar.gz: 5e605f2ac2210c6e44e21a1c088620c5d491314c
 SHA512:
-  metadata.gz: 08eaffefcb143dec760ab093b9328eb2b2dc83b97532ca62d7f22975ae1998075fa336105d7cbdc67a33e53251c268d8353a9cae5be4e6f18ab75eb9af8712d3
-  data.tar.gz: 13481d213926e4577f061c27e273a230fb11cd7ea19150f5bf26341c355188403598155d8b9c6862b916432d1dda8908606f7e18e346ae59020abb5b9a6f8a02
+  metadata.gz: 1703773a7d6d1391ec81f26bac6aa409344d8f036352b24e0acf37511e141da88cc560c0bbc525563f5b2bd277ea528c9d30bb41f9768afb1987ae9acee42af6
+  data.tar.gz: fc23bc9f6d63a0361d88d804bc64448613de5aa6c22d1f5b223630bee18c485d3d35b7a12876bb5b4c25dda5efa537aad83ed2e686e12a04a4fed33d88e2147d

data/lib/text_rank/fingerprint.rb ADDED Viewed

@@ -0,0 +1,99 @@
+require 'set'
+module TextRank
+  ##
+  # Class used to compare documents according to TextRank. A "fingerprint"
+  # represents the first N keywords (in order from most significant to least) from
+  # applying the TextRank algorithm.  To compare two "fingerprints" we apply an
+  # algorithm that looks at each of the N prefixes and counts the overlap.  This
+  # rewards matches of significant keywords much higher than matches of less
+  # significant keywords.  But to prevent less significant keywords from being
+  # completely ignored we apply an inverse log linear transformation to each of the
+  # N prefixes.
+  #
+  # For example, consider the following comparison:
+  #
+  #   town man empty found
+  #   vs.
+  #   general empty found jar
+  #
+  # The first pass considers just the first keywords: town vs. general.  As these
+  # are different, they contribute 0.
+  #
+  # The second pass considers the first two keywords: town man vs general empty.
+  # Again, no overlap, so they contribute 0.
+  #
+  # The third pass considers the first three keywords: town man empty vs general
+  # empty found.  Here we have one overlap: empty. This contributes 1.
+  #
+  # The fourth pass considers all, and there is two overlaps:  empty & found.  This
+  # contributes 2.
+  #
+  # We can represent the overlaps as the vector [0, 0, 1, 2].  Then we will apply
+  # the inverse log linear transformation defined by:
+  #
+  #   f(x_i) = x_i / ln(i + 1)
+  #          = [0, 0, 1 / ln(4), 2 / ln(5)]
+  #          = [0, 0, 0.7213475204444817, 1.2426698691192237]
+  #
+  # Finally we take the average of the transformed vector and normalize it (to
+  # ensure a final value between 0.0 and 1.0):
+  #
+  #   norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
+  #                         = norm( 0.49100434739092635 )
+  #                         = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
+  #                         = 0.49100434739092635 / avg(7.912555793714532)
+  #                         = 0.49100434739092635 / 1.978138948428633
+  #                         = 0.24821529740414025
+  ##
+  class Fingerprint
+    attr_reader :values, :size
+    # Creates a new fingerprint for comparison with another fingerprint
+    # @param {Array} values An array of fingerprint values of any hashable type.
+    # @return [Fingerprint]
+    def initialize(*values)
+      @size = values.size
+      @values = values
+    end
+    # Calculates the "similarity" between this fingerprint and another
+    # @param {Fingerprint} A second fingerprint to compare
+    # @return [Number] A number between 0.0 (different) and 1.0 (same)
+    def similarity(trf2)
+      return 1.0 if values == trf2.values
+      sim = 0
+      s1 = Set.new
+      s2 = Set.new
+      [size, trf2.size].max.times.reduce(0) do |sum, i|
+        v1 = values[i]
+        v2 = trf2.values[i]
+        if v1 == v2
+          sim += 1
+        else
+          s1.delete?(v2) ? (sim += 1) : (s2 << v2)
+          s2.delete?(v1) ? (sim += 1) : (s1 << v1)
+        end
+        sum + sim * linear_transform[i]
+      end
+    end
+    private
+    def linear_transform
+      @linear_transform ||= size.times.map do |i|
+        1.0 / Math.log(i + 2) / size.to_f / norm_factor
+      end
+    end
+    def norm_factor
+      @norm_factor ||= size.times.reduce(0.0) do |s, i|
+        s + (i + 1) / Math.log(i + 2) / size.to_f
+      end
+    end
+  end
+end

data/lib/text_rank/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module TextRank
   # Current gem version
-  VERSION = '1.1.6'
+  VERSION = '1.1.7'
 end

data/lib/text_rank.rb CHANGED Viewed

@@ -8,6 +8,7 @@ require 'page_rank'
 module TextRank
   autoload :CharFilter,       'text_rank/char_filter'
+  autoload :Fingerprint,      'text_rank/fingerprint'
   autoload :GraphStrategy,    'text_rank/graph_strategy'
   autoload :KeywordExtractor, 'text_rank/keyword_extractor'
   autoload :RankFilter,       'text_rank/rank_filter'
@@ -31,4 +32,8 @@ module TextRank
     TextRank::KeywordExtractor.advanced(**options).extract(text, **options)
   end
+  def self.similarity(keywords1, keywords2)
+    TextRank::Fingerprint.new(*keywords1).similarity(TextRank::Fingerprint.new(*keywords2))
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_rank
 version: !ruby/object:Gem::Version
-  version: 1.1.6
+  version: 1.1.7
 platform: ruby
 authors:
 - David McCullars
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-05-24 00:00:00.000000000 Z
+date: 2016-07-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -141,6 +141,7 @@ files:
 - lib/text_rank/char_filter/strip_html.rb
 - lib/text_rank/char_filter/strip_possessive.rb
 - lib/text_rank/char_filter/undo_contractions.rb
+- lib/text_rank/fingerprint.rb
 - lib/text_rank/graph_strategy.rb
 - lib/text_rank/graph_strategy/coocurrence.rb
 - lib/text_rank/keyword_extractor.rb
@@ -187,3 +188,4 @@ signing_key:
 specification_version: 4
 summary: Implementation of TextRank solution to ranked keyword extraction
 test_files: []
+has_rdoc: