RubyGems - text_rank - Versions diffs - 1.1.5 → 1.2.3 - Mend

text_rank 1.1.5 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +5 -5
data/.codeclimate.yml +1 -6
data/.rubocop.yml +55 -1077
data/.ruby-version +1 -1
data/.travis.yml +13 -5
data/{LICENSE.txt → LICENSE} +0 -0
data/README.md +2 -1
data/lib/text_rank.rb +5 -0
data/lib/text_rank/fingerprint.rb +99 -0
data/lib/text_rank/keyword_extractor.rb +1 -1
data/lib/text_rank/rank_filter/collapse_adjacent.rb +7 -2
data/lib/text_rank/tokenizer/number.rb +8 -8
data/lib/text_rank/version.rb +1 -1
data/text_rank.gemspec +9 -10
metadata +35 -34

data/.ruby-version CHANGED

	@@ -1 +1 @@
1	- ruby-2.1.2
1	+ ruby-2.5.1

data/.travis.yml CHANGED

@@ -1,7 +1,15 @@
+env:
+  global:
+    - CC_TEST_REPORTER_ID=6ab030bf370ffc2abbf0ba4d70a1c8d9649f6fd1426f48f6d43d5c9eb15f187f
 language: ruby
 rvm:
-  - 2.1.2
-before_install: gem install bundler -v 1.11.2
-addons:
-    code_climate:
-        repo_token: 6ab030bf370ffc2abbf0ba4d70a1c8d9649f6fd1426f48f6d43d5c9eb15f187f
+  - 2.5.1
+before_install: gem install bundler -v 1.17.3
+before_script:
+  - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
+  - chmod +x ./cc-test-reporter
+  - ./cc-test-reporter before-build
+script:
+  - bundle exec rspec
+after_script:
+  - ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT

data/{LICENSE.txt → LICENSE} RENAMED

File without changes

data/README.md CHANGED

@@ -7,6 +7,7 @@
 ## Status
+[![Gem Version](https://badge.fury.io/rb/text_rank.svg)](https://badge.fury.io/rb/text_rank)
 [![Travis Build Status](https://travis-ci.org/david-mccullars/text_rank.svg?branch=master)](https://travis-ci.org/david-mccullars/text_rank)
 [![Code Climate](https://codeclimate.com/github/david-mccullars/text_rank/badges/gpa.svg)](https://codeclimate.com/github/david-mccullars/text_rank)
 [![Test Coverage](https://codeclimate.com/github/david-mccullars/text_rank/badges/coverage.svg)](https://codeclimate.com/github/david-mccullars/text_rank/coverage)
@@ -127,7 +128,7 @@ multiplication.  Each iteration is O(N^3) where N is the number of graph nodes.
 ## License
-MIT. See the `LICENSE.txt` file.
+MIT. See the `LICENSE` file.
 ## References

data/lib/text_rank.rb CHANGED

@@ -8,6 +8,7 @@ require 'page_rank'
 module TextRank
   autoload :CharFilter,       'text_rank/char_filter'
+  autoload :Fingerprint,      'text_rank/fingerprint'
   autoload :GraphStrategy,    'text_rank/graph_strategy'
   autoload :KeywordExtractor, 'text_rank/keyword_extractor'
   autoload :RankFilter,       'text_rank/rank_filter'
@@ -31,4 +32,8 @@ module TextRank
     TextRank::KeywordExtractor.advanced(**options).extract(text, **options)
   end
+  def self.similarity(keywords1, keywords2)
+    TextRank::Fingerprint.new(*keywords1).similarity(TextRank::Fingerprint.new(*keywords2))
+  end
 end

data/lib/text_rank/fingerprint.rb ADDED

@@ -0,0 +1,99 @@
+require 'set'
+module TextRank
+  ##
+  # Class used to compare documents according to TextRank. A "fingerprint"
+  # represents the first N keywords (in order from most significant to least) from
+  # applying the TextRank algorithm.  To compare two "fingerprints" we apply an
+  # algorithm that looks at each of the N prefixes and counts the overlap.  This
+  # rewards matches of significant keywords much higher than matches of less
+  # significant keywords.  But to prevent less significant keywords from being
+  # completely ignored we apply an inverse log linear transformation to each of the
+  # N prefixes.
+  #
+  # For example, consider the following comparison:
+  #
+  #   town man empty found
+  #   vs.
+  #   general empty found jar
+  #
+  # The first pass considers just the first keywords: town vs. general.  As these
+  # are different, they contribute 0.
+  #
+  # The second pass considers the first two keywords: town man vs general empty.
+  # Again, no overlap, so they contribute 0.
+  #
+  # The third pass considers the first three keywords: town man empty vs general
+  # empty found.  Here we have one overlap: empty. This contributes 1.
+  #
+  # The fourth pass considers all, and there is two overlaps:  empty & found.  This
+  # contributes 2.
+  #
+  # We can represent the overlaps as the vector [0, 0, 1, 2].  Then we will apply
+  # the inverse log linear transformation defined by:
+  #
+  #   f(x_i) = x_i / ln(i + 1)
+  #          = [0, 0, 1 / ln(4), 2 / ln(5)]
+  #          = [0, 0, 0.7213475204444817, 1.2426698691192237]
+  #
+  # Finally we take the average of the transformed vector and normalize it (to
+  # ensure a final value between 0.0 and 1.0):
+  #
+  #   norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
+  #                         = norm( 0.49100434739092635 )
+  #                         = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
+  #                         = 0.49100434739092635 / avg(7.912555793714532)
+  #                         = 0.49100434739092635 / 1.978138948428633
+  #                         = 0.24821529740414025
+  ##
+  class Fingerprint
+    attr_reader :values, :size
+    # Creates a new fingerprint for comparison with another fingerprint
+    # @param {Array} values An array of fingerprint values of any hashable type.
+    # @return [Fingerprint]
+    def initialize(*values)
+      @size = values.size
+      @values = values
+    end
+    # Calculates the "similarity" between this fingerprint and another
+    # @param {Fingerprint} A second fingerprint to compare
+    # @return [Number] A number between 0.0 (different) and 1.0 (same)
+    def similarity(trf2)
+      return 1.0 if values == trf2.values
+      sim = 0
+      s1 = Set.new
+      s2 = Set.new
+      [size, trf2.size].max.times.reduce(0) do |sum, i|
+        v1 = values[i]
+        v2 = trf2.values[i]
+        if v1 == v2
+          sim += 1
+        else
+          s1.delete?(v2) ? (sim += 1) : (s2 << v2)
+          s2.delete?(v1) ? (sim += 1) : (s1 << v1)
+        end
+        sum + sim * linear_transform[i]
+      end
+    end
+    private
+    def linear_transform
+      @linear_transform ||= size.times.map do |i|
+        1.0 / Math.log(i + 2) / size.to_f / norm_factor
+      end
+    end
+    def norm_factor
+      @norm_factor ||= size.times.reduce(0.0) do |s, i|
+        s + (i + 1) / Math.log(i + 2) / size.to_f
+      end
+    end
+  end
+end

data/lib/text_rank/keyword_extractor.rb CHANGED

@@ -41,7 +41,7 @@ module TextRank
     # @option options [Array<Class, Symbol, #filter!>]  :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
     def initialize(**options)
       @page_rank_options = {
-        strategy: options[:strategy] || :dense,
+        strategy: options[:strategy] || :sparse,
         damping: options[:damping],
         tolerance: options[:tolerance],
       }

data/lib/text_rank/rank_filter/collapse_adjacent.rb CHANGED

@@ -105,7 +105,8 @@ module TextRank
           # until all of the top N final keywords (single or collapsed) have been
           # considered.
           loop do
-            single_tokens_to_consider = @tokens.keys.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
+            regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
+            single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
             scan_text_for_all_permutations_of(single_tokens_to_consider) or break
             decide_what_to_collapse_and_what_to_remove
           end
@@ -113,7 +114,11 @@ module TextRank
           # We now know what to collapse and what to remove, so we can start safely
           # modifying the tokens hash
           @to_collapse.each do |perm|
-            values = @tokens.values_at(*perm)
+            values = @tokens.values_at(*perm).compact
+            # This might be empty if somehow the scanned permutation doesn't
+            # exactly match one of the tokens (e.g. ASCII-folding gone awry).
+            # The goal is to do the best we can, so if we can't find it, ignore.
+            next if values.empty?
             @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
           end
           @tokens.reject! do |k, _|

data/lib/text_rank/tokenizer/number.rb CHANGED

@@ -7,23 +7,23 @@ module TextRank
     ##
     Number = %r{
       (
-        [1-9]\d{0,2}        # 453
-        (?:,\d{3})*         # 453,231,162
-        (?:\.\d{0,2})?      # 453,231,162.17
+        [1-9]\d{3,}       # 453231162
+        (?:\.\d+)?        # 453231162.17
         |
-        [1-9]\d*            # 453231162
-        (?:\.\d{0,2})?      # 453231162.17
+        [1-9]\d{0,2}      # 453
+        (?:,\d{3})*       # 453,231,162
+        (?:\.\d+)?        # 453,231,162.17
         |
-        0                   # 0
-        (?:\.\d{0,2})?      # 0.17
+        0                 # 0
+        (?:\.\d+)?        # 0.17
         |
-        (?:\.\d{1,2})       # .17
+        (?:\.\d+)         # .17
       )
     }x

data/lib/text_rank/version.rb CHANGED

@@ -1,4 +1,4 @@
 module TextRank
   # Current gem version
-  VERSION = '1.1.5'
+  VERSION = '1.2.3'
 end

data/text_rank.gemspec CHANGED

@@ -1,4 +1,3 @@
-# coding: utf-8
 lib = File.expand_path('../lib', __FILE__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require 'text_rank/version'
@@ -9,8 +8,8 @@ Gem::Specification.new do |spec|
   spec.authors       = ['David McCullars']
   spec.email         = ['david.mccullars@gmail.com']
-  spec.summary       = %q{Implementation of TextRank solution to ranked keyword extraction}
-  spec.description   = %q{Implementation of TextRank solution to ranked keyword extraction.  See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf}
+  spec.summary       = 'Implementation of TextRank solution to ranked keyword extraction'
+  spec.description   = 'Implementation of TextRank solution to ranked keyword extraction.  See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf'
   spec.homepage      = 'https://github.com/david-mccullars/text_rank'
   spec.license       = 'MIT'
@@ -19,12 +18,12 @@ Gem::Specification.new do |spec|
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ['lib']
-  spec.add_development_dependency 'bundler',    '~> 1.11'
-  spec.add_development_dependency 'rake',       '~> 10.0'
-  spec.add_development_dependency 'rspec',      '~> 3.0'
-  spec.add_development_dependency 'simplecov',  '~> 0.11'
-  spec.add_development_dependency 'codeclimate-test-reporter'
+  spec.add_development_dependency 'bundler'
+  spec.add_development_dependency 'rake'
+  spec.add_development_dependency 'rspec'
+  spec.add_development_dependency 'rubocop'
+  spec.add_development_dependency 'simplecov', '~> 0.17.0' # 0.18 not supported by code climate
-  spec.add_development_dependency 'engtagger',  '~> 0.2.0' # Optional runtime dependency but needed for specs
-  spec.add_development_dependency 'nokogiri',   '~> 1.0'   # Optional runtime dependency but needed for specs
+  spec.add_development_dependency 'engtagger' # Optional runtime dependency but needed for specs
+  spec.add_development_dependency 'nokogiri'  # Optional runtime dependency but needed for specs
 end

metadata CHANGED

@@ -1,113 +1,113 @@
 --- !ruby/object:Gem::Specification
 name: text_rank
 version: !ruby/object:Gem::Version
-  version: 1.1.5
+  version: 1.2.3
 platform: ruby
 authors:
 - David McCullars
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-05-23 00:00:00.000000000 Z
+date: 2020-06-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '1.11'
+        version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '1.11'
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '10.0'
+        version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '10.0'
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '3.0'
+        version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '3.0'
+        version: '0'
 - !ruby/object:Gem::Dependency
-  name: simplecov
+  name: rubocop
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '0.11'
+        version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '0.11'
+        version: '0'
 - !ruby/object:Gem::Dependency
-  name: codeclimate-test-reporter
+  name: simplecov
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 0.17.0
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 0.17.0
 - !ruby/object:Gem::Dependency
   name: engtagger
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: 0.2.0
+        version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: 0.2.0
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '1.0'
+        version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '1.0'
+        version: '0'
 description: Implementation of TextRank solution to ranked keyword extraction.  See
   https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
 email:
@@ -124,7 +124,7 @@ files:
 - ".travis.yml"
 - CODE_OF_CONDUCT.md
 - Gemfile
-- LICENSE.txt
+- LICENSE
 - README.md
 - Rakefile
 - bin/console
@@ -141,6 +141,7 @@ files:
 - lib/text_rank/char_filter/strip_html.rb
 - lib/text_rank/char_filter/strip_possessive.rb
 - lib/text_rank/char_filter/undo_contractions.rb
+- lib/text_rank/fingerprint.rb
 - lib/text_rank/graph_strategy.rb
 - lib/text_rank/graph_strategy/coocurrence.rb
 - lib/text_rank/keyword_extractor.rb
@@ -182,7 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.5.1
+rubygems_version: 2.7.6
 signing_key:
 specification_version: 4
 summary: Implementation of TextRank solution to ranked keyword extraction