RubyGems - text_rank - Versions diffs - 1.1.7 → 1.2.5 - Mend

text_rank 1.1.7 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +5 -5
data/.codeclimate.yml +1 -6
data/.rubocop.yml +60 -1075
data/.ruby-version +1 -1
data/.travis.yml +13 -5
data/{LICENSE.txt → LICENSE} +0 -0
data/README.md +2 -1
data/bin/console +3 -3
data/lib/page_rank.rb +2 -0
data/lib/page_rank/base.rb +9 -8
data/lib/page_rank/dense.rb +2 -1
data/lib/page_rank/sparse.rb +6 -7
data/lib/text_rank.rb +12 -9
data/lib/text_rank/char_filter.rb +1 -1
data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
data/lib/text_rank/fingerprint.rb +20 -28
data/lib/text_rank/fingerprint_overlap.rb +55 -0
data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
data/lib/text_rank/keyword_extractor.rb +32 -25
data/lib/text_rank/rank_filter/collapse_adjacent.rb +53 -25
data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
data/lib/text_rank/token_filter/stopwords.rb +1 -321
data/lib/text_rank/token_filter/stopwords.yml +317 -0
data/lib/text_rank/tokenizer.rb +1 -1
data/lib/text_rank/tokenizer/money.rb +11 -6
data/lib/text_rank/tokenizer/number.rb +4 -3
data/lib/text_rank/tokenizer/punctuation.rb +4 -1
data/lib/text_rank/tokenizer/url.rb +3 -0
data/lib/text_rank/tokenizer/whitespace.rb +4 -1
data/lib/text_rank/tokenizer/word.rb +5 -2
data/lib/text_rank/version.rb +3 -1
data/text_rank.gemspec +10 -10
metadata +48 -32

data/lib/text_rank/fingerprint.rb CHANGED

@@ -1,5 +1,3 @@
-require 'set'
 module TextRank
   ##
   # Class used to compare documents according to TextRank. A "fingerprint"
@@ -10,35 +8,35 @@ module TextRank
   # significant keywords.  But to prevent less significant keywords from being
   # completely ignored we apply an inverse log linear transformation to each of the
   # N prefixes.
-  #
+  #
   # For example, consider the following comparison:
-  #
+  #
   #   town man empty found
   #   vs.
   #   general empty found jar
-  #
+  #
   # The first pass considers just the first keywords: town vs. general.  As these
   # are different, they contribute 0.
-  #
+  #
   # The second pass considers the first two keywords: town man vs general empty.
   # Again, no overlap, so they contribute 0.
-  #
+  #
   # The third pass considers the first three keywords: town man empty vs general
   # empty found.  Here we have one overlap: empty. This contributes 1.
-  #
+  #
   # The fourth pass considers all, and there is two overlaps:  empty & found.  This
   # contributes 2.
-  #
+  #
   # We can represent the overlaps as the vector [0, 0, 1, 2].  Then we will apply
   # the inverse log linear transformation defined by:
-  #
+  #
   #   f(x_i) = x_i / ln(i + 1)
   #          = [0, 0, 1 / ln(4), 2 / ln(5)]
   #          = [0, 0, 0.7213475204444817, 1.2426698691192237]
-  #
+  #
   # Finally we take the average of the transformed vector and normalize it (to
   # ensure a final value between 0.0 and 1.0):
-  #
+  #
   #   norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
   #                         = norm( 0.49100434739092635 )
   #                         = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
@@ -61,28 +59,22 @@ module TextRank
     # Calculates the "similarity" between this fingerprint and another
     # @param {Fingerprint} A second fingerprint to compare
     # @return [Number] A number between 0.0 (different) and 1.0 (same)
-    def similarity(trf2)
-      return 1.0 if values == trf2.values
-      sim = 0
-      s1 = Set.new
-      s2 = Set.new
+    def similarity(other)
+      return 1.0 if values == other.values # Short-circuit for efficiency
-      [size, trf2.size].max.times.reduce(0) do |sum, i|
-        v1 = values[i]
-        v2 = trf2.values[i]
-        if v1 == v2
-          sim += 1
-        else
-          s1.delete?(v2) ? (sim += 1) : (s2 << v2)
-          s2.delete?(v1) ? (sim += 1) : (s1 << v1)
-        end
-        sum + sim * linear_transform[i]
+      sum = 0
+      overlap(other).each_with_index do |overlap_value, i|
+        sum += overlap_value * linear_transform[i]
       end
+      sum
     end
     private
+    def overlap(other)
+      FingerprintOverlap.new(values, other.values).overlap
+    end
     def linear_transform
       @linear_transform ||= size.times.map do |i|
         1.0 / Math.log(i + 2) / size.to_f / norm_factor

data/lib/text_rank/fingerprint_overlap.rb ADDED

@@ -0,0 +1,55 @@
+module TextRank
+  ##
+  # Determines "overlap" between two fingerprints at each N prefixes
+  #
+  # For example,
+  #
+  #   FingerprintOverlap.new(
+  #     %w[a b c d],
+  #     %w[b e a c],
+  #   ).overlap
+  #
+  #   => [
+  #     0, # [a] & (b) have no overlap
+  #     1, # [a b] & [b e] have one overlap: b
+  #     2, # [a b c] & [b e a] have two overlap: a & b
+  #     3, # [a b c d] & [b e a c] have three overlap: a, b, & c
+  #   ]
+  ##
+  class FingerprintOverlap
+    attr_reader :overlap
+    def initialize(values1, values2)
+      raise ArgumentError, 'Value size mismatch' if values1.size != values2.size
+      @encountered1 = Set.new
+      @encountered2 = Set.new
+      @overlap_count = 0
+      @overlap = determine_overlap(values1, values2)
+    end
+    private
+    def determine_overlap(values1, values2)
+      values1.zip(values2).map do |v1, v2|
+        encounter(v1, v2)
+        @overlap_count
+      end
+    end
+    # This algorithm is a little more complex than could be represented in Ruby,
+    # but we want to keep it as performant as possible.
+    def encounter(value1, value2)
+      if value1 == value2
+        @overlap_count += 1
+      else
+        # Delete from the set in case an element appears more than once
+        @encountered1.delete?(value2) ? (@overlap_count += 1) : (@encountered2 << value2)
+        @encountered2.delete?(value1) ? (@overlap_count += 1) : (@encountered1 << value1)
+      end
+    end
+  end
+end

data/lib/text_rank/graph_strategy/coocurrence.rb CHANGED

@@ -61,18 +61,27 @@ module TextRank
       # return [nil]
       def build_graph(tokens, graph)
         ngram_window = @ngram_size * 2 + 1
-        tokens.each_with_index do |token_i, i|
+        tokens.size.times do |i|
           ngram_window.times do |j|
-            next if j == @ngram_size || i + j < @ngram_size
-            token_j = tokens[i - @ngram_size + j]
-            if token_j
-              graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
-            end
+            consider_ngram_window(tokens, graph, i, j)
           end
         end
         nil
       end
+      private
+      def consider_ngram_window(tokens, graph, i, j)
+        return if j == @ngram_size || i + j < @ngram_size
+        token_i = tokens[i]
+        token_j = tokens[i - @ngram_size + j]
+        if token_j
+          graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
+        end
+      end
     end
   end
 end

data/lib/text_rank/keyword_extractor.rb CHANGED

@@ -13,9 +13,9 @@ module TextRank
     # @return [KeywordExtractor]
     def self.basic(**options)
       new(**{
-        char_filters:   [:AsciiFolding, :Lowercase],
-        tokenizers:     [:Word],
-        token_filters:  [:Stopwords, :MinLength],
+        char_filters:   %i[AsciiFolding Lowercase],
+        tokenizers:     %i[Word],
+        token_filters:  %i[Stopwords MinLength],
         graph_strategy: :Coocurrence,
       }.merge(options))
     end
@@ -25,11 +25,11 @@ module TextRank
     # @return [KeywordExtractor]
     def self.advanced(**options)
       new(**{
-        char_filters:   [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
-        tokenizers:     [:Url, :Money, :Number, :Word, :Punctuation],
-        token_filters:  [:PartOfSpeech, :Stopwords, :MinLength],
+        char_filters:   %i[AsciiFolding Lowercase StripHtml StripEmail UndoContractions StripPossessive],
+        tokenizers:     %i[Url Money Number Word Punctuation],
+        token_filters:  %i[PartOfSpeech Stopwords MinLength],
         graph_strategy: :Coocurrence,
-        rank_filters:   [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
+        rank_filters:   %i[CollapseAdjacent NormalizeUnitVector SortByValue],
       }.merge(options))
     end
@@ -41,14 +41,14 @@ module TextRank
     # @option options [Array<Class, Symbol, #filter!>]  :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
     def initialize(**options)
       @page_rank_options = {
-        strategy: options[:strategy] || :dense,
-        damping: options[:damping],
+        strategy:  options[:strategy] || :sparse,
+        damping:   options[:damping],
         tolerance: options[:tolerance],
       }
-      @char_filters   = options[:char_filters] || []
-      @tokenizers     = options[:tokenizers] || [Tokenizer::Word]
-      @token_filters  = options[:token_filters] || []
-      @rank_filters   = options[:rank_filters] || []
+      @char_filters = options[:char_filters] || []
+      @tokenizers = options[:tokenizers] || [Tokenizer::Word]
+      @token_filters = options[:token_filters] || []
+      @rank_filters = options[:rank_filters] || []
       @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
     end
@@ -73,9 +73,7 @@ module TextRank
     # Sets the graph strategy for producing a graph from tokens
     # @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
     # @return [Class, Symbol, #build_graph]
-    def graph_strategy=(strategy)
-      @graph_strategy = strategy
-    end
+    attr_writer :graph_strategy
     # Add a new TokenFilter for processing tokens after tokenization
     # @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
@@ -105,14 +103,23 @@ module TextRank
     end
     # Filter & tokenize text, and return PageRank
-    # @param text [String] unfiltered text to be processed
+    # @param text [String,Array<String>] unfiltered text to be processed
     # @return [Hash<String, Float>] tokens and page ranks (in descending order)
     def extract(text, **options)
-      tokens = tokenize(text)
+      text = Array(text)
+      tokens_per_text = text.map do |t|
+        tokenize(t)
+      end
       graph = PageRank.new(**@page_rank_options)
-      classify(@graph_strategy, context: GraphStrategy).build_graph(tokens, graph)
+      strategy = classify(@graph_strategy, context: GraphStrategy)
+      tokens_per_text.each do |tokens|
+        strategy.build_graph(tokens, graph)
+      end
       ranks = graph.calculate(**options)
-      apply_rank_filters(ranks, tokens: tokens, original_text: text)
+      tokens_per_text.each_with_index do |tokens, i|
+        ranks = apply_rank_filters(ranks, tokens: tokens, original_text: text[i])
+      end
+      ranks
     end
     private
@@ -153,14 +160,14 @@ module TextRank
       array.insert(idx, value)
     end
-    def classify(c, context: self)
-      case c
+    def classify(clazz, context: self)
+      case clazz
       when Class
-        c.new
+        clazz.new
       when Symbol
-        context.const_get(c).new
+        context.const_get(clazz).new
       else
-        c
+        clazz
       end
     end

data/lib/text_rank/rank_filter/collapse_adjacent.rb CHANGED

@@ -77,6 +77,7 @@ module TextRank
       class TokenCollapser
+        # rubocop:disable Metrics/ParameterLists
         def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
           @tokens = tokens
           @text = text
@@ -91,6 +92,7 @@ module TextRank
           @permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
           @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
         end
+        # rubocop:enable Metrics/ParameterLists
         # :nodoc:
         def delimiter_re
@@ -104,18 +106,36 @@ module TextRank
           # single tokens from below the cut to above it.  So we'll continue searching
           # until all of the top N final keywords (single or collapsed) have been
           # considered.
-          loop do
-            single_tokens_to_consider = @tokens.keys.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
-            scan_text_for_all_permutations_of(single_tokens_to_consider) or break
-            decide_what_to_collapse_and_what_to_remove
+          while collapse_attempt
+            # keep trying
           end
           # We now know what to collapse and what to remove, so we can start safely
           # modifying the tokens hash
+          apply_collapse
+        end
+        # :nodoc:
+        def collapse_attempt
+          regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
+          single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
+          scan_text_for_all_permutations_of(single_tokens_to_consider) or return false
+          decide_what_to_collapse_and_what_to_remove
+          true
+        end
+        # :nodoc:
+        def apply_collapse
           @to_collapse.each do |perm|
-            values = @tokens.values_at(*perm)
+            values = @tokens.values_at(*perm).compact
+            # This might be empty if somehow the scanned permutation doesn't
+            # exactly match one of the tokens (e.g. ASCII-folding gone awry).
+            # The goal is to do the best we can, so if we can't find it, ignore.
+            next if values.empty?
             @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
           end
           @tokens.reject! do |k, _|
             @to_remove.include?(k)
           end || @tokens
@@ -131,16 +151,10 @@ module TextRank
         # tokenization (e.g. ASCII folding).  That's okay.  We're just making the best effort we can
         # to find what we can.
         def scan_text_for_all_permutations_of(single_tokens)
-          perms = []
           # NOTE that by reversing the order we craft the regex to prefer larger combinations over
           # smaller combinations (or singletons).
-          (1..@max_tokens_to_combine).to_a.reverse.map do |nn|
-            single_tokens.permutation(nn).each do |perm|
-              unless @permutations_scanned.key?(perm)
-                @permutations_scanned[perm] = 0
-                perms << perm
-              end
-            end
+          perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
+            scan_text_for_n_permutations_of(single_tokens, n)
           end
           scan_text_for(perms) do |s|
             s = s.downcase if @ignore_case
@@ -148,6 +162,15 @@ module TextRank
           end unless perms.empty?
         end
+        def scan_text_for_n_permutations_of(single_tokens, n)
+          single_tokens.permutation(n).map do |perm|
+            unless @permutations_scanned.key?(perm)
+              @permutations_scanned[perm] = 0
+              perm
+            end
+          end.compact
+        end
         # Because we're scanning the original text, we've lost all of the character filtering we did
         # prior to tokenization, but that's important because we need the original context to be more
         # choosy.  Still, we need to know what delimiter goes between collapsed tokens (since it may
@@ -174,25 +197,30 @@ module TextRank
         # modifications to the original token list yet but just keep track of what we plan
         # to collapse/remove.
         def decide_what_to_collapse_and_what_to_remove
-          non_empty_ordered = @permutations_scanned.select do |k, v|
-            v > 0
-          end.sort_by do |k, v|
-            [-v, -k.size] # reverse order
-          end
           tokens_encountered = []
-          non_empty_ordered.each do |perm, perm_count|
+          permutations_to_consider_collapsing.each do |perm, perm_count|
             if perm.size > 1
-              singles_to_remove = perm - tokens_encountered
-              if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
-                @to_collapse << perm if perm.size > 1
-                @to_remove |= singles_to_remove
-              end
+              decide_to_collapse_or_remove(perm, perm_count, singles_to_remove: perm - tokens_encountered)
             end
             tokens_encountered += perm
           end
         end
+        def permutations_to_consider_collapsing
+          @permutations_scanned.select do |_k, v|
+            v.positive?
+          end.sort_by do |k, v|
+            [-v, -k.size] # reverse order
+          end
+        end
+        def decide_to_collapse_or_remove(perm, perm_count, singles_to_remove:)
+          if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
+            @to_collapse << perm if perm.size > 1
+            @to_remove |= singles_to_remove
+          end
+        end
         # Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
         # we still want to add the collapsed key if it shows up "enough" times.
         def combination_significant?(perm, perm_count)

data/lib/text_rank/rank_filter/normalize_probability.rb CHANGED

@@ -44,8 +44,9 @@ module TextRank
       # @return [Hash<String, Float>]
       def filter!(ranks, **_)
         return if ranks.empty?
         total = ranks.values.reduce(:+)
-        Hash[ranks.map { |k, v| [k, v / total] }]
+        ranks.transform_values { |v| v / total }
       end
     end

data/lib/text_rank/rank_filter/normalize_unit_vector.rb CHANGED

@@ -45,8 +45,9 @@ module TextRank
       # @return [Hash<String, Float>]
       def filter!(ranks, **_)
         return if ranks.empty?
         total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
-        Hash[ranks.map { |k, v| [k, v / total] }]
+        ranks.transform_values { |v| v / total }
       end
     end

data/lib/text_rank/token_filter/part_of_speech.rb CHANGED

@@ -1,5 +1,4 @@
 require 'engtagger'
-require 'set'
 module TextRank
   module TokenFilter