RubyGems - text_rank - Versions diffs - 1.2.3 → 1.2.4 - Mend

text_rank 1.2.3 → 1.2.4

Files changed (32) hide show

checksums.yaml +4 -4
data/.rubocop.yml +7 -0
data/bin/console +3 -3
data/lib/page_rank.rb +2 -0
data/lib/page_rank/base.rb +9 -8
data/lib/page_rank/dense.rb +2 -1
data/lib/page_rank/sparse.rb +6 -7
data/lib/text_rank.rb +11 -8
data/lib/text_rank/char_filter.rb +1 -1
data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
data/lib/text_rank/fingerprint.rb +10 -18
data/lib/text_rank/fingerprint_overlap.rb +55 -0
data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
data/lib/text_rank/keyword_extractor.rb +19 -21
data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
data/lib/text_rank/token_filter/stopwords.rb +1 -321
data/lib/text_rank/token_filter/stopwords.yml +317 -0
data/lib/text_rank/tokenizer.rb +1 -1
data/lib/text_rank/tokenizer/money.rb +11 -6
data/lib/text_rank/tokenizer/number.rb +4 -3
data/lib/text_rank/tokenizer/punctuation.rb +4 -1
data/lib/text_rank/tokenizer/url.rb +3 -0
data/lib/text_rank/tokenizer/whitespace.rb +4 -1
data/lib/text_rank/tokenizer/word.rb +5 -2
data/lib/text_rank/version.rb +3 -1
metadata +4 -1

data/lib/text_rank/fingerprint.rb CHANGED

@@ -1,5 +1,3 @@
-require 'set'
 module TextRank
   ##
   # Class used to compare documents according to TextRank. A "fingerprint"
@@ -61,28 +59,22 @@ module TextRank
     # Calculates the "similarity" between this fingerprint and another
     # @param {Fingerprint} A second fingerprint to compare
     # @return [Number] A number between 0.0 (different) and 1.0 (same)
-    def similarity(trf2)
-      return 1.0 if values == trf2.values
-      sim = 0
-      s1 = Set.new
-      s2 = Set.new
+    def similarity(other)
+      return 1.0 if values == other.values # Short-circuit for efficiency
-      [size, trf2.size].max.times.reduce(0) do |sum, i|
-        v1 = values[i]
-        v2 = trf2.values[i]
-        if v1 == v2
-          sim += 1
-        else
-          s1.delete?(v2) ? (sim += 1) : (s2 << v2)
-          s2.delete?(v1) ? (sim += 1) : (s1 << v1)
-        end
-        sum + sim * linear_transform[i]
+      sum = 0
+      overlap(other).each_with_index do |overlap_value, i|
+        sum += overlap_value * linear_transform[i]
       end
+      sum
     end
     private
+    def overlap(other)
+      FingerprintOverlap.new(values, other.values).overlap
+    end
     def linear_transform
       @linear_transform ||= size.times.map do |i|
         1.0 / Math.log(i + 2) / size.to_f / norm_factor

data/lib/text_rank/fingerprint_overlap.rb ADDED

@@ -0,0 +1,55 @@
+module TextRank
+  ##
+  # Determines "overlap" between two fingerprints at each N prefixes
+  #
+  # For example,
+  #
+  #   FingerprintOverlap.new(
+  #     %w[a b c d],
+  #     %w[b e a c],
+  #   ).overlap
+  #
+  #   => [
+  #     0, # [a] & (b) have no overlap
+  #     1, # [a b] & [b e] have one overlap: b
+  #     2, # [a b c] & [b e a] have two overlap: a & b
+  #     3, # [a b c d] & [b e a c] have three overlap: a, b, & c
+  #   ]
+  ##
+  class FingerprintOverlap
+    attr_reader :overlap
+    def initialize(values1, values2)
+      raise ArgumentError, 'Value size mismatch' if values1.size != values2.size
+      @encountered1 = Set.new
+      @encountered2 = Set.new
+      @overlap_count = 0
+      @overlap = determine_overlap(values1, values2)
+    end
+    private
+    def determine_overlap(values1, values2)
+      values1.zip(values2).map do |v1, v2|
+        encounter(v1, v2)
+        @overlap_count
+      end
+    end
+    # This algorithm is a little more complex than could be represented in Ruby,
+    # but we want to keep it as performant as possible.
+    def encounter(value1, value2)
+      if value1 == value2
+        @overlap_count += 1
+      else
+        # Delete from the set in case an element appears more than once
+        @encountered1.delete?(value2) ? (@overlap_count += 1) : (@encountered2 << value2)
+        @encountered2.delete?(value1) ? (@overlap_count += 1) : (@encountered1 << value1)
+      end
+    end
+  end
+end

data/lib/text_rank/graph_strategy/coocurrence.rb CHANGED

@@ -61,18 +61,27 @@ module TextRank
       # return [nil]
       def build_graph(tokens, graph)
         ngram_window = @ngram_size * 2 + 1
-        tokens.each_with_index do |token_i, i|
+        tokens.size.times do |i|
           ngram_window.times do |j|
-            next if j == @ngram_size || i + j < @ngram_size
-            token_j = tokens[i - @ngram_size + j]
-            if token_j
-              graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
-            end
+            consider_ngram_window(tokens, graph, i, j)
           end
         end
         nil
       end
+      private
+      def consider_ngram_window(tokens, graph, i, j)
+        return if j == @ngram_size || i + j < @ngram_size
+        token_i = tokens[i]
+        token_j = tokens[i - @ngram_size + j]
+        if token_j
+          graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
+        end
+      end
     end
   end
 end

data/lib/text_rank/keyword_extractor.rb CHANGED

@@ -13,9 +13,9 @@ module TextRank
     # @return [KeywordExtractor]
     def self.basic(**options)
       new(**{
-        char_filters:   [:AsciiFolding, :Lowercase],
-        tokenizers:     [:Word],
-        token_filters:  [:Stopwords, :MinLength],
+        char_filters:   %i[AsciiFolding Lowercase],
+        tokenizers:     %i[Word],
+        token_filters:  %i[Stopwords MinLength],
         graph_strategy: :Coocurrence,
       }.merge(options))
     end
@@ -25,11 +25,11 @@ module TextRank
     # @return [KeywordExtractor]
     def self.advanced(**options)
       new(**{
-        char_filters:   [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
-        tokenizers:     [:Url, :Money, :Number, :Word, :Punctuation],
-        token_filters:  [:PartOfSpeech, :Stopwords, :MinLength],
+        char_filters:   %i[AsciiFolding Lowercase StripHtml StripEmail UndoContractions StripPossessive],
+        tokenizers:     %i[Url Money Number Word Punctuation],
+        token_filters:  %i[PartOfSpeech Stopwords MinLength],
         graph_strategy: :Coocurrence,
-        rank_filters:   [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
+        rank_filters:   %i[CollapseAdjacent NormalizeUnitVector SortByValue],
       }.merge(options))
     end
@@ -41,14 +41,14 @@ module TextRank
     # @option options [Array<Class, Symbol, #filter!>]  :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
     def initialize(**options)
       @page_rank_options = {
-        strategy: options[:strategy] || :sparse,
-        damping: options[:damping],
+        strategy:  options[:strategy] || :sparse,
+        damping:   options[:damping],
         tolerance: options[:tolerance],
       }
-      @char_filters   = options[:char_filters] || []
-      @tokenizers     = options[:tokenizers] || [Tokenizer::Word]
-      @token_filters  = options[:token_filters] || []
-      @rank_filters   = options[:rank_filters] || []
+      @char_filters = options[:char_filters] || []
+      @tokenizers = options[:tokenizers] || [Tokenizer::Word]
+      @token_filters = options[:token_filters] || []
+      @rank_filters = options[:rank_filters] || []
       @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
     end
@@ -73,9 +73,7 @@ module TextRank
     # Sets the graph strategy for producing a graph from tokens
     # @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
     # @return [Class, Symbol, #build_graph]
-    def graph_strategy=(strategy)
-      @graph_strategy = strategy
-    end
+    attr_writer :graph_strategy
     # Add a new TokenFilter for processing tokens after tokenization
     # @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
@@ -153,14 +151,14 @@ module TextRank
       array.insert(idx, value)
     end
-    def classify(c, context: self)
-      case c
+    def classify(clazz, context: self)
+      case clazz
       when Class
-        c.new
+        clazz.new
       when Symbol
-        context.const_get(c).new
+        context.const_get(clazz).new
       else
-        c
+        clazz
       end
     end

data/lib/text_rank/rank_filter/collapse_adjacent.rb CHANGED

@@ -77,6 +77,7 @@ module TextRank
       class TokenCollapser
+        # rubocop:disable Metrics/ParameterLists
         def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
           @tokens = tokens
           @text = text
@@ -91,6 +92,7 @@ module TextRank
           @permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
           @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
         end
+        # rubocop:enable Metrics/ParameterLists
         # :nodoc:
         def delimiter_re
@@ -104,23 +106,36 @@ module TextRank
           # single tokens from below the cut to above it.  So we'll continue searching
           # until all of the top N final keywords (single or collapsed) have been
           # considered.
-          loop do
-            regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
-            single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
-            scan_text_for_all_permutations_of(single_tokens_to_consider) or break
-            decide_what_to_collapse_and_what_to_remove
+          while collapse_attempt
+            # keep trying
           end
           # We now know what to collapse and what to remove, so we can start safely
           # modifying the tokens hash
+          apply_collapse
+        end
+        # :nodoc:
+        def collapse_attempt
+          regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
+          single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
+          scan_text_for_all_permutations_of(single_tokens_to_consider) or return false
+          decide_what_to_collapse_and_what_to_remove
+          true
+        end
+        # :nodoc:
+        def apply_collapse
           @to_collapse.each do |perm|
             values = @tokens.values_at(*perm).compact
             # This might be empty if somehow the scanned permutation doesn't
             # exactly match one of the tokens (e.g. ASCII-folding gone awry).
             # The goal is to do the best we can, so if we can't find it, ignore.
             next if values.empty?
             @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
           end
           @tokens.reject! do |k, _|
             @to_remove.include?(k)
           end || @tokens
@@ -136,16 +151,10 @@ module TextRank
         # tokenization (e.g. ASCII folding).  That's okay.  We're just making the best effort we can
         # to find what we can.
         def scan_text_for_all_permutations_of(single_tokens)
-          perms = []
           # NOTE that by reversing the order we craft the regex to prefer larger combinations over
           # smaller combinations (or singletons).
-          (1..@max_tokens_to_combine).to_a.reverse.map do |nn|
-            single_tokens.permutation(nn).each do |perm|
-              unless @permutations_scanned.key?(perm)
-                @permutations_scanned[perm] = 0
-                perms << perm
-              end
-            end
+          perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
+            scan_text_for_n_permutations_of(single_tokens, n)
           end
           scan_text_for(perms) do |s|
             s = s.downcase if @ignore_case
@@ -153,6 +162,15 @@ module TextRank
           end unless perms.empty?
         end
+        def scan_text_for_n_permutations_of(single_tokens, n)
+          single_tokens.permutation(n).map do |perm|
+            unless @permutations_scanned.key?(perm)
+              @permutations_scanned[perm] = 0
+              perm
+            end
+          end.compact
+        end
         # Because we're scanning the original text, we've lost all of the character filtering we did
         # prior to tokenization, but that's important because we need the original context to be more
         # choosy.  Still, we need to know what delimiter goes between collapsed tokens (since it may
@@ -179,25 +197,30 @@ module TextRank
         # modifications to the original token list yet but just keep track of what we plan
         # to collapse/remove.
         def decide_what_to_collapse_and_what_to_remove
-          non_empty_ordered = @permutations_scanned.select do |k, v|
-            v > 0
-          end.sort_by do |k, v|
-            [-v, -k.size] # reverse order
-          end
           tokens_encountered = []
-          non_empty_ordered.each do |perm, perm_count|
+          permutations_to_consider_collapsing.each do |perm, perm_count|
             if perm.size > 1
-              singles_to_remove = perm - tokens_encountered
-              if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
-                @to_collapse << perm if perm.size > 1
-                @to_remove |= singles_to_remove
-              end
+              decide_to_collapse_or_remove(perm, perm_count, singles_to_remove: perm - tokens_encountered)
             end
             tokens_encountered += perm
           end
         end
+        def permutations_to_consider_collapsing
+          @permutations_scanned.select do |_k, v|
+            v.positive?
+          end.sort_by do |k, v|
+            [-v, -k.size] # reverse order
+          end
+        end
+        def decide_to_collapse_or_remove(perm, perm_count, singles_to_remove:)
+          if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
+            @to_collapse << perm if perm.size > 1
+            @to_remove |= singles_to_remove
+          end
+        end
         # Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
         # we still want to add the collapsed key if it shows up "enough" times.
         def combination_significant?(perm, perm_count)

data/lib/text_rank/rank_filter/normalize_probability.rb CHANGED

@@ -44,8 +44,9 @@ module TextRank
       # @return [Hash<String, Float>]
       def filter!(ranks, **_)
         return if ranks.empty?
         total = ranks.values.reduce(:+)
-        Hash[ranks.map { |k, v| [k, v / total] }]
+        ranks.transform_values { |v| v / total }
       end
     end

data/lib/text_rank/rank_filter/normalize_unit_vector.rb CHANGED

@@ -45,8 +45,9 @@ module TextRank
       # @return [Hash<String, Float>]
       def filter!(ranks, **_)
         return if ranks.empty?
         total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
-        Hash[ranks.map { |k, v| [k, v / total] }]
+        ranks.transform_values { |v| v / total }
       end
     end

data/lib/text_rank/token_filter/part_of_speech.rb CHANGED

@@ -1,5 +1,4 @@
 require 'engtagger'
-require 'set'
 module TextRank
   module TokenFilter

data/lib/text_rank/token_filter/stopwords.rb CHANGED

@@ -1,5 +1,3 @@
-require 'set'
 module TextRank
   module TokenFilter
     ##
@@ -15,325 +13,7 @@ module TextRank
     class Stopwords
       # Default English stop-word list.
-      STOP_WORDS = Set.new(%w[
-        a
-        about
-        above
-        across
-        after
-        afterwards
-        again
-        against
-        all
-        almost
-        alone
-        along
-        already
-        also
-        although
-        always
-        am
-        among
-        amongst
-        amoungst
-        amount
-        an
-        and
-        another
-        any
-        anyhow
-        anyone
-        anything
-        anyway
-        anywhere
-        are
-        around
-        as
-        at
-        back
-        be
-        became
-        because
-        become
-        becomes
-        becoming
-        been
-        before
-        beforehand
-        behind
-        being
-        below
-        beside
-        besides
-        between
-        beyond
-        bill
-        both
-        bottom
-        but
-        by
-        call
-        can
-        cannot
-        cant
-        co
-        con
-        could
-        couldnt
-        cry
-        de
-        describe
-        detail
-        do
-        done
-        down
-        due
-        during
-        each
-        eg
-        eight
-        either
-        eleven
-        else
-        elsewhere
-        empty
-        enough
-        etc
-        even
-        ever
-        every
-        everyone
-        everything
-        everywhere
-        except
-        few
-        fifteen
-        fify
-        fill
-        find
-        fire
-        first
-        five
-        for
-        former
-        formerly
-        forty
-        found
-        four
-        from
-        front
-        full
-        further
-        get
-        give
-        go
-        had
-        has
-        hasnt
-        have
-        he
-        hence
-        her
-        here
-        hereafter
-        hereby
-        herein
-        hereupon
-        hers
-        herself
-        him
-        himself
-        his
-        how
-        however
-        hundred
-        ie
-        if
-        in
-        inc
-        indeed
-        interest
-        into
-        is
-        it
-        its
-        itself
-        keep
-        last
-        latter
-        latterly
-        least
-        less
-        ltd
-        made
-        many
-        may
-        me
-        meanwhile
-        might
-        mill
-        mine
-        more
-        moreover
-        most
-        mostly
-        move
-        much
-        must
-        my
-        myself
-        name
-        namely
-        neither
-        never
-        nevertheless
-        next
-        nine
-        no
-        nobody
-        none
-        noone
-        nor
-        not
-        nothing
-        now
-        nowhere
-        of
-        off
-        often
-        on
-        once
-        one
-        only
-        onto
-        or
-        other
-        others
-        otherwise
-        our
-        ours
-        ourselves
-        out
-        over
-        own
-        part
-        per
-        perhaps
-        please
-        put
-        rather
-        re
-        same
-        see
-        seem
-        seemed
-        seeming
-        seems
-        serious
-        several
-        she
-        should
-        show
-        side
-        since
-        sincere
-        six
-        sixty
-        so
-        some
-        somehow
-        someone
-        something
-        sometime
-        sometimes
-        somewhere
-        still
-        such
-        system
-        take
-        ten
-        than
-        that
-        the
-        their
-        them
-        themselves
-        then
-        thence
-        there
-        thereafter
-        thereby
-        therefore
-        therein
-        thereupon
-        these
-        they
-        thickv
-        thin
-        third
-        this
-        those
-        though
-        three
-        through
-        throughout
-        thru
-        thus
-        to
-        together
-        too
-        top
-        toward
-        towards
-        twelve
-        twenty
-        two
-        un
-        under
-        until
-        up
-        upon
-        us
-        very
-        via
-        was
-        we
-        well
-        were
-        what
-        whatever
-        when
-        whence
-        whenever
-        where
-        whereafter
-        whereas
-        whereby
-        wherein
-        whereupon
-        wherever
-        whether
-        which
-        while
-        whither
-        who
-        whoever
-        whole
-        whom
-        whose
-        why
-        will
-        with
-        within
-        without
-        would
-        yet
-        you
-        your
-        yours
-        yourself
-        yourselves
-      ])
+      STOP_WORDS = Set.new(YAML.load_file(File.expand_path('stopwords.yml', __dir__)))
       # Perform the filter
       # @param tokens [Array<String>]