RubyGems - text_rank - Versions diffs - 1.2.3 → 1.3.0 - Mend

text_rank 1.2.3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

checksums.yaml +4 -4
data/.codeclimate.yml +1 -1
data/.gitignore +4 -0
data/.rubocop.yml +7 -0
data/.ruby-version +1 -1
data/.travis.yml +1 -0
data/Rakefile +5 -0
data/bin/console +3 -3
data/ext/text_rank/extconf.rb +3 -0
data/ext/text_rank/page_rank_sparse_native.c +300 -0
data/ext/text_rank/page_rank_sparse_native.h +93 -0
data/ext/text_rank/text_rank.c +5 -0
data/lib/page_rank/base.rb +12 -9
data/lib/page_rank/dense.rb +3 -2
data/lib/page_rank/sparse.rb +6 -7
data/lib/page_rank/sparse_native.rb +21 -0
data/lib/page_rank.rb +7 -4
data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
data/lib/text_rank/char_filter.rb +1 -1
data/lib/text_rank/fingerprint.rb +10 -18
data/lib/text_rank/fingerprint_overlap.rb +55 -0
data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
data/lib/text_rank/keyword_extractor.rb +32 -25
data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
data/lib/text_rank/token_filter/stopwords.rb +1 -321
data/lib/text_rank/token_filter/stopwords.yml +317 -0
data/lib/text_rank/tokenizer/money.rb +11 -6
data/lib/text_rank/tokenizer/number.rb +4 -3
data/lib/text_rank/tokenizer/punctuation.rb +4 -1
data/lib/text_rank/tokenizer/url.rb +3 -0
data/lib/text_rank/tokenizer/whitespace.rb +4 -1
data/lib/text_rank/tokenizer/word.rb +5 -2
data/lib/text_rank/tokenizer.rb +1 -1
data/lib/text_rank/version.rb +3 -1
data/lib/text_rank.rb +14 -9
data/text_rank.gemspec +4 -1
metadata +48 -12

data/lib/text_rank/char_filter/ascii_folding.rb CHANGED Viewed

@@ -1,13 +1,17 @@
-# coding: utf-8
 module TextRank
   module CharFilter
     ##
     # Characater filter to transform non-ASCII (unicode) characters into ASCII-friendly versions.
     #
+    # rubocop:disable Style/AsciiComments
+    #
     # = Example
     #
     #  AsciiFolding.new.filter!("the Perigordian Abbé then made answer, because a poor beggar of the country of Atrébatie heard some foolish things said")
     #  => "the Perigordian Abbe then made answer, because a poor beggar of the country of Atrebatie heard some foolish things said"
+    #
+    # rubocop:enable Style/AsciiComments
+    #
     ##
     class AsciiFolding

data/lib/text_rank/char_filter/strip_possessive.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module TextRank
     #
     # = Example
     #
-    #  StripPosessive.new.filter!("to loathe one’s very being and yet to hold it fast")
+    #  StripPosessive.new.filter!("to loathe one's very being and yet to hold it fast")
     #  => "to loathe one very being and yet to hold it fast"
     ##
     class StripPossessive
@@ -15,7 +15,7 @@ module TextRank
       # @return [String]
       def filter!(text)
         text.gsub!(/([a-z]+)'s\b/) do
-          $1
+          Regexp.last_match(1)
         end
       end

data/lib/text_rank/char_filter/undo_contractions.rb CHANGED Viewed

@@ -11,143 +11,7 @@ module TextRank
     class UndoContractions
       # List of English contractions to undo
-      CONTRACTIONS = {
-        "ain't"            => "am not",
-        "amn't"            => "am not",
-        "aren't"           => "are not",
-        "can't"            => "can not",
-        "could've"         => "could have",
-        "couldn't"         => "could not",
-        "couldn't've"      => "could not have",
-        "didn't"           => "did not",
-        "doesn't"          => "does not",
-        "don't"            => "do not",
-        "gonna"            => "going to",
-        "hadn't"           => "had not",
-        "hadn't've"        => "had not have",
-        "hasn't"           => "has not",
-        "haven't"          => "have not",
-        "he'd"             => "he had",
-        "he'd've"          => "he would have",
-        "he'll"            => "he shall",
-        "he's"             => "he has",
-        "he'sn't"          => "he has not",
-        "how'd"            => "how did",
-        "how'll"           => "how will",
-        "how's"            => "how has",
-        "i'd"              => "i had",
-        "i'd've"           => "i would have",
-        "i'll"             => "i shall",
-        "i'm"              => "i am",
-        "i've"             => "i have",
-        "i'ven't"          => "i have not",
-        "isn't"            => "is not",
-        "it'd"             => "it had",
-        "it'd've"          => "it would have",
-        "it'll"            => "it shall",
-        "it's"             => "it has",
-        "it'sn't"          => "it has not",
-        "let's"            => "let us",
-        "ma'am"            => "madam",
-        "mightn't"         => "might not",
-        "mightn't've"      => "might not have",
-        "might've"         => "might have",
-        "mustn't"          => "must not",
-        "must've"          => "must have",
-        "needn't"          => "need not",
-        "not've"           => "not have",
-        "o'clock"          => "of the clock",
-        "ol'"              => "old",
-        "oughtn't"         => "ought not",
-        "shan't"           => "shall not",
-        "she'd"            => "she had",
-        "she'd've"         => "she would have",
-        "she'll"           => "she shall",
-        "she's"            => "she has",
-        "she'sn't"         => "she has not",
-        "should've"        => "should have",
-        "shouldn't"        => "should not",
-        "shouldn't've"     => "should not have",
-        "somebody'd"       => "somebody had",
-        "somebody'd've"    => "somebody would have",
-        "somebody'dn't've" => "somebody would not have",
-        "somebody'll"      => "somebody shall",
-        "somebody's"       => "somebody has",
-        "someone'd"        => "someone had",
-        "someone'd've"     => "someone would have",
-        "someone'll"       => "someone shall",
-        "someone's"        => "someone has",
-        "something'd"      => "something had",
-        "something'd've"   => "something would have",
-        "something'll"     => "something shall",
-        "something's"      => "something has",
-        "'sup"             => "what's up",
-        "that'll"          => "that will",
-        "that's"           => "that has",
-        "there'd"          => "there had",
-        "there'd've"       => "there would have",
-        "there're"         => "there are",
-        "there's"          => "there has",
-        "they'd"           => "they had",
-        "they'dn't"        => "they would not",
-        "they'dn't've"     => "they would not have",
-        "they'd've"        => "they would have",
-        "they'd'ven't"     => "they would have not",
-        "they'll"          => "they shall",
-        "they'lln't've"    => "they will not have",
-        "they'll'ven't"    => "they will have not",
-        "they're"          => "they are",
-        "they've"          => "they have",
-        "they'ven't"       => "they have not",
-        "'tis"             => "it is",
-        "'twas"            => "it was",
-        "wanna"            => "want to",
-        "wasn't"           => "was not",
-        "we'd"             => "we had",
-        "we'd've"          => "we would have",
-        "we'dn't've"       => "we would not have",
-        "we'll"            => "we will",
-        "we'lln't've"      => "we will not have",
-        "we're"            => "we are",
-        "we've"            => "we have",
-        "weren't"          => "were not",
-        "what'll"          => "what shall",
-        "what're"          => "what are",
-        "what's"           => "what has",
-        "what've"          => "what have",
-        "when's"           => "when has",
-        "where'd"          => "where did",
-        "where's"          => "where has",
-        "where've"         => "where have",
-        "who'd"            => "who would",
-        "who'd've"         => "who would have",
-        "who'll"           => "who shall",
-        "who're"           => "who are",
-        "who's"            => "who has",
-        "who've"           => "who have",
-        "why'll"           => "why will",
-        "why're"           => "why are",
-        "why's"            => "why has",
-        "won't"            => "will not",
-        "won't've"         => "will not have",
-        "would've"         => "would have",
-        "wouldn't"         => "would not",
-        "wouldn't've"      => "would not have",
-        "y'all"            => "you all",
-        "y'all'd've"       => "you all would have",
-        "y'all'dn't've"    => "you all would not have",
-        "y'all'll"         => "you all will",
-        "y'all'lln't"      => "you all will not",
-        "y'all'll've"      => "you all will have",
-        "y'all'll'ven't"   => "you all will have not",
-        "you'd"            => "you had",
-        "you'd've"         => "you would have",
-        "you'll"           => "you shall",
-        "you're"           => "you are",
-        "you'ren't"        => "you are not",
-        "you've"           => "you have",
-        "you'ven't"        => "you have not",
-      }
+      CONTRACTIONS = YAML.load_file(File.expand_path('undo_contractions.yml', __dir__))
       # Perform the filter
       # @param text [String]

data/lib/text_rank/char_filter/undo_contractions.yml ADDED Viewed

@@ -0,0 +1,135 @@
+ain't:            am not
+amn't:            am not
+aren't:           are not
+can't:            can not
+could've:         could have
+couldn't:         could not
+couldn't've:      could not have
+didn't:           did not
+doesn't:          does not
+don't:            do not
+gonna:            going to
+hadn't:           had not
+hadn't've:        had not have
+hasn't:           has not
+haven't:          have not
+he'd:             he had
+he'd've:          he would have
+he'll:            he shall
+he's:             he has
+he'sn't:          he has not
+how'd:            how did
+how'll:           how will
+how's:            how has
+i'd:              i had
+i'd've:           i would have
+i'll:             i shall
+i'm:              i am
+i've:             i have
+i'ven't:          i have not
+isn't:            is not
+it'd:             it had
+it'd've:          it would have
+it'll:            it shall
+it's:             it has
+it'sn't:          it has not
+let's:            let us
+ma'am:            madam
+mightn't:         might not
+mightn't've:      might not have
+might've:         might have
+mustn't:          must not
+must've:          must have
+needn't:          need not
+not've:           not have
+o'clock:          of the clock
+ol':              old
+oughtn't:         ought not
+shan't:           shall not
+she'd:            she had
+she'd've:         she would have
+she'll:           she shall
+she's:            she has
+she'sn't:         she has not
+should've:        should have
+shouldn't:        should not
+shouldn't've:     should not have
+somebody'd:       somebody had
+somebody'd've:    somebody would have
+somebody'dn't've: somebody would not have
+somebody'll:      somebody shall
+somebody's:       somebody has
+someone'd:        someone had
+someone'd've:     someone would have
+someone'll:       someone shall
+someone's:        someone has
+something'd:      something had
+something'd've:   something would have
+something'll:     something shall
+something's:      something has
+"'sup":           "what's up"
+that'll:          that will
+that's:           that has
+there'd:          there had
+there'd've:       there would have
+there're:         there are
+there's:          there has
+they'd:           they had
+they'dn't:        they would not
+they'dn't've:     they would not have
+they'd've:        they would have
+they'd'ven't:     they would have not
+they'll:          they shall
+they'lln't've:    they will not have
+they'll'ven't:    they will have not
+they're:          they are
+they've:          they have
+they'ven't:       they have not
+"'tis":           it is
+"'twas":          it was
+wanna:            want to
+wasn't:           was not
+we'd:             we had
+we'd've:          we would have
+we'dn't've:       we would not have
+we'll:            we will
+we'lln't've:      we will not have
+we're:            we are
+we've:            we have
+weren't:          were not
+what'll:          what shall
+what're:          what are
+what's:           what has
+what've:          what have
+when's:           when has
+where'd:          where did
+where's:          where has
+where've:         where have
+who'd:            who would
+who'd've:         who would have
+who'll:           who shall
+who're:           who are
+who's:            who has
+who've:           who have
+why'll:           why will
+why're:           why are
+why's:            why has
+won't:            will not
+won't've:         will not have
+would've:         would have
+wouldn't:         would not
+wouldn't've:      would not have
+y'all:            you all
+y'all'd've:       you all would have
+y'all'dn't've:    you all would not have
+y'all'll:         you all will
+y'all'lln't:      you all will not
+y'all'll've:      you all will have
+y'all'll'ven't:   you all will have not
+you'd:            you had
+you'd've:         you would have
+you'll:           you shall
+you're:           you are
+you'ren't:        you are not
+you've:           you have
+you'ven't:        you have not

data/lib/text_rank/char_filter.rb CHANGED Viewed

@@ -7,7 +7,7 @@ module TextRank
   # converting non-ascii characters to related ascii characters, forcing text to
   # lower case, stripping out HTML, converting English contractions (e.g. "won't")
   # to the non-contracted form ("will not"), and more.
-  #
+  #
   # Character filters are applied as a chain, so care should be taken to use them
   # in the desired order.
   ##

data/lib/text_rank/fingerprint.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-require 'set'
 module TextRank
   ##
   # Class used to compare documents according to TextRank. A "fingerprint"
@@ -61,28 +59,22 @@ module TextRank
     # Calculates the "similarity" between this fingerprint and another
     # @param {Fingerprint} A second fingerprint to compare
     # @return [Number] A number between 0.0 (different) and 1.0 (same)
-    def similarity(trf2)
-      return 1.0 if values == trf2.values
-      sim = 0
-      s1 = Set.new
-      s2 = Set.new
+    def similarity(other)
+      return 1.0 if values == other.values # Short-circuit for efficiency
-      [size, trf2.size].max.times.reduce(0) do |sum, i|
-        v1 = values[i]
-        v2 = trf2.values[i]
-        if v1 == v2
-          sim += 1
-        else
-          s1.delete?(v2) ? (sim += 1) : (s2 << v2)
-          s2.delete?(v1) ? (sim += 1) : (s1 << v1)
-        end
-        sum + sim * linear_transform[i]
+      sum = 0
+      overlap(other).each_with_index do |overlap_value, i|
+        sum += overlap_value * linear_transform[i]
       end
+      sum
     end
     private
+    def overlap(other)
+      FingerprintOverlap.new(values, other.values).overlap
+    end
     def linear_transform
       @linear_transform ||= size.times.map do |i|
         1.0 / Math.log(i + 2) / size.to_f / norm_factor

data/lib/text_rank/fingerprint_overlap.rb ADDED Viewed

@@ -0,0 +1,55 @@
+module TextRank
+  ##
+  # Determines "overlap" between two fingerprints at each N prefixes
+  #
+  # For example,
+  #
+  #   FingerprintOverlap.new(
+  #     %w[a b c d],
+  #     %w[b e a c],
+  #   ).overlap
+  #
+  #   => [
+  #     0, # [a] & (b) have no overlap
+  #     1, # [a b] & [b e] have one overlap: b
+  #     2, # [a b c] & [b e a] have two overlap: a & b
+  #     3, # [a b c d] & [b e a c] have three overlap: a, b, & c
+  #   ]
+  ##
+  class FingerprintOverlap
+    attr_reader :overlap
+    def initialize(values1, values2)
+      raise ArgumentError, 'Value size mismatch' if values1.size != values2.size
+      @encountered1 = Set.new
+      @encountered2 = Set.new
+      @overlap_count = 0
+      @overlap = determine_overlap(values1, values2)
+    end
+    private
+    def determine_overlap(values1, values2)
+      values1.zip(values2).map do |v1, v2|
+        encounter(v1, v2)
+        @overlap_count
+      end
+    end
+    # This algorithm is a little more complex than could be represented in Ruby,
+    # but we want to keep it as performant as possible.
+    def encounter(value1, value2)
+      if value1 == value2
+        @overlap_count += 1
+      else
+        # Delete from the set in case an element appears more than once
+        @encountered1.delete?(value2) ? (@overlap_count += 1) : (@encountered2 << value2)
+        @encountered2.delete?(value1) ? (@overlap_count += 1) : (@encountered1 << value1)
+      end
+    end
+  end
+end

data/lib/text_rank/graph_strategy/coocurrence.rb CHANGED Viewed

@@ -61,18 +61,27 @@ module TextRank
       # return [nil]
       def build_graph(tokens, graph)
         ngram_window = @ngram_size * 2 + 1
-        tokens.each_with_index do |token_i, i|
+        tokens.size.times do |i|
           ngram_window.times do |j|
-            next if j == @ngram_size || i + j < @ngram_size
-            token_j = tokens[i - @ngram_size + j]
-            if token_j
-              graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
-            end
+            consider_ngram_window(tokens, graph, i, j)
           end
         end
         nil
       end
+      private
+      def consider_ngram_window(tokens, graph, i, j)
+        return if j == @ngram_size || i + j < @ngram_size
+        token_i = tokens[i]
+        token_j = tokens[i - @ngram_size + j]
+        if token_j
+          graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
+        end
+      end
     end
   end
 end

data/lib/text_rank/keyword_extractor.rb CHANGED Viewed

@@ -13,9 +13,9 @@ module TextRank
     # @return [KeywordExtractor]
     def self.basic(**options)
       new(**{
-        char_filters:   [:AsciiFolding, :Lowercase],
-        tokenizers:     [:Word],
-        token_filters:  [:Stopwords, :MinLength],
+        char_filters:   %i[AsciiFolding Lowercase],
+        tokenizers:     %i[Word],
+        token_filters:  %i[Stopwords MinLength],
         graph_strategy: :Coocurrence,
       }.merge(options))
     end
@@ -25,11 +25,11 @@ module TextRank
     # @return [KeywordExtractor]
     def self.advanced(**options)
       new(**{
-        char_filters:   [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
-        tokenizers:     [:Url, :Money, :Number, :Word, :Punctuation],
-        token_filters:  [:PartOfSpeech, :Stopwords, :MinLength],
+        char_filters:   %i[AsciiFolding Lowercase StripHtml StripEmail UndoContractions StripPossessive],
+        tokenizers:     %i[Url Money Number Word Punctuation],
+        token_filters:  %i[PartOfSpeech Stopwords MinLength],
         graph_strategy: :Coocurrence,
-        rank_filters:   [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
+        rank_filters:   %i[CollapseAdjacent NormalizeUnitVector SortByValue],
       }.merge(options))
     end
@@ -41,14 +41,14 @@ module TextRank
     # @option options [Array<Class, Symbol, #filter!>]  :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
     def initialize(**options)
       @page_rank_options = {
-        strategy: options[:strategy] || :sparse,
-        damping: options[:damping],
+        strategy:  options[:strategy] || :sparse,
+        damping:   options[:damping],
         tolerance: options[:tolerance],
       }
-      @char_filters   = options[:char_filters] || []
-      @tokenizers     = options[:tokenizers] || [Tokenizer::Word]
-      @token_filters  = options[:token_filters] || []
-      @rank_filters   = options[:rank_filters] || []
+      @char_filters = options[:char_filters] || []
+      @tokenizers = options[:tokenizers] || [Tokenizer::Word]
+      @token_filters = options[:token_filters] || []
+      @rank_filters = options[:rank_filters] || []
       @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
     end
@@ -73,9 +73,7 @@ module TextRank
     # Sets the graph strategy for producing a graph from tokens
     # @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
     # @return [Class, Symbol, #build_graph]
-    def graph_strategy=(strategy)
-      @graph_strategy = strategy
-    end
+    attr_writer :graph_strategy
     # Add a new TokenFilter for processing tokens after tokenization
     # @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
@@ -105,14 +103,23 @@ module TextRank
     end
     # Filter & tokenize text, and return PageRank
-    # @param text [String] unfiltered text to be processed
+    # @param text [String,Array<String>] unfiltered text to be processed
     # @return [Hash<String, Float>] tokens and page ranks (in descending order)
     def extract(text, **options)
-      tokens = tokenize(text)
+      text = Array(text)
+      tokens_per_text = text.map do |t|
+        tokenize(t)
+      end
       graph = PageRank.new(**@page_rank_options)
-      classify(@graph_strategy, context: GraphStrategy).build_graph(tokens, graph)
+      strategy = classify(@graph_strategy, context: GraphStrategy)
+      tokens_per_text.each do |tokens|
+        strategy.build_graph(tokens, graph)
+      end
       ranks = graph.calculate(**options)
-      apply_rank_filters(ranks, tokens: tokens, original_text: text)
+      tokens_per_text.each_with_index do |tokens, i|
+        ranks = apply_rank_filters(ranks, tokens: tokens, original_text: text[i])
+      end
+      ranks
     end
     private
@@ -153,14 +160,14 @@ module TextRank
       array.insert(idx, value)
     end
-    def classify(c, context: self)
-      case c
+    def classify(clazz, context: self)
+      case clazz
       when Class
-        c.new
+        clazz.new
       when Symbol
-        context.const_get(c).new
+        context.const_get(clazz).new
       else
-        c
+        clazz
       end
     end