RubyGems - text_rank - Versions diffs - 1.1.1 → 1.1.5 - Mend

text_rank 1.1.1 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/lib/page_rank/base.rb +2 -2
data/lib/text_rank/char_filter/ascii_folding.rb +2 -0
data/lib/text_rank/char_filter/strip_email.rb +1 -0
data/lib/text_rank/char_filter/undo_contractions.rb +1 -0
data/lib/text_rank/keyword_extractor.rb +25 -12
data/lib/text_rank/rank_filter.rb +4 -1
data/lib/text_rank/rank_filter/collapse_adjacent.rb +7 -15
data/lib/text_rank/rank_filter/normalize_probability.rb +53 -0
data/lib/text_rank/rank_filter/normalize_unit_vector.rb +54 -0
data/lib/text_rank/rank_filter/sort_by_value.rb +22 -0
data/lib/text_rank/tokenizer.rb +24 -3
data/lib/text_rank/tokenizer/money.rb +76 -0
data/lib/text_rank/tokenizer/number.rb +31 -0
data/lib/text_rank/tokenizer/punctuation.rb +11 -0
data/lib/text_rank/tokenizer/url.rb +21 -0
data/lib/text_rank/tokenizer/whitespace.rb +4 -12
data/lib/text_rank/tokenizer/word.rb +14 -0
data/lib/text_rank/version.rb +2 -1
data/text_rank.gemspec +1 -1
metadata +12 -6
data/lib/text_rank/tokenizer/regex.rb +0 -26
data/lib/text_rank/tokenizer/words_and_punctuation.rb +0 -26

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f76e0559c4da8e9461b2e89139746e52bcc1dd56
-  data.tar.gz: 81404e9523ff19eaec3839859f8a83db9690274e
+  metadata.gz: 190ea53d10c7ae49f55f0206c8c7346cea3ba4af
+  data.tar.gz: aac4c6be16b91047508af053ca48ba4fa8594f43
 SHA512:
-  metadata.gz: fd62d702fea2f2ba86fb84bb701201abbf0a5ae647734b87506b40d83f172f95661ebfc1c4d5eb702db2ad7284cd3a4ca5d90db809fa8841e6ee3b141dc8498d
-  data.tar.gz: 2568eae71807a231d34a6db0fd6549a22cd4b7e32991048b01c57918b4875facebd63aea665080c487cc492292bc76ede9a9ee82d851d8abd1f72c66a903cd81
+  metadata.gz: 86c6007b9397e126fcadc57d73f7f6c09def32507ffd2be401f8dc2ca389bafbf9f3384e41453deec7149be5f7862a361ea59d3c362915863571c7e0e81799e8
+  data.tar.gz: 2e678435c0079ab85518f2b4b380dd46744ab5e6d386b6ed82dcbdd9d93943bd4b5e2c54d1ac63077eda6ac2f1f13ccbbb8523bdd50bdd0991fdc11d04c61b49

data/lib/page_rank/base.rb CHANGED

@@ -33,8 +33,8 @@ module PageRank
     end
     # Adds a directed (and optionally weighted) edge to the graph
-    # @param source [Object] The source node
-    # @param dest [Object] The destination node
+    # @param _source [Object] The source node
+    # @param _dest [Object] The destination node
     # @return [nil]
     def add(_source, _dest, **_options)
       raise NotImplementedError

data/lib/text_rank/char_filter/ascii_folding.rb CHANGED

@@ -11,7 +11,9 @@ module TextRank
     ##
     class AsciiFolding
+      # Non-ASCII characters to replace
       NON_ASCII_CHARS        = 'ÀÁÂÃÄÅàáâãäåĀāĂăĄąÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňŉŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'
+      # "Equivalent" ASCII characters
       EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'
       # Perform the filter

data/lib/text_rank/char_filter/strip_email.rb CHANGED

@@ -10,6 +10,7 @@ module TextRank
     ##
     class StripEmail
+      # Simple regex to match most emails
       EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/i
       # Perform the filter

data/lib/text_rank/char_filter/undo_contractions.rb CHANGED

@@ -10,6 +10,7 @@ module TextRank
     ##
     class UndoContractions
+      # List of English contractions to undo
       CONTRACTIONS = {
         "ain't"            => "am not",
         "amn't"            => "am not",

data/lib/text_rank/keyword_extractor.rb CHANGED

@@ -14,7 +14,7 @@ module TextRank
     def self.basic(**options)
       new(**{
         char_filters:   [:AsciiFolding, :Lowercase],
-        tokenizer:      :Whitespace,
+        tokenizers:     [:Word],
         token_filters:  [:Stopwords, :MinLength],
         graph_strategy: :Coocurrence,
       }.merge(options))
@@ -26,27 +26,27 @@ module TextRank
     def self.advanced(**options)
       new(**{
         char_filters:   [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
-        tokenizer:      :WordsAndPunctuation,
+        tokenizers:     [:Url, :Money, :Number, :Word, :Punctuation],
         token_filters:  [:PartOfSpeech, :Stopwords, :MinLength],
         graph_strategy: :Coocurrence,
-        rank_filters:   [:CollapseAdjacent],
+        rank_filters:   [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
       }.merge(options))
     end
     # @option (see PageRank.new)
     # @option options [Array<Class, Symbol, #filter!>]  :char_filters A list of filters to be applied prior to tokenization
-    # @option options [Class, Symbol, #tokenize]        :tokenizer A class or tokenizer instance to perform tokenization
+    # @option options [Array<Symbol, Regexp, String>]   :tokenizers A list of tokenizer regular expressions to perform tokenization
     # @option options [Array<Class, Symbol, #filter!>]  :token_filters A list of filters to be applied to each token after tokenization
     # @option options [Class, Symbol, #build_graph]     :graph_strategy A class or strategy instance for producing a graph from tokens
     # @option options [Array<Class, Symbol, #filter!>]  :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
     def initialize(**options)
       @page_rank_options = {
-        strategy: options[:strategy] || :sparse,
+        strategy: options[:strategy] || :dense,
         damping: options[:damping],
         tolerance: options[:tolerance],
       }
       @char_filters   = options[:char_filters] || []
-      @tokenizer      = options[:tokenizer] || Tokenizer::Whitespace
+      @tokenizers     = options[:tokenizers] || [Tokenizer::Word]
       @token_filters  = options[:token_filters] || []
       @rank_filters   = options[:rank_filters] || []
       @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
@@ -61,11 +61,13 @@ module TextRank
       nil
     end
-    # Sets the tokenizer for producing tokens from filtered text
-    # @param tokenizer [Class, Symbol, #tokenize] Tokenizer
-    # @return [Class, Symbol, #tokenize]
-    def tokenizer=(tokenizer)
-      @tokenizer = tokenizer
+    # Add a tokenizer regular expression for producing tokens from filtered text
+    # @param tokenizer [Symbol, Regexp, String] Tokenizer regular expression
+    # @param (see #add_into)
+    # @return [nil]
+    def add_tokenizer(tokenizer, **options)
+      add_into(@tokenizers, tokenizer, **options)
+      nil
     end
     # Sets the graph strategy for producing a graph from tokens
@@ -98,7 +100,7 @@ module TextRank
     # @return [Array<String>] tokens
     def tokenize(text)
       filtered_text = apply_char_filters(text)
-      tokens = classify(@tokenizer, context: Tokenizer).tokenize(filtered_text)
+      tokens = Tokenizer.tokenize(filtered_text, *tokenizer_regular_expressions)
       apply_token_filters(tokens)
     end
@@ -121,6 +123,17 @@ module TextRank
       end
     end
+    def tokenizer_regular_expressions
+      @tokenizers.map do |t|
+        case t
+        when Symbol
+          Tokenizer.const_get(t)
+        else
+          t
+        end
+      end
+    end
     def apply_token_filters(tokens)
       @token_filters.reduce(tokens) do |t, f|
         classify(f, context: TokenFilter).filter!(t) || t

data/lib/text_rank/rank_filter.rb CHANGED

@@ -12,7 +12,10 @@ module TextRank
   ##
   module RankFilter
-    autoload :CollapseAdjacent, 'text_rank/rank_filter/collapse_adjacent'
+    autoload :CollapseAdjacent,     'text_rank/rank_filter/collapse_adjacent'
+    autoload :NormalizeProbability, 'text_rank/rank_filter/normalize_probability'
+    autoload :NormalizeUnitVector,  'text_rank/rank_filter/normalize_unit_vector'
+    autoload :SortByValue,          'text_rank/rank_filter/sort_by_value'
   end
 end

data/lib/text_rank/rank_filter/collapse_adjacent.rb CHANGED

@@ -62,7 +62,7 @@ module TextRank
       # @option options [Fixnum] ranks_to_collapse the top N ranks in which to look for collapsable keywords
       # @option options [Fixnum] max_tokens_to_combine the maximum number of tokens to collapse into a combined keyword
       # @option options [true, false] ignore_case whether to ignore case when finding adjacent keywords in original text
-      # @options options [String] delimiter an optional delimiter between adjacent keywords in original text
+      # @option options [String] delimiter an optional delimiter between adjacent keywords in original text
       def initialize(**options)
         @options = options
       end
@@ -75,8 +75,6 @@ module TextRank
         TokenCollapser.new(tokens: ranks, text: original_text, **@options).collapse
       end
-      private
       class TokenCollapser
         def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
@@ -90,14 +88,16 @@ module TextRank
           @to_collapse = Set.new # Track the permutations we plan to collapse
           @to_remove = Set.new # Track the single tokens we plan to remove (due to being collapsed)
-          @permutations_scanned = {} # Track how many occurrences of each permutation we found in the original text
+          @permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
           @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
         end
+        # :nodoc:
         def delimiter_re
           @delimiter_re ||= /#{@delimiter}+/
         end
+        # :nodoc:
         def collapse
           # We make multiple passes at collapsing because after the first pass we may have
           # replaced two or more singletons with a collapsed token, bumping up one or more
@@ -118,11 +118,7 @@ module TextRank
           end
           @tokens.reject! do |k, _|
             @to_remove.include?(k)
-          end
-          # Because we've made changes to the tokens hash, we need to re-normalize so that
-          # the sum of all token ranks is still 1.
-          normalize(@tokens)
+          end || @tokens
         end
         # We need to be efficient about how we search for the large number of possible collapsed keywords.
@@ -204,14 +200,10 @@ module TextRank
           total_single_count.zero? || (perm_count / total_single_count) > @combination_significance_threshold
         end
-        # Scale all of the token ranks so they add up to 1.
-        def normalize(tokens)
-          total = tokens.reduce(0.0) { |s, (_, v)| s + v }
-          Hash[tokens.map { |k, v| [k, v / total] }.sort_by { |_, v| -v }]
-        end
       end
+      private_constant :TokenCollapser
     end
   end
 end

data/lib/text_rank/rank_filter/normalize_probability.rb ADDED

@@ -0,0 +1,53 @@
+module TextRank
+  module RankFilter
+    ##
+    # A rank filter which normalizes the ranked keywords so that the sum of the
+    # rank values is 1.0 (a "probability" normalization).
+    #
+    # = Example
+    #
+    #  NormalizeProbability.new.filter!(
+    #    {
+    #      "town"         => 0.6818754334834477,
+    #      "cities"       => 0.6055017128817066,
+    #      "siege"        => 0.5411519524982207,
+    #      "arts"         => 0.4907977453782612,
+    #      "envy"         => 0.4692709808107252,
+    #      "blessings"    => 0.4442147897516214,
+    #      "plagues"      => 0.3972420789430091,
+    #      "florish"      => 0.2746092797528525,
+    #      "devoured"     => 0.26867321734332237,
+    #      "anxieties"    => 0.2367731719604189,
+    #      "peace"        => 0.1905352582752693,
+    #      "inhabitants"  => 0.02715120116732137,
+    #    }
+    #  )
+    #  => {
+    #   "town"        => 0.1473434248897056,
+    #   "cities"      => 0.13084016782478722,
+    #   "siege"       => 0.11693511476062682,
+    #   "arts"        => 0.10605429845557579,
+    #   "envy"        => 0.10140267579486278,
+    #   "blessings"   => 0.09598839508602595,
+    #   "plagues"     => 0.08583827125543537,
+    #   "florish"     => 0.0593390959673909,
+    #   "devoured"    => 0.058056398684529435,
+    #   "anxieties"   => 0.051163259981992296,
+    #   "peace"       => 0.041171915188530236,
+    #   "inhabitants" => 0.005866982110537665,
+    #  }
+    ##
+    class NormalizeProbability
+      # Perform the filter on the ranks
+      # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
+      # @return [Hash<String, Float>]
+      def filter!(ranks, **_)
+        return if ranks.empty?
+        total = ranks.values.reduce(:+)
+        Hash[ranks.map { |k, v| [k, v / total] }]
+      end
+    end
+  end
+end

data/lib/text_rank/rank_filter/normalize_unit_vector.rb ADDED

@@ -0,0 +1,54 @@
+module TextRank
+  module RankFilter
+    ##
+    # A rank filter which normalizes the ranked keywords so that the sum of the
+    # squares of the rank values is 1.0 (and thus the keyword rankings in an
+    # N-vector space is a unit vector).
+    #
+    # = Example
+    #
+    #  NormalizeUnitVector.new.filter!(
+    #    {
+    #      "town"         => 0.6818754334834477,
+    #      "cities"       => 0.6055017128817066,
+    #      "siege"        => 0.5411519524982207,
+    #      "arts"         => 0.4907977453782612,
+    #      "envy"         => 0.4692709808107252,
+    #      "blessings"    => 0.4442147897516214,
+    #      "plagues"      => 0.3972420789430091,
+    #      "florish"      => 0.2746092797528525,
+    #      "devoured"     => 0.26867321734332237,
+    #      "anxieties"    => 0.2367731719604189,
+    #      "peace"        => 0.1905352582752693,
+    #      "inhabitants"  => 0.02715120116732137,
+    #    }
+    #  )
+    #  => {
+    #   "town"        => 0.4616807998499129,
+    #   "cities"      => 0.40997006401243896,
+    #   "siege"       => 0.3664004508761722,
+    #   "arts"        => 0.3323068767754191,
+    #   "envy"        => 0.317731642948694,
+    #   "blessings"   => 0.30076672272820315,
+    #   "plagues"     => 0.2689626751964553,
+    #   "florish"     => 0.18593107435301526,
+    #   "devoured"    => 0.1819119149778339,
+    #   "anxieties"   => 0.16031319218415677,
+    #   "peace"       => 0.12900665740478157,
+    #   "inhabitants" => 0.01838339916101275,
+    #  }
+    ##
+    class NormalizeUnitVector
+      # Perform the filter on the ranks
+      # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
+      # @return [Hash<String, Float>]
+      def filter!(ranks, **_)
+        return if ranks.empty?
+        total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
+        Hash[ranks.map { |k, v| [k, v / total] }]
+      end
+    end
+  end
+end

data/lib/text_rank/rank_filter/sort_by_value.rb ADDED

@@ -0,0 +1,22 @@
+module TextRank
+  module RankFilter
+    ##
+    # A rank filter which sorts the results by value
+    ##
+    class SortByValue
+      # @param descending [boolean] whether to sort in descending order
+      def initialize(descending: true)
+        @descending = !!descending
+      end
+      # Perform the filter on the ranks
+      # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
+      # @return [Hash<String, Float>]
+      def filter!(ranks, **_)
+        Hash[ranks.sort_by { |_, v| @descending ? -v : v }]
+      end
+    end
+  end
+end

data/lib/text_rank/tokenizer.rb CHANGED

@@ -8,12 +8,33 @@ module TextRank
   # help inform its decision on which tokens to keep and which to drop.  An example
   # of this is the part of speech token filter which uses punctuation tokens to
   # help guess the part of speech of each non-punctuation token.
+  #
+  # When tokenizing a piece of text, the Tokenizer will combine one or more
+  # regular expressions (in the order given) to scan the text for matches. As such
+  # you need only tell the tokenizer which tokens you want; everything else will
+  # be ignored.
   ##
   module Tokenizer
-    autoload :Regex,                'text_rank/tokenizer/regex'
-    autoload :Whitespace,           'text_rank/tokenizer/whitespace'
-    autoload :WordsAndPunctuation,  'text_rank/tokenizer/words_and_punctuation'
+    autoload :Money,        'text_rank/tokenizer/money'
+    autoload :Number,       'text_rank/tokenizer/number'
+    autoload :Punctuation,  'text_rank/tokenizer/punctuation'
+    autoload :Url,          'text_rank/tokenizer/url'
+    autoload :Whitespace,   'text_rank/tokenizer/whitespace'
+    autoload :Word,         'text_rank/tokenizer/word'
+    # Performs tokenization of piece of text by one or more tokenizer regular expressions.
+    # @param text [String]
+    # @param regular_expressions [Array<Regexp|String>]
+    # @return [Array<String>]
+    def self.tokenize(text, *regular_expressions)
+      tokens = []
+      text.scan(Regexp.new(regular_expressions.flatten.join('|'))) do |matches|
+        m = matches.compact.first
+        tokens << m if m && m.size > 0
+      end
+      tokens
+    end
   end
 end

data/lib/text_rank/tokenizer/money.rb ADDED

@@ -0,0 +1,76 @@
+#encoding: UTF-8
+module TextRank
+  module Tokenizer
+    CURRENCY_SYMBOLS = '[' + [
+      "\u00a4", # Generic Currency Symbol
+      "\u0024", # Dollar Sign
+      "\u00a2", # Cent Sign
+      "\u00a3", # Pound Sterling
+      "\u00a5", # Yen Symbol
+      "\u20a3", # Franc Sign
+      "\u20a4", # Lira Symbol
+      "\u20a7", # Peseta Sign
+      "\u20ac", # Euro Symbol
+      "\u20B9", # Rupee
+      "\u20a9", # Won Sign
+      "\u20b4", # Hryvnia Sign
+      "\u20af", # Drachma Sign
+      "\u20ae", # Tugrik Sign
+      "\u20b0", # German Penny Sign
+      "\u20b2", # Guarani Sign
+      "\u20b1", # Peso Sign
+      "\u20b3", # Austral Sign
+      "\u20b5", # Cedi Sign
+      "\u20ad", # Kip Sign
+      "\u20aa", # New Sheqel Sign
+      "\u20ab", # Dong Sign
+      "\u0025", # Percent
+      "\u2030", # Per Million
+    ].join + ']'
+    private_constant :CURRENCY_SYMBOLS # Do not expose this to avoid confusion
+    ##
+    # A tokenizer regex that preserves money or formatted numbers as a single token. This
+    # currently supports 24 different currency symbols:
+    #
+    # * ¤
+    # * $
+    # * ¢
+    # * £
+    # * ¥
+    # * ₣
+    # * ₤
+    # * ₧
+    # * €
+    # * ₹
+    # * ₩
+    # * ₴
+    # * ₯
+    # * ₮
+    # * ₰
+    # * ₲
+    # * ₱
+    # * ₳
+    # * ₵
+    # * ₭
+    # * ₪
+    # * ₫
+    # * %
+    # * ‰
+    #
+    # It also supports two alternative formats for negatives as well as optional three digit comma
+    # separation and optional decimals.
+    ##
+    Money = %r{
+      (
+        #{CURRENCY_SYMBOLS} \-? #{Number}      # $-45,231.21
+        |
+        \-? #{CURRENCY_SYMBOLS} #{Number}      # -$45,231.21
+        |
+        \( #{CURRENCY_SYMBOLS} #{Number} \)    # ($45,231.21)
+      )
+    }x
+  end
+end

data/lib/text_rank/tokenizer/number.rb ADDED

@@ -0,0 +1,31 @@
+#encoding: UTF-8
+module TextRank
+  module Tokenizer
+    ##
+    # A tokenizer regex that preserves (optionally formatted) numbers as a single token.
+    ##
+    Number = %r{
+      (
+        [1-9]\d{0,2}        # 453
+        (?:,\d{3})*         # 453,231,162
+        (?:\.\d{0,2})?      # 453,231,162.17
+        |
+        [1-9]\d*            # 453231162
+        (?:\.\d{0,2})?      # 453231162.17
+        |
+        0                   # 0
+        (?:\.\d{0,2})?      # 0.17
+        |
+        (?:\.\d{1,2})       # .17
+      )
+    }x
+  end
+end

data/lib/text_rank/tokenizer/punctuation.rb ADDED

@@ -0,0 +1,11 @@
+module TextRank
+  module Tokenizer
+    ##
+    # A tokenizer regex that preserves single punctuation symbols as a token. Use
+    # this if one or more of your TokenFilter classes need punctuation in order to
+    # make decisions.
+    ##
+    Punctuation = %r{([\p{Punct}])}
+  end
+end

data/lib/text_rank/tokenizer/url.rb ADDED

@@ -0,0 +1,21 @@
+module TextRank
+  module Tokenizer
+    ##
+    # A tokenizer regex that preserves entire URL's as a token (rather than split them up)
+    ##
+    Url = %r{
+      (
+        (?:[\w-]+://?|www[.])
+        [^\s()<>]+
+        (?:
+          \([\w\d]+\)
+          |
+          (?:[^[:punct:]\s]
+          |
+          /)
+        )
+      )
+    }xi
+  end
+end

data/lib/text_rank/tokenizer/whitespace.rb CHANGED

@@ -1,19 +1,11 @@
 module TextRank
   module Tokenizer
     ##
-    # Tokenizer to split on any whitespace
-    #
-    # = Example
-    #
-    #  Whitespace.new.tokenize("i should:like to know:which is worse.")
-    #  => ["i", "should:like", "to", "know:which", "is", "worse."]
+    # A tokenizer regex that preserves single whitespace characters as a token. Use
+    # this if one or more of your TokenFilter classes need whitespace in order to
+    # make decisions.
     ##
-    class Whitespace < Regex
+    Whitespace = %r{\s}
-      def initialize
-        super(/\s+/)
-      end
-    end
   end
 end

data/lib/text_rank/tokenizer/word.rb ADDED

@@ -0,0 +1,14 @@
+module TextRank
+  module Tokenizer
+    ##
+    # A tokenizer regex that preserves a non-space, non-punctuation "word".  It does
+    # allow hyphens and numerals, but the first character must be an A-Z character.
+    ##
+    Word = %r{
+      (
+        [a-z][a-z0-9-]*
+      )
+    }xi
+  end
+end

data/lib/text_rank/version.rb CHANGED

@@ -1,3 +1,4 @@
 module TextRank
-  VERSION = '1.1.1'
+  # Current gem version
+  VERSION = '1.1.5'
 end

data/text_rank.gemspec CHANGED

@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
   spec.email         = ['david.mccullars@gmail.com']
   spec.summary       = %q{Implementation of TextRank solution to ranked keyword extraction}
-  spec.description   = %q{See https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwjK9tfHxcvMAhVOzGMKHdaQBeEQFggdMAA&url=https%3A%2F%2Fweb.eecs.umich.edu%2F~mihalcea%2Fpapers%2Fmihalcea.emnlp04.pdf&usg=AFQjCNHL5SGlxLy4qmEg1yexaKGZK_Q7IA}
+  spec.description   = %q{Implementation of TextRank solution to ranked keyword extraction.  See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf}
   spec.homepage      = 'https://github.com/david-mccullars/text_rank'
   spec.license       = 'MIT'

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_rank
 version: !ruby/object:Gem::Version
-  version: 1.1.1
+  version: 1.1.5
 platform: ruby
 authors:
 - David McCullars
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-05-12 00:00:00.000000000 Z
+date: 2016-05-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -108,7 +108,8 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '1.0'
-description: See https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwjK9tfHxcvMAhVOzGMKHdaQBeEQFggdMAA&url=https%3A%2F%2Fweb.eecs.umich.edu%2F~mihalcea%2Fpapers%2Fmihalcea.emnlp04.pdf&usg=AFQjCNHL5SGlxLy4qmEg1yexaKGZK_Q7IA
+description: Implementation of TextRank solution to ranked keyword extraction.  See
+  https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
 email:
 - david.mccullars@gmail.com
 executables: []
@@ -145,14 +146,20 @@ files:
 - lib/text_rank/keyword_extractor.rb
 - lib/text_rank/rank_filter.rb
 - lib/text_rank/rank_filter/collapse_adjacent.rb
+- lib/text_rank/rank_filter/normalize_probability.rb
+- lib/text_rank/rank_filter/normalize_unit_vector.rb
+- lib/text_rank/rank_filter/sort_by_value.rb
 - lib/text_rank/token_filter.rb
 - lib/text_rank/token_filter/min_length.rb
 - lib/text_rank/token_filter/part_of_speech.rb
 - lib/text_rank/token_filter/stopwords.rb
 - lib/text_rank/tokenizer.rb
-- lib/text_rank/tokenizer/regex.rb
+- lib/text_rank/tokenizer/money.rb
+- lib/text_rank/tokenizer/number.rb
+- lib/text_rank/tokenizer/punctuation.rb
+- lib/text_rank/tokenizer/url.rb
 - lib/text_rank/tokenizer/whitespace.rb
-- lib/text_rank/tokenizer/words_and_punctuation.rb
+- lib/text_rank/tokenizer/word.rb
 - lib/text_rank/version.rb
 - text_rank.gemspec
 homepage: https://github.com/david-mccullars/text_rank
@@ -180,4 +187,3 @@ signing_key:
 specification_version: 4
 summary: Implementation of TextRank solution to ranked keyword extraction
 test_files: []
-has_rdoc:

data/lib/text_rank/tokenizer/regex.rb DELETED

@@ -1,26 +0,0 @@
-module TextRank
-  module Tokenizer
-    ##
-    # Base tokenizer that tokenizes on any regular expression
-    #
-    # = Example
-    #
-    #  Regex.new(/:/).tokenize("i should:like to know:which is worse.")
-    #  => ["i should", "like to know", "which is worse"]
-    ##
-    class Regex
-      # @param regex [Regexp] to use for string splitting
-      def initialize(regex)
-        @regex = regex
-      end
-      # @param text [String] string to tokenize
-      # return [Array<String>] non-empty tokens
-      def tokenize(text)
-        text.split(@regex) - ['']
-      end
-    end
-  end
-end

data/lib/text_rank/tokenizer/words_and_punctuation.rb DELETED

@@ -1,26 +0,0 @@
-module TextRank
-  module Tokenizer
-    ##
-    # A tokenizer that preserves punctuation as their own tokens (which can be
-    # used, for example, by the [TokenFilter::PartOfSpeechBase] filter).
-    #
-    # = Example
-    #
-    #  WordsAndPunctuation.new.tokenize("i should:like to know:which is worse.")
-    #  => ["i", "should", ":", "like", "to", "know", ":", "which", "is", "worse", "."]
-    ##
-    class WordsAndPunctuation < Regex
-      def initialize
-        super(/
-          ([a-z][a-z0-9-]+)
-          |
-          ([\p{Punct}])
-          |
-          \s+
-        /xi)
-      end
-    end
-  end
-end