RubyGems - text_rank - Versions diffs - 1.1.1 → 1.1.5 - Mend

text_rank 1.1.1 → 1.1.5

Files changed (23) hide show

checksums.yaml +4 -4
data/lib/page_rank/base.rb +2 -2
data/lib/text_rank/char_filter/ascii_folding.rb +2 -0
data/lib/text_rank/char_filter/strip_email.rb +1 -0
data/lib/text_rank/char_filter/undo_contractions.rb +1 -0
data/lib/text_rank/keyword_extractor.rb +25 -12
data/lib/text_rank/rank_filter.rb +4 -1
data/lib/text_rank/rank_filter/collapse_adjacent.rb +7 -15
data/lib/text_rank/rank_filter/normalize_probability.rb +53 -0
data/lib/text_rank/rank_filter/normalize_unit_vector.rb +54 -0
data/lib/text_rank/rank_filter/sort_by_value.rb +22 -0
data/lib/text_rank/tokenizer.rb +24 -3
data/lib/text_rank/tokenizer/money.rb +76 -0
data/lib/text_rank/tokenizer/number.rb +31 -0
data/lib/text_rank/tokenizer/punctuation.rb +11 -0
data/lib/text_rank/tokenizer/url.rb +21 -0
data/lib/text_rank/tokenizer/whitespace.rb +4 -12
data/lib/text_rank/tokenizer/word.rb +14 -0
data/lib/text_rank/version.rb +2 -1
data/text_rank.gemspec +1 -1
metadata +12 -6
data/lib/text_rank/tokenizer/regex.rb +0 -26
data/lib/text_rank/tokenizer/words_and_punctuation.rb +0 -26

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f76e0559c4da8e9461b2e89139746e52bcc1dd56
-  data.tar.gz: 81404e9523ff19eaec3839859f8a83db9690274e
+  metadata.gz: 190ea53d10c7ae49f55f0206c8c7346cea3ba4af
+  data.tar.gz: aac4c6be16b91047508af053ca48ba4fa8594f43
 SHA512:
-  metadata.gz: fd62d702fea2f2ba86fb84bb701201abbf0a5ae647734b87506b40d83f172f95661ebfc1c4d5eb702db2ad7284cd3a4ca5d90db809fa8841e6ee3b141dc8498d
-  data.tar.gz: 2568eae71807a231d34a6db0fd6549a22cd4b7e32991048b01c57918b4875facebd63aea665080c487cc492292bc76ede9a9ee82d851d8abd1f72c66a903cd81
+  metadata.gz: 86c6007b9397e126fcadc57d73f7f6c09def32507ffd2be401f8dc2ca389bafbf9f3384e41453deec7149be5f7862a361ea59d3c362915863571c7e0e81799e8
+  data.tar.gz: 2e678435c0079ab85518f2b4b380dd46744ab5e6d386b6ed82dcbdd9d93943bd4b5e2c54d1ac63077eda6ac2f1f13ccbbb8523bdd50bdd0991fdc11d04c61b49

data/lib/page_rank/base.rb CHANGED

@@ -33,8 +33,8 @@ module PageRank
     end
     # Adds a directed (and optionally weighted) edge to the graph
-    # @param source [Object] The source node
-    # @param dest [Object] The destination node
+    # @param _source [Object] The source node
+    # @param _dest [Object] The destination node
     # @return [nil]
     def add(_source, _dest, **_options)
       raise NotImplementedError

data/lib/text_rank/char_filter/ascii_folding.rb CHANGED

@@ -11,7 +11,9 @@ module TextRank
     ##
     class AsciiFolding
+      # Non-ASCII characters to replace
       NON_ASCII_CHARS        = 'ÀÁÂÃÄÅàáâãäåĀāĂăĄąÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňŉŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'
+      # "Equivalent" ASCII characters
       EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'
       # Perform the filter

data/lib/text_rank/char_filter/strip_email.rb CHANGED

@@ -10,6 +10,7 @@ module TextRank
     ##
     class StripEmail
+      # Simple regex to match most emails
       EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/i
       # Perform the filter

data/lib/text_rank/char_filter/undo_contractions.rb CHANGED

@@ -10,6 +10,7 @@ module TextRank
     ##
     class UndoContractions
+      # List of English contractions to undo
       CONTRACTIONS = {
         "ain't"            => "am not",
         "amn't"            => "am not",

data/lib/text_rank/keyword_extractor.rb CHANGED

@@ -14,7 +14,7 @@ module TextRank
     def self.basic(**options)
       new(**{
         char_filters:   [:AsciiFolding, :Lowercase],
-        tokenizer:      :Whitespace,
+        tokenizers:     [:Word],
         token_filters:  [:Stopwords, :MinLength],
         graph_strategy: :Coocurrence,
       }.merge(options))
@@ -26,27 +26,27 @@ module TextRank
     def self.advanced(**options)
       new(**{
         char_filters:   [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
-        tokenizer:      :WordsAndPunctuation,
+        tokenizers:     [:Url, :Money, :Number, :Word, :Punctuation],
         token_filters:  [:PartOfSpeech, :Stopwords, :MinLength],
         graph_strategy: :Coocurrence,
-        rank_filters:   [:CollapseAdjacent],
+        rank_filters:   [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
       }.merge(options))
     end
     # @option (see PageRank.new)
     # @option options [Array<Class, Symbol, #filter!>]  :char_filters A list of filters to be applied prior to tokenization
-    # @option options [Class, Symbol, #tokenize]        :tokenizer A class or tokenizer instance to perform tokenization
+    # @option options [Array<Symbol, Regexp, String>]   :tokenizers A list of tokenizer regular expressions to perform tokenization
     # @option options [Array<Class, Symbol, #filter!>]  :token_filters A list of filters to be applied to each token after tokenization
     # @option options [Class, Symbol, #build_graph]     :graph_strategy A class or strategy instance for producing a graph from tokens
     # @option options [Array<Class, Symbol, #filter!>]  :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
     def initialize(**options)
       @page_rank_options = {
-        strategy: options[:strategy] || :sparse,
+        strategy: options[:strategy] || :dense,
         damping: options[:damping],
         tolerance: options[:tolerance],
       }
       @char_filters   = options[:char_filters] || []
-      @tokenizer      = options[:tokenizer] || Tokenizer::Whitespace
+      @tokenizers     = options[:tokenizers] || [Tokenizer::Word]
       @token_filters  = options[:token_filters] || []
       @rank_filters   = options[:rank_filters] || []
       @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
@@ -61,11 +61,13 @@ module TextRank
       nil
     end
-    # Sets the tokenizer for producing tokens from filtered text
-    # @param tokenizer [Class, Symbol, #tokenize] Tokenizer
-    # @return [Class, Symbol, #tokenize]
-    def tokenizer=(tokenizer)
-      @tokenizer = tokenizer
+    # Add a tokenizer regular expression for producing tokens from filtered text
+    # @param tokenizer [Symbol, Regexp, String] Tokenizer regular expression
+    # @param (see #add_into)
+    # @return [nil]
+    def add_tokenizer(tokenizer, **options)
+      add_into(@tokenizers, tokenizer, **options)
+      nil
     end
     # Sets the graph strategy for producing a graph from tokens
@@ -98,7 +100,7 @@ module TextRank
     # @return [Array<String>] tokens
     def tokenize(text)
       filtered_text = apply_char_filters(text)
-      tokens = classify(@tokenizer, context: Tokenizer).tokenize(filtered_text)
+      tokens = Tokenizer.tokenize(filtered_text, *tokenizer_regular_expressions)
       apply_token_filters(tokens)
     end
@@ -121,6 +123,17 @@ module TextRank
       end
     end
+    def tokenizer_regular_expressions
+      @tokenizers.map do |t|
+        case t
+        when Symbol
+          Tokenizer.const_get(t)
+        else
+          t
+        end
+      end
+    end
     def apply_token_filters(tokens)
       @token_filters.reduce(tokens) do |t, f|
         classify(f, context: TokenFilter).filter!(t) || t

data/lib/text_rank/rank_filter.rb CHANGED

@@ -12,7 +12,10 @@ module TextRank
   ##
   module RankFilter
-    autoload :CollapseAdjacent, 'text_rank/rank_filter/collapse_adjacent'
+    autoload :CollapseAdjacent,     'text_rank/rank_filter/collapse_adjacent'
+    autoload :NormalizeProbability, 'text_rank/rank_filter/normalize_probability'
+    autoload :NormalizeUnitVector,  'text_rank/rank_filter/normalize_unit_vector'
+    autoload :SortByValue,          'text_rank/rank_filter/sort_by_value'
   end
 end

data/lib/text_rank/rank_filter/collapse_adjacent.rb CHANGED

@@ -62,7 +62,7 @@ module TextRank
       # @option options [Fixnum] ranks_to_collapse the top N ranks in which to look for collapsable keywords
       # @option options [Fixnum] max_tokens_to_combine the maximum number of tokens to collapse into a combined keyword
       # @option options [true, false] ignore_case whether to ignore case when finding adjacent keywords in original text
-      # @options options [String] delimiter an optional delimiter between adjacent keywords in original text
+      # @option options [String] delimiter an optional delimiter between adjacent keywords in original text
       def initialize(**options)
         @options = options
       end
@@ -75,8 +75,6 @@ module TextRank
         TokenCollapser.new(tokens: ranks, text: original_text, **@options).collapse
       end
-      private
       class TokenCollapser
         def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
@@ -90,14 +88,16 @@ module TextRank
           @to_collapse = Set.new # Track the permutations we plan to collapse
           @to_remove = Set.new # Track the single tokens we plan to remove (due to being collapsed)
-          @permutations_scanned = {} # Track how many occurrences of each permutation we found in the original text
+          @permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
           @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
         end
+        # :nodoc:
         def delimiter_re
           @delimiter_re ||= /#{@delimiter}+/
         end
+        # :nodoc:
         def collapse
           # We make multiple passes at collapsing because after the first pass we may have
           # replaced two or more singletons with a collapsed token, bumping up one or more
@@ -118,11 +118,7 @@ module TextRank
           end
           @tokens.reject! do |k, _|
             @to_remove.include?(k)
-          end
-          # Because we've made changes to the tokens hash, we need to re-normalize so that
-          # the sum of all token ranks is still 1.
-          normalize(@tokens)
+          end || @tokens
         end
         # We need to be efficient about how we search for the large number of possible collapsed keywords.
@@ -204,14 +200,10 @@ module TextRank
           total_single_count.zero? || (perm_count / total_single_count) > @combination_significance_threshold
         end
-        # Scale all of the token ranks so they add up to 1.
-        def normalize(tokens)
-          total = tokens.reduce(0.0) { |s, (_, v)| s + v }
-          Hash[tokens.map { |k, v| [k, v / total] }.sort_by { |_, v| -v }]
-        end
       end
+      private_constant :TokenCollapser
     end
   end
 end

data/lib/text_rank/rank_filter/normalize_probability.rb ADDED

@@ -0,0 +1,53 @@
+module TextRank
+  module RankFilter
+    ##
+    # A rank filter which normalizes the ranked keywords so that the sum of the
+    # rank values is 1.0 (a "probability" normalization).
+    #
+    # = Example
+    #
+    #  NormalizeProbability.new.filter!(
+    #    {
+    #      "town"         => 0.6818754334834477,
+    #      "cities"       => 0.6055017128817066,
+    #      "siege"        => 0.5411519524982207,
+    #      "arts"         => 0.4907977453782612,
+    #      "envy"         => 0.4692709808107252,
+    #      "blessings"    => 0.4442147897516214,
+    #      "plagues"      => 0.3972420789430091,
+    #      "florish"      => 0.2746092797528525,
+    #      "devoured"     => 0.26867321734332237,
+    #      "anxieties"    => 0.2367731719604189,
+    #      "peace"        => 0.1905352582752693,
+    #      "inhabitants"  => 0.02715120116732137,
+    #    }
+    #  )
+    #  => {
+    #   "town"        => 0.1473434248897056,
+    #   "cities"      => 0.13084016782478722,
+    #   "siege"       => 0.11693511476062682,
+    #   "arts"        => 0.10605429845557579,
+    #   "envy"        => 0.10140267579486278,
+    #   "blessings"   => 0.09598839508602595,
+    #   "plagues"     => 0.08583827125543537,
+    #   "florish"     => 0.0593390959673909,
+    #   "devoured"    => 0.058056398684529435,
+    #   "anxieties"   => 0.051163259981992296,
+    #   "peace"       => 0.041171915188530236,
+    #   "inhabitants" => 0.005866982110537665,
+    #  }
+    ##
+    class NormalizeProbability
+      # Perform the filter on the ranks
+      # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
+      # @return [Hash<String, Float>]
+      def filter!(ranks, **_)
+        return if ranks.empty?
+        total = ranks.values.reduce(:+)
+        Hash[ranks.map { |k, v| [k, v / total] }]
+      end
+    end
+  end
+end

data/lib/text_rank/rank_filter/normalize_unit_vector.rb ADDED

@@ -0,0 +1,54 @@
+module TextRank
+  module RankFilter
+    ##
+    # A rank filter which normalizes the ranked keywords so that the sum of the
+    # squares of the rank values is 1.0 (and thus the keyword rankings in an
+    # N-vector space is a unit vector).
+    #
+    # = Example
+    #
+    #  NormalizeUnitVector.new.filter!(
+    #    {
+    #      "town"         => 0.6818754334834477,
+    #      "cities"       => 0.6055017128817066,
+    #      "siege"        => 0.5411519524982207,
+    #      "arts"         => 0.4907977453782612,
+    #      "envy"         => 0.4692709808107252,
+    #      "blessings"    => 0.4442147897516214,
+    #      "plagues"      => 0.3972420789430091,
+    #      "florish"      => 0.2746092797528525,
+    #      "devoured"     => 0.26867321734332237,
+    #      "anxieties"    => 0.2367731719604189,
+    #      "peace"        => 0.1905352582752693,
+    #      "inhabitants"  => 0.02715120116732137,
+    #    }
+    #  )
+    #  => {
+    #   "town"        => 0.4616807998499129,
+    #   "cities"      => 0.40997006401243896,
+    #   "siege"       => 0.3664004508761722,
+    #   "arts"        => 0.3323068767754191,
+    #   "envy"        => 0.317731642948694,
+    #   "blessings"   => 0.30076672272820315,
+    #   "plagues"     => 0.2689626751964553,
+    #   "florish"     => 0.18593107435301526,
+    #   "devoured"    => 0.1819119149778339,
+    #   "anxieties"   => 0.16031319218415677,
+    #   "peace"       => 0.12900665740478157,
+    #   "inhabitants" => 0.01838339916101275,
+    #  }
+    ##
+    class NormalizeUnitVector
+      # Perform the filter on the ranks
+      # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
+      # @return [Hash<String, Float>]
+      def filter!(ranks, **_)
+        return if ranks.empty?
+        total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
+        Hash[ranks.map { |k, v| [k, v / total] }]
+      end
+    end
+  end
+end

data/lib/text_rank/rank_filter/sort_by_value.rb ADDED

@@ -0,0 +1,22 @@
+module TextRank
+  module RankFilter
+    ##
+    # A rank filter which sorts the results by value
+    ##
+    class SortByValue
+      # @param descending [boolean] whether to sort in descending order
+      def initialize(descending: true)
+        @descending = !!descending
+      end
+      # Perform the filter on the ranks
+      # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
+      # @return [Hash<String, Float>]
+      def filter!(ranks, **_)
+        Hash[ranks.sort_by { |_, v| @descending ? -v : v }]
+      end
+    end
+  end
+end

data/lib/text_rank/tokenizer.rb CHANGED

@@ -8,12 +8,33 @@ module TextRank
   # help inform its decision on which tokens to keep and which to drop.  An example
   # of this is the part of speech token filter which uses punctuation tokens to
   # help guess the part of speech of each non-punctuation token.
+  #
+  # When tokenizing a piece of text, the Tokenizer will combine one or more
+  # regular expressions (in the order given) to scan the text for matches. As such
+  # you need only tell the tokenizer which tokens you want; everything else will
+  # be ignored.
   ##
   module Tokenizer
-    autoload :Regex,                'text_rank/tokenizer/regex'
-    autoload :Whitespace,           'text_rank/tokenizer/whitespace'
-    autoload :WordsAndPunctuation,  'text_rank/tokenizer/words_and_punctuation'
+    autoload :Money,        'text_rank/tokenizer/money'
+    autoload :Number,       'text_rank/tokenizer/number'
+    autoload :Punctuation,  'text_rank/tokenizer/punctuation'
+    autoload :Url,          'text_rank/tokenizer/url'
+    autoload :Whitespace,   'text_rank/tokenizer/whitespace'
+    autoload :Word,         'text_rank/tokenizer/word'
+    # Performs tokenization of piece of text by one or more tokenizer regular expressions.
+    # @param text [String]
+    # @param regular_expressions [Array<Regexp|String>]
+    # @return [Array<String>]
+    def self.tokenize(text, *regular_expressions)
+      tokens = []
+      text.scan(Regexp.new(regular_expressions.flatten.join('|'))) do |matches|
+        m = matches.compact.first
+        tokens << m if m && m.size > 0
+      end
+      tokens
+    end
   end
 end

data/lib/text_rank/tokenizer/money.rb ADDED

@@ -0,0 +1,76 @@
+#encoding: UTF-8
+module TextRank
+  module Tokenizer
+    CURRENCY_SYMBOLS = '[' + [
+      "\u00a4", # Generic Currency Symbol
+      "\u0024", # Dollar Sign
+      "\u00a2", # Cent Sign
+      "\u00a3", # Pound Sterling
+      "\u00a5", # Yen Symbol
+      "\u20a3", # Franc Sign
+      "\u20a4", # Lira Symbol
+      "\u20a7", # Peseta Sign
+      "\u20ac", # Euro Symbol
+      "\u20B9", # Rupee
+      "\u20a9", # Won Sign
+      "\u20b4", # Hryvnia Sign
+      "\u20af", # Drachma Sign
+      "\u20ae", # Tugrik Sign
+      "\u20b0", # German Penny Sign
+      "\u20b2", # Guarani Sign
+      "\u20b1", # Peso Sign
+      "\u20b3", # Austral Sign
+      "\u20b5", # Cedi Sign
+      "\u20ad", # Kip Sign
+      "\u20aa", # New Sheqel Sign
+      "\u20ab", # Dong Sign
+      "\u0025", # Percent
+      "\u2030", # Per Million
+    ].join + ']'
+    private_constant :CURRENCY_SYMBOLS # Do not expose this to avoid confusion
+    ##
+    # A tokenizer regex that preserves money or formatted numbers as a single token. This
+    # currently supports 24 different currency symbols:
+    #
+    # * ¤
+    # * $
+    # * ¢
+    # * £
+    # * ¥
+    # * ₣
+    # * ₤
+    # * ₧
+    # * €
+    # * ₹
+    # * ₩
+    # * ₴
+    # * ₯
+    # * ₮
+    # * ₰
+    # * ₲
+    # * ₱
+    # * ₳
+    # * ₵
+    # * ₭
+    # * ₪
+    # * ₫
+    # * %
+    # * ‰
+    #
+    # It also supports two alternative formats for negatives as well as optional three digit comma
+    # separation and optional decimals.
+    ##
+    Money = %r{
+      (
+        #{CURRENCY_SYMBOLS} \-? #{Number}      # $-45,231.21
+        |
+        \-? #{CURRENCY_SYMBOLS} #{Number}      # -$45,231.21
+        |
+        \( #{CURRENCY_SYMBOLS} #{Number} \)    # ($45,231.21)
+      )
+    }x
+  end
+end

data/lib/text_rank/tokenizer/number.rb ADDED

@@ -0,0 +1,31 @@
+#encoding: UTF-8
+module TextRank
+  module Tokenizer
+    ##
+    # A tokenizer regex that preserves (optionally formatted) numbers as a single token.
+    ##
+    Number = %r{
+      (
+        [1-9]\d{0,2}        # 453
+        (?:,\d{3})*         # 453,231,162
+        (?:\.\d{0,2})?      # 453,231,162.17
+        |
+        [1-9]\d*            # 453231162
+        (?:\.\d{0,2})?      # 453231162.17
+        |
+        0                   # 0
+        (?:\.\d{0,2})?      # 0.17
+        |
+        (?:\.\d{1,2})       # .17
+      )
+    }x
+  end
+end

data/lib/text_rank/tokenizer/punctuation.rb ADDED

@@ -0,0 +1,11 @@
+module TextRank
+  module Tokenizer
+    ##
+    # A tokenizer regex that preserves single punctuation symbols as a token. Use
+    # this if one or more of your TokenFilter classes need punctuation in order to
+    # make decisions.
+    ##
+    Punctuation = %r{([\p{Punct}])}
+  end
+end

data/lib/text_rank/tokenizer/url.rb ADDED

@@ -0,0 +1,21 @@
+module TextRank
+  module Tokenizer
+    ##
+    # A tokenizer regex that preserves entire URL's as a token (rather than split them up)
+    ##
+    Url = %r{
+      (
+        (?:[\w-]+://?|www[.])
+        [^\s()<>]+
+        (?:
+          \([\w\d]+\)
+          |
+          (?:[^[:punct:]\s]
+          |
+          /)
+        )
+      )
+    }xi
+  end
+end

data/lib/text_rank/tokenizer/whitespace.rb CHANGED

@@ -1,19 +1,11 @@
 module TextRank
   module Tokenizer
     ##
-    # Tokenizer to split on any whitespace
-    #
-    # = Example
-    #
-    #  Whitespace.new.tokenize("i should:like to know:which is worse.")
-    #  => ["i", "should:like", "to", "know:which", "is", "worse."]
+    # A tokenizer regex that preserves single whitespace characters as a token. Use
+    # this if one or more of your TokenFilter classes need whitespace in order to
+    # make decisions.
     ##
-    class Whitespace < Regex
+    Whitespace = %r{\s}
-      def initialize
-        super(/\s+/)
-      end
-    end
   end
 end

data/lib/text_rank/tokenizer/word.rb ADDED

@@ -0,0 +1,14 @@
+module TextRank
+  module Tokenizer
+    ##
+    # A tokenizer regex that preserves a non-space, non-punctuation "word".  It does
+    # allow hyphens and numerals, but the first character must be an A-Z character.
+    ##
+    Word = %r{
+      (
+        [a-z][a-z0-9-]*
+      )
+    }xi
+  end
+end

data/lib/text_rank/version.rb CHANGED

@@ -1,3 +1,4 @@
 module TextRank
-  VERSION = '1.1.1'
+  # Current gem version
+  VERSION = '1.1.5'
 end

data/text_rank.gemspec CHANGED

@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
   spec.email         = ['david.mccullars@gmail.com']
   spec.summary       = %q{Implementation of TextRank solution to ranked keyword extraction}
-  spec.description   = %q{See https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwjK9tfHxcvMAhVOzGMKHdaQBeEQFggdMAA&url=https%3A%2F%2Fweb.eecs.umich.edu%2F~mihalcea%2Fpapers%2Fmihalcea.emnlp04.pdf&usg=AFQjCNHL5SGlxLy4qmEg1yexaKGZK_Q7IA}
+  spec.description   = %q{Implementation of TextRank solution to ranked keyword extraction.  See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf}
   spec.homepage      = 'https://github.com/david-mccullars/text_rank'
   spec.license       = 'MIT'

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_rank
 version: !ruby/object:Gem::Version
-  version: 1.1.1
+  version: 1.1.5
 platform: ruby
 authors:
 - David McCullars
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-05-12 00:00:00.000000000 Z
+date: 2016-05-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -108,7 +108,8 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '1.0'
-description: See https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwjK9tfHxcvMAhVOzGMKHdaQBeEQFggdMAA&url=https%3A%2F%2Fweb.eecs.umich.edu%2F~mihalcea%2Fpapers%2Fmihalcea.emnlp04.pdf&usg=AFQjCNHL5SGlxLy4qmEg1yexaKGZK_Q7IA
+description: Implementation of TextRank solution to ranked keyword extraction.  See
+  https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
 email:
 - david.mccullars@gmail.com
 executables: []
@@ -145,14 +146,20 @@ files:
 - lib/text_rank/keyword_extractor.rb
 - lib/text_rank/rank_filter.rb
 - lib/text_rank/rank_filter/collapse_adjacent.rb
+- lib/text_rank/rank_filter/normalize_probability.rb
+- lib/text_rank/rank_filter/normalize_unit_vector.rb
+- lib/text_rank/rank_filter/sort_by_value.rb
 - lib/text_rank/token_filter.rb
 - lib/text_rank/token_filter/min_length.rb
 - lib/text_rank/token_filter/part_of_speech.rb
 - lib/text_rank/token_filter/stopwords.rb
 - lib/text_rank/tokenizer.rb
-- lib/text_rank/tokenizer/regex.rb
+- lib/text_rank/tokenizer/money.rb
+- lib/text_rank/tokenizer/number.rb
+- lib/text_rank/tokenizer/punctuation.rb
+- lib/text_rank/tokenizer/url.rb
 - lib/text_rank/tokenizer/whitespace.rb
-- lib/text_rank/tokenizer/words_and_punctuation.rb
+- lib/text_rank/tokenizer/word.rb
 - lib/text_rank/version.rb
 - text_rank.gemspec
 homepage: https://github.com/david-mccullars/text_rank
@@ -180,4 +187,3 @@ signing_key:
 specification_version: 4
 summary: Implementation of TextRank solution to ranked keyword extraction
 test_files: []
-has_rdoc:

data/lib/text_rank/tokenizer/regex.rb DELETED

@@ -1,26 +0,0 @@
-module TextRank
-  module Tokenizer
-    ##
-    # Base tokenizer that tokenizes on any regular expression
-    #
-    # = Example
-    #
-    #  Regex.new(/:/).tokenize("i should:like to know:which is worse.")
-    #  => ["i should", "like to know", "which is worse"]
-    ##
-    class Regex
-      # @param regex [Regexp] to use for string splitting
-      def initialize(regex)
-        @regex = regex
-      end
-      # @param text [String] string to tokenize
-      # return [Array<String>] non-empty tokens
-      def tokenize(text)
-        text.split(@regex) - ['']
-      end
-    end
-  end
-end

data/lib/text_rank/tokenizer/words_and_punctuation.rb DELETED

@@ -1,26 +0,0 @@
-module TextRank
-  module Tokenizer
-    ##
-    # A tokenizer that preserves punctuation as their own tokens (which can be
-    # used, for example, by the [TokenFilter::PartOfSpeechBase] filter).
-    #
-    # = Example
-    #
-    #  WordsAndPunctuation.new.tokenize("i should:like to know:which is worse.")
-    #  => ["i", "should", ":", "like", "to", "know", ":", "which", "is", "worse", "."]
-    ##
-    class WordsAndPunctuation < Regex
-      def initialize
-        super(/
-          ([a-z][a-z0-9-]+)
-          |
-          ([\p{Punct}])
-          |
-          \s+
-        /xi)
-      end
-    end
-  end
-end