RubyGems - text_rank - Versions diffs - 1.1.0 - Mend

text_rank 1.1.0

Files changed (42) hide show

checksums.yaml +7 -0
data/.codeclimate.yml +29 -0
data/.gitignore +10 -0
data/.rspec +2 -0
data/.rubocop.yml +1157 -0
data/.ruby-version +1 -0
data/.travis.yml +7 -0
data/CODE_OF_CONDUCT.md +49 -0
data/Gemfile +3 -0
data/LICENSE.txt +21 -0
data/README.md +137 -0
data/Rakefile +12 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/lib/page_rank/base.rb +89 -0
data/lib/page_rank/dense.rb +89 -0
data/lib/page_rank/sparse.rb +87 -0
data/lib/page_rank.rb +39 -0
data/lib/text_rank/char_filter/ascii_folding.rb +26 -0
data/lib/text_rank/char_filter/lowercase.rb +22 -0
data/lib/text_rank/char_filter/strip_email.rb +24 -0
data/lib/text_rank/char_filter/strip_html.rb +41 -0
data/lib/text_rank/char_filter/strip_possessive.rb +24 -0
data/lib/text_rank/char_filter/undo_contractions.rb +162 -0
data/lib/text_rank/char_filter.rb +24 -0
data/lib/text_rank/graph_strategy/coocurrence.rb +78 -0
data/lib/text_rank/graph_strategy.rb +23 -0
data/lib/text_rank/keyword_extractor.rb +155 -0
data/lib/text_rank/rank_filter/collapse_adjacent.rb +81 -0
data/lib/text_rank/rank_filter.rb +18 -0
data/lib/text_rank/token_filter/min_length.rb +33 -0
data/lib/text_rank/token_filter/part_of_speech.rb +45 -0
data/lib/text_rank/token_filter/stopwords.rb +349 -0
data/lib/text_rank/token_filter.rb +18 -0
data/lib/text_rank/tokenizer/regex.rb +26 -0
data/lib/text_rank/tokenizer/whitespace.rb +19 -0
data/lib/text_rank/tokenizer/words_and_punctuation.rb +26 -0
data/lib/text_rank/tokenizer.rb +19 -0
data/lib/text_rank/version.rb +3 -0
data/lib/text_rank.rb +34 -0
data/text_rank.gemspec +30 -0
metadata +183 -0

data/lib/text_rank/char_filter/strip_possessive.rb ADDED Viewed

@@ -0,0 +1,24 @@
+module TextRank
+  module CharFilter
+    ##
+    # Character filter to remove apostrophe from possessives.
+    #
+    # = Example
+    #
+    #  StripPosessive.new.filter!("to loathe one’s very being and yet to hold it fast")
+    #  => "to loathe one very being and yet to hold it fast"
+    ##
+    class StripPossessive
+      # Perform the filter
+      # @param text [String]
+      # @return [String]
+      def filter!(text)
+        text.gsub!(/([a-z]+)'s\b/) do
+          $1
+        end
+      end
+    end
+  end
+end

data/lib/text_rank/char_filter/undo_contractions.rb ADDED Viewed

@@ -0,0 +1,162 @@
+module TextRank
+  module CharFilter
+    ##
+    # Character filter to convert English contractions into their expanded form.
+    #
+    # = Example
+    #
+    #  UndoContractions.new.filter!("You're a bitter man. That's because I've lived.")
+    #  => "You are a bitter man. That is because I have lived."
+    ##
+    class UndoContractions
+      CONTRACTIONS = {
+        "ain't"            => "am not",
+        "amn't"            => "am not",
+        "aren't"           => "are not",
+        "can't"            => "can not",
+        "could've"         => "could have",
+        "couldn't"         => "could not",
+        "couldn't've"      => "could not have",
+        "didn't"           => "did not",
+        "doesn't"          => "does not",
+        "don't"            => "do not",
+        "gonna"            => "going to",
+        "hadn't"           => "had not",
+        "hadn't've"        => "had not have",
+        "hasn't"           => "has not",
+        "haven't"          => "have not",
+        "he'd"             => "he had",
+        "he'd've"          => "he would have",
+        "he'll"            => "he shall",
+        "he's"             => "he has",
+        "he'sn't"          => "he has not",
+        "how'd"            => "how did",
+        "how'll"           => "how will",
+        "how's"            => "how has",
+        "i'd"              => "i had",
+        "i'd've"           => "i would have",
+        "i'll"             => "i shall",
+        "i'm"              => "i am",
+        "i've"             => "i have",
+        "i'ven't"          => "i have not",
+        "isn't"            => "is not",
+        "it'd"             => "it had",
+        "it'd've"          => "it would have",
+        "it'll"            => "it shall",
+        "it's"             => "it has",
+        "it'sn't"          => "it has not",
+        "let's"            => "let us",
+        "ma'am"            => "madam",
+        "mightn't"         => "might not",
+        "mightn't've"      => "might not have",
+        "might've"         => "might have",
+        "mustn't"          => "must not",
+        "must've"          => "must have",
+        "needn't"          => "need not",
+        "not've"           => "not have",
+        "o'clock"          => "of the clock",
+        "ol'"              => "old",
+        "oughtn't"         => "ought not",
+        "shan't"           => "shall not",
+        "she'd"            => "she had",
+        "she'd've"         => "she would have",
+        "she'll"           => "she shall",
+        "she's"            => "she has",
+        "she'sn't"         => "she has not",
+        "should've"        => "should have",
+        "shouldn't"        => "should not",
+        "shouldn't've"     => "should not have",
+        "somebody'd"       => "somebody had",
+        "somebody'd've"    => "somebody would have",
+        "somebody'dn't've" => "somebody would not have",
+        "somebody'll"      => "somebody shall",
+        "somebody's"       => "somebody has",
+        "someone'd"        => "someone had",
+        "someone'd've"     => "someone would have",
+        "someone'll"       => "someone shall",
+        "someone's"        => "someone has",
+        "something'd"      => "something had",
+        "something'd've"   => "something would have",
+        "something'll"     => "something shall",
+        "something's"      => "something has",
+        "'sup"             => "what's up",
+        "that'll"          => "that will",
+        "that's"           => "that has",
+        "there'd"          => "there had",
+        "there'd've"       => "there would have",
+        "there're"         => "there are",
+        "there's"          => "there has",
+        "they'd"           => "they had",
+        "they'dn't"        => "they would not",
+        "they'dn't've"     => "they would not have",
+        "they'd've"        => "they would have",
+        "they'd'ven't"     => "they would have not",
+        "they'll"          => "they shall",
+        "they'lln't've"    => "they will not have",
+        "they'll'ven't"    => "they will have not",
+        "they're"          => "they are",
+        "they've"          => "they have",
+        "they'ven't"       => "they have not",
+        "'tis"             => "it is",
+        "'twas"            => "it was",
+        "wanna"            => "want to",
+        "wasn't"           => "was not",
+        "we'd"             => "we had",
+        "we'd've"          => "we would have",
+        "we'dn't've"       => "we would not have",
+        "we'll"            => "we will",
+        "we'lln't've"      => "we will not have",
+        "we're"            => "we are",
+        "we've"            => "we have",
+        "weren't"          => "were not",
+        "what'll"          => "what shall",
+        "what're"          => "what are",
+        "what's"           => "what has",
+        "what've"          => "what have",
+        "when's"           => "when has",
+        "where'd"          => "where did",
+        "where's"          => "where has",
+        "where've"         => "where have",
+        "who'd"            => "who would",
+        "who'd've"         => "who would have",
+        "who'll"           => "who shall",
+        "who're"           => "who are",
+        "who's"            => "who has",
+        "who've"           => "who have",
+        "why'll"           => "why will",
+        "why're"           => "why are",
+        "why's"            => "why has",
+        "won't"            => "will not",
+        "won't've"         => "will not have",
+        "would've"         => "would have",
+        "wouldn't"         => "would not",
+        "wouldn't've"      => "would not have",
+        "y'all"            => "you all",
+        "y'all'd've"       => "you all would have",
+        "y'all'dn't've"    => "you all would not have",
+        "y'all'll"         => "you all will",
+        "y'all'lln't"      => "you all will not",
+        "y'all'll've"      => "you all will have",
+        "y'all'll'ven't"   => "you all will have not",
+        "you'd"            => "you had",
+        "you'd've"         => "you would have",
+        "you'll"           => "you shall",
+        "you're"           => "you are",
+        "you'ren't"        => "you are not",
+        "you've"           => "you have",
+        "you'ven't"        => "you have not",
+      }
+      # Perform the filter
+      # @param text [String]
+      # @return [String]
+      def filter!(text)
+        text.gsub!(/[a-z']+/) do |word|
+          CONTRACTIONS[word] || word
+        end
+      end
+    end
+  end
+end

data/lib/text_rank/char_filter.rb ADDED Viewed

@@ -0,0 +1,24 @@
+module TextRank
+  ##
+  # Character filters pre-process text prior to tokenization.  It is during
+  # this phase that the text should be "cleaned up" so that the tokenizer will
+  # produce valid tokens.  Character filters should not attempt to remove undesired
+  # tokens, however.  That is the job of the token filter.  Examples include
+  # converting non-ascii characters to related ascii characters, forcing text to
+  # lower case, stripping out HTML, converting English contractions (e.g. "won't")
+  # to the non-contracted form ("will not"), and more.
+  #
+  # Character filters are applied as a chain, so care should be taken to use them
+  # in the desired order.
+  ##
+  module CharFilter
+    autoload :AsciiFolding,     'text_rank/char_filter/ascii_folding'
+    autoload :Lowercase,        'text_rank/char_filter/lowercase'
+    autoload :StripEmail,       'text_rank/char_filter/strip_email'
+    autoload :StripHtml,        'text_rank/char_filter/strip_html'
+    autoload :StripPossessive,  'text_rank/char_filter/strip_possessive'
+    autoload :UndoContractions, 'text_rank/char_filter/undo_contractions'
+  end
+end

data/lib/text_rank/graph_strategy/coocurrence.rb ADDED Viewed

@@ -0,0 +1,78 @@
+module TextRank
+  module GraphStrategy
+    ##
+    # The original TextRank algorithm uses co-occurrence in determining how to
+    # construct a graph of eligible token keywords and relate them together.  Given a
+    # window size of N any other token at most N positions away from a token is
+    # considered co-ocurrent, and an edge will be drawn between them.
+    #
+    # This implementation makes a slight change from the original algorithm by
+    # choosing a weight of 1/distance_from_token as the edge weight.
+    #
+    # = Example
+    #   Coocurrence.new(ngram_size: 4).build_graph(%w[what a pessimist you are exclaimed candide], graph)
+    #   # graph.add("what", "a", 1.0)
+    #   # graph.add("what", "pessimist", 0.5)
+    #   # graph.add("what", "you", 0.3333333333333333)
+    #   # graph.add("what", "are", 0.25)
+    #   # graph.add("a", "what", 1.0)
+    #   # graph.add("a", "pessimist", 1.0)
+    #   # graph.add("a", "you", 0.5)
+    #   # graph.add("a", "are", 0.3333333333333333)
+    #   # graph.add("a", "exclaimed", 0.25)
+    #   # graph.add("pessimist", "what", 0.5)
+    #   # graph.add("pessimist", "a", 1.0)
+    #   # graph.add("pessimist", "you", 1.0)
+    #   # graph.add("pessimist", "are", 0.5)
+    #   # graph.add("pessimist", "exclaimed", 0.3333333333333333)
+    #   # graph.add("pessimist", "candide", 0.25)
+    #   # graph.add("you", "what", 0.3333333333333333)
+    #   # graph.add("you", "a", 0.5)
+    #   # graph.add("you", "pessimist", 1.0)
+    #   # graph.add("you", "are", 1.0)
+    #   # graph.add("you", "exclaimed", 0.5)
+    #   # graph.add("you", "candide", 0.3333333333333333)
+    #   # graph.add("are", "what", 0.25)
+    #   # graph.add("are", "a", 0.3333333333333333)
+    #   # graph.add("are", "pessimist", 0.5)
+    #   # graph.add("are", "you", 1.0)
+    #   # graph.add("are", "exclaimed", 1.0)
+    #   # graph.add("are", "candide", 0.5)
+    #   # graph.add("exclaimed", "a", 0.25)
+    #   # graph.add("exclaimed", "pessimist", 0.3333333333333333)
+    #   # graph.add("exclaimed", "you", 0.5)
+    #   # graph.add("exclaimed", "are", 1.0)
+    #   # graph.add("exclaimed", "candide", 1.0)
+    #   # graph.add("candide", "pessimist", 0.25)
+    #   # graph.add("candide", "you", 0.3333333333333333)
+    #   # graph.add("candide", "are", 0.5)
+    #   # graph.add("candide", "exclaimed", 1.0)
+    ##
+    class Coocurrence
+      # @param ngram_size [Fixnum] Window size around a token considered co-occurrence
+      def initialize(ngram_size: 3, **_)
+        @ngram_size = ngram_size
+      end
+      # Build a graph for which the PageRank algorithm will be applied
+      # @param tokens [Array<String>] filtered tokens from which to build a graph
+      # @param graph [PageRank::Base] a PageRank graph into which to add nodes/edges
+      # return [nil]
+      def build_graph(tokens, graph)
+        ngram_window = @ngram_size * 2 + 1
+        tokens.each_with_index do |token_i, i|
+          ngram_window.times do |j|
+            next if j == @ngram_size || i + j < @ngram_size
+            token_j = tokens[i - @ngram_size + j]
+            if token_j
+              graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
+            end
+          end
+        end
+        nil
+      end
+    end
+  end
+end

data/lib/text_rank/graph_strategy.rb ADDED Viewed

@@ -0,0 +1,23 @@
+module TextRank
+  ##
+  # The graph strategy is the heart of the TextRank algorithm.  Strategies
+  # determine how a stream of potential tokens are transformed into a graph of
+  # unique tokens in such a way that the PageRank algorithm provides meaningful
+  # results.
+  #
+  # The standard TextRank approach uses co-occurence of tokens within a fixed-size
+  # window, and that strategy will likely suffice for most applications.  However,
+  # there are many variations of TextRank, e.g.:
+  #
+  # * SingleRank
+  # * ExpandRank
+  # * ClusterRank
+  #
+  # @see http://www.hlt.utdallas.edu/~vince/papers/coling10-keyphrase.pdf
+  ##
+  module GraphStrategy
+    autoload :Coocurrence, 'text_rank/graph_strategy/coocurrence'
+  end
+end

data/lib/text_rank/keyword_extractor.rb ADDED Viewed

@@ -0,0 +1,155 @@
+module TextRank
+  ##
+  # Primary class for keyword extraction and hub for filters, tokenizers, and
+  # graph strategies # that customize how the text is processed and how the
+  # TextRank algorithm is applied.
+  #
+  # @see README
+  ##
+  class KeywordExtractor
+    # Creates a "basic" keyword extractor with default options
+    # @option (see #initialize)
+    # @return [KeywordExtractor]
+    def self.basic(**options)
+      new(**{
+        char_filters:   [:AsciiFolding, :Lowercase],
+        tokenizer:      :Whitespace,
+        token_filters:  [:Stopwords, :MinLength],
+        graph_strategy: :Coocurrence,
+      }.merge(options))
+    end
+    # Creates an "advanced" keyword extractor with a larger set of default filters
+    # @option (see #initialize)
+    # @return [KeywordExtractor]
+    def self.advanced(**options)
+      new(**{
+        char_filters:   [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
+        tokenizer:      :WordsAndPunctuation,
+        token_filters:  [:PartOfSpeech, :Stopwords, :MinLength],
+        graph_strategy: :Coocurrence,
+        rank_filters:   [:CollapseAdjacent],
+      }.merge(options))
+    end
+    # @option (see PageRank.new)
+    # @option options [Array<Class, Symbol, #filter!>]  :char_filters A list of filters to be applied prior to tokenization
+    # @option options [Class, Symbol, #tokenize]        :tokenizer A class or tokenizer instance to perform tokenization
+    # @option options [Array<Class, Symbol, #filter!>]  :token_filters A list of filters to be applied to each token after tokenization
+    # @option options [Class, Symbol, #build_graph]     :graph_strategy A class or strategy instance for producing a graph from tokens
+    # @option options [Array<Class, Symbol, #filter!>]  :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
+    def initialize(**options)
+      @page_rank_options = {
+        strategy: options[:strategy] || :sparse,
+        damping: options[:damping],
+        tolerance: options[:tolerance],
+      }
+      @char_filters   = options[:char_filters] || []
+      @tokenizer      = options[:tokenizer] || Tokenizer::Whitespace
+      @token_filters  = options[:token_filters] || []
+      @rank_filters   = options[:rank_filters] || []
+      @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
+    end
+    # Add a new CharFilter for processing text before tokenization
+    # @param filter [Class, Symbol, #filter!] A filter to process text before tokenization
+    # @param (see #add_into)
+    # @return [nil]
+    def add_char_filter(filter, **options)
+      add_into(@char_filters, filter, **options)
+      nil
+    end
+    # Sets the tokenizer for producing tokens from filtered text
+    # @param tokenizer [Class, Symbol, #tokenize] Tokenizer
+    # @return [Class, Symbol, #tokenize]
+    def tokenizer=(tokenizer)
+      @tokenizer = tokenizer
+    end
+    # Sets the graph strategy for producing a graph from tokens
+    # @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
+    # @return [Class, Symbol, #build_graph]
+    def graph_strategy=(strategy)
+      @graph_strategy = strategy
+    end
+    # Add a new TokenFilter for processing tokens after tokenization
+    # @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
+    # @param (see #add_into)
+    # @return [nil]
+    def add_token_filter(filter, **options)
+      add_into(@token_filters, filter, **options)
+      nil
+    end
+    # Add a new RankFilter for processing ranks after calculating
+    # @param filter [Class, Symbol, #filter!] A filter to process ranks
+    # @param (see #add_into)
+    # @return [nil]
+    def add_rank_filter(filter, **options)
+      add_into(@rank_filters, filter, **options)
+      nil
+    end
+    # Filters and tokenizes text
+    # @param text [String] unfiltered text to be tokenized
+    # @return [Array<String>] tokens
+    def tokenize(text)
+      filtered_text = apply_char_filters(text)
+      tokens = classify(@tokenizer, context: Tokenizer).tokenize(filtered_text)
+      apply_token_filters(tokens)
+    end
+    # Filter & tokenize text, and return PageRank
+    # @param text [String] unfiltered text to be processed
+    # @return [Hash<String, Float>] tokens and page ranks (in descending order)
+    def extract(text, **options)
+      tokens = tokenize(text)
+      graph = PageRank.new(**@page_rank_options)
+      classify(@graph_strategy, context: GraphStrategy).build_graph(tokens, graph)
+      ranks = graph.calculate(**options)
+      apply_rank_filters(ranks, tokens: tokens, original_text: text)
+    end
+    private
+    def apply_char_filters(text)
+      @char_filters.reduce(text.clone) do |t, f|
+        classify(f, context: CharFilter).filter!(t) || t
+      end
+    end
+    def apply_token_filters(tokens)
+      @token_filters.reduce(tokens) do |t, f|
+        classify(f, context: TokenFilter).filter!(t) || t
+      end
+    end
+    def apply_rank_filters(ranks, **options)
+      @rank_filters.reduce(ranks) do |t, f|
+        classify(f, context: RankFilter).filter!(t, **options) || t
+      end
+    end
+    # @param before [Class, Symbol, Object] item to add before
+    # @param at [Fixnum] index to insert new item
+    def add_into(array, value, before: nil, at: nil)
+      idx = array.index(before) || at || -1
+      array.insert(idx, value)
+    end
+    def classify(c, context: self)
+      case c
+      when Class
+        c.new
+      when Symbol
+        context.const_get(c).new
+      else
+        c
+      end
+    end
+  end
+end

data/lib/text_rank/rank_filter/collapse_adjacent.rb ADDED Viewed

@@ -0,0 +1,81 @@
+module TextRank
+  module RankFilter
+    ##
+    # A rank filter which attempts to collapse one of the highly ranked, single
+    # token keywords into a combined keyword when those keywords are adjacent
+    # to each other in the original text.
+    #
+    # = Example
+    #
+    #  CollapseAdjacent.new(ranks_to_collapse: 6, max_tokens_to_combine: 2).filter!(
+    #    {
+    #      "town"         => 0.9818754334834477,
+    #      "cities"       => 0.9055017128817066,
+    #      "siege"        => 0.7411519524982207,
+    #      "arts"         => 0.6907977453782612,
+    #      "envy"         => 0.6692709808107252,
+    #      "blessings"    => 0.6442147897516214,
+    #      "plagues"      => 0.5972420789430091,
+    #      "florish"      => 0.3746092797528525,
+    #      "devoured"     => 0.36867321734332237,
+    #      "anxieties"    => 0.3367731719604189,
+    #      "peace"        => 0.2905352582752693,
+    #      "inhabitants"  => 0.12715120116732137,
+    #      "cares"        => 0.0697383057947685,
+    #    },
+    #    original_text: "cities blessings peace arts florish inhabitants devoured envy cares anxieties plagues town siege"
+    #  )
+    #  => {
+    #   "town siege"        => 0.9818754334834477,
+    #   "cities blessings"  => 0.9055017128817066,
+    #   "arts florish"      => 0.6907977453782612,
+    #   "devoured envy"     => 0.6692709808107252,
+    #   "anxieties plagues" => 0.5972420789430091,
+    #   "peace"             => 0.2905352582752693,
+    #   "inhabitants"       => 0.12715120116732137,
+    #   "cares"             => 0.0697383057947685,
+    #
+    ##
+    class CollapseAdjacent
+      # @param ranks_to_collapse [Fixnum] the top N ranks in which to look for collapsable keywords
+      # @param max_tokens_to_combine [Fixnum] the maximum number of tokens to collapse into a combined keyword
+      # @param ignore_case [true, false] whether to ignore case when finding adjacent keywords in original text
+      def initialize(ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, **_)
+        @ranks_to_collapse = ranks_to_collapse
+        @max_tokens_to_combine = max_tokens_to_combine
+        @ignore_case = !!ignore_case
+      end
+      # Perform the filter on the ranks
+      # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
+      # @param original_text [String] the original text (pre-tokenization) from which to find collapsable keywords
+      # @return [Hash<String, Float>]
+      def filter!(ranks, original_text:, **_)
+        collapsed = {}
+        loop do
+          permutation = collapse_one(ranks.keys.first(@ranks_to_collapse - collapsed.size), original_text) or break
+          collapsed[permutation.join(' ')] = ranks.values_at(*permutation).max
+          permutation.each { |token| ranks.delete(token) }
+        end
+        collapsed.merge!(ranks)
+        Hash[collapsed.sort_by { |_, v| -v }]
+      end
+      private
+      def collapse_one(tokens, original_text)
+        (2..@max_tokens_to_combine).to_a.reverse_each do |tokens_to_combine|
+          tokens.permutation(tokens_to_combine) do |permutation|
+            re_options = 0
+            re_options |= Regexp::IGNORECASE if @ignore_case
+            re = Regexp.new("\\b#{permutation.join(" +")}\\b", re_options)
+            return permutation if original_text =~ re
+          end
+        end
+        nil
+      end
+    end
+  end
+end

data/lib/text_rank/rank_filter.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module TextRank
+  ##
+  # Rank filters are post-process filters which can filter, enhance, or modify
+  # the results of the PageRank algorithm.  A common use case is to collapse highly
+  # ranked tokens which are found to be adjacent in the original text.  Other
+  # filters might modify the PageRank scores with some sort of external modifier.
+  # Another use might be to remove collapsed tokens which are not desired (since
+  # token filters only operate on a single, non-collapsed token).
+  #
+  # Rank filters are applied as a chain, so care should be taken to use them
+  # in the desired order.
+  ##
+  module RankFilter
+    autoload :CollapseAdjacent, 'text_rank/rank_filter/collapse_adjacent'
+  end
+end

data/lib/text_rank/token_filter/min_length.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module TextRank
+  module TokenFilter
+    ##
+    # Token filter to remove "small" tokens
+    #
+    # = Example
+    #
+    #  MinLength.new(min_length: 6).filter!(%w[
+    #    and ask each passenger to tell his story and if there is one of them all who has not
+    #    cursed his existence many times and said to himself over and over again that he was
+    #    the most miserable of men i give you permission to throw me head-first into the sea
+    #  ])
+    #  => ["passenger", "cursed", "existence", "himself", "miserable", "permission", "head-first"]
+    ##
+    class MinLength
+      # @param min_length [Fixnum] minimum size of token to keep
+      def initialize(min_length: 3, **_)
+        @min_length = min_length
+      end
+      # Perform the filter
+      # @param tokens [Array<String>]
+      # @return [Array<String>]
+      def filter!(tokens)
+        tokens.keep_if do |token|
+          token.size >= @min_length
+        end
+      end
+    end
+  end
+end

data/lib/text_rank/token_filter/part_of_speech.rb ADDED Viewed

@@ -0,0 +1,45 @@
+require 'engtagger'
+require 'set'
+module TextRank
+  module TokenFilter
+    ##
+    # Token filter to keep only a selected set of parts of speech
+    #
+    # = Example
+    #
+    #   PartOfSpeech.new(parts_to_keep: %w[nn nns]).filter!(%w[
+    #     all men are by nature free
+    #   ])
+    #   => ["men", "nature"]
+    ##
+    class PartOfSpeech
+      # @param parts_to_keep [Array<String>] list of engtagger parts of speech to keep
+      # @see https://github.com/yohasebe/engtagger#tag-set
+      def initialize(parts_to_keep: %w[nn nnp nnps nns jj jjr jjs vb vbd vbg vbn vbp vbz], **_)
+        @parts_to_keep = Set.new(parts_to_keep)
+        @eng_tagger = EngTagger.new
+        @last_pos_tag = 'pp'
+      end
+      # Perform the filter
+      # @param tokens [Array<String>]
+      # @return [Array<String>]
+      def filter!(tokens)
+        tokens.keep_if do |token|
+          @parts_to_keep.include?(pos_tag(token))
+        end
+      end
+      private
+      def pos_tag(token)
+        tag = @eng_tagger.assign_tag(@last_pos_tag, token) rescue nil
+        tag = 'nn' if tag.nil? || tag == ''
+        @last_pos_tag = tag
+      end
+    end
+  end
+end