text_rank 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/.codeclimate.yml +29 -0
  3. data/.gitignore +10 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +1157 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +7 -0
  8. data/CODE_OF_CONDUCT.md +49 -0
  9. data/Gemfile +3 -0
  10. data/LICENSE.txt +21 -0
  11. data/README.md +137 -0
  12. data/Rakefile +12 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/lib/page_rank/base.rb +89 -0
  16. data/lib/page_rank/dense.rb +89 -0
  17. data/lib/page_rank/sparse.rb +87 -0
  18. data/lib/page_rank.rb +39 -0
  19. data/lib/text_rank/char_filter/ascii_folding.rb +26 -0
  20. data/lib/text_rank/char_filter/lowercase.rb +22 -0
  21. data/lib/text_rank/char_filter/strip_email.rb +24 -0
  22. data/lib/text_rank/char_filter/strip_html.rb +41 -0
  23. data/lib/text_rank/char_filter/strip_possessive.rb +24 -0
  24. data/lib/text_rank/char_filter/undo_contractions.rb +162 -0
  25. data/lib/text_rank/char_filter.rb +24 -0
  26. data/lib/text_rank/graph_strategy/coocurrence.rb +78 -0
  27. data/lib/text_rank/graph_strategy.rb +23 -0
  28. data/lib/text_rank/keyword_extractor.rb +155 -0
  29. data/lib/text_rank/rank_filter/collapse_adjacent.rb +81 -0
  30. data/lib/text_rank/rank_filter.rb +18 -0
  31. data/lib/text_rank/token_filter/min_length.rb +33 -0
  32. data/lib/text_rank/token_filter/part_of_speech.rb +45 -0
  33. data/lib/text_rank/token_filter/stopwords.rb +349 -0
  34. data/lib/text_rank/token_filter.rb +18 -0
  35. data/lib/text_rank/tokenizer/regex.rb +26 -0
  36. data/lib/text_rank/tokenizer/whitespace.rb +19 -0
  37. data/lib/text_rank/tokenizer/words_and_punctuation.rb +26 -0
  38. data/lib/text_rank/tokenizer.rb +19 -0
  39. data/lib/text_rank/version.rb +3 -0
  40. data/lib/text_rank.rb +34 -0
  41. data/text_rank.gemspec +30 -0
  42. metadata +183 -0
@@ -0,0 +1,24 @@
1
+ module TextRank
2
+ module CharFilter
3
+ ##
4
+ # Character filter to remove apostrophe from possessives.
5
+ #
6
+ # = Example
7
+ #
8
+ # StripPosessive.new.filter!("to loathe one’s very being and yet to hold it fast")
9
+ # => "to loathe one very being and yet to hold it fast"
10
+ ##
11
+ class StripPossessive
12
+
13
+ # Perform the filter
14
+ # @param text [String]
15
+ # @return [String]
16
+ def filter!(text)
17
+ text.gsub!(/([a-z]+)'s\b/) do
18
+ $1
19
+ end
20
+ end
21
+
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,162 @@
1
+ module TextRank
2
+ module CharFilter
3
+ ##
4
+ # Character filter to convert English contractions into their expanded form.
5
+ #
6
+ # = Example
7
+ #
8
+ # UndoContractions.new.filter!("You're a bitter man. That's because I've lived.")
9
+ # => "You are a bitter man. That is because I have lived."
10
+ ##
11
+ class UndoContractions
12
+
13
+ CONTRACTIONS = {
14
+ "ain't" => "am not",
15
+ "amn't" => "am not",
16
+ "aren't" => "are not",
17
+ "can't" => "can not",
18
+ "could've" => "could have",
19
+ "couldn't" => "could not",
20
+ "couldn't've" => "could not have",
21
+ "didn't" => "did not",
22
+ "doesn't" => "does not",
23
+ "don't" => "do not",
24
+ "gonna" => "going to",
25
+ "hadn't" => "had not",
26
+ "hadn't've" => "had not have",
27
+ "hasn't" => "has not",
28
+ "haven't" => "have not",
29
+ "he'd" => "he had",
30
+ "he'd've" => "he would have",
31
+ "he'll" => "he shall",
32
+ "he's" => "he has",
33
+ "he'sn't" => "he has not",
34
+ "how'd" => "how did",
35
+ "how'll" => "how will",
36
+ "how's" => "how has",
37
+ "i'd" => "i had",
38
+ "i'd've" => "i would have",
39
+ "i'll" => "i shall",
40
+ "i'm" => "i am",
41
+ "i've" => "i have",
42
+ "i'ven't" => "i have not",
43
+ "isn't" => "is not",
44
+ "it'd" => "it had",
45
+ "it'd've" => "it would have",
46
+ "it'll" => "it shall",
47
+ "it's" => "it has",
48
+ "it'sn't" => "it has not",
49
+ "let's" => "let us",
50
+ "ma'am" => "madam",
51
+ "mightn't" => "might not",
52
+ "mightn't've" => "might not have",
53
+ "might've" => "might have",
54
+ "mustn't" => "must not",
55
+ "must've" => "must have",
56
+ "needn't" => "need not",
57
+ "not've" => "not have",
58
+ "o'clock" => "of the clock",
59
+ "ol'" => "old",
60
+ "oughtn't" => "ought not",
61
+ "shan't" => "shall not",
62
+ "she'd" => "she had",
63
+ "she'd've" => "she would have",
64
+ "she'll" => "she shall",
65
+ "she's" => "she has",
66
+ "she'sn't" => "she has not",
67
+ "should've" => "should have",
68
+ "shouldn't" => "should not",
69
+ "shouldn't've" => "should not have",
70
+ "somebody'd" => "somebody had",
71
+ "somebody'd've" => "somebody would have",
72
+ "somebody'dn't've" => "somebody would not have",
73
+ "somebody'll" => "somebody shall",
74
+ "somebody's" => "somebody has",
75
+ "someone'd" => "someone had",
76
+ "someone'd've" => "someone would have",
77
+ "someone'll" => "someone shall",
78
+ "someone's" => "someone has",
79
+ "something'd" => "something had",
80
+ "something'd've" => "something would have",
81
+ "something'll" => "something shall",
82
+ "something's" => "something has",
83
+ "'sup" => "what's up",
84
+ "that'll" => "that will",
85
+ "that's" => "that has",
86
+ "there'd" => "there had",
87
+ "there'd've" => "there would have",
88
+ "there're" => "there are",
89
+ "there's" => "there has",
90
+ "they'd" => "they had",
91
+ "they'dn't" => "they would not",
92
+ "they'dn't've" => "they would not have",
93
+ "they'd've" => "they would have",
94
+ "they'd'ven't" => "they would have not",
95
+ "they'll" => "they shall",
96
+ "they'lln't've" => "they will not have",
97
+ "they'll'ven't" => "they will have not",
98
+ "they're" => "they are",
99
+ "they've" => "they have",
100
+ "they'ven't" => "they have not",
101
+ "'tis" => "it is",
102
+ "'twas" => "it was",
103
+ "wanna" => "want to",
104
+ "wasn't" => "was not",
105
+ "we'd" => "we had",
106
+ "we'd've" => "we would have",
107
+ "we'dn't've" => "we would not have",
108
+ "we'll" => "we will",
109
+ "we'lln't've" => "we will not have",
110
+ "we're" => "we are",
111
+ "we've" => "we have",
112
+ "weren't" => "were not",
113
+ "what'll" => "what shall",
114
+ "what're" => "what are",
115
+ "what's" => "what has",
116
+ "what've" => "what have",
117
+ "when's" => "when has",
118
+ "where'd" => "where did",
119
+ "where's" => "where has",
120
+ "where've" => "where have",
121
+ "who'd" => "who would",
122
+ "who'd've" => "who would have",
123
+ "who'll" => "who shall",
124
+ "who're" => "who are",
125
+ "who's" => "who has",
126
+ "who've" => "who have",
127
+ "why'll" => "why will",
128
+ "why're" => "why are",
129
+ "why's" => "why has",
130
+ "won't" => "will not",
131
+ "won't've" => "will not have",
132
+ "would've" => "would have",
133
+ "wouldn't" => "would not",
134
+ "wouldn't've" => "would not have",
135
+ "y'all" => "you all",
136
+ "y'all'd've" => "you all would have",
137
+ "y'all'dn't've" => "you all would not have",
138
+ "y'all'll" => "you all will",
139
+ "y'all'lln't" => "you all will not",
140
+ "y'all'll've" => "you all will have",
141
+ "y'all'll'ven't" => "you all will have not",
142
+ "you'd" => "you had",
143
+ "you'd've" => "you would have",
144
+ "you'll" => "you shall",
145
+ "you're" => "you are",
146
+ "you'ren't" => "you are not",
147
+ "you've" => "you have",
148
+ "you'ven't" => "you have not",
149
+ }
150
+
151
+ # Perform the filter
152
+ # @param text [String]
153
+ # @return [String]
154
+ def filter!(text)
155
+ text.gsub!(/[a-z']+/) do |word|
156
+ CONTRACTIONS[word] || word
157
+ end
158
+ end
159
+
160
+ end
161
+ end
162
+ end
@@ -0,0 +1,24 @@
1
+ module TextRank
2
+ ##
3
+ # Character filters pre-process text prior to tokenization. It is during
4
+ # this phase that the text should be "cleaned up" so that the tokenizer will
5
+ # produce valid tokens. Character filters should not attempt to remove undesired
6
+ # tokens, however. That is the job of the token filter. Examples include
7
+ # converting non-ascii characters to related ascii characters, forcing text to
8
+ # lower case, stripping out HTML, converting English contractions (e.g. "won't")
9
+ # to the non-contracted form ("will not"), and more.
10
+ #
11
+ # Character filters are applied as a chain, so care should be taken to use them
12
+ # in the desired order.
13
+ ##
14
+ module CharFilter
15
+
16
+ autoload :AsciiFolding, 'text_rank/char_filter/ascii_folding'
17
+ autoload :Lowercase, 'text_rank/char_filter/lowercase'
18
+ autoload :StripEmail, 'text_rank/char_filter/strip_email'
19
+ autoload :StripHtml, 'text_rank/char_filter/strip_html'
20
+ autoload :StripPossessive, 'text_rank/char_filter/strip_possessive'
21
+ autoload :UndoContractions, 'text_rank/char_filter/undo_contractions'
22
+
23
+ end
24
+ end
@@ -0,0 +1,78 @@
1
+ module TextRank
2
+ module GraphStrategy
3
+ ##
4
+ # The original TextRank algorithm uses co-occurrence in determining how to
5
+ # construct a graph of eligible token keywords and relate them together. Given a
6
+ # window size of N any other token at most N positions away from a token is
7
+ # considered co-ocurrent, and an edge will be drawn between them.
8
+ #
9
+ # This implementation makes a slight change from the original algorithm by
10
+ # choosing a weight of 1/distance_from_token as the edge weight.
11
+ #
12
+ # = Example
13
+ # Coocurrence.new(ngram_size: 4).build_graph(%w[what a pessimist you are exclaimed candide], graph)
14
+ # # graph.add("what", "a", 1.0)
15
+ # # graph.add("what", "pessimist", 0.5)
16
+ # # graph.add("what", "you", 0.3333333333333333)
17
+ # # graph.add("what", "are", 0.25)
18
+ # # graph.add("a", "what", 1.0)
19
+ # # graph.add("a", "pessimist", 1.0)
20
+ # # graph.add("a", "you", 0.5)
21
+ # # graph.add("a", "are", 0.3333333333333333)
22
+ # # graph.add("a", "exclaimed", 0.25)
23
+ # # graph.add("pessimist", "what", 0.5)
24
+ # # graph.add("pessimist", "a", 1.0)
25
+ # # graph.add("pessimist", "you", 1.0)
26
+ # # graph.add("pessimist", "are", 0.5)
27
+ # # graph.add("pessimist", "exclaimed", 0.3333333333333333)
28
+ # # graph.add("pessimist", "candide", 0.25)
29
+ # # graph.add("you", "what", 0.3333333333333333)
30
+ # # graph.add("you", "a", 0.5)
31
+ # # graph.add("you", "pessimist", 1.0)
32
+ # # graph.add("you", "are", 1.0)
33
+ # # graph.add("you", "exclaimed", 0.5)
34
+ # # graph.add("you", "candide", 0.3333333333333333)
35
+ # # graph.add("are", "what", 0.25)
36
+ # # graph.add("are", "a", 0.3333333333333333)
37
+ # # graph.add("are", "pessimist", 0.5)
38
+ # # graph.add("are", "you", 1.0)
39
+ # # graph.add("are", "exclaimed", 1.0)
40
+ # # graph.add("are", "candide", 0.5)
41
+ # # graph.add("exclaimed", "a", 0.25)
42
+ # # graph.add("exclaimed", "pessimist", 0.3333333333333333)
43
+ # # graph.add("exclaimed", "you", 0.5)
44
+ # # graph.add("exclaimed", "are", 1.0)
45
+ # # graph.add("exclaimed", "candide", 1.0)
46
+ # # graph.add("candide", "pessimist", 0.25)
47
+ # # graph.add("candide", "you", 0.3333333333333333)
48
+ # # graph.add("candide", "are", 0.5)
49
+ # # graph.add("candide", "exclaimed", 1.0)
50
+ ##
51
+ class Coocurrence
52
+
53
+ # @param ngram_size [Fixnum] Window size around a token considered co-occurrence
54
+ def initialize(ngram_size: 3, **_)
55
+ @ngram_size = ngram_size
56
+ end
57
+
58
+ # Build a graph for which the PageRank algorithm will be applied
59
+ # @param tokens [Array<String>] filtered tokens from which to build a graph
60
+ # @param graph [PageRank::Base] a PageRank graph into which to add nodes/edges
61
+ # return [nil]
62
+ def build_graph(tokens, graph)
63
+ ngram_window = @ngram_size * 2 + 1
64
+ tokens.each_with_index do |token_i, i|
65
+ ngram_window.times do |j|
66
+ next if j == @ngram_size || i + j < @ngram_size
67
+ token_j = tokens[i - @ngram_size + j]
68
+ if token_j
69
+ graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
70
+ end
71
+ end
72
+ end
73
+ nil
74
+ end
75
+
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,23 @@
1
+ module TextRank
2
+ ##
3
+ # The graph strategy is the heart of the TextRank algorithm. Strategies
4
+ # determine how a stream of potential tokens are transformed into a graph of
5
+ # unique tokens in such a way that the PageRank algorithm provides meaningful
6
+ # results.
7
+ #
8
+ # The standard TextRank approach uses co-occurence of tokens within a fixed-size
9
+ # window, and that strategy will likely suffice for most applications. However,
10
+ # there are many variations of TextRank, e.g.:
11
+ #
12
+ # * SingleRank
13
+ # * ExpandRank
14
+ # * ClusterRank
15
+ #
16
+ # @see http://www.hlt.utdallas.edu/~vince/papers/coling10-keyphrase.pdf
17
+ ##
18
+ module GraphStrategy
19
+
20
+ autoload :Coocurrence, 'text_rank/graph_strategy/coocurrence'
21
+
22
+ end
23
+ end
@@ -0,0 +1,155 @@
1
+ module TextRank
2
+ ##
3
+ # Primary class for keyword extraction and hub for filters, tokenizers, and
4
+ # graph strategies # that customize how the text is processed and how the
5
+ # TextRank algorithm is applied.
6
+ #
7
+ # @see README
8
+ ##
9
+ class KeywordExtractor
10
+
11
+ # Creates a "basic" keyword extractor with default options
12
+ # @option (see #initialize)
13
+ # @return [KeywordExtractor]
14
+ def self.basic(**options)
15
+ new(**{
16
+ char_filters: [:AsciiFolding, :Lowercase],
17
+ tokenizer: :Whitespace,
18
+ token_filters: [:Stopwords, :MinLength],
19
+ graph_strategy: :Coocurrence,
20
+ }.merge(options))
21
+ end
22
+
23
+ # Creates an "advanced" keyword extractor with a larger set of default filters
24
+ # @option (see #initialize)
25
+ # @return [KeywordExtractor]
26
+ def self.advanced(**options)
27
+ new(**{
28
+ char_filters: [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
29
+ tokenizer: :WordsAndPunctuation,
30
+ token_filters: [:PartOfSpeech, :Stopwords, :MinLength],
31
+ graph_strategy: :Coocurrence,
32
+ rank_filters: [:CollapseAdjacent],
33
+ }.merge(options))
34
+ end
35
+
36
+ # @option (see PageRank.new)
37
+ # @option options [Array<Class, Symbol, #filter!>] :char_filters A list of filters to be applied prior to tokenization
38
+ # @option options [Class, Symbol, #tokenize] :tokenizer A class or tokenizer instance to perform tokenization
39
+ # @option options [Array<Class, Symbol, #filter!>] :token_filters A list of filters to be applied to each token after tokenization
40
+ # @option options [Class, Symbol, #build_graph] :graph_strategy A class or strategy instance for producing a graph from tokens
41
+ # @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
42
+ def initialize(**options)
43
+ @page_rank_options = {
44
+ strategy: options[:strategy] || :sparse,
45
+ damping: options[:damping],
46
+ tolerance: options[:tolerance],
47
+ }
48
+ @char_filters = options[:char_filters] || []
49
+ @tokenizer = options[:tokenizer] || Tokenizer::Whitespace
50
+ @token_filters = options[:token_filters] || []
51
+ @rank_filters = options[:rank_filters] || []
52
+ @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
53
+ end
54
+
55
+ # Add a new CharFilter for processing text before tokenization
56
+ # @param filter [Class, Symbol, #filter!] A filter to process text before tokenization
57
+ # @param (see #add_into)
58
+ # @return [nil]
59
+ def add_char_filter(filter, **options)
60
+ add_into(@char_filters, filter, **options)
61
+ nil
62
+ end
63
+
64
+ # Sets the tokenizer for producing tokens from filtered text
65
+ # @param tokenizer [Class, Symbol, #tokenize] Tokenizer
66
+ # @return [Class, Symbol, #tokenize]
67
+ def tokenizer=(tokenizer)
68
+ @tokenizer = tokenizer
69
+ end
70
+
71
+ # Sets the graph strategy for producing a graph from tokens
72
+ # @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
73
+ # @return [Class, Symbol, #build_graph]
74
+ def graph_strategy=(strategy)
75
+ @graph_strategy = strategy
76
+ end
77
+
78
+ # Add a new TokenFilter for processing tokens after tokenization
79
+ # @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
80
+ # @param (see #add_into)
81
+ # @return [nil]
82
+ def add_token_filter(filter, **options)
83
+ add_into(@token_filters, filter, **options)
84
+ nil
85
+ end
86
+
87
+ # Add a new RankFilter for processing ranks after calculating
88
+ # @param filter [Class, Symbol, #filter!] A filter to process ranks
89
+ # @param (see #add_into)
90
+ # @return [nil]
91
+ def add_rank_filter(filter, **options)
92
+ add_into(@rank_filters, filter, **options)
93
+ nil
94
+ end
95
+
96
+ # Filters and tokenizes text
97
+ # @param text [String] unfiltered text to be tokenized
98
+ # @return [Array<String>] tokens
99
+ def tokenize(text)
100
+ filtered_text = apply_char_filters(text)
101
+ tokens = classify(@tokenizer, context: Tokenizer).tokenize(filtered_text)
102
+ apply_token_filters(tokens)
103
+ end
104
+
105
+ # Filter & tokenize text, and return PageRank
106
+ # @param text [String] unfiltered text to be processed
107
+ # @return [Hash<String, Float>] tokens and page ranks (in descending order)
108
+ def extract(text, **options)
109
+ tokens = tokenize(text)
110
+ graph = PageRank.new(**@page_rank_options)
111
+ classify(@graph_strategy, context: GraphStrategy).build_graph(tokens, graph)
112
+ ranks = graph.calculate(**options)
113
+ apply_rank_filters(ranks, tokens: tokens, original_text: text)
114
+ end
115
+
116
+ private
117
+
118
+ def apply_char_filters(text)
119
+ @char_filters.reduce(text.clone) do |t, f|
120
+ classify(f, context: CharFilter).filter!(t) || t
121
+ end
122
+ end
123
+
124
+ def apply_token_filters(tokens)
125
+ @token_filters.reduce(tokens) do |t, f|
126
+ classify(f, context: TokenFilter).filter!(t) || t
127
+ end
128
+ end
129
+
130
+ def apply_rank_filters(ranks, **options)
131
+ @rank_filters.reduce(ranks) do |t, f|
132
+ classify(f, context: RankFilter).filter!(t, **options) || t
133
+ end
134
+ end
135
+
136
+ # @param before [Class, Symbol, Object] item to add before
137
+ # @param at [Fixnum] index to insert new item
138
+ def add_into(array, value, before: nil, at: nil)
139
+ idx = array.index(before) || at || -1
140
+ array.insert(idx, value)
141
+ end
142
+
143
+ def classify(c, context: self)
144
+ case c
145
+ when Class
146
+ c.new
147
+ when Symbol
148
+ context.const_get(c).new
149
+ else
150
+ c
151
+ end
152
+ end
153
+
154
+ end
155
+ end
@@ -0,0 +1,81 @@
1
+ module TextRank
2
+ module RankFilter
3
+ ##
4
+ # A rank filter which attempts to collapse one of the highly ranked, single
5
+ # token keywords into a combined keyword when those keywords are adjacent
6
+ # to each other in the original text.
7
+ #
8
+ # = Example
9
+ #
10
+ # CollapseAdjacent.new(ranks_to_collapse: 6, max_tokens_to_combine: 2).filter!(
11
+ # {
12
+ # "town" => 0.9818754334834477,
13
+ # "cities" => 0.9055017128817066,
14
+ # "siege" => 0.7411519524982207,
15
+ # "arts" => 0.6907977453782612,
16
+ # "envy" => 0.6692709808107252,
17
+ # "blessings" => 0.6442147897516214,
18
+ # "plagues" => 0.5972420789430091,
19
+ # "florish" => 0.3746092797528525,
20
+ # "devoured" => 0.36867321734332237,
21
+ # "anxieties" => 0.3367731719604189,
22
+ # "peace" => 0.2905352582752693,
23
+ # "inhabitants" => 0.12715120116732137,
24
+ # "cares" => 0.0697383057947685,
25
+ # },
26
+ # original_text: "cities blessings peace arts florish inhabitants devoured envy cares anxieties plagues town siege"
27
+ # )
28
+ # => {
29
+ # "town siege" => 0.9818754334834477,
30
+ # "cities blessings" => 0.9055017128817066,
31
+ # "arts florish" => 0.6907977453782612,
32
+ # "devoured envy" => 0.6692709808107252,
33
+ # "anxieties plagues" => 0.5972420789430091,
34
+ # "peace" => 0.2905352582752693,
35
+ # "inhabitants" => 0.12715120116732137,
36
+ # "cares" => 0.0697383057947685,
37
+ #
38
+ ##
39
+ class CollapseAdjacent
40
+
41
+ # @param ranks_to_collapse [Fixnum] the top N ranks in which to look for collapsable keywords
42
+ # @param max_tokens_to_combine [Fixnum] the maximum number of tokens to collapse into a combined keyword
43
+ # @param ignore_case [true, false] whether to ignore case when finding adjacent keywords in original text
44
+ def initialize(ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, **_)
45
+ @ranks_to_collapse = ranks_to_collapse
46
+ @max_tokens_to_combine = max_tokens_to_combine
47
+ @ignore_case = !!ignore_case
48
+ end
49
+
50
+ # Perform the filter on the ranks
51
+ # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
52
+ # @param original_text [String] the original text (pre-tokenization) from which to find collapsable keywords
53
+ # @return [Hash<String, Float>]
54
+ def filter!(ranks, original_text:, **_)
55
+ collapsed = {}
56
+ loop do
57
+ permutation = collapse_one(ranks.keys.first(@ranks_to_collapse - collapsed.size), original_text) or break
58
+ collapsed[permutation.join(' ')] = ranks.values_at(*permutation).max
59
+ permutation.each { |token| ranks.delete(token) }
60
+ end
61
+ collapsed.merge!(ranks)
62
+ Hash[collapsed.sort_by { |_, v| -v }]
63
+ end
64
+
65
+ private
66
+
67
+ def collapse_one(tokens, original_text)
68
+ (2..@max_tokens_to_combine).to_a.reverse_each do |tokens_to_combine|
69
+ tokens.permutation(tokens_to_combine) do |permutation|
70
+ re_options = 0
71
+ re_options |= Regexp::IGNORECASE if @ignore_case
72
+ re = Regexp.new("\\b#{permutation.join(" +")}\\b", re_options)
73
+ return permutation if original_text =~ re
74
+ end
75
+ end
76
+ nil
77
+ end
78
+
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,18 @@
1
+ module TextRank
2
+ ##
3
+ # Rank filters are post-process filters which can filter, enhance, or modify
4
+ # the results of the PageRank algorithm. A common use case is to collapse highly
5
+ # ranked tokens which are found to be adjacent in the original text. Other
6
+ # filters might modify the PageRank scores with some sort of external modifier.
7
+ # Another use might be to remove collapsed tokens which are not desired (since
8
+ # token filters only operate on a single, non-collapsed token).
9
+ #
10
+ # Rank filters are applied as a chain, so care should be taken to use them
11
+ # in the desired order.
12
+ ##
13
+ module RankFilter
14
+
15
+ autoload :CollapseAdjacent, 'text_rank/rank_filter/collapse_adjacent'
16
+
17
+ end
18
+ end
@@ -0,0 +1,33 @@
1
+ module TextRank
2
+ module TokenFilter
3
+ ##
4
+ # Token filter to remove "small" tokens
5
+ #
6
+ # = Example
7
+ #
8
+ # MinLength.new(min_length: 6).filter!(%w[
9
+ # and ask each passenger to tell his story and if there is one of them all who has not
10
+ # cursed his existence many times and said to himself over and over again that he was
11
+ # the most miserable of men i give you permission to throw me head-first into the sea
12
+ # ])
13
+ # => ["passenger", "cursed", "existence", "himself", "miserable", "permission", "head-first"]
14
+ ##
15
+ class MinLength
16
+
17
+ # @param min_length [Fixnum] minimum size of token to keep
18
+ def initialize(min_length: 3, **_)
19
+ @min_length = min_length
20
+ end
21
+
22
+ # Perform the filter
23
+ # @param tokens [Array<String>]
24
+ # @return [Array<String>]
25
+ def filter!(tokens)
26
+ tokens.keep_if do |token|
27
+ token.size >= @min_length
28
+ end
29
+ end
30
+
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,45 @@
1
+ require 'engtagger'
2
+ require 'set'
3
+
4
+ module TextRank
5
+ module TokenFilter
6
+ ##
7
+ # Token filter to keep only a selected set of parts of speech
8
+ #
9
+ # = Example
10
+ #
11
+ # PartOfSpeech.new(parts_to_keep: %w[nn nns]).filter!(%w[
12
+ # all men are by nature free
13
+ # ])
14
+ # => ["men", "nature"]
15
+ ##
16
+ class PartOfSpeech
17
+
18
+ # @param parts_to_keep [Array<String>] list of engtagger parts of speech to keep
19
+ # @see https://github.com/yohasebe/engtagger#tag-set
20
+ def initialize(parts_to_keep: %w[nn nnp nnps nns jj jjr jjs vb vbd vbg vbn vbp vbz], **_)
21
+ @parts_to_keep = Set.new(parts_to_keep)
22
+ @eng_tagger = EngTagger.new
23
+ @last_pos_tag = 'pp'
24
+ end
25
+
26
+ # Perform the filter
27
+ # @param tokens [Array<String>]
28
+ # @return [Array<String>]
29
+ def filter!(tokens)
30
+ tokens.keep_if do |token|
31
+ @parts_to_keep.include?(pos_tag(token))
32
+ end
33
+ end
34
+
35
+ private
36
+
37
+ def pos_tag(token)
38
+ tag = @eng_tagger.assign_tag(@last_pos_tag, token) rescue nil
39
+ tag = 'nn' if tag.nil? || tag == ''
40
+ @last_pos_tag = tag
41
+ end
42
+
43
+ end
44
+ end
45
+ end