text_rank 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/.codeclimate.yml +29 -0
  3. data/.gitignore +10 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +1157 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +7 -0
  8. data/CODE_OF_CONDUCT.md +49 -0
  9. data/Gemfile +3 -0
  10. data/LICENSE.txt +21 -0
  11. data/README.md +137 -0
  12. data/Rakefile +12 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/lib/page_rank/base.rb +89 -0
  16. data/lib/page_rank/dense.rb +89 -0
  17. data/lib/page_rank/sparse.rb +87 -0
  18. data/lib/page_rank.rb +39 -0
  19. data/lib/text_rank/char_filter/ascii_folding.rb +26 -0
  20. data/lib/text_rank/char_filter/lowercase.rb +22 -0
  21. data/lib/text_rank/char_filter/strip_email.rb +24 -0
  22. data/lib/text_rank/char_filter/strip_html.rb +41 -0
  23. data/lib/text_rank/char_filter/strip_possessive.rb +24 -0
  24. data/lib/text_rank/char_filter/undo_contractions.rb +162 -0
  25. data/lib/text_rank/char_filter.rb +24 -0
  26. data/lib/text_rank/graph_strategy/coocurrence.rb +78 -0
  27. data/lib/text_rank/graph_strategy.rb +23 -0
  28. data/lib/text_rank/keyword_extractor.rb +155 -0
  29. data/lib/text_rank/rank_filter/collapse_adjacent.rb +81 -0
  30. data/lib/text_rank/rank_filter.rb +18 -0
  31. data/lib/text_rank/token_filter/min_length.rb +33 -0
  32. data/lib/text_rank/token_filter/part_of_speech.rb +45 -0
  33. data/lib/text_rank/token_filter/stopwords.rb +349 -0
  34. data/lib/text_rank/token_filter.rb +18 -0
  35. data/lib/text_rank/tokenizer/regex.rb +26 -0
  36. data/lib/text_rank/tokenizer/whitespace.rb +19 -0
  37. data/lib/text_rank/tokenizer/words_and_punctuation.rb +26 -0
  38. data/lib/text_rank/tokenizer.rb +19 -0
  39. data/lib/text_rank/version.rb +3 -0
  40. data/lib/text_rank.rb +34 -0
  41. data/text_rank.gemspec +30 -0
  42. metadata +183 -0
@@ -0,0 +1,24 @@
1
+ module TextRank
2
+ module CharFilter
3
+ ##
4
+ # Character filter to remove apostrophe from possessives.
5
+ #
6
+ # = Example
7
+ #
8
+ # StripPosessive.new.filter!("to loathe one’s very being and yet to hold it fast")
9
+ # => "to loathe one very being and yet to hold it fast"
10
+ ##
11
+ class StripPossessive
12
+
13
+ # Perform the filter
14
+ # @param text [String]
15
+ # @return [String]
16
+ def filter!(text)
17
+ text.gsub!(/([a-z]+)'s\b/) do
18
+ $1
19
+ end
20
+ end
21
+
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,162 @@
1
+ module TextRank
2
+ module CharFilter
3
+ ##
4
+ # Character filter to convert English contractions into their expanded form.
5
+ #
6
+ # = Example
7
+ #
8
+ # UndoContractions.new.filter!("You're a bitter man. That's because I've lived.")
9
+ # => "You are a bitter man. That is because I have lived."
10
+ ##
11
+ class UndoContractions
12
+
13
+ CONTRACTIONS = {
14
+ "ain't" => "am not",
15
+ "amn't" => "am not",
16
+ "aren't" => "are not",
17
+ "can't" => "can not",
18
+ "could've" => "could have",
19
+ "couldn't" => "could not",
20
+ "couldn't've" => "could not have",
21
+ "didn't" => "did not",
22
+ "doesn't" => "does not",
23
+ "don't" => "do not",
24
+ "gonna" => "going to",
25
+ "hadn't" => "had not",
26
+ "hadn't've" => "had not have",
27
+ "hasn't" => "has not",
28
+ "haven't" => "have not",
29
+ "he'd" => "he had",
30
+ "he'd've" => "he would have",
31
+ "he'll" => "he shall",
32
+ "he's" => "he has",
33
+ "he'sn't" => "he has not",
34
+ "how'd" => "how did",
35
+ "how'll" => "how will",
36
+ "how's" => "how has",
37
+ "i'd" => "i had",
38
+ "i'd've" => "i would have",
39
+ "i'll" => "i shall",
40
+ "i'm" => "i am",
41
+ "i've" => "i have",
42
+ "i'ven't" => "i have not",
43
+ "isn't" => "is not",
44
+ "it'd" => "it had",
45
+ "it'd've" => "it would have",
46
+ "it'll" => "it shall",
47
+ "it's" => "it has",
48
+ "it'sn't" => "it has not",
49
+ "let's" => "let us",
50
+ "ma'am" => "madam",
51
+ "mightn't" => "might not",
52
+ "mightn't've" => "might not have",
53
+ "might've" => "might have",
54
+ "mustn't" => "must not",
55
+ "must've" => "must have",
56
+ "needn't" => "need not",
57
+ "not've" => "not have",
58
+ "o'clock" => "of the clock",
59
+ "ol'" => "old",
60
+ "oughtn't" => "ought not",
61
+ "shan't" => "shall not",
62
+ "she'd" => "she had",
63
+ "she'd've" => "she would have",
64
+ "she'll" => "she shall",
65
+ "she's" => "she has",
66
+ "she'sn't" => "she has not",
67
+ "should've" => "should have",
68
+ "shouldn't" => "should not",
69
+ "shouldn't've" => "should not have",
70
+ "somebody'd" => "somebody had",
71
+ "somebody'd've" => "somebody would have",
72
+ "somebody'dn't've" => "somebody would not have",
73
+ "somebody'll" => "somebody shall",
74
+ "somebody's" => "somebody has",
75
+ "someone'd" => "someone had",
76
+ "someone'd've" => "someone would have",
77
+ "someone'll" => "someone shall",
78
+ "someone's" => "someone has",
79
+ "something'd" => "something had",
80
+ "something'd've" => "something would have",
81
+ "something'll" => "something shall",
82
+ "something's" => "something has",
83
+ "'sup" => "what's up",
84
+ "that'll" => "that will",
85
+ "that's" => "that has",
86
+ "there'd" => "there had",
87
+ "there'd've" => "there would have",
88
+ "there're" => "there are",
89
+ "there's" => "there has",
90
+ "they'd" => "they had",
91
+ "they'dn't" => "they would not",
92
+ "they'dn't've" => "they would not have",
93
+ "they'd've" => "they would have",
94
+ "they'd'ven't" => "they would have not",
95
+ "they'll" => "they shall",
96
+ "they'lln't've" => "they will not have",
97
+ "they'll'ven't" => "they will have not",
98
+ "they're" => "they are",
99
+ "they've" => "they have",
100
+ "they'ven't" => "they have not",
101
+ "'tis" => "it is",
102
+ "'twas" => "it was",
103
+ "wanna" => "want to",
104
+ "wasn't" => "was not",
105
+ "we'd" => "we had",
106
+ "we'd've" => "we would have",
107
+ "we'dn't've" => "we would not have",
108
+ "we'll" => "we will",
109
+ "we'lln't've" => "we will not have",
110
+ "we're" => "we are",
111
+ "we've" => "we have",
112
+ "weren't" => "were not",
113
+ "what'll" => "what shall",
114
+ "what're" => "what are",
115
+ "what's" => "what has",
116
+ "what've" => "what have",
117
+ "when's" => "when has",
118
+ "where'd" => "where did",
119
+ "where's" => "where has",
120
+ "where've" => "where have",
121
+ "who'd" => "who would",
122
+ "who'd've" => "who would have",
123
+ "who'll" => "who shall",
124
+ "who're" => "who are",
125
+ "who's" => "who has",
126
+ "who've" => "who have",
127
+ "why'll" => "why will",
128
+ "why're" => "why are",
129
+ "why's" => "why has",
130
+ "won't" => "will not",
131
+ "won't've" => "will not have",
132
+ "would've" => "would have",
133
+ "wouldn't" => "would not",
134
+ "wouldn't've" => "would not have",
135
+ "y'all" => "you all",
136
+ "y'all'd've" => "you all would have",
137
+ "y'all'dn't've" => "you all would not have",
138
+ "y'all'll" => "you all will",
139
+ "y'all'lln't" => "you all will not",
140
+ "y'all'll've" => "you all will have",
141
+ "y'all'll'ven't" => "you all will have not",
142
+ "you'd" => "you had",
143
+ "you'd've" => "you would have",
144
+ "you'll" => "you shall",
145
+ "you're" => "you are",
146
+ "you'ren't" => "you are not",
147
+ "you've" => "you have",
148
+ "you'ven't" => "you have not",
149
+ }
150
+
151
+ # Perform the filter
152
+ # @param text [String]
153
+ # @return [String]
154
+ def filter!(text)
155
+ text.gsub!(/[a-z']+/) do |word|
156
+ CONTRACTIONS[word] || word
157
+ end
158
+ end
159
+
160
+ end
161
+ end
162
+ end
@@ -0,0 +1,24 @@
1
+ module TextRank
2
+ ##
3
+ # Character filters pre-process text prior to tokenization. It is during
4
+ # this phase that the text should be "cleaned up" so that the tokenizer will
5
+ # produce valid tokens. Character filters should not attempt to remove undesired
6
+ # tokens, however. That is the job of the token filter. Examples include
7
+ # converting non-ascii characters to related ascii characters, forcing text to
8
+ # lower case, stripping out HTML, converting English contractions (e.g. "won't")
9
+ # to the non-contracted form ("will not"), and more.
10
+ #
11
+ # Character filters are applied as a chain, so care should be taken to use them
12
+ # in the desired order.
13
+ ##
14
+ module CharFilter
15
+
16
+ autoload :AsciiFolding, 'text_rank/char_filter/ascii_folding'
17
+ autoload :Lowercase, 'text_rank/char_filter/lowercase'
18
+ autoload :StripEmail, 'text_rank/char_filter/strip_email'
19
+ autoload :StripHtml, 'text_rank/char_filter/strip_html'
20
+ autoload :StripPossessive, 'text_rank/char_filter/strip_possessive'
21
+ autoload :UndoContractions, 'text_rank/char_filter/undo_contractions'
22
+
23
+ end
24
+ end
@@ -0,0 +1,78 @@
1
+ module TextRank
2
+ module GraphStrategy
3
+ ##
4
+ # The original TextRank algorithm uses co-occurrence in determining how to
5
+ # construct a graph of eligible token keywords and relate them together. Given a
6
+ # window size of N any other token at most N positions away from a token is
7
+ # considered co-ocurrent, and an edge will be drawn between them.
8
+ #
9
+ # This implementation makes a slight change from the original algorithm by
10
+ # choosing a weight of 1/distance_from_token as the edge weight.
11
+ #
12
+ # = Example
13
+ # Coocurrence.new(ngram_size: 4).build_graph(%w[what a pessimist you are exclaimed candide], graph)
14
+ # # graph.add("what", "a", 1.0)
15
+ # # graph.add("what", "pessimist", 0.5)
16
+ # # graph.add("what", "you", 0.3333333333333333)
17
+ # # graph.add("what", "are", 0.25)
18
+ # # graph.add("a", "what", 1.0)
19
+ # # graph.add("a", "pessimist", 1.0)
20
+ # # graph.add("a", "you", 0.5)
21
+ # # graph.add("a", "are", 0.3333333333333333)
22
+ # # graph.add("a", "exclaimed", 0.25)
23
+ # # graph.add("pessimist", "what", 0.5)
24
+ # # graph.add("pessimist", "a", 1.0)
25
+ # # graph.add("pessimist", "you", 1.0)
26
+ # # graph.add("pessimist", "are", 0.5)
27
+ # # graph.add("pessimist", "exclaimed", 0.3333333333333333)
28
+ # # graph.add("pessimist", "candide", 0.25)
29
+ # # graph.add("you", "what", 0.3333333333333333)
30
+ # # graph.add("you", "a", 0.5)
31
+ # # graph.add("you", "pessimist", 1.0)
32
+ # # graph.add("you", "are", 1.0)
33
+ # # graph.add("you", "exclaimed", 0.5)
34
+ # # graph.add("you", "candide", 0.3333333333333333)
35
+ # # graph.add("are", "what", 0.25)
36
+ # # graph.add("are", "a", 0.3333333333333333)
37
+ # # graph.add("are", "pessimist", 0.5)
38
+ # # graph.add("are", "you", 1.0)
39
+ # # graph.add("are", "exclaimed", 1.0)
40
+ # # graph.add("are", "candide", 0.5)
41
+ # # graph.add("exclaimed", "a", 0.25)
42
+ # # graph.add("exclaimed", "pessimist", 0.3333333333333333)
43
+ # # graph.add("exclaimed", "you", 0.5)
44
+ # # graph.add("exclaimed", "are", 1.0)
45
+ # # graph.add("exclaimed", "candide", 1.0)
46
+ # # graph.add("candide", "pessimist", 0.25)
47
+ # # graph.add("candide", "you", 0.3333333333333333)
48
+ # # graph.add("candide", "are", 0.5)
49
+ # # graph.add("candide", "exclaimed", 1.0)
50
+ ##
51
+ class Coocurrence
52
+
53
+ # @param ngram_size [Fixnum] Window size around a token considered co-occurrence
54
+ def initialize(ngram_size: 3, **_)
55
+ @ngram_size = ngram_size
56
+ end
57
+
58
+ # Build a graph for which the PageRank algorithm will be applied
59
+ # @param tokens [Array<String>] filtered tokens from which to build a graph
60
+ # @param graph [PageRank::Base] a PageRank graph into which to add nodes/edges
61
+ # return [nil]
62
+ def build_graph(tokens, graph)
63
+ ngram_window = @ngram_size * 2 + 1
64
+ tokens.each_with_index do |token_i, i|
65
+ ngram_window.times do |j|
66
+ next if j == @ngram_size || i + j < @ngram_size
67
+ token_j = tokens[i - @ngram_size + j]
68
+ if token_j
69
+ graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
70
+ end
71
+ end
72
+ end
73
+ nil
74
+ end
75
+
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,23 @@
1
+ module TextRank
2
+ ##
3
+ # The graph strategy is the heart of the TextRank algorithm. Strategies
4
+ # determine how a stream of potential tokens are transformed into a graph of
5
+ # unique tokens in such a way that the PageRank algorithm provides meaningful
6
+ # results.
7
+ #
8
+ # The standard TextRank approach uses co-occurence of tokens within a fixed-size
9
+ # window, and that strategy will likely suffice for most applications. However,
10
+ # there are many variations of TextRank, e.g.:
11
+ #
12
+ # * SingleRank
13
+ # * ExpandRank
14
+ # * ClusterRank
15
+ #
16
+ # @see http://www.hlt.utdallas.edu/~vince/papers/coling10-keyphrase.pdf
17
+ ##
18
+ module GraphStrategy
19
+
20
+ autoload :Coocurrence, 'text_rank/graph_strategy/coocurrence'
21
+
22
+ end
23
+ end
@@ -0,0 +1,155 @@
1
+ module TextRank
2
+ ##
3
+ # Primary class for keyword extraction and hub for filters, tokenizers, and
4
+ # graph strategies # that customize how the text is processed and how the
5
+ # TextRank algorithm is applied.
6
+ #
7
+ # @see README
8
+ ##
9
+ class KeywordExtractor
10
+
11
+ # Creates a "basic" keyword extractor with default options
12
+ # @option (see #initialize)
13
+ # @return [KeywordExtractor]
14
+ def self.basic(**options)
15
+ new(**{
16
+ char_filters: [:AsciiFolding, :Lowercase],
17
+ tokenizer: :Whitespace,
18
+ token_filters: [:Stopwords, :MinLength],
19
+ graph_strategy: :Coocurrence,
20
+ }.merge(options))
21
+ end
22
+
23
+ # Creates an "advanced" keyword extractor with a larger set of default filters
24
+ # @option (see #initialize)
25
+ # @return [KeywordExtractor]
26
+ def self.advanced(**options)
27
+ new(**{
28
+ char_filters: [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
29
+ tokenizer: :WordsAndPunctuation,
30
+ token_filters: [:PartOfSpeech, :Stopwords, :MinLength],
31
+ graph_strategy: :Coocurrence,
32
+ rank_filters: [:CollapseAdjacent],
33
+ }.merge(options))
34
+ end
35
+
36
+ # @option (see PageRank.new)
37
+ # @option options [Array<Class, Symbol, #filter!>] :char_filters A list of filters to be applied prior to tokenization
38
+ # @option options [Class, Symbol, #tokenize] :tokenizer A class or tokenizer instance to perform tokenization
39
+ # @option options [Array<Class, Symbol, #filter!>] :token_filters A list of filters to be applied to each token after tokenization
40
+ # @option options [Class, Symbol, #build_graph] :graph_strategy A class or strategy instance for producing a graph from tokens
41
+ # @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
42
+ def initialize(**options)
43
+ @page_rank_options = {
44
+ strategy: options[:strategy] || :sparse,
45
+ damping: options[:damping],
46
+ tolerance: options[:tolerance],
47
+ }
48
+ @char_filters = options[:char_filters] || []
49
+ @tokenizer = options[:tokenizer] || Tokenizer::Whitespace
50
+ @token_filters = options[:token_filters] || []
51
+ @rank_filters = options[:rank_filters] || []
52
+ @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
53
+ end
54
+
55
+ # Add a new CharFilter for processing text before tokenization
56
+ # @param filter [Class, Symbol, #filter!] A filter to process text before tokenization
57
+ # @param (see #add_into)
58
+ # @return [nil]
59
+ def add_char_filter(filter, **options)
60
+ add_into(@char_filters, filter, **options)
61
+ nil
62
+ end
63
+
64
+ # Sets the tokenizer for producing tokens from filtered text
65
+ # @param tokenizer [Class, Symbol, #tokenize] Tokenizer
66
+ # @return [Class, Symbol, #tokenize]
67
+ def tokenizer=(tokenizer)
68
+ @tokenizer = tokenizer
69
+ end
70
+
71
+ # Sets the graph strategy for producing a graph from tokens
72
+ # @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
73
+ # @return [Class, Symbol, #build_graph]
74
+ def graph_strategy=(strategy)
75
+ @graph_strategy = strategy
76
+ end
77
+
78
+ # Add a new TokenFilter for processing tokens after tokenization
79
+ # @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
80
+ # @param (see #add_into)
81
+ # @return [nil]
82
+ def add_token_filter(filter, **options)
83
+ add_into(@token_filters, filter, **options)
84
+ nil
85
+ end
86
+
87
+ # Add a new RankFilter for processing ranks after calculating
88
+ # @param filter [Class, Symbol, #filter!] A filter to process ranks
89
+ # @param (see #add_into)
90
+ # @return [nil]
91
+ def add_rank_filter(filter, **options)
92
+ add_into(@rank_filters, filter, **options)
93
+ nil
94
+ end
95
+
96
+ # Filters and tokenizes text
97
+ # @param text [String] unfiltered text to be tokenized
98
+ # @return [Array<String>] tokens
99
+ def tokenize(text)
100
+ filtered_text = apply_char_filters(text)
101
+ tokens = classify(@tokenizer, context: Tokenizer).tokenize(filtered_text)
102
+ apply_token_filters(tokens)
103
+ end
104
+
105
+ # Filter & tokenize text, and return PageRank
106
+ # @param text [String] unfiltered text to be processed
107
+ # @return [Hash<String, Float>] tokens and page ranks (in descending order)
108
+ def extract(text, **options)
109
+ tokens = tokenize(text)
110
+ graph = PageRank.new(**@page_rank_options)
111
+ classify(@graph_strategy, context: GraphStrategy).build_graph(tokens, graph)
112
+ ranks = graph.calculate(**options)
113
+ apply_rank_filters(ranks, tokens: tokens, original_text: text)
114
+ end
115
+
116
+ private
117
+
118
+ def apply_char_filters(text)
119
+ @char_filters.reduce(text.clone) do |t, f|
120
+ classify(f, context: CharFilter).filter!(t) || t
121
+ end
122
+ end
123
+
124
+ def apply_token_filters(tokens)
125
+ @token_filters.reduce(tokens) do |t, f|
126
+ classify(f, context: TokenFilter).filter!(t) || t
127
+ end
128
+ end
129
+
130
+ def apply_rank_filters(ranks, **options)
131
+ @rank_filters.reduce(ranks) do |t, f|
132
+ classify(f, context: RankFilter).filter!(t, **options) || t
133
+ end
134
+ end
135
+
136
+ # @param before [Class, Symbol, Object] item to add before
137
+ # @param at [Fixnum] index to insert new item
138
+ def add_into(array, value, before: nil, at: nil)
139
+ idx = array.index(before) || at || -1
140
+ array.insert(idx, value)
141
+ end
142
+
143
+ def classify(c, context: self)
144
+ case c
145
+ when Class
146
+ c.new
147
+ when Symbol
148
+ context.const_get(c).new
149
+ else
150
+ c
151
+ end
152
+ end
153
+
154
+ end
155
+ end
@@ -0,0 +1,81 @@
1
+ module TextRank
2
+ module RankFilter
3
+ ##
4
+ # A rank filter which attempts to collapse one of the highly ranked, single
5
+ # token keywords into a combined keyword when those keywords are adjacent
6
+ # to each other in the original text.
7
+ #
8
+ # = Example
9
+ #
10
+ # CollapseAdjacent.new(ranks_to_collapse: 6, max_tokens_to_combine: 2).filter!(
11
+ # {
12
+ # "town" => 0.9818754334834477,
13
+ # "cities" => 0.9055017128817066,
14
+ # "siege" => 0.7411519524982207,
15
+ # "arts" => 0.6907977453782612,
16
+ # "envy" => 0.6692709808107252,
17
+ # "blessings" => 0.6442147897516214,
18
+ # "plagues" => 0.5972420789430091,
19
+ # "florish" => 0.3746092797528525,
20
+ # "devoured" => 0.36867321734332237,
21
+ # "anxieties" => 0.3367731719604189,
22
+ # "peace" => 0.2905352582752693,
23
+ # "inhabitants" => 0.12715120116732137,
24
+ # "cares" => 0.0697383057947685,
25
+ # },
26
+ # original_text: "cities blessings peace arts florish inhabitants devoured envy cares anxieties plagues town siege"
27
+ # )
28
+ # => {
29
+ # "town siege" => 0.9818754334834477,
30
+ # "cities blessings" => 0.9055017128817066,
31
+ # "arts florish" => 0.6907977453782612,
32
+ # "devoured envy" => 0.6692709808107252,
33
+ # "anxieties plagues" => 0.5972420789430091,
34
+ # "peace" => 0.2905352582752693,
35
+ # "inhabitants" => 0.12715120116732137,
36
+ # "cares" => 0.0697383057947685,
37
+ #
38
+ ##
39
+ class CollapseAdjacent
40
+
41
+ # @param ranks_to_collapse [Fixnum] the top N ranks in which to look for collapsable keywords
42
+ # @param max_tokens_to_combine [Fixnum] the maximum number of tokens to collapse into a combined keyword
43
+ # @param ignore_case [true, false] whether to ignore case when finding adjacent keywords in original text
44
+ def initialize(ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, **_)
45
+ @ranks_to_collapse = ranks_to_collapse
46
+ @max_tokens_to_combine = max_tokens_to_combine
47
+ @ignore_case = !!ignore_case
48
+ end
49
+
50
+ # Perform the filter on the ranks
51
+ # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
52
+ # @param original_text [String] the original text (pre-tokenization) from which to find collapsable keywords
53
+ # @return [Hash<String, Float>]
54
+ def filter!(ranks, original_text:, **_)
55
+ collapsed = {}
56
+ loop do
57
+ permutation = collapse_one(ranks.keys.first(@ranks_to_collapse - collapsed.size), original_text) or break
58
+ collapsed[permutation.join(' ')] = ranks.values_at(*permutation).max
59
+ permutation.each { |token| ranks.delete(token) }
60
+ end
61
+ collapsed.merge!(ranks)
62
+ Hash[collapsed.sort_by { |_, v| -v }]
63
+ end
64
+
65
+ private
66
+
67
+ def collapse_one(tokens, original_text)
68
+ (2..@max_tokens_to_combine).to_a.reverse_each do |tokens_to_combine|
69
+ tokens.permutation(tokens_to_combine) do |permutation|
70
+ re_options = 0
71
+ re_options |= Regexp::IGNORECASE if @ignore_case
72
+ re = Regexp.new("\\b#{permutation.join(" +")}\\b", re_options)
73
+ return permutation if original_text =~ re
74
+ end
75
+ end
76
+ nil
77
+ end
78
+
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,18 @@
1
+ module TextRank
2
+ ##
3
+ # Rank filters are post-process filters which can filter, enhance, or modify
4
+ # the results of the PageRank algorithm. A common use case is to collapse highly
5
+ # ranked tokens which are found to be adjacent in the original text. Other
6
+ # filters might modify the PageRank scores with some sort of external modifier.
7
+ # Another use might be to remove collapsed tokens which are not desired (since
8
+ # token filters only operate on a single, non-collapsed token).
9
+ #
10
+ # Rank filters are applied as a chain, so care should be taken to use them
11
+ # in the desired order.
12
+ ##
13
+ module RankFilter
14
+
15
+ autoload :CollapseAdjacent, 'text_rank/rank_filter/collapse_adjacent'
16
+
17
+ end
18
+ end
@@ -0,0 +1,33 @@
1
+ module TextRank
2
+ module TokenFilter
3
+ ##
4
+ # Token filter to remove "small" tokens
5
+ #
6
+ # = Example
7
+ #
8
+ # MinLength.new(min_length: 6).filter!(%w[
9
+ # and ask each passenger to tell his story and if there is one of them all who has not
10
+ # cursed his existence many times and said to himself over and over again that he was
11
+ # the most miserable of men i give you permission to throw me head-first into the sea
12
+ # ])
13
+ # => ["passenger", "cursed", "existence", "himself", "miserable", "permission", "head-first"]
14
+ ##
15
+ class MinLength
16
+
17
+ # @param min_length [Fixnum] minimum size of token to keep
18
+ def initialize(min_length: 3, **_)
19
+ @min_length = min_length
20
+ end
21
+
22
+ # Perform the filter
23
+ # @param tokens [Array<String>]
24
+ # @return [Array<String>]
25
+ def filter!(tokens)
26
+ tokens.keep_if do |token|
27
+ token.size >= @min_length
28
+ end
29
+ end
30
+
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,45 @@
1
+ require 'engtagger'
2
+ require 'set'
3
+
4
+ module TextRank
5
+ module TokenFilter
6
+ ##
7
+ # Token filter to keep only a selected set of parts of speech
8
+ #
9
+ # = Example
10
+ #
11
+ # PartOfSpeech.new(parts_to_keep: %w[nn nns]).filter!(%w[
12
+ # all men are by nature free
13
+ # ])
14
+ # => ["men", "nature"]
15
+ ##
16
+ class PartOfSpeech
17
+
18
+ # @param parts_to_keep [Array<String>] list of engtagger parts of speech to keep
19
+ # @see https://github.com/yohasebe/engtagger#tag-set
20
+ def initialize(parts_to_keep: %w[nn nnp nnps nns jj jjr jjs vb vbd vbg vbn vbp vbz], **_)
21
+ @parts_to_keep = Set.new(parts_to_keep)
22
+ @eng_tagger = EngTagger.new
23
+ @last_pos_tag = 'pp'
24
+ end
25
+
26
+ # Perform the filter
27
+ # @param tokens [Array<String>]
28
+ # @return [Array<String>]
29
+ def filter!(tokens)
30
+ tokens.keep_if do |token|
31
+ @parts_to_keep.include?(pos_tag(token))
32
+ end
33
+ end
34
+
35
+ private
36
+
37
+ def pos_tag(token)
38
+ tag = @eng_tagger.assign_tag(@last_pos_tag, token) rescue nil
39
+ tag = 'nn' if tag.nil? || tag == ''
40
+ @last_pos_tag = tag
41
+ end
42
+
43
+ end
44
+ end
45
+ end