text_rank 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.codeclimate.yml +29 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +1157 -0
- data/.ruby-version +1 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +21 -0
- data/README.md +137 -0
- data/Rakefile +12 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/page_rank/base.rb +89 -0
- data/lib/page_rank/dense.rb +89 -0
- data/lib/page_rank/sparse.rb +87 -0
- data/lib/page_rank.rb +39 -0
- data/lib/text_rank/char_filter/ascii_folding.rb +26 -0
- data/lib/text_rank/char_filter/lowercase.rb +22 -0
- data/lib/text_rank/char_filter/strip_email.rb +24 -0
- data/lib/text_rank/char_filter/strip_html.rb +41 -0
- data/lib/text_rank/char_filter/strip_possessive.rb +24 -0
- data/lib/text_rank/char_filter/undo_contractions.rb +162 -0
- data/lib/text_rank/char_filter.rb +24 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +78 -0
- data/lib/text_rank/graph_strategy.rb +23 -0
- data/lib/text_rank/keyword_extractor.rb +155 -0
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +81 -0
- data/lib/text_rank/rank_filter.rb +18 -0
- data/lib/text_rank/token_filter/min_length.rb +33 -0
- data/lib/text_rank/token_filter/part_of_speech.rb +45 -0
- data/lib/text_rank/token_filter/stopwords.rb +349 -0
- data/lib/text_rank/token_filter.rb +18 -0
- data/lib/text_rank/tokenizer/regex.rb +26 -0
- data/lib/text_rank/tokenizer/whitespace.rb +19 -0
- data/lib/text_rank/tokenizer/words_and_punctuation.rb +26 -0
- data/lib/text_rank/tokenizer.rb +19 -0
- data/lib/text_rank/version.rb +3 -0
- data/lib/text_rank.rb +34 -0
- data/text_rank.gemspec +30 -0
- metadata +183 -0
@@ -0,0 +1,24 @@
|
|
1
|
+
module TextRank
|
2
|
+
module CharFilter
|
3
|
+
##
|
4
|
+
# Character filter to remove apostrophe from possessives.
|
5
|
+
#
|
6
|
+
# = Example
|
7
|
+
#
|
8
|
+
# StripPosessive.new.filter!("to loathe one’s very being and yet to hold it fast")
|
9
|
+
# => "to loathe one very being and yet to hold it fast"
|
10
|
+
##
|
11
|
+
class StripPossessive
|
12
|
+
|
13
|
+
# Perform the filter
|
14
|
+
# @param text [String]
|
15
|
+
# @return [String]
|
16
|
+
def filter!(text)
|
17
|
+
text.gsub!(/([a-z]+)'s\b/) do
|
18
|
+
$1
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,162 @@
|
|
1
|
+
module TextRank
|
2
|
+
module CharFilter
|
3
|
+
##
|
4
|
+
# Character filter to convert English contractions into their expanded form.
|
5
|
+
#
|
6
|
+
# = Example
|
7
|
+
#
|
8
|
+
# UndoContractions.new.filter!("You're a bitter man. That's because I've lived.")
|
9
|
+
# => "You are a bitter man. That is because I have lived."
|
10
|
+
##
|
11
|
+
class UndoContractions
|
12
|
+
|
13
|
+
CONTRACTIONS = {
|
14
|
+
"ain't" => "am not",
|
15
|
+
"amn't" => "am not",
|
16
|
+
"aren't" => "are not",
|
17
|
+
"can't" => "can not",
|
18
|
+
"could've" => "could have",
|
19
|
+
"couldn't" => "could not",
|
20
|
+
"couldn't've" => "could not have",
|
21
|
+
"didn't" => "did not",
|
22
|
+
"doesn't" => "does not",
|
23
|
+
"don't" => "do not",
|
24
|
+
"gonna" => "going to",
|
25
|
+
"hadn't" => "had not",
|
26
|
+
"hadn't've" => "had not have",
|
27
|
+
"hasn't" => "has not",
|
28
|
+
"haven't" => "have not",
|
29
|
+
"he'd" => "he had",
|
30
|
+
"he'd've" => "he would have",
|
31
|
+
"he'll" => "he shall",
|
32
|
+
"he's" => "he has",
|
33
|
+
"he'sn't" => "he has not",
|
34
|
+
"how'd" => "how did",
|
35
|
+
"how'll" => "how will",
|
36
|
+
"how's" => "how has",
|
37
|
+
"i'd" => "i had",
|
38
|
+
"i'd've" => "i would have",
|
39
|
+
"i'll" => "i shall",
|
40
|
+
"i'm" => "i am",
|
41
|
+
"i've" => "i have",
|
42
|
+
"i'ven't" => "i have not",
|
43
|
+
"isn't" => "is not",
|
44
|
+
"it'd" => "it had",
|
45
|
+
"it'd've" => "it would have",
|
46
|
+
"it'll" => "it shall",
|
47
|
+
"it's" => "it has",
|
48
|
+
"it'sn't" => "it has not",
|
49
|
+
"let's" => "let us",
|
50
|
+
"ma'am" => "madam",
|
51
|
+
"mightn't" => "might not",
|
52
|
+
"mightn't've" => "might not have",
|
53
|
+
"might've" => "might have",
|
54
|
+
"mustn't" => "must not",
|
55
|
+
"must've" => "must have",
|
56
|
+
"needn't" => "need not",
|
57
|
+
"not've" => "not have",
|
58
|
+
"o'clock" => "of the clock",
|
59
|
+
"ol'" => "old",
|
60
|
+
"oughtn't" => "ought not",
|
61
|
+
"shan't" => "shall not",
|
62
|
+
"she'd" => "she had",
|
63
|
+
"she'd've" => "she would have",
|
64
|
+
"she'll" => "she shall",
|
65
|
+
"she's" => "she has",
|
66
|
+
"she'sn't" => "she has not",
|
67
|
+
"should've" => "should have",
|
68
|
+
"shouldn't" => "should not",
|
69
|
+
"shouldn't've" => "should not have",
|
70
|
+
"somebody'd" => "somebody had",
|
71
|
+
"somebody'd've" => "somebody would have",
|
72
|
+
"somebody'dn't've" => "somebody would not have",
|
73
|
+
"somebody'll" => "somebody shall",
|
74
|
+
"somebody's" => "somebody has",
|
75
|
+
"someone'd" => "someone had",
|
76
|
+
"someone'd've" => "someone would have",
|
77
|
+
"someone'll" => "someone shall",
|
78
|
+
"someone's" => "someone has",
|
79
|
+
"something'd" => "something had",
|
80
|
+
"something'd've" => "something would have",
|
81
|
+
"something'll" => "something shall",
|
82
|
+
"something's" => "something has",
|
83
|
+
"'sup" => "what's up",
|
84
|
+
"that'll" => "that will",
|
85
|
+
"that's" => "that has",
|
86
|
+
"there'd" => "there had",
|
87
|
+
"there'd've" => "there would have",
|
88
|
+
"there're" => "there are",
|
89
|
+
"there's" => "there has",
|
90
|
+
"they'd" => "they had",
|
91
|
+
"they'dn't" => "they would not",
|
92
|
+
"they'dn't've" => "they would not have",
|
93
|
+
"they'd've" => "they would have",
|
94
|
+
"they'd'ven't" => "they would have not",
|
95
|
+
"they'll" => "they shall",
|
96
|
+
"they'lln't've" => "they will not have",
|
97
|
+
"they'll'ven't" => "they will have not",
|
98
|
+
"they're" => "they are",
|
99
|
+
"they've" => "they have",
|
100
|
+
"they'ven't" => "they have not",
|
101
|
+
"'tis" => "it is",
|
102
|
+
"'twas" => "it was",
|
103
|
+
"wanna" => "want to",
|
104
|
+
"wasn't" => "was not",
|
105
|
+
"we'd" => "we had",
|
106
|
+
"we'd've" => "we would have",
|
107
|
+
"we'dn't've" => "we would not have",
|
108
|
+
"we'll" => "we will",
|
109
|
+
"we'lln't've" => "we will not have",
|
110
|
+
"we're" => "we are",
|
111
|
+
"we've" => "we have",
|
112
|
+
"weren't" => "were not",
|
113
|
+
"what'll" => "what shall",
|
114
|
+
"what're" => "what are",
|
115
|
+
"what's" => "what has",
|
116
|
+
"what've" => "what have",
|
117
|
+
"when's" => "when has",
|
118
|
+
"where'd" => "where did",
|
119
|
+
"where's" => "where has",
|
120
|
+
"where've" => "where have",
|
121
|
+
"who'd" => "who would",
|
122
|
+
"who'd've" => "who would have",
|
123
|
+
"who'll" => "who shall",
|
124
|
+
"who're" => "who are",
|
125
|
+
"who's" => "who has",
|
126
|
+
"who've" => "who have",
|
127
|
+
"why'll" => "why will",
|
128
|
+
"why're" => "why are",
|
129
|
+
"why's" => "why has",
|
130
|
+
"won't" => "will not",
|
131
|
+
"won't've" => "will not have",
|
132
|
+
"would've" => "would have",
|
133
|
+
"wouldn't" => "would not",
|
134
|
+
"wouldn't've" => "would not have",
|
135
|
+
"y'all" => "you all",
|
136
|
+
"y'all'd've" => "you all would have",
|
137
|
+
"y'all'dn't've" => "you all would not have",
|
138
|
+
"y'all'll" => "you all will",
|
139
|
+
"y'all'lln't" => "you all will not",
|
140
|
+
"y'all'll've" => "you all will have",
|
141
|
+
"y'all'll'ven't" => "you all will have not",
|
142
|
+
"you'd" => "you had",
|
143
|
+
"you'd've" => "you would have",
|
144
|
+
"you'll" => "you shall",
|
145
|
+
"you're" => "you are",
|
146
|
+
"you'ren't" => "you are not",
|
147
|
+
"you've" => "you have",
|
148
|
+
"you'ven't" => "you have not",
|
149
|
+
}
|
150
|
+
|
151
|
+
# Perform the filter
|
152
|
+
# @param text [String]
|
153
|
+
# @return [String]
|
154
|
+
def filter!(text)
|
155
|
+
text.gsub!(/[a-z']+/) do |word|
|
156
|
+
CONTRACTIONS[word] || word
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module TextRank
|
2
|
+
##
|
3
|
+
# Character filters pre-process text prior to tokenization. It is during
|
4
|
+
# this phase that the text should be "cleaned up" so that the tokenizer will
|
5
|
+
# produce valid tokens. Character filters should not attempt to remove undesired
|
6
|
+
# tokens, however. That is the job of the token filter. Examples include
|
7
|
+
# converting non-ascii characters to related ascii characters, forcing text to
|
8
|
+
# lower case, stripping out HTML, converting English contractions (e.g. "won't")
|
9
|
+
# to the non-contracted form ("will not"), and more.
|
10
|
+
#
|
11
|
+
# Character filters are applied as a chain, so care should be taken to use them
|
12
|
+
# in the desired order.
|
13
|
+
##
|
14
|
+
module CharFilter
|
15
|
+
|
16
|
+
autoload :AsciiFolding, 'text_rank/char_filter/ascii_folding'
|
17
|
+
autoload :Lowercase, 'text_rank/char_filter/lowercase'
|
18
|
+
autoload :StripEmail, 'text_rank/char_filter/strip_email'
|
19
|
+
autoload :StripHtml, 'text_rank/char_filter/strip_html'
|
20
|
+
autoload :StripPossessive, 'text_rank/char_filter/strip_possessive'
|
21
|
+
autoload :UndoContractions, 'text_rank/char_filter/undo_contractions'
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module TextRank
|
2
|
+
module GraphStrategy
|
3
|
+
##
|
4
|
+
# The original TextRank algorithm uses co-occurrence in determining how to
|
5
|
+
# construct a graph of eligible token keywords and relate them together. Given a
|
6
|
+
# window size of N any other token at most N positions away from a token is
|
7
|
+
# considered co-ocurrent, and an edge will be drawn between them.
|
8
|
+
#
|
9
|
+
# This implementation makes a slight change from the original algorithm by
|
10
|
+
# choosing a weight of 1/distance_from_token as the edge weight.
|
11
|
+
#
|
12
|
+
# = Example
|
13
|
+
# Coocurrence.new(ngram_size: 4).build_graph(%w[what a pessimist you are exclaimed candide], graph)
|
14
|
+
# # graph.add("what", "a", 1.0)
|
15
|
+
# # graph.add("what", "pessimist", 0.5)
|
16
|
+
# # graph.add("what", "you", 0.3333333333333333)
|
17
|
+
# # graph.add("what", "are", 0.25)
|
18
|
+
# # graph.add("a", "what", 1.0)
|
19
|
+
# # graph.add("a", "pessimist", 1.0)
|
20
|
+
# # graph.add("a", "you", 0.5)
|
21
|
+
# # graph.add("a", "are", 0.3333333333333333)
|
22
|
+
# # graph.add("a", "exclaimed", 0.25)
|
23
|
+
# # graph.add("pessimist", "what", 0.5)
|
24
|
+
# # graph.add("pessimist", "a", 1.0)
|
25
|
+
# # graph.add("pessimist", "you", 1.0)
|
26
|
+
# # graph.add("pessimist", "are", 0.5)
|
27
|
+
# # graph.add("pessimist", "exclaimed", 0.3333333333333333)
|
28
|
+
# # graph.add("pessimist", "candide", 0.25)
|
29
|
+
# # graph.add("you", "what", 0.3333333333333333)
|
30
|
+
# # graph.add("you", "a", 0.5)
|
31
|
+
# # graph.add("you", "pessimist", 1.0)
|
32
|
+
# # graph.add("you", "are", 1.0)
|
33
|
+
# # graph.add("you", "exclaimed", 0.5)
|
34
|
+
# # graph.add("you", "candide", 0.3333333333333333)
|
35
|
+
# # graph.add("are", "what", 0.25)
|
36
|
+
# # graph.add("are", "a", 0.3333333333333333)
|
37
|
+
# # graph.add("are", "pessimist", 0.5)
|
38
|
+
# # graph.add("are", "you", 1.0)
|
39
|
+
# # graph.add("are", "exclaimed", 1.0)
|
40
|
+
# # graph.add("are", "candide", 0.5)
|
41
|
+
# # graph.add("exclaimed", "a", 0.25)
|
42
|
+
# # graph.add("exclaimed", "pessimist", 0.3333333333333333)
|
43
|
+
# # graph.add("exclaimed", "you", 0.5)
|
44
|
+
# # graph.add("exclaimed", "are", 1.0)
|
45
|
+
# # graph.add("exclaimed", "candide", 1.0)
|
46
|
+
# # graph.add("candide", "pessimist", 0.25)
|
47
|
+
# # graph.add("candide", "you", 0.3333333333333333)
|
48
|
+
# # graph.add("candide", "are", 0.5)
|
49
|
+
# # graph.add("candide", "exclaimed", 1.0)
|
50
|
+
##
|
51
|
+
class Coocurrence
|
52
|
+
|
53
|
+
# @param ngram_size [Fixnum] Window size around a token considered co-occurrence
|
54
|
+
def initialize(ngram_size: 3, **_)
|
55
|
+
@ngram_size = ngram_size
|
56
|
+
end
|
57
|
+
|
58
|
+
# Build a graph for which the PageRank algorithm will be applied
|
59
|
+
# @param tokens [Array<String>] filtered tokens from which to build a graph
|
60
|
+
# @param graph [PageRank::Base] a PageRank graph into which to add nodes/edges
|
61
|
+
# return [nil]
|
62
|
+
def build_graph(tokens, graph)
|
63
|
+
ngram_window = @ngram_size * 2 + 1
|
64
|
+
tokens.each_with_index do |token_i, i|
|
65
|
+
ngram_window.times do |j|
|
66
|
+
next if j == @ngram_size || i + j < @ngram_size
|
67
|
+
token_j = tokens[i - @ngram_size + j]
|
68
|
+
if token_j
|
69
|
+
graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
nil
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module TextRank
|
2
|
+
##
|
3
|
+
# The graph strategy is the heart of the TextRank algorithm. Strategies
|
4
|
+
# determine how a stream of potential tokens are transformed into a graph of
|
5
|
+
# unique tokens in such a way that the PageRank algorithm provides meaningful
|
6
|
+
# results.
|
7
|
+
#
|
8
|
+
# The standard TextRank approach uses co-occurence of tokens within a fixed-size
|
9
|
+
# window, and that strategy will likely suffice for most applications. However,
|
10
|
+
# there are many variations of TextRank, e.g.:
|
11
|
+
#
|
12
|
+
# * SingleRank
|
13
|
+
# * ExpandRank
|
14
|
+
# * ClusterRank
|
15
|
+
#
|
16
|
+
# @see http://www.hlt.utdallas.edu/~vince/papers/coling10-keyphrase.pdf
|
17
|
+
##
|
18
|
+
module GraphStrategy
|
19
|
+
|
20
|
+
autoload :Coocurrence, 'text_rank/graph_strategy/coocurrence'
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,155 @@
|
|
1
|
+
module TextRank
|
2
|
+
##
|
3
|
+
# Primary class for keyword extraction and hub for filters, tokenizers, and
|
4
|
+
# graph strategies # that customize how the text is processed and how the
|
5
|
+
# TextRank algorithm is applied.
|
6
|
+
#
|
7
|
+
# @see README
|
8
|
+
##
|
9
|
+
class KeywordExtractor
|
10
|
+
|
11
|
+
# Creates a "basic" keyword extractor with default options
|
12
|
+
# @option (see #initialize)
|
13
|
+
# @return [KeywordExtractor]
|
14
|
+
def self.basic(**options)
|
15
|
+
new(**{
|
16
|
+
char_filters: [:AsciiFolding, :Lowercase],
|
17
|
+
tokenizer: :Whitespace,
|
18
|
+
token_filters: [:Stopwords, :MinLength],
|
19
|
+
graph_strategy: :Coocurrence,
|
20
|
+
}.merge(options))
|
21
|
+
end
|
22
|
+
|
23
|
+
# Creates an "advanced" keyword extractor with a larger set of default filters
|
24
|
+
# @option (see #initialize)
|
25
|
+
# @return [KeywordExtractor]
|
26
|
+
def self.advanced(**options)
|
27
|
+
new(**{
|
28
|
+
char_filters: [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
|
29
|
+
tokenizer: :WordsAndPunctuation,
|
30
|
+
token_filters: [:PartOfSpeech, :Stopwords, :MinLength],
|
31
|
+
graph_strategy: :Coocurrence,
|
32
|
+
rank_filters: [:CollapseAdjacent],
|
33
|
+
}.merge(options))
|
34
|
+
end
|
35
|
+
|
36
|
+
# @option (see PageRank.new)
|
37
|
+
# @option options [Array<Class, Symbol, #filter!>] :char_filters A list of filters to be applied prior to tokenization
|
38
|
+
# @option options [Class, Symbol, #tokenize] :tokenizer A class or tokenizer instance to perform tokenization
|
39
|
+
# @option options [Array<Class, Symbol, #filter!>] :token_filters A list of filters to be applied to each token after tokenization
|
40
|
+
# @option options [Class, Symbol, #build_graph] :graph_strategy A class or strategy instance for producing a graph from tokens
|
41
|
+
# @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
|
42
|
+
def initialize(**options)
|
43
|
+
@page_rank_options = {
|
44
|
+
strategy: options[:strategy] || :sparse,
|
45
|
+
damping: options[:damping],
|
46
|
+
tolerance: options[:tolerance],
|
47
|
+
}
|
48
|
+
@char_filters = options[:char_filters] || []
|
49
|
+
@tokenizer = options[:tokenizer] || Tokenizer::Whitespace
|
50
|
+
@token_filters = options[:token_filters] || []
|
51
|
+
@rank_filters = options[:rank_filters] || []
|
52
|
+
@graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
|
53
|
+
end
|
54
|
+
|
55
|
+
# Add a new CharFilter for processing text before tokenization
|
56
|
+
# @param filter [Class, Symbol, #filter!] A filter to process text before tokenization
|
57
|
+
# @param (see #add_into)
|
58
|
+
# @return [nil]
|
59
|
+
def add_char_filter(filter, **options)
|
60
|
+
add_into(@char_filters, filter, **options)
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
|
64
|
+
# Sets the tokenizer for producing tokens from filtered text
|
65
|
+
# @param tokenizer [Class, Symbol, #tokenize] Tokenizer
|
66
|
+
# @return [Class, Symbol, #tokenize]
|
67
|
+
def tokenizer=(tokenizer)
|
68
|
+
@tokenizer = tokenizer
|
69
|
+
end
|
70
|
+
|
71
|
+
# Sets the graph strategy for producing a graph from tokens
|
72
|
+
# @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
|
73
|
+
# @return [Class, Symbol, #build_graph]
|
74
|
+
def graph_strategy=(strategy)
|
75
|
+
@graph_strategy = strategy
|
76
|
+
end
|
77
|
+
|
78
|
+
# Add a new TokenFilter for processing tokens after tokenization
|
79
|
+
# @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
|
80
|
+
# @param (see #add_into)
|
81
|
+
# @return [nil]
|
82
|
+
def add_token_filter(filter, **options)
|
83
|
+
add_into(@token_filters, filter, **options)
|
84
|
+
nil
|
85
|
+
end
|
86
|
+
|
87
|
+
# Add a new RankFilter for processing ranks after calculating
|
88
|
+
# @param filter [Class, Symbol, #filter!] A filter to process ranks
|
89
|
+
# @param (see #add_into)
|
90
|
+
# @return [nil]
|
91
|
+
def add_rank_filter(filter, **options)
|
92
|
+
add_into(@rank_filters, filter, **options)
|
93
|
+
nil
|
94
|
+
end
|
95
|
+
|
96
|
+
# Filters and tokenizes text
|
97
|
+
# @param text [String] unfiltered text to be tokenized
|
98
|
+
# @return [Array<String>] tokens
|
99
|
+
def tokenize(text)
|
100
|
+
filtered_text = apply_char_filters(text)
|
101
|
+
tokens = classify(@tokenizer, context: Tokenizer).tokenize(filtered_text)
|
102
|
+
apply_token_filters(tokens)
|
103
|
+
end
|
104
|
+
|
105
|
+
# Filter & tokenize text, and return PageRank
|
106
|
+
# @param text [String] unfiltered text to be processed
|
107
|
+
# @return [Hash<String, Float>] tokens and page ranks (in descending order)
|
108
|
+
def extract(text, **options)
|
109
|
+
tokens = tokenize(text)
|
110
|
+
graph = PageRank.new(**@page_rank_options)
|
111
|
+
classify(@graph_strategy, context: GraphStrategy).build_graph(tokens, graph)
|
112
|
+
ranks = graph.calculate(**options)
|
113
|
+
apply_rank_filters(ranks, tokens: tokens, original_text: text)
|
114
|
+
end
|
115
|
+
|
116
|
+
private
|
117
|
+
|
118
|
+
def apply_char_filters(text)
|
119
|
+
@char_filters.reduce(text.clone) do |t, f|
|
120
|
+
classify(f, context: CharFilter).filter!(t) || t
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def apply_token_filters(tokens)
|
125
|
+
@token_filters.reduce(tokens) do |t, f|
|
126
|
+
classify(f, context: TokenFilter).filter!(t) || t
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def apply_rank_filters(ranks, **options)
|
131
|
+
@rank_filters.reduce(ranks) do |t, f|
|
132
|
+
classify(f, context: RankFilter).filter!(t, **options) || t
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# @param before [Class, Symbol, Object] item to add before
|
137
|
+
# @param at [Fixnum] index to insert new item
|
138
|
+
def add_into(array, value, before: nil, at: nil)
|
139
|
+
idx = array.index(before) || at || -1
|
140
|
+
array.insert(idx, value)
|
141
|
+
end
|
142
|
+
|
143
|
+
def classify(c, context: self)
|
144
|
+
case c
|
145
|
+
when Class
|
146
|
+
c.new
|
147
|
+
when Symbol
|
148
|
+
context.const_get(c).new
|
149
|
+
else
|
150
|
+
c
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
155
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module TextRank
|
2
|
+
module RankFilter
|
3
|
+
##
|
4
|
+
# A rank filter which attempts to collapse one of the highly ranked, single
|
5
|
+
# token keywords into a combined keyword when those keywords are adjacent
|
6
|
+
# to each other in the original text.
|
7
|
+
#
|
8
|
+
# = Example
|
9
|
+
#
|
10
|
+
# CollapseAdjacent.new(ranks_to_collapse: 6, max_tokens_to_combine: 2).filter!(
|
11
|
+
# {
|
12
|
+
# "town" => 0.9818754334834477,
|
13
|
+
# "cities" => 0.9055017128817066,
|
14
|
+
# "siege" => 0.7411519524982207,
|
15
|
+
# "arts" => 0.6907977453782612,
|
16
|
+
# "envy" => 0.6692709808107252,
|
17
|
+
# "blessings" => 0.6442147897516214,
|
18
|
+
# "plagues" => 0.5972420789430091,
|
19
|
+
# "florish" => 0.3746092797528525,
|
20
|
+
# "devoured" => 0.36867321734332237,
|
21
|
+
# "anxieties" => 0.3367731719604189,
|
22
|
+
# "peace" => 0.2905352582752693,
|
23
|
+
# "inhabitants" => 0.12715120116732137,
|
24
|
+
# "cares" => 0.0697383057947685,
|
25
|
+
# },
|
26
|
+
# original_text: "cities blessings peace arts florish inhabitants devoured envy cares anxieties plagues town siege"
|
27
|
+
# )
|
28
|
+
# => {
|
29
|
+
# "town siege" => 0.9818754334834477,
|
30
|
+
# "cities blessings" => 0.9055017128817066,
|
31
|
+
# "arts florish" => 0.6907977453782612,
|
32
|
+
# "devoured envy" => 0.6692709808107252,
|
33
|
+
# "anxieties plagues" => 0.5972420789430091,
|
34
|
+
# "peace" => 0.2905352582752693,
|
35
|
+
# "inhabitants" => 0.12715120116732137,
|
36
|
+
# "cares" => 0.0697383057947685,
|
37
|
+
#
|
38
|
+
##
|
39
|
+
class CollapseAdjacent
|
40
|
+
|
41
|
+
# @param ranks_to_collapse [Fixnum] the top N ranks in which to look for collapsable keywords
|
42
|
+
# @param max_tokens_to_combine [Fixnum] the maximum number of tokens to collapse into a combined keyword
|
43
|
+
# @param ignore_case [true, false] whether to ignore case when finding adjacent keywords in original text
|
44
|
+
def initialize(ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, **_)
|
45
|
+
@ranks_to_collapse = ranks_to_collapse
|
46
|
+
@max_tokens_to_combine = max_tokens_to_combine
|
47
|
+
@ignore_case = !!ignore_case
|
48
|
+
end
|
49
|
+
|
50
|
+
# Perform the filter on the ranks
|
51
|
+
# @param ranks [Hash<String, Float>] the results of the PageRank algorithm
|
52
|
+
# @param original_text [String] the original text (pre-tokenization) from which to find collapsable keywords
|
53
|
+
# @return [Hash<String, Float>]
|
54
|
+
def filter!(ranks, original_text:, **_)
|
55
|
+
collapsed = {}
|
56
|
+
loop do
|
57
|
+
permutation = collapse_one(ranks.keys.first(@ranks_to_collapse - collapsed.size), original_text) or break
|
58
|
+
collapsed[permutation.join(' ')] = ranks.values_at(*permutation).max
|
59
|
+
permutation.each { |token| ranks.delete(token) }
|
60
|
+
end
|
61
|
+
collapsed.merge!(ranks)
|
62
|
+
Hash[collapsed.sort_by { |_, v| -v }]
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def collapse_one(tokens, original_text)
|
68
|
+
(2..@max_tokens_to_combine).to_a.reverse_each do |tokens_to_combine|
|
69
|
+
tokens.permutation(tokens_to_combine) do |permutation|
|
70
|
+
re_options = 0
|
71
|
+
re_options |= Regexp::IGNORECASE if @ignore_case
|
72
|
+
re = Regexp.new("\\b#{permutation.join(" +")}\\b", re_options)
|
73
|
+
return permutation if original_text =~ re
|
74
|
+
end
|
75
|
+
end
|
76
|
+
nil
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module TextRank
|
2
|
+
##
|
3
|
+
# Rank filters are post-process filters which can filter, enhance, or modify
|
4
|
+
# the results of the PageRank algorithm. A common use case is to collapse highly
|
5
|
+
# ranked tokens which are found to be adjacent in the original text. Other
|
6
|
+
# filters might modify the PageRank scores with some sort of external modifier.
|
7
|
+
# Another use might be to remove collapsed tokens which are not desired (since
|
8
|
+
# token filters only operate on a single, non-collapsed token).
|
9
|
+
#
|
10
|
+
# Rank filters are applied as a chain, so care should be taken to use them
|
11
|
+
# in the desired order.
|
12
|
+
##
|
13
|
+
module RankFilter
|
14
|
+
|
15
|
+
autoload :CollapseAdjacent, 'text_rank/rank_filter/collapse_adjacent'
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module TextRank
|
2
|
+
module TokenFilter
|
3
|
+
##
|
4
|
+
# Token filter to remove "small" tokens
|
5
|
+
#
|
6
|
+
# = Example
|
7
|
+
#
|
8
|
+
# MinLength.new(min_length: 6).filter!(%w[
|
9
|
+
# and ask each passenger to tell his story and if there is one of them all who has not
|
10
|
+
# cursed his existence many times and said to himself over and over again that he was
|
11
|
+
# the most miserable of men i give you permission to throw me head-first into the sea
|
12
|
+
# ])
|
13
|
+
# => ["passenger", "cursed", "existence", "himself", "miserable", "permission", "head-first"]
|
14
|
+
##
|
15
|
+
class MinLength
|
16
|
+
|
17
|
+
# @param min_length [Fixnum] minimum size of token to keep
|
18
|
+
def initialize(min_length: 3, **_)
|
19
|
+
@min_length = min_length
|
20
|
+
end
|
21
|
+
|
22
|
+
# Perform the filter
|
23
|
+
# @param tokens [Array<String>]
|
24
|
+
# @return [Array<String>]
|
25
|
+
def filter!(tokens)
|
26
|
+
tokens.keep_if do |token|
|
27
|
+
token.size >= @min_length
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'engtagger'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
module TextRank
|
5
|
+
module TokenFilter
|
6
|
+
##
|
7
|
+
# Token filter to keep only a selected set of parts of speech
|
8
|
+
#
|
9
|
+
# = Example
|
10
|
+
#
|
11
|
+
# PartOfSpeech.new(parts_to_keep: %w[nn nns]).filter!(%w[
|
12
|
+
# all men are by nature free
|
13
|
+
# ])
|
14
|
+
# => ["men", "nature"]
|
15
|
+
##
|
16
|
+
class PartOfSpeech
|
17
|
+
|
18
|
+
# @param parts_to_keep [Array<String>] list of engtagger parts of speech to keep
|
19
|
+
# @see https://github.com/yohasebe/engtagger#tag-set
|
20
|
+
def initialize(parts_to_keep: %w[nn nnp nnps nns jj jjr jjs vb vbd vbg vbn vbp vbz], **_)
|
21
|
+
@parts_to_keep = Set.new(parts_to_keep)
|
22
|
+
@eng_tagger = EngTagger.new
|
23
|
+
@last_pos_tag = 'pp'
|
24
|
+
end
|
25
|
+
|
26
|
+
# Perform the filter
|
27
|
+
# @param tokens [Array<String>]
|
28
|
+
# @return [Array<String>]
|
29
|
+
def filter!(tokens)
|
30
|
+
tokens.keep_if do |token|
|
31
|
+
@parts_to_keep.include?(pos_tag(token))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def pos_tag(token)
|
38
|
+
tag = @eng_tagger.assign_tag(@last_pos_tag, token) rescue nil
|
39
|
+
tag = 'nn' if tag.nil? || tag == ''
|
40
|
+
@last_pos_tag = tag
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|