text_rank 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.codeclimate.yml +29 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +1157 -0
- data/.ruby-version +1 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +21 -0
- data/README.md +137 -0
- data/Rakefile +12 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/page_rank/base.rb +89 -0
- data/lib/page_rank/dense.rb +89 -0
- data/lib/page_rank/sparse.rb +87 -0
- data/lib/page_rank.rb +39 -0
- data/lib/text_rank/char_filter/ascii_folding.rb +26 -0
- data/lib/text_rank/char_filter/lowercase.rb +22 -0
- data/lib/text_rank/char_filter/strip_email.rb +24 -0
- data/lib/text_rank/char_filter/strip_html.rb +41 -0
- data/lib/text_rank/char_filter/strip_possessive.rb +24 -0
- data/lib/text_rank/char_filter/undo_contractions.rb +162 -0
- data/lib/text_rank/char_filter.rb +24 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +78 -0
- data/lib/text_rank/graph_strategy.rb +23 -0
- data/lib/text_rank/keyword_extractor.rb +155 -0
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +81 -0
- data/lib/text_rank/rank_filter.rb +18 -0
- data/lib/text_rank/token_filter/min_length.rb +33 -0
- data/lib/text_rank/token_filter/part_of_speech.rb +45 -0
- data/lib/text_rank/token_filter/stopwords.rb +349 -0
- data/lib/text_rank/token_filter.rb +18 -0
- data/lib/text_rank/tokenizer/regex.rb +26 -0
- data/lib/text_rank/tokenizer/whitespace.rb +19 -0
- data/lib/text_rank/tokenizer/words_and_punctuation.rb +26 -0
- data/lib/text_rank/tokenizer.rb +19 -0
- data/lib/text_rank/version.rb +3 -0
- data/lib/text_rank.rb +34 -0
- data/text_rank.gemspec +30 -0
- metadata +183 -0
@@ -0,0 +1,24 @@
|
|
1
|
+
module TextRank
|
2
|
+
module CharFilter
|
3
|
+
##
|
4
|
+
# Character filter to remove apostrophe from possessives.
|
5
|
+
#
|
6
|
+
# = Example
|
7
|
+
#
|
8
|
+
# StripPosessive.new.filter!("to loathe one’s very being and yet to hold it fast")
|
9
|
+
# => "to loathe one very being and yet to hold it fast"
|
10
|
+
##
|
11
|
+
class StripPossessive
|
12
|
+
|
13
|
+
# Perform the filter
|
14
|
+
# @param text [String]
|
15
|
+
# @return [String]
|
16
|
+
def filter!(text)
|
17
|
+
text.gsub!(/([a-z]+)'s\b/) do
|
18
|
+
$1
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,162 @@
|
|
1
|
+
module TextRank
|
2
|
+
module CharFilter
|
3
|
+
##
|
4
|
+
# Character filter to convert English contractions into their expanded form.
|
5
|
+
#
|
6
|
+
# = Example
|
7
|
+
#
|
8
|
+
# UndoContractions.new.filter!("You're a bitter man. That's because I've lived.")
|
9
|
+
# => "You are a bitter man. That is because I have lived."
|
10
|
+
##
|
11
|
+
class UndoContractions
|
12
|
+
|
13
|
+
CONTRACTIONS = {
|
14
|
+
"ain't" => "am not",
|
15
|
+
"amn't" => "am not",
|
16
|
+
"aren't" => "are not",
|
17
|
+
"can't" => "can not",
|
18
|
+
"could've" => "could have",
|
19
|
+
"couldn't" => "could not",
|
20
|
+
"couldn't've" => "could not have",
|
21
|
+
"didn't" => "did not",
|
22
|
+
"doesn't" => "does not",
|
23
|
+
"don't" => "do not",
|
24
|
+
"gonna" => "going to",
|
25
|
+
"hadn't" => "had not",
|
26
|
+
"hadn't've" => "had not have",
|
27
|
+
"hasn't" => "has not",
|
28
|
+
"haven't" => "have not",
|
29
|
+
"he'd" => "he had",
|
30
|
+
"he'd've" => "he would have",
|
31
|
+
"he'll" => "he shall",
|
32
|
+
"he's" => "he has",
|
33
|
+
"he'sn't" => "he has not",
|
34
|
+
"how'd" => "how did",
|
35
|
+
"how'll" => "how will",
|
36
|
+
"how's" => "how has",
|
37
|
+
"i'd" => "i had",
|
38
|
+
"i'd've" => "i would have",
|
39
|
+
"i'll" => "i shall",
|
40
|
+
"i'm" => "i am",
|
41
|
+
"i've" => "i have",
|
42
|
+
"i'ven't" => "i have not",
|
43
|
+
"isn't" => "is not",
|
44
|
+
"it'd" => "it had",
|
45
|
+
"it'd've" => "it would have",
|
46
|
+
"it'll" => "it shall",
|
47
|
+
"it's" => "it has",
|
48
|
+
"it'sn't" => "it has not",
|
49
|
+
"let's" => "let us",
|
50
|
+
"ma'am" => "madam",
|
51
|
+
"mightn't" => "might not",
|
52
|
+
"mightn't've" => "might not have",
|
53
|
+
"might've" => "might have",
|
54
|
+
"mustn't" => "must not",
|
55
|
+
"must've" => "must have",
|
56
|
+
"needn't" => "need not",
|
57
|
+
"not've" => "not have",
|
58
|
+
"o'clock" => "of the clock",
|
59
|
+
"ol'" => "old",
|
60
|
+
"oughtn't" => "ought not",
|
61
|
+
"shan't" => "shall not",
|
62
|
+
"she'd" => "she had",
|
63
|
+
"she'd've" => "she would have",
|
64
|
+
"she'll" => "she shall",
|
65
|
+
"she's" => "she has",
|
66
|
+
"she'sn't" => "she has not",
|
67
|
+
"should've" => "should have",
|
68
|
+
"shouldn't" => "should not",
|
69
|
+
"shouldn't've" => "should not have",
|
70
|
+
"somebody'd" => "somebody had",
|
71
|
+
"somebody'd've" => "somebody would have",
|
72
|
+
"somebody'dn't've" => "somebody would not have",
|
73
|
+
"somebody'll" => "somebody shall",
|
74
|
+
"somebody's" => "somebody has",
|
75
|
+
"someone'd" => "someone had",
|
76
|
+
"someone'd've" => "someone would have",
|
77
|
+
"someone'll" => "someone shall",
|
78
|
+
"someone's" => "someone has",
|
79
|
+
"something'd" => "something had",
|
80
|
+
"something'd've" => "something would have",
|
81
|
+
"something'll" => "something shall",
|
82
|
+
"something's" => "something has",
|
83
|
+
"'sup" => "what's up",
|
84
|
+
"that'll" => "that will",
|
85
|
+
"that's" => "that has",
|
86
|
+
"there'd" => "there had",
|
87
|
+
"there'd've" => "there would have",
|
88
|
+
"there're" => "there are",
|
89
|
+
"there's" => "there has",
|
90
|
+
"they'd" => "they had",
|
91
|
+
"they'dn't" => "they would not",
|
92
|
+
"they'dn't've" => "they would not have",
|
93
|
+
"they'd've" => "they would have",
|
94
|
+
"they'd'ven't" => "they would have not",
|
95
|
+
"they'll" => "they shall",
|
96
|
+
"they'lln't've" => "they will not have",
|
97
|
+
"they'll'ven't" => "they will have not",
|
98
|
+
"they're" => "they are",
|
99
|
+
"they've" => "they have",
|
100
|
+
"they'ven't" => "they have not",
|
101
|
+
"'tis" => "it is",
|
102
|
+
"'twas" => "it was",
|
103
|
+
"wanna" => "want to",
|
104
|
+
"wasn't" => "was not",
|
105
|
+
"we'd" => "we had",
|
106
|
+
"we'd've" => "we would have",
|
107
|
+
"we'dn't've" => "we would not have",
|
108
|
+
"we'll" => "we will",
|
109
|
+
"we'lln't've" => "we will not have",
|
110
|
+
"we're" => "we are",
|
111
|
+
"we've" => "we have",
|
112
|
+
"weren't" => "were not",
|
113
|
+
"what'll" => "what shall",
|
114
|
+
"what're" => "what are",
|
115
|
+
"what's" => "what has",
|
116
|
+
"what've" => "what have",
|
117
|
+
"when's" => "when has",
|
118
|
+
"where'd" => "where did",
|
119
|
+
"where's" => "where has",
|
120
|
+
"where've" => "where have",
|
121
|
+
"who'd" => "who would",
|
122
|
+
"who'd've" => "who would have",
|
123
|
+
"who'll" => "who shall",
|
124
|
+
"who're" => "who are",
|
125
|
+
"who's" => "who has",
|
126
|
+
"who've" => "who have",
|
127
|
+
"why'll" => "why will",
|
128
|
+
"why're" => "why are",
|
129
|
+
"why's" => "why has",
|
130
|
+
"won't" => "will not",
|
131
|
+
"won't've" => "will not have",
|
132
|
+
"would've" => "would have",
|
133
|
+
"wouldn't" => "would not",
|
134
|
+
"wouldn't've" => "would not have",
|
135
|
+
"y'all" => "you all",
|
136
|
+
"y'all'd've" => "you all would have",
|
137
|
+
"y'all'dn't've" => "you all would not have",
|
138
|
+
"y'all'll" => "you all will",
|
139
|
+
"y'all'lln't" => "you all will not",
|
140
|
+
"y'all'll've" => "you all will have",
|
141
|
+
"y'all'll'ven't" => "you all will have not",
|
142
|
+
"you'd" => "you had",
|
143
|
+
"you'd've" => "you would have",
|
144
|
+
"you'll" => "you shall",
|
145
|
+
"you're" => "you are",
|
146
|
+
"you'ren't" => "you are not",
|
147
|
+
"you've" => "you have",
|
148
|
+
"you'ven't" => "you have not",
|
149
|
+
}
|
150
|
+
|
151
|
+
# Perform the filter
|
152
|
+
# @param text [String]
|
153
|
+
# @return [String]
|
154
|
+
def filter!(text)
|
155
|
+
text.gsub!(/[a-z']+/) do |word|
|
156
|
+
CONTRACTIONS[word] || word
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module TextRank
|
2
|
+
##
|
3
|
+
# Character filters pre-process text prior to tokenization. It is during
|
4
|
+
# this phase that the text should be "cleaned up" so that the tokenizer will
|
5
|
+
# produce valid tokens. Character filters should not attempt to remove undesired
|
6
|
+
# tokens, however. That is the job of the token filter. Examples include
|
7
|
+
# converting non-ascii characters to related ascii characters, forcing text to
|
8
|
+
# lower case, stripping out HTML, converting English contractions (e.g. "won't")
|
9
|
+
# to the non-contracted form ("will not"), and more.
|
10
|
+
#
|
11
|
+
# Character filters are applied as a chain, so care should be taken to use them
|
12
|
+
# in the desired order.
|
13
|
+
##
|
14
|
+
module CharFilter
|
15
|
+
|
16
|
+
autoload :AsciiFolding, 'text_rank/char_filter/ascii_folding'
|
17
|
+
autoload :Lowercase, 'text_rank/char_filter/lowercase'
|
18
|
+
autoload :StripEmail, 'text_rank/char_filter/strip_email'
|
19
|
+
autoload :StripHtml, 'text_rank/char_filter/strip_html'
|
20
|
+
autoload :StripPossessive, 'text_rank/char_filter/strip_possessive'
|
21
|
+
autoload :UndoContractions, 'text_rank/char_filter/undo_contractions'
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module TextRank
|
2
|
+
module GraphStrategy
|
3
|
+
##
|
4
|
+
# The original TextRank algorithm uses co-occurrence in determining how to
|
5
|
+
# construct a graph of eligible token keywords and relate them together. Given a
|
6
|
+
# window size of N any other token at most N positions away from a token is
|
7
|
+
# considered co-ocurrent, and an edge will be drawn between them.
|
8
|
+
#
|
9
|
+
# This implementation makes a slight change from the original algorithm by
|
10
|
+
# choosing a weight of 1/distance_from_token as the edge weight.
|
11
|
+
#
|
12
|
+
# = Example
|
13
|
+
# Coocurrence.new(ngram_size: 4).build_graph(%w[what a pessimist you are exclaimed candide], graph)
|
14
|
+
# # graph.add("what", "a", 1.0)
|
15
|
+
# # graph.add("what", "pessimist", 0.5)
|
16
|
+
# # graph.add("what", "you", 0.3333333333333333)
|
17
|
+
# # graph.add("what", "are", 0.25)
|
18
|
+
# # graph.add("a", "what", 1.0)
|
19
|
+
# # graph.add("a", "pessimist", 1.0)
|
20
|
+
# # graph.add("a", "you", 0.5)
|
21
|
+
# # graph.add("a", "are", 0.3333333333333333)
|
22
|
+
# # graph.add("a", "exclaimed", 0.25)
|
23
|
+
# # graph.add("pessimist", "what", 0.5)
|
24
|
+
# # graph.add("pessimist", "a", 1.0)
|
25
|
+
# # graph.add("pessimist", "you", 1.0)
|
26
|
+
# # graph.add("pessimist", "are", 0.5)
|
27
|
+
# # graph.add("pessimist", "exclaimed", 0.3333333333333333)
|
28
|
+
# # graph.add("pessimist", "candide", 0.25)
|
29
|
+
# # graph.add("you", "what", 0.3333333333333333)
|
30
|
+
# # graph.add("you", "a", 0.5)
|
31
|
+
# # graph.add("you", "pessimist", 1.0)
|
32
|
+
# # graph.add("you", "are", 1.0)
|
33
|
+
# # graph.add("you", "exclaimed", 0.5)
|
34
|
+
# # graph.add("you", "candide", 0.3333333333333333)
|
35
|
+
# # graph.add("are", "what", 0.25)
|
36
|
+
# # graph.add("are", "a", 0.3333333333333333)
|
37
|
+
# # graph.add("are", "pessimist", 0.5)
|
38
|
+
# # graph.add("are", "you", 1.0)
|
39
|
+
# # graph.add("are", "exclaimed", 1.0)
|
40
|
+
# # graph.add("are", "candide", 0.5)
|
41
|
+
# # graph.add("exclaimed", "a", 0.25)
|
42
|
+
# # graph.add("exclaimed", "pessimist", 0.3333333333333333)
|
43
|
+
# # graph.add("exclaimed", "you", 0.5)
|
44
|
+
# # graph.add("exclaimed", "are", 1.0)
|
45
|
+
# # graph.add("exclaimed", "candide", 1.0)
|
46
|
+
# # graph.add("candide", "pessimist", 0.25)
|
47
|
+
# # graph.add("candide", "you", 0.3333333333333333)
|
48
|
+
# # graph.add("candide", "are", 0.5)
|
49
|
+
# # graph.add("candide", "exclaimed", 1.0)
|
50
|
+
##
|
51
|
+
class Coocurrence
|
52
|
+
|
53
|
+
# @param ngram_size [Fixnum] Window size around a token considered co-occurrence
|
54
|
+
def initialize(ngram_size: 3, **_)
|
55
|
+
@ngram_size = ngram_size
|
56
|
+
end
|
57
|
+
|
58
|
+
# Build a graph for which the PageRank algorithm will be applied
|
59
|
+
# @param tokens [Array<String>] filtered tokens from which to build a graph
|
60
|
+
# @param graph [PageRank::Base] a PageRank graph into which to add nodes/edges
|
61
|
+
# return [nil]
|
62
|
+
def build_graph(tokens, graph)
|
63
|
+
ngram_window = @ngram_size * 2 + 1
|
64
|
+
tokens.each_with_index do |token_i, i|
|
65
|
+
ngram_window.times do |j|
|
66
|
+
next if j == @ngram_size || i + j < @ngram_size
|
67
|
+
token_j = tokens[i - @ngram_size + j]
|
68
|
+
if token_j
|
69
|
+
graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
nil
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module TextRank
|
2
|
+
##
|
3
|
+
# The graph strategy is the heart of the TextRank algorithm. Strategies
|
4
|
+
# determine how a stream of potential tokens are transformed into a graph of
|
5
|
+
# unique tokens in such a way that the PageRank algorithm provides meaningful
|
6
|
+
# results.
|
7
|
+
#
|
8
|
+
# The standard TextRank approach uses co-occurence of tokens within a fixed-size
|
9
|
+
# window, and that strategy will likely suffice for most applications. However,
|
10
|
+
# there are many variations of TextRank, e.g.:
|
11
|
+
#
|
12
|
+
# * SingleRank
|
13
|
+
# * ExpandRank
|
14
|
+
# * ClusterRank
|
15
|
+
#
|
16
|
+
# @see http://www.hlt.utdallas.edu/~vince/papers/coling10-keyphrase.pdf
|
17
|
+
##
|
18
|
+
module GraphStrategy
|
19
|
+
|
20
|
+
autoload :Coocurrence, 'text_rank/graph_strategy/coocurrence'
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,155 @@
|
|
1
|
+
module TextRank
|
2
|
+
##
|
3
|
+
# Primary class for keyword extraction and hub for filters, tokenizers, and
|
4
|
+
# graph strategies # that customize how the text is processed and how the
|
5
|
+
# TextRank algorithm is applied.
|
6
|
+
#
|
7
|
+
# @see README
|
8
|
+
##
|
9
|
+
class KeywordExtractor
|
10
|
+
|
11
|
+
# Creates a "basic" keyword extractor with default options
|
12
|
+
# @option (see #initialize)
|
13
|
+
# @return [KeywordExtractor]
|
14
|
+
def self.basic(**options)
|
15
|
+
new(**{
|
16
|
+
char_filters: [:AsciiFolding, :Lowercase],
|
17
|
+
tokenizer: :Whitespace,
|
18
|
+
token_filters: [:Stopwords, :MinLength],
|
19
|
+
graph_strategy: :Coocurrence,
|
20
|
+
}.merge(options))
|
21
|
+
end
|
22
|
+
|
23
|
+
# Creates an "advanced" keyword extractor with a larger set of default filters
|
24
|
+
# @option (see #initialize)
|
25
|
+
# @return [KeywordExtractor]
|
26
|
+
def self.advanced(**options)
|
27
|
+
new(**{
|
28
|
+
char_filters: [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
|
29
|
+
tokenizer: :WordsAndPunctuation,
|
30
|
+
token_filters: [:PartOfSpeech, :Stopwords, :MinLength],
|
31
|
+
graph_strategy: :Coocurrence,
|
32
|
+
rank_filters: [:CollapseAdjacent],
|
33
|
+
}.merge(options))
|
34
|
+
end
|
35
|
+
|
36
|
+
# @option (see PageRank.new)
|
37
|
+
# @option options [Array<Class, Symbol, #filter!>] :char_filters A list of filters to be applied prior to tokenization
|
38
|
+
# @option options [Class, Symbol, #tokenize] :tokenizer A class or tokenizer instance to perform tokenization
|
39
|
+
# @option options [Array<Class, Symbol, #filter!>] :token_filters A list of filters to be applied to each token after tokenization
|
40
|
+
# @option options [Class, Symbol, #build_graph] :graph_strategy A class or strategy instance for producing a graph from tokens
|
41
|
+
# @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
|
42
|
+
def initialize(**options)
|
43
|
+
@page_rank_options = {
|
44
|
+
strategy: options[:strategy] || :sparse,
|
45
|
+
damping: options[:damping],
|
46
|
+
tolerance: options[:tolerance],
|
47
|
+
}
|
48
|
+
@char_filters = options[:char_filters] || []
|
49
|
+
@tokenizer = options[:tokenizer] || Tokenizer::Whitespace
|
50
|
+
@token_filters = options[:token_filters] || []
|
51
|
+
@rank_filters = options[:rank_filters] || []
|
52
|
+
@graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
|
53
|
+
end
|
54
|
+
|
55
|
+
# Add a new CharFilter for processing text before tokenization
|
56
|
+
# @param filter [Class, Symbol, #filter!] A filter to process text before tokenization
|
57
|
+
# @param (see #add_into)
|
58
|
+
# @return [nil]
|
59
|
+
def add_char_filter(filter, **options)
|
60
|
+
add_into(@char_filters, filter, **options)
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
|
64
|
+
# Sets the tokenizer for producing tokens from filtered text
|
65
|
+
# @param tokenizer [Class, Symbol, #tokenize] Tokenizer
|
66
|
+
# @return [Class, Symbol, #tokenize]
|
67
|
+
def tokenizer=(tokenizer)
|
68
|
+
@tokenizer = tokenizer
|
69
|
+
end
|
70
|
+
|
71
|
+
# Sets the graph strategy for producing a graph from tokens
|
72
|
+
# @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
|
73
|
+
# @return [Class, Symbol, #build_graph]
|
74
|
+
def graph_strategy=(strategy)
|
75
|
+
@graph_strategy = strategy
|
76
|
+
end
|
77
|
+
|
78
|
+
# Add a new TokenFilter for processing tokens after tokenization
|
79
|
+
# @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
|
80
|
+
# @param (see #add_into)
|
81
|
+
# @return [nil]
|
82
|
+
def add_token_filter(filter, **options)
|
83
|
+
add_into(@token_filters, filter, **options)
|
84
|
+
nil
|
85
|
+
end
|
86
|
+
|
87
|
+
# Add a new RankFilter for processing ranks after calculating
|
88
|
+
# @param filter [Class, Symbol, #filter!] A filter to process ranks
|
89
|
+
# @param (see #add_into)
|
90
|
+
# @return [nil]
|
91
|
+
def add_rank_filter(filter, **options)
|
92
|
+
add_into(@rank_filters, filter, **options)
|
93
|
+
nil
|
94
|
+
end
|
95
|
+
|
96
|
+
# Filters and tokenizes text
|
97
|
+
# @param text [String] unfiltered text to be tokenized
|
98
|
+
# @return [Array<String>] tokens
|
99
|
+
def tokenize(text)
|
100
|
+
filtered_text = apply_char_filters(text)
|
101
|
+
tokens = classify(@tokenizer, context: Tokenizer).tokenize(filtered_text)
|
102
|
+
apply_token_filters(tokens)
|
103
|
+
end
|
104
|
+
|
105
|
+
# Filter & tokenize text, and return PageRank
|
106
|
+
# @param text [String] unfiltered text to be processed
|
107
|
+
# @return [Hash<String, Float>] tokens and page ranks (in descending order)
|
108
|
+
def extract(text, **options)
|
109
|
+
tokens = tokenize(text)
|
110
|
+
graph = PageRank.new(**@page_rank_options)
|
111
|
+
classify(@graph_strategy, context: GraphStrategy).build_graph(tokens, graph)
|
112
|
+
ranks = graph.calculate(**options)
|
113
|
+
apply_rank_filters(ranks, tokens: tokens, original_text: text)
|
114
|
+
end
|
115
|
+
|
116
|
+
private
|
117
|
+
|
118
|
+
def apply_char_filters(text)
|
119
|
+
@char_filters.reduce(text.clone) do |t, f|
|
120
|
+
classify(f, context: CharFilter).filter!(t) || t
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def apply_token_filters(tokens)
|
125
|
+
@token_filters.reduce(tokens) do |t, f|
|
126
|
+
classify(f, context: TokenFilter).filter!(t) || t
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def apply_rank_filters(ranks, **options)
|
131
|
+
@rank_filters.reduce(ranks) do |t, f|
|
132
|
+
classify(f, context: RankFilter).filter!(t, **options) || t
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# @param before [Class, Symbol, Object] item to add before
|
137
|
+
# @param at [Fixnum] index to insert new item
|
138
|
+
def add_into(array, value, before: nil, at: nil)
|
139
|
+
idx = array.index(before) || at || -1
|
140
|
+
array.insert(idx, value)
|
141
|
+
end
|
142
|
+
|
143
|
+
def classify(c, context: self)
|
144
|
+
case c
|
145
|
+
when Class
|
146
|
+
c.new
|
147
|
+
when Symbol
|
148
|
+
context.const_get(c).new
|
149
|
+
else
|
150
|
+
c
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
155
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module TextRank
|
2
|
+
module RankFilter
|
3
|
+
##
|
4
|
+
# A rank filter which attempts to collapse one of the highly ranked, single
|
5
|
+
# token keywords into a combined keyword when those keywords are adjacent
|
6
|
+
# to each other in the original text.
|
7
|
+
#
|
8
|
+
# = Example
|
9
|
+
#
|
10
|
+
# CollapseAdjacent.new(ranks_to_collapse: 6, max_tokens_to_combine: 2).filter!(
|
11
|
+
# {
|
12
|
+
# "town" => 0.9818754334834477,
|
13
|
+
# "cities" => 0.9055017128817066,
|
14
|
+
# "siege" => 0.7411519524982207,
|
15
|
+
# "arts" => 0.6907977453782612,
|
16
|
+
# "envy" => 0.6692709808107252,
|
17
|
+
# "blessings" => 0.6442147897516214,
|
18
|
+
# "plagues" => 0.5972420789430091,
|
19
|
+
# "florish" => 0.3746092797528525,
|
20
|
+
# "devoured" => 0.36867321734332237,
|
21
|
+
# "anxieties" => 0.3367731719604189,
|
22
|
+
# "peace" => 0.2905352582752693,
|
23
|
+
# "inhabitants" => 0.12715120116732137,
|
24
|
+
# "cares" => 0.0697383057947685,
|
25
|
+
# },
|
26
|
+
# original_text: "cities blessings peace arts florish inhabitants devoured envy cares anxieties plagues town siege"
|
27
|
+
# )
|
28
|
+
# => {
|
29
|
+
# "town siege" => 0.9818754334834477,
|
30
|
+
# "cities blessings" => 0.9055017128817066,
|
31
|
+
# "arts florish" => 0.6907977453782612,
|
32
|
+
# "devoured envy" => 0.6692709808107252,
|
33
|
+
# "anxieties plagues" => 0.5972420789430091,
|
34
|
+
# "peace" => 0.2905352582752693,
|
35
|
+
# "inhabitants" => 0.12715120116732137,
|
36
|
+
# "cares" => 0.0697383057947685,
|
37
|
+
#
|
38
|
+
##
|
39
|
+
class CollapseAdjacent
|
40
|
+
|
41
|
+
# @param ranks_to_collapse [Fixnum] the top N ranks in which to look for collapsable keywords
|
42
|
+
# @param max_tokens_to_combine [Fixnum] the maximum number of tokens to collapse into a combined keyword
|
43
|
+
# @param ignore_case [true, false] whether to ignore case when finding adjacent keywords in original text
|
44
|
+
def initialize(ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, **_)
|
45
|
+
@ranks_to_collapse = ranks_to_collapse
|
46
|
+
@max_tokens_to_combine = max_tokens_to_combine
|
47
|
+
@ignore_case = !!ignore_case
|
48
|
+
end
|
49
|
+
|
50
|
+
# Perform the filter on the ranks
|
51
|
+
# @param ranks [Hash<String, Float>] the results of the PageRank algorithm
|
52
|
+
# @param original_text [String] the original text (pre-tokenization) from which to find collapsable keywords
|
53
|
+
# @return [Hash<String, Float>]
|
54
|
+
def filter!(ranks, original_text:, **_)
|
55
|
+
collapsed = {}
|
56
|
+
loop do
|
57
|
+
permutation = collapse_one(ranks.keys.first(@ranks_to_collapse - collapsed.size), original_text) or break
|
58
|
+
collapsed[permutation.join(' ')] = ranks.values_at(*permutation).max
|
59
|
+
permutation.each { |token| ranks.delete(token) }
|
60
|
+
end
|
61
|
+
collapsed.merge!(ranks)
|
62
|
+
Hash[collapsed.sort_by { |_, v| -v }]
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def collapse_one(tokens, original_text)
|
68
|
+
(2..@max_tokens_to_combine).to_a.reverse_each do |tokens_to_combine|
|
69
|
+
tokens.permutation(tokens_to_combine) do |permutation|
|
70
|
+
re_options = 0
|
71
|
+
re_options |= Regexp::IGNORECASE if @ignore_case
|
72
|
+
re = Regexp.new("\\b#{permutation.join(" +")}\\b", re_options)
|
73
|
+
return permutation if original_text =~ re
|
74
|
+
end
|
75
|
+
end
|
76
|
+
nil
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module TextRank
|
2
|
+
##
|
3
|
+
# Rank filters are post-process filters which can filter, enhance, or modify
|
4
|
+
# the results of the PageRank algorithm. A common use case is to collapse highly
|
5
|
+
# ranked tokens which are found to be adjacent in the original text. Other
|
6
|
+
# filters might modify the PageRank scores with some sort of external modifier.
|
7
|
+
# Another use might be to remove collapsed tokens which are not desired (since
|
8
|
+
# token filters only operate on a single, non-collapsed token).
|
9
|
+
#
|
10
|
+
# Rank filters are applied as a chain, so care should be taken to use them
|
11
|
+
# in the desired order.
|
12
|
+
##
|
13
|
+
module RankFilter
|
14
|
+
|
15
|
+
autoload :CollapseAdjacent, 'text_rank/rank_filter/collapse_adjacent'
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module TextRank
|
2
|
+
module TokenFilter
|
3
|
+
##
|
4
|
+
# Token filter to remove "small" tokens
|
5
|
+
#
|
6
|
+
# = Example
|
7
|
+
#
|
8
|
+
# MinLength.new(min_length: 6).filter!(%w[
|
9
|
+
# and ask each passenger to tell his story and if there is one of them all who has not
|
10
|
+
# cursed his existence many times and said to himself over and over again that he was
|
11
|
+
# the most miserable of men i give you permission to throw me head-first into the sea
|
12
|
+
# ])
|
13
|
+
# => ["passenger", "cursed", "existence", "himself", "miserable", "permission", "head-first"]
|
14
|
+
##
|
15
|
+
class MinLength
|
16
|
+
|
17
|
+
# @param min_length [Fixnum] minimum size of token to keep
|
18
|
+
def initialize(min_length: 3, **_)
|
19
|
+
@min_length = min_length
|
20
|
+
end
|
21
|
+
|
22
|
+
# Perform the filter
|
23
|
+
# @param tokens [Array<String>]
|
24
|
+
# @return [Array<String>]
|
25
|
+
def filter!(tokens)
|
26
|
+
tokens.keep_if do |token|
|
27
|
+
token.size >= @min_length
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'engtagger'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
module TextRank
|
5
|
+
module TokenFilter
|
6
|
+
##
|
7
|
+
# Token filter to keep only a selected set of parts of speech
|
8
|
+
#
|
9
|
+
# = Example
|
10
|
+
#
|
11
|
+
# PartOfSpeech.new(parts_to_keep: %w[nn nns]).filter!(%w[
|
12
|
+
# all men are by nature free
|
13
|
+
# ])
|
14
|
+
# => ["men", "nature"]
|
15
|
+
##
|
16
|
+
class PartOfSpeech
|
17
|
+
|
18
|
+
# @param parts_to_keep [Array<String>] list of engtagger parts of speech to keep
|
19
|
+
# @see https://github.com/yohasebe/engtagger#tag-set
|
20
|
+
def initialize(parts_to_keep: %w[nn nnp nnps nns jj jjr jjs vb vbd vbg vbn vbp vbz], **_)
|
21
|
+
@parts_to_keep = Set.new(parts_to_keep)
|
22
|
+
@eng_tagger = EngTagger.new
|
23
|
+
@last_pos_tag = 'pp'
|
24
|
+
end
|
25
|
+
|
26
|
+
# Perform the filter
|
27
|
+
# @param tokens [Array<String>]
|
28
|
+
# @return [Array<String>]
|
29
|
+
def filter!(tokens)
|
30
|
+
tokens.keep_if do |token|
|
31
|
+
@parts_to_keep.include?(pos_tag(token))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def pos_tag(token)
|
38
|
+
tag = @eng_tagger.assign_tag(@last_pos_tag, token) rescue nil
|
39
|
+
tag = 'nn' if tag.nil? || tag == ''
|
40
|
+
@last_pos_tag = tag
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|