text_rank 1.1.7 → 1.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.codeclimate.yml +1 -6
- data/.rubocop.yml +60 -1075
- data/.ruby-version +1 -1
- data/.travis.yml +13 -5
- data/{LICENSE.txt → LICENSE} +0 -0
- data/README.md +2 -1
- data/bin/console +3 -3
- data/lib/page_rank.rb +2 -0
- data/lib/page_rank/base.rb +9 -8
- data/lib/page_rank/dense.rb +2 -1
- data/lib/page_rank/sparse.rb +6 -7
- data/lib/text_rank.rb +12 -9
- data/lib/text_rank/char_filter.rb +1 -1
- data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
- data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
- data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
- data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
- data/lib/text_rank/fingerprint.rb +20 -28
- data/lib/text_rank/fingerprint_overlap.rb +55 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
- data/lib/text_rank/keyword_extractor.rb +32 -25
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +53 -25
- data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
- data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
- data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
- data/lib/text_rank/token_filter/stopwords.rb +1 -321
- data/lib/text_rank/token_filter/stopwords.yml +317 -0
- data/lib/text_rank/tokenizer.rb +1 -1
- data/lib/text_rank/tokenizer/money.rb +11 -6
- data/lib/text_rank/tokenizer/number.rb +4 -3
- data/lib/text_rank/tokenizer/punctuation.rb +4 -1
- data/lib/text_rank/tokenizer/url.rb +3 -0
- data/lib/text_rank/tokenizer/whitespace.rb +4 -1
- data/lib/text_rank/tokenizer/word.rb +5 -2
- data/lib/text_rank/version.rb +3 -1
- data/text_rank.gemspec +10 -10
- metadata +48 -32
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
1
|
module TextRank
|
4
2
|
##
|
5
3
|
# Class used to compare documents according to TextRank. A "fingerprint"
|
@@ -10,35 +8,35 @@ module TextRank
|
|
10
8
|
# significant keywords. But to prevent less significant keywords from being
|
11
9
|
# completely ignored we apply an inverse log linear transformation to each of the
|
12
10
|
# N prefixes.
|
13
|
-
#
|
11
|
+
#
|
14
12
|
# For example, consider the following comparison:
|
15
|
-
#
|
13
|
+
#
|
16
14
|
# town man empty found
|
17
15
|
# vs.
|
18
16
|
# general empty found jar
|
19
|
-
#
|
17
|
+
#
|
20
18
|
# The first pass considers just the first keywords: town vs. general. As these
|
21
19
|
# are different, they contribute 0.
|
22
|
-
#
|
20
|
+
#
|
23
21
|
# The second pass considers the first two keywords: town man vs general empty.
|
24
22
|
# Again, no overlap, so they contribute 0.
|
25
|
-
#
|
23
|
+
#
|
26
24
|
# The third pass considers the first three keywords: town man empty vs general
|
27
25
|
# empty found. Here we have one overlap: empty. This contributes 1.
|
28
|
-
#
|
26
|
+
#
|
29
27
|
# The fourth pass considers all, and there is two overlaps: empty & found. This
|
30
28
|
# contributes 2.
|
31
|
-
#
|
29
|
+
#
|
32
30
|
# We can represent the overlaps as the vector [0, 0, 1, 2]. Then we will apply
|
33
31
|
# the inverse log linear transformation defined by:
|
34
|
-
#
|
32
|
+
#
|
35
33
|
# f(x_i) = x_i / ln(i + 1)
|
36
34
|
# = [0, 0, 1 / ln(4), 2 / ln(5)]
|
37
35
|
# = [0, 0, 0.7213475204444817, 1.2426698691192237]
|
38
|
-
#
|
36
|
+
#
|
39
37
|
# Finally we take the average of the transformed vector and normalize it (to
|
40
38
|
# ensure a final value between 0.0 and 1.0):
|
41
|
-
#
|
39
|
+
#
|
42
40
|
# norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
|
43
41
|
# = norm( 0.49100434739092635 )
|
44
42
|
# = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
|
@@ -61,28 +59,22 @@ module TextRank
|
|
61
59
|
# Calculates the "similarity" between this fingerprint and another
|
62
60
|
# @param {Fingerprint} A second fingerprint to compare
|
63
61
|
# @return [Number] A number between 0.0 (different) and 1.0 (same)
|
64
|
-
def similarity(
|
65
|
-
return 1.0 if values ==
|
66
|
-
|
67
|
-
sim = 0
|
68
|
-
s1 = Set.new
|
69
|
-
s2 = Set.new
|
62
|
+
def similarity(other)
|
63
|
+
return 1.0 if values == other.values # Short-circuit for efficiency
|
70
64
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
if v1 == v2
|
75
|
-
sim += 1
|
76
|
-
else
|
77
|
-
s1.delete?(v2) ? (sim += 1) : (s2 << v2)
|
78
|
-
s2.delete?(v1) ? (sim += 1) : (s1 << v1)
|
79
|
-
end
|
80
|
-
sum + sim * linear_transform[i]
|
65
|
+
sum = 0
|
66
|
+
overlap(other).each_with_index do |overlap_value, i|
|
67
|
+
sum += overlap_value * linear_transform[i]
|
81
68
|
end
|
69
|
+
sum
|
82
70
|
end
|
83
71
|
|
84
72
|
private
|
85
73
|
|
74
|
+
def overlap(other)
|
75
|
+
FingerprintOverlap.new(values, other.values).overlap
|
76
|
+
end
|
77
|
+
|
86
78
|
def linear_transform
|
87
79
|
@linear_transform ||= size.times.map do |i|
|
88
80
|
1.0 / Math.log(i + 2) / size.to_f / norm_factor
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module TextRank
|
2
|
+
##
|
3
|
+
# Determines "overlap" between two fingerprints at each N prefixes
|
4
|
+
#
|
5
|
+
# For example,
|
6
|
+
#
|
7
|
+
# FingerprintOverlap.new(
|
8
|
+
# %w[a b c d],
|
9
|
+
# %w[b e a c],
|
10
|
+
# ).overlap
|
11
|
+
#
|
12
|
+
# => [
|
13
|
+
# 0, # [a] & (b) have no overlap
|
14
|
+
# 1, # [a b] & [b e] have one overlap: b
|
15
|
+
# 2, # [a b c] & [b e a] have two overlap: a & b
|
16
|
+
# 3, # [a b c d] & [b e a c] have three overlap: a, b, & c
|
17
|
+
# ]
|
18
|
+
##
|
19
|
+
class FingerprintOverlap
|
20
|
+
|
21
|
+
attr_reader :overlap
|
22
|
+
|
23
|
+
def initialize(values1, values2)
|
24
|
+
raise ArgumentError, 'Value size mismatch' if values1.size != values2.size
|
25
|
+
|
26
|
+
@encountered1 = Set.new
|
27
|
+
@encountered2 = Set.new
|
28
|
+
@overlap_count = 0
|
29
|
+
|
30
|
+
@overlap = determine_overlap(values1, values2)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def determine_overlap(values1, values2)
|
36
|
+
values1.zip(values2).map do |v1, v2|
|
37
|
+
encounter(v1, v2)
|
38
|
+
@overlap_count
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# This algorithm is a little more complex than could be represented in Ruby,
|
43
|
+
# but we want to keep it as performant as possible.
|
44
|
+
def encounter(value1, value2)
|
45
|
+
if value1 == value2
|
46
|
+
@overlap_count += 1
|
47
|
+
else
|
48
|
+
# Delete from the set in case an element appears more than once
|
49
|
+
@encountered1.delete?(value2) ? (@overlap_count += 1) : (@encountered2 << value2)
|
50
|
+
@encountered2.delete?(value1) ? (@overlap_count += 1) : (@encountered1 << value1)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
@@ -61,18 +61,27 @@ module TextRank
|
|
61
61
|
# return [nil]
|
62
62
|
def build_graph(tokens, graph)
|
63
63
|
ngram_window = @ngram_size * 2 + 1
|
64
|
-
tokens.
|
64
|
+
tokens.size.times do |i|
|
65
65
|
ngram_window.times do |j|
|
66
|
-
|
67
|
-
token_j = tokens[i - @ngram_size + j]
|
68
|
-
if token_j
|
69
|
-
graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
|
70
|
-
end
|
66
|
+
consider_ngram_window(tokens, graph, i, j)
|
71
67
|
end
|
72
68
|
end
|
73
69
|
nil
|
74
70
|
end
|
75
71
|
|
72
|
+
private
|
73
|
+
|
74
|
+
def consider_ngram_window(tokens, graph, i, j)
|
75
|
+
return if j == @ngram_size || i + j < @ngram_size
|
76
|
+
|
77
|
+
token_i = tokens[i]
|
78
|
+
token_j = tokens[i - @ngram_size + j]
|
79
|
+
|
80
|
+
if token_j
|
81
|
+
graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
76
85
|
end
|
77
86
|
end
|
78
87
|
end
|
@@ -13,9 +13,9 @@ module TextRank
|
|
13
13
|
# @return [KeywordExtractor]
|
14
14
|
def self.basic(**options)
|
15
15
|
new(**{
|
16
|
-
char_filters: [
|
17
|
-
tokenizers: [
|
18
|
-
token_filters: [
|
16
|
+
char_filters: %i[AsciiFolding Lowercase],
|
17
|
+
tokenizers: %i[Word],
|
18
|
+
token_filters: %i[Stopwords MinLength],
|
19
19
|
graph_strategy: :Coocurrence,
|
20
20
|
}.merge(options))
|
21
21
|
end
|
@@ -25,11 +25,11 @@ module TextRank
|
|
25
25
|
# @return [KeywordExtractor]
|
26
26
|
def self.advanced(**options)
|
27
27
|
new(**{
|
28
|
-
char_filters: [
|
29
|
-
tokenizers: [
|
30
|
-
token_filters: [
|
28
|
+
char_filters: %i[AsciiFolding Lowercase StripHtml StripEmail UndoContractions StripPossessive],
|
29
|
+
tokenizers: %i[Url Money Number Word Punctuation],
|
30
|
+
token_filters: %i[PartOfSpeech Stopwords MinLength],
|
31
31
|
graph_strategy: :Coocurrence,
|
32
|
-
rank_filters: [
|
32
|
+
rank_filters: %i[CollapseAdjacent NormalizeUnitVector SortByValue],
|
33
33
|
}.merge(options))
|
34
34
|
end
|
35
35
|
|
@@ -41,14 +41,14 @@ module TextRank
|
|
41
41
|
# @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
|
42
42
|
def initialize(**options)
|
43
43
|
@page_rank_options = {
|
44
|
-
strategy:
|
45
|
-
damping:
|
44
|
+
strategy: options[:strategy] || :sparse,
|
45
|
+
damping: options[:damping],
|
46
46
|
tolerance: options[:tolerance],
|
47
47
|
}
|
48
|
-
@char_filters
|
49
|
-
@tokenizers
|
50
|
-
@token_filters
|
51
|
-
@rank_filters
|
48
|
+
@char_filters = options[:char_filters] || []
|
49
|
+
@tokenizers = options[:tokenizers] || [Tokenizer::Word]
|
50
|
+
@token_filters = options[:token_filters] || []
|
51
|
+
@rank_filters = options[:rank_filters] || []
|
52
52
|
@graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
|
53
53
|
end
|
54
54
|
|
@@ -73,9 +73,7 @@ module TextRank
|
|
73
73
|
# Sets the graph strategy for producing a graph from tokens
|
74
74
|
# @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
|
75
75
|
# @return [Class, Symbol, #build_graph]
|
76
|
-
|
77
|
-
@graph_strategy = strategy
|
78
|
-
end
|
76
|
+
attr_writer :graph_strategy
|
79
77
|
|
80
78
|
# Add a new TokenFilter for processing tokens after tokenization
|
81
79
|
# @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
|
@@ -105,14 +103,23 @@ module TextRank
|
|
105
103
|
end
|
106
104
|
|
107
105
|
# Filter & tokenize text, and return PageRank
|
108
|
-
# @param text [String] unfiltered text to be processed
|
106
|
+
# @param text [String,Array<String>] unfiltered text to be processed
|
109
107
|
# @return [Hash<String, Float>] tokens and page ranks (in descending order)
|
110
108
|
def extract(text, **options)
|
111
|
-
|
109
|
+
text = Array(text)
|
110
|
+
tokens_per_text = text.map do |t|
|
111
|
+
tokenize(t)
|
112
|
+
end
|
112
113
|
graph = PageRank.new(**@page_rank_options)
|
113
|
-
classify(@graph_strategy, context: GraphStrategy)
|
114
|
+
strategy = classify(@graph_strategy, context: GraphStrategy)
|
115
|
+
tokens_per_text.each do |tokens|
|
116
|
+
strategy.build_graph(tokens, graph)
|
117
|
+
end
|
114
118
|
ranks = graph.calculate(**options)
|
115
|
-
|
119
|
+
tokens_per_text.each_with_index do |tokens, i|
|
120
|
+
ranks = apply_rank_filters(ranks, tokens: tokens, original_text: text[i])
|
121
|
+
end
|
122
|
+
ranks
|
116
123
|
end
|
117
124
|
|
118
125
|
private
|
@@ -153,14 +160,14 @@ module TextRank
|
|
153
160
|
array.insert(idx, value)
|
154
161
|
end
|
155
162
|
|
156
|
-
def classify(
|
157
|
-
case
|
163
|
+
def classify(clazz, context: self)
|
164
|
+
case clazz
|
158
165
|
when Class
|
159
|
-
|
166
|
+
clazz.new
|
160
167
|
when Symbol
|
161
|
-
context.const_get(
|
168
|
+
context.const_get(clazz).new
|
162
169
|
else
|
163
|
-
|
170
|
+
clazz
|
164
171
|
end
|
165
172
|
end
|
166
173
|
|
@@ -77,6 +77,7 @@ module TextRank
|
|
77
77
|
|
78
78
|
class TokenCollapser
|
79
79
|
|
80
|
+
# rubocop:disable Metrics/ParameterLists
|
80
81
|
def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
|
81
82
|
@tokens = tokens
|
82
83
|
@text = text
|
@@ -91,6 +92,7 @@ module TextRank
|
|
91
92
|
@permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
|
92
93
|
@combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
|
93
94
|
end
|
95
|
+
# rubocop:enable Metrics/ParameterLists
|
94
96
|
|
95
97
|
# :nodoc:
|
96
98
|
def delimiter_re
|
@@ -104,18 +106,36 @@ module TextRank
|
|
104
106
|
# single tokens from below the cut to above it. So we'll continue searching
|
105
107
|
# until all of the top N final keywords (single or collapsed) have been
|
106
108
|
# considered.
|
107
|
-
|
108
|
-
|
109
|
-
scan_text_for_all_permutations_of(single_tokens_to_consider) or break
|
110
|
-
decide_what_to_collapse_and_what_to_remove
|
109
|
+
while collapse_attempt
|
110
|
+
# keep trying
|
111
111
|
end
|
112
112
|
|
113
113
|
# We now know what to collapse and what to remove, so we can start safely
|
114
114
|
# modifying the tokens hash
|
115
|
+
apply_collapse
|
116
|
+
end
|
117
|
+
|
118
|
+
# :nodoc:
|
119
|
+
def collapse_attempt
|
120
|
+
regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
|
121
|
+
single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
|
122
|
+
scan_text_for_all_permutations_of(single_tokens_to_consider) or return false
|
123
|
+
decide_what_to_collapse_and_what_to_remove
|
124
|
+
true
|
125
|
+
end
|
126
|
+
|
127
|
+
# :nodoc:
|
128
|
+
def apply_collapse
|
115
129
|
@to_collapse.each do |perm|
|
116
|
-
values = @tokens.values_at(*perm)
|
130
|
+
values = @tokens.values_at(*perm).compact
|
131
|
+
# This might be empty if somehow the scanned permutation doesn't
|
132
|
+
# exactly match one of the tokens (e.g. ASCII-folding gone awry).
|
133
|
+
# The goal is to do the best we can, so if we can't find it, ignore.
|
134
|
+
next if values.empty?
|
135
|
+
|
117
136
|
@tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
|
118
137
|
end
|
138
|
+
|
119
139
|
@tokens.reject! do |k, _|
|
120
140
|
@to_remove.include?(k)
|
121
141
|
end || @tokens
|
@@ -131,16 +151,10 @@ module TextRank
|
|
131
151
|
# tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
|
132
152
|
# to find what we can.
|
133
153
|
def scan_text_for_all_permutations_of(single_tokens)
|
134
|
-
perms = []
|
135
154
|
# NOTE that by reversing the order we craft the regex to prefer larger combinations over
|
136
155
|
# smaller combinations (or singletons).
|
137
|
-
(1..@max_tokens_to_combine).to_a.reverse.
|
138
|
-
single_tokens
|
139
|
-
unless @permutations_scanned.key?(perm)
|
140
|
-
@permutations_scanned[perm] = 0
|
141
|
-
perms << perm
|
142
|
-
end
|
143
|
-
end
|
156
|
+
perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
|
157
|
+
scan_text_for_n_permutations_of(single_tokens, n)
|
144
158
|
end
|
145
159
|
scan_text_for(perms) do |s|
|
146
160
|
s = s.downcase if @ignore_case
|
@@ -148,6 +162,15 @@ module TextRank
|
|
148
162
|
end unless perms.empty?
|
149
163
|
end
|
150
164
|
|
165
|
+
def scan_text_for_n_permutations_of(single_tokens, n)
|
166
|
+
single_tokens.permutation(n).map do |perm|
|
167
|
+
unless @permutations_scanned.key?(perm)
|
168
|
+
@permutations_scanned[perm] = 0
|
169
|
+
perm
|
170
|
+
end
|
171
|
+
end.compact
|
172
|
+
end
|
173
|
+
|
151
174
|
# Because we're scanning the original text, we've lost all of the character filtering we did
|
152
175
|
# prior to tokenization, but that's important because we need the original context to be more
|
153
176
|
# choosy. Still, we need to know what delimiter goes between collapsed tokens (since it may
|
@@ -174,25 +197,30 @@ module TextRank
|
|
174
197
|
# modifications to the original token list yet but just keep track of what we plan
|
175
198
|
# to collapse/remove.
|
176
199
|
def decide_what_to_collapse_and_what_to_remove
|
177
|
-
non_empty_ordered = @permutations_scanned.select do |k, v|
|
178
|
-
v > 0
|
179
|
-
end.sort_by do |k, v|
|
180
|
-
[-v, -k.size] # reverse order
|
181
|
-
end
|
182
|
-
|
183
200
|
tokens_encountered = []
|
184
|
-
|
201
|
+
permutations_to_consider_collapsing.each do |perm, perm_count|
|
185
202
|
if perm.size > 1
|
186
|
-
singles_to_remove
|
187
|
-
if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
|
188
|
-
@to_collapse << perm if perm.size > 1
|
189
|
-
@to_remove |= singles_to_remove
|
190
|
-
end
|
203
|
+
decide_to_collapse_or_remove(perm, perm_count, singles_to_remove: perm - tokens_encountered)
|
191
204
|
end
|
192
205
|
tokens_encountered += perm
|
193
206
|
end
|
194
207
|
end
|
195
208
|
|
209
|
+
def permutations_to_consider_collapsing
|
210
|
+
@permutations_scanned.select do |_k, v|
|
211
|
+
v.positive?
|
212
|
+
end.sort_by do |k, v|
|
213
|
+
[-v, -k.size] # reverse order
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def decide_to_collapse_or_remove(perm, perm_count, singles_to_remove:)
|
218
|
+
if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
|
219
|
+
@to_collapse << perm if perm.size > 1
|
220
|
+
@to_remove |= singles_to_remove
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
196
224
|
# Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
|
197
225
|
# we still want to add the collapsed key if it shows up "enough" times.
|
198
226
|
def combination_significant?(perm, perm_count)
|
@@ -45,8 +45,9 @@ module TextRank
|
|
45
45
|
# @return [Hash<String, Float>]
|
46
46
|
def filter!(ranks, **_)
|
47
47
|
return if ranks.empty?
|
48
|
+
|
48
49
|
total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
|
49
|
-
|
50
|
+
ranks.transform_values { |v| v / total }
|
50
51
|
end
|
51
52
|
|
52
53
|
end
|