text_rank 1.1.7 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.codeclimate.yml +1 -6
- data/.rubocop.yml +60 -1075
- data/.ruby-version +1 -1
- data/.travis.yml +13 -5
- data/{LICENSE.txt → LICENSE} +0 -0
- data/README.md +2 -1
- data/bin/console +3 -3
- data/lib/page_rank.rb +2 -0
- data/lib/page_rank/base.rb +9 -8
- data/lib/page_rank/dense.rb +2 -1
- data/lib/page_rank/sparse.rb +6 -7
- data/lib/text_rank.rb +12 -9
- data/lib/text_rank/char_filter.rb +1 -1
- data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
- data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
- data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
- data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
- data/lib/text_rank/fingerprint.rb +20 -28
- data/lib/text_rank/fingerprint_overlap.rb +55 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
- data/lib/text_rank/keyword_extractor.rb +32 -25
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +53 -25
- data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
- data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
- data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
- data/lib/text_rank/token_filter/stopwords.rb +1 -321
- data/lib/text_rank/token_filter/stopwords.yml +317 -0
- data/lib/text_rank/tokenizer.rb +1 -1
- data/lib/text_rank/tokenizer/money.rb +11 -6
- data/lib/text_rank/tokenizer/number.rb +4 -3
- data/lib/text_rank/tokenizer/punctuation.rb +4 -1
- data/lib/text_rank/tokenizer/url.rb +3 -0
- data/lib/text_rank/tokenizer/whitespace.rb +4 -1
- data/lib/text_rank/tokenizer/word.rb +5 -2
- data/lib/text_rank/version.rb +3 -1
- data/text_rank.gemspec +10 -10
- metadata +48 -32
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
1
|
module TextRank
|
4
2
|
##
|
5
3
|
# Class used to compare documents according to TextRank. A "fingerprint"
|
@@ -10,35 +8,35 @@ module TextRank
|
|
10
8
|
# significant keywords. But to prevent less significant keywords from being
|
11
9
|
# completely ignored we apply an inverse log linear transformation to each of the
|
12
10
|
# N prefixes.
|
13
|
-
#
|
11
|
+
#
|
14
12
|
# For example, consider the following comparison:
|
15
|
-
#
|
13
|
+
#
|
16
14
|
# town man empty found
|
17
15
|
# vs.
|
18
16
|
# general empty found jar
|
19
|
-
#
|
17
|
+
#
|
20
18
|
# The first pass considers just the first keywords: town vs. general. As these
|
21
19
|
# are different, they contribute 0.
|
22
|
-
#
|
20
|
+
#
|
23
21
|
# The second pass considers the first two keywords: town man vs general empty.
|
24
22
|
# Again, no overlap, so they contribute 0.
|
25
|
-
#
|
23
|
+
#
|
26
24
|
# The third pass considers the first three keywords: town man empty vs general
|
27
25
|
# empty found. Here we have one overlap: empty. This contributes 1.
|
28
|
-
#
|
26
|
+
#
|
29
27
|
# The fourth pass considers all, and there is two overlaps: empty & found. This
|
30
28
|
# contributes 2.
|
31
|
-
#
|
29
|
+
#
|
32
30
|
# We can represent the overlaps as the vector [0, 0, 1, 2]. Then we will apply
|
33
31
|
# the inverse log linear transformation defined by:
|
34
|
-
#
|
32
|
+
#
|
35
33
|
# f(x_i) = x_i / ln(i + 1)
|
36
34
|
# = [0, 0, 1 / ln(4), 2 / ln(5)]
|
37
35
|
# = [0, 0, 0.7213475204444817, 1.2426698691192237]
|
38
|
-
#
|
36
|
+
#
|
39
37
|
# Finally we take the average of the transformed vector and normalize it (to
|
40
38
|
# ensure a final value between 0.0 and 1.0):
|
41
|
-
#
|
39
|
+
#
|
42
40
|
# norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
|
43
41
|
# = norm( 0.49100434739092635 )
|
44
42
|
# = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
|
@@ -61,28 +59,22 @@ module TextRank
|
|
61
59
|
# Calculates the "similarity" between this fingerprint and another
|
62
60
|
# @param {Fingerprint} A second fingerprint to compare
|
63
61
|
# @return [Number] A number between 0.0 (different) and 1.0 (same)
|
64
|
-
def similarity(
|
65
|
-
return 1.0 if values ==
|
66
|
-
|
67
|
-
sim = 0
|
68
|
-
s1 = Set.new
|
69
|
-
s2 = Set.new
|
62
|
+
def similarity(other)
|
63
|
+
return 1.0 if values == other.values # Short-circuit for efficiency
|
70
64
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
if v1 == v2
|
75
|
-
sim += 1
|
76
|
-
else
|
77
|
-
s1.delete?(v2) ? (sim += 1) : (s2 << v2)
|
78
|
-
s2.delete?(v1) ? (sim += 1) : (s1 << v1)
|
79
|
-
end
|
80
|
-
sum + sim * linear_transform[i]
|
65
|
+
sum = 0
|
66
|
+
overlap(other).each_with_index do |overlap_value, i|
|
67
|
+
sum += overlap_value * linear_transform[i]
|
81
68
|
end
|
69
|
+
sum
|
82
70
|
end
|
83
71
|
|
84
72
|
private
|
85
73
|
|
74
|
+
def overlap(other)
|
75
|
+
FingerprintOverlap.new(values, other.values).overlap
|
76
|
+
end
|
77
|
+
|
86
78
|
def linear_transform
|
87
79
|
@linear_transform ||= size.times.map do |i|
|
88
80
|
1.0 / Math.log(i + 2) / size.to_f / norm_factor
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module TextRank
|
2
|
+
##
|
3
|
+
# Determines "overlap" between two fingerprints at each N prefixes
|
4
|
+
#
|
5
|
+
# For example,
|
6
|
+
#
|
7
|
+
# FingerprintOverlap.new(
|
8
|
+
# %w[a b c d],
|
9
|
+
# %w[b e a c],
|
10
|
+
# ).overlap
|
11
|
+
#
|
12
|
+
# => [
|
13
|
+
# 0, # [a] & (b) have no overlap
|
14
|
+
# 1, # [a b] & [b e] have one overlap: b
|
15
|
+
# 2, # [a b c] & [b e a] have two overlap: a & b
|
16
|
+
# 3, # [a b c d] & [b e a c] have three overlap: a, b, & c
|
17
|
+
# ]
|
18
|
+
##
|
19
|
+
class FingerprintOverlap
|
20
|
+
|
21
|
+
attr_reader :overlap
|
22
|
+
|
23
|
+
def initialize(values1, values2)
|
24
|
+
raise ArgumentError, 'Value size mismatch' if values1.size != values2.size
|
25
|
+
|
26
|
+
@encountered1 = Set.new
|
27
|
+
@encountered2 = Set.new
|
28
|
+
@overlap_count = 0
|
29
|
+
|
30
|
+
@overlap = determine_overlap(values1, values2)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def determine_overlap(values1, values2)
|
36
|
+
values1.zip(values2).map do |v1, v2|
|
37
|
+
encounter(v1, v2)
|
38
|
+
@overlap_count
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# This algorithm is a little more complex than could be represented in Ruby,
|
43
|
+
# but we want to keep it as performant as possible.
|
44
|
+
def encounter(value1, value2)
|
45
|
+
if value1 == value2
|
46
|
+
@overlap_count += 1
|
47
|
+
else
|
48
|
+
# Delete from the set in case an element appears more than once
|
49
|
+
@encountered1.delete?(value2) ? (@overlap_count += 1) : (@encountered2 << value2)
|
50
|
+
@encountered2.delete?(value1) ? (@overlap_count += 1) : (@encountered1 << value1)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
@@ -61,18 +61,27 @@ module TextRank
|
|
61
61
|
# return [nil]
|
62
62
|
def build_graph(tokens, graph)
|
63
63
|
ngram_window = @ngram_size * 2 + 1
|
64
|
-
tokens.
|
64
|
+
tokens.size.times do |i|
|
65
65
|
ngram_window.times do |j|
|
66
|
-
|
67
|
-
token_j = tokens[i - @ngram_size + j]
|
68
|
-
if token_j
|
69
|
-
graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
|
70
|
-
end
|
66
|
+
consider_ngram_window(tokens, graph, i, j)
|
71
67
|
end
|
72
68
|
end
|
73
69
|
nil
|
74
70
|
end
|
75
71
|
|
72
|
+
private
|
73
|
+
|
74
|
+
def consider_ngram_window(tokens, graph, i, j)
|
75
|
+
return if j == @ngram_size || i + j < @ngram_size
|
76
|
+
|
77
|
+
token_i = tokens[i]
|
78
|
+
token_j = tokens[i - @ngram_size + j]
|
79
|
+
|
80
|
+
if token_j
|
81
|
+
graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
76
85
|
end
|
77
86
|
end
|
78
87
|
end
|
@@ -13,9 +13,9 @@ module TextRank
|
|
13
13
|
# @return [KeywordExtractor]
|
14
14
|
def self.basic(**options)
|
15
15
|
new(**{
|
16
|
-
char_filters: [
|
17
|
-
tokenizers: [
|
18
|
-
token_filters: [
|
16
|
+
char_filters: %i[AsciiFolding Lowercase],
|
17
|
+
tokenizers: %i[Word],
|
18
|
+
token_filters: %i[Stopwords MinLength],
|
19
19
|
graph_strategy: :Coocurrence,
|
20
20
|
}.merge(options))
|
21
21
|
end
|
@@ -25,11 +25,11 @@ module TextRank
|
|
25
25
|
# @return [KeywordExtractor]
|
26
26
|
def self.advanced(**options)
|
27
27
|
new(**{
|
28
|
-
char_filters: [
|
29
|
-
tokenizers: [
|
30
|
-
token_filters: [
|
28
|
+
char_filters: %i[AsciiFolding Lowercase StripHtml StripEmail UndoContractions StripPossessive],
|
29
|
+
tokenizers: %i[Url Money Number Word Punctuation],
|
30
|
+
token_filters: %i[PartOfSpeech Stopwords MinLength],
|
31
31
|
graph_strategy: :Coocurrence,
|
32
|
-
rank_filters: [
|
32
|
+
rank_filters: %i[CollapseAdjacent NormalizeUnitVector SortByValue],
|
33
33
|
}.merge(options))
|
34
34
|
end
|
35
35
|
|
@@ -41,14 +41,14 @@ module TextRank
|
|
41
41
|
# @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
|
42
42
|
def initialize(**options)
|
43
43
|
@page_rank_options = {
|
44
|
-
strategy:
|
45
|
-
damping:
|
44
|
+
strategy: options[:strategy] || :sparse,
|
45
|
+
damping: options[:damping],
|
46
46
|
tolerance: options[:tolerance],
|
47
47
|
}
|
48
|
-
@char_filters
|
49
|
-
@tokenizers
|
50
|
-
@token_filters
|
51
|
-
@rank_filters
|
48
|
+
@char_filters = options[:char_filters] || []
|
49
|
+
@tokenizers = options[:tokenizers] || [Tokenizer::Word]
|
50
|
+
@token_filters = options[:token_filters] || []
|
51
|
+
@rank_filters = options[:rank_filters] || []
|
52
52
|
@graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
|
53
53
|
end
|
54
54
|
|
@@ -73,9 +73,7 @@ module TextRank
|
|
73
73
|
# Sets the graph strategy for producing a graph from tokens
|
74
74
|
# @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
|
75
75
|
# @return [Class, Symbol, #build_graph]
|
76
|
-
|
77
|
-
@graph_strategy = strategy
|
78
|
-
end
|
76
|
+
attr_writer :graph_strategy
|
79
77
|
|
80
78
|
# Add a new TokenFilter for processing tokens after tokenization
|
81
79
|
# @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
|
@@ -105,14 +103,23 @@ module TextRank
|
|
105
103
|
end
|
106
104
|
|
107
105
|
# Filter & tokenize text, and return PageRank
|
108
|
-
# @param text [String] unfiltered text to be processed
|
106
|
+
# @param text [String,Array<String>] unfiltered text to be processed
|
109
107
|
# @return [Hash<String, Float>] tokens and page ranks (in descending order)
|
110
108
|
def extract(text, **options)
|
111
|
-
|
109
|
+
text = Array(text)
|
110
|
+
tokens_per_text = text.map do |t|
|
111
|
+
tokenize(t)
|
112
|
+
end
|
112
113
|
graph = PageRank.new(**@page_rank_options)
|
113
|
-
classify(@graph_strategy, context: GraphStrategy)
|
114
|
+
strategy = classify(@graph_strategy, context: GraphStrategy)
|
115
|
+
tokens_per_text.each do |tokens|
|
116
|
+
strategy.build_graph(tokens, graph)
|
117
|
+
end
|
114
118
|
ranks = graph.calculate(**options)
|
115
|
-
|
119
|
+
tokens_per_text.each_with_index do |tokens, i|
|
120
|
+
ranks = apply_rank_filters(ranks, tokens: tokens, original_text: text[i])
|
121
|
+
end
|
122
|
+
ranks
|
116
123
|
end
|
117
124
|
|
118
125
|
private
|
@@ -153,14 +160,14 @@ module TextRank
|
|
153
160
|
array.insert(idx, value)
|
154
161
|
end
|
155
162
|
|
156
|
-
def classify(
|
157
|
-
case
|
163
|
+
def classify(clazz, context: self)
|
164
|
+
case clazz
|
158
165
|
when Class
|
159
|
-
|
166
|
+
clazz.new
|
160
167
|
when Symbol
|
161
|
-
context.const_get(
|
168
|
+
context.const_get(clazz).new
|
162
169
|
else
|
163
|
-
|
170
|
+
clazz
|
164
171
|
end
|
165
172
|
end
|
166
173
|
|
@@ -77,6 +77,7 @@ module TextRank
|
|
77
77
|
|
78
78
|
class TokenCollapser
|
79
79
|
|
80
|
+
# rubocop:disable Metrics/ParameterLists
|
80
81
|
def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
|
81
82
|
@tokens = tokens
|
82
83
|
@text = text
|
@@ -91,6 +92,7 @@ module TextRank
|
|
91
92
|
@permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
|
92
93
|
@combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
|
93
94
|
end
|
95
|
+
# rubocop:enable Metrics/ParameterLists
|
94
96
|
|
95
97
|
# :nodoc:
|
96
98
|
def delimiter_re
|
@@ -104,18 +106,36 @@ module TextRank
|
|
104
106
|
# single tokens from below the cut to above it. So we'll continue searching
|
105
107
|
# until all of the top N final keywords (single or collapsed) have been
|
106
108
|
# considered.
|
107
|
-
|
108
|
-
|
109
|
-
scan_text_for_all_permutations_of(single_tokens_to_consider) or break
|
110
|
-
decide_what_to_collapse_and_what_to_remove
|
109
|
+
while collapse_attempt
|
110
|
+
# keep trying
|
111
111
|
end
|
112
112
|
|
113
113
|
# We now know what to collapse and what to remove, so we can start safely
|
114
114
|
# modifying the tokens hash
|
115
|
+
apply_collapse
|
116
|
+
end
|
117
|
+
|
118
|
+
# :nodoc:
|
119
|
+
def collapse_attempt
|
120
|
+
regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
|
121
|
+
single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
|
122
|
+
scan_text_for_all_permutations_of(single_tokens_to_consider) or return false
|
123
|
+
decide_what_to_collapse_and_what_to_remove
|
124
|
+
true
|
125
|
+
end
|
126
|
+
|
127
|
+
# :nodoc:
|
128
|
+
def apply_collapse
|
115
129
|
@to_collapse.each do |perm|
|
116
|
-
values = @tokens.values_at(*perm)
|
130
|
+
values = @tokens.values_at(*perm).compact
|
131
|
+
# This might be empty if somehow the scanned permutation doesn't
|
132
|
+
# exactly match one of the tokens (e.g. ASCII-folding gone awry).
|
133
|
+
# The goal is to do the best we can, so if we can't find it, ignore.
|
134
|
+
next if values.empty?
|
135
|
+
|
117
136
|
@tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
|
118
137
|
end
|
138
|
+
|
119
139
|
@tokens.reject! do |k, _|
|
120
140
|
@to_remove.include?(k)
|
121
141
|
end || @tokens
|
@@ -131,16 +151,10 @@ module TextRank
|
|
131
151
|
# tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
|
132
152
|
# to find what we can.
|
133
153
|
def scan_text_for_all_permutations_of(single_tokens)
|
134
|
-
perms = []
|
135
154
|
# NOTE that by reversing the order we craft the regex to prefer larger combinations over
|
136
155
|
# smaller combinations (or singletons).
|
137
|
-
(1..@max_tokens_to_combine).to_a.reverse.
|
138
|
-
single_tokens
|
139
|
-
unless @permutations_scanned.key?(perm)
|
140
|
-
@permutations_scanned[perm] = 0
|
141
|
-
perms << perm
|
142
|
-
end
|
143
|
-
end
|
156
|
+
perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
|
157
|
+
scan_text_for_n_permutations_of(single_tokens, n)
|
144
158
|
end
|
145
159
|
scan_text_for(perms) do |s|
|
146
160
|
s = s.downcase if @ignore_case
|
@@ -148,6 +162,15 @@ module TextRank
|
|
148
162
|
end unless perms.empty?
|
149
163
|
end
|
150
164
|
|
165
|
+
def scan_text_for_n_permutations_of(single_tokens, n)
|
166
|
+
single_tokens.permutation(n).map do |perm|
|
167
|
+
unless @permutations_scanned.key?(perm)
|
168
|
+
@permutations_scanned[perm] = 0
|
169
|
+
perm
|
170
|
+
end
|
171
|
+
end.compact
|
172
|
+
end
|
173
|
+
|
151
174
|
# Because we're scanning the original text, we've lost all of the character filtering we did
|
152
175
|
# prior to tokenization, but that's important because we need the original context to be more
|
153
176
|
# choosy. Still, we need to know what delimiter goes between collapsed tokens (since it may
|
@@ -174,25 +197,30 @@ module TextRank
|
|
174
197
|
# modifications to the original token list yet but just keep track of what we plan
|
175
198
|
# to collapse/remove.
|
176
199
|
def decide_what_to_collapse_and_what_to_remove
|
177
|
-
non_empty_ordered = @permutations_scanned.select do |k, v|
|
178
|
-
v > 0
|
179
|
-
end.sort_by do |k, v|
|
180
|
-
[-v, -k.size] # reverse order
|
181
|
-
end
|
182
|
-
|
183
200
|
tokens_encountered = []
|
184
|
-
|
201
|
+
permutations_to_consider_collapsing.each do |perm, perm_count|
|
185
202
|
if perm.size > 1
|
186
|
-
singles_to_remove
|
187
|
-
if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
|
188
|
-
@to_collapse << perm if perm.size > 1
|
189
|
-
@to_remove |= singles_to_remove
|
190
|
-
end
|
203
|
+
decide_to_collapse_or_remove(perm, perm_count, singles_to_remove: perm - tokens_encountered)
|
191
204
|
end
|
192
205
|
tokens_encountered += perm
|
193
206
|
end
|
194
207
|
end
|
195
208
|
|
209
|
+
def permutations_to_consider_collapsing
|
210
|
+
@permutations_scanned.select do |_k, v|
|
211
|
+
v.positive?
|
212
|
+
end.sort_by do |k, v|
|
213
|
+
[-v, -k.size] # reverse order
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def decide_to_collapse_or_remove(perm, perm_count, singles_to_remove:)
|
218
|
+
if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
|
219
|
+
@to_collapse << perm if perm.size > 1
|
220
|
+
@to_remove |= singles_to_remove
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
196
224
|
# Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
|
197
225
|
# we still want to add the collapsed key if it shows up "enough" times.
|
198
226
|
def combination_significant?(perm, perm_count)
|
@@ -45,8 +45,9 @@ module TextRank
|
|
45
45
|
# @return [Hash<String, Float>]
|
46
46
|
def filter!(ranks, **_)
|
47
47
|
return if ranks.empty?
|
48
|
+
|
48
49
|
total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
|
49
|
-
|
50
|
+
ranks.transform_values { |v| v / total }
|
50
51
|
end
|
51
52
|
|
52
53
|
end
|