text_rank 1.1.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +160 -24
- data/lib/text_rank/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f76e0559c4da8e9461b2e89139746e52bcc1dd56
|
4
|
+
data.tar.gz: 81404e9523ff19eaec3839859f8a83db9690274e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fd62d702fea2f2ba86fb84bb701201abbf0a5ae647734b87506b40d83f172f95661ebfc1c4d5eb702db2ad7284cd3a4ca5d90db809fa8841e6ee3b141dc8498d
|
7
|
+
data.tar.gz: 2568eae71807a231d34a6db0fd6549a22cd4b7e32991048b01c57918b4875facebd63aea665080c487cc492292bc76ede9a9ee82d851d8abd1f72c66a903cd81
|
@@ -5,6 +5,19 @@ module TextRank
|
|
5
5
|
# token keywords into a combined keyword when those keywords are adjacent
|
6
6
|
# to each other in the original text.
|
7
7
|
#
|
8
|
+
# It tries to do this in as intelligent a manner as possible, keeping the single
|
9
|
+
# tokens that comprise a combination when one or more of the single tokens occur
|
10
|
+
# more often than the combination.
|
11
|
+
#
|
12
|
+
# This filter operates on the original (non-filtered) text in order to more
|
13
|
+
# intelligently determine true text adjacency versus token adjacency (e.g.
|
14
|
+
# two tokens can be adjacent even though they appeared in the original text
|
15
|
+
# on separate lines with punctuation in between. However, because it operates
|
16
|
+
# on the original text we may fail to find some combinations due to the
|
17
|
+
# keyword tokens not exactly matching the original text any more (e.g. if
|
18
|
+
# ASCII folding has occurred). The goal is to err on the side of caution:
|
19
|
+
# it is better to not suggest a combination than to suggest a bad combination.
|
20
|
+
#
|
8
21
|
# = Example
|
9
22
|
#
|
10
23
|
# CollapseAdjacent.new(ranks_to_collapse: 6, max_tokens_to_combine: 2).filter!(
|
@@ -34,17 +47,24 @@ module TextRank
|
|
34
47
|
# "peace" => 0.2905352582752693,
|
35
48
|
# "inhabitants" => 0.12715120116732137,
|
36
49
|
# "cares" => 0.0697383057947685,
|
37
|
-
#
|
50
|
+
# "town siege" => 0.2365184450186848,
|
51
|
+
# "cities blessings" => 0.21272821337880285,
|
52
|
+
# "arts florish" => 0.146247479840506,
|
53
|
+
# "devoured envy" => 0.1424776818760168,
|
54
|
+
# "anxieties plagues" => 0.12821144722639122,
|
55
|
+
# "peace" => 0.07976303576999531,
|
56
|
+
# "inhabitants" => 0.03490786580297893,
|
57
|
+
# "cares" => 0.019145831086624026,
|
58
|
+
# }
|
38
59
|
##
|
39
60
|
class CollapseAdjacent
|
40
61
|
|
41
|
-
# @
|
42
|
-
# @
|
43
|
-
# @
|
44
|
-
|
45
|
-
|
46
|
-
@
|
47
|
-
@ignore_case = !!ignore_case
|
62
|
+
# @option options [Fixnum] ranks_to_collapse the top N ranks in which to look for collapsable keywords
|
63
|
+
# @option options [Fixnum] max_tokens_to_combine the maximum number of tokens to collapse into a combined keyword
|
64
|
+
# @option options [true, false] ignore_case whether to ignore case when finding adjacent keywords in original text
|
65
|
+
# @options options [String] delimiter an optional delimiter between adjacent keywords in original text
|
66
|
+
def initialize(**options)
|
67
|
+
@options = options
|
48
68
|
end
|
49
69
|
|
50
70
|
# Perform the filter on the ranks
|
@@ -52,28 +72,144 @@ module TextRank
|
|
52
72
|
# @param original_text [String] the original text (pre-tokenization) from which to find collapsable keywords
|
53
73
|
# @return [Hash<String, Float>]
|
54
74
|
def filter!(ranks, original_text:, **_)
|
55
|
-
|
56
|
-
loop do
|
57
|
-
permutation = collapse_one(ranks.keys.first(@ranks_to_collapse - collapsed.size), original_text) or break
|
58
|
-
collapsed[permutation.join(' ')] = ranks.values_at(*permutation).max
|
59
|
-
permutation.each { |token| ranks.delete(token) }
|
60
|
-
end
|
61
|
-
collapsed.merge!(ranks)
|
62
|
-
Hash[collapsed.sort_by { |_, v| -v }]
|
75
|
+
TokenCollapser.new(tokens: ranks, text: original_text, **@options).collapse
|
63
76
|
end
|
64
77
|
|
65
78
|
private
|
66
79
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
80
|
+
class TokenCollapser
|
81
|
+
|
82
|
+
def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
|
83
|
+
@tokens = tokens
|
84
|
+
@text = text
|
85
|
+
|
86
|
+
@ranks_to_collapse = ranks_to_collapse
|
87
|
+
@max_tokens_to_combine = max_tokens_to_combine
|
88
|
+
@ignore_case = !!ignore_case
|
89
|
+
@delimiter = delimiter.to_s == '' ? ' ' : delimiter
|
90
|
+
|
91
|
+
@to_collapse = Set.new # Track the permutations we plan to collapse
|
92
|
+
@to_remove = Set.new # Track the single tokens we plan to remove (due to being collapsed)
|
93
|
+
@permutations_scanned = {} # Track how many occurrences of each permutation we found in the original text
|
94
|
+
@combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
|
95
|
+
end
|
96
|
+
|
97
|
+
def delimiter_re
|
98
|
+
@delimiter_re ||= /#{@delimiter}+/
|
99
|
+
end
|
100
|
+
|
101
|
+
def collapse
|
102
|
+
# We make multiple passes at collapsing because after the first pass we may have
|
103
|
+
# replaced two or more singletons with a collapsed token, bumping up one or more
|
104
|
+
# single tokens from below the cut to above it. So we'll continue searching
|
105
|
+
# until all of the top N final keywords (single or collapsed) have been
|
106
|
+
# considered.
|
107
|
+
loop do
|
108
|
+
single_tokens_to_consider = @tokens.keys.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
|
109
|
+
scan_text_for_all_permutations_of(single_tokens_to_consider) or break
|
110
|
+
decide_what_to_collapse_and_what_to_remove
|
111
|
+
end
|
112
|
+
|
113
|
+
# We now know what to collapse and what to remove, so we can start safely
|
114
|
+
# modifying the tokens hash
|
115
|
+
@to_collapse.each do |perm|
|
116
|
+
values = @tokens.values_at(*perm)
|
117
|
+
@tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
|
118
|
+
end
|
119
|
+
@tokens.reject! do |k, _|
|
120
|
+
@to_remove.include?(k)
|
121
|
+
end
|
122
|
+
|
123
|
+
# Because we've made changes to the tokens hash, we need to re-normalize so that
|
124
|
+
# the sum of all token ranks is still 1.
|
125
|
+
normalize(@tokens)
|
126
|
+
end
|
127
|
+
|
128
|
+
# We need to be efficient about how we search for the large number of possible collapsed keywords.
|
129
|
+
# Doing them one at a time is very expensive and performs at least 20 times slower in my tests.
|
130
|
+
# And since we do multiple passes we need to be careful about not searching for the same combo
|
131
|
+
# more than once. So for every combo (and the single tokens themselves) we've searched for we
|
132
|
+
# keep track of the number of times we've found them.
|
133
|
+
#
|
134
|
+
# Even for single tokens this may be zero due to some modification from the original text before
|
135
|
+
# tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
|
136
|
+
# to find what we can.
|
137
|
+
def scan_text_for_all_permutations_of(single_tokens)
|
138
|
+
perms = []
|
139
|
+
# NOTE that by reversing the order we craft the regex to prefer larger combinations over
|
140
|
+
# smaller combinations (or singletons).
|
141
|
+
(1..@max_tokens_to_combine).to_a.reverse.map do |nn|
|
142
|
+
single_tokens.permutation(nn).each do |perm|
|
143
|
+
unless @permutations_scanned.key?(perm)
|
144
|
+
@permutations_scanned[perm] = 0
|
145
|
+
perms << perm
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
scan_text_for(perms) do |s|
|
150
|
+
s = s.downcase if @ignore_case
|
151
|
+
@permutations_scanned[s.split(delimiter_re)] += 1
|
152
|
+
end unless perms.empty?
|
153
|
+
end
|
154
|
+
|
155
|
+
# Because we're scanning the original text, we've lost all of the character filtering we did
|
156
|
+
# prior to tokenization, but that's important because we need the original context to be more
|
157
|
+
# choosy. Still, we need to know what delimiter goes between collapsed tokens (since it may
|
158
|
+
# not always be a space). Likewise, we can't always assume the Lowercase filter has been
|
159
|
+
# used, so we allow for customziation with the :ignore_case & :delimiter options.
|
160
|
+
def scan_text_for(all)
|
161
|
+
flags = 0
|
162
|
+
flags |= Regexp::IGNORECASE if @ignore_case
|
163
|
+
searches = all.map do |a|
|
164
|
+
a.is_a?(Array) ? a.join(delimiter_re.to_s) : a
|
165
|
+
end
|
166
|
+
re = Regexp.new("\\b(#{searches.join('|')})\\b", flags)
|
167
|
+
|
168
|
+
any_found = false
|
169
|
+
@text.scan(re) do |s, _|
|
170
|
+
yield s
|
171
|
+
any_found = true
|
172
|
+
end
|
173
|
+
any_found
|
174
|
+
end
|
175
|
+
|
176
|
+
# Once we have the number of occurrences for every permutation (including singletons)
|
177
|
+
# we can make choices about what to collapse and what to remove. We won't make any
|
178
|
+
# modifications to the original token list yet but just keep track of what we plan
|
179
|
+
# to collapse/remove.
|
180
|
+
def decide_what_to_collapse_and_what_to_remove
|
181
|
+
non_empty_ordered = @permutations_scanned.select do |k, v|
|
182
|
+
v > 0
|
183
|
+
end.sort_by do |k, v|
|
184
|
+
[-v, -k.size] # reverse order
|
185
|
+
end
|
186
|
+
|
187
|
+
tokens_encountered = []
|
188
|
+
non_empty_ordered.each do |perm, perm_count|
|
189
|
+
if perm.size > 1
|
190
|
+
singles_to_remove = perm - tokens_encountered
|
191
|
+
if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
|
192
|
+
@to_collapse << perm if perm.size > 1
|
193
|
+
@to_remove |= singles_to_remove
|
194
|
+
end
|
195
|
+
end
|
196
|
+
tokens_encountered += perm
|
74
197
|
end
|
75
198
|
end
|
76
|
-
|
199
|
+
|
200
|
+
# Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
|
201
|
+
# we still want to add the collapsed key if it shows up "enough" times.
|
202
|
+
def combination_significant?(perm, perm_count)
|
203
|
+
total_single_count = perm.reduce(0) { |s, t| s + @permutations_scanned[[t]] } / perm.size.to_f
|
204
|
+
total_single_count.zero? || (perm_count / total_single_count) > @combination_significance_threshold
|
205
|
+
end
|
206
|
+
|
207
|
+
# Scale all of the token ranks so they add up to 1.
|
208
|
+
def normalize(tokens)
|
209
|
+
total = tokens.reduce(0.0) { |s, (_, v)| s + v }
|
210
|
+
Hash[tokens.map { |k, v| [k, v / total] }.sort_by { |_, v| -v }]
|
211
|
+
end
|
212
|
+
|
77
213
|
end
|
78
214
|
|
79
215
|
end
|
data/lib/text_rank/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_rank
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David McCullars
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|