text_rank 1.2.3 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.codeclimate.yml +1 -1
- data/.gitignore +4 -0
- data/.rubocop.yml +7 -0
- data/.ruby-version +1 -1
- data/.travis.yml +1 -0
- data/Rakefile +5 -0
- data/bin/console +3 -3
- data/ext/text_rank/extconf.rb +3 -0
- data/ext/text_rank/page_rank_sparse_native.c +300 -0
- data/ext/text_rank/page_rank_sparse_native.h +93 -0
- data/ext/text_rank/text_rank.c +5 -0
- data/lib/page_rank/base.rb +12 -9
- data/lib/page_rank/dense.rb +3 -2
- data/lib/page_rank/sparse.rb +6 -7
- data/lib/page_rank/sparse_native.rb +21 -0
- data/lib/page_rank.rb +7 -4
- data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
- data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
- data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
- data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
- data/lib/text_rank/char_filter.rb +1 -1
- data/lib/text_rank/fingerprint.rb +10 -18
- data/lib/text_rank/fingerprint_overlap.rb +55 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
- data/lib/text_rank/keyword_extractor.rb +32 -25
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
- data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
- data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
- data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
- data/lib/text_rank/token_filter/stopwords.rb +1 -321
- data/lib/text_rank/token_filter/stopwords.yml +317 -0
- data/lib/text_rank/tokenizer/money.rb +11 -6
- data/lib/text_rank/tokenizer/number.rb +4 -3
- data/lib/text_rank/tokenizer/punctuation.rb +4 -1
- data/lib/text_rank/tokenizer/url.rb +3 -0
- data/lib/text_rank/tokenizer/whitespace.rb +4 -1
- data/lib/text_rank/tokenizer/word.rb +5 -2
- data/lib/text_rank/tokenizer.rb +1 -1
- data/lib/text_rank/version.rb +3 -1
- data/lib/text_rank.rb +14 -9
- data/text_rank.gemspec +4 -1
- metadata +48 -12
@@ -77,6 +77,7 @@ module TextRank
|
|
77
77
|
|
78
78
|
class TokenCollapser
|
79
79
|
|
80
|
+
# rubocop:disable Metrics/ParameterLists
|
80
81
|
def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
|
81
82
|
@tokens = tokens
|
82
83
|
@text = text
|
@@ -91,6 +92,7 @@ module TextRank
|
|
91
92
|
@permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
|
92
93
|
@combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
|
93
94
|
end
|
95
|
+
# rubocop:enable Metrics/ParameterLists
|
94
96
|
|
95
97
|
# :nodoc:
|
96
98
|
def delimiter_re
|
@@ -104,23 +106,36 @@ module TextRank
|
|
104
106
|
# single tokens from below the cut to above it. So we'll continue searching
|
105
107
|
# until all of the top N final keywords (single or collapsed) have been
|
106
108
|
# considered.
|
107
|
-
|
108
|
-
|
109
|
-
single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
|
110
|
-
scan_text_for_all_permutations_of(single_tokens_to_consider) or break
|
111
|
-
decide_what_to_collapse_and_what_to_remove
|
109
|
+
while collapse_attempt
|
110
|
+
# keep trying
|
112
111
|
end
|
113
112
|
|
114
113
|
# We now know what to collapse and what to remove, so we can start safely
|
115
114
|
# modifying the tokens hash
|
115
|
+
apply_collapse
|
116
|
+
end
|
117
|
+
|
118
|
+
# :nodoc:
|
119
|
+
def collapse_attempt
|
120
|
+
regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
|
121
|
+
single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
|
122
|
+
scan_text_for_all_permutations_of(single_tokens_to_consider) or return false
|
123
|
+
decide_what_to_collapse_and_what_to_remove
|
124
|
+
true
|
125
|
+
end
|
126
|
+
|
127
|
+
# :nodoc:
|
128
|
+
def apply_collapse
|
116
129
|
@to_collapse.each do |perm|
|
117
130
|
values = @tokens.values_at(*perm).compact
|
118
131
|
# This might be empty if somehow the scanned permutation doesn't
|
119
132
|
# exactly match one of the tokens (e.g. ASCII-folding gone awry).
|
120
133
|
# The goal is to do the best we can, so if we can't find it, ignore.
|
121
134
|
next if values.empty?
|
135
|
+
|
122
136
|
@tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
|
123
137
|
end
|
138
|
+
|
124
139
|
@tokens.reject! do |k, _|
|
125
140
|
@to_remove.include?(k)
|
126
141
|
end || @tokens
|
@@ -136,16 +151,10 @@ module TextRank
|
|
136
151
|
# tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
|
137
152
|
# to find what we can.
|
138
153
|
def scan_text_for_all_permutations_of(single_tokens)
|
139
|
-
perms = []
|
140
154
|
# NOTE that by reversing the order we craft the regex to prefer larger combinations over
|
141
155
|
# smaller combinations (or singletons).
|
142
|
-
(1..@max_tokens_to_combine).to_a.reverse.
|
143
|
-
single_tokens
|
144
|
-
unless @permutations_scanned.key?(perm)
|
145
|
-
@permutations_scanned[perm] = 0
|
146
|
-
perms << perm
|
147
|
-
end
|
148
|
-
end
|
156
|
+
perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
|
157
|
+
scan_text_for_n_permutations_of(single_tokens, n)
|
149
158
|
end
|
150
159
|
scan_text_for(perms) do |s|
|
151
160
|
s = s.downcase if @ignore_case
|
@@ -153,6 +162,15 @@ module TextRank
|
|
153
162
|
end unless perms.empty?
|
154
163
|
end
|
155
164
|
|
165
|
+
def scan_text_for_n_permutations_of(single_tokens, n)
|
166
|
+
single_tokens.permutation(n).map do |perm|
|
167
|
+
unless @permutations_scanned.key?(perm)
|
168
|
+
@permutations_scanned[perm] = 0
|
169
|
+
perm
|
170
|
+
end
|
171
|
+
end.compact
|
172
|
+
end
|
173
|
+
|
156
174
|
# Because we're scanning the original text, we've lost all of the character filtering we did
|
157
175
|
# prior to tokenization, but that's important because we need the original context to be more
|
158
176
|
# choosy. Still, we need to know what delimiter goes between collapsed tokens (since it may
|
@@ -179,25 +197,30 @@ module TextRank
|
|
179
197
|
# modifications to the original token list yet but just keep track of what we plan
|
180
198
|
# to collapse/remove.
|
181
199
|
def decide_what_to_collapse_and_what_to_remove
|
182
|
-
non_empty_ordered = @permutations_scanned.select do |k, v|
|
183
|
-
v > 0
|
184
|
-
end.sort_by do |k, v|
|
185
|
-
[-v, -k.size] # reverse order
|
186
|
-
end
|
187
|
-
|
188
200
|
tokens_encountered = []
|
189
|
-
|
201
|
+
permutations_to_consider_collapsing.each do |perm, perm_count|
|
190
202
|
if perm.size > 1
|
191
|
-
singles_to_remove
|
192
|
-
if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
|
193
|
-
@to_collapse << perm if perm.size > 1
|
194
|
-
@to_remove |= singles_to_remove
|
195
|
-
end
|
203
|
+
decide_to_collapse_or_remove(perm, perm_count, singles_to_remove: perm - tokens_encountered)
|
196
204
|
end
|
197
205
|
tokens_encountered += perm
|
198
206
|
end
|
199
207
|
end
|
200
208
|
|
209
|
+
def permutations_to_consider_collapsing
|
210
|
+
@permutations_scanned.select do |_k, v|
|
211
|
+
v.positive?
|
212
|
+
end.sort_by do |k, v|
|
213
|
+
[-v, -k.size] # reverse order
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def decide_to_collapse_or_remove(perm, perm_count, singles_to_remove:)
|
218
|
+
if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
|
219
|
+
@to_collapse << perm if perm.size > 1
|
220
|
+
@to_remove |= singles_to_remove
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
201
224
|
# Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
|
202
225
|
# we still want to add the collapsed key if it shows up "enough" times.
|
203
226
|
def combination_significant?(perm, perm_count)
|
@@ -45,8 +45,9 @@ module TextRank
|
|
45
45
|
# @return [Hash<String, Float>]
|
46
46
|
def filter!(ranks, **_)
|
47
47
|
return if ranks.empty?
|
48
|
+
|
48
49
|
total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
|
49
|
-
|
50
|
+
ranks.transform_values { |v| v / total }
|
50
51
|
end
|
51
52
|
|
52
53
|
end
|
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
1
|
module TextRank
|
4
2
|
module TokenFilter
|
5
3
|
##
|
@@ -15,325 +13,7 @@ module TextRank
|
|
15
13
|
class Stopwords
|
16
14
|
|
17
15
|
# Default English stop-word list.
|
18
|
-
STOP_WORDS = Set.new(
|
19
|
-
a
|
20
|
-
about
|
21
|
-
above
|
22
|
-
across
|
23
|
-
after
|
24
|
-
afterwards
|
25
|
-
again
|
26
|
-
against
|
27
|
-
all
|
28
|
-
almost
|
29
|
-
alone
|
30
|
-
along
|
31
|
-
already
|
32
|
-
also
|
33
|
-
although
|
34
|
-
always
|
35
|
-
am
|
36
|
-
among
|
37
|
-
amongst
|
38
|
-
amoungst
|
39
|
-
amount
|
40
|
-
an
|
41
|
-
and
|
42
|
-
another
|
43
|
-
any
|
44
|
-
anyhow
|
45
|
-
anyone
|
46
|
-
anything
|
47
|
-
anyway
|
48
|
-
anywhere
|
49
|
-
are
|
50
|
-
around
|
51
|
-
as
|
52
|
-
at
|
53
|
-
back
|
54
|
-
be
|
55
|
-
became
|
56
|
-
because
|
57
|
-
become
|
58
|
-
becomes
|
59
|
-
becoming
|
60
|
-
been
|
61
|
-
before
|
62
|
-
beforehand
|
63
|
-
behind
|
64
|
-
being
|
65
|
-
below
|
66
|
-
beside
|
67
|
-
besides
|
68
|
-
between
|
69
|
-
beyond
|
70
|
-
bill
|
71
|
-
both
|
72
|
-
bottom
|
73
|
-
but
|
74
|
-
by
|
75
|
-
call
|
76
|
-
can
|
77
|
-
cannot
|
78
|
-
cant
|
79
|
-
co
|
80
|
-
con
|
81
|
-
could
|
82
|
-
couldnt
|
83
|
-
cry
|
84
|
-
de
|
85
|
-
describe
|
86
|
-
detail
|
87
|
-
do
|
88
|
-
done
|
89
|
-
down
|
90
|
-
due
|
91
|
-
during
|
92
|
-
each
|
93
|
-
eg
|
94
|
-
eight
|
95
|
-
either
|
96
|
-
eleven
|
97
|
-
else
|
98
|
-
elsewhere
|
99
|
-
empty
|
100
|
-
enough
|
101
|
-
etc
|
102
|
-
even
|
103
|
-
ever
|
104
|
-
every
|
105
|
-
everyone
|
106
|
-
everything
|
107
|
-
everywhere
|
108
|
-
except
|
109
|
-
few
|
110
|
-
fifteen
|
111
|
-
fify
|
112
|
-
fill
|
113
|
-
find
|
114
|
-
fire
|
115
|
-
first
|
116
|
-
five
|
117
|
-
for
|
118
|
-
former
|
119
|
-
formerly
|
120
|
-
forty
|
121
|
-
found
|
122
|
-
four
|
123
|
-
from
|
124
|
-
front
|
125
|
-
full
|
126
|
-
further
|
127
|
-
get
|
128
|
-
give
|
129
|
-
go
|
130
|
-
had
|
131
|
-
has
|
132
|
-
hasnt
|
133
|
-
have
|
134
|
-
he
|
135
|
-
hence
|
136
|
-
her
|
137
|
-
here
|
138
|
-
hereafter
|
139
|
-
hereby
|
140
|
-
herein
|
141
|
-
hereupon
|
142
|
-
hers
|
143
|
-
herself
|
144
|
-
him
|
145
|
-
himself
|
146
|
-
his
|
147
|
-
how
|
148
|
-
however
|
149
|
-
hundred
|
150
|
-
ie
|
151
|
-
if
|
152
|
-
in
|
153
|
-
inc
|
154
|
-
indeed
|
155
|
-
interest
|
156
|
-
into
|
157
|
-
is
|
158
|
-
it
|
159
|
-
its
|
160
|
-
itself
|
161
|
-
keep
|
162
|
-
last
|
163
|
-
latter
|
164
|
-
latterly
|
165
|
-
least
|
166
|
-
less
|
167
|
-
ltd
|
168
|
-
made
|
169
|
-
many
|
170
|
-
may
|
171
|
-
me
|
172
|
-
meanwhile
|
173
|
-
might
|
174
|
-
mill
|
175
|
-
mine
|
176
|
-
more
|
177
|
-
moreover
|
178
|
-
most
|
179
|
-
mostly
|
180
|
-
move
|
181
|
-
much
|
182
|
-
must
|
183
|
-
my
|
184
|
-
myself
|
185
|
-
name
|
186
|
-
namely
|
187
|
-
neither
|
188
|
-
never
|
189
|
-
nevertheless
|
190
|
-
next
|
191
|
-
nine
|
192
|
-
no
|
193
|
-
nobody
|
194
|
-
none
|
195
|
-
noone
|
196
|
-
nor
|
197
|
-
not
|
198
|
-
nothing
|
199
|
-
now
|
200
|
-
nowhere
|
201
|
-
of
|
202
|
-
off
|
203
|
-
often
|
204
|
-
on
|
205
|
-
once
|
206
|
-
one
|
207
|
-
only
|
208
|
-
onto
|
209
|
-
or
|
210
|
-
other
|
211
|
-
others
|
212
|
-
otherwise
|
213
|
-
our
|
214
|
-
ours
|
215
|
-
ourselves
|
216
|
-
out
|
217
|
-
over
|
218
|
-
own
|
219
|
-
part
|
220
|
-
per
|
221
|
-
perhaps
|
222
|
-
please
|
223
|
-
put
|
224
|
-
rather
|
225
|
-
re
|
226
|
-
same
|
227
|
-
see
|
228
|
-
seem
|
229
|
-
seemed
|
230
|
-
seeming
|
231
|
-
seems
|
232
|
-
serious
|
233
|
-
several
|
234
|
-
she
|
235
|
-
should
|
236
|
-
show
|
237
|
-
side
|
238
|
-
since
|
239
|
-
sincere
|
240
|
-
six
|
241
|
-
sixty
|
242
|
-
so
|
243
|
-
some
|
244
|
-
somehow
|
245
|
-
someone
|
246
|
-
something
|
247
|
-
sometime
|
248
|
-
sometimes
|
249
|
-
somewhere
|
250
|
-
still
|
251
|
-
such
|
252
|
-
system
|
253
|
-
take
|
254
|
-
ten
|
255
|
-
than
|
256
|
-
that
|
257
|
-
the
|
258
|
-
their
|
259
|
-
them
|
260
|
-
themselves
|
261
|
-
then
|
262
|
-
thence
|
263
|
-
there
|
264
|
-
thereafter
|
265
|
-
thereby
|
266
|
-
therefore
|
267
|
-
therein
|
268
|
-
thereupon
|
269
|
-
these
|
270
|
-
they
|
271
|
-
thickv
|
272
|
-
thin
|
273
|
-
third
|
274
|
-
this
|
275
|
-
those
|
276
|
-
though
|
277
|
-
three
|
278
|
-
through
|
279
|
-
throughout
|
280
|
-
thru
|
281
|
-
thus
|
282
|
-
to
|
283
|
-
together
|
284
|
-
too
|
285
|
-
top
|
286
|
-
toward
|
287
|
-
towards
|
288
|
-
twelve
|
289
|
-
twenty
|
290
|
-
two
|
291
|
-
un
|
292
|
-
under
|
293
|
-
until
|
294
|
-
up
|
295
|
-
upon
|
296
|
-
us
|
297
|
-
very
|
298
|
-
via
|
299
|
-
was
|
300
|
-
we
|
301
|
-
well
|
302
|
-
were
|
303
|
-
what
|
304
|
-
whatever
|
305
|
-
when
|
306
|
-
whence
|
307
|
-
whenever
|
308
|
-
where
|
309
|
-
whereafter
|
310
|
-
whereas
|
311
|
-
whereby
|
312
|
-
wherein
|
313
|
-
whereupon
|
314
|
-
wherever
|
315
|
-
whether
|
316
|
-
which
|
317
|
-
while
|
318
|
-
whither
|
319
|
-
who
|
320
|
-
whoever
|
321
|
-
whole
|
322
|
-
whom
|
323
|
-
whose
|
324
|
-
why
|
325
|
-
will
|
326
|
-
with
|
327
|
-
within
|
328
|
-
without
|
329
|
-
would
|
330
|
-
yet
|
331
|
-
you
|
332
|
-
your
|
333
|
-
yours
|
334
|
-
yourself
|
335
|
-
yourselves
|
336
|
-
])
|
16
|
+
STOP_WORDS = Set.new(YAML.load_file(File.expand_path('stopwords.yml', __dir__)))
|
337
17
|
|
338
18
|
# Perform the filter
|
339
19
|
# @param tokens [Array<String>]
|