text_rank 1.2.3 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.codeclimate.yml +1 -1
- data/.gitignore +4 -0
- data/.rubocop.yml +7 -0
- data/.ruby-version +1 -1
- data/.travis.yml +1 -0
- data/Rakefile +5 -0
- data/bin/console +3 -3
- data/ext/text_rank/extconf.rb +3 -0
- data/ext/text_rank/page_rank_sparse_native.c +300 -0
- data/ext/text_rank/page_rank_sparse_native.h +93 -0
- data/ext/text_rank/text_rank.c +5 -0
- data/lib/page_rank/base.rb +12 -9
- data/lib/page_rank/dense.rb +3 -2
- data/lib/page_rank/sparse.rb +6 -7
- data/lib/page_rank/sparse_native.rb +21 -0
- data/lib/page_rank.rb +7 -4
- data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
- data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
- data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
- data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
- data/lib/text_rank/char_filter.rb +1 -1
- data/lib/text_rank/fingerprint.rb +10 -18
- data/lib/text_rank/fingerprint_overlap.rb +55 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
- data/lib/text_rank/keyword_extractor.rb +32 -25
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
- data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
- data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
- data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
- data/lib/text_rank/token_filter/stopwords.rb +1 -321
- data/lib/text_rank/token_filter/stopwords.yml +317 -0
- data/lib/text_rank/tokenizer/money.rb +11 -6
- data/lib/text_rank/tokenizer/number.rb +4 -3
- data/lib/text_rank/tokenizer/punctuation.rb +4 -1
- data/lib/text_rank/tokenizer/url.rb +3 -0
- data/lib/text_rank/tokenizer/whitespace.rb +4 -1
- data/lib/text_rank/tokenizer/word.rb +5 -2
- data/lib/text_rank/tokenizer.rb +1 -1
- data/lib/text_rank/version.rb +3 -1
- data/lib/text_rank.rb +14 -9
- data/text_rank.gemspec +4 -1
- metadata +48 -12
@@ -77,6 +77,7 @@ module TextRank
|
|
77
77
|
|
78
78
|
class TokenCollapser
|
79
79
|
|
80
|
+
# rubocop:disable Metrics/ParameterLists
|
80
81
|
def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
|
81
82
|
@tokens = tokens
|
82
83
|
@text = text
|
@@ -91,6 +92,7 @@ module TextRank
|
|
91
92
|
@permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
|
92
93
|
@combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
|
93
94
|
end
|
95
|
+
# rubocop:enable Metrics/ParameterLists
|
94
96
|
|
95
97
|
# :nodoc:
|
96
98
|
def delimiter_re
|
@@ -104,23 +106,36 @@ module TextRank
|
|
104
106
|
# single tokens from below the cut to above it. So we'll continue searching
|
105
107
|
# until all of the top N final keywords (single or collapsed) have been
|
106
108
|
# considered.
|
107
|
-
|
108
|
-
|
109
|
-
single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
|
110
|
-
scan_text_for_all_permutations_of(single_tokens_to_consider) or break
|
111
|
-
decide_what_to_collapse_and_what_to_remove
|
109
|
+
while collapse_attempt
|
110
|
+
# keep trying
|
112
111
|
end
|
113
112
|
|
114
113
|
# We now know what to collapse and what to remove, so we can start safely
|
115
114
|
# modifying the tokens hash
|
115
|
+
apply_collapse
|
116
|
+
end
|
117
|
+
|
118
|
+
# :nodoc:
|
119
|
+
def collapse_attempt
|
120
|
+
regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
|
121
|
+
single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
|
122
|
+
scan_text_for_all_permutations_of(single_tokens_to_consider) or return false
|
123
|
+
decide_what_to_collapse_and_what_to_remove
|
124
|
+
true
|
125
|
+
end
|
126
|
+
|
127
|
+
# :nodoc:
|
128
|
+
def apply_collapse
|
116
129
|
@to_collapse.each do |perm|
|
117
130
|
values = @tokens.values_at(*perm).compact
|
118
131
|
# This might be empty if somehow the scanned permutation doesn't
|
119
132
|
# exactly match one of the tokens (e.g. ASCII-folding gone awry).
|
120
133
|
# The goal is to do the best we can, so if we can't find it, ignore.
|
121
134
|
next if values.empty?
|
135
|
+
|
122
136
|
@tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
|
123
137
|
end
|
138
|
+
|
124
139
|
@tokens.reject! do |k, _|
|
125
140
|
@to_remove.include?(k)
|
126
141
|
end || @tokens
|
@@ -136,16 +151,10 @@ module TextRank
|
|
136
151
|
# tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
|
137
152
|
# to find what we can.
|
138
153
|
def scan_text_for_all_permutations_of(single_tokens)
|
139
|
-
perms = []
|
140
154
|
# NOTE that by reversing the order we craft the regex to prefer larger combinations over
|
141
155
|
# smaller combinations (or singletons).
|
142
|
-
(1..@max_tokens_to_combine).to_a.reverse.
|
143
|
-
single_tokens
|
144
|
-
unless @permutations_scanned.key?(perm)
|
145
|
-
@permutations_scanned[perm] = 0
|
146
|
-
perms << perm
|
147
|
-
end
|
148
|
-
end
|
156
|
+
perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
|
157
|
+
scan_text_for_n_permutations_of(single_tokens, n)
|
149
158
|
end
|
150
159
|
scan_text_for(perms) do |s|
|
151
160
|
s = s.downcase if @ignore_case
|
@@ -153,6 +162,15 @@ module TextRank
|
|
153
162
|
end unless perms.empty?
|
154
163
|
end
|
155
164
|
|
165
|
+
def scan_text_for_n_permutations_of(single_tokens, n)
|
166
|
+
single_tokens.permutation(n).map do |perm|
|
167
|
+
unless @permutations_scanned.key?(perm)
|
168
|
+
@permutations_scanned[perm] = 0
|
169
|
+
perm
|
170
|
+
end
|
171
|
+
end.compact
|
172
|
+
end
|
173
|
+
|
156
174
|
# Because we're scanning the original text, we've lost all of the character filtering we did
|
157
175
|
# prior to tokenization, but that's important because we need the original context to be more
|
158
176
|
# choosy. Still, we need to know what delimiter goes between collapsed tokens (since it may
|
@@ -179,25 +197,30 @@ module TextRank
|
|
179
197
|
# modifications to the original token list yet but just keep track of what we plan
|
180
198
|
# to collapse/remove.
|
181
199
|
def decide_what_to_collapse_and_what_to_remove
|
182
|
-
non_empty_ordered = @permutations_scanned.select do |k, v|
|
183
|
-
v > 0
|
184
|
-
end.sort_by do |k, v|
|
185
|
-
[-v, -k.size] # reverse order
|
186
|
-
end
|
187
|
-
|
188
200
|
tokens_encountered = []
|
189
|
-
|
201
|
+
permutations_to_consider_collapsing.each do |perm, perm_count|
|
190
202
|
if perm.size > 1
|
191
|
-
singles_to_remove
|
192
|
-
if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
|
193
|
-
@to_collapse << perm if perm.size > 1
|
194
|
-
@to_remove |= singles_to_remove
|
195
|
-
end
|
203
|
+
decide_to_collapse_or_remove(perm, perm_count, singles_to_remove: perm - tokens_encountered)
|
196
204
|
end
|
197
205
|
tokens_encountered += perm
|
198
206
|
end
|
199
207
|
end
|
200
208
|
|
209
|
+
def permutations_to_consider_collapsing
|
210
|
+
@permutations_scanned.select do |_k, v|
|
211
|
+
v.positive?
|
212
|
+
end.sort_by do |k, v|
|
213
|
+
[-v, -k.size] # reverse order
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def decide_to_collapse_or_remove(perm, perm_count, singles_to_remove:)
|
218
|
+
if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
|
219
|
+
@to_collapse << perm if perm.size > 1
|
220
|
+
@to_remove |= singles_to_remove
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
201
224
|
# Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
|
202
225
|
# we still want to add the collapsed key if it shows up "enough" times.
|
203
226
|
def combination_significant?(perm, perm_count)
|
@@ -45,8 +45,9 @@ module TextRank
|
|
45
45
|
# @return [Hash<String, Float>]
|
46
46
|
def filter!(ranks, **_)
|
47
47
|
return if ranks.empty?
|
48
|
+
|
48
49
|
total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
|
49
|
-
|
50
|
+
ranks.transform_values { |v| v / total }
|
50
51
|
end
|
51
52
|
|
52
53
|
end
|
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
1
|
module TextRank
|
4
2
|
module TokenFilter
|
5
3
|
##
|
@@ -15,325 +13,7 @@ module TextRank
|
|
15
13
|
class Stopwords
|
16
14
|
|
17
15
|
# Default English stop-word list.
|
18
|
-
STOP_WORDS = Set.new(
|
19
|
-
a
|
20
|
-
about
|
21
|
-
above
|
22
|
-
across
|
23
|
-
after
|
24
|
-
afterwards
|
25
|
-
again
|
26
|
-
against
|
27
|
-
all
|
28
|
-
almost
|
29
|
-
alone
|
30
|
-
along
|
31
|
-
already
|
32
|
-
also
|
33
|
-
although
|
34
|
-
always
|
35
|
-
am
|
36
|
-
among
|
37
|
-
amongst
|
38
|
-
amoungst
|
39
|
-
amount
|
40
|
-
an
|
41
|
-
and
|
42
|
-
another
|
43
|
-
any
|
44
|
-
anyhow
|
45
|
-
anyone
|
46
|
-
anything
|
47
|
-
anyway
|
48
|
-
anywhere
|
49
|
-
are
|
50
|
-
around
|
51
|
-
as
|
52
|
-
at
|
53
|
-
back
|
54
|
-
be
|
55
|
-
became
|
56
|
-
because
|
57
|
-
become
|
58
|
-
becomes
|
59
|
-
becoming
|
60
|
-
been
|
61
|
-
before
|
62
|
-
beforehand
|
63
|
-
behind
|
64
|
-
being
|
65
|
-
below
|
66
|
-
beside
|
67
|
-
besides
|
68
|
-
between
|
69
|
-
beyond
|
70
|
-
bill
|
71
|
-
both
|
72
|
-
bottom
|
73
|
-
but
|
74
|
-
by
|
75
|
-
call
|
76
|
-
can
|
77
|
-
cannot
|
78
|
-
cant
|
79
|
-
co
|
80
|
-
con
|
81
|
-
could
|
82
|
-
couldnt
|
83
|
-
cry
|
84
|
-
de
|
85
|
-
describe
|
86
|
-
detail
|
87
|
-
do
|
88
|
-
done
|
89
|
-
down
|
90
|
-
due
|
91
|
-
during
|
92
|
-
each
|
93
|
-
eg
|
94
|
-
eight
|
95
|
-
either
|
96
|
-
eleven
|
97
|
-
else
|
98
|
-
elsewhere
|
99
|
-
empty
|
100
|
-
enough
|
101
|
-
etc
|
102
|
-
even
|
103
|
-
ever
|
104
|
-
every
|
105
|
-
everyone
|
106
|
-
everything
|
107
|
-
everywhere
|
108
|
-
except
|
109
|
-
few
|
110
|
-
fifteen
|
111
|
-
fify
|
112
|
-
fill
|
113
|
-
find
|
114
|
-
fire
|
115
|
-
first
|
116
|
-
five
|
117
|
-
for
|
118
|
-
former
|
119
|
-
formerly
|
120
|
-
forty
|
121
|
-
found
|
122
|
-
four
|
123
|
-
from
|
124
|
-
front
|
125
|
-
full
|
126
|
-
further
|
127
|
-
get
|
128
|
-
give
|
129
|
-
go
|
130
|
-
had
|
131
|
-
has
|
132
|
-
hasnt
|
133
|
-
have
|
134
|
-
he
|
135
|
-
hence
|
136
|
-
her
|
137
|
-
here
|
138
|
-
hereafter
|
139
|
-
hereby
|
140
|
-
herein
|
141
|
-
hereupon
|
142
|
-
hers
|
143
|
-
herself
|
144
|
-
him
|
145
|
-
himself
|
146
|
-
his
|
147
|
-
how
|
148
|
-
however
|
149
|
-
hundred
|
150
|
-
ie
|
151
|
-
if
|
152
|
-
in
|
153
|
-
inc
|
154
|
-
indeed
|
155
|
-
interest
|
156
|
-
into
|
157
|
-
is
|
158
|
-
it
|
159
|
-
its
|
160
|
-
itself
|
161
|
-
keep
|
162
|
-
last
|
163
|
-
latter
|
164
|
-
latterly
|
165
|
-
least
|
166
|
-
less
|
167
|
-
ltd
|
168
|
-
made
|
169
|
-
many
|
170
|
-
may
|
171
|
-
me
|
172
|
-
meanwhile
|
173
|
-
might
|
174
|
-
mill
|
175
|
-
mine
|
176
|
-
more
|
177
|
-
moreover
|
178
|
-
most
|
179
|
-
mostly
|
180
|
-
move
|
181
|
-
much
|
182
|
-
must
|
183
|
-
my
|
184
|
-
myself
|
185
|
-
name
|
186
|
-
namely
|
187
|
-
neither
|
188
|
-
never
|
189
|
-
nevertheless
|
190
|
-
next
|
191
|
-
nine
|
192
|
-
no
|
193
|
-
nobody
|
194
|
-
none
|
195
|
-
noone
|
196
|
-
nor
|
197
|
-
not
|
198
|
-
nothing
|
199
|
-
now
|
200
|
-
nowhere
|
201
|
-
of
|
202
|
-
off
|
203
|
-
often
|
204
|
-
on
|
205
|
-
once
|
206
|
-
one
|
207
|
-
only
|
208
|
-
onto
|
209
|
-
or
|
210
|
-
other
|
211
|
-
others
|
212
|
-
otherwise
|
213
|
-
our
|
214
|
-
ours
|
215
|
-
ourselves
|
216
|
-
out
|
217
|
-
over
|
218
|
-
own
|
219
|
-
part
|
220
|
-
per
|
221
|
-
perhaps
|
222
|
-
please
|
223
|
-
put
|
224
|
-
rather
|
225
|
-
re
|
226
|
-
same
|
227
|
-
see
|
228
|
-
seem
|
229
|
-
seemed
|
230
|
-
seeming
|
231
|
-
seems
|
232
|
-
serious
|
233
|
-
several
|
234
|
-
she
|
235
|
-
should
|
236
|
-
show
|
237
|
-
side
|
238
|
-
since
|
239
|
-
sincere
|
240
|
-
six
|
241
|
-
sixty
|
242
|
-
so
|
243
|
-
some
|
244
|
-
somehow
|
245
|
-
someone
|
246
|
-
something
|
247
|
-
sometime
|
248
|
-
sometimes
|
249
|
-
somewhere
|
250
|
-
still
|
251
|
-
such
|
252
|
-
system
|
253
|
-
take
|
254
|
-
ten
|
255
|
-
than
|
256
|
-
that
|
257
|
-
the
|
258
|
-
their
|
259
|
-
them
|
260
|
-
themselves
|
261
|
-
then
|
262
|
-
thence
|
263
|
-
there
|
264
|
-
thereafter
|
265
|
-
thereby
|
266
|
-
therefore
|
267
|
-
therein
|
268
|
-
thereupon
|
269
|
-
these
|
270
|
-
they
|
271
|
-
thickv
|
272
|
-
thin
|
273
|
-
third
|
274
|
-
this
|
275
|
-
those
|
276
|
-
though
|
277
|
-
three
|
278
|
-
through
|
279
|
-
throughout
|
280
|
-
thru
|
281
|
-
thus
|
282
|
-
to
|
283
|
-
together
|
284
|
-
too
|
285
|
-
top
|
286
|
-
toward
|
287
|
-
towards
|
288
|
-
twelve
|
289
|
-
twenty
|
290
|
-
two
|
291
|
-
un
|
292
|
-
under
|
293
|
-
until
|
294
|
-
up
|
295
|
-
upon
|
296
|
-
us
|
297
|
-
very
|
298
|
-
via
|
299
|
-
was
|
300
|
-
we
|
301
|
-
well
|
302
|
-
were
|
303
|
-
what
|
304
|
-
whatever
|
305
|
-
when
|
306
|
-
whence
|
307
|
-
whenever
|
308
|
-
where
|
309
|
-
whereafter
|
310
|
-
whereas
|
311
|
-
whereby
|
312
|
-
wherein
|
313
|
-
whereupon
|
314
|
-
wherever
|
315
|
-
whether
|
316
|
-
which
|
317
|
-
while
|
318
|
-
whither
|
319
|
-
who
|
320
|
-
whoever
|
321
|
-
whole
|
322
|
-
whom
|
323
|
-
whose
|
324
|
-
why
|
325
|
-
will
|
326
|
-
with
|
327
|
-
within
|
328
|
-
without
|
329
|
-
would
|
330
|
-
yet
|
331
|
-
you
|
332
|
-
your
|
333
|
-
yours
|
334
|
-
yourself
|
335
|
-
yourselves
|
336
|
-
])
|
16
|
+
STOP_WORDS = Set.new(YAML.load_file(File.expand_path('stopwords.yml', __dir__)))
|
337
17
|
|
338
18
|
# Perform the filter
|
339
19
|
# @param tokens [Array<String>]
|