text_rank 1.2.3 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +7 -0
- data/bin/console +3 -3
- data/lib/page_rank.rb +2 -0
- data/lib/page_rank/base.rb +9 -8
- data/lib/page_rank/dense.rb +2 -1
- data/lib/page_rank/sparse.rb +6 -7
- data/lib/text_rank.rb +11 -8
- data/lib/text_rank/char_filter.rb +1 -1
- data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
- data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
- data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
- data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
- data/lib/text_rank/fingerprint.rb +10 -18
- data/lib/text_rank/fingerprint_overlap.rb +55 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
- data/lib/text_rank/keyword_extractor.rb +19 -21
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
- data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
- data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
- data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
- data/lib/text_rank/token_filter/stopwords.rb +1 -321
- data/lib/text_rank/token_filter/stopwords.yml +317 -0
- data/lib/text_rank/tokenizer.rb +1 -1
- data/lib/text_rank/tokenizer/money.rb +11 -6
- data/lib/text_rank/tokenizer/number.rb +4 -3
- data/lib/text_rank/tokenizer/punctuation.rb +4 -1
- data/lib/text_rank/tokenizer/url.rb +3 -0
- data/lib/text_rank/tokenizer/whitespace.rb +4 -1
- data/lib/text_rank/tokenizer/word.rb +5 -2
- data/lib/text_rank/version.rb +3 -1
- metadata +4 -1
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
1
|
module TextRank
|
4
2
|
##
|
5
3
|
# Class used to compare documents according to TextRank. A "fingerprint"
|
@@ -61,28 +59,22 @@ module TextRank
|
|
61
59
|
# Calculates the "similarity" between this fingerprint and another
|
62
60
|
# @param {Fingerprint} A second fingerprint to compare
|
63
61
|
# @return [Number] A number between 0.0 (different) and 1.0 (same)
|
64
|
-
def similarity(
|
65
|
-
return 1.0 if values ==
|
66
|
-
|
67
|
-
sim = 0
|
68
|
-
s1 = Set.new
|
69
|
-
s2 = Set.new
|
62
|
+
def similarity(other)
|
63
|
+
return 1.0 if values == other.values # Short-circuit for efficiency
|
70
64
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
if v1 == v2
|
75
|
-
sim += 1
|
76
|
-
else
|
77
|
-
s1.delete?(v2) ? (sim += 1) : (s2 << v2)
|
78
|
-
s2.delete?(v1) ? (sim += 1) : (s1 << v1)
|
79
|
-
end
|
80
|
-
sum + sim * linear_transform[i]
|
65
|
+
sum = 0
|
66
|
+
overlap(other).each_with_index do |overlap_value, i|
|
67
|
+
sum += overlap_value * linear_transform[i]
|
81
68
|
end
|
69
|
+
sum
|
82
70
|
end
|
83
71
|
|
84
72
|
private
|
85
73
|
|
74
|
+
def overlap(other)
|
75
|
+
FingerprintOverlap.new(values, other.values).overlap
|
76
|
+
end
|
77
|
+
|
86
78
|
def linear_transform
|
87
79
|
@linear_transform ||= size.times.map do |i|
|
88
80
|
1.0 / Math.log(i + 2) / size.to_f / norm_factor
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module TextRank
|
2
|
+
##
|
3
|
+
# Determines "overlap" between two fingerprints at each N prefixes
|
4
|
+
#
|
5
|
+
# For example,
|
6
|
+
#
|
7
|
+
# FingerprintOverlap.new(
|
8
|
+
# %w[a b c d],
|
9
|
+
# %w[b e a c],
|
10
|
+
# ).overlap
|
11
|
+
#
|
12
|
+
# => [
|
13
|
+
# 0, # [a] & (b) have no overlap
|
14
|
+
# 1, # [a b] & [b e] have one overlap: b
|
15
|
+
# 2, # [a b c] & [b e a] have two overlap: a & b
|
16
|
+
# 3, # [a b c d] & [b e a c] have three overlap: a, b, & c
|
17
|
+
# ]
|
18
|
+
##
|
19
|
+
class FingerprintOverlap
|
20
|
+
|
21
|
+
attr_reader :overlap
|
22
|
+
|
23
|
+
def initialize(values1, values2)
|
24
|
+
raise ArgumentError, 'Value size mismatch' if values1.size != values2.size
|
25
|
+
|
26
|
+
@encountered1 = Set.new
|
27
|
+
@encountered2 = Set.new
|
28
|
+
@overlap_count = 0
|
29
|
+
|
30
|
+
@overlap = determine_overlap(values1, values2)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def determine_overlap(values1, values2)
|
36
|
+
values1.zip(values2).map do |v1, v2|
|
37
|
+
encounter(v1, v2)
|
38
|
+
@overlap_count
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# This algorithm is a little more complex than could be represented in Ruby,
|
43
|
+
# but we want to keep it as performant as possible.
|
44
|
+
def encounter(value1, value2)
|
45
|
+
if value1 == value2
|
46
|
+
@overlap_count += 1
|
47
|
+
else
|
48
|
+
# Delete from the set in case an element appears more than once
|
49
|
+
@encountered1.delete?(value2) ? (@overlap_count += 1) : (@encountered2 << value2)
|
50
|
+
@encountered2.delete?(value1) ? (@overlap_count += 1) : (@encountered1 << value1)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
@@ -61,18 +61,27 @@ module TextRank
|
|
61
61
|
# return [nil]
|
62
62
|
def build_graph(tokens, graph)
|
63
63
|
ngram_window = @ngram_size * 2 + 1
|
64
|
-
tokens.
|
64
|
+
tokens.size.times do |i|
|
65
65
|
ngram_window.times do |j|
|
66
|
-
|
67
|
-
token_j = tokens[i - @ngram_size + j]
|
68
|
-
if token_j
|
69
|
-
graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
|
70
|
-
end
|
66
|
+
consider_ngram_window(tokens, graph, i, j)
|
71
67
|
end
|
72
68
|
end
|
73
69
|
nil
|
74
70
|
end
|
75
71
|
|
72
|
+
private
|
73
|
+
|
74
|
+
def consider_ngram_window(tokens, graph, i, j)
|
75
|
+
return if j == @ngram_size || i + j < @ngram_size
|
76
|
+
|
77
|
+
token_i = tokens[i]
|
78
|
+
token_j = tokens[i - @ngram_size + j]
|
79
|
+
|
80
|
+
if token_j
|
81
|
+
graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
76
85
|
end
|
77
86
|
end
|
78
87
|
end
|
@@ -13,9 +13,9 @@ module TextRank
|
|
13
13
|
# @return [KeywordExtractor]
|
14
14
|
def self.basic(**options)
|
15
15
|
new(**{
|
16
|
-
char_filters: [
|
17
|
-
tokenizers: [
|
18
|
-
token_filters: [
|
16
|
+
char_filters: %i[AsciiFolding Lowercase],
|
17
|
+
tokenizers: %i[Word],
|
18
|
+
token_filters: %i[Stopwords MinLength],
|
19
19
|
graph_strategy: :Coocurrence,
|
20
20
|
}.merge(options))
|
21
21
|
end
|
@@ -25,11 +25,11 @@ module TextRank
|
|
25
25
|
# @return [KeywordExtractor]
|
26
26
|
def self.advanced(**options)
|
27
27
|
new(**{
|
28
|
-
char_filters: [
|
29
|
-
tokenizers: [
|
30
|
-
token_filters: [
|
28
|
+
char_filters: %i[AsciiFolding Lowercase StripHtml StripEmail UndoContractions StripPossessive],
|
29
|
+
tokenizers: %i[Url Money Number Word Punctuation],
|
30
|
+
token_filters: %i[PartOfSpeech Stopwords MinLength],
|
31
31
|
graph_strategy: :Coocurrence,
|
32
|
-
rank_filters: [
|
32
|
+
rank_filters: %i[CollapseAdjacent NormalizeUnitVector SortByValue],
|
33
33
|
}.merge(options))
|
34
34
|
end
|
35
35
|
|
@@ -41,14 +41,14 @@ module TextRank
|
|
41
41
|
# @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
|
42
42
|
def initialize(**options)
|
43
43
|
@page_rank_options = {
|
44
|
-
strategy:
|
45
|
-
damping:
|
44
|
+
strategy: options[:strategy] || :sparse,
|
45
|
+
damping: options[:damping],
|
46
46
|
tolerance: options[:tolerance],
|
47
47
|
}
|
48
|
-
@char_filters
|
49
|
-
@tokenizers
|
50
|
-
@token_filters
|
51
|
-
@rank_filters
|
48
|
+
@char_filters = options[:char_filters] || []
|
49
|
+
@tokenizers = options[:tokenizers] || [Tokenizer::Word]
|
50
|
+
@token_filters = options[:token_filters] || []
|
51
|
+
@rank_filters = options[:rank_filters] || []
|
52
52
|
@graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
|
53
53
|
end
|
54
54
|
|
@@ -73,9 +73,7 @@ module TextRank
|
|
73
73
|
# Sets the graph strategy for producing a graph from tokens
|
74
74
|
# @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
|
75
75
|
# @return [Class, Symbol, #build_graph]
|
76
|
-
|
77
|
-
@graph_strategy = strategy
|
78
|
-
end
|
76
|
+
attr_writer :graph_strategy
|
79
77
|
|
80
78
|
# Add a new TokenFilter for processing tokens after tokenization
|
81
79
|
# @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
|
@@ -153,14 +151,14 @@ module TextRank
|
|
153
151
|
array.insert(idx, value)
|
154
152
|
end
|
155
153
|
|
156
|
-
def classify(
|
157
|
-
case
|
154
|
+
def classify(clazz, context: self)
|
155
|
+
case clazz
|
158
156
|
when Class
|
159
|
-
|
157
|
+
clazz.new
|
160
158
|
when Symbol
|
161
|
-
context.const_get(
|
159
|
+
context.const_get(clazz).new
|
162
160
|
else
|
163
|
-
|
161
|
+
clazz
|
164
162
|
end
|
165
163
|
end
|
166
164
|
|
@@ -77,6 +77,7 @@ module TextRank
|
|
77
77
|
|
78
78
|
class TokenCollapser
|
79
79
|
|
80
|
+
# rubocop:disable Metrics/ParameterLists
|
80
81
|
def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
|
81
82
|
@tokens = tokens
|
82
83
|
@text = text
|
@@ -91,6 +92,7 @@ module TextRank
|
|
91
92
|
@permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
|
92
93
|
@combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
|
93
94
|
end
|
95
|
+
# rubocop:enable Metrics/ParameterLists
|
94
96
|
|
95
97
|
# :nodoc:
|
96
98
|
def delimiter_re
|
@@ -104,23 +106,36 @@ module TextRank
|
|
104
106
|
# single tokens from below the cut to above it. So we'll continue searching
|
105
107
|
# until all of the top N final keywords (single or collapsed) have been
|
106
108
|
# considered.
|
107
|
-
|
108
|
-
|
109
|
-
single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
|
110
|
-
scan_text_for_all_permutations_of(single_tokens_to_consider) or break
|
111
|
-
decide_what_to_collapse_and_what_to_remove
|
109
|
+
while collapse_attempt
|
110
|
+
# keep trying
|
112
111
|
end
|
113
112
|
|
114
113
|
# We now know what to collapse and what to remove, so we can start safely
|
115
114
|
# modifying the tokens hash
|
115
|
+
apply_collapse
|
116
|
+
end
|
117
|
+
|
118
|
+
# :nodoc:
|
119
|
+
def collapse_attempt
|
120
|
+
regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
|
121
|
+
single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
|
122
|
+
scan_text_for_all_permutations_of(single_tokens_to_consider) or return false
|
123
|
+
decide_what_to_collapse_and_what_to_remove
|
124
|
+
true
|
125
|
+
end
|
126
|
+
|
127
|
+
# :nodoc:
|
128
|
+
def apply_collapse
|
116
129
|
@to_collapse.each do |perm|
|
117
130
|
values = @tokens.values_at(*perm).compact
|
118
131
|
# This might be empty if somehow the scanned permutation doesn't
|
119
132
|
# exactly match one of the tokens (e.g. ASCII-folding gone awry).
|
120
133
|
# The goal is to do the best we can, so if we can't find it, ignore.
|
121
134
|
next if values.empty?
|
135
|
+
|
122
136
|
@tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
|
123
137
|
end
|
138
|
+
|
124
139
|
@tokens.reject! do |k, _|
|
125
140
|
@to_remove.include?(k)
|
126
141
|
end || @tokens
|
@@ -136,16 +151,10 @@ module TextRank
|
|
136
151
|
# tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
|
137
152
|
# to find what we can.
|
138
153
|
def scan_text_for_all_permutations_of(single_tokens)
|
139
|
-
perms = []
|
140
154
|
# NOTE that by reversing the order we craft the regex to prefer larger combinations over
|
141
155
|
# smaller combinations (or singletons).
|
142
|
-
(1..@max_tokens_to_combine).to_a.reverse.
|
143
|
-
single_tokens
|
144
|
-
unless @permutations_scanned.key?(perm)
|
145
|
-
@permutations_scanned[perm] = 0
|
146
|
-
perms << perm
|
147
|
-
end
|
148
|
-
end
|
156
|
+
perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
|
157
|
+
scan_text_for_n_permutations_of(single_tokens, n)
|
149
158
|
end
|
150
159
|
scan_text_for(perms) do |s|
|
151
160
|
s = s.downcase if @ignore_case
|
@@ -153,6 +162,15 @@ module TextRank
|
|
153
162
|
end unless perms.empty?
|
154
163
|
end
|
155
164
|
|
165
|
+
def scan_text_for_n_permutations_of(single_tokens, n)
|
166
|
+
single_tokens.permutation(n).map do |perm|
|
167
|
+
unless @permutations_scanned.key?(perm)
|
168
|
+
@permutations_scanned[perm] = 0
|
169
|
+
perm
|
170
|
+
end
|
171
|
+
end.compact
|
172
|
+
end
|
173
|
+
|
156
174
|
# Because we're scanning the original text, we've lost all of the character filtering we did
|
157
175
|
# prior to tokenization, but that's important because we need the original context to be more
|
158
176
|
# choosy. Still, we need to know what delimiter goes between collapsed tokens (since it may
|
@@ -179,25 +197,30 @@ module TextRank
|
|
179
197
|
# modifications to the original token list yet but just keep track of what we plan
|
180
198
|
# to collapse/remove.
|
181
199
|
def decide_what_to_collapse_and_what_to_remove
|
182
|
-
non_empty_ordered = @permutations_scanned.select do |k, v|
|
183
|
-
v > 0
|
184
|
-
end.sort_by do |k, v|
|
185
|
-
[-v, -k.size] # reverse order
|
186
|
-
end
|
187
|
-
|
188
200
|
tokens_encountered = []
|
189
|
-
|
201
|
+
permutations_to_consider_collapsing.each do |perm, perm_count|
|
190
202
|
if perm.size > 1
|
191
|
-
singles_to_remove
|
192
|
-
if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
|
193
|
-
@to_collapse << perm if perm.size > 1
|
194
|
-
@to_remove |= singles_to_remove
|
195
|
-
end
|
203
|
+
decide_to_collapse_or_remove(perm, perm_count, singles_to_remove: perm - tokens_encountered)
|
196
204
|
end
|
197
205
|
tokens_encountered += perm
|
198
206
|
end
|
199
207
|
end
|
200
208
|
|
209
|
+
def permutations_to_consider_collapsing
|
210
|
+
@permutations_scanned.select do |_k, v|
|
211
|
+
v.positive?
|
212
|
+
end.sort_by do |k, v|
|
213
|
+
[-v, -k.size] # reverse order
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def decide_to_collapse_or_remove(perm, perm_count, singles_to_remove:)
|
218
|
+
if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
|
219
|
+
@to_collapse << perm if perm.size > 1
|
220
|
+
@to_remove |= singles_to_remove
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
201
224
|
# Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
|
202
225
|
# we still want to add the collapsed key if it shows up "enough" times.
|
203
226
|
def combination_significant?(perm, perm_count)
|
@@ -45,8 +45,9 @@ module TextRank
|
|
45
45
|
# @return [Hash<String, Float>]
|
46
46
|
def filter!(ranks, **_)
|
47
47
|
return if ranks.empty?
|
48
|
+
|
48
49
|
total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
|
49
|
-
|
50
|
+
ranks.transform_values { |v| v / total }
|
50
51
|
end
|
51
52
|
|
52
53
|
end
|
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
1
|
module TextRank
|
4
2
|
module TokenFilter
|
5
3
|
##
|
@@ -15,325 +13,7 @@ module TextRank
|
|
15
13
|
class Stopwords
|
16
14
|
|
17
15
|
# Default English stop-word list.
|
18
|
-
STOP_WORDS = Set.new(
|
19
|
-
a
|
20
|
-
about
|
21
|
-
above
|
22
|
-
across
|
23
|
-
after
|
24
|
-
afterwards
|
25
|
-
again
|
26
|
-
against
|
27
|
-
all
|
28
|
-
almost
|
29
|
-
alone
|
30
|
-
along
|
31
|
-
already
|
32
|
-
also
|
33
|
-
although
|
34
|
-
always
|
35
|
-
am
|
36
|
-
among
|
37
|
-
amongst
|
38
|
-
amoungst
|
39
|
-
amount
|
40
|
-
an
|
41
|
-
and
|
42
|
-
another
|
43
|
-
any
|
44
|
-
anyhow
|
45
|
-
anyone
|
46
|
-
anything
|
47
|
-
anyway
|
48
|
-
anywhere
|
49
|
-
are
|
50
|
-
around
|
51
|
-
as
|
52
|
-
at
|
53
|
-
back
|
54
|
-
be
|
55
|
-
became
|
56
|
-
because
|
57
|
-
become
|
58
|
-
becomes
|
59
|
-
becoming
|
60
|
-
been
|
61
|
-
before
|
62
|
-
beforehand
|
63
|
-
behind
|
64
|
-
being
|
65
|
-
below
|
66
|
-
beside
|
67
|
-
besides
|
68
|
-
between
|
69
|
-
beyond
|
70
|
-
bill
|
71
|
-
both
|
72
|
-
bottom
|
73
|
-
but
|
74
|
-
by
|
75
|
-
call
|
76
|
-
can
|
77
|
-
cannot
|
78
|
-
cant
|
79
|
-
co
|
80
|
-
con
|
81
|
-
could
|
82
|
-
couldnt
|
83
|
-
cry
|
84
|
-
de
|
85
|
-
describe
|
86
|
-
detail
|
87
|
-
do
|
88
|
-
done
|
89
|
-
down
|
90
|
-
due
|
91
|
-
during
|
92
|
-
each
|
93
|
-
eg
|
94
|
-
eight
|
95
|
-
either
|
96
|
-
eleven
|
97
|
-
else
|
98
|
-
elsewhere
|
99
|
-
empty
|
100
|
-
enough
|
101
|
-
etc
|
102
|
-
even
|
103
|
-
ever
|
104
|
-
every
|
105
|
-
everyone
|
106
|
-
everything
|
107
|
-
everywhere
|
108
|
-
except
|
109
|
-
few
|
110
|
-
fifteen
|
111
|
-
fify
|
112
|
-
fill
|
113
|
-
find
|
114
|
-
fire
|
115
|
-
first
|
116
|
-
five
|
117
|
-
for
|
118
|
-
former
|
119
|
-
formerly
|
120
|
-
forty
|
121
|
-
found
|
122
|
-
four
|
123
|
-
from
|
124
|
-
front
|
125
|
-
full
|
126
|
-
further
|
127
|
-
get
|
128
|
-
give
|
129
|
-
go
|
130
|
-
had
|
131
|
-
has
|
132
|
-
hasnt
|
133
|
-
have
|
134
|
-
he
|
135
|
-
hence
|
136
|
-
her
|
137
|
-
here
|
138
|
-
hereafter
|
139
|
-
hereby
|
140
|
-
herein
|
141
|
-
hereupon
|
142
|
-
hers
|
143
|
-
herself
|
144
|
-
him
|
145
|
-
himself
|
146
|
-
his
|
147
|
-
how
|
148
|
-
however
|
149
|
-
hundred
|
150
|
-
ie
|
151
|
-
if
|
152
|
-
in
|
153
|
-
inc
|
154
|
-
indeed
|
155
|
-
interest
|
156
|
-
into
|
157
|
-
is
|
158
|
-
it
|
159
|
-
its
|
160
|
-
itself
|
161
|
-
keep
|
162
|
-
last
|
163
|
-
latter
|
164
|
-
latterly
|
165
|
-
least
|
166
|
-
less
|
167
|
-
ltd
|
168
|
-
made
|
169
|
-
many
|
170
|
-
may
|
171
|
-
me
|
172
|
-
meanwhile
|
173
|
-
might
|
174
|
-
mill
|
175
|
-
mine
|
176
|
-
more
|
177
|
-
moreover
|
178
|
-
most
|
179
|
-
mostly
|
180
|
-
move
|
181
|
-
much
|
182
|
-
must
|
183
|
-
my
|
184
|
-
myself
|
185
|
-
name
|
186
|
-
namely
|
187
|
-
neither
|
188
|
-
never
|
189
|
-
nevertheless
|
190
|
-
next
|
191
|
-
nine
|
192
|
-
no
|
193
|
-
nobody
|
194
|
-
none
|
195
|
-
noone
|
196
|
-
nor
|
197
|
-
not
|
198
|
-
nothing
|
199
|
-
now
|
200
|
-
nowhere
|
201
|
-
of
|
202
|
-
off
|
203
|
-
often
|
204
|
-
on
|
205
|
-
once
|
206
|
-
one
|
207
|
-
only
|
208
|
-
onto
|
209
|
-
or
|
210
|
-
other
|
211
|
-
others
|
212
|
-
otherwise
|
213
|
-
our
|
214
|
-
ours
|
215
|
-
ourselves
|
216
|
-
out
|
217
|
-
over
|
218
|
-
own
|
219
|
-
part
|
220
|
-
per
|
221
|
-
perhaps
|
222
|
-
please
|
223
|
-
put
|
224
|
-
rather
|
225
|
-
re
|
226
|
-
same
|
227
|
-
see
|
228
|
-
seem
|
229
|
-
seemed
|
230
|
-
seeming
|
231
|
-
seems
|
232
|
-
serious
|
233
|
-
several
|
234
|
-
she
|
235
|
-
should
|
236
|
-
show
|
237
|
-
side
|
238
|
-
since
|
239
|
-
sincere
|
240
|
-
six
|
241
|
-
sixty
|
242
|
-
so
|
243
|
-
some
|
244
|
-
somehow
|
245
|
-
someone
|
246
|
-
something
|
247
|
-
sometime
|
248
|
-
sometimes
|
249
|
-
somewhere
|
250
|
-
still
|
251
|
-
such
|
252
|
-
system
|
253
|
-
take
|
254
|
-
ten
|
255
|
-
than
|
256
|
-
that
|
257
|
-
the
|
258
|
-
their
|
259
|
-
them
|
260
|
-
themselves
|
261
|
-
then
|
262
|
-
thence
|
263
|
-
there
|
264
|
-
thereafter
|
265
|
-
thereby
|
266
|
-
therefore
|
267
|
-
therein
|
268
|
-
thereupon
|
269
|
-
these
|
270
|
-
they
|
271
|
-
thickv
|
272
|
-
thin
|
273
|
-
third
|
274
|
-
this
|
275
|
-
those
|
276
|
-
though
|
277
|
-
three
|
278
|
-
through
|
279
|
-
throughout
|
280
|
-
thru
|
281
|
-
thus
|
282
|
-
to
|
283
|
-
together
|
284
|
-
too
|
285
|
-
top
|
286
|
-
toward
|
287
|
-
towards
|
288
|
-
twelve
|
289
|
-
twenty
|
290
|
-
two
|
291
|
-
un
|
292
|
-
under
|
293
|
-
until
|
294
|
-
up
|
295
|
-
upon
|
296
|
-
us
|
297
|
-
very
|
298
|
-
via
|
299
|
-
was
|
300
|
-
we
|
301
|
-
well
|
302
|
-
were
|
303
|
-
what
|
304
|
-
whatever
|
305
|
-
when
|
306
|
-
whence
|
307
|
-
whenever
|
308
|
-
where
|
309
|
-
whereafter
|
310
|
-
whereas
|
311
|
-
whereby
|
312
|
-
wherein
|
313
|
-
whereupon
|
314
|
-
wherever
|
315
|
-
whether
|
316
|
-
which
|
317
|
-
while
|
318
|
-
whither
|
319
|
-
who
|
320
|
-
whoever
|
321
|
-
whole
|
322
|
-
whom
|
323
|
-
whose
|
324
|
-
why
|
325
|
-
will
|
326
|
-
with
|
327
|
-
within
|
328
|
-
without
|
329
|
-
would
|
330
|
-
yet
|
331
|
-
you
|
332
|
-
your
|
333
|
-
yours
|
334
|
-
yourself
|
335
|
-
yourselves
|
336
|
-
])
|
16
|
+
STOP_WORDS = Set.new(YAML.load_file(File.expand_path('stopwords.yml', __dir__)))
|
337
17
|
|
338
18
|
# Perform the filter
|
339
19
|
# @param tokens [Array<String>]
|