text_rank 1.2.0 → 1.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +5 -5
  2. data/.codeclimate.yml +1 -6
  3. data/.gitignore +4 -0
  4. data/.rubocop.yml +60 -1075
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +14 -5
  7. data/{LICENSE.txt → LICENSE} +0 -0
  8. data/README.md +2 -1
  9. data/Rakefile +5 -0
  10. data/bin/console +3 -3
  11. data/ext/text_rank/extconf.rb +3 -0
  12. data/ext/text_rank/page_rank_sparse_native.c +296 -0
  13. data/ext/text_rank/page_rank_sparse_native.h +93 -0
  14. data/ext/text_rank/text_rank.c +5 -0
  15. data/lib/page_rank.rb +7 -4
  16. data/lib/page_rank/base.rb +12 -9
  17. data/lib/page_rank/dense.rb +3 -2
  18. data/lib/page_rank/sparse.rb +6 -7
  19. data/lib/page_rank/sparse_native.rb +21 -0
  20. data/lib/text_rank.rb +14 -9
  21. data/lib/text_rank/char_filter.rb +1 -1
  22. data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
  23. data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
  24. data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
  25. data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
  26. data/lib/text_rank/fingerprint.rb +10 -18
  27. data/lib/text_rank/fingerprint_overlap.rb +55 -0
  28. data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
  29. data/lib/text_rank/keyword_extractor.rb +32 -25
  30. data/lib/text_rank/rank_filter/collapse_adjacent.rb +53 -26
  31. data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
  32. data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
  33. data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
  34. data/lib/text_rank/token_filter/stopwords.rb +1 -321
  35. data/lib/text_rank/token_filter/stopwords.yml +317 -0
  36. data/lib/text_rank/tokenizer.rb +1 -1
  37. data/lib/text_rank/tokenizer/money.rb +11 -6
  38. data/lib/text_rank/tokenizer/number.rb +4 -3
  39. data/lib/text_rank/tokenizer/punctuation.rb +4 -1
  40. data/lib/text_rank/tokenizer/url.rb +3 -0
  41. data/lib/text_rank/tokenizer/whitespace.rb +4 -1
  42. data/lib/text_rank/tokenizer/word.rb +5 -2
  43. data/lib/text_rank/version.rb +3 -1
  44. data/text_rank.gemspec +12 -10
  45. metadata +69 -33
@@ -1,5 +1,3 @@
1
- require 'set'
2
-
3
1
  module TextRank
4
2
  ##
5
3
  # Class used to compare documents according to TextRank. A "fingerprint"
@@ -61,28 +59,22 @@ module TextRank
61
59
  # Calculates the "similarity" between this fingerprint and another
62
60
  # @param {Fingerprint} A second fingerprint to compare
63
61
  # @return [Number] A number between 0.0 (different) and 1.0 (same)
64
- def similarity(trf2)
65
- return 1.0 if values == trf2.values
66
-
67
- sim = 0
68
- s1 = Set.new
69
- s2 = Set.new
62
+ def similarity(other)
63
+ return 1.0 if values == other.values # Short-circuit for efficiency
70
64
 
71
- [size, trf2.size].max.times.reduce(0) do |sum, i|
72
- v1 = values[i]
73
- v2 = trf2.values[i]
74
- if v1 == v2
75
- sim += 1
76
- else
77
- s1.delete?(v2) ? (sim += 1) : (s2 << v2)
78
- s2.delete?(v1) ? (sim += 1) : (s1 << v1)
79
- end
80
- sum + sim * linear_transform[i]
65
+ sum = 0
66
+ overlap(other).each_with_index do |overlap_value, i|
67
+ sum += overlap_value * linear_transform[i]
81
68
  end
69
+ sum
82
70
  end
83
71
 
84
72
  private
85
73
 
74
+ def overlap(other)
75
+ FingerprintOverlap.new(values, other.values).overlap
76
+ end
77
+
86
78
  def linear_transform
87
79
  @linear_transform ||= size.times.map do |i|
88
80
  1.0 / Math.log(i + 2) / size.to_f / norm_factor
@@ -0,0 +1,55 @@
1
+ module TextRank
2
+ ##
3
+ # Determines "overlap" between two fingerprints at each N prefixes
4
+ #
5
+ # For example,
6
+ #
7
+ # FingerprintOverlap.new(
8
+ # %w[a b c d],
9
+ # %w[b e a c],
10
+ # ).overlap
11
+ #
12
+ # => [
13
+ # 0, # [a] & (b) have no overlap
14
+ # 1, # [a b] & [b e] have one overlap: b
15
+ # 2, # [a b c] & [b e a] have two overlap: a & b
16
+ # 3, # [a b c d] & [b e a c] have three overlap: a, b, & c
17
+ # ]
18
+ ##
19
+ class FingerprintOverlap
20
+
21
+ attr_reader :overlap
22
+
23
+ def initialize(values1, values2)
24
+ raise ArgumentError, 'Value size mismatch' if values1.size != values2.size
25
+
26
+ @encountered1 = Set.new
27
+ @encountered2 = Set.new
28
+ @overlap_count = 0
29
+
30
+ @overlap = determine_overlap(values1, values2)
31
+ end
32
+
33
+ private
34
+
35
+ def determine_overlap(values1, values2)
36
+ values1.zip(values2).map do |v1, v2|
37
+ encounter(v1, v2)
38
+ @overlap_count
39
+ end
40
+ end
41
+
42
+ # This algorithm is a little more complex than could be represented in Ruby,
43
+ # but we want to keep it as performant as possible.
44
+ def encounter(value1, value2)
45
+ if value1 == value2
46
+ @overlap_count += 1
47
+ else
48
+ # Delete from the set in case an element appears more than once
49
+ @encountered1.delete?(value2) ? (@overlap_count += 1) : (@encountered2 << value2)
50
+ @encountered2.delete?(value1) ? (@overlap_count += 1) : (@encountered1 << value1)
51
+ end
52
+ end
53
+
54
+ end
55
+ end
@@ -61,18 +61,27 @@ module TextRank
61
61
  # return [nil]
62
62
  def build_graph(tokens, graph)
63
63
  ngram_window = @ngram_size * 2 + 1
64
- tokens.each_with_index do |token_i, i|
64
+ tokens.size.times do |i|
65
65
  ngram_window.times do |j|
66
- next if j == @ngram_size || i + j < @ngram_size
67
- token_j = tokens[i - @ngram_size + j]
68
- if token_j
69
- graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
70
- end
66
+ consider_ngram_window(tokens, graph, i, j)
71
67
  end
72
68
  end
73
69
  nil
74
70
  end
75
71
 
72
+ private
73
+
74
+ def consider_ngram_window(tokens, graph, i, j)
75
+ return if j == @ngram_size || i + j < @ngram_size
76
+
77
+ token_i = tokens[i]
78
+ token_j = tokens[i - @ngram_size + j]
79
+
80
+ if token_j
81
+ graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
82
+ end
83
+ end
84
+
76
85
  end
77
86
  end
78
87
  end
@@ -13,9 +13,9 @@ module TextRank
13
13
  # @return [KeywordExtractor]
14
14
  def self.basic(**options)
15
15
  new(**{
16
- char_filters: [:AsciiFolding, :Lowercase],
17
- tokenizers: [:Word],
18
- token_filters: [:Stopwords, :MinLength],
16
+ char_filters: %i[AsciiFolding Lowercase],
17
+ tokenizers: %i[Word],
18
+ token_filters: %i[Stopwords MinLength],
19
19
  graph_strategy: :Coocurrence,
20
20
  }.merge(options))
21
21
  end
@@ -25,11 +25,11 @@ module TextRank
25
25
  # @return [KeywordExtractor]
26
26
  def self.advanced(**options)
27
27
  new(**{
28
- char_filters: [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
29
- tokenizers: [:Url, :Money, :Number, :Word, :Punctuation],
30
- token_filters: [:PartOfSpeech, :Stopwords, :MinLength],
28
+ char_filters: %i[AsciiFolding Lowercase StripHtml StripEmail UndoContractions StripPossessive],
29
+ tokenizers: %i[Url Money Number Word Punctuation],
30
+ token_filters: %i[PartOfSpeech Stopwords MinLength],
31
31
  graph_strategy: :Coocurrence,
32
- rank_filters: [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
32
+ rank_filters: %i[CollapseAdjacent NormalizeUnitVector SortByValue],
33
33
  }.merge(options))
34
34
  end
35
35
 
@@ -41,14 +41,14 @@ module TextRank
41
41
  # @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
42
42
  def initialize(**options)
43
43
  @page_rank_options = {
44
- strategy: options[:strategy] || :sparse,
45
- damping: options[:damping],
44
+ strategy: options[:strategy] || :sparse,
45
+ damping: options[:damping],
46
46
  tolerance: options[:tolerance],
47
47
  }
48
- @char_filters = options[:char_filters] || []
49
- @tokenizers = options[:tokenizers] || [Tokenizer::Word]
50
- @token_filters = options[:token_filters] || []
51
- @rank_filters = options[:rank_filters] || []
48
+ @char_filters = options[:char_filters] || []
49
+ @tokenizers = options[:tokenizers] || [Tokenizer::Word]
50
+ @token_filters = options[:token_filters] || []
51
+ @rank_filters = options[:rank_filters] || []
52
52
  @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
53
53
  end
54
54
 
@@ -73,9 +73,7 @@ module TextRank
73
73
  # Sets the graph strategy for producing a graph from tokens
74
74
  # @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
75
75
  # @return [Class, Symbol, #build_graph]
76
- def graph_strategy=(strategy)
77
- @graph_strategy = strategy
78
- end
76
+ attr_writer :graph_strategy
79
77
 
80
78
  # Add a new TokenFilter for processing tokens after tokenization
81
79
  # @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
@@ -105,14 +103,23 @@ module TextRank
105
103
  end
106
104
 
107
105
  # Filter & tokenize text, and return PageRank
108
- # @param text [String] unfiltered text to be processed
106
+ # @param text [String,Array<String>] unfiltered text to be processed
109
107
  # @return [Hash<String, Float>] tokens and page ranks (in descending order)
110
108
  def extract(text, **options)
111
- tokens = tokenize(text)
109
+ text = Array(text)
110
+ tokens_per_text = text.map do |t|
111
+ tokenize(t)
112
+ end
112
113
  graph = PageRank.new(**@page_rank_options)
113
- classify(@graph_strategy, context: GraphStrategy).build_graph(tokens, graph)
114
+ strategy = classify(@graph_strategy, context: GraphStrategy)
115
+ tokens_per_text.each do |tokens|
116
+ strategy.build_graph(tokens, graph)
117
+ end
114
118
  ranks = graph.calculate(**options)
115
- apply_rank_filters(ranks, tokens: tokens, original_text: text)
119
+ tokens_per_text.each_with_index do |tokens, i|
120
+ ranks = apply_rank_filters(ranks, tokens: tokens, original_text: text[i])
121
+ end
122
+ ranks
116
123
  end
117
124
 
118
125
  private
@@ -153,14 +160,14 @@ module TextRank
153
160
  array.insert(idx, value)
154
161
  end
155
162
 
156
- def classify(c, context: self)
157
- case c
163
+ def classify(clazz, context: self)
164
+ case clazz
158
165
  when Class
159
- c.new
166
+ clazz.new
160
167
  when Symbol
161
- context.const_get(c).new
168
+ context.const_get(clazz).new
162
169
  else
163
- c
170
+ clazz
164
171
  end
165
172
  end
166
173
 
@@ -77,6 +77,7 @@ module TextRank
77
77
 
78
78
  class TokenCollapser
79
79
 
80
+ # rubocop:disable Metrics/ParameterLists
80
81
  def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
81
82
  @tokens = tokens
82
83
  @text = text
@@ -91,6 +92,7 @@ module TextRank
91
92
  @permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
92
93
  @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
93
94
  end
95
+ # rubocop:enable Metrics/ParameterLists
94
96
 
95
97
  # :nodoc:
96
98
  def delimiter_re
@@ -104,19 +106,36 @@ module TextRank
104
106
  # single tokens from below the cut to above it. So we'll continue searching
105
107
  # until all of the top N final keywords (single or collapsed) have been
106
108
  # considered.
107
- loop do
108
- regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
109
- single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
110
- scan_text_for_all_permutations_of(single_tokens_to_consider) or break
111
- decide_what_to_collapse_and_what_to_remove
109
+ while collapse_attempt
110
+ # keep trying
112
111
  end
113
112
 
114
113
  # We now know what to collapse and what to remove, so we can start safely
115
114
  # modifying the tokens hash
115
+ apply_collapse
116
+ end
117
+
118
+ # :nodoc:
119
+ def collapse_attempt
120
+ regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
121
+ single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
122
+ scan_text_for_all_permutations_of(single_tokens_to_consider) or return false
123
+ decide_what_to_collapse_and_what_to_remove
124
+ true
125
+ end
126
+
127
+ # :nodoc:
128
+ def apply_collapse
116
129
  @to_collapse.each do |perm|
117
- values = @tokens.values_at(*perm)
130
+ values = @tokens.values_at(*perm).compact
131
+ # This might be empty if somehow the scanned permutation doesn't
132
+ # exactly match one of the tokens (e.g. ASCII-folding gone awry).
133
+ # The goal is to do the best we can, so if we can't find it, ignore.
134
+ next if values.empty?
135
+
118
136
  @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
119
137
  end
138
+
120
139
  @tokens.reject! do |k, _|
121
140
  @to_remove.include?(k)
122
141
  end || @tokens
@@ -132,16 +151,10 @@ module TextRank
132
151
  # tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
133
152
  # to find what we can.
134
153
  def scan_text_for_all_permutations_of(single_tokens)
135
- perms = []
136
154
  # NOTE that by reversing the order we craft the regex to prefer larger combinations over
137
155
  # smaller combinations (or singletons).
138
- (1..@max_tokens_to_combine).to_a.reverse.map do |nn|
139
- single_tokens.permutation(nn).each do |perm|
140
- unless @permutations_scanned.key?(perm)
141
- @permutations_scanned[perm] = 0
142
- perms << perm
143
- end
144
- end
156
+ perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
157
+ scan_text_for_n_permutations_of(single_tokens, n)
145
158
  end
146
159
  scan_text_for(perms) do |s|
147
160
  s = s.downcase if @ignore_case
@@ -149,6 +162,15 @@ module TextRank
149
162
  end unless perms.empty?
150
163
  end
151
164
 
165
+ def scan_text_for_n_permutations_of(single_tokens, n)
166
+ single_tokens.permutation(n).map do |perm|
167
+ unless @permutations_scanned.key?(perm)
168
+ @permutations_scanned[perm] = 0
169
+ perm
170
+ end
171
+ end.compact
172
+ end
173
+
152
174
  # Because we're scanning the original text, we've lost all of the character filtering we did
153
175
  # prior to tokenization, but that's important because we need the original context to be more
154
176
  # choosy. Still, we need to know what delimiter goes between collapsed tokens (since it may
@@ -175,25 +197,30 @@ module TextRank
175
197
  # modifications to the original token list yet but just keep track of what we plan
176
198
  # to collapse/remove.
177
199
  def decide_what_to_collapse_and_what_to_remove
178
- non_empty_ordered = @permutations_scanned.select do |k, v|
179
- v > 0
180
- end.sort_by do |k, v|
181
- [-v, -k.size] # reverse order
182
- end
183
-
184
200
  tokens_encountered = []
185
- non_empty_ordered.each do |perm, perm_count|
201
+ permutations_to_consider_collapsing.each do |perm, perm_count|
186
202
  if perm.size > 1
187
- singles_to_remove = perm - tokens_encountered
188
- if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
189
- @to_collapse << perm if perm.size > 1
190
- @to_remove |= singles_to_remove
191
- end
203
+ decide_to_collapse_or_remove(perm, perm_count, singles_to_remove: perm - tokens_encountered)
192
204
  end
193
205
  tokens_encountered += perm
194
206
  end
195
207
  end
196
208
 
209
+ def permutations_to_consider_collapsing
210
+ @permutations_scanned.select do |_k, v|
211
+ v.positive?
212
+ end.sort_by do |k, v|
213
+ [-v, -k.size] # reverse order
214
+ end
215
+ end
216
+
217
+ def decide_to_collapse_or_remove(perm, perm_count, singles_to_remove:)
218
+ if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
219
+ @to_collapse << perm if perm.size > 1
220
+ @to_remove |= singles_to_remove
221
+ end
222
+ end
223
+
197
224
  # Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
198
225
  # we still want to add the collapsed key if it shows up "enough" times.
199
226
  def combination_significant?(perm, perm_count)
@@ -44,8 +44,9 @@ module TextRank
44
44
  # @return [Hash<String, Float>]
45
45
  def filter!(ranks, **_)
46
46
  return if ranks.empty?
47
+
47
48
  total = ranks.values.reduce(:+)
48
- Hash[ranks.map { |k, v| [k, v / total] }]
49
+ ranks.transform_values { |v| v / total }
49
50
  end
50
51
 
51
52
  end
@@ -45,8 +45,9 @@ module TextRank
45
45
  # @return [Hash<String, Float>]
46
46
  def filter!(ranks, **_)
47
47
  return if ranks.empty?
48
+
48
49
  total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
49
- Hash[ranks.map { |k, v| [k, v / total] }]
50
+ ranks.transform_values { |v| v / total }
50
51
  end
51
52
 
52
53
  end
@@ -1,5 +1,4 @@
1
1
  require 'engtagger'
2
- require 'set'
3
2
 
4
3
  module TextRank
5
4
  module TokenFilter
@@ -1,5 +1,3 @@
1
- require 'set'
2
-
3
1
  module TextRank
4
2
  module TokenFilter
5
3
  ##
@@ -15,325 +13,7 @@ module TextRank
15
13
  class Stopwords
16
14
 
17
15
  # Default English stop-word list.
18
- STOP_WORDS = Set.new(%w[
19
- a
20
- about
21
- above
22
- across
23
- after
24
- afterwards
25
- again
26
- against
27
- all
28
- almost
29
- alone
30
- along
31
- already
32
- also
33
- although
34
- always
35
- am
36
- among
37
- amongst
38
- amoungst
39
- amount
40
- an
41
- and
42
- another
43
- any
44
- anyhow
45
- anyone
46
- anything
47
- anyway
48
- anywhere
49
- are
50
- around
51
- as
52
- at
53
- back
54
- be
55
- became
56
- because
57
- become
58
- becomes
59
- becoming
60
- been
61
- before
62
- beforehand
63
- behind
64
- being
65
- below
66
- beside
67
- besides
68
- between
69
- beyond
70
- bill
71
- both
72
- bottom
73
- but
74
- by
75
- call
76
- can
77
- cannot
78
- cant
79
- co
80
- con
81
- could
82
- couldnt
83
- cry
84
- de
85
- describe
86
- detail
87
- do
88
- done
89
- down
90
- due
91
- during
92
- each
93
- eg
94
- eight
95
- either
96
- eleven
97
- else
98
- elsewhere
99
- empty
100
- enough
101
- etc
102
- even
103
- ever
104
- every
105
- everyone
106
- everything
107
- everywhere
108
- except
109
- few
110
- fifteen
111
- fify
112
- fill
113
- find
114
- fire
115
- first
116
- five
117
- for
118
- former
119
- formerly
120
- forty
121
- found
122
- four
123
- from
124
- front
125
- full
126
- further
127
- get
128
- give
129
- go
130
- had
131
- has
132
- hasnt
133
- have
134
- he
135
- hence
136
- her
137
- here
138
- hereafter
139
- hereby
140
- herein
141
- hereupon
142
- hers
143
- herself
144
- him
145
- himself
146
- his
147
- how
148
- however
149
- hundred
150
- ie
151
- if
152
- in
153
- inc
154
- indeed
155
- interest
156
- into
157
- is
158
- it
159
- its
160
- itself
161
- keep
162
- last
163
- latter
164
- latterly
165
- least
166
- less
167
- ltd
168
- made
169
- many
170
- may
171
- me
172
- meanwhile
173
- might
174
- mill
175
- mine
176
- more
177
- moreover
178
- most
179
- mostly
180
- move
181
- much
182
- must
183
- my
184
- myself
185
- name
186
- namely
187
- neither
188
- never
189
- nevertheless
190
- next
191
- nine
192
- no
193
- nobody
194
- none
195
- noone
196
- nor
197
- not
198
- nothing
199
- now
200
- nowhere
201
- of
202
- off
203
- often
204
- on
205
- once
206
- one
207
- only
208
- onto
209
- or
210
- other
211
- others
212
- otherwise
213
- our
214
- ours
215
- ourselves
216
- out
217
- over
218
- own
219
- part
220
- per
221
- perhaps
222
- please
223
- put
224
- rather
225
- re
226
- same
227
- see
228
- seem
229
- seemed
230
- seeming
231
- seems
232
- serious
233
- several
234
- she
235
- should
236
- show
237
- side
238
- since
239
- sincere
240
- six
241
- sixty
242
- so
243
- some
244
- somehow
245
- someone
246
- something
247
- sometime
248
- sometimes
249
- somewhere
250
- still
251
- such
252
- system
253
- take
254
- ten
255
- than
256
- that
257
- the
258
- their
259
- them
260
- themselves
261
- then
262
- thence
263
- there
264
- thereafter
265
- thereby
266
- therefore
267
- therein
268
- thereupon
269
- these
270
- they
271
- thickv
272
- thin
273
- third
274
- this
275
- those
276
- though
277
- three
278
- through
279
- throughout
280
- thru
281
- thus
282
- to
283
- together
284
- too
285
- top
286
- toward
287
- towards
288
- twelve
289
- twenty
290
- two
291
- un
292
- under
293
- until
294
- up
295
- upon
296
- us
297
- very
298
- via
299
- was
300
- we
301
- well
302
- were
303
- what
304
- whatever
305
- when
306
- whence
307
- whenever
308
- where
309
- whereafter
310
- whereas
311
- whereby
312
- wherein
313
- whereupon
314
- wherever
315
- whether
316
- which
317
- while
318
- whither
319
- who
320
- whoever
321
- whole
322
- whom
323
- whose
324
- why
325
- will
326
- with
327
- within
328
- without
329
- would
330
- yet
331
- you
332
- your
333
- yours
334
- yourself
335
- yourselves
336
- ])
16
+ STOP_WORDS = Set.new(YAML.load_file(File.expand_path('stopwords.yml', __dir__)))
337
17
 
338
18
  # Perform the filter
339
19
  # @param tokens [Array<String>]