text_rank 1.2.3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.codeclimate.yml +1 -1
  3. data/.gitignore +4 -0
  4. data/.rubocop.yml +7 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +1 -0
  7. data/Rakefile +5 -0
  8. data/bin/console +3 -3
  9. data/ext/text_rank/extconf.rb +3 -0
  10. data/ext/text_rank/page_rank_sparse_native.c +300 -0
  11. data/ext/text_rank/page_rank_sparse_native.h +93 -0
  12. data/ext/text_rank/text_rank.c +5 -0
  13. data/lib/page_rank/base.rb +12 -9
  14. data/lib/page_rank/dense.rb +3 -2
  15. data/lib/page_rank/sparse.rb +6 -7
  16. data/lib/page_rank/sparse_native.rb +21 -0
  17. data/lib/page_rank.rb +7 -4
  18. data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
  19. data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
  20. data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
  21. data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
  22. data/lib/text_rank/char_filter.rb +1 -1
  23. data/lib/text_rank/fingerprint.rb +10 -18
  24. data/lib/text_rank/fingerprint_overlap.rb +55 -0
  25. data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
  26. data/lib/text_rank/keyword_extractor.rb +32 -25
  27. data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
  28. data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
  29. data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
  30. data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
  31. data/lib/text_rank/token_filter/stopwords.rb +1 -321
  32. data/lib/text_rank/token_filter/stopwords.yml +317 -0
  33. data/lib/text_rank/tokenizer/money.rb +11 -6
  34. data/lib/text_rank/tokenizer/number.rb +4 -3
  35. data/lib/text_rank/tokenizer/punctuation.rb +4 -1
  36. data/lib/text_rank/tokenizer/url.rb +3 -0
  37. data/lib/text_rank/tokenizer/whitespace.rb +4 -1
  38. data/lib/text_rank/tokenizer/word.rb +5 -2
  39. data/lib/text_rank/tokenizer.rb +1 -1
  40. data/lib/text_rank/version.rb +3 -1
  41. data/lib/text_rank.rb +14 -9
  42. data/text_rank.gemspec +4 -1
  43. metadata +48 -12
@@ -77,6 +77,7 @@ module TextRank
77
77
 
78
78
  class TokenCollapser
79
79
 
80
+ # rubocop:disable Metrics/ParameterLists
80
81
  def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
81
82
  @tokens = tokens
82
83
  @text = text
@@ -91,6 +92,7 @@ module TextRank
91
92
  @permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
92
93
  @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
93
94
  end
95
+ # rubocop:enable Metrics/ParameterLists
94
96
 
95
97
  # :nodoc:
96
98
  def delimiter_re
@@ -104,23 +106,36 @@ module TextRank
104
106
  # single tokens from below the cut to above it. So we'll continue searching
105
107
  # until all of the top N final keywords (single or collapsed) have been
106
108
  # considered.
107
- loop do
108
- regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
109
- single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
110
- scan_text_for_all_permutations_of(single_tokens_to_consider) or break
111
- decide_what_to_collapse_and_what_to_remove
109
+ while collapse_attempt
110
+ # keep trying
112
111
  end
113
112
 
114
113
  # We now know what to collapse and what to remove, so we can start safely
115
114
  # modifying the tokens hash
115
+ apply_collapse
116
+ end
117
+
118
+ # :nodoc:
119
+ def collapse_attempt
120
+ regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
121
+ single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
122
+ scan_text_for_all_permutations_of(single_tokens_to_consider) or return false
123
+ decide_what_to_collapse_and_what_to_remove
124
+ true
125
+ end
126
+
127
+ # :nodoc:
128
+ def apply_collapse
116
129
  @to_collapse.each do |perm|
117
130
  values = @tokens.values_at(*perm).compact
118
131
  # This might be empty if somehow the scanned permutation doesn't
119
132
  # exactly match one of the tokens (e.g. ASCII-folding gone awry).
120
133
  # The goal is to do the best we can, so if we can't find it, ignore.
121
134
  next if values.empty?
135
+
122
136
  @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
123
137
  end
138
+
124
139
  @tokens.reject! do |k, _|
125
140
  @to_remove.include?(k)
126
141
  end || @tokens
@@ -136,16 +151,10 @@ module TextRank
136
151
  # tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
137
152
  # to find what we can.
138
153
  def scan_text_for_all_permutations_of(single_tokens)
139
- perms = []
140
154
  # NOTE that by reversing the order we craft the regex to prefer larger combinations over
141
155
  # smaller combinations (or singletons).
142
- (1..@max_tokens_to_combine).to_a.reverse.map do |nn|
143
- single_tokens.permutation(nn).each do |perm|
144
- unless @permutations_scanned.key?(perm)
145
- @permutations_scanned[perm] = 0
146
- perms << perm
147
- end
148
- end
156
+ perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
157
+ scan_text_for_n_permutations_of(single_tokens, n)
149
158
  end
150
159
  scan_text_for(perms) do |s|
151
160
  s = s.downcase if @ignore_case
@@ -153,6 +162,15 @@ module TextRank
153
162
  end unless perms.empty?
154
163
  end
155
164
 
165
+ def scan_text_for_n_permutations_of(single_tokens, n)
166
+ single_tokens.permutation(n).map do |perm|
167
+ unless @permutations_scanned.key?(perm)
168
+ @permutations_scanned[perm] = 0
169
+ perm
170
+ end
171
+ end.compact
172
+ end
173
+
156
174
  # Because we're scanning the original text, we've lost all of the character filtering we did
157
175
  # prior to tokenization, but that's important because we need the original context to be more
158
176
  # choosy. Still, we need to know what delimiter goes between collapsed tokens (since it may
@@ -179,25 +197,30 @@ module TextRank
179
197
  # modifications to the original token list yet but just keep track of what we plan
180
198
  # to collapse/remove.
181
199
  def decide_what_to_collapse_and_what_to_remove
182
- non_empty_ordered = @permutations_scanned.select do |k, v|
183
- v > 0
184
- end.sort_by do |k, v|
185
- [-v, -k.size] # reverse order
186
- end
187
-
188
200
  tokens_encountered = []
189
- non_empty_ordered.each do |perm, perm_count|
201
+ permutations_to_consider_collapsing.each do |perm, perm_count|
190
202
  if perm.size > 1
191
- singles_to_remove = perm - tokens_encountered
192
- if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
193
- @to_collapse << perm if perm.size > 1
194
- @to_remove |= singles_to_remove
195
- end
203
+ decide_to_collapse_or_remove(perm, perm_count, singles_to_remove: perm - tokens_encountered)
196
204
  end
197
205
  tokens_encountered += perm
198
206
  end
199
207
  end
200
208
 
209
+ def permutations_to_consider_collapsing
210
+ @permutations_scanned.select do |_k, v|
211
+ v.positive?
212
+ end.sort_by do |k, v|
213
+ [-v, -k.size] # reverse order
214
+ end
215
+ end
216
+
217
+ def decide_to_collapse_or_remove(perm, perm_count, singles_to_remove:)
218
+ if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
219
+ @to_collapse << perm if perm.size > 1
220
+ @to_remove |= singles_to_remove
221
+ end
222
+ end
223
+
201
224
  # Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
202
225
  # we still want to add the collapsed key if it shows up "enough" times.
203
226
  def combination_significant?(perm, perm_count)
@@ -44,8 +44,9 @@ module TextRank
44
44
  # @return [Hash<String, Float>]
45
45
  def filter!(ranks, **_)
46
46
  return if ranks.empty?
47
+
47
48
  total = ranks.values.reduce(:+)
48
- Hash[ranks.map { |k, v| [k, v / total] }]
49
+ ranks.transform_values { |v| v / total }
49
50
  end
50
51
 
51
52
  end
@@ -45,8 +45,9 @@ module TextRank
45
45
  # @return [Hash<String, Float>]
46
46
  def filter!(ranks, **_)
47
47
  return if ranks.empty?
48
+
48
49
  total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
49
- Hash[ranks.map { |k, v| [k, v / total] }]
50
+ ranks.transform_values { |v| v / total }
50
51
  end
51
52
 
52
53
  end
@@ -1,5 +1,4 @@
1
1
  require 'engtagger'
2
- require 'set'
3
2
 
4
3
  module TextRank
5
4
  module TokenFilter
@@ -1,5 +1,3 @@
1
- require 'set'
2
-
3
1
  module TextRank
4
2
  module TokenFilter
5
3
  ##
@@ -15,325 +13,7 @@ module TextRank
15
13
  class Stopwords
16
14
 
17
15
  # Default English stop-word list.
18
- STOP_WORDS = Set.new(%w[
19
- a
20
- about
21
- above
22
- across
23
- after
24
- afterwards
25
- again
26
- against
27
- all
28
- almost
29
- alone
30
- along
31
- already
32
- also
33
- although
34
- always
35
- am
36
- among
37
- amongst
38
- amoungst
39
- amount
40
- an
41
- and
42
- another
43
- any
44
- anyhow
45
- anyone
46
- anything
47
- anyway
48
- anywhere
49
- are
50
- around
51
- as
52
- at
53
- back
54
- be
55
- became
56
- because
57
- become
58
- becomes
59
- becoming
60
- been
61
- before
62
- beforehand
63
- behind
64
- being
65
- below
66
- beside
67
- besides
68
- between
69
- beyond
70
- bill
71
- both
72
- bottom
73
- but
74
- by
75
- call
76
- can
77
- cannot
78
- cant
79
- co
80
- con
81
- could
82
- couldnt
83
- cry
84
- de
85
- describe
86
- detail
87
- do
88
- done
89
- down
90
- due
91
- during
92
- each
93
- eg
94
- eight
95
- either
96
- eleven
97
- else
98
- elsewhere
99
- empty
100
- enough
101
- etc
102
- even
103
- ever
104
- every
105
- everyone
106
- everything
107
- everywhere
108
- except
109
- few
110
- fifteen
111
- fify
112
- fill
113
- find
114
- fire
115
- first
116
- five
117
- for
118
- former
119
- formerly
120
- forty
121
- found
122
- four
123
- from
124
- front
125
- full
126
- further
127
- get
128
- give
129
- go
130
- had
131
- has
132
- hasnt
133
- have
134
- he
135
- hence
136
- her
137
- here
138
- hereafter
139
- hereby
140
- herein
141
- hereupon
142
- hers
143
- herself
144
- him
145
- himself
146
- his
147
- how
148
- however
149
- hundred
150
- ie
151
- if
152
- in
153
- inc
154
- indeed
155
- interest
156
- into
157
- is
158
- it
159
- its
160
- itself
161
- keep
162
- last
163
- latter
164
- latterly
165
- least
166
- less
167
- ltd
168
- made
169
- many
170
- may
171
- me
172
- meanwhile
173
- might
174
- mill
175
- mine
176
- more
177
- moreover
178
- most
179
- mostly
180
- move
181
- much
182
- must
183
- my
184
- myself
185
- name
186
- namely
187
- neither
188
- never
189
- nevertheless
190
- next
191
- nine
192
- no
193
- nobody
194
- none
195
- noone
196
- nor
197
- not
198
- nothing
199
- now
200
- nowhere
201
- of
202
- off
203
- often
204
- on
205
- once
206
- one
207
- only
208
- onto
209
- or
210
- other
211
- others
212
- otherwise
213
- our
214
- ours
215
- ourselves
216
- out
217
- over
218
- own
219
- part
220
- per
221
- perhaps
222
- please
223
- put
224
- rather
225
- re
226
- same
227
- see
228
- seem
229
- seemed
230
- seeming
231
- seems
232
- serious
233
- several
234
- she
235
- should
236
- show
237
- side
238
- since
239
- sincere
240
- six
241
- sixty
242
- so
243
- some
244
- somehow
245
- someone
246
- something
247
- sometime
248
- sometimes
249
- somewhere
250
- still
251
- such
252
- system
253
- take
254
- ten
255
- than
256
- that
257
- the
258
- their
259
- them
260
- themselves
261
- then
262
- thence
263
- there
264
- thereafter
265
- thereby
266
- therefore
267
- therein
268
- thereupon
269
- these
270
- they
271
- thickv
272
- thin
273
- third
274
- this
275
- those
276
- though
277
- three
278
- through
279
- throughout
280
- thru
281
- thus
282
- to
283
- together
284
- too
285
- top
286
- toward
287
- towards
288
- twelve
289
- twenty
290
- two
291
- un
292
- under
293
- until
294
- up
295
- upon
296
- us
297
- very
298
- via
299
- was
300
- we
301
- well
302
- were
303
- what
304
- whatever
305
- when
306
- whence
307
- whenever
308
- where
309
- whereafter
310
- whereas
311
- whereby
312
- wherein
313
- whereupon
314
- wherever
315
- whether
316
- which
317
- while
318
- whither
319
- who
320
- whoever
321
- whole
322
- whom
323
- whose
324
- why
325
- will
326
- with
327
- within
328
- without
329
- would
330
- yet
331
- you
332
- your
333
- yours
334
- yourself
335
- yourselves
336
- ])
16
+ STOP_WORDS = Set.new(YAML.load_file(File.expand_path('stopwords.yml', __dir__)))
337
17
 
338
18
  # Perform the filter
339
19
  # @param tokens [Array<String>]