text_rank 1.2.3 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.codeclimate.yml +1 -1
  3. data/.gitignore +4 -0
  4. data/.rubocop.yml +7 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +1 -0
  7. data/Rakefile +5 -0
  8. data/bin/console +3 -3
  9. data/ext/text_rank/extconf.rb +3 -0
  10. data/ext/text_rank/page_rank_sparse_native.c +300 -0
  11. data/ext/text_rank/page_rank_sparse_native.h +93 -0
  12. data/ext/text_rank/text_rank.c +5 -0
  13. data/lib/page_rank/base.rb +12 -9
  14. data/lib/page_rank/dense.rb +3 -2
  15. data/lib/page_rank/sparse.rb +6 -7
  16. data/lib/page_rank/sparse_native.rb +21 -0
  17. data/lib/page_rank.rb +7 -4
  18. data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
  19. data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
  20. data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
  21. data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
  22. data/lib/text_rank/char_filter.rb +1 -1
  23. data/lib/text_rank/fingerprint.rb +10 -18
  24. data/lib/text_rank/fingerprint_overlap.rb +55 -0
  25. data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
  26. data/lib/text_rank/keyword_extractor.rb +32 -25
  27. data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
  28. data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
  29. data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
  30. data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
  31. data/lib/text_rank/token_filter/stopwords.rb +1 -321
  32. data/lib/text_rank/token_filter/stopwords.yml +317 -0
  33. data/lib/text_rank/tokenizer/money.rb +11 -6
  34. data/lib/text_rank/tokenizer/number.rb +4 -3
  35. data/lib/text_rank/tokenizer/punctuation.rb +4 -1
  36. data/lib/text_rank/tokenizer/url.rb +3 -0
  37. data/lib/text_rank/tokenizer/whitespace.rb +4 -1
  38. data/lib/text_rank/tokenizer/word.rb +5 -2
  39. data/lib/text_rank/tokenizer.rb +1 -1
  40. data/lib/text_rank/version.rb +3 -1
  41. data/lib/text_rank.rb +14 -9
  42. data/text_rank.gemspec +4 -1
  43. metadata +48 -12
@@ -77,6 +77,7 @@ module TextRank
77
77
 
78
78
  class TokenCollapser
79
79
 
80
+ # rubocop:disable Metrics/ParameterLists
80
81
  def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
81
82
  @tokens = tokens
82
83
  @text = text
@@ -91,6 +92,7 @@ module TextRank
91
92
  @permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
92
93
  @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
93
94
  end
95
+ # rubocop:enable Metrics/ParameterLists
94
96
 
95
97
  # :nodoc:
96
98
  def delimiter_re
@@ -104,23 +106,36 @@ module TextRank
104
106
  # single tokens from below the cut to above it. So we'll continue searching
105
107
  # until all of the top N final keywords (single or collapsed) have been
106
108
  # considered.
107
- loop do
108
- regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
109
- single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
110
- scan_text_for_all_permutations_of(single_tokens_to_consider) or break
111
- decide_what_to_collapse_and_what_to_remove
109
+ while collapse_attempt
110
+ # keep trying
112
111
  end
113
112
 
114
113
  # We now know what to collapse and what to remove, so we can start safely
115
114
  # modifying the tokens hash
115
+ apply_collapse
116
+ end
117
+
118
+ # :nodoc:
119
+ def collapse_attempt
120
+ regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
121
+ single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
122
+ scan_text_for_all_permutations_of(single_tokens_to_consider) or return false
123
+ decide_what_to_collapse_and_what_to_remove
124
+ true
125
+ end
126
+
127
+ # :nodoc:
128
+ def apply_collapse
116
129
  @to_collapse.each do |perm|
117
130
  values = @tokens.values_at(*perm).compact
118
131
  # This might be empty if somehow the scanned permutation doesn't
119
132
  # exactly match one of the tokens (e.g. ASCII-folding gone awry).
120
133
  # The goal is to do the best we can, so if we can't find it, ignore.
121
134
  next if values.empty?
135
+
122
136
  @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
123
137
  end
138
+
124
139
  @tokens.reject! do |k, _|
125
140
  @to_remove.include?(k)
126
141
  end || @tokens
@@ -136,16 +151,10 @@ module TextRank
136
151
  # tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
137
152
  # to find what we can.
138
153
  def scan_text_for_all_permutations_of(single_tokens)
139
- perms = []
140
154
  # NOTE that by reversing the order we craft the regex to prefer larger combinations over
141
155
  # smaller combinations (or singletons).
142
- (1..@max_tokens_to_combine).to_a.reverse.map do |nn|
143
- single_tokens.permutation(nn).each do |perm|
144
- unless @permutations_scanned.key?(perm)
145
- @permutations_scanned[perm] = 0
146
- perms << perm
147
- end
148
- end
156
+ perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
157
+ scan_text_for_n_permutations_of(single_tokens, n)
149
158
  end
150
159
  scan_text_for(perms) do |s|
151
160
  s = s.downcase if @ignore_case
@@ -153,6 +162,15 @@ module TextRank
153
162
  end unless perms.empty?
154
163
  end
155
164
 
165
+ def scan_text_for_n_permutations_of(single_tokens, n)
166
+ single_tokens.permutation(n).map do |perm|
167
+ unless @permutations_scanned.key?(perm)
168
+ @permutations_scanned[perm] = 0
169
+ perm
170
+ end
171
+ end.compact
172
+ end
173
+
156
174
  # Because we're scanning the original text, we've lost all of the character filtering we did
157
175
  # prior to tokenization, but that's important because we need the original context to be more
158
176
  # choosy. Still, we need to know what delimiter goes between collapsed tokens (since it may
@@ -179,25 +197,30 @@ module TextRank
179
197
  # modifications to the original token list yet but just keep track of what we plan
180
198
  # to collapse/remove.
181
199
  def decide_what_to_collapse_and_what_to_remove
182
- non_empty_ordered = @permutations_scanned.select do |k, v|
183
- v > 0
184
- end.sort_by do |k, v|
185
- [-v, -k.size] # reverse order
186
- end
187
-
188
200
  tokens_encountered = []
189
- non_empty_ordered.each do |perm, perm_count|
201
+ permutations_to_consider_collapsing.each do |perm, perm_count|
190
202
  if perm.size > 1
191
- singles_to_remove = perm - tokens_encountered
192
- if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
193
- @to_collapse << perm if perm.size > 1
194
- @to_remove |= singles_to_remove
195
- end
203
+ decide_to_collapse_or_remove(perm, perm_count, singles_to_remove: perm - tokens_encountered)
196
204
  end
197
205
  tokens_encountered += perm
198
206
  end
199
207
  end
200
208
 
209
+ def permutations_to_consider_collapsing
210
+ @permutations_scanned.select do |_k, v|
211
+ v.positive?
212
+ end.sort_by do |k, v|
213
+ [-v, -k.size] # reverse order
214
+ end
215
+ end
216
+
217
+ def decide_to_collapse_or_remove(perm, perm_count, singles_to_remove:)
218
+ if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
219
+ @to_collapse << perm if perm.size > 1
220
+ @to_remove |= singles_to_remove
221
+ end
222
+ end
223
+
201
224
  # Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
202
225
  # we still want to add the collapsed key if it shows up "enough" times.
203
226
  def combination_significant?(perm, perm_count)
@@ -44,8 +44,9 @@ module TextRank
44
44
  # @return [Hash<String, Float>]
45
45
  def filter!(ranks, **_)
46
46
  return if ranks.empty?
47
+
47
48
  total = ranks.values.reduce(:+)
48
- Hash[ranks.map { |k, v| [k, v / total] }]
49
+ ranks.transform_values { |v| v / total }
49
50
  end
50
51
 
51
52
  end
@@ -45,8 +45,9 @@ module TextRank
45
45
  # @return [Hash<String, Float>]
46
46
  def filter!(ranks, **_)
47
47
  return if ranks.empty?
48
+
48
49
  total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
49
- Hash[ranks.map { |k, v| [k, v / total] }]
50
+ ranks.transform_values { |v| v / total }
50
51
  end
51
52
 
52
53
  end
@@ -1,5 +1,4 @@
1
1
  require 'engtagger'
2
- require 'set'
3
2
 
4
3
  module TextRank
5
4
  module TokenFilter
@@ -1,5 +1,3 @@
1
- require 'set'
2
-
3
1
  module TextRank
4
2
  module TokenFilter
5
3
  ##
@@ -15,325 +13,7 @@ module TextRank
15
13
  class Stopwords
16
14
 
17
15
  # Default English stop-word list.
18
- STOP_WORDS = Set.new(%w[
19
- a
20
- about
21
- above
22
- across
23
- after
24
- afterwards
25
- again
26
- against
27
- all
28
- almost
29
- alone
30
- along
31
- already
32
- also
33
- although
34
- always
35
- am
36
- among
37
- amongst
38
- amoungst
39
- amount
40
- an
41
- and
42
- another
43
- any
44
- anyhow
45
- anyone
46
- anything
47
- anyway
48
- anywhere
49
- are
50
- around
51
- as
52
- at
53
- back
54
- be
55
- became
56
- because
57
- become
58
- becomes
59
- becoming
60
- been
61
- before
62
- beforehand
63
- behind
64
- being
65
- below
66
- beside
67
- besides
68
- between
69
- beyond
70
- bill
71
- both
72
- bottom
73
- but
74
- by
75
- call
76
- can
77
- cannot
78
- cant
79
- co
80
- con
81
- could
82
- couldnt
83
- cry
84
- de
85
- describe
86
- detail
87
- do
88
- done
89
- down
90
- due
91
- during
92
- each
93
- eg
94
- eight
95
- either
96
- eleven
97
- else
98
- elsewhere
99
- empty
100
- enough
101
- etc
102
- even
103
- ever
104
- every
105
- everyone
106
- everything
107
- everywhere
108
- except
109
- few
110
- fifteen
111
- fify
112
- fill
113
- find
114
- fire
115
- first
116
- five
117
- for
118
- former
119
- formerly
120
- forty
121
- found
122
- four
123
- from
124
- front
125
- full
126
- further
127
- get
128
- give
129
- go
130
- had
131
- has
132
- hasnt
133
- have
134
- he
135
- hence
136
- her
137
- here
138
- hereafter
139
- hereby
140
- herein
141
- hereupon
142
- hers
143
- herself
144
- him
145
- himself
146
- his
147
- how
148
- however
149
- hundred
150
- ie
151
- if
152
- in
153
- inc
154
- indeed
155
- interest
156
- into
157
- is
158
- it
159
- its
160
- itself
161
- keep
162
- last
163
- latter
164
- latterly
165
- least
166
- less
167
- ltd
168
- made
169
- many
170
- may
171
- me
172
- meanwhile
173
- might
174
- mill
175
- mine
176
- more
177
- moreover
178
- most
179
- mostly
180
- move
181
- much
182
- must
183
- my
184
- myself
185
- name
186
- namely
187
- neither
188
- never
189
- nevertheless
190
- next
191
- nine
192
- no
193
- nobody
194
- none
195
- noone
196
- nor
197
- not
198
- nothing
199
- now
200
- nowhere
201
- of
202
- off
203
- often
204
- on
205
- once
206
- one
207
- only
208
- onto
209
- or
210
- other
211
- others
212
- otherwise
213
- our
214
- ours
215
- ourselves
216
- out
217
- over
218
- own
219
- part
220
- per
221
- perhaps
222
- please
223
- put
224
- rather
225
- re
226
- same
227
- see
228
- seem
229
- seemed
230
- seeming
231
- seems
232
- serious
233
- several
234
- she
235
- should
236
- show
237
- side
238
- since
239
- sincere
240
- six
241
- sixty
242
- so
243
- some
244
- somehow
245
- someone
246
- something
247
- sometime
248
- sometimes
249
- somewhere
250
- still
251
- such
252
- system
253
- take
254
- ten
255
- than
256
- that
257
- the
258
- their
259
- them
260
- themselves
261
- then
262
- thence
263
- there
264
- thereafter
265
- thereby
266
- therefore
267
- therein
268
- thereupon
269
- these
270
- they
271
- thickv
272
- thin
273
- third
274
- this
275
- those
276
- though
277
- three
278
- through
279
- throughout
280
- thru
281
- thus
282
- to
283
- together
284
- too
285
- top
286
- toward
287
- towards
288
- twelve
289
- twenty
290
- two
291
- un
292
- under
293
- until
294
- up
295
- upon
296
- us
297
- very
298
- via
299
- was
300
- we
301
- well
302
- were
303
- what
304
- whatever
305
- when
306
- whence
307
- whenever
308
- where
309
- whereafter
310
- whereas
311
- whereby
312
- wherein
313
- whereupon
314
- wherever
315
- whether
316
- which
317
- while
318
- whither
319
- who
320
- whoever
321
- whole
322
- whom
323
- whose
324
- why
325
- will
326
- with
327
- within
328
- without
329
- would
330
- yet
331
- you
332
- your
333
- yours
334
- yourself
335
- yourselves
336
- ])
16
+ STOP_WORDS = Set.new(YAML.load_file(File.expand_path('stopwords.yml', __dir__)))
337
17
 
338
18
  # Perform the filter
339
19
  # @param tokens [Array<String>]