text_rank 1.2.3 → 1.2.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,3 @@
1
- require 'set'
2
-
3
1
  module TextRank
4
2
  ##
5
3
  # Class used to compare documents according to TextRank. A "fingerprint"
@@ -61,28 +59,22 @@ module TextRank
61
59
  # Calculates the "similarity" between this fingerprint and another
62
60
  # @param {Fingerprint} A second fingerprint to compare
63
61
  # @return [Number] A number between 0.0 (different) and 1.0 (same)
64
- def similarity(trf2)
65
- return 1.0 if values == trf2.values
66
-
67
- sim = 0
68
- s1 = Set.new
69
- s2 = Set.new
62
+ def similarity(other)
63
+ return 1.0 if values == other.values # Short-circuit for efficiency
70
64
 
71
- [size, trf2.size].max.times.reduce(0) do |sum, i|
72
- v1 = values[i]
73
- v2 = trf2.values[i]
74
- if v1 == v2
75
- sim += 1
76
- else
77
- s1.delete?(v2) ? (sim += 1) : (s2 << v2)
78
- s2.delete?(v1) ? (sim += 1) : (s1 << v1)
79
- end
80
- sum + sim * linear_transform[i]
65
+ sum = 0
66
+ overlap(other).each_with_index do |overlap_value, i|
67
+ sum += overlap_value * linear_transform[i]
81
68
  end
69
+ sum
82
70
  end
83
71
 
84
72
  private
85
73
 
74
+ def overlap(other)
75
+ FingerprintOverlap.new(values, other.values).overlap
76
+ end
77
+
86
78
  def linear_transform
87
79
  @linear_transform ||= size.times.map do |i|
88
80
  1.0 / Math.log(i + 2) / size.to_f / norm_factor
@@ -0,0 +1,55 @@
1
+ module TextRank
2
+ ##
3
+ # Determines "overlap" between two fingerprints at each N prefixes
4
+ #
5
+ # For example,
6
+ #
7
+ # FingerprintOverlap.new(
8
+ # %w[a b c d],
9
+ # %w[b e a c],
10
+ # ).overlap
11
+ #
12
+ # => [
13
+ # 0, # [a] & (b) have no overlap
14
+ # 1, # [a b] & [b e] have one overlap: b
15
+ # 2, # [a b c] & [b e a] have two overlap: a & b
16
+ # 3, # [a b c d] & [b e a c] have three overlap: a, b, & c
17
+ # ]
18
+ ##
19
+ class FingerprintOverlap
20
+
21
+ attr_reader :overlap
22
+
23
+ def initialize(values1, values2)
24
+ raise ArgumentError, 'Value size mismatch' if values1.size != values2.size
25
+
26
+ @encountered1 = Set.new
27
+ @encountered2 = Set.new
28
+ @overlap_count = 0
29
+
30
+ @overlap = determine_overlap(values1, values2)
31
+ end
32
+
33
+ private
34
+
35
+ def determine_overlap(values1, values2)
36
+ values1.zip(values2).map do |v1, v2|
37
+ encounter(v1, v2)
38
+ @overlap_count
39
+ end
40
+ end
41
+
42
+ # This algorithm is a little more complex than could be represented in Ruby,
43
+ # but we want to keep it as performant as possible.
44
+ def encounter(value1, value2)
45
+ if value1 == value2
46
+ @overlap_count += 1
47
+ else
48
+ # Delete from the set in case an element appears more than once
49
+ @encountered1.delete?(value2) ? (@overlap_count += 1) : (@encountered2 << value2)
50
+ @encountered2.delete?(value1) ? (@overlap_count += 1) : (@encountered1 << value1)
51
+ end
52
+ end
53
+
54
+ end
55
+ end
@@ -61,18 +61,27 @@ module TextRank
61
61
  # return [nil]
62
62
  def build_graph(tokens, graph)
63
63
  ngram_window = @ngram_size * 2 + 1
64
- tokens.each_with_index do |token_i, i|
64
+ tokens.size.times do |i|
65
65
  ngram_window.times do |j|
66
- next if j == @ngram_size || i + j < @ngram_size
67
- token_j = tokens[i - @ngram_size + j]
68
- if token_j
69
- graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
70
- end
66
+ consider_ngram_window(tokens, graph, i, j)
71
67
  end
72
68
  end
73
69
  nil
74
70
  end
75
71
 
72
+ private
73
+
74
+ def consider_ngram_window(tokens, graph, i, j)
75
+ return if j == @ngram_size || i + j < @ngram_size
76
+
77
+ token_i = tokens[i]
78
+ token_j = tokens[i - @ngram_size + j]
79
+
80
+ if token_j
81
+ graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
82
+ end
83
+ end
84
+
76
85
  end
77
86
  end
78
87
  end
@@ -13,9 +13,9 @@ module TextRank
13
13
  # @return [KeywordExtractor]
14
14
  def self.basic(**options)
15
15
  new(**{
16
- char_filters: [:AsciiFolding, :Lowercase],
17
- tokenizers: [:Word],
18
- token_filters: [:Stopwords, :MinLength],
16
+ char_filters: %i[AsciiFolding Lowercase],
17
+ tokenizers: %i[Word],
18
+ token_filters: %i[Stopwords MinLength],
19
19
  graph_strategy: :Coocurrence,
20
20
  }.merge(options))
21
21
  end
@@ -25,11 +25,11 @@ module TextRank
25
25
  # @return [KeywordExtractor]
26
26
  def self.advanced(**options)
27
27
  new(**{
28
- char_filters: [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
29
- tokenizers: [:Url, :Money, :Number, :Word, :Punctuation],
30
- token_filters: [:PartOfSpeech, :Stopwords, :MinLength],
28
+ char_filters: %i[AsciiFolding Lowercase StripHtml StripEmail UndoContractions StripPossessive],
29
+ tokenizers: %i[Url Money Number Word Punctuation],
30
+ token_filters: %i[PartOfSpeech Stopwords MinLength],
31
31
  graph_strategy: :Coocurrence,
32
- rank_filters: [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
32
+ rank_filters: %i[CollapseAdjacent NormalizeUnitVector SortByValue],
33
33
  }.merge(options))
34
34
  end
35
35
 
@@ -41,14 +41,14 @@ module TextRank
41
41
  # @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
42
42
  def initialize(**options)
43
43
  @page_rank_options = {
44
- strategy: options[:strategy] || :sparse,
45
- damping: options[:damping],
44
+ strategy: options[:strategy] || :sparse,
45
+ damping: options[:damping],
46
46
  tolerance: options[:tolerance],
47
47
  }
48
- @char_filters = options[:char_filters] || []
49
- @tokenizers = options[:tokenizers] || [Tokenizer::Word]
50
- @token_filters = options[:token_filters] || []
51
- @rank_filters = options[:rank_filters] || []
48
+ @char_filters = options[:char_filters] || []
49
+ @tokenizers = options[:tokenizers] || [Tokenizer::Word]
50
+ @token_filters = options[:token_filters] || []
51
+ @rank_filters = options[:rank_filters] || []
52
52
  @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
53
53
  end
54
54
 
@@ -73,9 +73,7 @@ module TextRank
73
73
  # Sets the graph strategy for producing a graph from tokens
74
74
  # @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
75
75
  # @return [Class, Symbol, #build_graph]
76
- def graph_strategy=(strategy)
77
- @graph_strategy = strategy
78
- end
76
+ attr_writer :graph_strategy
79
77
 
80
78
  # Add a new TokenFilter for processing tokens after tokenization
81
79
  # @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
@@ -153,14 +151,14 @@ module TextRank
153
151
  array.insert(idx, value)
154
152
  end
155
153
 
156
- def classify(c, context: self)
157
- case c
154
+ def classify(clazz, context: self)
155
+ case clazz
158
156
  when Class
159
- c.new
157
+ clazz.new
160
158
  when Symbol
161
- context.const_get(c).new
159
+ context.const_get(clazz).new
162
160
  else
163
- c
161
+ clazz
164
162
  end
165
163
  end
166
164
 
@@ -77,6 +77,7 @@ module TextRank
77
77
 
78
78
  class TokenCollapser
79
79
 
80
+ # rubocop:disable Metrics/ParameterLists
80
81
  def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
81
82
  @tokens = tokens
82
83
  @text = text
@@ -91,6 +92,7 @@ module TextRank
91
92
  @permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
92
93
  @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
93
94
  end
95
+ # rubocop:enable Metrics/ParameterLists
94
96
 
95
97
  # :nodoc:
96
98
  def delimiter_re
@@ -104,23 +106,36 @@ module TextRank
104
106
  # single tokens from below the cut to above it. So we'll continue searching
105
107
  # until all of the top N final keywords (single or collapsed) have been
106
108
  # considered.
107
- loop do
108
- regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
109
- single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
110
- scan_text_for_all_permutations_of(single_tokens_to_consider) or break
111
- decide_what_to_collapse_and_what_to_remove
109
+ while collapse_attempt
110
+ # keep trying
112
111
  end
113
112
 
114
113
  # We now know what to collapse and what to remove, so we can start safely
115
114
  # modifying the tokens hash
115
+ apply_collapse
116
+ end
117
+
118
+ # :nodoc:
119
+ def collapse_attempt
120
+ regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
121
+ single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
122
+ scan_text_for_all_permutations_of(single_tokens_to_consider) or return false
123
+ decide_what_to_collapse_and_what_to_remove
124
+ true
125
+ end
126
+
127
+ # :nodoc:
128
+ def apply_collapse
116
129
  @to_collapse.each do |perm|
117
130
  values = @tokens.values_at(*perm).compact
118
131
  # This might be empty if somehow the scanned permutation doesn't
119
132
  # exactly match one of the tokens (e.g. ASCII-folding gone awry).
120
133
  # The goal is to do the best we can, so if we can't find it, ignore.
121
134
  next if values.empty?
135
+
122
136
  @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
123
137
  end
138
+
124
139
  @tokens.reject! do |k, _|
125
140
  @to_remove.include?(k)
126
141
  end || @tokens
@@ -136,16 +151,10 @@ module TextRank
136
151
  # tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
137
152
  # to find what we can.
138
153
  def scan_text_for_all_permutations_of(single_tokens)
139
- perms = []
140
154
  # NOTE that by reversing the order we craft the regex to prefer larger combinations over
141
155
  # smaller combinations (or singletons).
142
- (1..@max_tokens_to_combine).to_a.reverse.map do |nn|
143
- single_tokens.permutation(nn).each do |perm|
144
- unless @permutations_scanned.key?(perm)
145
- @permutations_scanned[perm] = 0
146
- perms << perm
147
- end
148
- end
156
+ perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
157
+ scan_text_for_n_permutations_of(single_tokens, n)
149
158
  end
150
159
  scan_text_for(perms) do |s|
151
160
  s = s.downcase if @ignore_case
@@ -153,6 +162,15 @@ module TextRank
153
162
  end unless perms.empty?
154
163
  end
155
164
 
165
+ def scan_text_for_n_permutations_of(single_tokens, n)
166
+ single_tokens.permutation(n).map do |perm|
167
+ unless @permutations_scanned.key?(perm)
168
+ @permutations_scanned[perm] = 0
169
+ perm
170
+ end
171
+ end.compact
172
+ end
173
+
156
174
  # Because we're scanning the original text, we've lost all of the character filtering we did
157
175
  # prior to tokenization, but that's important because we need the original context to be more
158
176
  # choosy. Still, we need to know what delimiter goes between collapsed tokens (since it may
@@ -179,25 +197,30 @@ module TextRank
179
197
  # modifications to the original token list yet but just keep track of what we plan
180
198
  # to collapse/remove.
181
199
  def decide_what_to_collapse_and_what_to_remove
182
- non_empty_ordered = @permutations_scanned.select do |k, v|
183
- v > 0
184
- end.sort_by do |k, v|
185
- [-v, -k.size] # reverse order
186
- end
187
-
188
200
  tokens_encountered = []
189
- non_empty_ordered.each do |perm, perm_count|
201
+ permutations_to_consider_collapsing.each do |perm, perm_count|
190
202
  if perm.size > 1
191
- singles_to_remove = perm - tokens_encountered
192
- if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
193
- @to_collapse << perm if perm.size > 1
194
- @to_remove |= singles_to_remove
195
- end
203
+ decide_to_collapse_or_remove(perm, perm_count, singles_to_remove: perm - tokens_encountered)
196
204
  end
197
205
  tokens_encountered += perm
198
206
  end
199
207
  end
200
208
 
209
+ def permutations_to_consider_collapsing
210
+ @permutations_scanned.select do |_k, v|
211
+ v.positive?
212
+ end.sort_by do |k, v|
213
+ [-v, -k.size] # reverse order
214
+ end
215
+ end
216
+
217
+ def decide_to_collapse_or_remove(perm, perm_count, singles_to_remove:)
218
+ if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
219
+ @to_collapse << perm if perm.size > 1
220
+ @to_remove |= singles_to_remove
221
+ end
222
+ end
223
+
201
224
  # Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
202
225
  # we still want to add the collapsed key if it shows up "enough" times.
203
226
  def combination_significant?(perm, perm_count)
@@ -44,8 +44,9 @@ module TextRank
44
44
  # @return [Hash<String, Float>]
45
45
  def filter!(ranks, **_)
46
46
  return if ranks.empty?
47
+
47
48
  total = ranks.values.reduce(:+)
48
- Hash[ranks.map { |k, v| [k, v / total] }]
49
+ ranks.transform_values { |v| v / total }
49
50
  end
50
51
 
51
52
  end
@@ -45,8 +45,9 @@ module TextRank
45
45
  # @return [Hash<String, Float>]
46
46
  def filter!(ranks, **_)
47
47
  return if ranks.empty?
48
+
48
49
  total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
49
- Hash[ranks.map { |k, v| [k, v / total] }]
50
+ ranks.transform_values { |v| v / total }
50
51
  end
51
52
 
52
53
  end
@@ -1,5 +1,4 @@
1
1
  require 'engtagger'
2
- require 'set'
3
2
 
4
3
  module TextRank
5
4
  module TokenFilter
@@ -1,5 +1,3 @@
1
- require 'set'
2
-
3
1
  module TextRank
4
2
  module TokenFilter
5
3
  ##
@@ -15,325 +13,7 @@ module TextRank
15
13
  class Stopwords
16
14
 
17
15
  # Default English stop-word list.
18
- STOP_WORDS = Set.new(%w[
19
- a
20
- about
21
- above
22
- across
23
- after
24
- afterwards
25
- again
26
- against
27
- all
28
- almost
29
- alone
30
- along
31
- already
32
- also
33
- although
34
- always
35
- am
36
- among
37
- amongst
38
- amoungst
39
- amount
40
- an
41
- and
42
- another
43
- any
44
- anyhow
45
- anyone
46
- anything
47
- anyway
48
- anywhere
49
- are
50
- around
51
- as
52
- at
53
- back
54
- be
55
- became
56
- because
57
- become
58
- becomes
59
- becoming
60
- been
61
- before
62
- beforehand
63
- behind
64
- being
65
- below
66
- beside
67
- besides
68
- between
69
- beyond
70
- bill
71
- both
72
- bottom
73
- but
74
- by
75
- call
76
- can
77
- cannot
78
- cant
79
- co
80
- con
81
- could
82
- couldnt
83
- cry
84
- de
85
- describe
86
- detail
87
- do
88
- done
89
- down
90
- due
91
- during
92
- each
93
- eg
94
- eight
95
- either
96
- eleven
97
- else
98
- elsewhere
99
- empty
100
- enough
101
- etc
102
- even
103
- ever
104
- every
105
- everyone
106
- everything
107
- everywhere
108
- except
109
- few
110
- fifteen
111
- fify
112
- fill
113
- find
114
- fire
115
- first
116
- five
117
- for
118
- former
119
- formerly
120
- forty
121
- found
122
- four
123
- from
124
- front
125
- full
126
- further
127
- get
128
- give
129
- go
130
- had
131
- has
132
- hasnt
133
- have
134
- he
135
- hence
136
- her
137
- here
138
- hereafter
139
- hereby
140
- herein
141
- hereupon
142
- hers
143
- herself
144
- him
145
- himself
146
- his
147
- how
148
- however
149
- hundred
150
- ie
151
- if
152
- in
153
- inc
154
- indeed
155
- interest
156
- into
157
- is
158
- it
159
- its
160
- itself
161
- keep
162
- last
163
- latter
164
- latterly
165
- least
166
- less
167
- ltd
168
- made
169
- many
170
- may
171
- me
172
- meanwhile
173
- might
174
- mill
175
- mine
176
- more
177
- moreover
178
- most
179
- mostly
180
- move
181
- much
182
- must
183
- my
184
- myself
185
- name
186
- namely
187
- neither
188
- never
189
- nevertheless
190
- next
191
- nine
192
- no
193
- nobody
194
- none
195
- noone
196
- nor
197
- not
198
- nothing
199
- now
200
- nowhere
201
- of
202
- off
203
- often
204
- on
205
- once
206
- one
207
- only
208
- onto
209
- or
210
- other
211
- others
212
- otherwise
213
- our
214
- ours
215
- ourselves
216
- out
217
- over
218
- own
219
- part
220
- per
221
- perhaps
222
- please
223
- put
224
- rather
225
- re
226
- same
227
- see
228
- seem
229
- seemed
230
- seeming
231
- seems
232
- serious
233
- several
234
- she
235
- should
236
- show
237
- side
238
- since
239
- sincere
240
- six
241
- sixty
242
- so
243
- some
244
- somehow
245
- someone
246
- something
247
- sometime
248
- sometimes
249
- somewhere
250
- still
251
- such
252
- system
253
- take
254
- ten
255
- than
256
- that
257
- the
258
- their
259
- them
260
- themselves
261
- then
262
- thence
263
- there
264
- thereafter
265
- thereby
266
- therefore
267
- therein
268
- thereupon
269
- these
270
- they
271
- thickv
272
- thin
273
- third
274
- this
275
- those
276
- though
277
- three
278
- through
279
- throughout
280
- thru
281
- thus
282
- to
283
- together
284
- too
285
- top
286
- toward
287
- towards
288
- twelve
289
- twenty
290
- two
291
- un
292
- under
293
- until
294
- up
295
- upon
296
- us
297
- very
298
- via
299
- was
300
- we
301
- well
302
- were
303
- what
304
- whatever
305
- when
306
- whence
307
- whenever
308
- where
309
- whereafter
310
- whereas
311
- whereby
312
- wherein
313
- whereupon
314
- wherever
315
- whether
316
- which
317
- while
318
- whither
319
- who
320
- whoever
321
- whole
322
- whom
323
- whose
324
- why
325
- will
326
- with
327
- within
328
- without
329
- would
330
- yet
331
- you
332
- your
333
- yours
334
- yourself
335
- yourselves
336
- ])
16
+ STOP_WORDS = Set.new(YAML.load_file(File.expand_path('stopwords.yml', __dir__)))
337
17
 
338
18
  # Perform the filter
339
19
  # @param tokens [Array<String>]