text_rank 1.1.7 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +5 -5
  2. data/.codeclimate.yml +1 -6
  3. data/.rubocop.yml +60 -1075
  4. data/.ruby-version +1 -1
  5. data/.travis.yml +13 -5
  6. data/{LICENSE.txt → LICENSE} +0 -0
  7. data/README.md +2 -1
  8. data/bin/console +3 -3
  9. data/lib/page_rank.rb +2 -0
  10. data/lib/page_rank/base.rb +9 -8
  11. data/lib/page_rank/dense.rb +2 -1
  12. data/lib/page_rank/sparse.rb +6 -7
  13. data/lib/text_rank.rb +12 -9
  14. data/lib/text_rank/char_filter.rb +1 -1
  15. data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
  16. data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
  17. data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
  18. data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
  19. data/lib/text_rank/fingerprint.rb +20 -28
  20. data/lib/text_rank/fingerprint_overlap.rb +55 -0
  21. data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
  22. data/lib/text_rank/keyword_extractor.rb +32 -25
  23. data/lib/text_rank/rank_filter/collapse_adjacent.rb +53 -25
  24. data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
  25. data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
  26. data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
  27. data/lib/text_rank/token_filter/stopwords.rb +1 -321
  28. data/lib/text_rank/token_filter/stopwords.yml +317 -0
  29. data/lib/text_rank/tokenizer.rb +1 -1
  30. data/lib/text_rank/tokenizer/money.rb +11 -6
  31. data/lib/text_rank/tokenizer/number.rb +4 -3
  32. data/lib/text_rank/tokenizer/punctuation.rb +4 -1
  33. data/lib/text_rank/tokenizer/url.rb +3 -0
  34. data/lib/text_rank/tokenizer/whitespace.rb +4 -1
  35. data/lib/text_rank/tokenizer/word.rb +5 -2
  36. data/lib/text_rank/version.rb +3 -1
  37. data/text_rank.gemspec +10 -10
  38. metadata +48 -32
@@ -1,5 +1,3 @@
1
- require 'set'
2
-
3
1
  module TextRank
4
2
  ##
5
3
  # Class used to compare documents according to TextRank. A "fingerprint"
@@ -10,35 +8,35 @@ module TextRank
10
8
  # significant keywords. But to prevent less significant keywords from being
11
9
  # completely ignored we apply an inverse log linear transformation to each of the
12
10
  # N prefixes.
13
- #
11
+ #
14
12
  # For example, consider the following comparison:
15
- #
13
+ #
16
14
  # town man empty found
17
15
  # vs.
18
16
  # general empty found jar
19
- #
17
+ #
20
18
  # The first pass considers just the first keywords: town vs. general. As these
21
19
  # are different, they contribute 0.
22
- #
20
+ #
23
21
  # The second pass considers the first two keywords: town man vs general empty.
24
22
  # Again, no overlap, so they contribute 0.
25
- #
23
+ #
26
24
  # The third pass considers the first three keywords: town man empty vs general
27
25
  # empty found. Here we have one overlap: empty. This contributes 1.
28
- #
26
+ #
29
27
  # The fourth pass considers all, and there is two overlaps: empty & found. This
30
28
  # contributes 2.
31
- #
29
+ #
32
30
  # We can represent the overlaps as the vector [0, 0, 1, 2]. Then we will apply
33
31
  # the inverse log linear transformation defined by:
34
- #
32
+ #
35
33
  # f(x_i) = x_i / ln(i + 1)
36
34
  # = [0, 0, 1 / ln(4), 2 / ln(5)]
37
35
  # = [0, 0, 0.7213475204444817, 1.2426698691192237]
38
- #
36
+ #
39
37
  # Finally we take the average of the transformed vector and normalize it (to
40
38
  # ensure a final value between 0.0 and 1.0):
41
- #
39
+ #
42
40
  # norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
43
41
  # = norm( 0.49100434739092635 )
44
42
  # = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
@@ -61,28 +59,22 @@ module TextRank
61
59
  # Calculates the "similarity" between this fingerprint and another
62
60
  # @param {Fingerprint} A second fingerprint to compare
63
61
  # @return [Number] A number between 0.0 (different) and 1.0 (same)
64
- def similarity(trf2)
65
- return 1.0 if values == trf2.values
66
-
67
- sim = 0
68
- s1 = Set.new
69
- s2 = Set.new
62
+ def similarity(other)
63
+ return 1.0 if values == other.values # Short-circuit for efficiency
70
64
 
71
- [size, trf2.size].max.times.reduce(0) do |sum, i|
72
- v1 = values[i]
73
- v2 = trf2.values[i]
74
- if v1 == v2
75
- sim += 1
76
- else
77
- s1.delete?(v2) ? (sim += 1) : (s2 << v2)
78
- s2.delete?(v1) ? (sim += 1) : (s1 << v1)
79
- end
80
- sum + sim * linear_transform[i]
65
+ sum = 0
66
+ overlap(other).each_with_index do |overlap_value, i|
67
+ sum += overlap_value * linear_transform[i]
81
68
  end
69
+ sum
82
70
  end
83
71
 
84
72
  private
85
73
 
74
+ def overlap(other)
75
+ FingerprintOverlap.new(values, other.values).overlap
76
+ end
77
+
86
78
  def linear_transform
87
79
  @linear_transform ||= size.times.map do |i|
88
80
  1.0 / Math.log(i + 2) / size.to_f / norm_factor
@@ -0,0 +1,55 @@
1
+ module TextRank
2
+ ##
3
+ # Determines "overlap" between two fingerprints at each N prefixes
4
+ #
5
+ # For example,
6
+ #
7
+ # FingerprintOverlap.new(
8
+ # %w[a b c d],
9
+ # %w[b e a c],
10
+ # ).overlap
11
+ #
12
+ # => [
13
+ # 0, # [a] & (b) have no overlap
14
+ # 1, # [a b] & [b e] have one overlap: b
15
+ # 2, # [a b c] & [b e a] have two overlap: a & b
16
+ # 3, # [a b c d] & [b e a c] have three overlap: a, b, & c
17
+ # ]
18
+ ##
19
+ class FingerprintOverlap
20
+
21
+ attr_reader :overlap
22
+
23
+ def initialize(values1, values2)
24
+ raise ArgumentError, 'Value size mismatch' if values1.size != values2.size
25
+
26
+ @encountered1 = Set.new
27
+ @encountered2 = Set.new
28
+ @overlap_count = 0
29
+
30
+ @overlap = determine_overlap(values1, values2)
31
+ end
32
+
33
+ private
34
+
35
+ def determine_overlap(values1, values2)
36
+ values1.zip(values2).map do |v1, v2|
37
+ encounter(v1, v2)
38
+ @overlap_count
39
+ end
40
+ end
41
+
42
+ # This algorithm is a little more complex than could be represented in Ruby,
43
+ # but we want to keep it as performant as possible.
44
+ def encounter(value1, value2)
45
+ if value1 == value2
46
+ @overlap_count += 1
47
+ else
48
+ # Delete from the set in case an element appears more than once
49
+ @encountered1.delete?(value2) ? (@overlap_count += 1) : (@encountered2 << value2)
50
+ @encountered2.delete?(value1) ? (@overlap_count += 1) : (@encountered1 << value1)
51
+ end
52
+ end
53
+
54
+ end
55
+ end
@@ -61,18 +61,27 @@ module TextRank
61
61
  # return [nil]
62
62
  def build_graph(tokens, graph)
63
63
  ngram_window = @ngram_size * 2 + 1
64
- tokens.each_with_index do |token_i, i|
64
+ tokens.size.times do |i|
65
65
  ngram_window.times do |j|
66
- next if j == @ngram_size || i + j < @ngram_size
67
- token_j = tokens[i - @ngram_size + j]
68
- if token_j
69
- graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
70
- end
66
+ consider_ngram_window(tokens, graph, i, j)
71
67
  end
72
68
  end
73
69
  nil
74
70
  end
75
71
 
72
+ private
73
+
74
+ def consider_ngram_window(tokens, graph, i, j)
75
+ return if j == @ngram_size || i + j < @ngram_size
76
+
77
+ token_i = tokens[i]
78
+ token_j = tokens[i - @ngram_size + j]
79
+
80
+ if token_j
81
+ graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
82
+ end
83
+ end
84
+
76
85
  end
77
86
  end
78
87
  end
@@ -13,9 +13,9 @@ module TextRank
13
13
  # @return [KeywordExtractor]
14
14
  def self.basic(**options)
15
15
  new(**{
16
- char_filters: [:AsciiFolding, :Lowercase],
17
- tokenizers: [:Word],
18
- token_filters: [:Stopwords, :MinLength],
16
+ char_filters: %i[AsciiFolding Lowercase],
17
+ tokenizers: %i[Word],
18
+ token_filters: %i[Stopwords MinLength],
19
19
  graph_strategy: :Coocurrence,
20
20
  }.merge(options))
21
21
  end
@@ -25,11 +25,11 @@ module TextRank
25
25
  # @return [KeywordExtractor]
26
26
  def self.advanced(**options)
27
27
  new(**{
28
- char_filters: [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
29
- tokenizers: [:Url, :Money, :Number, :Word, :Punctuation],
30
- token_filters: [:PartOfSpeech, :Stopwords, :MinLength],
28
+ char_filters: %i[AsciiFolding Lowercase StripHtml StripEmail UndoContractions StripPossessive],
29
+ tokenizers: %i[Url Money Number Word Punctuation],
30
+ token_filters: %i[PartOfSpeech Stopwords MinLength],
31
31
  graph_strategy: :Coocurrence,
32
- rank_filters: [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
32
+ rank_filters: %i[CollapseAdjacent NormalizeUnitVector SortByValue],
33
33
  }.merge(options))
34
34
  end
35
35
 
@@ -41,14 +41,14 @@ module TextRank
41
41
  # @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
42
42
  def initialize(**options)
43
43
  @page_rank_options = {
44
- strategy: options[:strategy] || :dense,
45
- damping: options[:damping],
44
+ strategy: options[:strategy] || :sparse,
45
+ damping: options[:damping],
46
46
  tolerance: options[:tolerance],
47
47
  }
48
- @char_filters = options[:char_filters] || []
49
- @tokenizers = options[:tokenizers] || [Tokenizer::Word]
50
- @token_filters = options[:token_filters] || []
51
- @rank_filters = options[:rank_filters] || []
48
+ @char_filters = options[:char_filters] || []
49
+ @tokenizers = options[:tokenizers] || [Tokenizer::Word]
50
+ @token_filters = options[:token_filters] || []
51
+ @rank_filters = options[:rank_filters] || []
52
52
  @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
53
53
  end
54
54
 
@@ -73,9 +73,7 @@ module TextRank
73
73
  # Sets the graph strategy for producing a graph from tokens
74
74
  # @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
75
75
  # @return [Class, Symbol, #build_graph]
76
- def graph_strategy=(strategy)
77
- @graph_strategy = strategy
78
- end
76
+ attr_writer :graph_strategy
79
77
 
80
78
  # Add a new TokenFilter for processing tokens after tokenization
81
79
  # @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
@@ -105,14 +103,23 @@ module TextRank
105
103
  end
106
104
 
107
105
  # Filter & tokenize text, and return PageRank
108
- # @param text [String] unfiltered text to be processed
106
+ # @param text [String,Array<String>] unfiltered text to be processed
109
107
  # @return [Hash<String, Float>] tokens and page ranks (in descending order)
110
108
  def extract(text, **options)
111
- tokens = tokenize(text)
109
+ text = Array(text)
110
+ tokens_per_text = text.map do |t|
111
+ tokenize(t)
112
+ end
112
113
  graph = PageRank.new(**@page_rank_options)
113
- classify(@graph_strategy, context: GraphStrategy).build_graph(tokens, graph)
114
+ strategy = classify(@graph_strategy, context: GraphStrategy)
115
+ tokens_per_text.each do |tokens|
116
+ strategy.build_graph(tokens, graph)
117
+ end
114
118
  ranks = graph.calculate(**options)
115
- apply_rank_filters(ranks, tokens: tokens, original_text: text)
119
+ tokens_per_text.each_with_index do |tokens, i|
120
+ ranks = apply_rank_filters(ranks, tokens: tokens, original_text: text[i])
121
+ end
122
+ ranks
116
123
  end
117
124
 
118
125
  private
@@ -153,14 +160,14 @@ module TextRank
153
160
  array.insert(idx, value)
154
161
  end
155
162
 
156
- def classify(c, context: self)
157
- case c
163
+ def classify(clazz, context: self)
164
+ case clazz
158
165
  when Class
159
- c.new
166
+ clazz.new
160
167
  when Symbol
161
- context.const_get(c).new
168
+ context.const_get(clazz).new
162
169
  else
163
- c
170
+ clazz
164
171
  end
165
172
  end
166
173
 
@@ -77,6 +77,7 @@ module TextRank
77
77
 
78
78
  class TokenCollapser
79
79
 
80
+ # rubocop:disable Metrics/ParameterLists
80
81
  def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
81
82
  @tokens = tokens
82
83
  @text = text
@@ -91,6 +92,7 @@ module TextRank
91
92
  @permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
92
93
  @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
93
94
  end
95
+ # rubocop:enable Metrics/ParameterLists
94
96
 
95
97
  # :nodoc:
96
98
  def delimiter_re
@@ -104,18 +106,36 @@ module TextRank
104
106
  # single tokens from below the cut to above it. So we'll continue searching
105
107
  # until all of the top N final keywords (single or collapsed) have been
106
108
  # considered.
107
- loop do
108
- single_tokens_to_consider = @tokens.keys.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
109
- scan_text_for_all_permutations_of(single_tokens_to_consider) or break
110
- decide_what_to_collapse_and_what_to_remove
109
+ while collapse_attempt
110
+ # keep trying
111
111
  end
112
112
 
113
113
  # We now know what to collapse and what to remove, so we can start safely
114
114
  # modifying the tokens hash
115
+ apply_collapse
116
+ end
117
+
118
+ # :nodoc:
119
+ def collapse_attempt
120
+ regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
121
+ single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
122
+ scan_text_for_all_permutations_of(single_tokens_to_consider) or return false
123
+ decide_what_to_collapse_and_what_to_remove
124
+ true
125
+ end
126
+
127
+ # :nodoc:
128
+ def apply_collapse
115
129
  @to_collapse.each do |perm|
116
- values = @tokens.values_at(*perm)
130
+ values = @tokens.values_at(*perm).compact
131
+ # This might be empty if somehow the scanned permutation doesn't
132
+ # exactly match one of the tokens (e.g. ASCII-folding gone awry).
133
+ # The goal is to do the best we can, so if we can't find it, ignore.
134
+ next if values.empty?
135
+
117
136
  @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
118
137
  end
138
+
119
139
  @tokens.reject! do |k, _|
120
140
  @to_remove.include?(k)
121
141
  end || @tokens
@@ -131,16 +151,10 @@ module TextRank
131
151
  # tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
132
152
  # to find what we can.
133
153
  def scan_text_for_all_permutations_of(single_tokens)
134
- perms = []
135
154
  # NOTE that by reversing the order we craft the regex to prefer larger combinations over
136
155
  # smaller combinations (or singletons).
137
- (1..@max_tokens_to_combine).to_a.reverse.map do |nn|
138
- single_tokens.permutation(nn).each do |perm|
139
- unless @permutations_scanned.key?(perm)
140
- @permutations_scanned[perm] = 0
141
- perms << perm
142
- end
143
- end
156
+ perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
157
+ scan_text_for_n_permutations_of(single_tokens, n)
144
158
  end
145
159
  scan_text_for(perms) do |s|
146
160
  s = s.downcase if @ignore_case
@@ -148,6 +162,15 @@ module TextRank
148
162
  end unless perms.empty?
149
163
  end
150
164
 
165
+ def scan_text_for_n_permutations_of(single_tokens, n)
166
+ single_tokens.permutation(n).map do |perm|
167
+ unless @permutations_scanned.key?(perm)
168
+ @permutations_scanned[perm] = 0
169
+ perm
170
+ end
171
+ end.compact
172
+ end
173
+
151
174
  # Because we're scanning the original text, we've lost all of the character filtering we did
152
175
  # prior to tokenization, but that's important because we need the original context to be more
153
176
  # choosy. Still, we need to know what delimiter goes between collapsed tokens (since it may
@@ -174,25 +197,30 @@ module TextRank
174
197
  # modifications to the original token list yet but just keep track of what we plan
175
198
  # to collapse/remove.
176
199
  def decide_what_to_collapse_and_what_to_remove
177
- non_empty_ordered = @permutations_scanned.select do |k, v|
178
- v > 0
179
- end.sort_by do |k, v|
180
- [-v, -k.size] # reverse order
181
- end
182
-
183
200
  tokens_encountered = []
184
- non_empty_ordered.each do |perm, perm_count|
201
+ permutations_to_consider_collapsing.each do |perm, perm_count|
185
202
  if perm.size > 1
186
- singles_to_remove = perm - tokens_encountered
187
- if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
188
- @to_collapse << perm if perm.size > 1
189
- @to_remove |= singles_to_remove
190
- end
203
+ decide_to_collapse_or_remove(perm, perm_count, singles_to_remove: perm - tokens_encountered)
191
204
  end
192
205
  tokens_encountered += perm
193
206
  end
194
207
  end
195
208
 
209
+ def permutations_to_consider_collapsing
210
+ @permutations_scanned.select do |_k, v|
211
+ v.positive?
212
+ end.sort_by do |k, v|
213
+ [-v, -k.size] # reverse order
214
+ end
215
+ end
216
+
217
+ def decide_to_collapse_or_remove(perm, perm_count, singles_to_remove:)
218
+ if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
219
+ @to_collapse << perm if perm.size > 1
220
+ @to_remove |= singles_to_remove
221
+ end
222
+ end
223
+
196
224
  # Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
197
225
  # we still want to add the collapsed key if it shows up "enough" times.
198
226
  def combination_significant?(perm, perm_count)
@@ -44,8 +44,9 @@ module TextRank
44
44
  # @return [Hash<String, Float>]
45
45
  def filter!(ranks, **_)
46
46
  return if ranks.empty?
47
+
47
48
  total = ranks.values.reduce(:+)
48
- Hash[ranks.map { |k, v| [k, v / total] }]
49
+ ranks.transform_values { |v| v / total }
49
50
  end
50
51
 
51
52
  end
@@ -45,8 +45,9 @@ module TextRank
45
45
  # @return [Hash<String, Float>]
46
46
  def filter!(ranks, **_)
47
47
  return if ranks.empty?
48
+
48
49
  total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
49
- Hash[ranks.map { |k, v| [k, v / total] }]
50
+ ranks.transform_values { |v| v / total }
50
51
  end
51
52
 
52
53
  end
@@ -1,5 +1,4 @@
1
1
  require 'engtagger'
2
- require 'set'
3
2
 
4
3
  module TextRank
5
4
  module TokenFilter