text_rank 1.1.7 → 1.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +5 -5
  2. data/.codeclimate.yml +1 -6
  3. data/.rubocop.yml +60 -1075
  4. data/.ruby-version +1 -1
  5. data/.travis.yml +13 -5
  6. data/{LICENSE.txt → LICENSE} +0 -0
  7. data/README.md +2 -1
  8. data/bin/console +3 -3
  9. data/lib/page_rank.rb +2 -0
  10. data/lib/page_rank/base.rb +9 -8
  11. data/lib/page_rank/dense.rb +2 -1
  12. data/lib/page_rank/sparse.rb +6 -7
  13. data/lib/text_rank.rb +12 -9
  14. data/lib/text_rank/char_filter.rb +1 -1
  15. data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
  16. data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
  17. data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
  18. data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
  19. data/lib/text_rank/fingerprint.rb +20 -28
  20. data/lib/text_rank/fingerprint_overlap.rb +55 -0
  21. data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
  22. data/lib/text_rank/keyword_extractor.rb +32 -25
  23. data/lib/text_rank/rank_filter/collapse_adjacent.rb +53 -25
  24. data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
  25. data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
  26. data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
  27. data/lib/text_rank/token_filter/stopwords.rb +1 -321
  28. data/lib/text_rank/token_filter/stopwords.yml +317 -0
  29. data/lib/text_rank/tokenizer.rb +1 -1
  30. data/lib/text_rank/tokenizer/money.rb +11 -6
  31. data/lib/text_rank/tokenizer/number.rb +4 -3
  32. data/lib/text_rank/tokenizer/punctuation.rb +4 -1
  33. data/lib/text_rank/tokenizer/url.rb +3 -0
  34. data/lib/text_rank/tokenizer/whitespace.rb +4 -1
  35. data/lib/text_rank/tokenizer/word.rb +5 -2
  36. data/lib/text_rank/version.rb +3 -1
  37. data/text_rank.gemspec +10 -10
  38. metadata +48 -32
@@ -1,5 +1,3 @@
1
- require 'set'
2
-
3
1
  module TextRank
4
2
  ##
5
3
  # Class used to compare documents according to TextRank. A "fingerprint"
@@ -10,35 +8,35 @@ module TextRank
10
8
  # significant keywords. But to prevent less significant keywords from being
11
9
  # completely ignored we apply an inverse log linear transformation to each of the
12
10
  # N prefixes.
13
- #
11
+ #
14
12
  # For example, consider the following comparison:
15
- #
13
+ #
16
14
  # town man empty found
17
15
  # vs.
18
16
  # general empty found jar
19
- #
17
+ #
20
18
  # The first pass considers just the first keywords: town vs. general. As these
21
19
  # are different, they contribute 0.
22
- #
20
+ #
23
21
  # The second pass considers the first two keywords: town man vs general empty.
24
22
  # Again, no overlap, so they contribute 0.
25
- #
23
+ #
26
24
  # The third pass considers the first three keywords: town man empty vs general
27
25
  # empty found. Here we have one overlap: empty. This contributes 1.
28
- #
26
+ #
29
27
  # The fourth pass considers all, and there is two overlaps: empty & found. This
30
28
  # contributes 2.
31
- #
29
+ #
32
30
  # We can represent the overlaps as the vector [0, 0, 1, 2]. Then we will apply
33
31
  # the inverse log linear transformation defined by:
34
- #
32
+ #
35
33
  # f(x_i) = x_i / ln(i + 1)
36
34
  # = [0, 0, 1 / ln(4), 2 / ln(5)]
37
35
  # = [0, 0, 0.7213475204444817, 1.2426698691192237]
38
- #
36
+ #
39
37
  # Finally we take the average of the transformed vector and normalize it (to
40
38
  # ensure a final value between 0.0 and 1.0):
41
- #
39
+ #
42
40
  # norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
43
41
  # = norm( 0.49100434739092635 )
44
42
  # = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
@@ -61,28 +59,22 @@ module TextRank
61
59
  # Calculates the "similarity" between this fingerprint and another
62
60
  # @param {Fingerprint} A second fingerprint to compare
63
61
  # @return [Number] A number between 0.0 (different) and 1.0 (same)
64
- def similarity(trf2)
65
- return 1.0 if values == trf2.values
66
-
67
- sim = 0
68
- s1 = Set.new
69
- s2 = Set.new
62
+ def similarity(other)
63
+ return 1.0 if values == other.values # Short-circuit for efficiency
70
64
 
71
- [size, trf2.size].max.times.reduce(0) do |sum, i|
72
- v1 = values[i]
73
- v2 = trf2.values[i]
74
- if v1 == v2
75
- sim += 1
76
- else
77
- s1.delete?(v2) ? (sim += 1) : (s2 << v2)
78
- s2.delete?(v1) ? (sim += 1) : (s1 << v1)
79
- end
80
- sum + sim * linear_transform[i]
65
+ sum = 0
66
+ overlap(other).each_with_index do |overlap_value, i|
67
+ sum += overlap_value * linear_transform[i]
81
68
  end
69
+ sum
82
70
  end
83
71
 
84
72
  private
85
73
 
74
+ def overlap(other)
75
+ FingerprintOverlap.new(values, other.values).overlap
76
+ end
77
+
86
78
  def linear_transform
87
79
  @linear_transform ||= size.times.map do |i|
88
80
  1.0 / Math.log(i + 2) / size.to_f / norm_factor
@@ -0,0 +1,55 @@
1
+ module TextRank
2
+ ##
3
+ # Determines "overlap" between two fingerprints at each N prefixes
4
+ #
5
+ # For example,
6
+ #
7
+ # FingerprintOverlap.new(
8
+ # %w[a b c d],
9
+ # %w[b e a c],
10
+ # ).overlap
11
+ #
12
+ # => [
13
+ # 0, # [a] & (b) have no overlap
14
+ # 1, # [a b] & [b e] have one overlap: b
15
+ # 2, # [a b c] & [b e a] have two overlap: a & b
16
+ # 3, # [a b c d] & [b e a c] have three overlap: a, b, & c
17
+ # ]
18
+ ##
19
+ class FingerprintOverlap
20
+
21
+ attr_reader :overlap
22
+
23
+ def initialize(values1, values2)
24
+ raise ArgumentError, 'Value size mismatch' if values1.size != values2.size
25
+
26
+ @encountered1 = Set.new
27
+ @encountered2 = Set.new
28
+ @overlap_count = 0
29
+
30
+ @overlap = determine_overlap(values1, values2)
31
+ end
32
+
33
+ private
34
+
35
+ def determine_overlap(values1, values2)
36
+ values1.zip(values2).map do |v1, v2|
37
+ encounter(v1, v2)
38
+ @overlap_count
39
+ end
40
+ end
41
+
42
+ # This algorithm is a little more complex than could be represented in Ruby,
43
+ # but we want to keep it as performant as possible.
44
+ def encounter(value1, value2)
45
+ if value1 == value2
46
+ @overlap_count += 1
47
+ else
48
+ # Delete from the set in case an element appears more than once
49
+ @encountered1.delete?(value2) ? (@overlap_count += 1) : (@encountered2 << value2)
50
+ @encountered2.delete?(value1) ? (@overlap_count += 1) : (@encountered1 << value1)
51
+ end
52
+ end
53
+
54
+ end
55
+ end
@@ -61,18 +61,27 @@ module TextRank
61
61
  # return [nil]
62
62
  def build_graph(tokens, graph)
63
63
  ngram_window = @ngram_size * 2 + 1
64
- tokens.each_with_index do |token_i, i|
64
+ tokens.size.times do |i|
65
65
  ngram_window.times do |j|
66
- next if j == @ngram_size || i + j < @ngram_size
67
- token_j = tokens[i - @ngram_size + j]
68
- if token_j
69
- graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
70
- end
66
+ consider_ngram_window(tokens, graph, i, j)
71
67
  end
72
68
  end
73
69
  nil
74
70
  end
75
71
 
72
+ private
73
+
74
+ def consider_ngram_window(tokens, graph, i, j)
75
+ return if j == @ngram_size || i + j < @ngram_size
76
+
77
+ token_i = tokens[i]
78
+ token_j = tokens[i - @ngram_size + j]
79
+
80
+ if token_j
81
+ graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
82
+ end
83
+ end
84
+
76
85
  end
77
86
  end
78
87
  end
@@ -13,9 +13,9 @@ module TextRank
13
13
  # @return [KeywordExtractor]
14
14
  def self.basic(**options)
15
15
  new(**{
16
- char_filters: [:AsciiFolding, :Lowercase],
17
- tokenizers: [:Word],
18
- token_filters: [:Stopwords, :MinLength],
16
+ char_filters: %i[AsciiFolding Lowercase],
17
+ tokenizers: %i[Word],
18
+ token_filters: %i[Stopwords MinLength],
19
19
  graph_strategy: :Coocurrence,
20
20
  }.merge(options))
21
21
  end
@@ -25,11 +25,11 @@ module TextRank
25
25
  # @return [KeywordExtractor]
26
26
  def self.advanced(**options)
27
27
  new(**{
28
- char_filters: [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
29
- tokenizers: [:Url, :Money, :Number, :Word, :Punctuation],
30
- token_filters: [:PartOfSpeech, :Stopwords, :MinLength],
28
+ char_filters: %i[AsciiFolding Lowercase StripHtml StripEmail UndoContractions StripPossessive],
29
+ tokenizers: %i[Url Money Number Word Punctuation],
30
+ token_filters: %i[PartOfSpeech Stopwords MinLength],
31
31
  graph_strategy: :Coocurrence,
32
- rank_filters: [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
32
+ rank_filters: %i[CollapseAdjacent NormalizeUnitVector SortByValue],
33
33
  }.merge(options))
34
34
  end
35
35
 
@@ -41,14 +41,14 @@ module TextRank
41
41
  # @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
42
42
  def initialize(**options)
43
43
  @page_rank_options = {
44
- strategy: options[:strategy] || :dense,
45
- damping: options[:damping],
44
+ strategy: options[:strategy] || :sparse,
45
+ damping: options[:damping],
46
46
  tolerance: options[:tolerance],
47
47
  }
48
- @char_filters = options[:char_filters] || []
49
- @tokenizers = options[:tokenizers] || [Tokenizer::Word]
50
- @token_filters = options[:token_filters] || []
51
- @rank_filters = options[:rank_filters] || []
48
+ @char_filters = options[:char_filters] || []
49
+ @tokenizers = options[:tokenizers] || [Tokenizer::Word]
50
+ @token_filters = options[:token_filters] || []
51
+ @rank_filters = options[:rank_filters] || []
52
52
  @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
53
53
  end
54
54
 
@@ -73,9 +73,7 @@ module TextRank
73
73
  # Sets the graph strategy for producing a graph from tokens
74
74
  # @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
75
75
  # @return [Class, Symbol, #build_graph]
76
- def graph_strategy=(strategy)
77
- @graph_strategy = strategy
78
- end
76
+ attr_writer :graph_strategy
79
77
 
80
78
  # Add a new TokenFilter for processing tokens after tokenization
81
79
  # @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
@@ -105,14 +103,23 @@ module TextRank
105
103
  end
106
104
 
107
105
  # Filter & tokenize text, and return PageRank
108
- # @param text [String] unfiltered text to be processed
106
+ # @param text [String,Array<String>] unfiltered text to be processed
109
107
  # @return [Hash<String, Float>] tokens and page ranks (in descending order)
110
108
  def extract(text, **options)
111
- tokens = tokenize(text)
109
+ text = Array(text)
110
+ tokens_per_text = text.map do |t|
111
+ tokenize(t)
112
+ end
112
113
  graph = PageRank.new(**@page_rank_options)
113
- classify(@graph_strategy, context: GraphStrategy).build_graph(tokens, graph)
114
+ strategy = classify(@graph_strategy, context: GraphStrategy)
115
+ tokens_per_text.each do |tokens|
116
+ strategy.build_graph(tokens, graph)
117
+ end
114
118
  ranks = graph.calculate(**options)
115
- apply_rank_filters(ranks, tokens: tokens, original_text: text)
119
+ tokens_per_text.each_with_index do |tokens, i|
120
+ ranks = apply_rank_filters(ranks, tokens: tokens, original_text: text[i])
121
+ end
122
+ ranks
116
123
  end
117
124
 
118
125
  private
@@ -153,14 +160,14 @@ module TextRank
153
160
  array.insert(idx, value)
154
161
  end
155
162
 
156
- def classify(c, context: self)
157
- case c
163
+ def classify(clazz, context: self)
164
+ case clazz
158
165
  when Class
159
- c.new
166
+ clazz.new
160
167
  when Symbol
161
- context.const_get(c).new
168
+ context.const_get(clazz).new
162
169
  else
163
- c
170
+ clazz
164
171
  end
165
172
  end
166
173
 
@@ -77,6 +77,7 @@ module TextRank
77
77
 
78
78
  class TokenCollapser
79
79
 
80
+ # rubocop:disable Metrics/ParameterLists
80
81
  def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
81
82
  @tokens = tokens
82
83
  @text = text
@@ -91,6 +92,7 @@ module TextRank
91
92
  @permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
92
93
  @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
93
94
  end
95
+ # rubocop:enable Metrics/ParameterLists
94
96
 
95
97
  # :nodoc:
96
98
  def delimiter_re
@@ -104,18 +106,36 @@ module TextRank
104
106
  # single tokens from below the cut to above it. So we'll continue searching
105
107
  # until all of the top N final keywords (single or collapsed) have been
106
108
  # considered.
107
- loop do
108
- single_tokens_to_consider = @tokens.keys.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
109
- scan_text_for_all_permutations_of(single_tokens_to_consider) or break
110
- decide_what_to_collapse_and_what_to_remove
109
+ while collapse_attempt
110
+ # keep trying
111
111
  end
112
112
 
113
113
  # We now know what to collapse and what to remove, so we can start safely
114
114
  # modifying the tokens hash
115
+ apply_collapse
116
+ end
117
+
118
+ # :nodoc:
119
+ def collapse_attempt
120
+ regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
121
+ single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
122
+ scan_text_for_all_permutations_of(single_tokens_to_consider) or return false
123
+ decide_what_to_collapse_and_what_to_remove
124
+ true
125
+ end
126
+
127
+ # :nodoc:
128
+ def apply_collapse
115
129
  @to_collapse.each do |perm|
116
- values = @tokens.values_at(*perm)
130
+ values = @tokens.values_at(*perm).compact
131
+ # This might be empty if somehow the scanned permutation doesn't
132
+ # exactly match one of the tokens (e.g. ASCII-folding gone awry).
133
+ # The goal is to do the best we can, so if we can't find it, ignore.
134
+ next if values.empty?
135
+
117
136
  @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
118
137
  end
138
+
119
139
  @tokens.reject! do |k, _|
120
140
  @to_remove.include?(k)
121
141
  end || @tokens
@@ -131,16 +151,10 @@ module TextRank
131
151
  # tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
132
152
  # to find what we can.
133
153
  def scan_text_for_all_permutations_of(single_tokens)
134
- perms = []
135
154
  # NOTE that by reversing the order we craft the regex to prefer larger combinations over
136
155
  # smaller combinations (or singletons).
137
- (1..@max_tokens_to_combine).to_a.reverse.map do |nn|
138
- single_tokens.permutation(nn).each do |perm|
139
- unless @permutations_scanned.key?(perm)
140
- @permutations_scanned[perm] = 0
141
- perms << perm
142
- end
143
- end
156
+ perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
157
+ scan_text_for_n_permutations_of(single_tokens, n)
144
158
  end
145
159
  scan_text_for(perms) do |s|
146
160
  s = s.downcase if @ignore_case
@@ -148,6 +162,15 @@ module TextRank
148
162
  end unless perms.empty?
149
163
  end
150
164
 
165
+ def scan_text_for_n_permutations_of(single_tokens, n)
166
+ single_tokens.permutation(n).map do |perm|
167
+ unless @permutations_scanned.key?(perm)
168
+ @permutations_scanned[perm] = 0
169
+ perm
170
+ end
171
+ end.compact
172
+ end
173
+
151
174
  # Because we're scanning the original text, we've lost all of the character filtering we did
152
175
  # prior to tokenization, but that's important because we need the original context to be more
153
176
  # choosy. Still, we need to know what delimiter goes between collapsed tokens (since it may
@@ -174,25 +197,30 @@ module TextRank
174
197
  # modifications to the original token list yet but just keep track of what we plan
175
198
  # to collapse/remove.
176
199
  def decide_what_to_collapse_and_what_to_remove
177
- non_empty_ordered = @permutations_scanned.select do |k, v|
178
- v > 0
179
- end.sort_by do |k, v|
180
- [-v, -k.size] # reverse order
181
- end
182
-
183
200
  tokens_encountered = []
184
- non_empty_ordered.each do |perm, perm_count|
201
+ permutations_to_consider_collapsing.each do |perm, perm_count|
185
202
  if perm.size > 1
186
- singles_to_remove = perm - tokens_encountered
187
- if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
188
- @to_collapse << perm if perm.size > 1
189
- @to_remove |= singles_to_remove
190
- end
203
+ decide_to_collapse_or_remove(perm, perm_count, singles_to_remove: perm - tokens_encountered)
191
204
  end
192
205
  tokens_encountered += perm
193
206
  end
194
207
  end
195
208
 
209
+ def permutations_to_consider_collapsing
210
+ @permutations_scanned.select do |_k, v|
211
+ v.positive?
212
+ end.sort_by do |k, v|
213
+ [-v, -k.size] # reverse order
214
+ end
215
+ end
216
+
217
+ def decide_to_collapse_or_remove(perm, perm_count, singles_to_remove:)
218
+ if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
219
+ @to_collapse << perm if perm.size > 1
220
+ @to_remove |= singles_to_remove
221
+ end
222
+ end
223
+
196
224
  # Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
197
225
  # we still want to add the collapsed key if it shows up "enough" times.
198
226
  def combination_significant?(perm, perm_count)
@@ -44,8 +44,9 @@ module TextRank
44
44
  # @return [Hash<String, Float>]
45
45
  def filter!(ranks, **_)
46
46
  return if ranks.empty?
47
+
47
48
  total = ranks.values.reduce(:+)
48
- Hash[ranks.map { |k, v| [k, v / total] }]
49
+ ranks.transform_values { |v| v / total }
49
50
  end
50
51
 
51
52
  end
@@ -45,8 +45,9 @@ module TextRank
45
45
  # @return [Hash<String, Float>]
46
46
  def filter!(ranks, **_)
47
47
  return if ranks.empty?
48
+
48
49
  total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
49
- Hash[ranks.map { |k, v| [k, v / total] }]
50
+ ranks.transform_values { |v| v / total }
50
51
  end
51
52
 
52
53
  end
@@ -1,5 +1,4 @@
1
1
  require 'engtagger'
2
- require 'set'
3
2
 
4
3
  module TextRank
5
4
  module TokenFilter