text_rank 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 731bcb5680a6397568803361e08d552a66023ed4
4
- data.tar.gz: b3e66aa06ef115be83509a8702fe112837983f21
3
+ metadata.gz: f76e0559c4da8e9461b2e89139746e52bcc1dd56
4
+ data.tar.gz: 81404e9523ff19eaec3839859f8a83db9690274e
5
5
  SHA512:
6
- metadata.gz: 63f179e7780d659130bb16c6e7366324ecadf5ad4e6affe6497bcac3dd1a32b2cb87793badb6239ba1b453974b46e23d7379a8ae424922e47c8d0b2f53b0a267
7
- data.tar.gz: a09e4c0f7604d2a3c870efe9f8cfb87f59de2b02a2748d7fbb2807e37acf0e491722a75fb53fe227e43d1092bd9cdca6306767702e3cdf06969c4146bc2595ba
6
+ metadata.gz: fd62d702fea2f2ba86fb84bb701201abbf0a5ae647734b87506b40d83f172f95661ebfc1c4d5eb702db2ad7284cd3a4ca5d90db809fa8841e6ee3b141dc8498d
7
+ data.tar.gz: 2568eae71807a231d34a6db0fd6549a22cd4b7e32991048b01c57918b4875facebd63aea665080c487cc492292bc76ede9a9ee82d851d8abd1f72c66a903cd81
@@ -5,6 +5,19 @@ module TextRank
5
5
  # token keywords into a combined keyword when those keywords are adjacent
6
6
  # to each other in the original text.
7
7
  #
8
+ # It tries to do this in as intelligent a manner as possible, keeping the single
9
+ # tokens that comprise a combination when one or more of the single tokens occur
10
+ # more often than the combination.
11
+ #
12
+ # This filter operates on the original (non-filtered) text in order to more
13
+ # intelligently determine true text adjacency versus token adjacency (e.g.
14
+ # two tokens can be adjacent even though they appeared in the original text
15
+ # on separate lines with punctuation in between. However, because it operates
16
+ # on the original text we may fail to find some combinations due to the
17
+ # keyword tokens not exactly matching the original text any more (e.g. if
18
+ # ASCII folding has occurred). The goal is to err on the side of caution:
19
+ # it is better to not suggest a combination than to suggest a bad combination.
20
+ #
8
21
  # = Example
9
22
  #
10
23
  # CollapseAdjacent.new(ranks_to_collapse: 6, max_tokens_to_combine: 2).filter!(
@@ -34,17 +47,24 @@ module TextRank
34
47
  # "peace" => 0.2905352582752693,
35
48
  # "inhabitants" => 0.12715120116732137,
36
49
  # "cares" => 0.0697383057947685,
37
- #
50
+ # "town siege" => 0.2365184450186848,
51
+ # "cities blessings" => 0.21272821337880285,
52
+ # "arts florish" => 0.146247479840506,
53
+ # "devoured envy" => 0.1424776818760168,
54
+ # "anxieties plagues" => 0.12821144722639122,
55
+ # "peace" => 0.07976303576999531,
56
+ # "inhabitants" => 0.03490786580297893,
57
+ # "cares" => 0.019145831086624026,
58
+ # }
38
59
  ##
39
60
  class CollapseAdjacent
40
61
 
41
- # @param ranks_to_collapse [Fixnum] the top N ranks in which to look for collapsable keywords
42
- # @param max_tokens_to_combine [Fixnum] the maximum number of tokens to collapse into a combined keyword
43
- # @param ignore_case [true, false] whether to ignore case when finding adjacent keywords in original text
44
- def initialize(ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, **_)
45
- @ranks_to_collapse = ranks_to_collapse
46
- @max_tokens_to_combine = max_tokens_to_combine
47
- @ignore_case = !!ignore_case
62
+ # @option options [Fixnum] ranks_to_collapse the top N ranks in which to look for collapsable keywords
63
+ # @option options [Fixnum] max_tokens_to_combine the maximum number of tokens to collapse into a combined keyword
64
+ # @option options [true, false] ignore_case whether to ignore case when finding adjacent keywords in original text
65
+ # @options options [String] delimiter an optional delimiter between adjacent keywords in original text
66
+ def initialize(**options)
67
+ @options = options
48
68
  end
49
69
 
50
70
  # Perform the filter on the ranks
@@ -52,28 +72,144 @@ module TextRank
52
72
  # @param original_text [String] the original text (pre-tokenization) from which to find collapsable keywords
53
73
  # @return [Hash<String, Float>]
54
74
  def filter!(ranks, original_text:, **_)
55
- collapsed = {}
56
- loop do
57
- permutation = collapse_one(ranks.keys.first(@ranks_to_collapse - collapsed.size), original_text) or break
58
- collapsed[permutation.join(' ')] = ranks.values_at(*permutation).max
59
- permutation.each { |token| ranks.delete(token) }
60
- end
61
- collapsed.merge!(ranks)
62
- Hash[collapsed.sort_by { |_, v| -v }]
75
+ TokenCollapser.new(tokens: ranks, text: original_text, **@options).collapse
63
76
  end
64
77
 
65
78
  private
66
79
 
67
- def collapse_one(tokens, original_text)
68
- (2..@max_tokens_to_combine).to_a.reverse_each do |tokens_to_combine|
69
- tokens.permutation(tokens_to_combine) do |permutation|
70
- re_options = 0
71
- re_options |= Regexp::IGNORECASE if @ignore_case
72
- re = Regexp.new("\\b#{permutation.join(" +")}\\b", re_options)
73
- return permutation if original_text =~ re
80
+ class TokenCollapser
81
+
82
+ def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
83
+ @tokens = tokens
84
+ @text = text
85
+
86
+ @ranks_to_collapse = ranks_to_collapse
87
+ @max_tokens_to_combine = max_tokens_to_combine
88
+ @ignore_case = !!ignore_case
89
+ @delimiter = delimiter.to_s == '' ? ' ' : delimiter
90
+
91
+ @to_collapse = Set.new # Track the permutations we plan to collapse
92
+ @to_remove = Set.new # Track the single tokens we plan to remove (due to being collapsed)
93
+ @permutations_scanned = {} # Track how many occurrences of each permutation we found in the original text
94
+ @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
95
+ end
96
+
97
+ def delimiter_re
98
+ @delimiter_re ||= /#{@delimiter}+/
99
+ end
100
+
101
+ def collapse
102
+ # We make multiple passes at collapsing because after the first pass we may have
103
+ # replaced two or more singletons with a collapsed token, bumping up one or more
104
+ # single tokens from below the cut to above it. So we'll continue searching
105
+ # until all of the top N final keywords (single or collapsed) have been
106
+ # considered.
107
+ loop do
108
+ single_tokens_to_consider = @tokens.keys.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
109
+ scan_text_for_all_permutations_of(single_tokens_to_consider) or break
110
+ decide_what_to_collapse_and_what_to_remove
111
+ end
112
+
113
+ # We now know what to collapse and what to remove, so we can start safely
114
+ # modifying the tokens hash
115
+ @to_collapse.each do |perm|
116
+ values = @tokens.values_at(*perm)
117
+ @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
118
+ end
119
+ @tokens.reject! do |k, _|
120
+ @to_remove.include?(k)
121
+ end
122
+
123
+ # Because we've made changes to the tokens hash, we need to re-normalize so that
124
+ # the sum of all token ranks is still 1.
125
+ normalize(@tokens)
126
+ end
127
+
128
+ # We need to be efficient about how we search for the large number of possible collapsed keywords.
129
+ # Doing them one at a time is very expensive and performs at least 20 times slower in my tests.
130
+ # And since we do multiple passes we need to be careful about not searching for the same combo
131
+ # more than once. So for every combo (and the single tokens themselves) we've searched for we
132
+ # keep track of the number of times we've found them.
133
+ #
134
+ # Even for single tokens this may be zero due to some modification from the original text before
135
+ # tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
136
+ # to find what we can.
137
+ def scan_text_for_all_permutations_of(single_tokens)
138
+ perms = []
139
+ # NOTE that by reversing the order we craft the regex to prefer larger combinations over
140
+ # smaller combinations (or singletons).
141
+ (1..@max_tokens_to_combine).to_a.reverse.map do |nn|
142
+ single_tokens.permutation(nn).each do |perm|
143
+ unless @permutations_scanned.key?(perm)
144
+ @permutations_scanned[perm] = 0
145
+ perms << perm
146
+ end
147
+ end
148
+ end
149
+ scan_text_for(perms) do |s|
150
+ s = s.downcase if @ignore_case
151
+ @permutations_scanned[s.split(delimiter_re)] += 1
152
+ end unless perms.empty?
153
+ end
154
+
155
+ # Because we're scanning the original text, we've lost all of the character filtering we did
156
+ # prior to tokenization, but that's important because we need the original context to be more
157
+ # choosy. Still, we need to know what delimiter goes between collapsed tokens (since it may
158
+ # not always be a space). Likewise, we can't always assume the Lowercase filter has been
159
+ # used, so we allow for customziation with the :ignore_case & :delimiter options.
160
+ def scan_text_for(all)
161
+ flags = 0
162
+ flags |= Regexp::IGNORECASE if @ignore_case
163
+ searches = all.map do |a|
164
+ a.is_a?(Array) ? a.join(delimiter_re.to_s) : a
165
+ end
166
+ re = Regexp.new("\\b(#{searches.join('|')})\\b", flags)
167
+
168
+ any_found = false
169
+ @text.scan(re) do |s, _|
170
+ yield s
171
+ any_found = true
172
+ end
173
+ any_found
174
+ end
175
+
176
+ # Once we have the number of occurrences for every permutation (including singletons)
177
+ # we can make choices about what to collapse and what to remove. We won't make any
178
+ # modifications to the original token list yet but just keep track of what we plan
179
+ # to collapse/remove.
180
+ def decide_what_to_collapse_and_what_to_remove
181
+ non_empty_ordered = @permutations_scanned.select do |k, v|
182
+ v > 0
183
+ end.sort_by do |k, v|
184
+ [-v, -k.size] # reverse order
185
+ end
186
+
187
+ tokens_encountered = []
188
+ non_empty_ordered.each do |perm, perm_count|
189
+ if perm.size > 1
190
+ singles_to_remove = perm - tokens_encountered
191
+ if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
192
+ @to_collapse << perm if perm.size > 1
193
+ @to_remove |= singles_to_remove
194
+ end
195
+ end
196
+ tokens_encountered += perm
74
197
  end
75
198
  end
76
- nil
199
+
200
+ # Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
201
+ # we still want to add the collapsed key if it shows up "enough" times.
202
+ def combination_significant?(perm, perm_count)
203
+ total_single_count = perm.reduce(0) { |s, t| s + @permutations_scanned[[t]] } / perm.size.to_f
204
+ total_single_count.zero? || (perm_count / total_single_count) > @combination_significance_threshold
205
+ end
206
+
207
+ # Scale all of the token ranks so they add up to 1.
208
+ def normalize(tokens)
209
+ total = tokens.reduce(0.0) { |s, (_, v)| s + v }
210
+ Hash[tokens.map { |k, v| [k, v / total] }.sort_by { |_, v| -v }]
211
+ end
212
+
77
213
  end
78
214
 
79
215
  end
@@ -1,3 +1,3 @@
1
1
  module TextRank
2
- VERSION = '1.1.0'
2
+ VERSION = '1.1.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_rank
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - David McCullars
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-05-10 00:00:00.000000000 Z
11
+ date: 2016-05-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler