text_rank 1.1.0 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 731bcb5680a6397568803361e08d552a66023ed4
4
- data.tar.gz: b3e66aa06ef115be83509a8702fe112837983f21
3
+ metadata.gz: f76e0559c4da8e9461b2e89139746e52bcc1dd56
4
+ data.tar.gz: 81404e9523ff19eaec3839859f8a83db9690274e
5
5
  SHA512:
6
- metadata.gz: 63f179e7780d659130bb16c6e7366324ecadf5ad4e6affe6497bcac3dd1a32b2cb87793badb6239ba1b453974b46e23d7379a8ae424922e47c8d0b2f53b0a267
7
- data.tar.gz: a09e4c0f7604d2a3c870efe9f8cfb87f59de2b02a2748d7fbb2807e37acf0e491722a75fb53fe227e43d1092bd9cdca6306767702e3cdf06969c4146bc2595ba
6
+ metadata.gz: fd62d702fea2f2ba86fb84bb701201abbf0a5ae647734b87506b40d83f172f95661ebfc1c4d5eb702db2ad7284cd3a4ca5d90db809fa8841e6ee3b141dc8498d
7
+ data.tar.gz: 2568eae71807a231d34a6db0fd6549a22cd4b7e32991048b01c57918b4875facebd63aea665080c487cc492292bc76ede9a9ee82d851d8abd1f72c66a903cd81
@@ -5,6 +5,19 @@ module TextRank
5
5
  # token keywords into a combined keyword when those keywords are adjacent
6
6
  # to each other in the original text.
7
7
  #
8
+ # It tries to do this in as intelligent a manner as possible, keeping the single
9
+ # tokens that comprise a combination when one or more of the single tokens occur
10
+ # more often than the combination.
11
+ #
12
+ # This filter operates on the original (non-filtered) text in order to more
13
+ # intelligently determine true text adjacency versus token adjacency (e.g.
14
+ # two tokens can be adjacent even though they appeared in the original text
15
+ # on separate lines with punctuation in between. However, because it operates
16
+ # on the original text we may fail to find some combinations due to the
17
+ # keyword tokens not exactly matching the original text any more (e.g. if
18
+ # ASCII folding has occurred). The goal is to err on the side of caution:
19
+ # it is better to not suggest a combination than to suggest a bad combination.
20
+ #
8
21
  # = Example
9
22
  #
10
23
  # CollapseAdjacent.new(ranks_to_collapse: 6, max_tokens_to_combine: 2).filter!(
@@ -34,17 +47,24 @@ module TextRank
34
47
  # "peace" => 0.2905352582752693,
35
48
  # "inhabitants" => 0.12715120116732137,
36
49
  # "cares" => 0.0697383057947685,
37
- #
50
+ # "town siege" => 0.2365184450186848,
51
+ # "cities blessings" => 0.21272821337880285,
52
+ # "arts florish" => 0.146247479840506,
53
+ # "devoured envy" => 0.1424776818760168,
54
+ # "anxieties plagues" => 0.12821144722639122,
55
+ # "peace" => 0.07976303576999531,
56
+ # "inhabitants" => 0.03490786580297893,
57
+ # "cares" => 0.019145831086624026,
58
+ # }
38
59
  ##
39
60
  class CollapseAdjacent
40
61
 
41
- # @param ranks_to_collapse [Fixnum] the top N ranks in which to look for collapsable keywords
42
- # @param max_tokens_to_combine [Fixnum] the maximum number of tokens to collapse into a combined keyword
43
- # @param ignore_case [true, false] whether to ignore case when finding adjacent keywords in original text
44
- def initialize(ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, **_)
45
- @ranks_to_collapse = ranks_to_collapse
46
- @max_tokens_to_combine = max_tokens_to_combine
47
- @ignore_case = !!ignore_case
62
+ # @option options [Fixnum] ranks_to_collapse the top N ranks in which to look for collapsable keywords
63
+ # @option options [Fixnum] max_tokens_to_combine the maximum number of tokens to collapse into a combined keyword
64
+ # @option options [true, false] ignore_case whether to ignore case when finding adjacent keywords in original text
65
+ # @options options [String] delimiter an optional delimiter between adjacent keywords in original text
66
+ def initialize(**options)
67
+ @options = options
48
68
  end
49
69
 
50
70
  # Perform the filter on the ranks
@@ -52,28 +72,144 @@ module TextRank
52
72
  # @param original_text [String] the original text (pre-tokenization) from which to find collapsable keywords
53
73
  # @return [Hash<String, Float>]
54
74
  def filter!(ranks, original_text:, **_)
55
- collapsed = {}
56
- loop do
57
- permutation = collapse_one(ranks.keys.first(@ranks_to_collapse - collapsed.size), original_text) or break
58
- collapsed[permutation.join(' ')] = ranks.values_at(*permutation).max
59
- permutation.each { |token| ranks.delete(token) }
60
- end
61
- collapsed.merge!(ranks)
62
- Hash[collapsed.sort_by { |_, v| -v }]
75
+ TokenCollapser.new(tokens: ranks, text: original_text, **@options).collapse
63
76
  end
64
77
 
65
78
  private
66
79
 
67
- def collapse_one(tokens, original_text)
68
- (2..@max_tokens_to_combine).to_a.reverse_each do |tokens_to_combine|
69
- tokens.permutation(tokens_to_combine) do |permutation|
70
- re_options = 0
71
- re_options |= Regexp::IGNORECASE if @ignore_case
72
- re = Regexp.new("\\b#{permutation.join(" +")}\\b", re_options)
73
- return permutation if original_text =~ re
80
+ class TokenCollapser
81
+
82
+ def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
83
+ @tokens = tokens
84
+ @text = text
85
+
86
+ @ranks_to_collapse = ranks_to_collapse
87
+ @max_tokens_to_combine = max_tokens_to_combine
88
+ @ignore_case = !!ignore_case
89
+ @delimiter = delimiter.to_s == '' ? ' ' : delimiter
90
+
91
+ @to_collapse = Set.new # Track the permutations we plan to collapse
92
+ @to_remove = Set.new # Track the single tokens we plan to remove (due to being collapsed)
93
+ @permutations_scanned = {} # Track how many occurrences of each permutation we found in the original text
94
+ @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
95
+ end
96
+
97
+ def delimiter_re
98
+ @delimiter_re ||= /#{@delimiter}+/
99
+ end
100
+
101
+ def collapse
102
+ # We make multiple passes at collapsing because after the first pass we may have
103
+ # replaced two or more singletons with a collapsed token, bumping up one or more
104
+ # single tokens from below the cut to above it. So we'll continue searching
105
+ # until all of the top N final keywords (single or collapsed) have been
106
+ # considered.
107
+ loop do
108
+ single_tokens_to_consider = @tokens.keys.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
109
+ scan_text_for_all_permutations_of(single_tokens_to_consider) or break
110
+ decide_what_to_collapse_and_what_to_remove
111
+ end
112
+
113
+ # We now know what to collapse and what to remove, so we can start safely
114
+ # modifying the tokens hash
115
+ @to_collapse.each do |perm|
116
+ values = @tokens.values_at(*perm)
117
+ @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
118
+ end
119
+ @tokens.reject! do |k, _|
120
+ @to_remove.include?(k)
121
+ end
122
+
123
+ # Because we've made changes to the tokens hash, we need to re-normalize so that
124
+ # the sum of all token ranks is still 1.
125
+ normalize(@tokens)
126
+ end
127
+
128
+ # We need to be efficient about how we search for the large number of possible collapsed keywords.
129
+ # Doing them one at a time is very expensive and performs at least 20 times slower in my tests.
130
+ # And since we do multiple passes we need to be careful about not searching for the same combo
131
+ # more than once. So for every combo (and the single tokens themselves) we've searched for we
132
+ # keep track of the number of times we've found them.
133
+ #
134
+ # Even for single tokens this may be zero due to some modification from the original text before
135
+ # tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
136
+ # to find what we can.
137
+ def scan_text_for_all_permutations_of(single_tokens)
138
+ perms = []
139
+ # NOTE that by reversing the order we craft the regex to prefer larger combinations over
140
+ # smaller combinations (or singletons).
141
+ (1..@max_tokens_to_combine).to_a.reverse.map do |nn|
142
+ single_tokens.permutation(nn).each do |perm|
143
+ unless @permutations_scanned.key?(perm)
144
+ @permutations_scanned[perm] = 0
145
+ perms << perm
146
+ end
147
+ end
148
+ end
149
+ scan_text_for(perms) do |s|
150
+ s = s.downcase if @ignore_case
151
+ @permutations_scanned[s.split(delimiter_re)] += 1
152
+ end unless perms.empty?
153
+ end
154
+
155
+ # Because we're scanning the original text, we've lost all of the character filtering we did
156
+ # prior to tokenization, but that's important because we need the original context to be more
157
+ # choosy. Still, we need to know what delimiter goes between collapsed tokens (since it may
158
+ # not always be a space). Likewise, we can't always assume the Lowercase filter has been
159
+ # used, so we allow for customziation with the :ignore_case & :delimiter options.
160
+ def scan_text_for(all)
161
+ flags = 0
162
+ flags |= Regexp::IGNORECASE if @ignore_case
163
+ searches = all.map do |a|
164
+ a.is_a?(Array) ? a.join(delimiter_re.to_s) : a
165
+ end
166
+ re = Regexp.new("\\b(#{searches.join('|')})\\b", flags)
167
+
168
+ any_found = false
169
+ @text.scan(re) do |s, _|
170
+ yield s
171
+ any_found = true
172
+ end
173
+ any_found
174
+ end
175
+
176
+ # Once we have the number of occurrences for every permutation (including singletons)
177
+ # we can make choices about what to collapse and what to remove. We won't make any
178
+ # modifications to the original token list yet but just keep track of what we plan
179
+ # to collapse/remove.
180
+ def decide_what_to_collapse_and_what_to_remove
181
+ non_empty_ordered = @permutations_scanned.select do |k, v|
182
+ v > 0
183
+ end.sort_by do |k, v|
184
+ [-v, -k.size] # reverse order
185
+ end
186
+
187
+ tokens_encountered = []
188
+ non_empty_ordered.each do |perm, perm_count|
189
+ if perm.size > 1
190
+ singles_to_remove = perm - tokens_encountered
191
+ if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
192
+ @to_collapse << perm if perm.size > 1
193
+ @to_remove |= singles_to_remove
194
+ end
195
+ end
196
+ tokens_encountered += perm
74
197
  end
75
198
  end
76
- nil
199
+
200
+ # Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
201
+ # we still want to add the collapsed key if it shows up "enough" times.
202
+ def combination_significant?(perm, perm_count)
203
+ total_single_count = perm.reduce(0) { |s, t| s + @permutations_scanned[[t]] } / perm.size.to_f
204
+ total_single_count.zero? || (perm_count / total_single_count) > @combination_significance_threshold
205
+ end
206
+
207
+ # Scale all of the token ranks so they add up to 1.
208
+ def normalize(tokens)
209
+ total = tokens.reduce(0.0) { |s, (_, v)| s + v }
210
+ Hash[tokens.map { |k, v| [k, v / total] }.sort_by { |_, v| -v }]
211
+ end
212
+
77
213
  end
78
214
 
79
215
  end
@@ -1,3 +1,3 @@
1
1
  module TextRank
2
- VERSION = '1.1.0'
2
+ VERSION = '1.1.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_rank
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - David McCullars
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-05-10 00:00:00.000000000 Z
11
+ date: 2016-05-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler