RubyGems - text_rank - Versions diffs - 1.1.0 → 1.1.1 - Mend

text_rank 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml +4 -4
data/lib/text_rank/rank_filter/collapse_adjacent.rb +160 -24
data/lib/text_rank/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 731bcb5680a6397568803361e08d552a66023ed4
-  data.tar.gz: b3e66aa06ef115be83509a8702fe112837983f21
+  metadata.gz: f76e0559c4da8e9461b2e89139746e52bcc1dd56
+  data.tar.gz: 81404e9523ff19eaec3839859f8a83db9690274e
 SHA512:
-  metadata.gz: 63f179e7780d659130bb16c6e7366324ecadf5ad4e6affe6497bcac3dd1a32b2cb87793badb6239ba1b453974b46e23d7379a8ae424922e47c8d0b2f53b0a267
-  data.tar.gz: a09e4c0f7604d2a3c870efe9f8cfb87f59de2b02a2748d7fbb2807e37acf0e491722a75fb53fe227e43d1092bd9cdca6306767702e3cdf06969c4146bc2595ba
+  metadata.gz: fd62d702fea2f2ba86fb84bb701201abbf0a5ae647734b87506b40d83f172f95661ebfc1c4d5eb702db2ad7284cd3a4ca5d90db809fa8841e6ee3b141dc8498d
+  data.tar.gz: 2568eae71807a231d34a6db0fd6549a22cd4b7e32991048b01c57918b4875facebd63aea665080c487cc492292bc76ede9a9ee82d851d8abd1f72c66a903cd81

data/lib/text_rank/rank_filter/collapse_adjacent.rb CHANGED Viewed

@@ -5,6 +5,19 @@ module TextRank
     # token keywords into a combined keyword when those keywords are adjacent
     # to each other in the original text.
     #
+    # It tries to do this in as intelligent a manner as possible, keeping the single
+    # tokens that comprise a combination when one or more of the single tokens occur
+    # more often than the combination.
+    #
+    # This filter operates on the original (non-filtered) text in order to more
+    # intelligently determine true text adjacency versus token adjacency (e.g.
+    # two tokens can be adjacent even though they appeared in the original text
+    # on separate lines with punctuation in between.  However, because it operates
+    # on the original text we may fail to find some combinations due to the
+    # keyword tokens not exactly matching the original text any more (e.g. if
+    # ASCII folding has occurred).  The goal is to err on the side of caution:
+    # it is better to not suggest a combination than to suggest a bad combination.
+    #
     # = Example
     #
     #  CollapseAdjacent.new(ranks_to_collapse: 6, max_tokens_to_combine: 2).filter!(
@@ -34,17 +47,24 @@ module TextRank
     #   "peace"             => 0.2905352582752693,
     #   "inhabitants"       => 0.12715120116732137,
     #   "cares"             => 0.0697383057947685,
-    #
+    #   "town siege"        => 0.2365184450186848,
+    #   "cities blessings"  => 0.21272821337880285,
+    #   "arts florish"      => 0.146247479840506,
+    #   "devoured envy"     => 0.1424776818760168,
+    #   "anxieties plagues" => 0.12821144722639122,
+    #   "peace"             => 0.07976303576999531,
+    #   "inhabitants"       => 0.03490786580297893,
+    #   "cares"             => 0.019145831086624026,
+    #  }
     ##
     class CollapseAdjacent
-      # @param ranks_to_collapse [Fixnum] the top N ranks in which to look for collapsable keywords
-      # @param max_tokens_to_combine [Fixnum] the maximum number of tokens to collapse into a combined keyword
-      # @param ignore_case [true, false] whether to ignore case when finding adjacent keywords in original text
-      def initialize(ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, **_)
-        @ranks_to_collapse = ranks_to_collapse
-        @max_tokens_to_combine = max_tokens_to_combine
-        @ignore_case = !!ignore_case
+      # @option options [Fixnum] ranks_to_collapse the top N ranks in which to look for collapsable keywords
+      # @option options [Fixnum] max_tokens_to_combine the maximum number of tokens to collapse into a combined keyword
+      # @option options [true, false] ignore_case whether to ignore case when finding adjacent keywords in original text
+      # @options options [String] delimiter an optional delimiter between adjacent keywords in original text
+      def initialize(**options)
+        @options = options
       end
       # Perform the filter on the ranks
@@ -52,28 +72,144 @@ module TextRank
       # @param original_text [String] the original text (pre-tokenization) from which to find collapsable keywords
       # @return [Hash<String, Float>]
       def filter!(ranks, original_text:, **_)
-        collapsed = {}
-        loop do
-          permutation = collapse_one(ranks.keys.first(@ranks_to_collapse - collapsed.size), original_text) or break
-          collapsed[permutation.join(' ')] = ranks.values_at(*permutation).max
-          permutation.each { |token| ranks.delete(token) }
-        end
-        collapsed.merge!(ranks)
-        Hash[collapsed.sort_by { |_, v| -v }]
+        TokenCollapser.new(tokens: ranks, text: original_text, **@options).collapse
       end
       private
-      def collapse_one(tokens, original_text)
-        (2..@max_tokens_to_combine).to_a.reverse_each do |tokens_to_combine|
-          tokens.permutation(tokens_to_combine) do |permutation|
-            re_options = 0
-            re_options |= Regexp::IGNORECASE if @ignore_case
-            re = Regexp.new("\\b#{permutation.join(" +")}\\b", re_options)
-            return permutation if original_text =~ re
+      class TokenCollapser
+        def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
+          @tokens = tokens
+          @text = text
+          @ranks_to_collapse = ranks_to_collapse
+          @max_tokens_to_combine = max_tokens_to_combine
+          @ignore_case = !!ignore_case
+          @delimiter = delimiter.to_s == '' ? ' ' : delimiter
+          @to_collapse = Set.new # Track the permutations we plan to collapse
+          @to_remove = Set.new # Track the single tokens we plan to remove (due to being collapsed)
+          @permutations_scanned = {} # Track how many occurrences of each permutation we found in the original text
+          @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
+        end
+        def delimiter_re
+          @delimiter_re ||= /#{@delimiter}+/
+        end
+        def collapse
+          # We make multiple passes at collapsing because after the first pass we may have
+          # replaced two or more singletons with a collapsed token, bumping up one or more
+          # single tokens from below the cut to above it.  So we'll continue searching
+          # until all of the top N final keywords (single or collapsed) have been
+          # considered.
+          loop do
+            single_tokens_to_consider = @tokens.keys.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
+            scan_text_for_all_permutations_of(single_tokens_to_consider) or break
+            decide_what_to_collapse_and_what_to_remove
+          end
+          # We now know what to collapse and what to remove, so we can start safely
+          # modifying the tokens hash
+          @to_collapse.each do |perm|
+            values = @tokens.values_at(*perm)
+            @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
+          end
+          @tokens.reject! do |k, _|
+            @to_remove.include?(k)
+          end
+          # Because we've made changes to the tokens hash, we need to re-normalize so that
+          # the sum of all token ranks is still 1.
+          normalize(@tokens)
+        end
+        # We need to be efficient about how we search for the large number of possible collapsed keywords.
+        # Doing them one at a time is very expensive and performs at least 20 times slower in my tests.
+        # And since we do multiple passes we need to be careful about not searching for the same combo
+        # more than once.  So for every combo (and the single tokens themselves) we've searched for we
+        # keep track of the number of times we've found them.
+        #
+        # Even for single tokens this may be zero due to some modification from the original text before
+        # tokenization (e.g. ASCII folding).  That's okay.  We're just making the best effort we can
+        # to find what we can.
+        def scan_text_for_all_permutations_of(single_tokens)
+          perms = []
+          # NOTE that by reversing the order we craft the regex to prefer larger combinations over
+          # smaller combinations (or singletons).
+          (1..@max_tokens_to_combine).to_a.reverse.map do |nn|
+            single_tokens.permutation(nn).each do |perm|
+              unless @permutations_scanned.key?(perm)
+                @permutations_scanned[perm] = 0
+                perms << perm
+              end
+            end
+          end
+          scan_text_for(perms) do |s|
+            s = s.downcase if @ignore_case
+            @permutations_scanned[s.split(delimiter_re)] += 1
+          end unless perms.empty?
+        end
+        # Because we're scanning the original text, we've lost all of the character filtering we did
+        # prior to tokenization, but that's important because we need the original context to be more
+        # choosy.  Still, we need to know what delimiter goes between collapsed tokens (since it may
+        # not always be a space).  Likewise, we can't always assume the Lowercase filter has been
+        # used, so we allow for customziation with the :ignore_case & :delimiter options.
+        def scan_text_for(all)
+          flags = 0
+          flags |= Regexp::IGNORECASE if @ignore_case
+          searches = all.map do |a|
+            a.is_a?(Array) ? a.join(delimiter_re.to_s) : a
+          end
+          re = Regexp.new("\\b(#{searches.join('|')})\\b", flags)
+          any_found = false
+          @text.scan(re) do |s, _|
+            yield s
+            any_found = true
+          end
+          any_found
+        end
+        # Once we have the number of occurrences for every permutation (including singletons)
+        # we can make choices about what to collapse and what to remove.  We won't make any
+        # modifications to the original token list yet but just keep track of what we plan
+        # to collapse/remove.
+        def decide_what_to_collapse_and_what_to_remove
+          non_empty_ordered = @permutations_scanned.select do |k, v|
+            v > 0
+          end.sort_by do |k, v|
+            [-v, -k.size] # reverse order
+          end
+          tokens_encountered = []
+          non_empty_ordered.each do |perm, perm_count|
+            if perm.size > 1
+              singles_to_remove = perm - tokens_encountered
+              if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
+                @to_collapse << perm if perm.size > 1
+                @to_remove |= singles_to_remove
+              end
+            end
+            tokens_encountered += perm
           end
         end
-        nil
+        # Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
+        # we still want to add the collapsed key if it shows up "enough" times.
+        def combination_significant?(perm, perm_count)
+          total_single_count = perm.reduce(0) { |s, t| s + @permutations_scanned[[t]] } / perm.size.to_f
+          total_single_count.zero? || (perm_count / total_single_count) > @combination_significance_threshold
+        end
+        # Scale all of the token ranks so they add up to 1.
+        def normalize(tokens)
+          total = tokens.reduce(0.0) { |s, (_, v)| s + v }
+          Hash[tokens.map { |k, v| [k, v / total] }.sort_by { |_, v| -v }]
+        end
       end
     end

data/lib/text_rank/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module TextRank
-  VERSION = '1.1.0'
+  VERSION = '1.1.1'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_rank
 version: !ruby/object:Gem::Version
-  version: 1.1.0
+  version: 1.1.1
 platform: ruby
 authors:
 - David McCullars
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-05-10 00:00:00.000000000 Z
+date: 2016-05-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler