RubyGems - text_rank - Versions diffs - 1.2.3 → 1.3.0 - Mend

text_rank 1.2.3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

checksums.yaml +4 -4
data/.codeclimate.yml +1 -1
data/.gitignore +4 -0
data/.rubocop.yml +7 -0
data/.ruby-version +1 -1
data/.travis.yml +1 -0
data/Rakefile +5 -0
data/bin/console +3 -3
data/ext/text_rank/extconf.rb +3 -0
data/ext/text_rank/page_rank_sparse_native.c +300 -0
data/ext/text_rank/page_rank_sparse_native.h +93 -0
data/ext/text_rank/text_rank.c +5 -0
data/lib/page_rank/base.rb +12 -9
data/lib/page_rank/dense.rb +3 -2
data/lib/page_rank/sparse.rb +6 -7
data/lib/page_rank/sparse_native.rb +21 -0
data/lib/page_rank.rb +7 -4
data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
data/lib/text_rank/char_filter.rb +1 -1
data/lib/text_rank/fingerprint.rb +10 -18
data/lib/text_rank/fingerprint_overlap.rb +55 -0
data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
data/lib/text_rank/keyword_extractor.rb +32 -25
data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
data/lib/text_rank/token_filter/stopwords.rb +1 -321
data/lib/text_rank/token_filter/stopwords.yml +317 -0
data/lib/text_rank/tokenizer/money.rb +11 -6
data/lib/text_rank/tokenizer/number.rb +4 -3
data/lib/text_rank/tokenizer/punctuation.rb +4 -1
data/lib/text_rank/tokenizer/url.rb +3 -0
data/lib/text_rank/tokenizer/whitespace.rb +4 -1
data/lib/text_rank/tokenizer/word.rb +5 -2
data/lib/text_rank/tokenizer.rb +1 -1
data/lib/text_rank/version.rb +3 -1
data/lib/text_rank.rb +14 -9
data/text_rank.gemspec +4 -1
metadata +48 -12

data/lib/text_rank/rank_filter/collapse_adjacent.rb CHANGED Viewed

@@ -77,6 +77,7 @@ module TextRank
       class TokenCollapser
+        # rubocop:disable Metrics/ParameterLists
         def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
           @tokens = tokens
           @text = text
@@ -91,6 +92,7 @@ module TextRank
           @permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
           @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
         end
+        # rubocop:enable Metrics/ParameterLists
         # :nodoc:
         def delimiter_re
@@ -104,23 +106,36 @@ module TextRank
           # single tokens from below the cut to above it.  So we'll continue searching
           # until all of the top N final keywords (single or collapsed) have been
           # considered.
-          loop do
-            regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
-            single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
-            scan_text_for_all_permutations_of(single_tokens_to_consider) or break
-            decide_what_to_collapse_and_what_to_remove
+          while collapse_attempt
+            # keep trying
           end
           # We now know what to collapse and what to remove, so we can start safely
           # modifying the tokens hash
+          apply_collapse
+        end
+        # :nodoc:
+        def collapse_attempt
+          regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
+          single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
+          scan_text_for_all_permutations_of(single_tokens_to_consider) or return false
+          decide_what_to_collapse_and_what_to_remove
+          true
+        end
+        # :nodoc:
+        def apply_collapse
           @to_collapse.each do |perm|
             values = @tokens.values_at(*perm).compact
             # This might be empty if somehow the scanned permutation doesn't
             # exactly match one of the tokens (e.g. ASCII-folding gone awry).
             # The goal is to do the best we can, so if we can't find it, ignore.
             next if values.empty?
             @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
           end
           @tokens.reject! do |k, _|
             @to_remove.include?(k)
           end || @tokens
@@ -136,16 +151,10 @@ module TextRank
         # tokenization (e.g. ASCII folding).  That's okay.  We're just making the best effort we can
         # to find what we can.
         def scan_text_for_all_permutations_of(single_tokens)
-          perms = []
           # NOTE that by reversing the order we craft the regex to prefer larger combinations over
           # smaller combinations (or singletons).
-          (1..@max_tokens_to_combine).to_a.reverse.map do |nn|
-            single_tokens.permutation(nn).each do |perm|
-              unless @permutations_scanned.key?(perm)
-                @permutations_scanned[perm] = 0
-                perms << perm
-              end
-            end
+          perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
+            scan_text_for_n_permutations_of(single_tokens, n)
           end
           scan_text_for(perms) do |s|
             s = s.downcase if @ignore_case
@@ -153,6 +162,15 @@ module TextRank
           end unless perms.empty?
         end
+        def scan_text_for_n_permutations_of(single_tokens, n)
+          single_tokens.permutation(n).map do |perm|
+            unless @permutations_scanned.key?(perm)
+              @permutations_scanned[perm] = 0
+              perm
+            end
+          end.compact
+        end
         # Because we're scanning the original text, we've lost all of the character filtering we did
         # prior to tokenization, but that's important because we need the original context to be more
         # choosy.  Still, we need to know what delimiter goes between collapsed tokens (since it may
@@ -179,25 +197,30 @@ module TextRank
         # modifications to the original token list yet but just keep track of what we plan
         # to collapse/remove.
         def decide_what_to_collapse_and_what_to_remove
-          non_empty_ordered = @permutations_scanned.select do |k, v|
-            v > 0
-          end.sort_by do |k, v|
-            [-v, -k.size] # reverse order
-          end
           tokens_encountered = []
-          non_empty_ordered.each do |perm, perm_count|
+          permutations_to_consider_collapsing.each do |perm, perm_count|
             if perm.size > 1
-              singles_to_remove = perm - tokens_encountered
-              if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
-                @to_collapse << perm if perm.size > 1
-                @to_remove |= singles_to_remove
-              end
+              decide_to_collapse_or_remove(perm, perm_count, singles_to_remove: perm - tokens_encountered)
             end
             tokens_encountered += perm
           end
         end
+        def permutations_to_consider_collapsing
+          @permutations_scanned.select do |_k, v|
+            v.positive?
+          end.sort_by do |k, v|
+            [-v, -k.size] # reverse order
+          end
+        end
+        def decide_to_collapse_or_remove(perm, perm_count, singles_to_remove:)
+          if !singles_to_remove.empty? || combination_significant?(perm, perm_count)
+            @to_collapse << perm if perm.size > 1
+            @to_remove |= singles_to_remove
+          end
+        end
         # Even if we encounter a potential collapsed key which occurs less often than the single tokens that make it up,
         # we still want to add the collapsed key if it shows up "enough" times.
         def combination_significant?(perm, perm_count)

data/lib/text_rank/rank_filter/normalize_probability.rb CHANGED Viewed

@@ -44,8 +44,9 @@ module TextRank
       # @return [Hash<String, Float>]
       def filter!(ranks, **_)
         return if ranks.empty?
         total = ranks.values.reduce(:+)
-        Hash[ranks.map { |k, v| [k, v / total] }]
+        ranks.transform_values { |v| v / total }
       end
     end

data/lib/text_rank/rank_filter/normalize_unit_vector.rb CHANGED Viewed

@@ -45,8 +45,9 @@ module TextRank
       # @return [Hash<String, Float>]
       def filter!(ranks, **_)
         return if ranks.empty?
         total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
-        Hash[ranks.map { |k, v| [k, v / total] }]
+        ranks.transform_values { |v| v / total }
       end
     end

data/lib/text_rank/token_filter/part_of_speech.rb CHANGED Viewed

@@ -1,5 +1,4 @@
 require 'engtagger'
-require 'set'
 module TextRank
   module TokenFilter

data/lib/text_rank/token_filter/stopwords.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-require 'set'
 module TextRank
   module TokenFilter
     ##
@@ -15,325 +13,7 @@ module TextRank
     class Stopwords
       # Default English stop-word list.
-      STOP_WORDS = Set.new(%w[
-        a
-        about
-        above
-        across
-        after
-        afterwards
-        again
-        against
-        all
-        almost
-        alone
-        along
-        already
-        also
-        although
-        always
-        am
-        among
-        amongst
-        amoungst
-        amount
-        an
-        and
-        another
-        any
-        anyhow
-        anyone
-        anything
-        anyway
-        anywhere
-        are
-        around
-        as
-        at
-        back
-        be
-        became
-        because
-        become
-        becomes
-        becoming
-        been
-        before
-        beforehand
-        behind
-        being
-        below
-        beside
-        besides
-        between
-        beyond
-        bill
-        both
-        bottom
-        but
-        by
-        call
-        can
-        cannot
-        cant
-        co
-        con
-        could
-        couldnt
-        cry
-        de
-        describe
-        detail
-        do
-        done
-        down
-        due
-        during
-        each
-        eg
-        eight
-        either
-        eleven
-        else
-        elsewhere
-        empty
-        enough
-        etc
-        even
-        ever
-        every
-        everyone
-        everything
-        everywhere
-        except
-        few
-        fifteen
-        fify
-        fill
-        find
-        fire
-        first
-        five
-        for
-        former
-        formerly
-        forty
-        found
-        four
-        from
-        front
-        full
-        further
-        get
-        give
-        go
-        had
-        has
-        hasnt
-        have
-        he
-        hence
-        her
-        here
-        hereafter
-        hereby
-        herein
-        hereupon
-        hers
-        herself
-        him
-        himself
-        his
-        how
-        however
-        hundred
-        ie
-        if
-        in
-        inc
-        indeed
-        interest
-        into
-        is
-        it
-        its
-        itself
-        keep
-        last
-        latter
-        latterly
-        least
-        less
-        ltd
-        made
-        many
-        may
-        me
-        meanwhile
-        might
-        mill
-        mine
-        more
-        moreover
-        most
-        mostly
-        move
-        much
-        must
-        my
-        myself
-        name
-        namely
-        neither
-        never
-        nevertheless
-        next
-        nine
-        no
-        nobody
-        none
-        noone
-        nor
-        not
-        nothing
-        now
-        nowhere
-        of
-        off
-        often
-        on
-        once
-        one
-        only
-        onto
-        or
-        other
-        others
-        otherwise
-        our
-        ours
-        ourselves
-        out
-        over
-        own
-        part
-        per
-        perhaps
-        please
-        put
-        rather
-        re
-        same
-        see
-        seem
-        seemed
-        seeming
-        seems
-        serious
-        several
-        she
-        should
-        show
-        side
-        since
-        sincere
-        six
-        sixty
-        so
-        some
-        somehow
-        someone
-        something
-        sometime
-        sometimes
-        somewhere
-        still
-        such
-        system
-        take
-        ten
-        than
-        that
-        the
-        their
-        them
-        themselves
-        then
-        thence
-        there
-        thereafter
-        thereby
-        therefore
-        therein
-        thereupon
-        these
-        they
-        thickv
-        thin
-        third
-        this
-        those
-        though
-        three
-        through
-        throughout
-        thru
-        thus
-        to
-        together
-        too
-        top
-        toward
-        towards
-        twelve
-        twenty
-        two
-        un
-        under
-        until
-        up
-        upon
-        us
-        very
-        via
-        was
-        we
-        well
-        were
-        what
-        whatever
-        when
-        whence
-        whenever
-        where
-        whereafter
-        whereas
-        whereby
-        wherein
-        whereupon
-        wherever
-        whether
-        which
-        while
-        whither
-        who
-        whoever
-        whole
-        whom
-        whose
-        why
-        will
-        with
-        within
-        without
-        would
-        yet
-        you
-        your
-        yours
-        yourself
-        yourselves
-      ])
+      STOP_WORDS = Set.new(YAML.load_file(File.expand_path('stopwords.yml', __dir__)))
       # Perform the filter
       # @param tokens [Array<String>]