RubyGems - text_rank - Versions diffs - 1.2.3 → 1.2.4 - Mend

text_rank 1.2.3 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +4 -4
data/.rubocop.yml +7 -0
data/bin/console +3 -3
data/lib/page_rank.rb +2 -0
data/lib/page_rank/base.rb +9 -8
data/lib/page_rank/dense.rb +2 -1
data/lib/page_rank/sparse.rb +6 -7
data/lib/text_rank.rb +11 -8
data/lib/text_rank/char_filter.rb +1 -1
data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
data/lib/text_rank/fingerprint.rb +10 -18
data/lib/text_rank/fingerprint_overlap.rb +55 -0
data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
data/lib/text_rank/keyword_extractor.rb +19 -21
data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
data/lib/text_rank/token_filter/stopwords.rb +1 -321
data/lib/text_rank/token_filter/stopwords.yml +317 -0
data/lib/text_rank/tokenizer.rb +1 -1
data/lib/text_rank/tokenizer/money.rb +11 -6
data/lib/text_rank/tokenizer/number.rb +4 -3
data/lib/text_rank/tokenizer/punctuation.rb +4 -1
data/lib/text_rank/tokenizer/url.rb +3 -0
data/lib/text_rank/tokenizer/whitespace.rb +4 -1
data/lib/text_rank/tokenizer/word.rb +5 -2
data/lib/text_rank/version.rb +3 -1
metadata +4 -1

data/lib/text_rank/token_filter/stopwords.yml ADDED

@@ -0,0 +1,317 @@
+- a
+- about
+- above
+- across
+- after
+- afterwards
+- again
+- against
+- all
+- almost
+- alone
+- along
+- already
+- also
+- although
+- always
+- am
+- among
+- amongst
+- amoungst
+- amount
+- an
+- and
+- another
+- any
+- anyhow
+- anyone
+- anything
+- anyway
+- anywhere
+- are
+- around
+- as
+- at
+- back
+- be
+- became
+- because
+- become
+- becomes
+- becoming
+- been
+- before
+- beforehand
+- behind
+- being
+- below
+- beside
+- besides
+- between
+- beyond
+- bill
+- both
+- bottom
+- but
+- by
+- call
+- can
+- cannot
+- cant
+- co
+- con
+- could
+- couldnt
+- cry
+- de
+- describe
+- detail
+- do
+- done
+- down
+- due
+- during
+- each
+- eg
+- eight
+- either
+- eleven
+- else
+- elsewhere
+- empty
+- enough
+- etc
+- even
+- ever
+- every
+- everyone
+- everything
+- everywhere
+- except
+- few
+- fifteen
+- fify
+- fill
+- find
+- fire
+- first
+- five
+- for
+- former
+- formerly
+- forty
+- found
+- four
+- from
+- front
+- full
+- further
+- get
+- give
+- go
+- had
+- has
+- hasnt
+- have
+- he
+- hence
+- her
+- here
+- hereafter
+- hereby
+- herein
+- hereupon
+- hers
+- herself
+- him
+- himself
+- his
+- how
+- however
+- hundred
+- ie
+- if
+- in
+- inc
+- indeed
+- interest
+- into
+- is
+- it
+- its
+- itself
+- keep
+- last
+- latter
+- latterly
+- least
+- less
+- ltd
+- made
+- many
+- may
+- me
+- meanwhile
+- might
+- mill
+- mine
+- more
+- moreover
+- most
+- mostly
+- move
+- much
+- must
+- my
+- myself
+- name
+- namely
+- neither
+- never
+- nevertheless
+- next
+- nine
+- no
+- nobody
+- none
+- noone
+- nor
+- not
+- nothing
+- now
+- nowhere
+- of
+- off
+- often
+- on
+- once
+- one
+- only
+- onto
+- or
+- other
+- others
+- otherwise
+- our
+- ours
+- ourselves
+- out
+- over
+- own
+- part
+- per
+- perhaps
+- please
+- put
+- rather
+- re
+- same
+- see
+- seem
+- seemed
+- seeming
+- seems
+- serious
+- several
+- she
+- should
+- show
+- side
+- since
+- sincere
+- six
+- sixty
+- so
+- some
+- somehow
+- someone
+- something
+- sometime
+- sometimes
+- somewhere
+- still
+- such
+- system
+- take
+- ten
+- than
+- that
+- the
+- their
+- them
+- themselves
+- then
+- thence
+- there
+- thereafter
+- thereby
+- therefore
+- therein
+- thereupon
+- these
+- they
+- thickv
+- thin
+- third
+- this
+- those
+- though
+- three
+- through
+- throughout
+- thru
+- thus
+- to
+- together
+- too
+- top
+- toward
+- towards
+- twelve
+- twenty
+- two
+- un
+- under
+- until
+- up
+- upon
+- us
+- very
+- via
+- was
+- we
+- well
+- were
+- what
+- whatever
+- when
+- whence
+- whenever
+- where
+- whereafter
+- whereas
+- whereby
+- wherein
+- whereupon
+- wherever
+- whether
+- which
+- while
+- whither
+- who
+- whoever
+- whole
+- whom
+- whose
+- why
+- will
+- with
+- within
+- without
+- would
+- yet
+- you
+- your
+- yours
+- yourself
+- yourselves

data/lib/text_rank/tokenizer.rb CHANGED

@@ -31,7 +31,7 @@ module TextRank
       tokens = []
       text.scan(Regexp.new(regular_expressions.flatten.join('|'))) do |matches|
         m = matches.compact.first
-        tokens << m if m && m.size > 0
+        tokens << m if m&.size&.positive?
       end
       tokens
     end

data/lib/text_rank/tokenizer/money.rb CHANGED

@@ -1,4 +1,3 @@
-#encoding: UTF-8
 module TextRank
   module Tokenizer
@@ -12,7 +11,7 @@ module TextRank
       "\u20a4", # Lira Symbol
       "\u20a7", # Peseta Sign
       "\u20ac", # Euro Symbol
-      "\u20B9", # Rupee
+      "\u20B9", # Rupee
       "\u20a9", # Won Sign
       "\u20b4", # Hryvnia Sign
       "\u20af", # Drachma Sign
@@ -34,6 +33,8 @@ module TextRank
     # A tokenizer regex that preserves money or formatted numbers as a single token. This
     # currently supports 24 different currency symbols:
     #
+    # rubocop:disable Style/AsciiComments
+    #
     # * ¤
     # * $
     # * ¢
@@ -58,19 +59,23 @@ module TextRank
     # * ₫
     # * %
     # * ‰
+    # rubocop:enable Style/AsciiComments
     #
     # It also supports two alternative formats for negatives as well as optional three digit comma
     # separation and optional decimals.
     ##
-    Money = %r{
+    # rubocop:disable Naming/ConstantName
+    Money = /
       (
-        #{CURRENCY_SYMBOLS} \-? #{Number}      # $-45,231.21
+        #{CURRENCY_SYMBOLS} -? #{Number}       # $-45,231.21
         |
-        \-? #{CURRENCY_SYMBOLS} #{Number}      # -$45,231.21
+        -? #{CURRENCY_SYMBOLS} #{Number}       # -$45,231.21
         |
         \( #{CURRENCY_SYMBOLS} #{Number} \)    # ($45,231.21)
       )
-    }x
+    /x
+    # rubocop:enable Naming/ConstantName
   end
 end

data/lib/text_rank/tokenizer/number.rb CHANGED

@@ -1,11 +1,11 @@
-#encoding: UTF-8
 module TextRank
   module Tokenizer
     ##
     # A tokenizer regex that preserves (optionally formatted) numbers as a single token.
     ##
-    Number = %r{
+    # rubocop:disable Naming/ConstantName
+    Number = /
       (
         [1-9]\d{3,}       # 453231162
         (?:\.\d+)?        # 453231162.17
@@ -25,7 +25,8 @@ module TextRank
         (?:\.\d+)         # .17
       )
-    }x
+    /x
+    # rubocop:enable Naming/ConstantName
   end
 end

data/lib/text_rank/tokenizer/punctuation.rb CHANGED

@@ -1,11 +1,14 @@
 module TextRank
   module Tokenizer
     ##
     # A tokenizer regex that preserves single punctuation symbols as a token. Use
     # this if one or more of your TokenFilter classes need punctuation in order to
     # make decisions.
     ##
-    Punctuation = %r{([\p{Punct}])}
+    # rubocop:disable Naming/ConstantName
+    Punctuation = /(\p{Punct})/
+    # rubocop:enable Naming/ConstantName
   end
 end

data/lib/text_rank/tokenizer/url.rb CHANGED

@@ -1,8 +1,10 @@
 module TextRank
   module Tokenizer
     ##
     # A tokenizer regex that preserves entire URL's as a token (rather than split them up)
     ##
+    # rubocop:disable Naming/ConstantName
     Url = %r{
       (
         (?:[\w-]+://?|www[.])
@@ -16,6 +18,7 @@ module TextRank
         )
       )
     }xi
+    # rubocop:enable Naming/ConstantName
   end
 end

data/lib/text_rank/tokenizer/whitespace.rb CHANGED

@@ -1,11 +1,14 @@
 module TextRank
   module Tokenizer
     ##
     # A tokenizer regex that preserves single whitespace characters as a token. Use
     # this if one or more of your TokenFilter classes need whitespace in order to
     # make decisions.
     ##
-    Whitespace = %r{\s}
+    # rubocop:disable Naming/ConstantName
+    Whitespace = /\s/
+    # rubocop:enable Naming/ConstantName
   end
 end