text_rank 1.1.1 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f76e0559c4da8e9461b2e89139746e52bcc1dd56
4
- data.tar.gz: 81404e9523ff19eaec3839859f8a83db9690274e
3
+ metadata.gz: 190ea53d10c7ae49f55f0206c8c7346cea3ba4af
4
+ data.tar.gz: aac4c6be16b91047508af053ca48ba4fa8594f43
5
5
  SHA512:
6
- metadata.gz: fd62d702fea2f2ba86fb84bb701201abbf0a5ae647734b87506b40d83f172f95661ebfc1c4d5eb702db2ad7284cd3a4ca5d90db809fa8841e6ee3b141dc8498d
7
- data.tar.gz: 2568eae71807a231d34a6db0fd6549a22cd4b7e32991048b01c57918b4875facebd63aea665080c487cc492292bc76ede9a9ee82d851d8abd1f72c66a903cd81
6
+ metadata.gz: 86c6007b9397e126fcadc57d73f7f6c09def32507ffd2be401f8dc2ca389bafbf9f3384e41453deec7149be5f7862a361ea59d3c362915863571c7e0e81799e8
7
+ data.tar.gz: 2e678435c0079ab85518f2b4b380dd46744ab5e6d386b6ed82dcbdd9d93943bd4b5e2c54d1ac63077eda6ac2f1f13ccbbb8523bdd50bdd0991fdc11d04c61b49
@@ -33,8 +33,8 @@ module PageRank
33
33
  end
34
34
 
35
35
  # Adds a directed (and optionally weighted) edge to the graph
36
- # @param source [Object] The source node
37
- # @param dest [Object] The destination node
36
+ # @param _source [Object] The source node
37
+ # @param _dest [Object] The destination node
38
38
  # @return [nil]
39
39
  def add(_source, _dest, **_options)
40
40
  raise NotImplementedError
@@ -11,7 +11,9 @@ module TextRank
11
11
  ##
12
12
  class AsciiFolding
13
13
 
14
+ # Non-ASCII characters to replace
14
15
  NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'
16
+ # "Equivalent" ASCII characters
15
17
  EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'
16
18
 
17
19
  # Perform the filter
@@ -10,6 +10,7 @@ module TextRank
10
10
  ##
11
11
  class StripEmail
12
12
 
13
+ # Simple regex to match most emails
13
14
  EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/i
14
15
 
15
16
  # Perform the filter
@@ -10,6 +10,7 @@ module TextRank
10
10
  ##
11
11
  class UndoContractions
12
12
 
13
+ # List of English contractions to undo
13
14
  CONTRACTIONS = {
14
15
  "ain't" => "am not",
15
16
  "amn't" => "am not",
@@ -14,7 +14,7 @@ module TextRank
14
14
  def self.basic(**options)
15
15
  new(**{
16
16
  char_filters: [:AsciiFolding, :Lowercase],
17
- tokenizer: :Whitespace,
17
+ tokenizers: [:Word],
18
18
  token_filters: [:Stopwords, :MinLength],
19
19
  graph_strategy: :Coocurrence,
20
20
  }.merge(options))
@@ -26,27 +26,27 @@ module TextRank
26
26
  def self.advanced(**options)
27
27
  new(**{
28
28
  char_filters: [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
29
- tokenizer: :WordsAndPunctuation,
29
+ tokenizers: [:Url, :Money, :Number, :Word, :Punctuation],
30
30
  token_filters: [:PartOfSpeech, :Stopwords, :MinLength],
31
31
  graph_strategy: :Coocurrence,
32
- rank_filters: [:CollapseAdjacent],
32
+ rank_filters: [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
33
33
  }.merge(options))
34
34
  end
35
35
 
36
36
  # @option (see PageRank.new)
37
37
  # @option options [Array<Class, Symbol, #filter!>] :char_filters A list of filters to be applied prior to tokenization
38
- # @option options [Class, Symbol, #tokenize] :tokenizer A class or tokenizer instance to perform tokenization
38
+ # @option options [Array<Symbol, Regexp, String>] :tokenizers A list of tokenizer regular expressions to perform tokenization
39
39
  # @option options [Array<Class, Symbol, #filter!>] :token_filters A list of filters to be applied to each token after tokenization
40
40
  # @option options [Class, Symbol, #build_graph] :graph_strategy A class or strategy instance for producing a graph from tokens
41
41
  # @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
42
42
  def initialize(**options)
43
43
  @page_rank_options = {
44
- strategy: options[:strategy] || :sparse,
44
+ strategy: options[:strategy] || :dense,
45
45
  damping: options[:damping],
46
46
  tolerance: options[:tolerance],
47
47
  }
48
48
  @char_filters = options[:char_filters] || []
49
- @tokenizer = options[:tokenizer] || Tokenizer::Whitespace
49
+ @tokenizers = options[:tokenizers] || [Tokenizer::Word]
50
50
  @token_filters = options[:token_filters] || []
51
51
  @rank_filters = options[:rank_filters] || []
52
52
  @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
@@ -61,11 +61,13 @@ module TextRank
61
61
  nil
62
62
  end
63
63
 
64
- # Sets the tokenizer for producing tokens from filtered text
65
- # @param tokenizer [Class, Symbol, #tokenize] Tokenizer
66
- # @return [Class, Symbol, #tokenize]
67
- def tokenizer=(tokenizer)
68
- @tokenizer = tokenizer
64
+ # Add a tokenizer regular expression for producing tokens from filtered text
65
+ # @param tokenizer [Symbol, Regexp, String] Tokenizer regular expression
66
+ # @param (see #add_into)
67
+ # @return [nil]
68
+ def add_tokenizer(tokenizer, **options)
69
+ add_into(@tokenizers, tokenizer, **options)
70
+ nil
69
71
  end
70
72
 
71
73
  # Sets the graph strategy for producing a graph from tokens
@@ -98,7 +100,7 @@ module TextRank
98
100
  # @return [Array<String>] tokens
99
101
  def tokenize(text)
100
102
  filtered_text = apply_char_filters(text)
101
- tokens = classify(@tokenizer, context: Tokenizer).tokenize(filtered_text)
103
+ tokens = Tokenizer.tokenize(filtered_text, *tokenizer_regular_expressions)
102
104
  apply_token_filters(tokens)
103
105
  end
104
106
 
@@ -121,6 +123,17 @@ module TextRank
121
123
  end
122
124
  end
123
125
 
126
+ def tokenizer_regular_expressions
127
+ @tokenizers.map do |t|
128
+ case t
129
+ when Symbol
130
+ Tokenizer.const_get(t)
131
+ else
132
+ t
133
+ end
134
+ end
135
+ end
136
+
124
137
  def apply_token_filters(tokens)
125
138
  @token_filters.reduce(tokens) do |t, f|
126
139
  classify(f, context: TokenFilter).filter!(t) || t
@@ -12,7 +12,10 @@ module TextRank
12
12
  ##
13
13
  module RankFilter
14
14
 
15
- autoload :CollapseAdjacent, 'text_rank/rank_filter/collapse_adjacent'
15
+ autoload :CollapseAdjacent, 'text_rank/rank_filter/collapse_adjacent'
16
+ autoload :NormalizeProbability, 'text_rank/rank_filter/normalize_probability'
17
+ autoload :NormalizeUnitVector, 'text_rank/rank_filter/normalize_unit_vector'
18
+ autoload :SortByValue, 'text_rank/rank_filter/sort_by_value'
16
19
 
17
20
  end
18
21
  end
@@ -62,7 +62,7 @@ module TextRank
62
62
  # @option options [Fixnum] ranks_to_collapse the top N ranks in which to look for collapsable keywords
63
63
  # @option options [Fixnum] max_tokens_to_combine the maximum number of tokens to collapse into a combined keyword
64
64
  # @option options [true, false] ignore_case whether to ignore case when finding adjacent keywords in original text
65
- # @options options [String] delimiter an optional delimiter between adjacent keywords in original text
65
+ # @option options [String] delimiter an optional delimiter between adjacent keywords in original text
66
66
  def initialize(**options)
67
67
  @options = options
68
68
  end
@@ -75,8 +75,6 @@ module TextRank
75
75
  TokenCollapser.new(tokens: ranks, text: original_text, **@options).collapse
76
76
  end
77
77
 
78
- private
79
-
80
78
  class TokenCollapser
81
79
 
82
80
  def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
@@ -90,14 +88,16 @@ module TextRank
90
88
 
91
89
  @to_collapse = Set.new # Track the permutations we plan to collapse
92
90
  @to_remove = Set.new # Track the single tokens we plan to remove (due to being collapsed)
93
- @permutations_scanned = {} # Track how many occurrences of each permutation we found in the original text
91
+ @permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
94
92
  @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
95
93
  end
96
94
 
95
+ # :nodoc:
97
96
  def delimiter_re
98
97
  @delimiter_re ||= /#{@delimiter}+/
99
98
  end
100
99
 
100
+ # :nodoc:
101
101
  def collapse
102
102
  # We make multiple passes at collapsing because after the first pass we may have
103
103
  # replaced two or more singletons with a collapsed token, bumping up one or more
@@ -118,11 +118,7 @@ module TextRank
118
118
  end
119
119
  @tokens.reject! do |k, _|
120
120
  @to_remove.include?(k)
121
- end
122
-
123
- # Because we've made changes to the tokens hash, we need to re-normalize so that
124
- # the sum of all token ranks is still 1.
125
- normalize(@tokens)
121
+ end || @tokens
126
122
  end
127
123
 
128
124
  # We need to be efficient about how we search for the large number of possible collapsed keywords.
@@ -204,14 +200,10 @@ module TextRank
204
200
  total_single_count.zero? || (perm_count / total_single_count) > @combination_significance_threshold
205
201
  end
206
202
 
207
- # Scale all of the token ranks so they add up to 1.
208
- def normalize(tokens)
209
- total = tokens.reduce(0.0) { |s, (_, v)| s + v }
210
- Hash[tokens.map { |k, v| [k, v / total] }.sort_by { |_, v| -v }]
211
- end
212
-
213
203
  end
214
204
 
205
+ private_constant :TokenCollapser
206
+
215
207
  end
216
208
  end
217
209
  end
@@ -0,0 +1,53 @@
1
+ module TextRank
2
+ module RankFilter
3
+ ##
4
+ # A rank filter which normalizes the ranked keywords so that the sum of the
5
+ # rank values is 1.0 (a "probability" normalization).
6
+ #
7
+ # = Example
8
+ #
9
+ # NormalizeProbability.new.filter!(
10
+ # {
11
+ # "town" => 0.6818754334834477,
12
+ # "cities" => 0.6055017128817066,
13
+ # "siege" => 0.5411519524982207,
14
+ # "arts" => 0.4907977453782612,
15
+ # "envy" => 0.4692709808107252,
16
+ # "blessings" => 0.4442147897516214,
17
+ # "plagues" => 0.3972420789430091,
18
+ # "florish" => 0.2746092797528525,
19
+ # "devoured" => 0.26867321734332237,
20
+ # "anxieties" => 0.2367731719604189,
21
+ # "peace" => 0.1905352582752693,
22
+ # "inhabitants" => 0.02715120116732137,
23
+ # }
24
+ # )
25
+ # => {
26
+ # "town" => 0.1473434248897056,
27
+ # "cities" => 0.13084016782478722,
28
+ # "siege" => 0.11693511476062682,
29
+ # "arts" => 0.10605429845557579,
30
+ # "envy" => 0.10140267579486278,
31
+ # "blessings" => 0.09598839508602595,
32
+ # "plagues" => 0.08583827125543537,
33
+ # "florish" => 0.0593390959673909,
34
+ # "devoured" => 0.058056398684529435,
35
+ # "anxieties" => 0.051163259981992296,
36
+ # "peace" => 0.041171915188530236,
37
+ # "inhabitants" => 0.005866982110537665,
38
+ # }
39
+ ##
40
+ class NormalizeProbability
41
+
42
+ # Perform the filter on the ranks
43
+ # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
44
+ # @return [Hash<String, Float>]
45
+ def filter!(ranks, **_)
46
+ return if ranks.empty?
47
+ total = ranks.values.reduce(:+)
48
+ Hash[ranks.map { |k, v| [k, v / total] }]
49
+ end
50
+
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,54 @@
1
+ module TextRank
2
+ module RankFilter
3
+ ##
4
+ # A rank filter which normalizes the ranked keywords so that the sum of the
5
+ # squares of the rank values is 1.0 (and thus the keyword rankings in an
6
+ # N-vector space is a unit vector).
7
+ #
8
+ # = Example
9
+ #
10
+ # NormalizeUnitVector.new.filter!(
11
+ # {
12
+ # "town" => 0.6818754334834477,
13
+ # "cities" => 0.6055017128817066,
14
+ # "siege" => 0.5411519524982207,
15
+ # "arts" => 0.4907977453782612,
16
+ # "envy" => 0.4692709808107252,
17
+ # "blessings" => 0.4442147897516214,
18
+ # "plagues" => 0.3972420789430091,
19
+ # "florish" => 0.2746092797528525,
20
+ # "devoured" => 0.26867321734332237,
21
+ # "anxieties" => 0.2367731719604189,
22
+ # "peace" => 0.1905352582752693,
23
+ # "inhabitants" => 0.02715120116732137,
24
+ # }
25
+ # )
26
+ # => {
27
+ # "town" => 0.4616807998499129,
28
+ # "cities" => 0.40997006401243896,
29
+ # "siege" => 0.3664004508761722,
30
+ # "arts" => 0.3323068767754191,
31
+ # "envy" => 0.317731642948694,
32
+ # "blessings" => 0.30076672272820315,
33
+ # "plagues" => 0.2689626751964553,
34
+ # "florish" => 0.18593107435301526,
35
+ # "devoured" => 0.1819119149778339,
36
+ # "anxieties" => 0.16031319218415677,
37
+ # "peace" => 0.12900665740478157,
38
+ # "inhabitants" => 0.01838339916101275,
39
+ # }
40
+ ##
41
+ class NormalizeUnitVector
42
+
43
+ # Perform the filter on the ranks
44
+ # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
45
+ # @return [Hash<String, Float>]
46
+ def filter!(ranks, **_)
47
+ return if ranks.empty?
48
+ total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
49
+ Hash[ranks.map { |k, v| [k, v / total] }]
50
+ end
51
+
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,22 @@
1
+ module TextRank
2
+ module RankFilter
3
+ ##
4
+ # A rank filter which sorts the results by value
5
+ ##
6
+ class SortByValue
7
+
8
+ # @param descending [boolean] whether to sort in descending order
9
+ def initialize(descending: true)
10
+ @descending = !!descending
11
+ end
12
+
13
+ # Perform the filter on the ranks
14
+ # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
15
+ # @return [Hash<String, Float>]
16
+ def filter!(ranks, **_)
17
+ Hash[ranks.sort_by { |_, v| @descending ? -v : v }]
18
+ end
19
+
20
+ end
21
+ end
22
+ end
@@ -8,12 +8,33 @@ module TextRank
8
8
  # help inform its decision on which tokens to keep and which to drop. An example
9
9
  # of this is the part of speech token filter which uses punctuation tokens to
10
10
  # help guess the part of speech of each non-punctuation token.
11
+ #
12
+ # When tokenizing a piece of text, the Tokenizer will combine one or more
13
+ # regular expressions (in the order given) to scan the text for matches. As such
14
+ # you need only tell the tokenizer which tokens you want; everything else will
15
+ # be ignored.
11
16
  ##
12
17
  module Tokenizer
13
18
 
14
- autoload :Regex, 'text_rank/tokenizer/regex'
15
- autoload :Whitespace, 'text_rank/tokenizer/whitespace'
16
- autoload :WordsAndPunctuation, 'text_rank/tokenizer/words_and_punctuation'
19
+ autoload :Money, 'text_rank/tokenizer/money'
20
+ autoload :Number, 'text_rank/tokenizer/number'
21
+ autoload :Punctuation, 'text_rank/tokenizer/punctuation'
22
+ autoload :Url, 'text_rank/tokenizer/url'
23
+ autoload :Whitespace, 'text_rank/tokenizer/whitespace'
24
+ autoload :Word, 'text_rank/tokenizer/word'
25
+
26
+ # Performs tokenization of piece of text by one or more tokenizer regular expressions.
27
+ # @param text [String]
28
+ # @param regular_expressions [Array<Regexp|String>]
29
+ # @return [Array<String>]
30
+ def self.tokenize(text, *regular_expressions)
31
+ tokens = []
32
+ text.scan(Regexp.new(regular_expressions.flatten.join('|'))) do |matches|
33
+ m = matches.compact.first
34
+ tokens << m if m && m.size > 0
35
+ end
36
+ tokens
37
+ end
17
38
 
18
39
  end
19
40
  end
@@ -0,0 +1,76 @@
1
+ #encoding: UTF-8
2
+ module TextRank
3
+ module Tokenizer
4
+
5
+ CURRENCY_SYMBOLS = '[' + [
6
+ "\u00a4", # Generic Currency Symbol
7
+ "\u0024", # Dollar Sign
8
+ "\u00a2", # Cent Sign
9
+ "\u00a3", # Pound Sterling
10
+ "\u00a5", # Yen Symbol
11
+ "\u20a3", # Franc Sign
12
+ "\u20a4", # Lira Symbol
13
+ "\u20a7", # Peseta Sign
14
+ "\u20ac", # Euro Symbol
15
+ "\u20B9", # Rupee
16
+ "\u20a9", # Won Sign
17
+ "\u20b4", # Hryvnia Sign
18
+ "\u20af", # Drachma Sign
19
+ "\u20ae", # Tugrik Sign
20
+ "\u20b0", # German Penny Sign
21
+ "\u20b2", # Guarani Sign
22
+ "\u20b1", # Peso Sign
23
+ "\u20b3", # Austral Sign
24
+ "\u20b5", # Cedi Sign
25
+ "\u20ad", # Kip Sign
26
+ "\u20aa", # New Sheqel Sign
27
+ "\u20ab", # Dong Sign
28
+ "\u0025", # Percent
29
+ "\u2030", # Per Million
30
+ ].join + ']'
31
+ private_constant :CURRENCY_SYMBOLS # Do not expose this to avoid confusion
32
+
33
+ ##
34
+ # A tokenizer regex that preserves money or formatted numbers as a single token. This
35
+ # currently supports 24 different currency symbols:
36
+ #
37
+ # * ¤
38
+ # * $
39
+ # * ¢
40
+ # * £
41
+ # * ¥
42
+ # * ₣
43
+ # * ₤
44
+ # * ₧
45
+ # * €
46
+ # * ₹
47
+ # * ₩
48
+ # * ₴
49
+ # * ₯
50
+ # * ₮
51
+ # * ₰
52
+ # * ₲
53
+ # * ₱
54
+ # * ₳
55
+ # * ₵
56
+ # * ₭
57
+ # * ₪
58
+ # * ₫
59
+ # * %
60
+ # * ‰
61
+ #
62
+ # It also supports two alternative formats for negatives as well as optional three digit comma
63
+ # separation and optional decimals.
64
+ ##
65
+ Money = %r{
66
+ (
67
+ #{CURRENCY_SYMBOLS} \-? #{Number} # $-45,231.21
68
+ |
69
+ \-? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
70
+ |
71
+ \( #{CURRENCY_SYMBOLS} #{Number} \) # ($45,231.21)
72
+ )
73
+ }x
74
+
75
+ end
76
+ end
@@ -0,0 +1,31 @@
1
+ #encoding: UTF-8
2
+ module TextRank
3
+ module Tokenizer
4
+
5
+ ##
6
+ # A tokenizer regex that preserves (optionally formatted) numbers as a single token.
7
+ ##
8
+ Number = %r{
9
+ (
10
+ [1-9]\d{0,2} # 453
11
+ (?:,\d{3})* # 453,231,162
12
+ (?:\.\d{0,2})? # 453,231,162.17
13
+
14
+ |
15
+
16
+ [1-9]\d* # 453231162
17
+ (?:\.\d{0,2})? # 453231162.17
18
+
19
+ |
20
+
21
+ 0 # 0
22
+ (?:\.\d{0,2})? # 0.17
23
+
24
+ |
25
+
26
+ (?:\.\d{1,2}) # .17
27
+ )
28
+ }x
29
+
30
+ end
31
+ end
@@ -0,0 +1,11 @@
1
+ module TextRank
2
+ module Tokenizer
3
+ ##
4
+ # A tokenizer regex that preserves single punctuation symbols as a token. Use
5
+ # this if one or more of your TokenFilter classes need punctuation in order to
6
+ # make decisions.
7
+ ##
8
+ Punctuation = %r{([\p{Punct}])}
9
+
10
+ end
11
+ end
@@ -0,0 +1,21 @@
1
+ module TextRank
2
+ module Tokenizer
3
+ ##
4
+ # A tokenizer regex that preserves entire URL's as a token (rather than split them up)
5
+ ##
6
+ Url = %r{
7
+ (
8
+ (?:[\w-]+://?|www[.])
9
+ [^\s()<>]+
10
+ (?:
11
+ \([\w\d]+\)
12
+ |
13
+ (?:[^[:punct:]\s]
14
+ |
15
+ /)
16
+ )
17
+ )
18
+ }xi
19
+
20
+ end
21
+ end
@@ -1,19 +1,11 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
3
  ##
4
- # Tokenizer to split on any whitespace
5
- #
6
- # = Example
7
- #
8
- # Whitespace.new.tokenize("i should:like to know:which is worse.")
9
- # => ["i", "should:like", "to", "know:which", "is", "worse."]
4
+ # A tokenizer regex that preserves single whitespace characters as a token. Use
5
+ # this if one or more of your TokenFilter classes need whitespace in order to
6
+ # make decisions.
10
7
  ##
11
- class Whitespace < Regex
8
+ Whitespace = %r{\s}
12
9
 
13
- def initialize
14
- super(/\s+/)
15
- end
16
-
17
- end
18
10
  end
19
11
  end
@@ -0,0 +1,14 @@
1
+ module TextRank
2
+ module Tokenizer
3
+ ##
4
+ # A tokenizer regex that preserves a non-space, non-punctuation "word". It does
5
+ # allow hyphens and numerals, but the first character must be an A-Z character.
6
+ ##
7
+ Word = %r{
8
+ (
9
+ [a-z][a-z0-9-]*
10
+ )
11
+ }xi
12
+
13
+ end
14
+ end
@@ -1,3 +1,4 @@
1
1
  module TextRank
2
- VERSION = '1.1.1'
2
+ # Current gem version
3
+ VERSION = '1.1.5'
3
4
  end
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ['david.mccullars@gmail.com']
11
11
 
12
12
  spec.summary = %q{Implementation of TextRank solution to ranked keyword extraction}
13
- spec.description = %q{See https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwjK9tfHxcvMAhVOzGMKHdaQBeEQFggdMAA&url=https%3A%2F%2Fweb.eecs.umich.edu%2F~mihalcea%2Fpapers%2Fmihalcea.emnlp04.pdf&usg=AFQjCNHL5SGlxLy4qmEg1yexaKGZK_Q7IA}
13
+ spec.description = %q{Implementation of TextRank solution to ranked keyword extraction. See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf}
14
14
  spec.homepage = 'https://github.com/david-mccullars/text_rank'
15
15
  spec.license = 'MIT'
16
16
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_rank
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - David McCullars
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-05-12 00:00:00.000000000 Z
11
+ date: 2016-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -108,7 +108,8 @@ dependencies:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: '1.0'
111
- description: See https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwjK9tfHxcvMAhVOzGMKHdaQBeEQFggdMAA&url=https%3A%2F%2Fweb.eecs.umich.edu%2F~mihalcea%2Fpapers%2Fmihalcea.emnlp04.pdf&usg=AFQjCNHL5SGlxLy4qmEg1yexaKGZK_Q7IA
111
+ description: Implementation of TextRank solution to ranked keyword extraction. See
112
+ https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
112
113
  email:
113
114
  - david.mccullars@gmail.com
114
115
  executables: []
@@ -145,14 +146,20 @@ files:
145
146
  - lib/text_rank/keyword_extractor.rb
146
147
  - lib/text_rank/rank_filter.rb
147
148
  - lib/text_rank/rank_filter/collapse_adjacent.rb
149
+ - lib/text_rank/rank_filter/normalize_probability.rb
150
+ - lib/text_rank/rank_filter/normalize_unit_vector.rb
151
+ - lib/text_rank/rank_filter/sort_by_value.rb
148
152
  - lib/text_rank/token_filter.rb
149
153
  - lib/text_rank/token_filter/min_length.rb
150
154
  - lib/text_rank/token_filter/part_of_speech.rb
151
155
  - lib/text_rank/token_filter/stopwords.rb
152
156
  - lib/text_rank/tokenizer.rb
153
- - lib/text_rank/tokenizer/regex.rb
157
+ - lib/text_rank/tokenizer/money.rb
158
+ - lib/text_rank/tokenizer/number.rb
159
+ - lib/text_rank/tokenizer/punctuation.rb
160
+ - lib/text_rank/tokenizer/url.rb
154
161
  - lib/text_rank/tokenizer/whitespace.rb
155
- - lib/text_rank/tokenizer/words_and_punctuation.rb
162
+ - lib/text_rank/tokenizer/word.rb
156
163
  - lib/text_rank/version.rb
157
164
  - text_rank.gemspec
158
165
  homepage: https://github.com/david-mccullars/text_rank
@@ -180,4 +187,3 @@ signing_key:
180
187
  specification_version: 4
181
188
  summary: Implementation of TextRank solution to ranked keyword extraction
182
189
  test_files: []
183
- has_rdoc:
@@ -1,26 +0,0 @@
1
- module TextRank
2
- module Tokenizer
3
- ##
4
- # Base tokenizer that tokenizes on any regular expression
5
- #
6
- # = Example
7
- #
8
- # Regex.new(/:/).tokenize("i should:like to know:which is worse.")
9
- # => ["i should", "like to know", "which is worse"]
10
- ##
11
- class Regex
12
-
13
- # @param regex [Regexp] to use for string splitting
14
- def initialize(regex)
15
- @regex = regex
16
- end
17
-
18
- # @param text [String] string to tokenize
19
- # return [Array<String>] non-empty tokens
20
- def tokenize(text)
21
- text.split(@regex) - ['']
22
- end
23
-
24
- end
25
- end
26
- end
@@ -1,26 +0,0 @@
1
- module TextRank
2
- module Tokenizer
3
- ##
4
- # A tokenizer that preserves punctuation as their own tokens (which can be
5
- # used, for example, by the [TokenFilter::PartOfSpeechBase] filter).
6
- #
7
- # = Example
8
- #
9
- # WordsAndPunctuation.new.tokenize("i should:like to know:which is worse.")
10
- # => ["i", "should", ":", "like", "to", "know", ":", "which", "is", "worse", "."]
11
- ##
12
- class WordsAndPunctuation < Regex
13
-
14
- def initialize
15
- super(/
16
- ([a-z][a-z0-9-]+)
17
- |
18
- ([\p{Punct}])
19
- |
20
- \s+
21
- /xi)
22
- end
23
-
24
- end
25
- end
26
- end