text_rank 1.1.1 → 1.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f76e0559c4da8e9461b2e89139746e52bcc1dd56
4
- data.tar.gz: 81404e9523ff19eaec3839859f8a83db9690274e
3
+ metadata.gz: 190ea53d10c7ae49f55f0206c8c7346cea3ba4af
4
+ data.tar.gz: aac4c6be16b91047508af053ca48ba4fa8594f43
5
5
  SHA512:
6
- metadata.gz: fd62d702fea2f2ba86fb84bb701201abbf0a5ae647734b87506b40d83f172f95661ebfc1c4d5eb702db2ad7284cd3a4ca5d90db809fa8841e6ee3b141dc8498d
7
- data.tar.gz: 2568eae71807a231d34a6db0fd6549a22cd4b7e32991048b01c57918b4875facebd63aea665080c487cc492292bc76ede9a9ee82d851d8abd1f72c66a903cd81
6
+ metadata.gz: 86c6007b9397e126fcadc57d73f7f6c09def32507ffd2be401f8dc2ca389bafbf9f3384e41453deec7149be5f7862a361ea59d3c362915863571c7e0e81799e8
7
+ data.tar.gz: 2e678435c0079ab85518f2b4b380dd46744ab5e6d386b6ed82dcbdd9d93943bd4b5e2c54d1ac63077eda6ac2f1f13ccbbb8523bdd50bdd0991fdc11d04c61b49
@@ -33,8 +33,8 @@ module PageRank
33
33
  end
34
34
 
35
35
  # Adds a directed (and optionally weighted) edge to the graph
36
- # @param source [Object] The source node
37
- # @param dest [Object] The destination node
36
+ # @param _source [Object] The source node
37
+ # @param _dest [Object] The destination node
38
38
  # @return [nil]
39
39
  def add(_source, _dest, **_options)
40
40
  raise NotImplementedError
@@ -11,7 +11,9 @@ module TextRank
11
11
  ##
12
12
  class AsciiFolding
13
13
 
14
+ # Non-ASCII characters to replace
14
15
  NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'
16
+ # "Equivalent" ASCII characters
15
17
  EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'
16
18
 
17
19
  # Perform the filter
@@ -10,6 +10,7 @@ module TextRank
10
10
  ##
11
11
  class StripEmail
12
12
 
13
+ # Simple regex to match most emails
13
14
  EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/i
14
15
 
15
16
  # Perform the filter
@@ -10,6 +10,7 @@ module TextRank
10
10
  ##
11
11
  class UndoContractions
12
12
 
13
+ # List of English contractions to undo
13
14
  CONTRACTIONS = {
14
15
  "ain't" => "am not",
15
16
  "amn't" => "am not",
@@ -14,7 +14,7 @@ module TextRank
14
14
  def self.basic(**options)
15
15
  new(**{
16
16
  char_filters: [:AsciiFolding, :Lowercase],
17
- tokenizer: :Whitespace,
17
+ tokenizers: [:Word],
18
18
  token_filters: [:Stopwords, :MinLength],
19
19
  graph_strategy: :Coocurrence,
20
20
  }.merge(options))
@@ -26,27 +26,27 @@ module TextRank
26
26
  def self.advanced(**options)
27
27
  new(**{
28
28
  char_filters: [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
29
- tokenizer: :WordsAndPunctuation,
29
+ tokenizers: [:Url, :Money, :Number, :Word, :Punctuation],
30
30
  token_filters: [:PartOfSpeech, :Stopwords, :MinLength],
31
31
  graph_strategy: :Coocurrence,
32
- rank_filters: [:CollapseAdjacent],
32
+ rank_filters: [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
33
33
  }.merge(options))
34
34
  end
35
35
 
36
36
  # @option (see PageRank.new)
37
37
  # @option options [Array<Class, Symbol, #filter!>] :char_filters A list of filters to be applied prior to tokenization
38
- # @option options [Class, Symbol, #tokenize] :tokenizer A class or tokenizer instance to perform tokenization
38
+ # @option options [Array<Symbol, Regexp, String>] :tokenizers A list of tokenizer regular expressions to perform tokenization
39
39
  # @option options [Array<Class, Symbol, #filter!>] :token_filters A list of filters to be applied to each token after tokenization
40
40
  # @option options [Class, Symbol, #build_graph] :graph_strategy A class or strategy instance for producing a graph from tokens
41
41
  # @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
42
42
  def initialize(**options)
43
43
  @page_rank_options = {
44
- strategy: options[:strategy] || :sparse,
44
+ strategy: options[:strategy] || :dense,
45
45
  damping: options[:damping],
46
46
  tolerance: options[:tolerance],
47
47
  }
48
48
  @char_filters = options[:char_filters] || []
49
- @tokenizer = options[:tokenizer] || Tokenizer::Whitespace
49
+ @tokenizers = options[:tokenizers] || [Tokenizer::Word]
50
50
  @token_filters = options[:token_filters] || []
51
51
  @rank_filters = options[:rank_filters] || []
52
52
  @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
@@ -61,11 +61,13 @@ module TextRank
61
61
  nil
62
62
  end
63
63
 
64
- # Sets the tokenizer for producing tokens from filtered text
65
- # @param tokenizer [Class, Symbol, #tokenize] Tokenizer
66
- # @return [Class, Symbol, #tokenize]
67
- def tokenizer=(tokenizer)
68
- @tokenizer = tokenizer
64
+ # Add a tokenizer regular expression for producing tokens from filtered text
65
+ # @param tokenizer [Symbol, Regexp, String] Tokenizer regular expression
66
+ # @param (see #add_into)
67
+ # @return [nil]
68
+ def add_tokenizer(tokenizer, **options)
69
+ add_into(@tokenizers, tokenizer, **options)
70
+ nil
69
71
  end
70
72
 
71
73
  # Sets the graph strategy for producing a graph from tokens
@@ -98,7 +100,7 @@ module TextRank
98
100
  # @return [Array<String>] tokens
99
101
  def tokenize(text)
100
102
  filtered_text = apply_char_filters(text)
101
- tokens = classify(@tokenizer, context: Tokenizer).tokenize(filtered_text)
103
+ tokens = Tokenizer.tokenize(filtered_text, *tokenizer_regular_expressions)
102
104
  apply_token_filters(tokens)
103
105
  end
104
106
 
@@ -121,6 +123,17 @@ module TextRank
121
123
  end
122
124
  end
123
125
 
126
+ def tokenizer_regular_expressions
127
+ @tokenizers.map do |t|
128
+ case t
129
+ when Symbol
130
+ Tokenizer.const_get(t)
131
+ else
132
+ t
133
+ end
134
+ end
135
+ end
136
+
124
137
  def apply_token_filters(tokens)
125
138
  @token_filters.reduce(tokens) do |t, f|
126
139
  classify(f, context: TokenFilter).filter!(t) || t
@@ -12,7 +12,10 @@ module TextRank
12
12
  ##
13
13
  module RankFilter
14
14
 
15
- autoload :CollapseAdjacent, 'text_rank/rank_filter/collapse_adjacent'
15
+ autoload :CollapseAdjacent, 'text_rank/rank_filter/collapse_adjacent'
16
+ autoload :NormalizeProbability, 'text_rank/rank_filter/normalize_probability'
17
+ autoload :NormalizeUnitVector, 'text_rank/rank_filter/normalize_unit_vector'
18
+ autoload :SortByValue, 'text_rank/rank_filter/sort_by_value'
16
19
 
17
20
  end
18
21
  end
@@ -62,7 +62,7 @@ module TextRank
62
62
  # @option options [Fixnum] ranks_to_collapse the top N ranks in which to look for collapsable keywords
63
63
  # @option options [Fixnum] max_tokens_to_combine the maximum number of tokens to collapse into a combined keyword
64
64
  # @option options [true, false] ignore_case whether to ignore case when finding adjacent keywords in original text
65
- # @options options [String] delimiter an optional delimiter between adjacent keywords in original text
65
+ # @option options [String] delimiter an optional delimiter between adjacent keywords in original text
66
66
  def initialize(**options)
67
67
  @options = options
68
68
  end
@@ -75,8 +75,6 @@ module TextRank
75
75
  TokenCollapser.new(tokens: ranks, text: original_text, **@options).collapse
76
76
  end
77
77
 
78
- private
79
-
80
78
  class TokenCollapser
81
79
 
82
80
  def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
@@ -90,14 +88,16 @@ module TextRank
90
88
 
91
89
  @to_collapse = Set.new # Track the permutations we plan to collapse
92
90
  @to_remove = Set.new # Track the single tokens we plan to remove (due to being collapsed)
93
- @permutations_scanned = {} # Track how many occurrences of each permutation we found in the original text
91
+ @permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
94
92
  @combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
95
93
  end
96
94
 
95
+ # :nodoc:
97
96
  def delimiter_re
98
97
  @delimiter_re ||= /#{@delimiter}+/
99
98
  end
100
99
 
100
+ # :nodoc:
101
101
  def collapse
102
102
  # We make multiple passes at collapsing because after the first pass we may have
103
103
  # replaced two or more singletons with a collapsed token, bumping up one or more
@@ -118,11 +118,7 @@ module TextRank
118
118
  end
119
119
  @tokens.reject! do |k, _|
120
120
  @to_remove.include?(k)
121
- end
122
-
123
- # Because we've made changes to the tokens hash, we need to re-normalize so that
124
- # the sum of all token ranks is still 1.
125
- normalize(@tokens)
121
+ end || @tokens
126
122
  end
127
123
 
128
124
  # We need to be efficient about how we search for the large number of possible collapsed keywords.
@@ -204,14 +200,10 @@ module TextRank
204
200
  total_single_count.zero? || (perm_count / total_single_count) > @combination_significance_threshold
205
201
  end
206
202
 
207
- # Scale all of the token ranks so they add up to 1.
208
- def normalize(tokens)
209
- total = tokens.reduce(0.0) { |s, (_, v)| s + v }
210
- Hash[tokens.map { |k, v| [k, v / total] }.sort_by { |_, v| -v }]
211
- end
212
-
213
203
  end
214
204
 
205
+ private_constant :TokenCollapser
206
+
215
207
  end
216
208
  end
217
209
  end
@@ -0,0 +1,53 @@
1
+ module TextRank
2
+ module RankFilter
3
+ ##
4
+ # A rank filter which normalizes the ranked keywords so that the sum of the
5
+ # rank values is 1.0 (a "probability" normalization).
6
+ #
7
+ # = Example
8
+ #
9
+ # NormalizeProbability.new.filter!(
10
+ # {
11
+ # "town" => 0.6818754334834477,
12
+ # "cities" => 0.6055017128817066,
13
+ # "siege" => 0.5411519524982207,
14
+ # "arts" => 0.4907977453782612,
15
+ # "envy" => 0.4692709808107252,
16
+ # "blessings" => 0.4442147897516214,
17
+ # "plagues" => 0.3972420789430091,
18
+ # "florish" => 0.2746092797528525,
19
+ # "devoured" => 0.26867321734332237,
20
+ # "anxieties" => 0.2367731719604189,
21
+ # "peace" => 0.1905352582752693,
22
+ # "inhabitants" => 0.02715120116732137,
23
+ # }
24
+ # )
25
+ # => {
26
+ # "town" => 0.1473434248897056,
27
+ # "cities" => 0.13084016782478722,
28
+ # "siege" => 0.11693511476062682,
29
+ # "arts" => 0.10605429845557579,
30
+ # "envy" => 0.10140267579486278,
31
+ # "blessings" => 0.09598839508602595,
32
+ # "plagues" => 0.08583827125543537,
33
+ # "florish" => 0.0593390959673909,
34
+ # "devoured" => 0.058056398684529435,
35
+ # "anxieties" => 0.051163259981992296,
36
+ # "peace" => 0.041171915188530236,
37
+ # "inhabitants" => 0.005866982110537665,
38
+ # }
39
+ ##
40
+ class NormalizeProbability
41
+
42
+ # Perform the filter on the ranks
43
+ # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
44
+ # @return [Hash<String, Float>]
45
+ def filter!(ranks, **_)
46
+ return if ranks.empty?
47
+ total = ranks.values.reduce(:+)
48
+ Hash[ranks.map { |k, v| [k, v / total] }]
49
+ end
50
+
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,54 @@
1
+ module TextRank
2
+ module RankFilter
3
+ ##
4
+ # A rank filter which normalizes the ranked keywords so that the sum of the
5
+ # squares of the rank values is 1.0 (and thus the keyword rankings in an
6
+ # N-vector space is a unit vector).
7
+ #
8
+ # = Example
9
+ #
10
+ # NormalizeUnitVector.new.filter!(
11
+ # {
12
+ # "town" => 0.6818754334834477,
13
+ # "cities" => 0.6055017128817066,
14
+ # "siege" => 0.5411519524982207,
15
+ # "arts" => 0.4907977453782612,
16
+ # "envy" => 0.4692709808107252,
17
+ # "blessings" => 0.4442147897516214,
18
+ # "plagues" => 0.3972420789430091,
19
+ # "florish" => 0.2746092797528525,
20
+ # "devoured" => 0.26867321734332237,
21
+ # "anxieties" => 0.2367731719604189,
22
+ # "peace" => 0.1905352582752693,
23
+ # "inhabitants" => 0.02715120116732137,
24
+ # }
25
+ # )
26
+ # => {
27
+ # "town" => 0.4616807998499129,
28
+ # "cities" => 0.40997006401243896,
29
+ # "siege" => 0.3664004508761722,
30
+ # "arts" => 0.3323068767754191,
31
+ # "envy" => 0.317731642948694,
32
+ # "blessings" => 0.30076672272820315,
33
+ # "plagues" => 0.2689626751964553,
34
+ # "florish" => 0.18593107435301526,
35
+ # "devoured" => 0.1819119149778339,
36
+ # "anxieties" => 0.16031319218415677,
37
+ # "peace" => 0.12900665740478157,
38
+ # "inhabitants" => 0.01838339916101275,
39
+ # }
40
+ ##
41
+ class NormalizeUnitVector
42
+
43
+ # Perform the filter on the ranks
44
+ # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
45
+ # @return [Hash<String, Float>]
46
+ def filter!(ranks, **_)
47
+ return if ranks.empty?
48
+ total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
49
+ Hash[ranks.map { |k, v| [k, v / total] }]
50
+ end
51
+
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,22 @@
1
+ module TextRank
2
+ module RankFilter
3
+ ##
4
+ # A rank filter which sorts the results by value
5
+ ##
6
+ class SortByValue
7
+
8
+ # @param descending [boolean] whether to sort in descending order
9
+ def initialize(descending: true)
10
+ @descending = !!descending
11
+ end
12
+
13
+ # Perform the filter on the ranks
14
+ # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
15
+ # @return [Hash<String, Float>]
16
+ def filter!(ranks, **_)
17
+ Hash[ranks.sort_by { |_, v| @descending ? -v : v }]
18
+ end
19
+
20
+ end
21
+ end
22
+ end
@@ -8,12 +8,33 @@ module TextRank
8
8
  # help inform its decision on which tokens to keep and which to drop. An example
9
9
  # of this is the part of speech token filter which uses punctuation tokens to
10
10
  # help guess the part of speech of each non-punctuation token.
11
+ #
12
+ # When tokenizing a piece of text, the Tokenizer will combine one or more
13
+ # regular expressions (in the order given) to scan the text for matches. As such
14
+ # you need only tell the tokenizer which tokens you want; everything else will
15
+ # be ignored.
11
16
  ##
12
17
  module Tokenizer
13
18
 
14
- autoload :Regex, 'text_rank/tokenizer/regex'
15
- autoload :Whitespace, 'text_rank/tokenizer/whitespace'
16
- autoload :WordsAndPunctuation, 'text_rank/tokenizer/words_and_punctuation'
19
+ autoload :Money, 'text_rank/tokenizer/money'
20
+ autoload :Number, 'text_rank/tokenizer/number'
21
+ autoload :Punctuation, 'text_rank/tokenizer/punctuation'
22
+ autoload :Url, 'text_rank/tokenizer/url'
23
+ autoload :Whitespace, 'text_rank/tokenizer/whitespace'
24
+ autoload :Word, 'text_rank/tokenizer/word'
25
+
26
+ # Performs tokenization of piece of text by one or more tokenizer regular expressions.
27
+ # @param text [String]
28
+ # @param regular_expressions [Array<Regexp|String>]
29
+ # @return [Array<String>]
30
+ def self.tokenize(text, *regular_expressions)
31
+ tokens = []
32
+ text.scan(Regexp.new(regular_expressions.flatten.join('|'))) do |matches|
33
+ m = matches.compact.first
34
+ tokens << m if m && m.size > 0
35
+ end
36
+ tokens
37
+ end
17
38
 
18
39
  end
19
40
  end
@@ -0,0 +1,76 @@
1
+ #encoding: UTF-8
2
+ module TextRank
3
+ module Tokenizer
4
+
5
+ CURRENCY_SYMBOLS = '[' + [
6
+ "\u00a4", # Generic Currency Symbol
7
+ "\u0024", # Dollar Sign
8
+ "\u00a2", # Cent Sign
9
+ "\u00a3", # Pound Sterling
10
+ "\u00a5", # Yen Symbol
11
+ "\u20a3", # Franc Sign
12
+ "\u20a4", # Lira Symbol
13
+ "\u20a7", # Peseta Sign
14
+ "\u20ac", # Euro Symbol
15
+ "\u20B9", # Rupee
16
+ "\u20a9", # Won Sign
17
+ "\u20b4", # Hryvnia Sign
18
+ "\u20af", # Drachma Sign
19
+ "\u20ae", # Tugrik Sign
20
+ "\u20b0", # German Penny Sign
21
+ "\u20b2", # Guarani Sign
22
+ "\u20b1", # Peso Sign
23
+ "\u20b3", # Austral Sign
24
+ "\u20b5", # Cedi Sign
25
+ "\u20ad", # Kip Sign
26
+ "\u20aa", # New Sheqel Sign
27
+ "\u20ab", # Dong Sign
28
+ "\u0025", # Percent
29
+ "\u2030", # Per Million
30
+ ].join + ']'
31
+ private_constant :CURRENCY_SYMBOLS # Do not expose this to avoid confusion
32
+
33
+ ##
34
+ # A tokenizer regex that preserves money or formatted numbers as a single token. This
35
+ # currently supports 24 different currency symbols:
36
+ #
37
+ # * ¤
38
+ # * $
39
+ # * ¢
40
+ # * £
41
+ # * ¥
42
+ # * ₣
43
+ # * ₤
44
+ # * ₧
45
+ # * €
46
+ # * ₹
47
+ # * ₩
48
+ # * ₴
49
+ # * ₯
50
+ # * ₮
51
+ # * ₰
52
+ # * ₲
53
+ # * ₱
54
+ # * ₳
55
+ # * ₵
56
+ # * ₭
57
+ # * ₪
58
+ # * ₫
59
+ # * %
60
+ # * ‰
61
+ #
62
+ # It also supports two alternative formats for negatives as well as optional three digit comma
63
+ # separation and optional decimals.
64
+ ##
65
+ Money = %r{
66
+ (
67
+ #{CURRENCY_SYMBOLS} \-? #{Number} # $-45,231.21
68
+ |
69
+ \-? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
70
+ |
71
+ \( #{CURRENCY_SYMBOLS} #{Number} \) # ($45,231.21)
72
+ )
73
+ }x
74
+
75
+ end
76
+ end
@@ -0,0 +1,31 @@
1
+ #encoding: UTF-8
2
+ module TextRank
3
+ module Tokenizer
4
+
5
+ ##
6
+ # A tokenizer regex that preserves (optionally formatted) numbers as a single token.
7
+ ##
8
+ Number = %r{
9
+ (
10
+ [1-9]\d{0,2} # 453
11
+ (?:,\d{3})* # 453,231,162
12
+ (?:\.\d{0,2})? # 453,231,162.17
13
+
14
+ |
15
+
16
+ [1-9]\d* # 453231162
17
+ (?:\.\d{0,2})? # 453231162.17
18
+
19
+ |
20
+
21
+ 0 # 0
22
+ (?:\.\d{0,2})? # 0.17
23
+
24
+ |
25
+
26
+ (?:\.\d{1,2}) # .17
27
+ )
28
+ }x
29
+
30
+ end
31
+ end
@@ -0,0 +1,11 @@
1
+ module TextRank
2
+ module Tokenizer
3
+ ##
4
+ # A tokenizer regex that preserves single punctuation symbols as a token. Use
5
+ # this if one or more of your TokenFilter classes need punctuation in order to
6
+ # make decisions.
7
+ ##
8
+ Punctuation = %r{([\p{Punct}])}
9
+
10
+ end
11
+ end
@@ -0,0 +1,21 @@
1
+ module TextRank
2
+ module Tokenizer
3
+ ##
4
+ # A tokenizer regex that preserves entire URL's as a token (rather than split them up)
5
+ ##
6
+ Url = %r{
7
+ (
8
+ (?:[\w-]+://?|www[.])
9
+ [^\s()<>]+
10
+ (?:
11
+ \([\w\d]+\)
12
+ |
13
+ (?:[^[:punct:]\s]
14
+ |
15
+ /)
16
+ )
17
+ )
18
+ }xi
19
+
20
+ end
21
+ end
@@ -1,19 +1,11 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
3
  ##
4
- # Tokenizer to split on any whitespace
5
- #
6
- # = Example
7
- #
8
- # Whitespace.new.tokenize("i should:like to know:which is worse.")
9
- # => ["i", "should:like", "to", "know:which", "is", "worse."]
4
+ # A tokenizer regex that preserves single whitespace characters as a token. Use
5
+ # this if one or more of your TokenFilter classes need whitespace in order to
6
+ # make decisions.
10
7
  ##
11
- class Whitespace < Regex
8
+ Whitespace = %r{\s}
12
9
 
13
- def initialize
14
- super(/\s+/)
15
- end
16
-
17
- end
18
10
  end
19
11
  end
@@ -0,0 +1,14 @@
1
+ module TextRank
2
+ module Tokenizer
3
+ ##
4
+ # A tokenizer regex that preserves a non-space, non-punctuation "word". It does
5
+ # allow hyphens and numerals, but the first character must be an A-Z character.
6
+ ##
7
+ Word = %r{
8
+ (
9
+ [a-z][a-z0-9-]*
10
+ )
11
+ }xi
12
+
13
+ end
14
+ end
@@ -1,3 +1,4 @@
1
1
  module TextRank
2
- VERSION = '1.1.1'
2
+ # Current gem version
3
+ VERSION = '1.1.5'
3
4
  end
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ['david.mccullars@gmail.com']
11
11
 
12
12
  spec.summary = %q{Implementation of TextRank solution to ranked keyword extraction}
13
- spec.description = %q{See https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwjK9tfHxcvMAhVOzGMKHdaQBeEQFggdMAA&url=https%3A%2F%2Fweb.eecs.umich.edu%2F~mihalcea%2Fpapers%2Fmihalcea.emnlp04.pdf&usg=AFQjCNHL5SGlxLy4qmEg1yexaKGZK_Q7IA}
13
+ spec.description = %q{Implementation of TextRank solution to ranked keyword extraction. See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf}
14
14
  spec.homepage = 'https://github.com/david-mccullars/text_rank'
15
15
  spec.license = 'MIT'
16
16
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_rank
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - David McCullars
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-05-12 00:00:00.000000000 Z
11
+ date: 2016-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -108,7 +108,8 @@ dependencies:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: '1.0'
111
- description: See https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwjK9tfHxcvMAhVOzGMKHdaQBeEQFggdMAA&url=https%3A%2F%2Fweb.eecs.umich.edu%2F~mihalcea%2Fpapers%2Fmihalcea.emnlp04.pdf&usg=AFQjCNHL5SGlxLy4qmEg1yexaKGZK_Q7IA
111
+ description: Implementation of TextRank solution to ranked keyword extraction. See
112
+ https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
112
113
  email:
113
114
  - david.mccullars@gmail.com
114
115
  executables: []
@@ -145,14 +146,20 @@ files:
145
146
  - lib/text_rank/keyword_extractor.rb
146
147
  - lib/text_rank/rank_filter.rb
147
148
  - lib/text_rank/rank_filter/collapse_adjacent.rb
149
+ - lib/text_rank/rank_filter/normalize_probability.rb
150
+ - lib/text_rank/rank_filter/normalize_unit_vector.rb
151
+ - lib/text_rank/rank_filter/sort_by_value.rb
148
152
  - lib/text_rank/token_filter.rb
149
153
  - lib/text_rank/token_filter/min_length.rb
150
154
  - lib/text_rank/token_filter/part_of_speech.rb
151
155
  - lib/text_rank/token_filter/stopwords.rb
152
156
  - lib/text_rank/tokenizer.rb
153
- - lib/text_rank/tokenizer/regex.rb
157
+ - lib/text_rank/tokenizer/money.rb
158
+ - lib/text_rank/tokenizer/number.rb
159
+ - lib/text_rank/tokenizer/punctuation.rb
160
+ - lib/text_rank/tokenizer/url.rb
154
161
  - lib/text_rank/tokenizer/whitespace.rb
155
- - lib/text_rank/tokenizer/words_and_punctuation.rb
162
+ - lib/text_rank/tokenizer/word.rb
156
163
  - lib/text_rank/version.rb
157
164
  - text_rank.gemspec
158
165
  homepage: https://github.com/david-mccullars/text_rank
@@ -180,4 +187,3 @@ signing_key:
180
187
  specification_version: 4
181
188
  summary: Implementation of TextRank solution to ranked keyword extraction
182
189
  test_files: []
183
- has_rdoc:
@@ -1,26 +0,0 @@
1
- module TextRank
2
- module Tokenizer
3
- ##
4
- # Base tokenizer that tokenizes on any regular expression
5
- #
6
- # = Example
7
- #
8
- # Regex.new(/:/).tokenize("i should:like to know:which is worse.")
9
- # => ["i should", "like to know", "which is worse"]
10
- ##
11
- class Regex
12
-
13
- # @param regex [Regexp] to use for string splitting
14
- def initialize(regex)
15
- @regex = regex
16
- end
17
-
18
- # @param text [String] string to tokenize
19
- # return [Array<String>] non-empty tokens
20
- def tokenize(text)
21
- text.split(@regex) - ['']
22
- end
23
-
24
- end
25
- end
26
- end
@@ -1,26 +0,0 @@
1
- module TextRank
2
- module Tokenizer
3
- ##
4
- # A tokenizer that preserves punctuation as their own tokens (which can be
5
- # used, for example, by the [TokenFilter::PartOfSpeechBase] filter).
6
- #
7
- # = Example
8
- #
9
- # WordsAndPunctuation.new.tokenize("i should:like to know:which is worse.")
10
- # => ["i", "should", ":", "like", "to", "know", ":", "which", "is", "worse", "."]
11
- ##
12
- class WordsAndPunctuation < Regex
13
-
14
- def initialize
15
- super(/
16
- ([a-z][a-z0-9-]+)
17
- |
18
- ([\p{Punct}])
19
- |
20
- \s+
21
- /xi)
22
- end
23
-
24
- end
25
- end
26
- end