text_rank 1.1.1 → 1.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/page_rank/base.rb +2 -2
- data/lib/text_rank/char_filter/ascii_folding.rb +2 -0
- data/lib/text_rank/char_filter/strip_email.rb +1 -0
- data/lib/text_rank/char_filter/undo_contractions.rb +1 -0
- data/lib/text_rank/keyword_extractor.rb +25 -12
- data/lib/text_rank/rank_filter.rb +4 -1
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +7 -15
- data/lib/text_rank/rank_filter/normalize_probability.rb +53 -0
- data/lib/text_rank/rank_filter/normalize_unit_vector.rb +54 -0
- data/lib/text_rank/rank_filter/sort_by_value.rb +22 -0
- data/lib/text_rank/tokenizer.rb +24 -3
- data/lib/text_rank/tokenizer/money.rb +76 -0
- data/lib/text_rank/tokenizer/number.rb +31 -0
- data/lib/text_rank/tokenizer/punctuation.rb +11 -0
- data/lib/text_rank/tokenizer/url.rb +21 -0
- data/lib/text_rank/tokenizer/whitespace.rb +4 -12
- data/lib/text_rank/tokenizer/word.rb +14 -0
- data/lib/text_rank/version.rb +2 -1
- data/text_rank.gemspec +1 -1
- metadata +12 -6
- data/lib/text_rank/tokenizer/regex.rb +0 -26
- data/lib/text_rank/tokenizer/words_and_punctuation.rb +0 -26
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 190ea53d10c7ae49f55f0206c8c7346cea3ba4af
|
4
|
+
data.tar.gz: aac4c6be16b91047508af053ca48ba4fa8594f43
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 86c6007b9397e126fcadc57d73f7f6c09def32507ffd2be401f8dc2ca389bafbf9f3384e41453deec7149be5f7862a361ea59d3c362915863571c7e0e81799e8
|
7
|
+
data.tar.gz: 2e678435c0079ab85518f2b4b380dd46744ab5e6d386b6ed82dcbdd9d93943bd4b5e2c54d1ac63077eda6ac2f1f13ccbbb8523bdd50bdd0991fdc11d04c61b49
|
data/lib/page_rank/base.rb
CHANGED
@@ -33,8 +33,8 @@ module PageRank
|
|
33
33
|
end
|
34
34
|
|
35
35
|
# Adds a directed (and optionally weighted) edge to the graph
|
36
|
-
# @param
|
37
|
-
# @param
|
36
|
+
# @param _source [Object] The source node
|
37
|
+
# @param _dest [Object] The destination node
|
38
38
|
# @return [nil]
|
39
39
|
def add(_source, _dest, **_options)
|
40
40
|
raise NotImplementedError
|
@@ -11,7 +11,9 @@ module TextRank
|
|
11
11
|
##
|
12
12
|
class AsciiFolding
|
13
13
|
|
14
|
+
# Non-ASCII characters to replace
|
14
15
|
NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'
|
16
|
+
# "Equivalent" ASCII characters
|
15
17
|
EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'
|
16
18
|
|
17
19
|
# Perform the filter
|
@@ -14,7 +14,7 @@ module TextRank
|
|
14
14
|
def self.basic(**options)
|
15
15
|
new(**{
|
16
16
|
char_filters: [:AsciiFolding, :Lowercase],
|
17
|
-
|
17
|
+
tokenizers: [:Word],
|
18
18
|
token_filters: [:Stopwords, :MinLength],
|
19
19
|
graph_strategy: :Coocurrence,
|
20
20
|
}.merge(options))
|
@@ -26,27 +26,27 @@ module TextRank
|
|
26
26
|
def self.advanced(**options)
|
27
27
|
new(**{
|
28
28
|
char_filters: [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
|
29
|
-
|
29
|
+
tokenizers: [:Url, :Money, :Number, :Word, :Punctuation],
|
30
30
|
token_filters: [:PartOfSpeech, :Stopwords, :MinLength],
|
31
31
|
graph_strategy: :Coocurrence,
|
32
|
-
rank_filters: [:CollapseAdjacent],
|
32
|
+
rank_filters: [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
|
33
33
|
}.merge(options))
|
34
34
|
end
|
35
35
|
|
36
36
|
# @option (see PageRank.new)
|
37
37
|
# @option options [Array<Class, Symbol, #filter!>] :char_filters A list of filters to be applied prior to tokenization
|
38
|
-
# @option options [
|
38
|
+
# @option options [Array<Symbol, Regexp, String>] :tokenizers A list of tokenizer regular expressions to perform tokenization
|
39
39
|
# @option options [Array<Class, Symbol, #filter!>] :token_filters A list of filters to be applied to each token after tokenization
|
40
40
|
# @option options [Class, Symbol, #build_graph] :graph_strategy A class or strategy instance for producing a graph from tokens
|
41
41
|
# @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
|
42
42
|
def initialize(**options)
|
43
43
|
@page_rank_options = {
|
44
|
-
strategy: options[:strategy] || :
|
44
|
+
strategy: options[:strategy] || :dense,
|
45
45
|
damping: options[:damping],
|
46
46
|
tolerance: options[:tolerance],
|
47
47
|
}
|
48
48
|
@char_filters = options[:char_filters] || []
|
49
|
-
@
|
49
|
+
@tokenizers = options[:tokenizers] || [Tokenizer::Word]
|
50
50
|
@token_filters = options[:token_filters] || []
|
51
51
|
@rank_filters = options[:rank_filters] || []
|
52
52
|
@graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
|
@@ -61,11 +61,13 @@ module TextRank
|
|
61
61
|
nil
|
62
62
|
end
|
63
63
|
|
64
|
-
#
|
65
|
-
# @param tokenizer [
|
66
|
-
# @
|
67
|
-
|
68
|
-
|
64
|
+
# Add a tokenizer regular expression for producing tokens from filtered text
|
65
|
+
# @param tokenizer [Symbol, Regexp, String] Tokenizer regular expression
|
66
|
+
# @param (see #add_into)
|
67
|
+
# @return [nil]
|
68
|
+
def add_tokenizer(tokenizer, **options)
|
69
|
+
add_into(@tokenizers, tokenizer, **options)
|
70
|
+
nil
|
69
71
|
end
|
70
72
|
|
71
73
|
# Sets the graph strategy for producing a graph from tokens
|
@@ -98,7 +100,7 @@ module TextRank
|
|
98
100
|
# @return [Array<String>] tokens
|
99
101
|
def tokenize(text)
|
100
102
|
filtered_text = apply_char_filters(text)
|
101
|
-
tokens =
|
103
|
+
tokens = Tokenizer.tokenize(filtered_text, *tokenizer_regular_expressions)
|
102
104
|
apply_token_filters(tokens)
|
103
105
|
end
|
104
106
|
|
@@ -121,6 +123,17 @@ module TextRank
|
|
121
123
|
end
|
122
124
|
end
|
123
125
|
|
126
|
+
def tokenizer_regular_expressions
|
127
|
+
@tokenizers.map do |t|
|
128
|
+
case t
|
129
|
+
when Symbol
|
130
|
+
Tokenizer.const_get(t)
|
131
|
+
else
|
132
|
+
t
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
124
137
|
def apply_token_filters(tokens)
|
125
138
|
@token_filters.reduce(tokens) do |t, f|
|
126
139
|
classify(f, context: TokenFilter).filter!(t) || t
|
@@ -12,7 +12,10 @@ module TextRank
|
|
12
12
|
##
|
13
13
|
module RankFilter
|
14
14
|
|
15
|
-
autoload :CollapseAdjacent,
|
15
|
+
autoload :CollapseAdjacent, 'text_rank/rank_filter/collapse_adjacent'
|
16
|
+
autoload :NormalizeProbability, 'text_rank/rank_filter/normalize_probability'
|
17
|
+
autoload :NormalizeUnitVector, 'text_rank/rank_filter/normalize_unit_vector'
|
18
|
+
autoload :SortByValue, 'text_rank/rank_filter/sort_by_value'
|
16
19
|
|
17
20
|
end
|
18
21
|
end
|
@@ -62,7 +62,7 @@ module TextRank
|
|
62
62
|
# @option options [Fixnum] ranks_to_collapse the top N ranks in which to look for collapsable keywords
|
63
63
|
# @option options [Fixnum] max_tokens_to_combine the maximum number of tokens to collapse into a combined keyword
|
64
64
|
# @option options [true, false] ignore_case whether to ignore case when finding adjacent keywords in original text
|
65
|
-
# @
|
65
|
+
# @option options [String] delimiter an optional delimiter between adjacent keywords in original text
|
66
66
|
def initialize(**options)
|
67
67
|
@options = options
|
68
68
|
end
|
@@ -75,8 +75,6 @@ module TextRank
|
|
75
75
|
TokenCollapser.new(tokens: ranks, text: original_text, **@options).collapse
|
76
76
|
end
|
77
77
|
|
78
|
-
private
|
79
|
-
|
80
78
|
class TokenCollapser
|
81
79
|
|
82
80
|
def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
|
@@ -90,14 +88,16 @@ module TextRank
|
|
90
88
|
|
91
89
|
@to_collapse = Set.new # Track the permutations we plan to collapse
|
92
90
|
@to_remove = Set.new # Track the single tokens we plan to remove (due to being collapsed)
|
93
|
-
@permutations_scanned =
|
91
|
+
@permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
|
94
92
|
@combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
|
95
93
|
end
|
96
94
|
|
95
|
+
# :nodoc:
|
97
96
|
def delimiter_re
|
98
97
|
@delimiter_re ||= /#{@delimiter}+/
|
99
98
|
end
|
100
99
|
|
100
|
+
# :nodoc:
|
101
101
|
def collapse
|
102
102
|
# We make multiple passes at collapsing because after the first pass we may have
|
103
103
|
# replaced two or more singletons with a collapsed token, bumping up one or more
|
@@ -118,11 +118,7 @@ module TextRank
|
|
118
118
|
end
|
119
119
|
@tokens.reject! do |k, _|
|
120
120
|
@to_remove.include?(k)
|
121
|
-
end
|
122
|
-
|
123
|
-
# Because we've made changes to the tokens hash, we need to re-normalize so that
|
124
|
-
# the sum of all token ranks is still 1.
|
125
|
-
normalize(@tokens)
|
121
|
+
end || @tokens
|
126
122
|
end
|
127
123
|
|
128
124
|
# We need to be efficient about how we search for the large number of possible collapsed keywords.
|
@@ -204,14 +200,10 @@ module TextRank
|
|
204
200
|
total_single_count.zero? || (perm_count / total_single_count) > @combination_significance_threshold
|
205
201
|
end
|
206
202
|
|
207
|
-
# Scale all of the token ranks so they add up to 1.
|
208
|
-
def normalize(tokens)
|
209
|
-
total = tokens.reduce(0.0) { |s, (_, v)| s + v }
|
210
|
-
Hash[tokens.map { |k, v| [k, v / total] }.sort_by { |_, v| -v }]
|
211
|
-
end
|
212
|
-
|
213
203
|
end
|
214
204
|
|
205
|
+
private_constant :TokenCollapser
|
206
|
+
|
215
207
|
end
|
216
208
|
end
|
217
209
|
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module TextRank
|
2
|
+
module RankFilter
|
3
|
+
##
|
4
|
+
# A rank filter which normalizes the ranked keywords so that the sum of the
|
5
|
+
# rank values is 1.0 (a "probability" normalization).
|
6
|
+
#
|
7
|
+
# = Example
|
8
|
+
#
|
9
|
+
# NormalizeProbability.new.filter!(
|
10
|
+
# {
|
11
|
+
# "town" => 0.6818754334834477,
|
12
|
+
# "cities" => 0.6055017128817066,
|
13
|
+
# "siege" => 0.5411519524982207,
|
14
|
+
# "arts" => 0.4907977453782612,
|
15
|
+
# "envy" => 0.4692709808107252,
|
16
|
+
# "blessings" => 0.4442147897516214,
|
17
|
+
# "plagues" => 0.3972420789430091,
|
18
|
+
# "florish" => 0.2746092797528525,
|
19
|
+
# "devoured" => 0.26867321734332237,
|
20
|
+
# "anxieties" => 0.2367731719604189,
|
21
|
+
# "peace" => 0.1905352582752693,
|
22
|
+
# "inhabitants" => 0.02715120116732137,
|
23
|
+
# }
|
24
|
+
# )
|
25
|
+
# => {
|
26
|
+
# "town" => 0.1473434248897056,
|
27
|
+
# "cities" => 0.13084016782478722,
|
28
|
+
# "siege" => 0.11693511476062682,
|
29
|
+
# "arts" => 0.10605429845557579,
|
30
|
+
# "envy" => 0.10140267579486278,
|
31
|
+
# "blessings" => 0.09598839508602595,
|
32
|
+
# "plagues" => 0.08583827125543537,
|
33
|
+
# "florish" => 0.0593390959673909,
|
34
|
+
# "devoured" => 0.058056398684529435,
|
35
|
+
# "anxieties" => 0.051163259981992296,
|
36
|
+
# "peace" => 0.041171915188530236,
|
37
|
+
# "inhabitants" => 0.005866982110537665,
|
38
|
+
# }
|
39
|
+
##
|
40
|
+
class NormalizeProbability
|
41
|
+
|
42
|
+
# Perform the filter on the ranks
|
43
|
+
# @param ranks [Hash<String, Float>] the results of the PageRank algorithm
|
44
|
+
# @return [Hash<String, Float>]
|
45
|
+
def filter!(ranks, **_)
|
46
|
+
return if ranks.empty?
|
47
|
+
total = ranks.values.reduce(:+)
|
48
|
+
Hash[ranks.map { |k, v| [k, v / total] }]
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module TextRank
|
2
|
+
module RankFilter
|
3
|
+
##
|
4
|
+
# A rank filter which normalizes the ranked keywords so that the sum of the
|
5
|
+
# squares of the rank values is 1.0 (and thus the keyword rankings in an
|
6
|
+
# N-vector space is a unit vector).
|
7
|
+
#
|
8
|
+
# = Example
|
9
|
+
#
|
10
|
+
# NormalizeUnitVector.new.filter!(
|
11
|
+
# {
|
12
|
+
# "town" => 0.6818754334834477,
|
13
|
+
# "cities" => 0.6055017128817066,
|
14
|
+
# "siege" => 0.5411519524982207,
|
15
|
+
# "arts" => 0.4907977453782612,
|
16
|
+
# "envy" => 0.4692709808107252,
|
17
|
+
# "blessings" => 0.4442147897516214,
|
18
|
+
# "plagues" => 0.3972420789430091,
|
19
|
+
# "florish" => 0.2746092797528525,
|
20
|
+
# "devoured" => 0.26867321734332237,
|
21
|
+
# "anxieties" => 0.2367731719604189,
|
22
|
+
# "peace" => 0.1905352582752693,
|
23
|
+
# "inhabitants" => 0.02715120116732137,
|
24
|
+
# }
|
25
|
+
# )
|
26
|
+
# => {
|
27
|
+
# "town" => 0.4616807998499129,
|
28
|
+
# "cities" => 0.40997006401243896,
|
29
|
+
# "siege" => 0.3664004508761722,
|
30
|
+
# "arts" => 0.3323068767754191,
|
31
|
+
# "envy" => 0.317731642948694,
|
32
|
+
# "blessings" => 0.30076672272820315,
|
33
|
+
# "plagues" => 0.2689626751964553,
|
34
|
+
# "florish" => 0.18593107435301526,
|
35
|
+
# "devoured" => 0.1819119149778339,
|
36
|
+
# "anxieties" => 0.16031319218415677,
|
37
|
+
# "peace" => 0.12900665740478157,
|
38
|
+
# "inhabitants" => 0.01838339916101275,
|
39
|
+
# }
|
40
|
+
##
|
41
|
+
class NormalizeUnitVector
|
42
|
+
|
43
|
+
# Perform the filter on the ranks
|
44
|
+
# @param ranks [Hash<String, Float>] the results of the PageRank algorithm
|
45
|
+
# @return [Hash<String, Float>]
|
46
|
+
def filter!(ranks, **_)
|
47
|
+
return if ranks.empty?
|
48
|
+
total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
|
49
|
+
Hash[ranks.map { |k, v| [k, v / total] }]
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module TextRank
|
2
|
+
module RankFilter
|
3
|
+
##
|
4
|
+
# A rank filter which sorts the results by value
|
5
|
+
##
|
6
|
+
class SortByValue
|
7
|
+
|
8
|
+
# @param descending [boolean] whether to sort in descending order
|
9
|
+
def initialize(descending: true)
|
10
|
+
@descending = !!descending
|
11
|
+
end
|
12
|
+
|
13
|
+
# Perform the filter on the ranks
|
14
|
+
# @param ranks [Hash<String, Float>] the results of the PageRank algorithm
|
15
|
+
# @return [Hash<String, Float>]
|
16
|
+
def filter!(ranks, **_)
|
17
|
+
Hash[ranks.sort_by { |_, v| @descending ? -v : v }]
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/text_rank/tokenizer.rb
CHANGED
@@ -8,12 +8,33 @@ module TextRank
|
|
8
8
|
# help inform its decision on which tokens to keep and which to drop. An example
|
9
9
|
# of this is the part of speech token filter which uses punctuation tokens to
|
10
10
|
# help guess the part of speech of each non-punctuation token.
|
11
|
+
#
|
12
|
+
# When tokenizing a piece of text, the Tokenizer will combine one or more
|
13
|
+
# regular expressions (in the order given) to scan the text for matches. As such
|
14
|
+
# you need only tell the tokenizer which tokens you want; everything else will
|
15
|
+
# be ignored.
|
11
16
|
##
|
12
17
|
module Tokenizer
|
13
18
|
|
14
|
-
autoload :
|
15
|
-
autoload :
|
16
|
-
autoload :
|
19
|
+
autoload :Money, 'text_rank/tokenizer/money'
|
20
|
+
autoload :Number, 'text_rank/tokenizer/number'
|
21
|
+
autoload :Punctuation, 'text_rank/tokenizer/punctuation'
|
22
|
+
autoload :Url, 'text_rank/tokenizer/url'
|
23
|
+
autoload :Whitespace, 'text_rank/tokenizer/whitespace'
|
24
|
+
autoload :Word, 'text_rank/tokenizer/word'
|
25
|
+
|
26
|
+
# Performs tokenization of piece of text by one or more tokenizer regular expressions.
|
27
|
+
# @param text [String]
|
28
|
+
# @param regular_expressions [Array<Regexp|String>]
|
29
|
+
# @return [Array<String>]
|
30
|
+
def self.tokenize(text, *regular_expressions)
|
31
|
+
tokens = []
|
32
|
+
text.scan(Regexp.new(regular_expressions.flatten.join('|'))) do |matches|
|
33
|
+
m = matches.compact.first
|
34
|
+
tokens << m if m && m.size > 0
|
35
|
+
end
|
36
|
+
tokens
|
37
|
+
end
|
17
38
|
|
18
39
|
end
|
19
40
|
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#encoding: UTF-8
|
2
|
+
module TextRank
|
3
|
+
module Tokenizer
|
4
|
+
|
5
|
+
CURRENCY_SYMBOLS = '[' + [
|
6
|
+
"\u00a4", # Generic Currency Symbol
|
7
|
+
"\u0024", # Dollar Sign
|
8
|
+
"\u00a2", # Cent Sign
|
9
|
+
"\u00a3", # Pound Sterling
|
10
|
+
"\u00a5", # Yen Symbol
|
11
|
+
"\u20a3", # Franc Sign
|
12
|
+
"\u20a4", # Lira Symbol
|
13
|
+
"\u20a7", # Peseta Sign
|
14
|
+
"\u20ac", # Euro Symbol
|
15
|
+
"\u20B9", # Rupee
|
16
|
+
"\u20a9", # Won Sign
|
17
|
+
"\u20b4", # Hryvnia Sign
|
18
|
+
"\u20af", # Drachma Sign
|
19
|
+
"\u20ae", # Tugrik Sign
|
20
|
+
"\u20b0", # German Penny Sign
|
21
|
+
"\u20b2", # Guarani Sign
|
22
|
+
"\u20b1", # Peso Sign
|
23
|
+
"\u20b3", # Austral Sign
|
24
|
+
"\u20b5", # Cedi Sign
|
25
|
+
"\u20ad", # Kip Sign
|
26
|
+
"\u20aa", # New Sheqel Sign
|
27
|
+
"\u20ab", # Dong Sign
|
28
|
+
"\u0025", # Percent
|
29
|
+
"\u2030", # Per Million
|
30
|
+
].join + ']'
|
31
|
+
private_constant :CURRENCY_SYMBOLS # Do not expose this to avoid confusion
|
32
|
+
|
33
|
+
##
|
34
|
+
# A tokenizer regex that preserves money or formatted numbers as a single token. This
|
35
|
+
# currently supports 24 different currency symbols:
|
36
|
+
#
|
37
|
+
# * ¤
|
38
|
+
# * $
|
39
|
+
# * ¢
|
40
|
+
# * £
|
41
|
+
# * ¥
|
42
|
+
# * ₣
|
43
|
+
# * ₤
|
44
|
+
# * ₧
|
45
|
+
# * €
|
46
|
+
# * ₹
|
47
|
+
# * ₩
|
48
|
+
# * ₴
|
49
|
+
# * ₯
|
50
|
+
# * ₮
|
51
|
+
# * ₰
|
52
|
+
# * ₲
|
53
|
+
# * ₱
|
54
|
+
# * ₳
|
55
|
+
# * ₵
|
56
|
+
# * ₭
|
57
|
+
# * ₪
|
58
|
+
# * ₫
|
59
|
+
# * %
|
60
|
+
# * ‰
|
61
|
+
#
|
62
|
+
# It also supports two alternative formats for negatives as well as optional three digit comma
|
63
|
+
# separation and optional decimals.
|
64
|
+
##
|
65
|
+
Money = %r{
|
66
|
+
(
|
67
|
+
#{CURRENCY_SYMBOLS} \-? #{Number} # $-45,231.21
|
68
|
+
|
|
69
|
+
\-? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
|
70
|
+
|
|
71
|
+
\( #{CURRENCY_SYMBOLS} #{Number} \) # ($45,231.21)
|
72
|
+
)
|
73
|
+
}x
|
74
|
+
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#encoding: UTF-8
|
2
|
+
module TextRank
|
3
|
+
module Tokenizer
|
4
|
+
|
5
|
+
##
|
6
|
+
# A tokenizer regex that preserves (optionally formatted) numbers as a single token.
|
7
|
+
##
|
8
|
+
Number = %r{
|
9
|
+
(
|
10
|
+
[1-9]\d{0,2} # 453
|
11
|
+
(?:,\d{3})* # 453,231,162
|
12
|
+
(?:\.\d{0,2})? # 453,231,162.17
|
13
|
+
|
14
|
+
|
|
15
|
+
|
16
|
+
[1-9]\d* # 453231162
|
17
|
+
(?:\.\d{0,2})? # 453231162.17
|
18
|
+
|
19
|
+
|
|
20
|
+
|
21
|
+
0 # 0
|
22
|
+
(?:\.\d{0,2})? # 0.17
|
23
|
+
|
24
|
+
|
|
25
|
+
|
26
|
+
(?:\.\d{1,2}) # .17
|
27
|
+
)
|
28
|
+
}x
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module TextRank
|
2
|
+
module Tokenizer
|
3
|
+
##
|
4
|
+
# A tokenizer regex that preserves single punctuation symbols as a token. Use
|
5
|
+
# this if one or more of your TokenFilter classes need punctuation in order to
|
6
|
+
# make decisions.
|
7
|
+
##
|
8
|
+
Punctuation = %r{([\p{Punct}])}
|
9
|
+
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module TextRank
|
2
|
+
module Tokenizer
|
3
|
+
##
|
4
|
+
# A tokenizer regex that preserves entire URL's as a token (rather than split them up)
|
5
|
+
##
|
6
|
+
Url = %r{
|
7
|
+
(
|
8
|
+
(?:[\w-]+://?|www[.])
|
9
|
+
[^\s()<>]+
|
10
|
+
(?:
|
11
|
+
\([\w\d]+\)
|
12
|
+
|
|
13
|
+
(?:[^[:punct:]\s]
|
14
|
+
|
|
15
|
+
/)
|
16
|
+
)
|
17
|
+
)
|
18
|
+
}xi
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
@@ -1,19 +1,11 @@
|
|
1
1
|
module TextRank
|
2
2
|
module Tokenizer
|
3
3
|
##
|
4
|
-
#
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
# Whitespace.new.tokenize("i should:like to know:which is worse.")
|
9
|
-
# => ["i", "should:like", "to", "know:which", "is", "worse."]
|
4
|
+
# A tokenizer regex that preserves single whitespace characters as a token. Use
|
5
|
+
# this if one or more of your TokenFilter classes need whitespace in order to
|
6
|
+
# make decisions.
|
10
7
|
##
|
11
|
-
|
8
|
+
Whitespace = %r{\s}
|
12
9
|
|
13
|
-
def initialize
|
14
|
-
super(/\s+/)
|
15
|
-
end
|
16
|
-
|
17
|
-
end
|
18
10
|
end
|
19
11
|
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module TextRank
|
2
|
+
module Tokenizer
|
3
|
+
##
|
4
|
+
# A tokenizer regex that preserves a non-space, non-punctuation "word". It does
|
5
|
+
# allow hyphens and numerals, but the first character must be an A-Z character.
|
6
|
+
##
|
7
|
+
Word = %r{
|
8
|
+
(
|
9
|
+
[a-z][a-z0-9-]*
|
10
|
+
)
|
11
|
+
}xi
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
data/lib/text_rank/version.rb
CHANGED
data/text_rank.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ['david.mccullars@gmail.com']
|
11
11
|
|
12
12
|
spec.summary = %q{Implementation of TextRank solution to ranked keyword extraction}
|
13
|
-
spec.description = %q{See https://
|
13
|
+
spec.description = %q{Implementation of TextRank solution to ranked keyword extraction. See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf}
|
14
14
|
spec.homepage = 'https://github.com/david-mccullars/text_rank'
|
15
15
|
spec.license = 'MIT'
|
16
16
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_rank
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David McCullars
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -108,7 +108,8 @@ dependencies:
|
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '1.0'
|
111
|
-
description:
|
111
|
+
description: Implementation of TextRank solution to ranked keyword extraction. See
|
112
|
+
https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
|
112
113
|
email:
|
113
114
|
- david.mccullars@gmail.com
|
114
115
|
executables: []
|
@@ -145,14 +146,20 @@ files:
|
|
145
146
|
- lib/text_rank/keyword_extractor.rb
|
146
147
|
- lib/text_rank/rank_filter.rb
|
147
148
|
- lib/text_rank/rank_filter/collapse_adjacent.rb
|
149
|
+
- lib/text_rank/rank_filter/normalize_probability.rb
|
150
|
+
- lib/text_rank/rank_filter/normalize_unit_vector.rb
|
151
|
+
- lib/text_rank/rank_filter/sort_by_value.rb
|
148
152
|
- lib/text_rank/token_filter.rb
|
149
153
|
- lib/text_rank/token_filter/min_length.rb
|
150
154
|
- lib/text_rank/token_filter/part_of_speech.rb
|
151
155
|
- lib/text_rank/token_filter/stopwords.rb
|
152
156
|
- lib/text_rank/tokenizer.rb
|
153
|
-
- lib/text_rank/tokenizer/
|
157
|
+
- lib/text_rank/tokenizer/money.rb
|
158
|
+
- lib/text_rank/tokenizer/number.rb
|
159
|
+
- lib/text_rank/tokenizer/punctuation.rb
|
160
|
+
- lib/text_rank/tokenizer/url.rb
|
154
161
|
- lib/text_rank/tokenizer/whitespace.rb
|
155
|
-
- lib/text_rank/tokenizer/
|
162
|
+
- lib/text_rank/tokenizer/word.rb
|
156
163
|
- lib/text_rank/version.rb
|
157
164
|
- text_rank.gemspec
|
158
165
|
homepage: https://github.com/david-mccullars/text_rank
|
@@ -180,4 +187,3 @@ signing_key:
|
|
180
187
|
specification_version: 4
|
181
188
|
summary: Implementation of TextRank solution to ranked keyword extraction
|
182
189
|
test_files: []
|
183
|
-
has_rdoc:
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module TextRank
|
2
|
-
module Tokenizer
|
3
|
-
##
|
4
|
-
# Base tokenizer that tokenizes on any regular expression
|
5
|
-
#
|
6
|
-
# = Example
|
7
|
-
#
|
8
|
-
# Regex.new(/:/).tokenize("i should:like to know:which is worse.")
|
9
|
-
# => ["i should", "like to know", "which is worse"]
|
10
|
-
##
|
11
|
-
class Regex
|
12
|
-
|
13
|
-
# @param regex [Regexp] to use for string splitting
|
14
|
-
def initialize(regex)
|
15
|
-
@regex = regex
|
16
|
-
end
|
17
|
-
|
18
|
-
# @param text [String] string to tokenize
|
19
|
-
# return [Array<String>] non-empty tokens
|
20
|
-
def tokenize(text)
|
21
|
-
text.split(@regex) - ['']
|
22
|
-
end
|
23
|
-
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module TextRank
|
2
|
-
module Tokenizer
|
3
|
-
##
|
4
|
-
# A tokenizer that preserves punctuation as their own tokens (which can be
|
5
|
-
# used, for example, by the [TokenFilter::PartOfSpeechBase] filter).
|
6
|
-
#
|
7
|
-
# = Example
|
8
|
-
#
|
9
|
-
# WordsAndPunctuation.new.tokenize("i should:like to know:which is worse.")
|
10
|
-
# => ["i", "should", ":", "like", "to", "know", ":", "which", "is", "worse", "."]
|
11
|
-
##
|
12
|
-
class WordsAndPunctuation < Regex
|
13
|
-
|
14
|
-
def initialize
|
15
|
-
super(/
|
16
|
-
([a-z][a-z0-9-]+)
|
17
|
-
|
|
18
|
-
([\p{Punct}])
|
19
|
-
|
|
20
|
-
\s+
|
21
|
-
/xi)
|
22
|
-
end
|
23
|
-
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|