text_rank 1.1.1 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/page_rank/base.rb +2 -2
- data/lib/text_rank/char_filter/ascii_folding.rb +2 -0
- data/lib/text_rank/char_filter/strip_email.rb +1 -0
- data/lib/text_rank/char_filter/undo_contractions.rb +1 -0
- data/lib/text_rank/keyword_extractor.rb +25 -12
- data/lib/text_rank/rank_filter.rb +4 -1
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +7 -15
- data/lib/text_rank/rank_filter/normalize_probability.rb +53 -0
- data/lib/text_rank/rank_filter/normalize_unit_vector.rb +54 -0
- data/lib/text_rank/rank_filter/sort_by_value.rb +22 -0
- data/lib/text_rank/tokenizer.rb +24 -3
- data/lib/text_rank/tokenizer/money.rb +76 -0
- data/lib/text_rank/tokenizer/number.rb +31 -0
- data/lib/text_rank/tokenizer/punctuation.rb +11 -0
- data/lib/text_rank/tokenizer/url.rb +21 -0
- data/lib/text_rank/tokenizer/whitespace.rb +4 -12
- data/lib/text_rank/tokenizer/word.rb +14 -0
- data/lib/text_rank/version.rb +2 -1
- data/text_rank.gemspec +1 -1
- metadata +12 -6
- data/lib/text_rank/tokenizer/regex.rb +0 -26
- data/lib/text_rank/tokenizer/words_and_punctuation.rb +0 -26
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 190ea53d10c7ae49f55f0206c8c7346cea3ba4af
|
4
|
+
data.tar.gz: aac4c6be16b91047508af053ca48ba4fa8594f43
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 86c6007b9397e126fcadc57d73f7f6c09def32507ffd2be401f8dc2ca389bafbf9f3384e41453deec7149be5f7862a361ea59d3c362915863571c7e0e81799e8
|
7
|
+
data.tar.gz: 2e678435c0079ab85518f2b4b380dd46744ab5e6d386b6ed82dcbdd9d93943bd4b5e2c54d1ac63077eda6ac2f1f13ccbbb8523bdd50bdd0991fdc11d04c61b49
|
data/lib/page_rank/base.rb
CHANGED
@@ -33,8 +33,8 @@ module PageRank
|
|
33
33
|
end
|
34
34
|
|
35
35
|
# Adds a directed (and optionally weighted) edge to the graph
|
36
|
-
# @param
|
37
|
-
# @param
|
36
|
+
# @param _source [Object] The source node
|
37
|
+
# @param _dest [Object] The destination node
|
38
38
|
# @return [nil]
|
39
39
|
def add(_source, _dest, **_options)
|
40
40
|
raise NotImplementedError
|
@@ -11,7 +11,9 @@ module TextRank
|
|
11
11
|
##
|
12
12
|
class AsciiFolding
|
13
13
|
|
14
|
+
# Non-ASCII characters to replace
|
14
15
|
NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'
|
16
|
+
# "Equivalent" ASCII characters
|
15
17
|
EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'
|
16
18
|
|
17
19
|
# Perform the filter
|
@@ -14,7 +14,7 @@ module TextRank
|
|
14
14
|
def self.basic(**options)
|
15
15
|
new(**{
|
16
16
|
char_filters: [:AsciiFolding, :Lowercase],
|
17
|
-
|
17
|
+
tokenizers: [:Word],
|
18
18
|
token_filters: [:Stopwords, :MinLength],
|
19
19
|
graph_strategy: :Coocurrence,
|
20
20
|
}.merge(options))
|
@@ -26,27 +26,27 @@ module TextRank
|
|
26
26
|
def self.advanced(**options)
|
27
27
|
new(**{
|
28
28
|
char_filters: [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
|
29
|
-
|
29
|
+
tokenizers: [:Url, :Money, :Number, :Word, :Punctuation],
|
30
30
|
token_filters: [:PartOfSpeech, :Stopwords, :MinLength],
|
31
31
|
graph_strategy: :Coocurrence,
|
32
|
-
rank_filters: [:CollapseAdjacent],
|
32
|
+
rank_filters: [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
|
33
33
|
}.merge(options))
|
34
34
|
end
|
35
35
|
|
36
36
|
# @option (see PageRank.new)
|
37
37
|
# @option options [Array<Class, Symbol, #filter!>] :char_filters A list of filters to be applied prior to tokenization
|
38
|
-
# @option options [
|
38
|
+
# @option options [Array<Symbol, Regexp, String>] :tokenizers A list of tokenizer regular expressions to perform tokenization
|
39
39
|
# @option options [Array<Class, Symbol, #filter!>] :token_filters A list of filters to be applied to each token after tokenization
|
40
40
|
# @option options [Class, Symbol, #build_graph] :graph_strategy A class or strategy instance for producing a graph from tokens
|
41
41
|
# @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
|
42
42
|
def initialize(**options)
|
43
43
|
@page_rank_options = {
|
44
|
-
strategy: options[:strategy] || :
|
44
|
+
strategy: options[:strategy] || :dense,
|
45
45
|
damping: options[:damping],
|
46
46
|
tolerance: options[:tolerance],
|
47
47
|
}
|
48
48
|
@char_filters = options[:char_filters] || []
|
49
|
-
@
|
49
|
+
@tokenizers = options[:tokenizers] || [Tokenizer::Word]
|
50
50
|
@token_filters = options[:token_filters] || []
|
51
51
|
@rank_filters = options[:rank_filters] || []
|
52
52
|
@graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
|
@@ -61,11 +61,13 @@ module TextRank
|
|
61
61
|
nil
|
62
62
|
end
|
63
63
|
|
64
|
-
#
|
65
|
-
# @param tokenizer [
|
66
|
-
# @
|
67
|
-
|
68
|
-
|
64
|
+
# Add a tokenizer regular expression for producing tokens from filtered text
|
65
|
+
# @param tokenizer [Symbol, Regexp, String] Tokenizer regular expression
|
66
|
+
# @param (see #add_into)
|
67
|
+
# @return [nil]
|
68
|
+
def add_tokenizer(tokenizer, **options)
|
69
|
+
add_into(@tokenizers, tokenizer, **options)
|
70
|
+
nil
|
69
71
|
end
|
70
72
|
|
71
73
|
# Sets the graph strategy for producing a graph from tokens
|
@@ -98,7 +100,7 @@ module TextRank
|
|
98
100
|
# @return [Array<String>] tokens
|
99
101
|
def tokenize(text)
|
100
102
|
filtered_text = apply_char_filters(text)
|
101
|
-
tokens =
|
103
|
+
tokens = Tokenizer.tokenize(filtered_text, *tokenizer_regular_expressions)
|
102
104
|
apply_token_filters(tokens)
|
103
105
|
end
|
104
106
|
|
@@ -121,6 +123,17 @@ module TextRank
|
|
121
123
|
end
|
122
124
|
end
|
123
125
|
|
126
|
+
def tokenizer_regular_expressions
|
127
|
+
@tokenizers.map do |t|
|
128
|
+
case t
|
129
|
+
when Symbol
|
130
|
+
Tokenizer.const_get(t)
|
131
|
+
else
|
132
|
+
t
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
124
137
|
def apply_token_filters(tokens)
|
125
138
|
@token_filters.reduce(tokens) do |t, f|
|
126
139
|
classify(f, context: TokenFilter).filter!(t) || t
|
@@ -12,7 +12,10 @@ module TextRank
|
|
12
12
|
##
|
13
13
|
module RankFilter
|
14
14
|
|
15
|
-
autoload :CollapseAdjacent,
|
15
|
+
autoload :CollapseAdjacent, 'text_rank/rank_filter/collapse_adjacent'
|
16
|
+
autoload :NormalizeProbability, 'text_rank/rank_filter/normalize_probability'
|
17
|
+
autoload :NormalizeUnitVector, 'text_rank/rank_filter/normalize_unit_vector'
|
18
|
+
autoload :SortByValue, 'text_rank/rank_filter/sort_by_value'
|
16
19
|
|
17
20
|
end
|
18
21
|
end
|
@@ -62,7 +62,7 @@ module TextRank
|
|
62
62
|
# @option options [Fixnum] ranks_to_collapse the top N ranks in which to look for collapsable keywords
|
63
63
|
# @option options [Fixnum] max_tokens_to_combine the maximum number of tokens to collapse into a combined keyword
|
64
64
|
# @option options [true, false] ignore_case whether to ignore case when finding adjacent keywords in original text
|
65
|
-
# @
|
65
|
+
# @option options [String] delimiter an optional delimiter between adjacent keywords in original text
|
66
66
|
def initialize(**options)
|
67
67
|
@options = options
|
68
68
|
end
|
@@ -75,8 +75,6 @@ module TextRank
|
|
75
75
|
TokenCollapser.new(tokens: ranks, text: original_text, **@options).collapse
|
76
76
|
end
|
77
77
|
|
78
|
-
private
|
79
|
-
|
80
78
|
class TokenCollapser
|
81
79
|
|
82
80
|
def initialize(tokens:, text:, ranks_to_collapse: 10, max_tokens_to_combine: 2, ignore_case: true, delimiter: ' ', **_)
|
@@ -90,14 +88,16 @@ module TextRank
|
|
90
88
|
|
91
89
|
@to_collapse = Set.new # Track the permutations we plan to collapse
|
92
90
|
@to_remove = Set.new # Track the single tokens we plan to remove (due to being collapsed)
|
93
|
-
@permutations_scanned =
|
91
|
+
@permutations_scanned = Hash.new(0.0) # Track how many occurrences of each permutation we found in the original text
|
94
92
|
@combination_significance_threshold = 0.3 # The percent of occurrences of a combo of tokens to the occurrences of single tokens required to force collapsing
|
95
93
|
end
|
96
94
|
|
95
|
+
# :nodoc:
|
97
96
|
def delimiter_re
|
98
97
|
@delimiter_re ||= /#{@delimiter}+/
|
99
98
|
end
|
100
99
|
|
100
|
+
# :nodoc:
|
101
101
|
def collapse
|
102
102
|
# We make multiple passes at collapsing because after the first pass we may have
|
103
103
|
# replaced two or more singletons with a collapsed token, bumping up one or more
|
@@ -118,11 +118,7 @@ module TextRank
|
|
118
118
|
end
|
119
119
|
@tokens.reject! do |k, _|
|
120
120
|
@to_remove.include?(k)
|
121
|
-
end
|
122
|
-
|
123
|
-
# Because we've made changes to the tokens hash, we need to re-normalize so that
|
124
|
-
# the sum of all token ranks is still 1.
|
125
|
-
normalize(@tokens)
|
121
|
+
end || @tokens
|
126
122
|
end
|
127
123
|
|
128
124
|
# We need to be efficient about how we search for the large number of possible collapsed keywords.
|
@@ -204,14 +200,10 @@ module TextRank
|
|
204
200
|
total_single_count.zero? || (perm_count / total_single_count) > @combination_significance_threshold
|
205
201
|
end
|
206
202
|
|
207
|
-
# Scale all of the token ranks so they add up to 1.
|
208
|
-
def normalize(tokens)
|
209
|
-
total = tokens.reduce(0.0) { |s, (_, v)| s + v }
|
210
|
-
Hash[tokens.map { |k, v| [k, v / total] }.sort_by { |_, v| -v }]
|
211
|
-
end
|
212
|
-
|
213
203
|
end
|
214
204
|
|
205
|
+
private_constant :TokenCollapser
|
206
|
+
|
215
207
|
end
|
216
208
|
end
|
217
209
|
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module TextRank
|
2
|
+
module RankFilter
|
3
|
+
##
|
4
|
+
# A rank filter which normalizes the ranked keywords so that the sum of the
|
5
|
+
# rank values is 1.0 (a "probability" normalization).
|
6
|
+
#
|
7
|
+
# = Example
|
8
|
+
#
|
9
|
+
# NormalizeProbability.new.filter!(
|
10
|
+
# {
|
11
|
+
# "town" => 0.6818754334834477,
|
12
|
+
# "cities" => 0.6055017128817066,
|
13
|
+
# "siege" => 0.5411519524982207,
|
14
|
+
# "arts" => 0.4907977453782612,
|
15
|
+
# "envy" => 0.4692709808107252,
|
16
|
+
# "blessings" => 0.4442147897516214,
|
17
|
+
# "plagues" => 0.3972420789430091,
|
18
|
+
# "florish" => 0.2746092797528525,
|
19
|
+
# "devoured" => 0.26867321734332237,
|
20
|
+
# "anxieties" => 0.2367731719604189,
|
21
|
+
# "peace" => 0.1905352582752693,
|
22
|
+
# "inhabitants" => 0.02715120116732137,
|
23
|
+
# }
|
24
|
+
# )
|
25
|
+
# => {
|
26
|
+
# "town" => 0.1473434248897056,
|
27
|
+
# "cities" => 0.13084016782478722,
|
28
|
+
# "siege" => 0.11693511476062682,
|
29
|
+
# "arts" => 0.10605429845557579,
|
30
|
+
# "envy" => 0.10140267579486278,
|
31
|
+
# "blessings" => 0.09598839508602595,
|
32
|
+
# "plagues" => 0.08583827125543537,
|
33
|
+
# "florish" => 0.0593390959673909,
|
34
|
+
# "devoured" => 0.058056398684529435,
|
35
|
+
# "anxieties" => 0.051163259981992296,
|
36
|
+
# "peace" => 0.041171915188530236,
|
37
|
+
# "inhabitants" => 0.005866982110537665,
|
38
|
+
# }
|
39
|
+
##
|
40
|
+
class NormalizeProbability
|
41
|
+
|
42
|
+
# Perform the filter on the ranks
|
43
|
+
# @param ranks [Hash<String, Float>] the results of the PageRank algorithm
|
44
|
+
# @return [Hash<String, Float>]
|
45
|
+
def filter!(ranks, **_)
|
46
|
+
return if ranks.empty?
|
47
|
+
total = ranks.values.reduce(:+)
|
48
|
+
Hash[ranks.map { |k, v| [k, v / total] }]
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module TextRank
|
2
|
+
module RankFilter
|
3
|
+
##
|
4
|
+
# A rank filter which normalizes the ranked keywords so that the sum of the
|
5
|
+
# squares of the rank values is 1.0 (and thus the keyword rankings in an
|
6
|
+
# N-vector space is a unit vector).
|
7
|
+
#
|
8
|
+
# = Example
|
9
|
+
#
|
10
|
+
# NormalizeUnitVector.new.filter!(
|
11
|
+
# {
|
12
|
+
# "town" => 0.6818754334834477,
|
13
|
+
# "cities" => 0.6055017128817066,
|
14
|
+
# "siege" => 0.5411519524982207,
|
15
|
+
# "arts" => 0.4907977453782612,
|
16
|
+
# "envy" => 0.4692709808107252,
|
17
|
+
# "blessings" => 0.4442147897516214,
|
18
|
+
# "plagues" => 0.3972420789430091,
|
19
|
+
# "florish" => 0.2746092797528525,
|
20
|
+
# "devoured" => 0.26867321734332237,
|
21
|
+
# "anxieties" => 0.2367731719604189,
|
22
|
+
# "peace" => 0.1905352582752693,
|
23
|
+
# "inhabitants" => 0.02715120116732137,
|
24
|
+
# }
|
25
|
+
# )
|
26
|
+
# => {
|
27
|
+
# "town" => 0.4616807998499129,
|
28
|
+
# "cities" => 0.40997006401243896,
|
29
|
+
# "siege" => 0.3664004508761722,
|
30
|
+
# "arts" => 0.3323068767754191,
|
31
|
+
# "envy" => 0.317731642948694,
|
32
|
+
# "blessings" => 0.30076672272820315,
|
33
|
+
# "plagues" => 0.2689626751964553,
|
34
|
+
# "florish" => 0.18593107435301526,
|
35
|
+
# "devoured" => 0.1819119149778339,
|
36
|
+
# "anxieties" => 0.16031319218415677,
|
37
|
+
# "peace" => 0.12900665740478157,
|
38
|
+
# "inhabitants" => 0.01838339916101275,
|
39
|
+
# }
|
40
|
+
##
|
41
|
+
class NormalizeUnitVector
|
42
|
+
|
43
|
+
# Perform the filter on the ranks
|
44
|
+
# @param ranks [Hash<String, Float>] the results of the PageRank algorithm
|
45
|
+
# @return [Hash<String, Float>]
|
46
|
+
def filter!(ranks, **_)
|
47
|
+
return if ranks.empty?
|
48
|
+
total = Math.sqrt(ranks.values.map { |v| v * v }.reduce(:+))
|
49
|
+
Hash[ranks.map { |k, v| [k, v / total] }]
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module TextRank
|
2
|
+
module RankFilter
|
3
|
+
##
|
4
|
+
# A rank filter which sorts the results by value
|
5
|
+
##
|
6
|
+
class SortByValue
|
7
|
+
|
8
|
+
# @param descending [boolean] whether to sort in descending order
|
9
|
+
def initialize(descending: true)
|
10
|
+
@descending = !!descending
|
11
|
+
end
|
12
|
+
|
13
|
+
# Perform the filter on the ranks
|
14
|
+
# @param ranks [Hash<String, Float>] the results of the PageRank algorithm
|
15
|
+
# @return [Hash<String, Float>]
|
16
|
+
def filter!(ranks, **_)
|
17
|
+
Hash[ranks.sort_by { |_, v| @descending ? -v : v }]
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/text_rank/tokenizer.rb
CHANGED
@@ -8,12 +8,33 @@ module TextRank
|
|
8
8
|
# help inform its decision on which tokens to keep and which to drop. An example
|
9
9
|
# of this is the part of speech token filter which uses punctuation tokens to
|
10
10
|
# help guess the part of speech of each non-punctuation token.
|
11
|
+
#
|
12
|
+
# When tokenizing a piece of text, the Tokenizer will combine one or more
|
13
|
+
# regular expressions (in the order given) to scan the text for matches. As such
|
14
|
+
# you need only tell the tokenizer which tokens you want; everything else will
|
15
|
+
# be ignored.
|
11
16
|
##
|
12
17
|
module Tokenizer
|
13
18
|
|
14
|
-
autoload :
|
15
|
-
autoload :
|
16
|
-
autoload :
|
19
|
+
autoload :Money, 'text_rank/tokenizer/money'
|
20
|
+
autoload :Number, 'text_rank/tokenizer/number'
|
21
|
+
autoload :Punctuation, 'text_rank/tokenizer/punctuation'
|
22
|
+
autoload :Url, 'text_rank/tokenizer/url'
|
23
|
+
autoload :Whitespace, 'text_rank/tokenizer/whitespace'
|
24
|
+
autoload :Word, 'text_rank/tokenizer/word'
|
25
|
+
|
26
|
+
# Performs tokenization of piece of text by one or more tokenizer regular expressions.
|
27
|
+
# @param text [String]
|
28
|
+
# @param regular_expressions [Array<Regexp|String>]
|
29
|
+
# @return [Array<String>]
|
30
|
+
def self.tokenize(text, *regular_expressions)
|
31
|
+
tokens = []
|
32
|
+
text.scan(Regexp.new(regular_expressions.flatten.join('|'))) do |matches|
|
33
|
+
m = matches.compact.first
|
34
|
+
tokens << m if m && m.size > 0
|
35
|
+
end
|
36
|
+
tokens
|
37
|
+
end
|
17
38
|
|
18
39
|
end
|
19
40
|
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#encoding: UTF-8
|
2
|
+
module TextRank
|
3
|
+
module Tokenizer
|
4
|
+
|
5
|
+
CURRENCY_SYMBOLS = '[' + [
|
6
|
+
"\u00a4", # Generic Currency Symbol
|
7
|
+
"\u0024", # Dollar Sign
|
8
|
+
"\u00a2", # Cent Sign
|
9
|
+
"\u00a3", # Pound Sterling
|
10
|
+
"\u00a5", # Yen Symbol
|
11
|
+
"\u20a3", # Franc Sign
|
12
|
+
"\u20a4", # Lira Symbol
|
13
|
+
"\u20a7", # Peseta Sign
|
14
|
+
"\u20ac", # Euro Symbol
|
15
|
+
"\u20B9", # Rupee
|
16
|
+
"\u20a9", # Won Sign
|
17
|
+
"\u20b4", # Hryvnia Sign
|
18
|
+
"\u20af", # Drachma Sign
|
19
|
+
"\u20ae", # Tugrik Sign
|
20
|
+
"\u20b0", # German Penny Sign
|
21
|
+
"\u20b2", # Guarani Sign
|
22
|
+
"\u20b1", # Peso Sign
|
23
|
+
"\u20b3", # Austral Sign
|
24
|
+
"\u20b5", # Cedi Sign
|
25
|
+
"\u20ad", # Kip Sign
|
26
|
+
"\u20aa", # New Sheqel Sign
|
27
|
+
"\u20ab", # Dong Sign
|
28
|
+
"\u0025", # Percent
|
29
|
+
"\u2030", # Per Million
|
30
|
+
].join + ']'
|
31
|
+
private_constant :CURRENCY_SYMBOLS # Do not expose this to avoid confusion
|
32
|
+
|
33
|
+
##
|
34
|
+
# A tokenizer regex that preserves money or formatted numbers as a single token. This
|
35
|
+
# currently supports 24 different currency symbols:
|
36
|
+
#
|
37
|
+
# * ¤
|
38
|
+
# * $
|
39
|
+
# * ¢
|
40
|
+
# * £
|
41
|
+
# * ¥
|
42
|
+
# * ₣
|
43
|
+
# * ₤
|
44
|
+
# * ₧
|
45
|
+
# * €
|
46
|
+
# * ₹
|
47
|
+
# * ₩
|
48
|
+
# * ₴
|
49
|
+
# * ₯
|
50
|
+
# * ₮
|
51
|
+
# * ₰
|
52
|
+
# * ₲
|
53
|
+
# * ₱
|
54
|
+
# * ₳
|
55
|
+
# * ₵
|
56
|
+
# * ₭
|
57
|
+
# * ₪
|
58
|
+
# * ₫
|
59
|
+
# * %
|
60
|
+
# * ‰
|
61
|
+
#
|
62
|
+
# It also supports two alternative formats for negatives as well as optional three digit comma
|
63
|
+
# separation and optional decimals.
|
64
|
+
##
|
65
|
+
Money = %r{
|
66
|
+
(
|
67
|
+
#{CURRENCY_SYMBOLS} \-? #{Number} # $-45,231.21
|
68
|
+
|
|
69
|
+
\-? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
|
70
|
+
|
|
71
|
+
\( #{CURRENCY_SYMBOLS} #{Number} \) # ($45,231.21)
|
72
|
+
)
|
73
|
+
}x
|
74
|
+
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#encoding: UTF-8
|
2
|
+
module TextRank
|
3
|
+
module Tokenizer
|
4
|
+
|
5
|
+
##
|
6
|
+
# A tokenizer regex that preserves (optionally formatted) numbers as a single token.
|
7
|
+
##
|
8
|
+
Number = %r{
|
9
|
+
(
|
10
|
+
[1-9]\d{0,2} # 453
|
11
|
+
(?:,\d{3})* # 453,231,162
|
12
|
+
(?:\.\d{0,2})? # 453,231,162.17
|
13
|
+
|
14
|
+
|
|
15
|
+
|
16
|
+
[1-9]\d* # 453231162
|
17
|
+
(?:\.\d{0,2})? # 453231162.17
|
18
|
+
|
19
|
+
|
|
20
|
+
|
21
|
+
0 # 0
|
22
|
+
(?:\.\d{0,2})? # 0.17
|
23
|
+
|
24
|
+
|
|
25
|
+
|
26
|
+
(?:\.\d{1,2}) # .17
|
27
|
+
)
|
28
|
+
}x
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module TextRank
|
2
|
+
module Tokenizer
|
3
|
+
##
|
4
|
+
# A tokenizer regex that preserves single punctuation symbols as a token. Use
|
5
|
+
# this if one or more of your TokenFilter classes need punctuation in order to
|
6
|
+
# make decisions.
|
7
|
+
##
|
8
|
+
Punctuation = %r{([\p{Punct}])}
|
9
|
+
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module TextRank
|
2
|
+
module Tokenizer
|
3
|
+
##
|
4
|
+
# A tokenizer regex that preserves entire URL's as a token (rather than split them up)
|
5
|
+
##
|
6
|
+
Url = %r{
|
7
|
+
(
|
8
|
+
(?:[\w-]+://?|www[.])
|
9
|
+
[^\s()<>]+
|
10
|
+
(?:
|
11
|
+
\([\w\d]+\)
|
12
|
+
|
|
13
|
+
(?:[^[:punct:]\s]
|
14
|
+
|
|
15
|
+
/)
|
16
|
+
)
|
17
|
+
)
|
18
|
+
}xi
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
@@ -1,19 +1,11 @@
|
|
1
1
|
module TextRank
|
2
2
|
module Tokenizer
|
3
3
|
##
|
4
|
-
#
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
# Whitespace.new.tokenize("i should:like to know:which is worse.")
|
9
|
-
# => ["i", "should:like", "to", "know:which", "is", "worse."]
|
4
|
+
# A tokenizer regex that preserves single whitespace characters as a token. Use
|
5
|
+
# this if one or more of your TokenFilter classes need whitespace in order to
|
6
|
+
# make decisions.
|
10
7
|
##
|
11
|
-
|
8
|
+
Whitespace = %r{\s}
|
12
9
|
|
13
|
-
def initialize
|
14
|
-
super(/\s+/)
|
15
|
-
end
|
16
|
-
|
17
|
-
end
|
18
10
|
end
|
19
11
|
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module TextRank
|
2
|
+
module Tokenizer
|
3
|
+
##
|
4
|
+
# A tokenizer regex that preserves a non-space, non-punctuation "word". It does
|
5
|
+
# allow hyphens and numerals, but the first character must be an A-Z character.
|
6
|
+
##
|
7
|
+
Word = %r{
|
8
|
+
(
|
9
|
+
[a-z][a-z0-9-]*
|
10
|
+
)
|
11
|
+
}xi
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
data/lib/text_rank/version.rb
CHANGED
data/text_rank.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ['david.mccullars@gmail.com']
|
11
11
|
|
12
12
|
spec.summary = %q{Implementation of TextRank solution to ranked keyword extraction}
|
13
|
-
spec.description = %q{See https://
|
13
|
+
spec.description = %q{Implementation of TextRank solution to ranked keyword extraction. See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf}
|
14
14
|
spec.homepage = 'https://github.com/david-mccullars/text_rank'
|
15
15
|
spec.license = 'MIT'
|
16
16
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_rank
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David McCullars
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -108,7 +108,8 @@ dependencies:
|
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '1.0'
|
111
|
-
description:
|
111
|
+
description: Implementation of TextRank solution to ranked keyword extraction. See
|
112
|
+
https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
|
112
113
|
email:
|
113
114
|
- david.mccullars@gmail.com
|
114
115
|
executables: []
|
@@ -145,14 +146,20 @@ files:
|
|
145
146
|
- lib/text_rank/keyword_extractor.rb
|
146
147
|
- lib/text_rank/rank_filter.rb
|
147
148
|
- lib/text_rank/rank_filter/collapse_adjacent.rb
|
149
|
+
- lib/text_rank/rank_filter/normalize_probability.rb
|
150
|
+
- lib/text_rank/rank_filter/normalize_unit_vector.rb
|
151
|
+
- lib/text_rank/rank_filter/sort_by_value.rb
|
148
152
|
- lib/text_rank/token_filter.rb
|
149
153
|
- lib/text_rank/token_filter/min_length.rb
|
150
154
|
- lib/text_rank/token_filter/part_of_speech.rb
|
151
155
|
- lib/text_rank/token_filter/stopwords.rb
|
152
156
|
- lib/text_rank/tokenizer.rb
|
153
|
-
- lib/text_rank/tokenizer/
|
157
|
+
- lib/text_rank/tokenizer/money.rb
|
158
|
+
- lib/text_rank/tokenizer/number.rb
|
159
|
+
- lib/text_rank/tokenizer/punctuation.rb
|
160
|
+
- lib/text_rank/tokenizer/url.rb
|
154
161
|
- lib/text_rank/tokenizer/whitespace.rb
|
155
|
-
- lib/text_rank/tokenizer/
|
162
|
+
- lib/text_rank/tokenizer/word.rb
|
156
163
|
- lib/text_rank/version.rb
|
157
164
|
- text_rank.gemspec
|
158
165
|
homepage: https://github.com/david-mccullars/text_rank
|
@@ -180,4 +187,3 @@ signing_key:
|
|
180
187
|
specification_version: 4
|
181
188
|
summary: Implementation of TextRank solution to ranked keyword extraction
|
182
189
|
test_files: []
|
183
|
-
has_rdoc:
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module TextRank
|
2
|
-
module Tokenizer
|
3
|
-
##
|
4
|
-
# Base tokenizer that tokenizes on any regular expression
|
5
|
-
#
|
6
|
-
# = Example
|
7
|
-
#
|
8
|
-
# Regex.new(/:/).tokenize("i should:like to know:which is worse.")
|
9
|
-
# => ["i should", "like to know", "which is worse"]
|
10
|
-
##
|
11
|
-
class Regex
|
12
|
-
|
13
|
-
# @param regex [Regexp] to use for string splitting
|
14
|
-
def initialize(regex)
|
15
|
-
@regex = regex
|
16
|
-
end
|
17
|
-
|
18
|
-
# @param text [String] string to tokenize
|
19
|
-
# return [Array<String>] non-empty tokens
|
20
|
-
def tokenize(text)
|
21
|
-
text.split(@regex) - ['']
|
22
|
-
end
|
23
|
-
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module TextRank
|
2
|
-
module Tokenizer
|
3
|
-
##
|
4
|
-
# A tokenizer that preserves punctuation as their own tokens (which can be
|
5
|
-
# used, for example, by the [TokenFilter::PartOfSpeechBase] filter).
|
6
|
-
#
|
7
|
-
# = Example
|
8
|
-
#
|
9
|
-
# WordsAndPunctuation.new.tokenize("i should:like to know:which is worse.")
|
10
|
-
# => ["i", "should", ":", "like", "to", "know", ":", "which", "is", "worse", "."]
|
11
|
-
##
|
12
|
-
class WordsAndPunctuation < Regex
|
13
|
-
|
14
|
-
def initialize
|
15
|
-
super(/
|
16
|
-
([a-z][a-z0-9-]+)
|
17
|
-
|
|
18
|
-
([\p{Punct}])
|
19
|
-
|
|
20
|
-
\s+
|
21
|
-
/xi)
|
22
|
-
end
|
23
|
-
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|