text_rank 1.1.7 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.codeclimate.yml +1 -6
- data/.rubocop.yml +60 -1075
- data/.ruby-version +1 -1
- data/.travis.yml +13 -5
- data/{LICENSE.txt → LICENSE} +0 -0
- data/README.md +2 -1
- data/bin/console +3 -3
- data/lib/page_rank.rb +2 -0
- data/lib/page_rank/base.rb +9 -8
- data/lib/page_rank/dense.rb +2 -1
- data/lib/page_rank/sparse.rb +6 -7
- data/lib/text_rank.rb +12 -9
- data/lib/text_rank/char_filter.rb +1 -1
- data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
- data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
- data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
- data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
- data/lib/text_rank/fingerprint.rb +20 -28
- data/lib/text_rank/fingerprint_overlap.rb +55 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
- data/lib/text_rank/keyword_extractor.rb +32 -25
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +53 -25
- data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
- data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
- data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
- data/lib/text_rank/token_filter/stopwords.rb +1 -321
- data/lib/text_rank/token_filter/stopwords.yml +317 -0
- data/lib/text_rank/tokenizer.rb +1 -1
- data/lib/text_rank/tokenizer/money.rb +11 -6
- data/lib/text_rank/tokenizer/number.rb +4 -3
- data/lib/text_rank/tokenizer/punctuation.rb +4 -1
- data/lib/text_rank/tokenizer/url.rb +3 -0
- data/lib/text_rank/tokenizer/whitespace.rb +4 -1
- data/lib/text_rank/tokenizer/word.rb +5 -2
- data/lib/text_rank/version.rb +3 -1
- data/text_rank.gemspec +10 -10
- metadata +48 -32
data/lib/text_rank/tokenizer.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
#encoding: UTF-8
|
2
1
|
module TextRank
|
3
2
|
module Tokenizer
|
4
3
|
|
@@ -12,7 +11,7 @@ module TextRank
|
|
12
11
|
"\u20a4", # Lira Symbol
|
13
12
|
"\u20a7", # Peseta Sign
|
14
13
|
"\u20ac", # Euro Symbol
|
15
|
-
"\u20B9", # Rupee
|
14
|
+
"\u20B9", # Rupee
|
16
15
|
"\u20a9", # Won Sign
|
17
16
|
"\u20b4", # Hryvnia Sign
|
18
17
|
"\u20af", # Drachma Sign
|
@@ -34,6 +33,8 @@ module TextRank
|
|
34
33
|
# A tokenizer regex that preserves money or formatted numbers as a single token. This
|
35
34
|
# currently supports 24 different currency symbols:
|
36
35
|
#
|
36
|
+
# rubocop:disable Style/AsciiComments
|
37
|
+
#
|
37
38
|
# * ¤
|
38
39
|
# * $
|
39
40
|
# * ¢
|
@@ -58,19 +59,23 @@ module TextRank
|
|
58
59
|
# * ₫
|
59
60
|
# * %
|
60
61
|
# * ‰
|
62
|
+
|
63
|
+
# rubocop:enable Style/AsciiComments
|
61
64
|
#
|
62
65
|
# It also supports two alternative formats for negatives as well as optional three digit comma
|
63
66
|
# separation and optional decimals.
|
64
67
|
##
|
65
|
-
|
68
|
+
# rubocop:disable Naming/ConstantName
|
69
|
+
Money = /
|
66
70
|
(
|
67
|
-
#{CURRENCY_SYMBOLS}
|
71
|
+
#{CURRENCY_SYMBOLS} -? #{Number} # $-45,231.21
|
68
72
|
|
|
69
|
-
|
73
|
+
-? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
|
70
74
|
|
|
71
75
|
\( #{CURRENCY_SYMBOLS} #{Number} \) # ($45,231.21)
|
72
76
|
)
|
73
|
-
|
77
|
+
/x
|
78
|
+
# rubocop:enable Naming/ConstantName
|
74
79
|
|
75
80
|
end
|
76
81
|
end
|
@@ -1,11 +1,11 @@
|
|
1
|
-
#encoding: UTF-8
|
2
1
|
module TextRank
|
3
2
|
module Tokenizer
|
4
3
|
|
5
4
|
##
|
6
5
|
# A tokenizer regex that preserves (optionally formatted) numbers as a single token.
|
7
6
|
##
|
8
|
-
|
7
|
+
# rubocop:disable Naming/ConstantName
|
8
|
+
Number = /
|
9
9
|
(
|
10
10
|
[1-9]\d{3,} # 453231162
|
11
11
|
(?:\.\d+)? # 453231162.17
|
@@ -25,7 +25,8 @@ module TextRank
|
|
25
25
|
|
26
26
|
(?:\.\d+) # .17
|
27
27
|
)
|
28
|
-
|
28
|
+
/x
|
29
|
+
# rubocop:enable Naming/ConstantName
|
29
30
|
|
30
31
|
end
|
31
32
|
end
|
@@ -1,11 +1,14 @@
|
|
1
1
|
module TextRank
|
2
2
|
module Tokenizer
|
3
|
+
|
3
4
|
##
|
4
5
|
# A tokenizer regex that preserves single punctuation symbols as a token. Use
|
5
6
|
# this if one or more of your TokenFilter classes need punctuation in order to
|
6
7
|
# make decisions.
|
7
8
|
##
|
8
|
-
|
9
|
+
# rubocop:disable Naming/ConstantName
|
10
|
+
Punctuation = /(\p{Punct})/
|
11
|
+
# rubocop:enable Naming/ConstantName
|
9
12
|
|
10
13
|
end
|
11
14
|
end
|
@@ -1,8 +1,10 @@
|
|
1
1
|
module TextRank
|
2
2
|
module Tokenizer
|
3
|
+
|
3
4
|
##
|
4
5
|
# A tokenizer regex that preserves entire URL's as a token (rather than split them up)
|
5
6
|
##
|
7
|
+
# rubocop:disable Naming/ConstantName
|
6
8
|
Url = %r{
|
7
9
|
(
|
8
10
|
(?:[\w-]+://?|www[.])
|
@@ -16,6 +18,7 @@ module TextRank
|
|
16
18
|
)
|
17
19
|
)
|
18
20
|
}xi
|
21
|
+
# rubocop:enable Naming/ConstantName
|
19
22
|
|
20
23
|
end
|
21
24
|
end
|
@@ -1,11 +1,14 @@
|
|
1
1
|
module TextRank
|
2
2
|
module Tokenizer
|
3
|
+
|
3
4
|
##
|
4
5
|
# A tokenizer regex that preserves single whitespace characters as a token. Use
|
5
6
|
# this if one or more of your TokenFilter classes need whitespace in order to
|
6
7
|
# make decisions.
|
7
8
|
##
|
8
|
-
|
9
|
+
# rubocop:disable Naming/ConstantName
|
10
|
+
Whitespace = /\s/
|
11
|
+
# rubocop:enable Naming/ConstantName
|
9
12
|
|
10
13
|
end
|
11
14
|
end
|
@@ -1,14 +1,17 @@
|
|
1
1
|
module TextRank
|
2
2
|
module Tokenizer
|
3
|
+
|
3
4
|
##
|
4
5
|
# A tokenizer regex that preserves a non-space, non-punctuation "word". It does
|
5
6
|
# allow hyphens and numerals, but the first character must be an A-Z character.
|
6
7
|
##
|
7
|
-
|
8
|
+
# rubocop:disable Naming/ConstantName
|
9
|
+
Word = /
|
8
10
|
(
|
9
11
|
[a-z][a-z0-9-]*
|
10
12
|
)
|
11
|
-
|
13
|
+
/xi
|
14
|
+
# rubocop:enable Naming/ConstantName
|
12
15
|
|
13
16
|
end
|
14
17
|
end
|
data/lib/text_rank/version.rb
CHANGED
data/text_rank.gemspec
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
# coding: utf-8
|
2
1
|
lib = File.expand_path('../lib', __FILE__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
3
|
require 'text_rank/version'
|
@@ -9,8 +8,8 @@ Gem::Specification.new do |spec|
|
|
9
8
|
spec.authors = ['David McCullars']
|
10
9
|
spec.email = ['david.mccullars@gmail.com']
|
11
10
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
11
|
+
spec.summary = 'Implementation of TextRank solution to ranked keyword extraction'
|
12
|
+
spec.description = 'Implementation of TextRank solution to ranked keyword extraction. See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf'
|
14
13
|
spec.homepage = 'https://github.com/david-mccullars/text_rank'
|
15
14
|
spec.license = 'MIT'
|
16
15
|
|
@@ -19,12 +18,13 @@ Gem::Specification.new do |spec|
|
|
19
18
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
19
|
spec.require_paths = ['lib']
|
21
20
|
|
22
|
-
spec.add_development_dependency 'bundler'
|
23
|
-
spec.add_development_dependency 'rake'
|
24
|
-
spec.add_development_dependency 'rspec'
|
25
|
-
spec.add_development_dependency '
|
26
|
-
spec.add_development_dependency '
|
21
|
+
spec.add_development_dependency 'bundler'
|
22
|
+
spec.add_development_dependency 'rake'
|
23
|
+
spec.add_development_dependency 'rspec'
|
24
|
+
spec.add_development_dependency 'rubocop'
|
25
|
+
spec.add_development_dependency 'simplecov', '~> 0.17.0' # 0.18 not supported by code climate
|
26
|
+
spec.add_development_dependency 'yard'
|
27
27
|
|
28
|
-
spec.add_development_dependency 'engtagger'
|
29
|
-
spec.add_development_dependency 'nokogiri'
|
28
|
+
spec.add_development_dependency 'engtagger' # Optional runtime dependency but needed for specs
|
29
|
+
spec.add_development_dependency 'nokogiri' # Optional runtime dependency but needed for specs
|
30
30
|
end
|
metadata
CHANGED
@@ -1,73 +1,87 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_rank
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David McCullars
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rubocop
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: simplecov
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
73
|
- - "~>"
|
60
74
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
75
|
+
version: 0.17.0
|
62
76
|
type: :development
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
80
|
- - "~>"
|
67
81
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
82
|
+
version: 0.17.0
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
84
|
+
name: yard
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
87
|
- - ">="
|
@@ -84,30 +98,30 @@ dependencies:
|
|
84
98
|
name: engtagger
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- - "
|
101
|
+
- - ">="
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version: 0
|
103
|
+
version: '0'
|
90
104
|
type: :development
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
|
-
- - "
|
108
|
+
- - ">="
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version: 0
|
110
|
+
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: nokogiri
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
|
-
- - "
|
115
|
+
- - ">="
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
117
|
+
version: '0'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
|
-
- - "
|
122
|
+
- - ">="
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
124
|
+
version: '0'
|
111
125
|
description: Implementation of TextRank solution to ranked keyword extraction. See
|
112
126
|
https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
|
113
127
|
email:
|
@@ -124,7 +138,7 @@ files:
|
|
124
138
|
- ".travis.yml"
|
125
139
|
- CODE_OF_CONDUCT.md
|
126
140
|
- Gemfile
|
127
|
-
- LICENSE
|
141
|
+
- LICENSE
|
128
142
|
- README.md
|
129
143
|
- Rakefile
|
130
144
|
- bin/console
|
@@ -141,7 +155,9 @@ files:
|
|
141
155
|
- lib/text_rank/char_filter/strip_html.rb
|
142
156
|
- lib/text_rank/char_filter/strip_possessive.rb
|
143
157
|
- lib/text_rank/char_filter/undo_contractions.rb
|
158
|
+
- lib/text_rank/char_filter/undo_contractions.yml
|
144
159
|
- lib/text_rank/fingerprint.rb
|
160
|
+
- lib/text_rank/fingerprint_overlap.rb
|
145
161
|
- lib/text_rank/graph_strategy.rb
|
146
162
|
- lib/text_rank/graph_strategy/coocurrence.rb
|
147
163
|
- lib/text_rank/keyword_extractor.rb
|
@@ -154,6 +170,7 @@ files:
|
|
154
170
|
- lib/text_rank/token_filter/min_length.rb
|
155
171
|
- lib/text_rank/token_filter/part_of_speech.rb
|
156
172
|
- lib/text_rank/token_filter/stopwords.rb
|
173
|
+
- lib/text_rank/token_filter/stopwords.yml
|
157
174
|
- lib/text_rank/tokenizer.rb
|
158
175
|
- lib/text_rank/tokenizer/money.rb
|
159
176
|
- lib/text_rank/tokenizer/number.rb
|
@@ -167,7 +184,7 @@ homepage: https://github.com/david-mccullars/text_rank
|
|
167
184
|
licenses:
|
168
185
|
- MIT
|
169
186
|
metadata: {}
|
170
|
-
post_install_message:
|
187
|
+
post_install_message:
|
171
188
|
rdoc_options: []
|
172
189
|
require_paths:
|
173
190
|
- lib
|
@@ -182,10 +199,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
182
199
|
- !ruby/object:Gem::Version
|
183
200
|
version: '0'
|
184
201
|
requirements: []
|
185
|
-
rubyforge_project:
|
186
|
-
rubygems_version: 2.
|
187
|
-
signing_key:
|
202
|
+
rubyforge_project:
|
203
|
+
rubygems_version: 2.7.6
|
204
|
+
signing_key:
|
188
205
|
specification_version: 4
|
189
206
|
summary: Implementation of TextRank solution to ranked keyword extraction
|
190
207
|
test_files: []
|
191
|
-
has_rdoc:
|