text_rank 1.1.5 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.codeclimate.yml +1 -6
- data/.rubocop.yml +55 -1077
- data/.ruby-version +1 -1
- data/.travis.yml +13 -5
- data/{LICENSE.txt → LICENSE} +0 -0
- data/README.md +2 -1
- data/lib/text_rank.rb +5 -0
- data/lib/text_rank/fingerprint.rb +99 -0
- data/lib/text_rank/keyword_extractor.rb +1 -1
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +7 -2
- data/lib/text_rank/tokenizer/number.rb +8 -8
- data/lib/text_rank/version.rb +1 -1
- data/text_rank.gemspec +9 -10
- metadata +35 -34
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
ruby-2.1
|
1
|
+
ruby-2.5.1
|
data/.travis.yml
CHANGED
@@ -1,7 +1,15 @@
|
|
1
|
+
env:
|
2
|
+
global:
|
3
|
+
- CC_TEST_REPORTER_ID=6ab030bf370ffc2abbf0ba4d70a1c8d9649f6fd1426f48f6d43d5c9eb15f187f
|
1
4
|
language: ruby
|
2
5
|
rvm:
|
3
|
-
- 2.1
|
4
|
-
before_install: gem install bundler -v 1.
|
5
|
-
|
6
|
-
|
7
|
-
|
6
|
+
- 2.5.1
|
7
|
+
before_install: gem install bundler -v 1.17.3
|
8
|
+
before_script:
|
9
|
+
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
10
|
+
- chmod +x ./cc-test-reporter
|
11
|
+
- ./cc-test-reporter before-build
|
12
|
+
script:
|
13
|
+
- bundle exec rspec
|
14
|
+
after_script:
|
15
|
+
- ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
|
data/{LICENSE.txt → LICENSE}
RENAMED
File without changes
|
data/README.md
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
|
8
8
|
## Status
|
9
9
|
|
10
|
+
[](https://badge.fury.io/rb/text_rank)
|
10
11
|
[](https://travis-ci.org/david-mccullars/text_rank)
|
11
12
|
[](https://codeclimate.com/github/david-mccullars/text_rank)
|
12
13
|
[](https://codeclimate.com/github/david-mccullars/text_rank/coverage)
|
@@ -127,7 +128,7 @@ multiplication. Each iteration is O(N^3) where N is the number of graph nodes.
|
|
127
128
|
|
128
129
|
## License
|
129
130
|
|
130
|
-
MIT. See the `LICENSE
|
131
|
+
MIT. See the `LICENSE` file.
|
131
132
|
|
132
133
|
|
133
134
|
## References
|
data/lib/text_rank.rb
CHANGED
@@ -8,6 +8,7 @@ require 'page_rank'
|
|
8
8
|
module TextRank
|
9
9
|
|
10
10
|
autoload :CharFilter, 'text_rank/char_filter'
|
11
|
+
autoload :Fingerprint, 'text_rank/fingerprint'
|
11
12
|
autoload :GraphStrategy, 'text_rank/graph_strategy'
|
12
13
|
autoload :KeywordExtractor, 'text_rank/keyword_extractor'
|
13
14
|
autoload :RankFilter, 'text_rank/rank_filter'
|
@@ -31,4 +32,8 @@ module TextRank
|
|
31
32
|
TextRank::KeywordExtractor.advanced(**options).extract(text, **options)
|
32
33
|
end
|
33
34
|
|
35
|
+
def self.similarity(keywords1, keywords2)
|
36
|
+
TextRank::Fingerprint.new(*keywords1).similarity(TextRank::Fingerprint.new(*keywords2))
|
37
|
+
end
|
38
|
+
|
34
39
|
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module TextRank
|
4
|
+
##
|
5
|
+
# Class used to compare documents according to TextRank. A "fingerprint"
|
6
|
+
# represents the first N keywords (in order from most significant to least) from
|
7
|
+
# applying the TextRank algorithm. To compare two "fingerprints" we apply an
|
8
|
+
# algorithm that looks at each of the N prefixes and counts the overlap. This
|
9
|
+
# rewards matches of significant keywords much higher than matches of less
|
10
|
+
# significant keywords. But to prevent less significant keywords from being
|
11
|
+
# completely ignored we apply an inverse log linear transformation to each of the
|
12
|
+
# N prefixes.
|
13
|
+
#
|
14
|
+
# For example, consider the following comparison:
|
15
|
+
#
|
16
|
+
# town man empty found
|
17
|
+
# vs.
|
18
|
+
# general empty found jar
|
19
|
+
#
|
20
|
+
# The first pass considers just the first keywords: town vs. general. As these
|
21
|
+
# are different, they contribute 0.
|
22
|
+
#
|
23
|
+
# The second pass considers the first two keywords: town man vs general empty.
|
24
|
+
# Again, no overlap, so they contribute 0.
|
25
|
+
#
|
26
|
+
# The third pass considers the first three keywords: town man empty vs general
|
27
|
+
# empty found. Here we have one overlap: empty. This contributes 1.
|
28
|
+
#
|
29
|
+
# The fourth pass considers all, and there is two overlaps: empty & found. This
|
30
|
+
# contributes 2.
|
31
|
+
#
|
32
|
+
# We can represent the overlaps as the vector [0, 0, 1, 2]. Then we will apply
|
33
|
+
# the inverse log linear transformation defined by:
|
34
|
+
#
|
35
|
+
# f(x_i) = x_i / ln(i + 1)
|
36
|
+
# = [0, 0, 1 / ln(4), 2 / ln(5)]
|
37
|
+
# = [0, 0, 0.7213475204444817, 1.2426698691192237]
|
38
|
+
#
|
39
|
+
# Finally we take the average of the transformed vector and normalize it (to
|
40
|
+
# ensure a final value between 0.0 and 1.0):
|
41
|
+
#
|
42
|
+
# norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
|
43
|
+
# = norm( 0.49100434739092635 )
|
44
|
+
# = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
|
45
|
+
# = 0.49100434739092635 / avg(7.912555793714532)
|
46
|
+
# = 0.49100434739092635 / 1.978138948428633
|
47
|
+
# = 0.24821529740414025
|
48
|
+
##
|
49
|
+
class Fingerprint
|
50
|
+
|
51
|
+
attr_reader :values, :size
|
52
|
+
|
53
|
+
# Creates a new fingerprint for comparison with another fingerprint
|
54
|
+
# @param {Array} values An array of fingerprint values of any hashable type.
|
55
|
+
# @return [Fingerprint]
|
56
|
+
def initialize(*values)
|
57
|
+
@size = values.size
|
58
|
+
@values = values
|
59
|
+
end
|
60
|
+
|
61
|
+
# Calculates the "similarity" between this fingerprint and another
|
62
|
+
# @param {Fingerprint} A second fingerprint to compare
|
63
|
+
# @return [Number] A number between 0.0 (different) and 1.0 (same)
|
64
|
+
def similarity(trf2)
|
65
|
+
return 1.0 if values == trf2.values
|
66
|
+
|
67
|
+
sim = 0
|
68
|
+
s1 = Set.new
|
69
|
+
s2 = Set.new
|
70
|
+
|
71
|
+
[size, trf2.size].max.times.reduce(0) do |sum, i|
|
72
|
+
v1 = values[i]
|
73
|
+
v2 = trf2.values[i]
|
74
|
+
if v1 == v2
|
75
|
+
sim += 1
|
76
|
+
else
|
77
|
+
s1.delete?(v2) ? (sim += 1) : (s2 << v2)
|
78
|
+
s2.delete?(v1) ? (sim += 1) : (s1 << v1)
|
79
|
+
end
|
80
|
+
sum + sim * linear_transform[i]
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def linear_transform
|
87
|
+
@linear_transform ||= size.times.map do |i|
|
88
|
+
1.0 / Math.log(i + 2) / size.to_f / norm_factor
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def norm_factor
|
93
|
+
@norm_factor ||= size.times.reduce(0.0) do |s, i|
|
94
|
+
s + (i + 1) / Math.log(i + 2) / size.to_f
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
end
|
@@ -41,7 +41,7 @@ module TextRank
|
|
41
41
|
# @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
|
42
42
|
def initialize(**options)
|
43
43
|
@page_rank_options = {
|
44
|
-
strategy: options[:strategy] || :
|
44
|
+
strategy: options[:strategy] || :sparse,
|
45
45
|
damping: options[:damping],
|
46
46
|
tolerance: options[:tolerance],
|
47
47
|
}
|
@@ -105,7 +105,8 @@ module TextRank
|
|
105
105
|
# until all of the top N final keywords (single or collapsed) have been
|
106
106
|
# considered.
|
107
107
|
loop do
|
108
|
-
|
108
|
+
regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
|
109
|
+
single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
|
109
110
|
scan_text_for_all_permutations_of(single_tokens_to_consider) or break
|
110
111
|
decide_what_to_collapse_and_what_to_remove
|
111
112
|
end
|
@@ -113,7 +114,11 @@ module TextRank
|
|
113
114
|
# We now know what to collapse and what to remove, so we can start safely
|
114
115
|
# modifying the tokens hash
|
115
116
|
@to_collapse.each do |perm|
|
116
|
-
values = @tokens.values_at(*perm)
|
117
|
+
values = @tokens.values_at(*perm).compact
|
118
|
+
# This might be empty if somehow the scanned permutation doesn't
|
119
|
+
# exactly match one of the tokens (e.g. ASCII-folding gone awry).
|
120
|
+
# The goal is to do the best we can, so if we can't find it, ignore.
|
121
|
+
next if values.empty?
|
117
122
|
@tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
|
118
123
|
end
|
119
124
|
@tokens.reject! do |k, _|
|
@@ -7,23 +7,23 @@ module TextRank
|
|
7
7
|
##
|
8
8
|
Number = %r{
|
9
9
|
(
|
10
|
-
[1-9]\d{
|
11
|
-
(
|
12
|
-
(?:\.\d{0,2})? # 453,231,162.17
|
10
|
+
[1-9]\d{3,} # 453231162
|
11
|
+
(?:\.\d+)? # 453231162.17
|
13
12
|
|
14
13
|
|
|
15
14
|
|
16
|
-
[1-9]\d
|
17
|
-
(
|
15
|
+
[1-9]\d{0,2} # 453
|
16
|
+
(?:,\d{3})* # 453,231,162
|
17
|
+
(?:\.\d+)? # 453,231,162.17
|
18
18
|
|
19
19
|
|
|
20
20
|
|
21
|
-
0
|
22
|
-
(?:\.\d
|
21
|
+
0 # 0
|
22
|
+
(?:\.\d+)? # 0.17
|
23
23
|
|
24
24
|
|
|
25
25
|
|
26
|
-
(?:\.\d
|
26
|
+
(?:\.\d+) # .17
|
27
27
|
)
|
28
28
|
}x
|
29
29
|
|
data/lib/text_rank/version.rb
CHANGED
data/text_rank.gemspec
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
# coding: utf-8
|
2
1
|
lib = File.expand_path('../lib', __FILE__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
3
|
require 'text_rank/version'
|
@@ -9,8 +8,8 @@ Gem::Specification.new do |spec|
|
|
9
8
|
spec.authors = ['David McCullars']
|
10
9
|
spec.email = ['david.mccullars@gmail.com']
|
11
10
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
11
|
+
spec.summary = 'Implementation of TextRank solution to ranked keyword extraction'
|
12
|
+
spec.description = 'Implementation of TextRank solution to ranked keyword extraction. See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf'
|
14
13
|
spec.homepage = 'https://github.com/david-mccullars/text_rank'
|
15
14
|
spec.license = 'MIT'
|
16
15
|
|
@@ -19,12 +18,12 @@ Gem::Specification.new do |spec|
|
|
19
18
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
19
|
spec.require_paths = ['lib']
|
21
20
|
|
22
|
-
spec.add_development_dependency 'bundler'
|
23
|
-
spec.add_development_dependency 'rake'
|
24
|
-
spec.add_development_dependency 'rspec'
|
25
|
-
spec.add_development_dependency '
|
26
|
-
spec.add_development_dependency '
|
21
|
+
spec.add_development_dependency 'bundler'
|
22
|
+
spec.add_development_dependency 'rake'
|
23
|
+
spec.add_development_dependency 'rspec'
|
24
|
+
spec.add_development_dependency 'rubocop'
|
25
|
+
spec.add_development_dependency 'simplecov', '~> 0.17.0' # 0.18 not supported by code climate
|
27
26
|
|
28
|
-
spec.add_development_dependency 'engtagger'
|
29
|
-
spec.add_development_dependency 'nokogiri'
|
27
|
+
spec.add_development_dependency 'engtagger' # Optional runtime dependency but needed for specs
|
28
|
+
spec.add_development_dependency 'nokogiri' # Optional runtime dependency but needed for specs
|
30
29
|
end
|
metadata
CHANGED
@@ -1,113 +1,113 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_rank
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David McCullars
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-06-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rubocop
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '0
|
61
|
+
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '0
|
68
|
+
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: simplecov
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - "
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
75
|
+
version: 0.17.0
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - "
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
82
|
+
version: 0.17.0
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: engtagger
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- - "
|
87
|
+
- - ">="
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: 0
|
89
|
+
version: '0'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- - "
|
94
|
+
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: 0
|
96
|
+
version: '0'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: nokogiri
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- - "
|
101
|
+
- - ">="
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
103
|
+
version: '0'
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
|
-
- - "
|
108
|
+
- - ">="
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
110
|
+
version: '0'
|
111
111
|
description: Implementation of TextRank solution to ranked keyword extraction. See
|
112
112
|
https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
|
113
113
|
email:
|
@@ -124,7 +124,7 @@ files:
|
|
124
124
|
- ".travis.yml"
|
125
125
|
- CODE_OF_CONDUCT.md
|
126
126
|
- Gemfile
|
127
|
-
- LICENSE
|
127
|
+
- LICENSE
|
128
128
|
- README.md
|
129
129
|
- Rakefile
|
130
130
|
- bin/console
|
@@ -141,6 +141,7 @@ files:
|
|
141
141
|
- lib/text_rank/char_filter/strip_html.rb
|
142
142
|
- lib/text_rank/char_filter/strip_possessive.rb
|
143
143
|
- lib/text_rank/char_filter/undo_contractions.rb
|
144
|
+
- lib/text_rank/fingerprint.rb
|
144
145
|
- lib/text_rank/graph_strategy.rb
|
145
146
|
- lib/text_rank/graph_strategy/coocurrence.rb
|
146
147
|
- lib/text_rank/keyword_extractor.rb
|
@@ -182,7 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
182
183
|
version: '0'
|
183
184
|
requirements: []
|
184
185
|
rubyforge_project:
|
185
|
-
rubygems_version: 2.
|
186
|
+
rubygems_version: 2.7.6
|
186
187
|
signing_key:
|
187
188
|
specification_version: 4
|
188
189
|
summary: Implementation of TextRank solution to ranked keyword extraction
|