text_rank 1.1.5 → 1.2.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1 +1 @@
1
- ruby-2.1.2
1
+ ruby-2.5.1
@@ -1,7 +1,15 @@
1
+ env:
2
+ global:
3
+ - CC_TEST_REPORTER_ID=6ab030bf370ffc2abbf0ba4d70a1c8d9649f6fd1426f48f6d43d5c9eb15f187f
1
4
  language: ruby
2
5
  rvm:
3
- - 2.1.2
4
- before_install: gem install bundler -v 1.11.2
5
- addons:
6
- code_climate:
7
- repo_token: 6ab030bf370ffc2abbf0ba4d70a1c8d9649f6fd1426f48f6d43d5c9eb15f187f
6
+ - 2.5.1
7
+ before_install: gem install bundler -v 1.17.3
8
+ before_script:
9
+ - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
10
+ - chmod +x ./cc-test-reporter
11
+ - ./cc-test-reporter before-build
12
+ script:
13
+ - bundle exec rspec
14
+ after_script:
15
+ - ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
File without changes
data/README.md CHANGED
@@ -7,6 +7,7 @@
7
7
 
8
8
  ## Status
9
9
 
10
+ [![Gem Version](https://badge.fury.io/rb/text_rank.svg)](https://badge.fury.io/rb/text_rank)
10
11
  [![Travis Build Status](https://travis-ci.org/david-mccullars/text_rank.svg?branch=master)](https://travis-ci.org/david-mccullars/text_rank)
11
12
  [![Code Climate](https://codeclimate.com/github/david-mccullars/text_rank/badges/gpa.svg)](https://codeclimate.com/github/david-mccullars/text_rank)
12
13
  [![Test Coverage](https://codeclimate.com/github/david-mccullars/text_rank/badges/coverage.svg)](https://codeclimate.com/github/david-mccullars/text_rank/coverage)
@@ -127,7 +128,7 @@ multiplication. Each iteration is O(N^3) where N is the number of graph nodes.
127
128
 
128
129
  ## License
129
130
 
130
- MIT. See the `LICENSE.txt` file.
131
+ MIT. See the `LICENSE` file.
131
132
 
132
133
 
133
134
  ## References
@@ -8,6 +8,7 @@ require 'page_rank'
8
8
  module TextRank
9
9
 
10
10
  autoload :CharFilter, 'text_rank/char_filter'
11
+ autoload :Fingerprint, 'text_rank/fingerprint'
11
12
  autoload :GraphStrategy, 'text_rank/graph_strategy'
12
13
  autoload :KeywordExtractor, 'text_rank/keyword_extractor'
13
14
  autoload :RankFilter, 'text_rank/rank_filter'
@@ -31,4 +32,8 @@ module TextRank
31
32
  TextRank::KeywordExtractor.advanced(**options).extract(text, **options)
32
33
  end
33
34
 
35
+ def self.similarity(keywords1, keywords2)
36
+ TextRank::Fingerprint.new(*keywords1).similarity(TextRank::Fingerprint.new(*keywords2))
37
+ end
38
+
34
39
  end
@@ -0,0 +1,99 @@
1
+ require 'set'
2
+
3
+ module TextRank
4
+ ##
5
+ # Class used to compare documents according to TextRank. A "fingerprint"
6
+ # represents the first N keywords (in order from most significant to least) from
7
+ # applying the TextRank algorithm. To compare two "fingerprints" we apply an
8
+ # algorithm that looks at each of the N prefixes and counts the overlap. This
9
+ # rewards matches of significant keywords much higher than matches of less
10
+ # significant keywords. But to prevent less significant keywords from being
11
+ # completely ignored we apply an inverse log linear transformation to each of the
12
+ # N prefixes.
13
+ #
14
+ # For example, consider the following comparison:
15
+ #
16
+ # town man empty found
17
+ # vs.
18
+ # general empty found jar
19
+ #
20
+ # The first pass considers just the first keywords: town vs. general. As these
21
+ # are different, they contribute 0.
22
+ #
23
+ # The second pass considers the first two keywords: town man vs general empty.
24
+ # Again, no overlap, so they contribute 0.
25
+ #
26
+ # The third pass considers the first three keywords: town man empty vs general
27
+ # empty found. Here we have one overlap: empty. This contributes 1.
28
+ #
29
+ # The fourth pass considers all, and there is two overlaps: empty & found. This
30
+ # contributes 2.
31
+ #
32
+ # We can represent the overlaps as the vector [0, 0, 1, 2]. Then we will apply
33
+ # the inverse log linear transformation defined by:
34
+ #
35
+ # f(x_i) = x_i / ln(i + 1)
36
+ # = [0, 0, 1 / ln(4), 2 / ln(5)]
37
+ # = [0, 0, 0.7213475204444817, 1.2426698691192237]
38
+ #
39
+ # Finally we take the average of the transformed vector and normalize it (to
40
+ # ensure a final value between 0.0 and 1.0):
41
+ #
42
+ # norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
43
+ # = norm( 0.49100434739092635 )
44
+ # = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
45
+ # = 0.49100434739092635 / avg(7.912555793714532)
46
+ # = 0.49100434739092635 / 1.978138948428633
47
+ # = 0.24821529740414025
48
+ ##
49
+ class Fingerprint
50
+
51
+ attr_reader :values, :size
52
+
53
+ # Creates a new fingerprint for comparison with another fingerprint
54
+ # @param {Array} values An array of fingerprint values of any hashable type.
55
+ # @return [Fingerprint]
56
+ def initialize(*values)
57
+ @size = values.size
58
+ @values = values
59
+ end
60
+
61
+ # Calculates the "similarity" between this fingerprint and another
62
+ # @param {Fingerprint} A second fingerprint to compare
63
+ # @return [Number] A number between 0.0 (different) and 1.0 (same)
64
+ def similarity(trf2)
65
+ return 1.0 if values == trf2.values
66
+
67
+ sim = 0
68
+ s1 = Set.new
69
+ s2 = Set.new
70
+
71
+ [size, trf2.size].max.times.reduce(0) do |sum, i|
72
+ v1 = values[i]
73
+ v2 = trf2.values[i]
74
+ if v1 == v2
75
+ sim += 1
76
+ else
77
+ s1.delete?(v2) ? (sim += 1) : (s2 << v2)
78
+ s2.delete?(v1) ? (sim += 1) : (s1 << v1)
79
+ end
80
+ sum + sim * linear_transform[i]
81
+ end
82
+ end
83
+
84
+ private
85
+
86
+ def linear_transform
87
+ @linear_transform ||= size.times.map do |i|
88
+ 1.0 / Math.log(i + 2) / size.to_f / norm_factor
89
+ end
90
+ end
91
+
92
+ def norm_factor
93
+ @norm_factor ||= size.times.reduce(0.0) do |s, i|
94
+ s + (i + 1) / Math.log(i + 2) / size.to_f
95
+ end
96
+ end
97
+
98
+ end
99
+ end
@@ -41,7 +41,7 @@ module TextRank
41
41
  # @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
42
42
  def initialize(**options)
43
43
  @page_rank_options = {
44
- strategy: options[:strategy] || :dense,
44
+ strategy: options[:strategy] || :sparse,
45
45
  damping: options[:damping],
46
46
  tolerance: options[:tolerance],
47
47
  }
@@ -105,7 +105,8 @@ module TextRank
105
105
  # until all of the top N final keywords (single or collapsed) have been
106
106
  # considered.
107
107
  loop do
108
- single_tokens_to_consider = @tokens.keys.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
108
+ regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
109
+ single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
109
110
  scan_text_for_all_permutations_of(single_tokens_to_consider) or break
110
111
  decide_what_to_collapse_and_what_to_remove
111
112
  end
@@ -113,7 +114,11 @@ module TextRank
113
114
  # We now know what to collapse and what to remove, so we can start safely
114
115
  # modifying the tokens hash
115
116
  @to_collapse.each do |perm|
116
- values = @tokens.values_at(*perm)
117
+ values = @tokens.values_at(*perm).compact
118
+ # This might be empty if somehow the scanned permutation doesn't
119
+ # exactly match one of the tokens (e.g. ASCII-folding gone awry).
120
+ # The goal is to do the best we can, so if we can't find it, ignore.
121
+ next if values.empty?
117
122
  @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
118
123
  end
119
124
  @tokens.reject! do |k, _|
@@ -7,23 +7,23 @@ module TextRank
7
7
  ##
8
8
  Number = %r{
9
9
  (
10
- [1-9]\d{0,2} # 453
11
- (?:,\d{3})* # 453,231,162
12
- (?:\.\d{0,2})? # 453,231,162.17
10
+ [1-9]\d{3,} # 453231162
11
+ (?:\.\d+)? # 453231162.17
13
12
 
14
13
  |
15
14
 
16
- [1-9]\d* # 453231162
17
- (?:\.\d{0,2})? # 453231162.17
15
+ [1-9]\d{0,2} # 453
16
+ (?:,\d{3})* # 453,231,162
17
+ (?:\.\d+)? # 453,231,162.17
18
18
 
19
19
  |
20
20
 
21
- 0 # 0
22
- (?:\.\d{0,2})? # 0.17
21
+ 0 # 0
22
+ (?:\.\d+)? # 0.17
23
23
 
24
24
  |
25
25
 
26
- (?:\.\d{1,2}) # .17
26
+ (?:\.\d+) # .17
27
27
  )
28
28
  }x
29
29
 
@@ -1,4 +1,4 @@
1
1
  module TextRank
2
2
  # Current gem version
3
- VERSION = '1.1.5'
3
+ VERSION = '1.2.3'
4
4
  end
@@ -1,4 +1,3 @@
1
- # coding: utf-8
2
1
  lib = File.expand_path('../lib', __FILE__)
3
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
3
  require 'text_rank/version'
@@ -9,8 +8,8 @@ Gem::Specification.new do |spec|
9
8
  spec.authors = ['David McCullars']
10
9
  spec.email = ['david.mccullars@gmail.com']
11
10
 
12
- spec.summary = %q{Implementation of TextRank solution to ranked keyword extraction}
13
- spec.description = %q{Implementation of TextRank solution to ranked keyword extraction. See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf}
11
+ spec.summary = 'Implementation of TextRank solution to ranked keyword extraction'
12
+ spec.description = 'Implementation of TextRank solution to ranked keyword extraction. See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf'
14
13
  spec.homepage = 'https://github.com/david-mccullars/text_rank'
15
14
  spec.license = 'MIT'
16
15
 
@@ -19,12 +18,12 @@ Gem::Specification.new do |spec|
19
18
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
19
  spec.require_paths = ['lib']
21
20
 
22
- spec.add_development_dependency 'bundler', '~> 1.11'
23
- spec.add_development_dependency 'rake', '~> 10.0'
24
- spec.add_development_dependency 'rspec', '~> 3.0'
25
- spec.add_development_dependency 'simplecov', '~> 0.11'
26
- spec.add_development_dependency 'codeclimate-test-reporter'
21
+ spec.add_development_dependency 'bundler'
22
+ spec.add_development_dependency 'rake'
23
+ spec.add_development_dependency 'rspec'
24
+ spec.add_development_dependency 'rubocop'
25
+ spec.add_development_dependency 'simplecov', '~> 0.17.0' # 0.18 not supported by code climate
27
26
 
28
- spec.add_development_dependency 'engtagger', '~> 0.2.0' # Optional runtime dependency but needed for specs
29
- spec.add_development_dependency 'nokogiri', '~> 1.0' # Optional runtime dependency but needed for specs
27
+ spec.add_development_dependency 'engtagger' # Optional runtime dependency but needed for specs
28
+ spec.add_development_dependency 'nokogiri' # Optional runtime dependency but needed for specs
30
29
  end
metadata CHANGED
@@ -1,113 +1,113 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_rank
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.5
4
+ version: 1.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - David McCullars
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-05-23 00:00:00.000000000 Z
11
+ date: 2020-06-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '1.11'
19
+ version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '1.11'
26
+ version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '10.0'
33
+ version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: '3.0'
47
+ version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: '3.0'
54
+ version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: simplecov
56
+ name: rubocop
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - "~>"
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
- version: '0.11'
61
+ version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - "~>"
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
- version: '0.11'
68
+ version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: codeclimate-test-reporter
70
+ name: simplecov
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - ">="
73
+ - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '0'
75
+ version: 0.17.0
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - ">="
80
+ - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '0'
82
+ version: 0.17.0
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: engtagger
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - "~>"
87
+ - - ">="
88
88
  - !ruby/object:Gem::Version
89
- version: 0.2.0
89
+ version: '0'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - "~>"
94
+ - - ">="
95
95
  - !ruby/object:Gem::Version
96
- version: 0.2.0
96
+ version: '0'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: nokogiri
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - "~>"
101
+ - - ">="
102
102
  - !ruby/object:Gem::Version
103
- version: '1.0'
103
+ version: '0'
104
104
  type: :development
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
- - - "~>"
108
+ - - ">="
109
109
  - !ruby/object:Gem::Version
110
- version: '1.0'
110
+ version: '0'
111
111
  description: Implementation of TextRank solution to ranked keyword extraction. See
112
112
  https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
113
113
  email:
@@ -124,7 +124,7 @@ files:
124
124
  - ".travis.yml"
125
125
  - CODE_OF_CONDUCT.md
126
126
  - Gemfile
127
- - LICENSE.txt
127
+ - LICENSE
128
128
  - README.md
129
129
  - Rakefile
130
130
  - bin/console
@@ -141,6 +141,7 @@ files:
141
141
  - lib/text_rank/char_filter/strip_html.rb
142
142
  - lib/text_rank/char_filter/strip_possessive.rb
143
143
  - lib/text_rank/char_filter/undo_contractions.rb
144
+ - lib/text_rank/fingerprint.rb
144
145
  - lib/text_rank/graph_strategy.rb
145
146
  - lib/text_rank/graph_strategy/coocurrence.rb
146
147
  - lib/text_rank/keyword_extractor.rb
@@ -182,7 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
182
183
  version: '0'
183
184
  requirements: []
184
185
  rubyforge_project:
185
- rubygems_version: 2.5.1
186
+ rubygems_version: 2.7.6
186
187
  signing_key:
187
188
  specification_version: 4
188
189
  summary: Implementation of TextRank solution to ranked keyword extraction