text_rank 1.1.5 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- ruby-2.1.2
1
+ ruby-2.5.1
@@ -1,7 +1,15 @@
1
+ env:
2
+ global:
3
+ - CC_TEST_REPORTER_ID=6ab030bf370ffc2abbf0ba4d70a1c8d9649f6fd1426f48f6d43d5c9eb15f187f
1
4
  language: ruby
2
5
  rvm:
3
- - 2.1.2
4
- before_install: gem install bundler -v 1.11.2
5
- addons:
6
- code_climate:
7
- repo_token: 6ab030bf370ffc2abbf0ba4d70a1c8d9649f6fd1426f48f6d43d5c9eb15f187f
6
+ - 2.5.1
7
+ before_install: gem install bundler -v 1.17.3
8
+ before_script:
9
+ - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
10
+ - chmod +x ./cc-test-reporter
11
+ - ./cc-test-reporter before-build
12
+ script:
13
+ - bundle exec rspec
14
+ after_script:
15
+ - ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
File without changes
data/README.md CHANGED
@@ -7,6 +7,7 @@
7
7
 
8
8
  ## Status
9
9
 
10
+ [![Gem Version](https://badge.fury.io/rb/text_rank.svg)](https://badge.fury.io/rb/text_rank)
10
11
  [![Travis Build Status](https://travis-ci.org/david-mccullars/text_rank.svg?branch=master)](https://travis-ci.org/david-mccullars/text_rank)
11
12
  [![Code Climate](https://codeclimate.com/github/david-mccullars/text_rank/badges/gpa.svg)](https://codeclimate.com/github/david-mccullars/text_rank)
12
13
  [![Test Coverage](https://codeclimate.com/github/david-mccullars/text_rank/badges/coverage.svg)](https://codeclimate.com/github/david-mccullars/text_rank/coverage)
@@ -127,7 +128,7 @@ multiplication. Each iteration is O(N^3) where N is the number of graph nodes.
127
128
 
128
129
  ## License
129
130
 
130
- MIT. See the `LICENSE.txt` file.
131
+ MIT. See the `LICENSE` file.
131
132
 
132
133
 
133
134
  ## References
@@ -8,6 +8,7 @@ require 'page_rank'
8
8
  module TextRank
9
9
 
10
10
  autoload :CharFilter, 'text_rank/char_filter'
11
+ autoload :Fingerprint, 'text_rank/fingerprint'
11
12
  autoload :GraphStrategy, 'text_rank/graph_strategy'
12
13
  autoload :KeywordExtractor, 'text_rank/keyword_extractor'
13
14
  autoload :RankFilter, 'text_rank/rank_filter'
@@ -31,4 +32,8 @@ module TextRank
31
32
  TextRank::KeywordExtractor.advanced(**options).extract(text, **options)
32
33
  end
33
34
 
35
+ def self.similarity(keywords1, keywords2)
36
+ TextRank::Fingerprint.new(*keywords1).similarity(TextRank::Fingerprint.new(*keywords2))
37
+ end
38
+
34
39
  end
@@ -0,0 +1,99 @@
1
+ require 'set'
2
+
3
+ module TextRank
4
+ ##
5
+ # Class used to compare documents according to TextRank. A "fingerprint"
6
+ # represents the first N keywords (in order from most significant to least) from
7
+ # applying the TextRank algorithm. To compare two "fingerprints" we apply an
8
+ # algorithm that looks at each of the N prefixes and counts the overlap. This
9
+ # rewards matches of significant keywords much higher than matches of less
10
+ # significant keywords. But to prevent less significant keywords from being
11
+ # completely ignored we apply an inverse log linear transformation to each of the
12
+ # N prefixes.
13
+ #
14
+ # For example, consider the following comparison:
15
+ #
16
+ # town man empty found
17
+ # vs.
18
+ # general empty found jar
19
+ #
20
+ # The first pass considers just the first keywords: town vs. general. As these
21
+ # are different, they contribute 0.
22
+ #
23
+ # The second pass considers the first two keywords: town man vs general empty.
24
+ # Again, no overlap, so they contribute 0.
25
+ #
26
+ # The third pass considers the first three keywords: town man empty vs general
27
+ # empty found. Here we have one overlap: empty. This contributes 1.
28
+ #
29
+ # The fourth pass considers all, and there is two overlaps: empty & found. This
30
+ # contributes 2.
31
+ #
32
+ # We can represent the overlaps as the vector [0, 0, 1, 2]. Then we will apply
33
+ # the inverse log linear transformation defined by:
34
+ #
35
+ # f(x_i) = x_i / ln(i + 1)
36
+ # = [0, 0, 1 / ln(4), 2 / ln(5)]
37
+ # = [0, 0, 0.7213475204444817, 1.2426698691192237]
38
+ #
39
+ # Finally we take the average of the transformed vector and normalize it (to
40
+ # ensure a final value between 0.0 and 1.0):
41
+ #
42
+ # norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
43
+ # = norm( 0.49100434739092635 )
44
+ # = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
45
+ # = 0.49100434739092635 / avg(7.912555793714532)
46
+ # = 0.49100434739092635 / 1.978138948428633
47
+ # = 0.24821529740414025
48
+ ##
49
+ class Fingerprint
50
+
51
+ attr_reader :values, :size
52
+
53
+ # Creates a new fingerprint for comparison with another fingerprint
54
+ # @param {Array} values An array of fingerprint values of any hashable type.
55
+ # @return [Fingerprint]
56
+ def initialize(*values)
57
+ @size = values.size
58
+ @values = values
59
+ end
60
+
61
+ # Calculates the "similarity" between this fingerprint and another
62
+ # @param {Fingerprint} A second fingerprint to compare
63
+ # @return [Number] A number between 0.0 (different) and 1.0 (same)
64
+ def similarity(trf2)
65
+ return 1.0 if values == trf2.values
66
+
67
+ sim = 0
68
+ s1 = Set.new
69
+ s2 = Set.new
70
+
71
+ [size, trf2.size].max.times.reduce(0) do |sum, i|
72
+ v1 = values[i]
73
+ v2 = trf2.values[i]
74
+ if v1 == v2
75
+ sim += 1
76
+ else
77
+ s1.delete?(v2) ? (sim += 1) : (s2 << v2)
78
+ s2.delete?(v1) ? (sim += 1) : (s1 << v1)
79
+ end
80
+ sum + sim * linear_transform[i]
81
+ end
82
+ end
83
+
84
+ private
85
+
86
+ def linear_transform
87
+ @linear_transform ||= size.times.map do |i|
88
+ 1.0 / Math.log(i + 2) / size.to_f / norm_factor
89
+ end
90
+ end
91
+
92
+ def norm_factor
93
+ @norm_factor ||= size.times.reduce(0.0) do |s, i|
94
+ s + (i + 1) / Math.log(i + 2) / size.to_f
95
+ end
96
+ end
97
+
98
+ end
99
+ end
@@ -41,7 +41,7 @@ module TextRank
41
41
  # @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
42
42
  def initialize(**options)
43
43
  @page_rank_options = {
44
- strategy: options[:strategy] || :dense,
44
+ strategy: options[:strategy] || :sparse,
45
45
  damping: options[:damping],
46
46
  tolerance: options[:tolerance],
47
47
  }
@@ -105,7 +105,8 @@ module TextRank
105
105
  # until all of the top N final keywords (single or collapsed) have been
106
106
  # considered.
107
107
  loop do
108
- single_tokens_to_consider = @tokens.keys.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
108
+ regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
109
+ single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
109
110
  scan_text_for_all_permutations_of(single_tokens_to_consider) or break
110
111
  decide_what_to_collapse_and_what_to_remove
111
112
  end
@@ -113,7 +114,11 @@ module TextRank
113
114
  # We now know what to collapse and what to remove, so we can start safely
114
115
  # modifying the tokens hash
115
116
  @to_collapse.each do |perm|
116
- values = @tokens.values_at(*perm)
117
+ values = @tokens.values_at(*perm).compact
118
+ # This might be empty if somehow the scanned permutation doesn't
119
+ # exactly match one of the tokens (e.g. ASCII-folding gone awry).
120
+ # The goal is to do the best we can, so if we can't find it, ignore.
121
+ next if values.empty?
117
122
  @tokens[perm.join(@delimiter)] = values.reduce(:+) / values.size
118
123
  end
119
124
  @tokens.reject! do |k, _|
@@ -7,23 +7,23 @@ module TextRank
7
7
  ##
8
8
  Number = %r{
9
9
  (
10
- [1-9]\d{0,2} # 453
11
- (?:,\d{3})* # 453,231,162
12
- (?:\.\d{0,2})? # 453,231,162.17
10
+ [1-9]\d{3,} # 453231162
11
+ (?:\.\d+)? # 453231162.17
13
12
 
14
13
  |
15
14
 
16
- [1-9]\d* # 453231162
17
- (?:\.\d{0,2})? # 453231162.17
15
+ [1-9]\d{0,2} # 453
16
+ (?:,\d{3})* # 453,231,162
17
+ (?:\.\d+)? # 453,231,162.17
18
18
 
19
19
  |
20
20
 
21
- 0 # 0
22
- (?:\.\d{0,2})? # 0.17
21
+ 0 # 0
22
+ (?:\.\d+)? # 0.17
23
23
 
24
24
  |
25
25
 
26
- (?:\.\d{1,2}) # .17
26
+ (?:\.\d+) # .17
27
27
  )
28
28
  }x
29
29
 
@@ -1,4 +1,4 @@
1
1
  module TextRank
2
2
  # Current gem version
3
- VERSION = '1.1.5'
3
+ VERSION = '1.2.3'
4
4
  end
@@ -1,4 +1,3 @@
1
- # coding: utf-8
2
1
  lib = File.expand_path('../lib', __FILE__)
3
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
3
  require 'text_rank/version'
@@ -9,8 +8,8 @@ Gem::Specification.new do |spec|
9
8
  spec.authors = ['David McCullars']
10
9
  spec.email = ['david.mccullars@gmail.com']
11
10
 
12
- spec.summary = %q{Implementation of TextRank solution to ranked keyword extraction}
13
- spec.description = %q{Implementation of TextRank solution to ranked keyword extraction. See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf}
11
+ spec.summary = 'Implementation of TextRank solution to ranked keyword extraction'
12
+ spec.description = 'Implementation of TextRank solution to ranked keyword extraction. See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf'
14
13
  spec.homepage = 'https://github.com/david-mccullars/text_rank'
15
14
  spec.license = 'MIT'
16
15
 
@@ -19,12 +18,12 @@ Gem::Specification.new do |spec|
19
18
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
19
  spec.require_paths = ['lib']
21
20
 
22
- spec.add_development_dependency 'bundler', '~> 1.11'
23
- spec.add_development_dependency 'rake', '~> 10.0'
24
- spec.add_development_dependency 'rspec', '~> 3.0'
25
- spec.add_development_dependency 'simplecov', '~> 0.11'
26
- spec.add_development_dependency 'codeclimate-test-reporter'
21
+ spec.add_development_dependency 'bundler'
22
+ spec.add_development_dependency 'rake'
23
+ spec.add_development_dependency 'rspec'
24
+ spec.add_development_dependency 'rubocop'
25
+ spec.add_development_dependency 'simplecov', '~> 0.17.0' # 0.18 not supported by code climate
27
26
 
28
- spec.add_development_dependency 'engtagger', '~> 0.2.0' # Optional runtime dependency but needed for specs
29
- spec.add_development_dependency 'nokogiri', '~> 1.0' # Optional runtime dependency but needed for specs
27
+ spec.add_development_dependency 'engtagger' # Optional runtime dependency but needed for specs
28
+ spec.add_development_dependency 'nokogiri' # Optional runtime dependency but needed for specs
30
29
  end
metadata CHANGED
@@ -1,113 +1,113 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_rank
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.5
4
+ version: 1.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - David McCullars
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-05-23 00:00:00.000000000 Z
11
+ date: 2020-06-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '1.11'
19
+ version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '1.11'
26
+ version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '10.0'
33
+ version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: '3.0'
47
+ version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: '3.0'
54
+ version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: simplecov
56
+ name: rubocop
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - "~>"
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
- version: '0.11'
61
+ version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - "~>"
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
- version: '0.11'
68
+ version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: codeclimate-test-reporter
70
+ name: simplecov
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - ">="
73
+ - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '0'
75
+ version: 0.17.0
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - ">="
80
+ - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '0'
82
+ version: 0.17.0
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: engtagger
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - "~>"
87
+ - - ">="
88
88
  - !ruby/object:Gem::Version
89
- version: 0.2.0
89
+ version: '0'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - "~>"
94
+ - - ">="
95
95
  - !ruby/object:Gem::Version
96
- version: 0.2.0
96
+ version: '0'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: nokogiri
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - "~>"
101
+ - - ">="
102
102
  - !ruby/object:Gem::Version
103
- version: '1.0'
103
+ version: '0'
104
104
  type: :development
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
- - - "~>"
108
+ - - ">="
109
109
  - !ruby/object:Gem::Version
110
- version: '1.0'
110
+ version: '0'
111
111
  description: Implementation of TextRank solution to ranked keyword extraction. See
112
112
  https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
113
113
  email:
@@ -124,7 +124,7 @@ files:
124
124
  - ".travis.yml"
125
125
  - CODE_OF_CONDUCT.md
126
126
  - Gemfile
127
- - LICENSE.txt
127
+ - LICENSE
128
128
  - README.md
129
129
  - Rakefile
130
130
  - bin/console
@@ -141,6 +141,7 @@ files:
141
141
  - lib/text_rank/char_filter/strip_html.rb
142
142
  - lib/text_rank/char_filter/strip_possessive.rb
143
143
  - lib/text_rank/char_filter/undo_contractions.rb
144
+ - lib/text_rank/fingerprint.rb
144
145
  - lib/text_rank/graph_strategy.rb
145
146
  - lib/text_rank/graph_strategy/coocurrence.rb
146
147
  - lib/text_rank/keyword_extractor.rb
@@ -182,7 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
182
183
  version: '0'
183
184
  requirements: []
184
185
  rubyforge_project:
185
- rubygems_version: 2.5.1
186
+ rubygems_version: 2.7.6
186
187
  signing_key:
187
188
  specification_version: 4
188
189
  summary: Implementation of TextRank solution to ranked keyword extraction