text_rank 1.1.6 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0526b3b997480a7ff013bb2b6eebccb0cd0eb017
4
- data.tar.gz: de2b5c80ba02acb4f2e987f9bd2ab39e6043a6d1
3
+ metadata.gz: 1625933f78441107094a4306a488cf0d68f45e53
4
+ data.tar.gz: 5e605f2ac2210c6e44e21a1c088620c5d491314c
5
5
  SHA512:
6
- metadata.gz: 08eaffefcb143dec760ab093b9328eb2b2dc83b97532ca62d7f22975ae1998075fa336105d7cbdc67a33e53251c268d8353a9cae5be4e6f18ab75eb9af8712d3
7
- data.tar.gz: 13481d213926e4577f061c27e273a230fb11cd7ea19150f5bf26341c355188403598155d8b9c6862b916432d1dda8908606f7e18e346ae59020abb5b9a6f8a02
6
+ metadata.gz: 1703773a7d6d1391ec81f26bac6aa409344d8f036352b24e0acf37511e141da88cc560c0bbc525563f5b2bd277ea528c9d30bb41f9768afb1987ae9acee42af6
7
+ data.tar.gz: fc23bc9f6d63a0361d88d804bc64448613de5aa6c22d1f5b223630bee18c485d3d35b7a12876bb5b4c25dda5efa537aad83ed2e686e12a04a4fed33d88e2147d
@@ -0,0 +1,99 @@
1
+ require 'set'
2
+
3
+ module TextRank
4
+ ##
5
+ # Class used to compare documents according to TextRank. A "fingerprint"
6
+ # represents the first N keywords (in order from most significant to least) from
7
+ # applying the TextRank algorithm. To compare two "fingerprints" we apply an
8
+ # algorithm that looks at each of the N prefixes and counts the overlap. This
9
+ # rewards matches of significant keywords much higher than matches of less
10
+ # significant keywords. But to prevent less significant keywords from being
11
+ # completely ignored we apply an inverse log linear transformation to each of the
12
+ # N prefixes.
13
+ #
14
+ # For example, consider the following comparison:
15
+ #
16
+ # town man empty found
17
+ # vs.
18
+ # general empty found jar
19
+ #
20
+ # The first pass considers just the first keywords: town vs. general. As these
21
+ # are different, they contribute 0.
22
+ #
23
+ # The second pass considers the first two keywords: town man vs general empty.
24
+ # Again, no overlap, so they contribute 0.
25
+ #
26
+ # The third pass considers the first three keywords: town man empty vs general
27
+ # empty found. Here we have one overlap: empty. This contributes 1.
28
+ #
29
+ # The fourth pass considers all, and there is two overlaps: empty & found. This
30
+ # contributes 2.
31
+ #
32
+ # We can represent the overlaps as the vector [0, 0, 1, 2]. Then we will apply
33
+ # the inverse log linear transformation defined by:
34
+ #
35
+ # f(x_i) = x_i / ln(i + 1)
36
+ # = [0, 0, 1 / ln(4), 2 / ln(5)]
37
+ # = [0, 0, 0.7213475204444817, 1.2426698691192237]
38
+ #
39
+ # Finally we take the average of the transformed vector and normalize it (to
40
+ # ensure a final value between 0.0 and 1.0):
41
+ #
42
+ # norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
43
+ # = norm( 0.49100434739092635 )
44
+ # = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
45
+ # = 0.49100434739092635 / avg(7.912555793714532)
46
+ # = 0.49100434739092635 / 1.978138948428633
47
+ # = 0.24821529740414025
48
+ ##
49
+ class Fingerprint
50
+
51
+ attr_reader :values, :size
52
+
53
+ # Creates a new fingerprint for comparison with another fingerprint
54
+ # @param {Array} values An array of fingerprint values of any hashable type.
55
+ # @return [Fingerprint]
56
+ def initialize(*values)
57
+ @size = values.size
58
+ @values = values
59
+ end
60
+
61
+ # Calculates the "similarity" between this fingerprint and another
62
+ # @param {Fingerprint} A second fingerprint to compare
63
+ # @return [Number] A number between 0.0 (different) and 1.0 (same)
64
+ def similarity(trf2)
65
+ return 1.0 if values == trf2.values
66
+
67
+ sim = 0
68
+ s1 = Set.new
69
+ s2 = Set.new
70
+
71
+ [size, trf2.size].max.times.reduce(0) do |sum, i|
72
+ v1 = values[i]
73
+ v2 = trf2.values[i]
74
+ if v1 == v2
75
+ sim += 1
76
+ else
77
+ s1.delete?(v2) ? (sim += 1) : (s2 << v2)
78
+ s2.delete?(v1) ? (sim += 1) : (s1 << v1)
79
+ end
80
+ sum + sim * linear_transform[i]
81
+ end
82
+ end
83
+
84
+ private
85
+
86
+ def linear_transform
87
+ @linear_transform ||= size.times.map do |i|
88
+ 1.0 / Math.log(i + 2) / size.to_f / norm_factor
89
+ end
90
+ end
91
+
92
+ def norm_factor
93
+ @norm_factor ||= size.times.reduce(0.0) do |s, i|
94
+ s + (i + 1) / Math.log(i + 2) / size.to_f
95
+ end
96
+ end
97
+
98
+ end
99
+ end
@@ -1,4 +1,4 @@
1
1
  module TextRank
2
2
  # Current gem version
3
- VERSION = '1.1.6'
3
+ VERSION = '1.1.7'
4
4
  end
data/lib/text_rank.rb CHANGED
@@ -8,6 +8,7 @@ require 'page_rank'
8
8
  module TextRank
9
9
 
10
10
  autoload :CharFilter, 'text_rank/char_filter'
11
+ autoload :Fingerprint, 'text_rank/fingerprint'
11
12
  autoload :GraphStrategy, 'text_rank/graph_strategy'
12
13
  autoload :KeywordExtractor, 'text_rank/keyword_extractor'
13
14
  autoload :RankFilter, 'text_rank/rank_filter'
@@ -31,4 +32,8 @@ module TextRank
31
32
  TextRank::KeywordExtractor.advanced(**options).extract(text, **options)
32
33
  end
33
34
 
35
+ def self.similarity(keywords1, keywords2)
36
+ TextRank::Fingerprint.new(*keywords1).similarity(TextRank::Fingerprint.new(*keywords2))
37
+ end
38
+
34
39
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_rank
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.6
4
+ version: 1.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - David McCullars
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-05-24 00:00:00.000000000 Z
11
+ date: 2016-07-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -141,6 +141,7 @@ files:
141
141
  - lib/text_rank/char_filter/strip_html.rb
142
142
  - lib/text_rank/char_filter/strip_possessive.rb
143
143
  - lib/text_rank/char_filter/undo_contractions.rb
144
+ - lib/text_rank/fingerprint.rb
144
145
  - lib/text_rank/graph_strategy.rb
145
146
  - lib/text_rank/graph_strategy/coocurrence.rb
146
147
  - lib/text_rank/keyword_extractor.rb
@@ -187,3 +188,4 @@ signing_key:
187
188
  specification_version: 4
188
189
  summary: Implementation of TextRank solution to ranked keyword extraction
189
190
  test_files: []
191
+ has_rdoc: