text_rank 1.1.6 → 1.1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0526b3b997480a7ff013bb2b6eebccb0cd0eb017
4
- data.tar.gz: de2b5c80ba02acb4f2e987f9bd2ab39e6043a6d1
3
+ metadata.gz: 1625933f78441107094a4306a488cf0d68f45e53
4
+ data.tar.gz: 5e605f2ac2210c6e44e21a1c088620c5d491314c
5
5
  SHA512:
6
- metadata.gz: 08eaffefcb143dec760ab093b9328eb2b2dc83b97532ca62d7f22975ae1998075fa336105d7cbdc67a33e53251c268d8353a9cae5be4e6f18ab75eb9af8712d3
7
- data.tar.gz: 13481d213926e4577f061c27e273a230fb11cd7ea19150f5bf26341c355188403598155d8b9c6862b916432d1dda8908606f7e18e346ae59020abb5b9a6f8a02
6
+ metadata.gz: 1703773a7d6d1391ec81f26bac6aa409344d8f036352b24e0acf37511e141da88cc560c0bbc525563f5b2bd277ea528c9d30bb41f9768afb1987ae9acee42af6
7
+ data.tar.gz: fc23bc9f6d63a0361d88d804bc64448613de5aa6c22d1f5b223630bee18c485d3d35b7a12876bb5b4c25dda5efa537aad83ed2e686e12a04a4fed33d88e2147d
@@ -0,0 +1,99 @@
1
+ require 'set'
2
+
3
+ module TextRank
4
+ ##
5
+ # Class used to compare documents according to TextRank. A "fingerprint"
6
+ # represents the first N keywords (in order from most significant to least) from
7
+ # applying the TextRank algorithm. To compare two "fingerprints" we apply an
8
+ # algorithm that looks at each of the N prefixes and counts the overlap. This
9
+ # rewards matches of significant keywords much higher than matches of less
10
+ # significant keywords. But to prevent less significant keywords from being
11
+ # completely ignored we apply an inverse log linear transformation to each of the
12
+ # N prefixes.
13
+ #
14
+ # For example, consider the following comparison:
15
+ #
16
+ # town man empty found
17
+ # vs.
18
+ # general empty found jar
19
+ #
20
+ # The first pass considers just the first keywords: town vs. general. As these
21
+ # are different, they contribute 0.
22
+ #
23
+ # The second pass considers the first two keywords: town man vs general empty.
24
+ # Again, no overlap, so they contribute 0.
25
+ #
26
+ # The third pass considers the first three keywords: town man empty vs general
27
+ # empty found. Here we have one overlap: empty. This contributes 1.
28
+ #
29
+ # The fourth pass considers all, and there is two overlaps: empty & found. This
30
+ # contributes 2.
31
+ #
32
+ # We can represent the overlaps as the vector [0, 0, 1, 2]. Then we will apply
33
+ # the inverse log linear transformation defined by:
34
+ #
35
+ # f(x_i) = x_i / ln(i + 1)
36
+ # = [0, 0, 1 / ln(4), 2 / ln(5)]
37
+ # = [0, 0, 0.7213475204444817, 1.2426698691192237]
38
+ #
39
+ # Finally we take the average of the transformed vector and normalize it (to
40
+ # ensure a final value between 0.0 and 1.0):
41
+ #
42
+ # norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
43
+ # = norm( 0.49100434739092635 )
44
+ # = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
45
+ # = 0.49100434739092635 / avg(7.912555793714532)
46
+ # = 0.49100434739092635 / 1.978138948428633
47
+ # = 0.24821529740414025
48
+ ##
49
+ class Fingerprint
50
+
51
+ attr_reader :values, :size
52
+
53
+ # Creates a new fingerprint for comparison with another fingerprint
54
+ # @param {Array} values An array of fingerprint values of any hashable type.
55
+ # @return [Fingerprint]
56
+ def initialize(*values)
57
+ @size = values.size
58
+ @values = values
59
+ end
60
+
61
+ # Calculates the "similarity" between this fingerprint and another
62
+ # @param {Fingerprint} A second fingerprint to compare
63
+ # @return [Number] A number between 0.0 (different) and 1.0 (same)
64
+ def similarity(trf2)
65
+ return 1.0 if values == trf2.values
66
+
67
+ sim = 0
68
+ s1 = Set.new
69
+ s2 = Set.new
70
+
71
+ [size, trf2.size].max.times.reduce(0) do |sum, i|
72
+ v1 = values[i]
73
+ v2 = trf2.values[i]
74
+ if v1 == v2
75
+ sim += 1
76
+ else
77
+ s1.delete?(v2) ? (sim += 1) : (s2 << v2)
78
+ s2.delete?(v1) ? (sim += 1) : (s1 << v1)
79
+ end
80
+ sum + sim * linear_transform[i]
81
+ end
82
+ end
83
+
84
+ private
85
+
86
+ def linear_transform
87
+ @linear_transform ||= size.times.map do |i|
88
+ 1.0 / Math.log(i + 2) / size.to_f / norm_factor
89
+ end
90
+ end
91
+
92
+ def norm_factor
93
+ @norm_factor ||= size.times.reduce(0.0) do |s, i|
94
+ s + (i + 1) / Math.log(i + 2) / size.to_f
95
+ end
96
+ end
97
+
98
+ end
99
+ end
@@ -1,4 +1,4 @@
1
1
  module TextRank
2
2
  # Current gem version
3
- VERSION = '1.1.6'
3
+ VERSION = '1.1.7'
4
4
  end
data/lib/text_rank.rb CHANGED
@@ -8,6 +8,7 @@ require 'page_rank'
8
8
  module TextRank
9
9
 
10
10
  autoload :CharFilter, 'text_rank/char_filter'
11
+ autoload :Fingerprint, 'text_rank/fingerprint'
11
12
  autoload :GraphStrategy, 'text_rank/graph_strategy'
12
13
  autoload :KeywordExtractor, 'text_rank/keyword_extractor'
13
14
  autoload :RankFilter, 'text_rank/rank_filter'
@@ -31,4 +32,8 @@ module TextRank
31
32
  TextRank::KeywordExtractor.advanced(**options).extract(text, **options)
32
33
  end
33
34
 
35
+ def self.similarity(keywords1, keywords2)
36
+ TextRank::Fingerprint.new(*keywords1).similarity(TextRank::Fingerprint.new(*keywords2))
37
+ end
38
+
34
39
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_rank
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.6
4
+ version: 1.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - David McCullars
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-05-24 00:00:00.000000000 Z
11
+ date: 2016-07-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -141,6 +141,7 @@ files:
141
141
  - lib/text_rank/char_filter/strip_html.rb
142
142
  - lib/text_rank/char_filter/strip_possessive.rb
143
143
  - lib/text_rank/char_filter/undo_contractions.rb
144
+ - lib/text_rank/fingerprint.rb
144
145
  - lib/text_rank/graph_strategy.rb
145
146
  - lib/text_rank/graph_strategy/coocurrence.rb
146
147
  - lib/text_rank/keyword_extractor.rb
@@ -187,3 +188,4 @@ signing_key:
187
188
  specification_version: 4
188
189
  summary: Implementation of TextRank solution to ranked keyword extraction
189
190
  test_files: []
191
+ has_rdoc: