text_rank 1.1.6 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_rank/fingerprint.rb +99 -0
- data/lib/text_rank/version.rb +1 -1
- data/lib/text_rank.rb +5 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1625933f78441107094a4306a488cf0d68f45e53
|
4
|
+
data.tar.gz: 5e605f2ac2210c6e44e21a1c088620c5d491314c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1703773a7d6d1391ec81f26bac6aa409344d8f036352b24e0acf37511e141da88cc560c0bbc525563f5b2bd277ea528c9d30bb41f9768afb1987ae9acee42af6
|
7
|
+
data.tar.gz: fc23bc9f6d63a0361d88d804bc64448613de5aa6c22d1f5b223630bee18c485d3d35b7a12876bb5b4c25dda5efa537aad83ed2e686e12a04a4fed33d88e2147d
|
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module TextRank
|
4
|
+
##
|
5
|
+
# Class used to compare documents according to TextRank. A "fingerprint"
|
6
|
+
# represents the first N keywords (in order from most significant to least) from
|
7
|
+
# applying the TextRank algorithm. To compare two "fingerprints" we apply an
|
8
|
+
# algorithm that looks at each of the N prefixes and counts the overlap. This
|
9
|
+
# rewards matches of significant keywords much higher than matches of less
|
10
|
+
# significant keywords. But to prevent less significant keywords from being
|
11
|
+
# completely ignored we apply an inverse log linear transformation to each of the
|
12
|
+
# N prefixes.
|
13
|
+
#
|
14
|
+
# For example, consider the following comparison:
|
15
|
+
#
|
16
|
+
# town man empty found
|
17
|
+
# vs.
|
18
|
+
# general empty found jar
|
19
|
+
#
|
20
|
+
# The first pass considers just the first keywords: town vs. general. As these
|
21
|
+
# are different, they contribute 0.
|
22
|
+
#
|
23
|
+
# The second pass considers the first two keywords: town man vs general empty.
|
24
|
+
# Again, no overlap, so they contribute 0.
|
25
|
+
#
|
26
|
+
# The third pass considers the first three keywords: town man empty vs general
|
27
|
+
# empty found. Here we have one overlap: empty. This contributes 1.
|
28
|
+
#
|
29
|
+
# The fourth pass considers all, and there is two overlaps: empty & found. This
|
30
|
+
# contributes 2.
|
31
|
+
#
|
32
|
+
# We can represent the overlaps as the vector [0, 0, 1, 2]. Then we will apply
|
33
|
+
# the inverse log linear transformation defined by:
|
34
|
+
#
|
35
|
+
# f(x_i) = x_i / ln(i + 1)
|
36
|
+
# = [0, 0, 1 / ln(4), 2 / ln(5)]
|
37
|
+
# = [0, 0, 0.7213475204444817, 1.2426698691192237]
|
38
|
+
#
|
39
|
+
# Finally we take the average of the transformed vector and normalize it (to
|
40
|
+
# ensure a final value between 0.0 and 1.0):
|
41
|
+
#
|
42
|
+
# norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
|
43
|
+
# = norm( 0.49100434739092635 )
|
44
|
+
# = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
|
45
|
+
# = 0.49100434739092635 / avg(7.912555793714532)
|
46
|
+
# = 0.49100434739092635 / 1.978138948428633
|
47
|
+
# = 0.24821529740414025
|
48
|
+
##
|
49
|
+
class Fingerprint
|
50
|
+
|
51
|
+
attr_reader :values, :size
|
52
|
+
|
53
|
+
# Creates a new fingerprint for comparison with another fingerprint
|
54
|
+
# @param {Array} values An array of fingerprint values of any hashable type.
|
55
|
+
# @return [Fingerprint]
|
56
|
+
def initialize(*values)
|
57
|
+
@size = values.size
|
58
|
+
@values = values
|
59
|
+
end
|
60
|
+
|
61
|
+
# Calculates the "similarity" between this fingerprint and another
|
62
|
+
# @param {Fingerprint} A second fingerprint to compare
|
63
|
+
# @return [Number] A number between 0.0 (different) and 1.0 (same)
|
64
|
+
def similarity(trf2)
|
65
|
+
return 1.0 if values == trf2.values
|
66
|
+
|
67
|
+
sim = 0
|
68
|
+
s1 = Set.new
|
69
|
+
s2 = Set.new
|
70
|
+
|
71
|
+
[size, trf2.size].max.times.reduce(0) do |sum, i|
|
72
|
+
v1 = values[i]
|
73
|
+
v2 = trf2.values[i]
|
74
|
+
if v1 == v2
|
75
|
+
sim += 1
|
76
|
+
else
|
77
|
+
s1.delete?(v2) ? (sim += 1) : (s2 << v2)
|
78
|
+
s2.delete?(v1) ? (sim += 1) : (s1 << v1)
|
79
|
+
end
|
80
|
+
sum + sim * linear_transform[i]
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def linear_transform
|
87
|
+
@linear_transform ||= size.times.map do |i|
|
88
|
+
1.0 / Math.log(i + 2) / size.to_f / norm_factor
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def norm_factor
|
93
|
+
@norm_factor ||= size.times.reduce(0.0) do |s, i|
|
94
|
+
s + (i + 1) / Math.log(i + 2) / size.to_f
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
end
|
data/lib/text_rank/version.rb
CHANGED
data/lib/text_rank.rb
CHANGED
@@ -8,6 +8,7 @@ require 'page_rank'
|
|
8
8
|
module TextRank
|
9
9
|
|
10
10
|
autoload :CharFilter, 'text_rank/char_filter'
|
11
|
+
autoload :Fingerprint, 'text_rank/fingerprint'
|
11
12
|
autoload :GraphStrategy, 'text_rank/graph_strategy'
|
12
13
|
autoload :KeywordExtractor, 'text_rank/keyword_extractor'
|
13
14
|
autoload :RankFilter, 'text_rank/rank_filter'
|
@@ -31,4 +32,8 @@ module TextRank
|
|
31
32
|
TextRank::KeywordExtractor.advanced(**options).extract(text, **options)
|
32
33
|
end
|
33
34
|
|
35
|
+
def self.similarity(keywords1, keywords2)
|
36
|
+
TextRank::Fingerprint.new(*keywords1).similarity(TextRank::Fingerprint.new(*keywords2))
|
37
|
+
end
|
38
|
+
|
34
39
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_rank
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David McCullars
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05
|
11
|
+
date: 2016-07-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -141,6 +141,7 @@ files:
|
|
141
141
|
- lib/text_rank/char_filter/strip_html.rb
|
142
142
|
- lib/text_rank/char_filter/strip_possessive.rb
|
143
143
|
- lib/text_rank/char_filter/undo_contractions.rb
|
144
|
+
- lib/text_rank/fingerprint.rb
|
144
145
|
- lib/text_rank/graph_strategy.rb
|
145
146
|
- lib/text_rank/graph_strategy/coocurrence.rb
|
146
147
|
- lib/text_rank/keyword_extractor.rb
|
@@ -187,3 +188,4 @@ signing_key:
|
|
187
188
|
specification_version: 4
|
188
189
|
summary: Implementation of TextRank solution to ranked keyword extraction
|
189
190
|
test_files: []
|
191
|
+
has_rdoc:
|