text_rank 1.1.6 → 1.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/text_rank/fingerprint.rb +99 -0
- data/lib/text_rank/version.rb +1 -1
- data/lib/text_rank.rb +5 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1625933f78441107094a4306a488cf0d68f45e53
|
4
|
+
data.tar.gz: 5e605f2ac2210c6e44e21a1c088620c5d491314c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1703773a7d6d1391ec81f26bac6aa409344d8f036352b24e0acf37511e141da88cc560c0bbc525563f5b2bd277ea528c9d30bb41f9768afb1987ae9acee42af6
|
7
|
+
data.tar.gz: fc23bc9f6d63a0361d88d804bc64448613de5aa6c22d1f5b223630bee18c485d3d35b7a12876bb5b4c25dda5efa537aad83ed2e686e12a04a4fed33d88e2147d
|
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module TextRank
|
4
|
+
##
|
5
|
+
# Class used to compare documents according to TextRank. A "fingerprint"
|
6
|
+
# represents the first N keywords (in order from most significant to least) from
|
7
|
+
# applying the TextRank algorithm. To compare two "fingerprints" we apply an
|
8
|
+
# algorithm that looks at each of the N prefixes and counts the overlap. This
|
9
|
+
# rewards matches of significant keywords much higher than matches of less
|
10
|
+
# significant keywords. But to prevent less significant keywords from being
|
11
|
+
# completely ignored we apply an inverse log linear transformation to each of the
|
12
|
+
# N prefixes.
|
13
|
+
#
|
14
|
+
# For example, consider the following comparison:
|
15
|
+
#
|
16
|
+
# town man empty found
|
17
|
+
# vs.
|
18
|
+
# general empty found jar
|
19
|
+
#
|
20
|
+
# The first pass considers just the first keywords: town vs. general. As these
|
21
|
+
# are different, they contribute 0.
|
22
|
+
#
|
23
|
+
# The second pass considers the first two keywords: town man vs general empty.
|
24
|
+
# Again, no overlap, so they contribute 0.
|
25
|
+
#
|
26
|
+
# The third pass considers the first three keywords: town man empty vs general
|
27
|
+
# empty found. Here we have one overlap: empty. This contributes 1.
|
28
|
+
#
|
29
|
+
# The fourth pass considers all, and there is two overlaps: empty & found. This
|
30
|
+
# contributes 2.
|
31
|
+
#
|
32
|
+
# We can represent the overlaps as the vector [0, 0, 1, 2]. Then we will apply
|
33
|
+
# the inverse log linear transformation defined by:
|
34
|
+
#
|
35
|
+
# f(x_i) = x_i / ln(i + 1)
|
36
|
+
# = [0, 0, 1 / ln(4), 2 / ln(5)]
|
37
|
+
# = [0, 0, 0.7213475204444817, 1.2426698691192237]
|
38
|
+
#
|
39
|
+
# Finally we take the average of the transformed vector and normalize it (to
|
40
|
+
# ensure a final value between 0.0 and 1.0):
|
41
|
+
#
|
42
|
+
# norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
|
43
|
+
# = norm( 0.49100434739092635 )
|
44
|
+
# = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
|
45
|
+
# = 0.49100434739092635 / avg(7.912555793714532)
|
46
|
+
# = 0.49100434739092635 / 1.978138948428633
|
47
|
+
# = 0.24821529740414025
|
48
|
+
##
|
49
|
+
class Fingerprint
|
50
|
+
|
51
|
+
attr_reader :values, :size
|
52
|
+
|
53
|
+
# Creates a new fingerprint for comparison with another fingerprint
|
54
|
+
# @param {Array} values An array of fingerprint values of any hashable type.
|
55
|
+
# @return [Fingerprint]
|
56
|
+
def initialize(*values)
|
57
|
+
@size = values.size
|
58
|
+
@values = values
|
59
|
+
end
|
60
|
+
|
61
|
+
# Calculates the "similarity" between this fingerprint and another
|
62
|
+
# @param {Fingerprint} A second fingerprint to compare
|
63
|
+
# @return [Number] A number between 0.0 (different) and 1.0 (same)
|
64
|
+
def similarity(trf2)
|
65
|
+
return 1.0 if values == trf2.values
|
66
|
+
|
67
|
+
sim = 0
|
68
|
+
s1 = Set.new
|
69
|
+
s2 = Set.new
|
70
|
+
|
71
|
+
[size, trf2.size].max.times.reduce(0) do |sum, i|
|
72
|
+
v1 = values[i]
|
73
|
+
v2 = trf2.values[i]
|
74
|
+
if v1 == v2
|
75
|
+
sim += 1
|
76
|
+
else
|
77
|
+
s1.delete?(v2) ? (sim += 1) : (s2 << v2)
|
78
|
+
s2.delete?(v1) ? (sim += 1) : (s1 << v1)
|
79
|
+
end
|
80
|
+
sum + sim * linear_transform[i]
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def linear_transform
|
87
|
+
@linear_transform ||= size.times.map do |i|
|
88
|
+
1.0 / Math.log(i + 2) / size.to_f / norm_factor
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def norm_factor
|
93
|
+
@norm_factor ||= size.times.reduce(0.0) do |s, i|
|
94
|
+
s + (i + 1) / Math.log(i + 2) / size.to_f
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
end
|
data/lib/text_rank/version.rb
CHANGED
data/lib/text_rank.rb
CHANGED
@@ -8,6 +8,7 @@ require 'page_rank'
|
|
8
8
|
module TextRank
|
9
9
|
|
10
10
|
autoload :CharFilter, 'text_rank/char_filter'
|
11
|
+
autoload :Fingerprint, 'text_rank/fingerprint'
|
11
12
|
autoload :GraphStrategy, 'text_rank/graph_strategy'
|
12
13
|
autoload :KeywordExtractor, 'text_rank/keyword_extractor'
|
13
14
|
autoload :RankFilter, 'text_rank/rank_filter'
|
@@ -31,4 +32,8 @@ module TextRank
|
|
31
32
|
TextRank::KeywordExtractor.advanced(**options).extract(text, **options)
|
32
33
|
end
|
33
34
|
|
35
|
+
def self.similarity(keywords1, keywords2)
|
36
|
+
TextRank::Fingerprint.new(*keywords1).similarity(TextRank::Fingerprint.new(*keywords2))
|
37
|
+
end
|
38
|
+
|
34
39
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_rank
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David McCullars
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05
|
11
|
+
date: 2016-07-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -141,6 +141,7 @@ files:
|
|
141
141
|
- lib/text_rank/char_filter/strip_html.rb
|
142
142
|
- lib/text_rank/char_filter/strip_possessive.rb
|
143
143
|
- lib/text_rank/char_filter/undo_contractions.rb
|
144
|
+
- lib/text_rank/fingerprint.rb
|
144
145
|
- lib/text_rank/graph_strategy.rb
|
145
146
|
- lib/text_rank/graph_strategy/coocurrence.rb
|
146
147
|
- lib/text_rank/keyword_extractor.rb
|
@@ -187,3 +188,4 @@ signing_key:
|
|
187
188
|
specification_version: 4
|
188
189
|
summary: Implementation of TextRank solution to ranked keyword extraction
|
189
190
|
test_files: []
|
191
|
+
has_rdoc:
|