text_rank 1.1.7 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c7c140fafc459b538cdc9fcc84f639e2155c6fbb
|
4
|
+
data.tar.gz: c2e8b24f80414a113ba9d2d93ca26b6a7e1c38a8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ab7b1875f82d42a51243f74b827f90dc0e23b3ad68479b7dca4a78058423bf63e74d50361c952809e1396ddc70423fbafcee090344bbb47cd5095f94cd09e435
|
7
|
+
data.tar.gz: 7fb37752476eb9f0fefab815af32522eb3a748bb39fdeace39af08446c9a6e544c96fe187bcf541c27947493d6c7aafd69351af1c6a6438f987926396189adde
|
@@ -10,35 +10,35 @@ module TextRank
|
|
10
10
|
# significant keywords. But to prevent less significant keywords from being
|
11
11
|
# completely ignored we apply an inverse log linear transformation to each of the
|
12
12
|
# N prefixes.
|
13
|
-
#
|
13
|
+
#
|
14
14
|
# For example, consider the following comparison:
|
15
|
-
#
|
15
|
+
#
|
16
16
|
# town man empty found
|
17
17
|
# vs.
|
18
18
|
# general empty found jar
|
19
|
-
#
|
19
|
+
#
|
20
20
|
# The first pass considers just the first keywords: town vs. general. As these
|
21
21
|
# are different, they contribute 0.
|
22
|
-
#
|
22
|
+
#
|
23
23
|
# The second pass considers the first two keywords: town man vs general empty.
|
24
24
|
# Again, no overlap, so they contribute 0.
|
25
|
-
#
|
25
|
+
#
|
26
26
|
# The third pass considers the first three keywords: town man empty vs general
|
27
27
|
# empty found. Here we have one overlap: empty. This contributes 1.
|
28
|
-
#
|
28
|
+
#
|
29
29
|
# The fourth pass considers all, and there is two overlaps: empty & found. This
|
30
30
|
# contributes 2.
|
31
|
-
#
|
31
|
+
#
|
32
32
|
# We can represent the overlaps as the vector [0, 0, 1, 2]. Then we will apply
|
33
33
|
# the inverse log linear transformation defined by:
|
34
|
-
#
|
34
|
+
#
|
35
35
|
# f(x_i) = x_i / ln(i + 1)
|
36
36
|
# = [0, 0, 1 / ln(4), 2 / ln(5)]
|
37
37
|
# = [0, 0, 0.7213475204444817, 1.2426698691192237]
|
38
|
-
#
|
38
|
+
#
|
39
39
|
# Finally we take the average of the transformed vector and normalize it (to
|
40
40
|
# ensure a final value between 0.0 and 1.0):
|
41
|
-
#
|
41
|
+
#
|
42
42
|
# norm(avg(SUM f(x_i))) = norm( avg(1.9640173895637054) )
|
43
43
|
# = norm( 0.49100434739092635 )
|
44
44
|
# = 0.49100434739092635 / avg(SUM f(1, 2, 3, 4))
|
@@ -41,7 +41,7 @@ module TextRank
|
|
41
41
|
# @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
|
42
42
|
def initialize(**options)
|
43
43
|
@page_rank_options = {
|
44
|
-
strategy: options[:strategy] || :
|
44
|
+
strategy: options[:strategy] || :sparse,
|
45
45
|
damping: options[:damping],
|
46
46
|
tolerance: options[:tolerance],
|
47
47
|
}
|
@@ -105,7 +105,8 @@ module TextRank
|
|
105
105
|
# until all of the top N final keywords (single or collapsed) have been
|
106
106
|
# considered.
|
107
107
|
loop do
|
108
|
-
|
108
|
+
regexp_safe_tokens = @tokens.keys.select { |s| Regexp.escape(s) == s }
|
109
|
+
single_tokens_to_consider = regexp_safe_tokens.first(@ranks_to_collapse + @to_remove.size - @to_collapse.size) - @to_remove.to_a
|
109
110
|
scan_text_for_all_permutations_of(single_tokens_to_consider) or break
|
110
111
|
decide_what_to_collapse_and_what_to_remove
|
111
112
|
end
|
data/lib/text_rank/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_rank
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David McCullars
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-10-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -183,7 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
183
183
|
version: '0'
|
184
184
|
requirements: []
|
185
185
|
rubyforge_project:
|
186
|
-
rubygems_version: 2.
|
186
|
+
rubygems_version: 2.6.7
|
187
187
|
signing_key:
|
188
188
|
specification_version: 4
|
189
189
|
summary: Implementation of TextRank solution to ranked keyword extraction
|