text_rank 1.2.3 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.codeclimate.yml +1 -1
- data/.gitignore +4 -0
- data/.rubocop.yml +7 -0
- data/.ruby-version +1 -1
- data/.travis.yml +1 -0
- data/Rakefile +5 -0
- data/bin/console +3 -3
- data/ext/text_rank/extconf.rb +3 -0
- data/ext/text_rank/page_rank_sparse_native.c +300 -0
- data/ext/text_rank/page_rank_sparse_native.h +93 -0
- data/ext/text_rank/text_rank.c +5 -0
- data/lib/page_rank/base.rb +12 -9
- data/lib/page_rank/dense.rb +3 -2
- data/lib/page_rank/sparse.rb +6 -7
- data/lib/page_rank/sparse_native.rb +21 -0
- data/lib/page_rank.rb +7 -4
- data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
- data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
- data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
- data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
- data/lib/text_rank/char_filter.rb +1 -1
- data/lib/text_rank/fingerprint.rb +10 -18
- data/lib/text_rank/fingerprint_overlap.rb +55 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
- data/lib/text_rank/keyword_extractor.rb +32 -25
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
- data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
- data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
- data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
- data/lib/text_rank/token_filter/stopwords.rb +1 -321
- data/lib/text_rank/token_filter/stopwords.yml +317 -0
- data/lib/text_rank/tokenizer/money.rb +11 -6
- data/lib/text_rank/tokenizer/number.rb +4 -3
- data/lib/text_rank/tokenizer/punctuation.rb +4 -1
- data/lib/text_rank/tokenizer/url.rb +3 -0
- data/lib/text_rank/tokenizer/whitespace.rb +4 -1
- data/lib/text_rank/tokenizer/word.rb +5 -2
- data/lib/text_rank/tokenizer.rb +1 -1
- data/lib/text_rank/version.rb +3 -1
- data/lib/text_rank.rb +14 -9
- data/text_rank.gemspec +4 -1
- metadata +48 -12
@@ -1,13 +1,17 @@
|
|
1
|
-
# coding: utf-8
|
2
1
|
module TextRank
|
3
2
|
module CharFilter
|
4
3
|
##
|
5
4
|
# Characater filter to transform non-ASCII (unicode) characters into ASCII-friendly versions.
|
6
5
|
#
|
6
|
+
# rubocop:disable Style/AsciiComments
|
7
|
+
#
|
7
8
|
# = Example
|
8
9
|
#
|
9
10
|
# AsciiFolding.new.filter!("the Perigordian Abbé then made answer, because a poor beggar of the country of Atrébatie heard some foolish things said")
|
10
11
|
# => "the Perigordian Abbe then made answer, because a poor beggar of the country of Atrebatie heard some foolish things said"
|
12
|
+
#
|
13
|
+
# rubocop:enable Style/AsciiComments
|
14
|
+
#
|
11
15
|
##
|
12
16
|
class AsciiFolding
|
13
17
|
|
@@ -5,7 +5,7 @@ module TextRank
|
|
5
5
|
#
|
6
6
|
# = Example
|
7
7
|
#
|
8
|
-
# StripPosessive.new.filter!("to loathe one
|
8
|
+
# StripPosessive.new.filter!("to loathe one's very being and yet to hold it fast")
|
9
9
|
# => "to loathe one very being and yet to hold it fast"
|
10
10
|
##
|
11
11
|
class StripPossessive
|
@@ -15,7 +15,7 @@ module TextRank
|
|
15
15
|
# @return [String]
|
16
16
|
def filter!(text)
|
17
17
|
text.gsub!(/([a-z]+)'s\b/) do
|
18
|
-
|
18
|
+
Regexp.last_match(1)
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
@@ -11,143 +11,7 @@ module TextRank
|
|
11
11
|
class UndoContractions
|
12
12
|
|
13
13
|
# List of English contractions to undo
|
14
|
-
CONTRACTIONS =
|
15
|
-
"ain't" => "am not",
|
16
|
-
"amn't" => "am not",
|
17
|
-
"aren't" => "are not",
|
18
|
-
"can't" => "can not",
|
19
|
-
"could've" => "could have",
|
20
|
-
"couldn't" => "could not",
|
21
|
-
"couldn't've" => "could not have",
|
22
|
-
"didn't" => "did not",
|
23
|
-
"doesn't" => "does not",
|
24
|
-
"don't" => "do not",
|
25
|
-
"gonna" => "going to",
|
26
|
-
"hadn't" => "had not",
|
27
|
-
"hadn't've" => "had not have",
|
28
|
-
"hasn't" => "has not",
|
29
|
-
"haven't" => "have not",
|
30
|
-
"he'd" => "he had",
|
31
|
-
"he'd've" => "he would have",
|
32
|
-
"he'll" => "he shall",
|
33
|
-
"he's" => "he has",
|
34
|
-
"he'sn't" => "he has not",
|
35
|
-
"how'd" => "how did",
|
36
|
-
"how'll" => "how will",
|
37
|
-
"how's" => "how has",
|
38
|
-
"i'd" => "i had",
|
39
|
-
"i'd've" => "i would have",
|
40
|
-
"i'll" => "i shall",
|
41
|
-
"i'm" => "i am",
|
42
|
-
"i've" => "i have",
|
43
|
-
"i'ven't" => "i have not",
|
44
|
-
"isn't" => "is not",
|
45
|
-
"it'd" => "it had",
|
46
|
-
"it'd've" => "it would have",
|
47
|
-
"it'll" => "it shall",
|
48
|
-
"it's" => "it has",
|
49
|
-
"it'sn't" => "it has not",
|
50
|
-
"let's" => "let us",
|
51
|
-
"ma'am" => "madam",
|
52
|
-
"mightn't" => "might not",
|
53
|
-
"mightn't've" => "might not have",
|
54
|
-
"might've" => "might have",
|
55
|
-
"mustn't" => "must not",
|
56
|
-
"must've" => "must have",
|
57
|
-
"needn't" => "need not",
|
58
|
-
"not've" => "not have",
|
59
|
-
"o'clock" => "of the clock",
|
60
|
-
"ol'" => "old",
|
61
|
-
"oughtn't" => "ought not",
|
62
|
-
"shan't" => "shall not",
|
63
|
-
"she'd" => "she had",
|
64
|
-
"she'd've" => "she would have",
|
65
|
-
"she'll" => "she shall",
|
66
|
-
"she's" => "she has",
|
67
|
-
"she'sn't" => "she has not",
|
68
|
-
"should've" => "should have",
|
69
|
-
"shouldn't" => "should not",
|
70
|
-
"shouldn't've" => "should not have",
|
71
|
-
"somebody'd" => "somebody had",
|
72
|
-
"somebody'd've" => "somebody would have",
|
73
|
-
"somebody'dn't've" => "somebody would not have",
|
74
|
-
"somebody'll" => "somebody shall",
|
75
|
-
"somebody's" => "somebody has",
|
76
|
-
"someone'd" => "someone had",
|
77
|
-
"someone'd've" => "someone would have",
|
78
|
-
"someone'll" => "someone shall",
|
79
|
-
"someone's" => "someone has",
|
80
|
-
"something'd" => "something had",
|
81
|
-
"something'd've" => "something would have",
|
82
|
-
"something'll" => "something shall",
|
83
|
-
"something's" => "something has",
|
84
|
-
"'sup" => "what's up",
|
85
|
-
"that'll" => "that will",
|
86
|
-
"that's" => "that has",
|
87
|
-
"there'd" => "there had",
|
88
|
-
"there'd've" => "there would have",
|
89
|
-
"there're" => "there are",
|
90
|
-
"there's" => "there has",
|
91
|
-
"they'd" => "they had",
|
92
|
-
"they'dn't" => "they would not",
|
93
|
-
"they'dn't've" => "they would not have",
|
94
|
-
"they'd've" => "they would have",
|
95
|
-
"they'd'ven't" => "they would have not",
|
96
|
-
"they'll" => "they shall",
|
97
|
-
"they'lln't've" => "they will not have",
|
98
|
-
"they'll'ven't" => "they will have not",
|
99
|
-
"they're" => "they are",
|
100
|
-
"they've" => "they have",
|
101
|
-
"they'ven't" => "they have not",
|
102
|
-
"'tis" => "it is",
|
103
|
-
"'twas" => "it was",
|
104
|
-
"wanna" => "want to",
|
105
|
-
"wasn't" => "was not",
|
106
|
-
"we'd" => "we had",
|
107
|
-
"we'd've" => "we would have",
|
108
|
-
"we'dn't've" => "we would not have",
|
109
|
-
"we'll" => "we will",
|
110
|
-
"we'lln't've" => "we will not have",
|
111
|
-
"we're" => "we are",
|
112
|
-
"we've" => "we have",
|
113
|
-
"weren't" => "were not",
|
114
|
-
"what'll" => "what shall",
|
115
|
-
"what're" => "what are",
|
116
|
-
"what's" => "what has",
|
117
|
-
"what've" => "what have",
|
118
|
-
"when's" => "when has",
|
119
|
-
"where'd" => "where did",
|
120
|
-
"where's" => "where has",
|
121
|
-
"where've" => "where have",
|
122
|
-
"who'd" => "who would",
|
123
|
-
"who'd've" => "who would have",
|
124
|
-
"who'll" => "who shall",
|
125
|
-
"who're" => "who are",
|
126
|
-
"who's" => "who has",
|
127
|
-
"who've" => "who have",
|
128
|
-
"why'll" => "why will",
|
129
|
-
"why're" => "why are",
|
130
|
-
"why's" => "why has",
|
131
|
-
"won't" => "will not",
|
132
|
-
"won't've" => "will not have",
|
133
|
-
"would've" => "would have",
|
134
|
-
"wouldn't" => "would not",
|
135
|
-
"wouldn't've" => "would not have",
|
136
|
-
"y'all" => "you all",
|
137
|
-
"y'all'd've" => "you all would have",
|
138
|
-
"y'all'dn't've" => "you all would not have",
|
139
|
-
"y'all'll" => "you all will",
|
140
|
-
"y'all'lln't" => "you all will not",
|
141
|
-
"y'all'll've" => "you all will have",
|
142
|
-
"y'all'll'ven't" => "you all will have not",
|
143
|
-
"you'd" => "you had",
|
144
|
-
"you'd've" => "you would have",
|
145
|
-
"you'll" => "you shall",
|
146
|
-
"you're" => "you are",
|
147
|
-
"you'ren't" => "you are not",
|
148
|
-
"you've" => "you have",
|
149
|
-
"you'ven't" => "you have not",
|
150
|
-
}
|
14
|
+
CONTRACTIONS = YAML.load_file(File.expand_path('undo_contractions.yml', __dir__))
|
151
15
|
|
152
16
|
# Perform the filter
|
153
17
|
# @param text [String]
|
@@ -0,0 +1,135 @@
|
|
1
|
+
ain't: am not
|
2
|
+
amn't: am not
|
3
|
+
aren't: are not
|
4
|
+
can't: can not
|
5
|
+
could've: could have
|
6
|
+
couldn't: could not
|
7
|
+
couldn't've: could not have
|
8
|
+
didn't: did not
|
9
|
+
doesn't: does not
|
10
|
+
don't: do not
|
11
|
+
gonna: going to
|
12
|
+
hadn't: had not
|
13
|
+
hadn't've: had not have
|
14
|
+
hasn't: has not
|
15
|
+
haven't: have not
|
16
|
+
he'd: he had
|
17
|
+
he'd've: he would have
|
18
|
+
he'll: he shall
|
19
|
+
he's: he has
|
20
|
+
he'sn't: he has not
|
21
|
+
how'd: how did
|
22
|
+
how'll: how will
|
23
|
+
how's: how has
|
24
|
+
i'd: i had
|
25
|
+
i'd've: i would have
|
26
|
+
i'll: i shall
|
27
|
+
i'm: i am
|
28
|
+
i've: i have
|
29
|
+
i'ven't: i have not
|
30
|
+
isn't: is not
|
31
|
+
it'd: it had
|
32
|
+
it'd've: it would have
|
33
|
+
it'll: it shall
|
34
|
+
it's: it has
|
35
|
+
it'sn't: it has not
|
36
|
+
let's: let us
|
37
|
+
ma'am: madam
|
38
|
+
mightn't: might not
|
39
|
+
mightn't've: might not have
|
40
|
+
might've: might have
|
41
|
+
mustn't: must not
|
42
|
+
must've: must have
|
43
|
+
needn't: need not
|
44
|
+
not've: not have
|
45
|
+
o'clock: of the clock
|
46
|
+
ol': old
|
47
|
+
oughtn't: ought not
|
48
|
+
shan't: shall not
|
49
|
+
she'd: she had
|
50
|
+
she'd've: she would have
|
51
|
+
she'll: she shall
|
52
|
+
she's: she has
|
53
|
+
she'sn't: she has not
|
54
|
+
should've: should have
|
55
|
+
shouldn't: should not
|
56
|
+
shouldn't've: should not have
|
57
|
+
somebody'd: somebody had
|
58
|
+
somebody'd've: somebody would have
|
59
|
+
somebody'dn't've: somebody would not have
|
60
|
+
somebody'll: somebody shall
|
61
|
+
somebody's: somebody has
|
62
|
+
someone'd: someone had
|
63
|
+
someone'd've: someone would have
|
64
|
+
someone'll: someone shall
|
65
|
+
someone's: someone has
|
66
|
+
something'd: something had
|
67
|
+
something'd've: something would have
|
68
|
+
something'll: something shall
|
69
|
+
something's: something has
|
70
|
+
"'sup": "what's up"
|
71
|
+
that'll: that will
|
72
|
+
that's: that has
|
73
|
+
there'd: there had
|
74
|
+
there'd've: there would have
|
75
|
+
there're: there are
|
76
|
+
there's: there has
|
77
|
+
they'd: they had
|
78
|
+
they'dn't: they would not
|
79
|
+
they'dn't've: they would not have
|
80
|
+
they'd've: they would have
|
81
|
+
they'd'ven't: they would have not
|
82
|
+
they'll: they shall
|
83
|
+
they'lln't've: they will not have
|
84
|
+
they'll'ven't: they will have not
|
85
|
+
they're: they are
|
86
|
+
they've: they have
|
87
|
+
they'ven't: they have not
|
88
|
+
"'tis": it is
|
89
|
+
"'twas": it was
|
90
|
+
wanna: want to
|
91
|
+
wasn't: was not
|
92
|
+
we'd: we had
|
93
|
+
we'd've: we would have
|
94
|
+
we'dn't've: we would not have
|
95
|
+
we'll: we will
|
96
|
+
we'lln't've: we will not have
|
97
|
+
we're: we are
|
98
|
+
we've: we have
|
99
|
+
weren't: were not
|
100
|
+
what'll: what shall
|
101
|
+
what're: what are
|
102
|
+
what's: what has
|
103
|
+
what've: what have
|
104
|
+
when's: when has
|
105
|
+
where'd: where did
|
106
|
+
where's: where has
|
107
|
+
where've: where have
|
108
|
+
who'd: who would
|
109
|
+
who'd've: who would have
|
110
|
+
who'll: who shall
|
111
|
+
who're: who are
|
112
|
+
who's: who has
|
113
|
+
who've: who have
|
114
|
+
why'll: why will
|
115
|
+
why're: why are
|
116
|
+
why's: why has
|
117
|
+
won't: will not
|
118
|
+
won't've: will not have
|
119
|
+
would've: would have
|
120
|
+
wouldn't: would not
|
121
|
+
wouldn't've: would not have
|
122
|
+
y'all: you all
|
123
|
+
y'all'd've: you all would have
|
124
|
+
y'all'dn't've: you all would not have
|
125
|
+
y'all'll: you all will
|
126
|
+
y'all'lln't: you all will not
|
127
|
+
y'all'll've: you all will have
|
128
|
+
y'all'll'ven't: you all will have not
|
129
|
+
you'd: you had
|
130
|
+
you'd've: you would have
|
131
|
+
you'll: you shall
|
132
|
+
you're: you are
|
133
|
+
you'ren't: you are not
|
134
|
+
you've: you have
|
135
|
+
you'ven't: you have not
|
@@ -7,7 +7,7 @@ module TextRank
|
|
7
7
|
# converting non-ascii characters to related ascii characters, forcing text to
|
8
8
|
# lower case, stripping out HTML, converting English contractions (e.g. "won't")
|
9
9
|
# to the non-contracted form ("will not"), and more.
|
10
|
-
#
|
10
|
+
#
|
11
11
|
# Character filters are applied as a chain, so care should be taken to use them
|
12
12
|
# in the desired order.
|
13
13
|
##
|
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
1
|
module TextRank
|
4
2
|
##
|
5
3
|
# Class used to compare documents according to TextRank. A "fingerprint"
|
@@ -61,28 +59,22 @@ module TextRank
|
|
61
59
|
# Calculates the "similarity" between this fingerprint and another
|
62
60
|
# @param {Fingerprint} A second fingerprint to compare
|
63
61
|
# @return [Number] A number between 0.0 (different) and 1.0 (same)
|
64
|
-
def similarity(
|
65
|
-
return 1.0 if values ==
|
66
|
-
|
67
|
-
sim = 0
|
68
|
-
s1 = Set.new
|
69
|
-
s2 = Set.new
|
62
|
+
def similarity(other)
|
63
|
+
return 1.0 if values == other.values # Short-circuit for efficiency
|
70
64
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
if v1 == v2
|
75
|
-
sim += 1
|
76
|
-
else
|
77
|
-
s1.delete?(v2) ? (sim += 1) : (s2 << v2)
|
78
|
-
s2.delete?(v1) ? (sim += 1) : (s1 << v1)
|
79
|
-
end
|
80
|
-
sum + sim * linear_transform[i]
|
65
|
+
sum = 0
|
66
|
+
overlap(other).each_with_index do |overlap_value, i|
|
67
|
+
sum += overlap_value * linear_transform[i]
|
81
68
|
end
|
69
|
+
sum
|
82
70
|
end
|
83
71
|
|
84
72
|
private
|
85
73
|
|
74
|
+
def overlap(other)
|
75
|
+
FingerprintOverlap.new(values, other.values).overlap
|
76
|
+
end
|
77
|
+
|
86
78
|
def linear_transform
|
87
79
|
@linear_transform ||= size.times.map do |i|
|
88
80
|
1.0 / Math.log(i + 2) / size.to_f / norm_factor
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module TextRank
|
2
|
+
##
|
3
|
+
# Determines "overlap" between two fingerprints at each N prefixes
|
4
|
+
#
|
5
|
+
# For example,
|
6
|
+
#
|
7
|
+
# FingerprintOverlap.new(
|
8
|
+
# %w[a b c d],
|
9
|
+
# %w[b e a c],
|
10
|
+
# ).overlap
|
11
|
+
#
|
12
|
+
# => [
|
13
|
+
# 0, # [a] & (b) have no overlap
|
14
|
+
# 1, # [a b] & [b e] have one overlap: b
|
15
|
+
# 2, # [a b c] & [b e a] have two overlap: a & b
|
16
|
+
# 3, # [a b c d] & [b e a c] have three overlap: a, b, & c
|
17
|
+
# ]
|
18
|
+
##
|
19
|
+
class FingerprintOverlap
|
20
|
+
|
21
|
+
attr_reader :overlap
|
22
|
+
|
23
|
+
def initialize(values1, values2)
|
24
|
+
raise ArgumentError, 'Value size mismatch' if values1.size != values2.size
|
25
|
+
|
26
|
+
@encountered1 = Set.new
|
27
|
+
@encountered2 = Set.new
|
28
|
+
@overlap_count = 0
|
29
|
+
|
30
|
+
@overlap = determine_overlap(values1, values2)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def determine_overlap(values1, values2)
|
36
|
+
values1.zip(values2).map do |v1, v2|
|
37
|
+
encounter(v1, v2)
|
38
|
+
@overlap_count
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# This algorithm is a little more complex than could be represented in Ruby,
|
43
|
+
# but we want to keep it as performant as possible.
|
44
|
+
def encounter(value1, value2)
|
45
|
+
if value1 == value2
|
46
|
+
@overlap_count += 1
|
47
|
+
else
|
48
|
+
# Delete from the set in case an element appears more than once
|
49
|
+
@encountered1.delete?(value2) ? (@overlap_count += 1) : (@encountered2 << value2)
|
50
|
+
@encountered2.delete?(value1) ? (@overlap_count += 1) : (@encountered1 << value1)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
@@ -61,18 +61,27 @@ module TextRank
|
|
61
61
|
# return [nil]
|
62
62
|
def build_graph(tokens, graph)
|
63
63
|
ngram_window = @ngram_size * 2 + 1
|
64
|
-
tokens.
|
64
|
+
tokens.size.times do |i|
|
65
65
|
ngram_window.times do |j|
|
66
|
-
|
67
|
-
token_j = tokens[i - @ngram_size + j]
|
68
|
-
if token_j
|
69
|
-
graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
|
70
|
-
end
|
66
|
+
consider_ngram_window(tokens, graph, i, j)
|
71
67
|
end
|
72
68
|
end
|
73
69
|
nil
|
74
70
|
end
|
75
71
|
|
72
|
+
private
|
73
|
+
|
74
|
+
def consider_ngram_window(tokens, graph, i, j)
|
75
|
+
return if j == @ngram_size || i + j < @ngram_size
|
76
|
+
|
77
|
+
token_i = tokens[i]
|
78
|
+
token_j = tokens[i - @ngram_size + j]
|
79
|
+
|
80
|
+
if token_j
|
81
|
+
graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
76
85
|
end
|
77
86
|
end
|
78
87
|
end
|
@@ -13,9 +13,9 @@ module TextRank
|
|
13
13
|
# @return [KeywordExtractor]
|
14
14
|
def self.basic(**options)
|
15
15
|
new(**{
|
16
|
-
char_filters: [
|
17
|
-
tokenizers: [
|
18
|
-
token_filters: [
|
16
|
+
char_filters: %i[AsciiFolding Lowercase],
|
17
|
+
tokenizers: %i[Word],
|
18
|
+
token_filters: %i[Stopwords MinLength],
|
19
19
|
graph_strategy: :Coocurrence,
|
20
20
|
}.merge(options))
|
21
21
|
end
|
@@ -25,11 +25,11 @@ module TextRank
|
|
25
25
|
# @return [KeywordExtractor]
|
26
26
|
def self.advanced(**options)
|
27
27
|
new(**{
|
28
|
-
char_filters: [
|
29
|
-
tokenizers: [
|
30
|
-
token_filters: [
|
28
|
+
char_filters: %i[AsciiFolding Lowercase StripHtml StripEmail UndoContractions StripPossessive],
|
29
|
+
tokenizers: %i[Url Money Number Word Punctuation],
|
30
|
+
token_filters: %i[PartOfSpeech Stopwords MinLength],
|
31
31
|
graph_strategy: :Coocurrence,
|
32
|
-
rank_filters: [
|
32
|
+
rank_filters: %i[CollapseAdjacent NormalizeUnitVector SortByValue],
|
33
33
|
}.merge(options))
|
34
34
|
end
|
35
35
|
|
@@ -41,14 +41,14 @@ module TextRank
|
|
41
41
|
# @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
|
42
42
|
def initialize(**options)
|
43
43
|
@page_rank_options = {
|
44
|
-
strategy:
|
45
|
-
damping:
|
44
|
+
strategy: options[:strategy] || :sparse,
|
45
|
+
damping: options[:damping],
|
46
46
|
tolerance: options[:tolerance],
|
47
47
|
}
|
48
|
-
@char_filters
|
49
|
-
@tokenizers
|
50
|
-
@token_filters
|
51
|
-
@rank_filters
|
48
|
+
@char_filters = options[:char_filters] || []
|
49
|
+
@tokenizers = options[:tokenizers] || [Tokenizer::Word]
|
50
|
+
@token_filters = options[:token_filters] || []
|
51
|
+
@rank_filters = options[:rank_filters] || []
|
52
52
|
@graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
|
53
53
|
end
|
54
54
|
|
@@ -73,9 +73,7 @@ module TextRank
|
|
73
73
|
# Sets the graph strategy for producing a graph from tokens
|
74
74
|
# @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
|
75
75
|
# @return [Class, Symbol, #build_graph]
|
76
|
-
|
77
|
-
@graph_strategy = strategy
|
78
|
-
end
|
76
|
+
attr_writer :graph_strategy
|
79
77
|
|
80
78
|
# Add a new TokenFilter for processing tokens after tokenization
|
81
79
|
# @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
|
@@ -105,14 +103,23 @@ module TextRank
|
|
105
103
|
end
|
106
104
|
|
107
105
|
# Filter & tokenize text, and return PageRank
|
108
|
-
# @param text [String] unfiltered text to be processed
|
106
|
+
# @param text [String,Array<String>] unfiltered text to be processed
|
109
107
|
# @return [Hash<String, Float>] tokens and page ranks (in descending order)
|
110
108
|
def extract(text, **options)
|
111
|
-
|
109
|
+
text = Array(text)
|
110
|
+
tokens_per_text = text.map do |t|
|
111
|
+
tokenize(t)
|
112
|
+
end
|
112
113
|
graph = PageRank.new(**@page_rank_options)
|
113
|
-
classify(@graph_strategy, context: GraphStrategy)
|
114
|
+
strategy = classify(@graph_strategy, context: GraphStrategy)
|
115
|
+
tokens_per_text.each do |tokens|
|
116
|
+
strategy.build_graph(tokens, graph)
|
117
|
+
end
|
114
118
|
ranks = graph.calculate(**options)
|
115
|
-
|
119
|
+
tokens_per_text.each_with_index do |tokens, i|
|
120
|
+
ranks = apply_rank_filters(ranks, tokens: tokens, original_text: text[i])
|
121
|
+
end
|
122
|
+
ranks
|
116
123
|
end
|
117
124
|
|
118
125
|
private
|
@@ -153,14 +160,14 @@ module TextRank
|
|
153
160
|
array.insert(idx, value)
|
154
161
|
end
|
155
162
|
|
156
|
-
def classify(
|
157
|
-
case
|
163
|
+
def classify(clazz, context: self)
|
164
|
+
case clazz
|
158
165
|
when Class
|
159
|
-
|
166
|
+
clazz.new
|
160
167
|
when Symbol
|
161
|
-
context.const_get(
|
168
|
+
context.const_get(clazz).new
|
162
169
|
else
|
163
|
-
|
170
|
+
clazz
|
164
171
|
end
|
165
172
|
end
|
166
173
|
|