text_rank 1.2.3 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.codeclimate.yml +1 -1
- data/.gitignore +4 -0
- data/.rubocop.yml +7 -0
- data/.ruby-version +1 -1
- data/.travis.yml +1 -0
- data/Rakefile +5 -0
- data/bin/console +3 -3
- data/ext/text_rank/extconf.rb +3 -0
- data/ext/text_rank/page_rank_sparse_native.c +300 -0
- data/ext/text_rank/page_rank_sparse_native.h +93 -0
- data/ext/text_rank/text_rank.c +5 -0
- data/lib/page_rank/base.rb +12 -9
- data/lib/page_rank/dense.rb +3 -2
- data/lib/page_rank/sparse.rb +6 -7
- data/lib/page_rank/sparse_native.rb +21 -0
- data/lib/page_rank.rb +7 -4
- data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
- data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
- data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
- data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
- data/lib/text_rank/char_filter.rb +1 -1
- data/lib/text_rank/fingerprint.rb +10 -18
- data/lib/text_rank/fingerprint_overlap.rb +55 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
- data/lib/text_rank/keyword_extractor.rb +32 -25
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
- data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
- data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
- data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
- data/lib/text_rank/token_filter/stopwords.rb +1 -321
- data/lib/text_rank/token_filter/stopwords.yml +317 -0
- data/lib/text_rank/tokenizer/money.rb +11 -6
- data/lib/text_rank/tokenizer/number.rb +4 -3
- data/lib/text_rank/tokenizer/punctuation.rb +4 -1
- data/lib/text_rank/tokenizer/url.rb +3 -0
- data/lib/text_rank/tokenizer/whitespace.rb +4 -1
- data/lib/text_rank/tokenizer/word.rb +5 -2
- data/lib/text_rank/tokenizer.rb +1 -1
- data/lib/text_rank/version.rb +3 -1
- data/lib/text_rank.rb +14 -9
- data/text_rank.gemspec +4 -1
- metadata +48 -12
@@ -1,13 +1,17 @@
|
|
1
|
-
# coding: utf-8
|
2
1
|
module TextRank
|
3
2
|
module CharFilter
|
4
3
|
##
|
5
4
|
# Characater filter to transform non-ASCII (unicode) characters into ASCII-friendly versions.
|
6
5
|
#
|
6
|
+
# rubocop:disable Style/AsciiComments
|
7
|
+
#
|
7
8
|
# = Example
|
8
9
|
#
|
9
10
|
# AsciiFolding.new.filter!("the Perigordian Abbé then made answer, because a poor beggar of the country of Atrébatie heard some foolish things said")
|
10
11
|
# => "the Perigordian Abbe then made answer, because a poor beggar of the country of Atrebatie heard some foolish things said"
|
12
|
+
#
|
13
|
+
# rubocop:enable Style/AsciiComments
|
14
|
+
#
|
11
15
|
##
|
12
16
|
class AsciiFolding
|
13
17
|
|
@@ -5,7 +5,7 @@ module TextRank
|
|
5
5
|
#
|
6
6
|
# = Example
|
7
7
|
#
|
8
|
-
# StripPosessive.new.filter!("to loathe one
|
8
|
+
# StripPosessive.new.filter!("to loathe one's very being and yet to hold it fast")
|
9
9
|
# => "to loathe one very being and yet to hold it fast"
|
10
10
|
##
|
11
11
|
class StripPossessive
|
@@ -15,7 +15,7 @@ module TextRank
|
|
15
15
|
# @return [String]
|
16
16
|
def filter!(text)
|
17
17
|
text.gsub!(/([a-z]+)'s\b/) do
|
18
|
-
|
18
|
+
Regexp.last_match(1)
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
@@ -11,143 +11,7 @@ module TextRank
|
|
11
11
|
class UndoContractions
|
12
12
|
|
13
13
|
# List of English contractions to undo
|
14
|
-
CONTRACTIONS =
|
15
|
-
"ain't" => "am not",
|
16
|
-
"amn't" => "am not",
|
17
|
-
"aren't" => "are not",
|
18
|
-
"can't" => "can not",
|
19
|
-
"could've" => "could have",
|
20
|
-
"couldn't" => "could not",
|
21
|
-
"couldn't've" => "could not have",
|
22
|
-
"didn't" => "did not",
|
23
|
-
"doesn't" => "does not",
|
24
|
-
"don't" => "do not",
|
25
|
-
"gonna" => "going to",
|
26
|
-
"hadn't" => "had not",
|
27
|
-
"hadn't've" => "had not have",
|
28
|
-
"hasn't" => "has not",
|
29
|
-
"haven't" => "have not",
|
30
|
-
"he'd" => "he had",
|
31
|
-
"he'd've" => "he would have",
|
32
|
-
"he'll" => "he shall",
|
33
|
-
"he's" => "he has",
|
34
|
-
"he'sn't" => "he has not",
|
35
|
-
"how'd" => "how did",
|
36
|
-
"how'll" => "how will",
|
37
|
-
"how's" => "how has",
|
38
|
-
"i'd" => "i had",
|
39
|
-
"i'd've" => "i would have",
|
40
|
-
"i'll" => "i shall",
|
41
|
-
"i'm" => "i am",
|
42
|
-
"i've" => "i have",
|
43
|
-
"i'ven't" => "i have not",
|
44
|
-
"isn't" => "is not",
|
45
|
-
"it'd" => "it had",
|
46
|
-
"it'd've" => "it would have",
|
47
|
-
"it'll" => "it shall",
|
48
|
-
"it's" => "it has",
|
49
|
-
"it'sn't" => "it has not",
|
50
|
-
"let's" => "let us",
|
51
|
-
"ma'am" => "madam",
|
52
|
-
"mightn't" => "might not",
|
53
|
-
"mightn't've" => "might not have",
|
54
|
-
"might've" => "might have",
|
55
|
-
"mustn't" => "must not",
|
56
|
-
"must've" => "must have",
|
57
|
-
"needn't" => "need not",
|
58
|
-
"not've" => "not have",
|
59
|
-
"o'clock" => "of the clock",
|
60
|
-
"ol'" => "old",
|
61
|
-
"oughtn't" => "ought not",
|
62
|
-
"shan't" => "shall not",
|
63
|
-
"she'd" => "she had",
|
64
|
-
"she'd've" => "she would have",
|
65
|
-
"she'll" => "she shall",
|
66
|
-
"she's" => "she has",
|
67
|
-
"she'sn't" => "she has not",
|
68
|
-
"should've" => "should have",
|
69
|
-
"shouldn't" => "should not",
|
70
|
-
"shouldn't've" => "should not have",
|
71
|
-
"somebody'd" => "somebody had",
|
72
|
-
"somebody'd've" => "somebody would have",
|
73
|
-
"somebody'dn't've" => "somebody would not have",
|
74
|
-
"somebody'll" => "somebody shall",
|
75
|
-
"somebody's" => "somebody has",
|
76
|
-
"someone'd" => "someone had",
|
77
|
-
"someone'd've" => "someone would have",
|
78
|
-
"someone'll" => "someone shall",
|
79
|
-
"someone's" => "someone has",
|
80
|
-
"something'd" => "something had",
|
81
|
-
"something'd've" => "something would have",
|
82
|
-
"something'll" => "something shall",
|
83
|
-
"something's" => "something has",
|
84
|
-
"'sup" => "what's up",
|
85
|
-
"that'll" => "that will",
|
86
|
-
"that's" => "that has",
|
87
|
-
"there'd" => "there had",
|
88
|
-
"there'd've" => "there would have",
|
89
|
-
"there're" => "there are",
|
90
|
-
"there's" => "there has",
|
91
|
-
"they'd" => "they had",
|
92
|
-
"they'dn't" => "they would not",
|
93
|
-
"they'dn't've" => "they would not have",
|
94
|
-
"they'd've" => "they would have",
|
95
|
-
"they'd'ven't" => "they would have not",
|
96
|
-
"they'll" => "they shall",
|
97
|
-
"they'lln't've" => "they will not have",
|
98
|
-
"they'll'ven't" => "they will have not",
|
99
|
-
"they're" => "they are",
|
100
|
-
"they've" => "they have",
|
101
|
-
"they'ven't" => "they have not",
|
102
|
-
"'tis" => "it is",
|
103
|
-
"'twas" => "it was",
|
104
|
-
"wanna" => "want to",
|
105
|
-
"wasn't" => "was not",
|
106
|
-
"we'd" => "we had",
|
107
|
-
"we'd've" => "we would have",
|
108
|
-
"we'dn't've" => "we would not have",
|
109
|
-
"we'll" => "we will",
|
110
|
-
"we'lln't've" => "we will not have",
|
111
|
-
"we're" => "we are",
|
112
|
-
"we've" => "we have",
|
113
|
-
"weren't" => "were not",
|
114
|
-
"what'll" => "what shall",
|
115
|
-
"what're" => "what are",
|
116
|
-
"what's" => "what has",
|
117
|
-
"what've" => "what have",
|
118
|
-
"when's" => "when has",
|
119
|
-
"where'd" => "where did",
|
120
|
-
"where's" => "where has",
|
121
|
-
"where've" => "where have",
|
122
|
-
"who'd" => "who would",
|
123
|
-
"who'd've" => "who would have",
|
124
|
-
"who'll" => "who shall",
|
125
|
-
"who're" => "who are",
|
126
|
-
"who's" => "who has",
|
127
|
-
"who've" => "who have",
|
128
|
-
"why'll" => "why will",
|
129
|
-
"why're" => "why are",
|
130
|
-
"why's" => "why has",
|
131
|
-
"won't" => "will not",
|
132
|
-
"won't've" => "will not have",
|
133
|
-
"would've" => "would have",
|
134
|
-
"wouldn't" => "would not",
|
135
|
-
"wouldn't've" => "would not have",
|
136
|
-
"y'all" => "you all",
|
137
|
-
"y'all'd've" => "you all would have",
|
138
|
-
"y'all'dn't've" => "you all would not have",
|
139
|
-
"y'all'll" => "you all will",
|
140
|
-
"y'all'lln't" => "you all will not",
|
141
|
-
"y'all'll've" => "you all will have",
|
142
|
-
"y'all'll'ven't" => "you all will have not",
|
143
|
-
"you'd" => "you had",
|
144
|
-
"you'd've" => "you would have",
|
145
|
-
"you'll" => "you shall",
|
146
|
-
"you're" => "you are",
|
147
|
-
"you'ren't" => "you are not",
|
148
|
-
"you've" => "you have",
|
149
|
-
"you'ven't" => "you have not",
|
150
|
-
}
|
14
|
+
CONTRACTIONS = YAML.load_file(File.expand_path('undo_contractions.yml', __dir__))
|
151
15
|
|
152
16
|
# Perform the filter
|
153
17
|
# @param text [String]
|
@@ -0,0 +1,135 @@
|
|
1
|
+
ain't: am not
|
2
|
+
amn't: am not
|
3
|
+
aren't: are not
|
4
|
+
can't: can not
|
5
|
+
could've: could have
|
6
|
+
couldn't: could not
|
7
|
+
couldn't've: could not have
|
8
|
+
didn't: did not
|
9
|
+
doesn't: does not
|
10
|
+
don't: do not
|
11
|
+
gonna: going to
|
12
|
+
hadn't: had not
|
13
|
+
hadn't've: had not have
|
14
|
+
hasn't: has not
|
15
|
+
haven't: have not
|
16
|
+
he'd: he had
|
17
|
+
he'd've: he would have
|
18
|
+
he'll: he shall
|
19
|
+
he's: he has
|
20
|
+
he'sn't: he has not
|
21
|
+
how'd: how did
|
22
|
+
how'll: how will
|
23
|
+
how's: how has
|
24
|
+
i'd: i had
|
25
|
+
i'd've: i would have
|
26
|
+
i'll: i shall
|
27
|
+
i'm: i am
|
28
|
+
i've: i have
|
29
|
+
i'ven't: i have not
|
30
|
+
isn't: is not
|
31
|
+
it'd: it had
|
32
|
+
it'd've: it would have
|
33
|
+
it'll: it shall
|
34
|
+
it's: it has
|
35
|
+
it'sn't: it has not
|
36
|
+
let's: let us
|
37
|
+
ma'am: madam
|
38
|
+
mightn't: might not
|
39
|
+
mightn't've: might not have
|
40
|
+
might've: might have
|
41
|
+
mustn't: must not
|
42
|
+
must've: must have
|
43
|
+
needn't: need not
|
44
|
+
not've: not have
|
45
|
+
o'clock: of the clock
|
46
|
+
ol': old
|
47
|
+
oughtn't: ought not
|
48
|
+
shan't: shall not
|
49
|
+
she'd: she had
|
50
|
+
she'd've: she would have
|
51
|
+
she'll: she shall
|
52
|
+
she's: she has
|
53
|
+
she'sn't: she has not
|
54
|
+
should've: should have
|
55
|
+
shouldn't: should not
|
56
|
+
shouldn't've: should not have
|
57
|
+
somebody'd: somebody had
|
58
|
+
somebody'd've: somebody would have
|
59
|
+
somebody'dn't've: somebody would not have
|
60
|
+
somebody'll: somebody shall
|
61
|
+
somebody's: somebody has
|
62
|
+
someone'd: someone had
|
63
|
+
someone'd've: someone would have
|
64
|
+
someone'll: someone shall
|
65
|
+
someone's: someone has
|
66
|
+
something'd: something had
|
67
|
+
something'd've: something would have
|
68
|
+
something'll: something shall
|
69
|
+
something's: something has
|
70
|
+
"'sup": "what's up"
|
71
|
+
that'll: that will
|
72
|
+
that's: that has
|
73
|
+
there'd: there had
|
74
|
+
there'd've: there would have
|
75
|
+
there're: there are
|
76
|
+
there's: there has
|
77
|
+
they'd: they had
|
78
|
+
they'dn't: they would not
|
79
|
+
they'dn't've: they would not have
|
80
|
+
they'd've: they would have
|
81
|
+
they'd'ven't: they would have not
|
82
|
+
they'll: they shall
|
83
|
+
they'lln't've: they will not have
|
84
|
+
they'll'ven't: they will have not
|
85
|
+
they're: they are
|
86
|
+
they've: they have
|
87
|
+
they'ven't: they have not
|
88
|
+
"'tis": it is
|
89
|
+
"'twas": it was
|
90
|
+
wanna: want to
|
91
|
+
wasn't: was not
|
92
|
+
we'd: we had
|
93
|
+
we'd've: we would have
|
94
|
+
we'dn't've: we would not have
|
95
|
+
we'll: we will
|
96
|
+
we'lln't've: we will not have
|
97
|
+
we're: we are
|
98
|
+
we've: we have
|
99
|
+
weren't: were not
|
100
|
+
what'll: what shall
|
101
|
+
what're: what are
|
102
|
+
what's: what has
|
103
|
+
what've: what have
|
104
|
+
when's: when has
|
105
|
+
where'd: where did
|
106
|
+
where's: where has
|
107
|
+
where've: where have
|
108
|
+
who'd: who would
|
109
|
+
who'd've: who would have
|
110
|
+
who'll: who shall
|
111
|
+
who're: who are
|
112
|
+
who's: who has
|
113
|
+
who've: who have
|
114
|
+
why'll: why will
|
115
|
+
why're: why are
|
116
|
+
why's: why has
|
117
|
+
won't: will not
|
118
|
+
won't've: will not have
|
119
|
+
would've: would have
|
120
|
+
wouldn't: would not
|
121
|
+
wouldn't've: would not have
|
122
|
+
y'all: you all
|
123
|
+
y'all'd've: you all would have
|
124
|
+
y'all'dn't've: you all would not have
|
125
|
+
y'all'll: you all will
|
126
|
+
y'all'lln't: you all will not
|
127
|
+
y'all'll've: you all will have
|
128
|
+
y'all'll'ven't: you all will have not
|
129
|
+
you'd: you had
|
130
|
+
you'd've: you would have
|
131
|
+
you'll: you shall
|
132
|
+
you're: you are
|
133
|
+
you'ren't: you are not
|
134
|
+
you've: you have
|
135
|
+
you'ven't: you have not
|
@@ -7,7 +7,7 @@ module TextRank
|
|
7
7
|
# converting non-ascii characters to related ascii characters, forcing text to
|
8
8
|
# lower case, stripping out HTML, converting English contractions (e.g. "won't")
|
9
9
|
# to the non-contracted form ("will not"), and more.
|
10
|
-
#
|
10
|
+
#
|
11
11
|
# Character filters are applied as a chain, so care should be taken to use them
|
12
12
|
# in the desired order.
|
13
13
|
##
|
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
1
|
module TextRank
|
4
2
|
##
|
5
3
|
# Class used to compare documents according to TextRank. A "fingerprint"
|
@@ -61,28 +59,22 @@ module TextRank
|
|
61
59
|
# Calculates the "similarity" between this fingerprint and another
|
62
60
|
# @param {Fingerprint} A second fingerprint to compare
|
63
61
|
# @return [Number] A number between 0.0 (different) and 1.0 (same)
|
64
|
-
def similarity(
|
65
|
-
return 1.0 if values ==
|
66
|
-
|
67
|
-
sim = 0
|
68
|
-
s1 = Set.new
|
69
|
-
s2 = Set.new
|
62
|
+
def similarity(other)
|
63
|
+
return 1.0 if values == other.values # Short-circuit for efficiency
|
70
64
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
if v1 == v2
|
75
|
-
sim += 1
|
76
|
-
else
|
77
|
-
s1.delete?(v2) ? (sim += 1) : (s2 << v2)
|
78
|
-
s2.delete?(v1) ? (sim += 1) : (s1 << v1)
|
79
|
-
end
|
80
|
-
sum + sim * linear_transform[i]
|
65
|
+
sum = 0
|
66
|
+
overlap(other).each_with_index do |overlap_value, i|
|
67
|
+
sum += overlap_value * linear_transform[i]
|
81
68
|
end
|
69
|
+
sum
|
82
70
|
end
|
83
71
|
|
84
72
|
private
|
85
73
|
|
74
|
+
def overlap(other)
|
75
|
+
FingerprintOverlap.new(values, other.values).overlap
|
76
|
+
end
|
77
|
+
|
86
78
|
def linear_transform
|
87
79
|
@linear_transform ||= size.times.map do |i|
|
88
80
|
1.0 / Math.log(i + 2) / size.to_f / norm_factor
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module TextRank
|
2
|
+
##
|
3
|
+
# Determines "overlap" between two fingerprints at each N prefixes
|
4
|
+
#
|
5
|
+
# For example,
|
6
|
+
#
|
7
|
+
# FingerprintOverlap.new(
|
8
|
+
# %w[a b c d],
|
9
|
+
# %w[b e a c],
|
10
|
+
# ).overlap
|
11
|
+
#
|
12
|
+
# => [
|
13
|
+
# 0, # [a] & (b) have no overlap
|
14
|
+
# 1, # [a b] & [b e] have one overlap: b
|
15
|
+
# 2, # [a b c] & [b e a] have two overlap: a & b
|
16
|
+
# 3, # [a b c d] & [b e a c] have three overlap: a, b, & c
|
17
|
+
# ]
|
18
|
+
##
|
19
|
+
class FingerprintOverlap
|
20
|
+
|
21
|
+
attr_reader :overlap
|
22
|
+
|
23
|
+
def initialize(values1, values2)
|
24
|
+
raise ArgumentError, 'Value size mismatch' if values1.size != values2.size
|
25
|
+
|
26
|
+
@encountered1 = Set.new
|
27
|
+
@encountered2 = Set.new
|
28
|
+
@overlap_count = 0
|
29
|
+
|
30
|
+
@overlap = determine_overlap(values1, values2)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def determine_overlap(values1, values2)
|
36
|
+
values1.zip(values2).map do |v1, v2|
|
37
|
+
encounter(v1, v2)
|
38
|
+
@overlap_count
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# This algorithm is a little more complex than could be represented in Ruby,
|
43
|
+
# but we want to keep it as performant as possible.
|
44
|
+
def encounter(value1, value2)
|
45
|
+
if value1 == value2
|
46
|
+
@overlap_count += 1
|
47
|
+
else
|
48
|
+
# Delete from the set in case an element appears more than once
|
49
|
+
@encountered1.delete?(value2) ? (@overlap_count += 1) : (@encountered2 << value2)
|
50
|
+
@encountered2.delete?(value1) ? (@overlap_count += 1) : (@encountered1 << value1)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
@@ -61,18 +61,27 @@ module TextRank
|
|
61
61
|
# return [nil]
|
62
62
|
def build_graph(tokens, graph)
|
63
63
|
ngram_window = @ngram_size * 2 + 1
|
64
|
-
tokens.
|
64
|
+
tokens.size.times do |i|
|
65
65
|
ngram_window.times do |j|
|
66
|
-
|
67
|
-
token_j = tokens[i - @ngram_size + j]
|
68
|
-
if token_j
|
69
|
-
graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
|
70
|
-
end
|
66
|
+
consider_ngram_window(tokens, graph, i, j)
|
71
67
|
end
|
72
68
|
end
|
73
69
|
nil
|
74
70
|
end
|
75
71
|
|
72
|
+
private
|
73
|
+
|
74
|
+
def consider_ngram_window(tokens, graph, i, j)
|
75
|
+
return if j == @ngram_size || i + j < @ngram_size
|
76
|
+
|
77
|
+
token_i = tokens[i]
|
78
|
+
token_j = tokens[i - @ngram_size + j]
|
79
|
+
|
80
|
+
if token_j
|
81
|
+
graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
76
85
|
end
|
77
86
|
end
|
78
87
|
end
|
@@ -13,9 +13,9 @@ module TextRank
|
|
13
13
|
# @return [KeywordExtractor]
|
14
14
|
def self.basic(**options)
|
15
15
|
new(**{
|
16
|
-
char_filters: [
|
17
|
-
tokenizers: [
|
18
|
-
token_filters: [
|
16
|
+
char_filters: %i[AsciiFolding Lowercase],
|
17
|
+
tokenizers: %i[Word],
|
18
|
+
token_filters: %i[Stopwords MinLength],
|
19
19
|
graph_strategy: :Coocurrence,
|
20
20
|
}.merge(options))
|
21
21
|
end
|
@@ -25,11 +25,11 @@ module TextRank
|
|
25
25
|
# @return [KeywordExtractor]
|
26
26
|
def self.advanced(**options)
|
27
27
|
new(**{
|
28
|
-
char_filters: [
|
29
|
-
tokenizers: [
|
30
|
-
token_filters: [
|
28
|
+
char_filters: %i[AsciiFolding Lowercase StripHtml StripEmail UndoContractions StripPossessive],
|
29
|
+
tokenizers: %i[Url Money Number Word Punctuation],
|
30
|
+
token_filters: %i[PartOfSpeech Stopwords MinLength],
|
31
31
|
graph_strategy: :Coocurrence,
|
32
|
-
rank_filters: [
|
32
|
+
rank_filters: %i[CollapseAdjacent NormalizeUnitVector SortByValue],
|
33
33
|
}.merge(options))
|
34
34
|
end
|
35
35
|
|
@@ -41,14 +41,14 @@ module TextRank
|
|
41
41
|
# @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
|
42
42
|
def initialize(**options)
|
43
43
|
@page_rank_options = {
|
44
|
-
strategy:
|
45
|
-
damping:
|
44
|
+
strategy: options[:strategy] || :sparse,
|
45
|
+
damping: options[:damping],
|
46
46
|
tolerance: options[:tolerance],
|
47
47
|
}
|
48
|
-
@char_filters
|
49
|
-
@tokenizers
|
50
|
-
@token_filters
|
51
|
-
@rank_filters
|
48
|
+
@char_filters = options[:char_filters] || []
|
49
|
+
@tokenizers = options[:tokenizers] || [Tokenizer::Word]
|
50
|
+
@token_filters = options[:token_filters] || []
|
51
|
+
@rank_filters = options[:rank_filters] || []
|
52
52
|
@graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
|
53
53
|
end
|
54
54
|
|
@@ -73,9 +73,7 @@ module TextRank
|
|
73
73
|
# Sets the graph strategy for producing a graph from tokens
|
74
74
|
# @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
|
75
75
|
# @return [Class, Symbol, #build_graph]
|
76
|
-
|
77
|
-
@graph_strategy = strategy
|
78
|
-
end
|
76
|
+
attr_writer :graph_strategy
|
79
77
|
|
80
78
|
# Add a new TokenFilter for processing tokens after tokenization
|
81
79
|
# @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
|
@@ -105,14 +103,23 @@ module TextRank
|
|
105
103
|
end
|
106
104
|
|
107
105
|
# Filter & tokenize text, and return PageRank
|
108
|
-
# @param text [String] unfiltered text to be processed
|
106
|
+
# @param text [String,Array<String>] unfiltered text to be processed
|
109
107
|
# @return [Hash<String, Float>] tokens and page ranks (in descending order)
|
110
108
|
def extract(text, **options)
|
111
|
-
|
109
|
+
text = Array(text)
|
110
|
+
tokens_per_text = text.map do |t|
|
111
|
+
tokenize(t)
|
112
|
+
end
|
112
113
|
graph = PageRank.new(**@page_rank_options)
|
113
|
-
classify(@graph_strategy, context: GraphStrategy)
|
114
|
+
strategy = classify(@graph_strategy, context: GraphStrategy)
|
115
|
+
tokens_per_text.each do |tokens|
|
116
|
+
strategy.build_graph(tokens, graph)
|
117
|
+
end
|
114
118
|
ranks = graph.calculate(**options)
|
115
|
-
|
119
|
+
tokens_per_text.each_with_index do |tokens, i|
|
120
|
+
ranks = apply_rank_filters(ranks, tokens: tokens, original_text: text[i])
|
121
|
+
end
|
122
|
+
ranks
|
116
123
|
end
|
117
124
|
|
118
125
|
private
|
@@ -153,14 +160,14 @@ module TextRank
|
|
153
160
|
array.insert(idx, value)
|
154
161
|
end
|
155
162
|
|
156
|
-
def classify(
|
157
|
-
case
|
163
|
+
def classify(clazz, context: self)
|
164
|
+
case clazz
|
158
165
|
when Class
|
159
|
-
|
166
|
+
clazz.new
|
160
167
|
when Symbol
|
161
|
-
context.const_get(
|
168
|
+
context.const_get(clazz).new
|
162
169
|
else
|
163
|
-
|
170
|
+
clazz
|
164
171
|
end
|
165
172
|
end
|
166
173
|
|