text_rank 1.2.3 → 1.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +7 -0
- data/bin/console +3 -3
- data/lib/page_rank.rb +2 -0
- data/lib/page_rank/base.rb +9 -8
- data/lib/page_rank/dense.rb +2 -1
- data/lib/page_rank/sparse.rb +6 -7
- data/lib/text_rank.rb +11 -8
- data/lib/text_rank/char_filter.rb +1 -1
- data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
- data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
- data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
- data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
- data/lib/text_rank/fingerprint.rb +10 -18
- data/lib/text_rank/fingerprint_overlap.rb +55 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
- data/lib/text_rank/keyword_extractor.rb +19 -21
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
- data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
- data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
- data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
- data/lib/text_rank/token_filter/stopwords.rb +1 -321
- data/lib/text_rank/token_filter/stopwords.yml +317 -0
- data/lib/text_rank/tokenizer.rb +1 -1
- data/lib/text_rank/tokenizer/money.rb +11 -6
- data/lib/text_rank/tokenizer/number.rb +4 -3
- data/lib/text_rank/tokenizer/punctuation.rb +4 -1
- data/lib/text_rank/tokenizer/url.rb +3 -0
- data/lib/text_rank/tokenizer/whitespace.rb +4 -1
- data/lib/text_rank/tokenizer/word.rb +5 -2
- data/lib/text_rank/version.rb +3 -1
- metadata +4 -1
@@ -0,0 +1,317 @@
|
|
1
|
+
- a
|
2
|
+
- about
|
3
|
+
- above
|
4
|
+
- across
|
5
|
+
- after
|
6
|
+
- afterwards
|
7
|
+
- again
|
8
|
+
- against
|
9
|
+
- all
|
10
|
+
- almost
|
11
|
+
- alone
|
12
|
+
- along
|
13
|
+
- already
|
14
|
+
- also
|
15
|
+
- although
|
16
|
+
- always
|
17
|
+
- am
|
18
|
+
- among
|
19
|
+
- amongst
|
20
|
+
- amoungst
|
21
|
+
- amount
|
22
|
+
- an
|
23
|
+
- and
|
24
|
+
- another
|
25
|
+
- any
|
26
|
+
- anyhow
|
27
|
+
- anyone
|
28
|
+
- anything
|
29
|
+
- anyway
|
30
|
+
- anywhere
|
31
|
+
- are
|
32
|
+
- around
|
33
|
+
- as
|
34
|
+
- at
|
35
|
+
- back
|
36
|
+
- be
|
37
|
+
- became
|
38
|
+
- because
|
39
|
+
- become
|
40
|
+
- becomes
|
41
|
+
- becoming
|
42
|
+
- been
|
43
|
+
- before
|
44
|
+
- beforehand
|
45
|
+
- behind
|
46
|
+
- being
|
47
|
+
- below
|
48
|
+
- beside
|
49
|
+
- besides
|
50
|
+
- between
|
51
|
+
- beyond
|
52
|
+
- bill
|
53
|
+
- both
|
54
|
+
- bottom
|
55
|
+
- but
|
56
|
+
- by
|
57
|
+
- call
|
58
|
+
- can
|
59
|
+
- cannot
|
60
|
+
- cant
|
61
|
+
- co
|
62
|
+
- con
|
63
|
+
- could
|
64
|
+
- couldnt
|
65
|
+
- cry
|
66
|
+
- de
|
67
|
+
- describe
|
68
|
+
- detail
|
69
|
+
- do
|
70
|
+
- done
|
71
|
+
- down
|
72
|
+
- due
|
73
|
+
- during
|
74
|
+
- each
|
75
|
+
- eg
|
76
|
+
- eight
|
77
|
+
- either
|
78
|
+
- eleven
|
79
|
+
- else
|
80
|
+
- elsewhere
|
81
|
+
- empty
|
82
|
+
- enough
|
83
|
+
- etc
|
84
|
+
- even
|
85
|
+
- ever
|
86
|
+
- every
|
87
|
+
- everyone
|
88
|
+
- everything
|
89
|
+
- everywhere
|
90
|
+
- except
|
91
|
+
- few
|
92
|
+
- fifteen
|
93
|
+
- fify
|
94
|
+
- fill
|
95
|
+
- find
|
96
|
+
- fire
|
97
|
+
- first
|
98
|
+
- five
|
99
|
+
- for
|
100
|
+
- former
|
101
|
+
- formerly
|
102
|
+
- forty
|
103
|
+
- found
|
104
|
+
- four
|
105
|
+
- from
|
106
|
+
- front
|
107
|
+
- full
|
108
|
+
- further
|
109
|
+
- get
|
110
|
+
- give
|
111
|
+
- go
|
112
|
+
- had
|
113
|
+
- has
|
114
|
+
- hasnt
|
115
|
+
- have
|
116
|
+
- he
|
117
|
+
- hence
|
118
|
+
- her
|
119
|
+
- here
|
120
|
+
- hereafter
|
121
|
+
- hereby
|
122
|
+
- herein
|
123
|
+
- hereupon
|
124
|
+
- hers
|
125
|
+
- herself
|
126
|
+
- him
|
127
|
+
- himself
|
128
|
+
- his
|
129
|
+
- how
|
130
|
+
- however
|
131
|
+
- hundred
|
132
|
+
- ie
|
133
|
+
- if
|
134
|
+
- in
|
135
|
+
- inc
|
136
|
+
- indeed
|
137
|
+
- interest
|
138
|
+
- into
|
139
|
+
- is
|
140
|
+
- it
|
141
|
+
- its
|
142
|
+
- itself
|
143
|
+
- keep
|
144
|
+
- last
|
145
|
+
- latter
|
146
|
+
- latterly
|
147
|
+
- least
|
148
|
+
- less
|
149
|
+
- ltd
|
150
|
+
- made
|
151
|
+
- many
|
152
|
+
- may
|
153
|
+
- me
|
154
|
+
- meanwhile
|
155
|
+
- might
|
156
|
+
- mill
|
157
|
+
- mine
|
158
|
+
- more
|
159
|
+
- moreover
|
160
|
+
- most
|
161
|
+
- mostly
|
162
|
+
- move
|
163
|
+
- much
|
164
|
+
- must
|
165
|
+
- my
|
166
|
+
- myself
|
167
|
+
- name
|
168
|
+
- namely
|
169
|
+
- neither
|
170
|
+
- never
|
171
|
+
- nevertheless
|
172
|
+
- next
|
173
|
+
- nine
|
174
|
+
- no
|
175
|
+
- nobody
|
176
|
+
- none
|
177
|
+
- noone
|
178
|
+
- nor
|
179
|
+
- not
|
180
|
+
- nothing
|
181
|
+
- now
|
182
|
+
- nowhere
|
183
|
+
- of
|
184
|
+
- off
|
185
|
+
- often
|
186
|
+
- on
|
187
|
+
- once
|
188
|
+
- one
|
189
|
+
- only
|
190
|
+
- onto
|
191
|
+
- or
|
192
|
+
- other
|
193
|
+
- others
|
194
|
+
- otherwise
|
195
|
+
- our
|
196
|
+
- ours
|
197
|
+
- ourselves
|
198
|
+
- out
|
199
|
+
- over
|
200
|
+
- own
|
201
|
+
- part
|
202
|
+
- per
|
203
|
+
- perhaps
|
204
|
+
- please
|
205
|
+
- put
|
206
|
+
- rather
|
207
|
+
- re
|
208
|
+
- same
|
209
|
+
- see
|
210
|
+
- seem
|
211
|
+
- seemed
|
212
|
+
- seeming
|
213
|
+
- seems
|
214
|
+
- serious
|
215
|
+
- several
|
216
|
+
- she
|
217
|
+
- should
|
218
|
+
- show
|
219
|
+
- side
|
220
|
+
- since
|
221
|
+
- sincere
|
222
|
+
- six
|
223
|
+
- sixty
|
224
|
+
- so
|
225
|
+
- some
|
226
|
+
- somehow
|
227
|
+
- someone
|
228
|
+
- something
|
229
|
+
- sometime
|
230
|
+
- sometimes
|
231
|
+
- somewhere
|
232
|
+
- still
|
233
|
+
- such
|
234
|
+
- system
|
235
|
+
- take
|
236
|
+
- ten
|
237
|
+
- than
|
238
|
+
- that
|
239
|
+
- the
|
240
|
+
- their
|
241
|
+
- them
|
242
|
+
- themselves
|
243
|
+
- then
|
244
|
+
- thence
|
245
|
+
- there
|
246
|
+
- thereafter
|
247
|
+
- thereby
|
248
|
+
- therefore
|
249
|
+
- therein
|
250
|
+
- thereupon
|
251
|
+
- these
|
252
|
+
- they
|
253
|
+
- thickv
|
254
|
+
- thin
|
255
|
+
- third
|
256
|
+
- this
|
257
|
+
- those
|
258
|
+
- though
|
259
|
+
- three
|
260
|
+
- through
|
261
|
+
- throughout
|
262
|
+
- thru
|
263
|
+
- thus
|
264
|
+
- to
|
265
|
+
- together
|
266
|
+
- too
|
267
|
+
- top
|
268
|
+
- toward
|
269
|
+
- towards
|
270
|
+
- twelve
|
271
|
+
- twenty
|
272
|
+
- two
|
273
|
+
- un
|
274
|
+
- under
|
275
|
+
- until
|
276
|
+
- up
|
277
|
+
- upon
|
278
|
+
- us
|
279
|
+
- very
|
280
|
+
- via
|
281
|
+
- was
|
282
|
+
- we
|
283
|
+
- well
|
284
|
+
- were
|
285
|
+
- what
|
286
|
+
- whatever
|
287
|
+
- when
|
288
|
+
- whence
|
289
|
+
- whenever
|
290
|
+
- where
|
291
|
+
- whereafter
|
292
|
+
- whereas
|
293
|
+
- whereby
|
294
|
+
- wherein
|
295
|
+
- whereupon
|
296
|
+
- wherever
|
297
|
+
- whether
|
298
|
+
- which
|
299
|
+
- while
|
300
|
+
- whither
|
301
|
+
- who
|
302
|
+
- whoever
|
303
|
+
- whole
|
304
|
+
- whom
|
305
|
+
- whose
|
306
|
+
- why
|
307
|
+
- will
|
308
|
+
- with
|
309
|
+
- within
|
310
|
+
- without
|
311
|
+
- would
|
312
|
+
- yet
|
313
|
+
- you
|
314
|
+
- your
|
315
|
+
- yours
|
316
|
+
- yourself
|
317
|
+
- yourselves
|
data/lib/text_rank/tokenizer.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
#encoding: UTF-8
|
2
1
|
module TextRank
|
3
2
|
module Tokenizer
|
4
3
|
|
@@ -12,7 +11,7 @@ module TextRank
|
|
12
11
|
"\u20a4", # Lira Symbol
|
13
12
|
"\u20a7", # Peseta Sign
|
14
13
|
"\u20ac", # Euro Symbol
|
15
|
-
"\u20B9", # Rupee
|
14
|
+
"\u20B9", # Rupee
|
16
15
|
"\u20a9", # Won Sign
|
17
16
|
"\u20b4", # Hryvnia Sign
|
18
17
|
"\u20af", # Drachma Sign
|
@@ -34,6 +33,8 @@ module TextRank
|
|
34
33
|
# A tokenizer regex that preserves money or formatted numbers as a single token. This
|
35
34
|
# currently supports 24 different currency symbols:
|
36
35
|
#
|
36
|
+
# rubocop:disable Style/AsciiComments
|
37
|
+
#
|
37
38
|
# * ¤
|
38
39
|
# * $
|
39
40
|
# * ¢
|
@@ -58,19 +59,23 @@ module TextRank
|
|
58
59
|
# * ₫
|
59
60
|
# * %
|
60
61
|
# * ‰
|
62
|
+
|
63
|
+
# rubocop:enable Style/AsciiComments
|
61
64
|
#
|
62
65
|
# It also supports two alternative formats for negatives as well as optional three digit comma
|
63
66
|
# separation and optional decimals.
|
64
67
|
##
|
65
|
-
|
68
|
+
# rubocop:disable Naming/ConstantName
|
69
|
+
Money = /
|
66
70
|
(
|
67
|
-
#{CURRENCY_SYMBOLS}
|
71
|
+
#{CURRENCY_SYMBOLS} -? #{Number} # $-45,231.21
|
68
72
|
|
|
69
|
-
|
73
|
+
-? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
|
70
74
|
|
|
71
75
|
\( #{CURRENCY_SYMBOLS} #{Number} \) # ($45,231.21)
|
72
76
|
)
|
73
|
-
|
77
|
+
/x
|
78
|
+
# rubocop:enable Naming/ConstantName
|
74
79
|
|
75
80
|
end
|
76
81
|
end
|
@@ -1,11 +1,11 @@
|
|
1
|
-
#encoding: UTF-8
|
2
1
|
module TextRank
|
3
2
|
module Tokenizer
|
4
3
|
|
5
4
|
##
|
6
5
|
# A tokenizer regex that preserves (optionally formatted) numbers as a single token.
|
7
6
|
##
|
8
|
-
|
7
|
+
# rubocop:disable Naming/ConstantName
|
8
|
+
Number = /
|
9
9
|
(
|
10
10
|
[1-9]\d{3,} # 453231162
|
11
11
|
(?:\.\d+)? # 453231162.17
|
@@ -25,7 +25,8 @@ module TextRank
|
|
25
25
|
|
26
26
|
(?:\.\d+) # .17
|
27
27
|
)
|
28
|
-
|
28
|
+
/x
|
29
|
+
# rubocop:enable Naming/ConstantName
|
29
30
|
|
30
31
|
end
|
31
32
|
end
|
@@ -1,11 +1,14 @@
|
|
1
1
|
module TextRank
|
2
2
|
module Tokenizer
|
3
|
+
|
3
4
|
##
|
4
5
|
# A tokenizer regex that preserves single punctuation symbols as a token. Use
|
5
6
|
# this if one or more of your TokenFilter classes need punctuation in order to
|
6
7
|
# make decisions.
|
7
8
|
##
|
8
|
-
|
9
|
+
# rubocop:disable Naming/ConstantName
|
10
|
+
Punctuation = /(\p{Punct})/
|
11
|
+
# rubocop:enable Naming/ConstantName
|
9
12
|
|
10
13
|
end
|
11
14
|
end
|
@@ -1,8 +1,10 @@
|
|
1
1
|
module TextRank
|
2
2
|
module Tokenizer
|
3
|
+
|
3
4
|
##
|
4
5
|
# A tokenizer regex that preserves entire URL's as a token (rather than split them up)
|
5
6
|
##
|
7
|
+
# rubocop:disable Naming/ConstantName
|
6
8
|
Url = %r{
|
7
9
|
(
|
8
10
|
(?:[\w-]+://?|www[.])
|
@@ -16,6 +18,7 @@ module TextRank
|
|
16
18
|
)
|
17
19
|
)
|
18
20
|
}xi
|
21
|
+
# rubocop:enable Naming/ConstantName
|
19
22
|
|
20
23
|
end
|
21
24
|
end
|
@@ -1,11 +1,14 @@
|
|
1
1
|
module TextRank
|
2
2
|
module Tokenizer
|
3
|
+
|
3
4
|
##
|
4
5
|
# A tokenizer regex that preserves single whitespace characters as a token. Use
|
5
6
|
# this if one or more of your TokenFilter classes need whitespace in order to
|
6
7
|
# make decisions.
|
7
8
|
##
|
8
|
-
|
9
|
+
# rubocop:disable Naming/ConstantName
|
10
|
+
Whitespace = /\s/
|
11
|
+
# rubocop:enable Naming/ConstantName
|
9
12
|
|
10
13
|
end
|
11
14
|
end
|