text_rank 1.2.0 → 1.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.codeclimate.yml +1 -6
- data/.gitignore +4 -0
- data/.rubocop.yml +60 -1075
- data/.ruby-version +1 -1
- data/.travis.yml +14 -5
- data/{LICENSE.txt → LICENSE} +0 -0
- data/README.md +2 -1
- data/Rakefile +5 -0
- data/bin/console +3 -3
- data/ext/text_rank/extconf.rb +3 -0
- data/ext/text_rank/page_rank_sparse_native.c +296 -0
- data/ext/text_rank/page_rank_sparse_native.h +93 -0
- data/ext/text_rank/text_rank.c +5 -0
- data/lib/page_rank.rb +7 -4
- data/lib/page_rank/base.rb +12 -9
- data/lib/page_rank/dense.rb +3 -2
- data/lib/page_rank/sparse.rb +6 -7
- data/lib/page_rank/sparse_native.rb +21 -0
- data/lib/text_rank.rb +14 -9
- data/lib/text_rank/char_filter.rb +1 -1
- data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
- data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
- data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
- data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
- data/lib/text_rank/fingerprint.rb +10 -18
- data/lib/text_rank/fingerprint_overlap.rb +55 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
- data/lib/text_rank/keyword_extractor.rb +32 -25
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +53 -26
- data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
- data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
- data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
- data/lib/text_rank/token_filter/stopwords.rb +1 -321
- data/lib/text_rank/token_filter/stopwords.yml +317 -0
- data/lib/text_rank/tokenizer.rb +1 -1
- data/lib/text_rank/tokenizer/money.rb +11 -6
- data/lib/text_rank/tokenizer/number.rb +4 -3
- data/lib/text_rank/tokenizer/punctuation.rb +4 -1
- data/lib/text_rank/tokenizer/url.rb +3 -0
- data/lib/text_rank/tokenizer/whitespace.rb +4 -1
- data/lib/text_rank/tokenizer/word.rb +5 -2
- data/lib/text_rank/version.rb +3 -1
- data/text_rank.gemspec +12 -10
- metadata +69 -33
@@ -0,0 +1,317 @@
|
|
1
|
+
- a
|
2
|
+
- about
|
3
|
+
- above
|
4
|
+
- across
|
5
|
+
- after
|
6
|
+
- afterwards
|
7
|
+
- again
|
8
|
+
- against
|
9
|
+
- all
|
10
|
+
- almost
|
11
|
+
- alone
|
12
|
+
- along
|
13
|
+
- already
|
14
|
+
- also
|
15
|
+
- although
|
16
|
+
- always
|
17
|
+
- am
|
18
|
+
- among
|
19
|
+
- amongst
|
20
|
+
- amoungst
|
21
|
+
- amount
|
22
|
+
- an
|
23
|
+
- and
|
24
|
+
- another
|
25
|
+
- any
|
26
|
+
- anyhow
|
27
|
+
- anyone
|
28
|
+
- anything
|
29
|
+
- anyway
|
30
|
+
- anywhere
|
31
|
+
- are
|
32
|
+
- around
|
33
|
+
- as
|
34
|
+
- at
|
35
|
+
- back
|
36
|
+
- be
|
37
|
+
- became
|
38
|
+
- because
|
39
|
+
- become
|
40
|
+
- becomes
|
41
|
+
- becoming
|
42
|
+
- been
|
43
|
+
- before
|
44
|
+
- beforehand
|
45
|
+
- behind
|
46
|
+
- being
|
47
|
+
- below
|
48
|
+
- beside
|
49
|
+
- besides
|
50
|
+
- between
|
51
|
+
- beyond
|
52
|
+
- bill
|
53
|
+
- both
|
54
|
+
- bottom
|
55
|
+
- but
|
56
|
+
- by
|
57
|
+
- call
|
58
|
+
- can
|
59
|
+
- cannot
|
60
|
+
- cant
|
61
|
+
- co
|
62
|
+
- con
|
63
|
+
- could
|
64
|
+
- couldnt
|
65
|
+
- cry
|
66
|
+
- de
|
67
|
+
- describe
|
68
|
+
- detail
|
69
|
+
- do
|
70
|
+
- done
|
71
|
+
- down
|
72
|
+
- due
|
73
|
+
- during
|
74
|
+
- each
|
75
|
+
- eg
|
76
|
+
- eight
|
77
|
+
- either
|
78
|
+
- eleven
|
79
|
+
- else
|
80
|
+
- elsewhere
|
81
|
+
- empty
|
82
|
+
- enough
|
83
|
+
- etc
|
84
|
+
- even
|
85
|
+
- ever
|
86
|
+
- every
|
87
|
+
- everyone
|
88
|
+
- everything
|
89
|
+
- everywhere
|
90
|
+
- except
|
91
|
+
- few
|
92
|
+
- fifteen
|
93
|
+
- fify
|
94
|
+
- fill
|
95
|
+
- find
|
96
|
+
- fire
|
97
|
+
- first
|
98
|
+
- five
|
99
|
+
- for
|
100
|
+
- former
|
101
|
+
- formerly
|
102
|
+
- forty
|
103
|
+
- found
|
104
|
+
- four
|
105
|
+
- from
|
106
|
+
- front
|
107
|
+
- full
|
108
|
+
- further
|
109
|
+
- get
|
110
|
+
- give
|
111
|
+
- go
|
112
|
+
- had
|
113
|
+
- has
|
114
|
+
- hasnt
|
115
|
+
- have
|
116
|
+
- he
|
117
|
+
- hence
|
118
|
+
- her
|
119
|
+
- here
|
120
|
+
- hereafter
|
121
|
+
- hereby
|
122
|
+
- herein
|
123
|
+
- hereupon
|
124
|
+
- hers
|
125
|
+
- herself
|
126
|
+
- him
|
127
|
+
- himself
|
128
|
+
- his
|
129
|
+
- how
|
130
|
+
- however
|
131
|
+
- hundred
|
132
|
+
- ie
|
133
|
+
- if
|
134
|
+
- in
|
135
|
+
- inc
|
136
|
+
- indeed
|
137
|
+
- interest
|
138
|
+
- into
|
139
|
+
- is
|
140
|
+
- it
|
141
|
+
- its
|
142
|
+
- itself
|
143
|
+
- keep
|
144
|
+
- last
|
145
|
+
- latter
|
146
|
+
- latterly
|
147
|
+
- least
|
148
|
+
- less
|
149
|
+
- ltd
|
150
|
+
- made
|
151
|
+
- many
|
152
|
+
- may
|
153
|
+
- me
|
154
|
+
- meanwhile
|
155
|
+
- might
|
156
|
+
- mill
|
157
|
+
- mine
|
158
|
+
- more
|
159
|
+
- moreover
|
160
|
+
- most
|
161
|
+
- mostly
|
162
|
+
- move
|
163
|
+
- much
|
164
|
+
- must
|
165
|
+
- my
|
166
|
+
- myself
|
167
|
+
- name
|
168
|
+
- namely
|
169
|
+
- neither
|
170
|
+
- never
|
171
|
+
- nevertheless
|
172
|
+
- next
|
173
|
+
- nine
|
174
|
+
- no
|
175
|
+
- nobody
|
176
|
+
- none
|
177
|
+
- noone
|
178
|
+
- nor
|
179
|
+
- not
|
180
|
+
- nothing
|
181
|
+
- now
|
182
|
+
- nowhere
|
183
|
+
- of
|
184
|
+
- off
|
185
|
+
- often
|
186
|
+
- on
|
187
|
+
- once
|
188
|
+
- one
|
189
|
+
- only
|
190
|
+
- onto
|
191
|
+
- or
|
192
|
+
- other
|
193
|
+
- others
|
194
|
+
- otherwise
|
195
|
+
- our
|
196
|
+
- ours
|
197
|
+
- ourselves
|
198
|
+
- out
|
199
|
+
- over
|
200
|
+
- own
|
201
|
+
- part
|
202
|
+
- per
|
203
|
+
- perhaps
|
204
|
+
- please
|
205
|
+
- put
|
206
|
+
- rather
|
207
|
+
- re
|
208
|
+
- same
|
209
|
+
- see
|
210
|
+
- seem
|
211
|
+
- seemed
|
212
|
+
- seeming
|
213
|
+
- seems
|
214
|
+
- serious
|
215
|
+
- several
|
216
|
+
- she
|
217
|
+
- should
|
218
|
+
- show
|
219
|
+
- side
|
220
|
+
- since
|
221
|
+
- sincere
|
222
|
+
- six
|
223
|
+
- sixty
|
224
|
+
- so
|
225
|
+
- some
|
226
|
+
- somehow
|
227
|
+
- someone
|
228
|
+
- something
|
229
|
+
- sometime
|
230
|
+
- sometimes
|
231
|
+
- somewhere
|
232
|
+
- still
|
233
|
+
- such
|
234
|
+
- system
|
235
|
+
- take
|
236
|
+
- ten
|
237
|
+
- than
|
238
|
+
- that
|
239
|
+
- the
|
240
|
+
- their
|
241
|
+
- them
|
242
|
+
- themselves
|
243
|
+
- then
|
244
|
+
- thence
|
245
|
+
- there
|
246
|
+
- thereafter
|
247
|
+
- thereby
|
248
|
+
- therefore
|
249
|
+
- therein
|
250
|
+
- thereupon
|
251
|
+
- these
|
252
|
+
- they
|
253
|
+
- thickv
|
254
|
+
- thin
|
255
|
+
- third
|
256
|
+
- this
|
257
|
+
- those
|
258
|
+
- though
|
259
|
+
- three
|
260
|
+
- through
|
261
|
+
- throughout
|
262
|
+
- thru
|
263
|
+
- thus
|
264
|
+
- to
|
265
|
+
- together
|
266
|
+
- too
|
267
|
+
- top
|
268
|
+
- toward
|
269
|
+
- towards
|
270
|
+
- twelve
|
271
|
+
- twenty
|
272
|
+
- two
|
273
|
+
- un
|
274
|
+
- under
|
275
|
+
- until
|
276
|
+
- up
|
277
|
+
- upon
|
278
|
+
- us
|
279
|
+
- very
|
280
|
+
- via
|
281
|
+
- was
|
282
|
+
- we
|
283
|
+
- well
|
284
|
+
- were
|
285
|
+
- what
|
286
|
+
- whatever
|
287
|
+
- when
|
288
|
+
- whence
|
289
|
+
- whenever
|
290
|
+
- where
|
291
|
+
- whereafter
|
292
|
+
- whereas
|
293
|
+
- whereby
|
294
|
+
- wherein
|
295
|
+
- whereupon
|
296
|
+
- wherever
|
297
|
+
- whether
|
298
|
+
- which
|
299
|
+
- while
|
300
|
+
- whither
|
301
|
+
- who
|
302
|
+
- whoever
|
303
|
+
- whole
|
304
|
+
- whom
|
305
|
+
- whose
|
306
|
+
- why
|
307
|
+
- will
|
308
|
+
- with
|
309
|
+
- within
|
310
|
+
- without
|
311
|
+
- would
|
312
|
+
- yet
|
313
|
+
- you
|
314
|
+
- your
|
315
|
+
- yours
|
316
|
+
- yourself
|
317
|
+
- yourselves
|
data/lib/text_rank/tokenizer.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
#encoding: UTF-8
|
2
1
|
module TextRank
|
3
2
|
module Tokenizer
|
4
3
|
|
@@ -12,7 +11,7 @@ module TextRank
|
|
12
11
|
"\u20a4", # Lira Symbol
|
13
12
|
"\u20a7", # Peseta Sign
|
14
13
|
"\u20ac", # Euro Symbol
|
15
|
-
"\u20B9", # Rupee
|
14
|
+
"\u20B9", # Rupee
|
16
15
|
"\u20a9", # Won Sign
|
17
16
|
"\u20b4", # Hryvnia Sign
|
18
17
|
"\u20af", # Drachma Sign
|
@@ -34,6 +33,8 @@ module TextRank
|
|
34
33
|
# A tokenizer regex that preserves money or formatted numbers as a single token. This
|
35
34
|
# currently supports 24 different currency symbols:
|
36
35
|
#
|
36
|
+
# rubocop:disable Style/AsciiComments
|
37
|
+
#
|
37
38
|
# * ¤
|
38
39
|
# * $
|
39
40
|
# * ¢
|
@@ -58,19 +59,23 @@ module TextRank
|
|
58
59
|
# * ₫
|
59
60
|
# * %
|
60
61
|
# * ‰
|
62
|
+
|
63
|
+
# rubocop:enable Style/AsciiComments
|
61
64
|
#
|
62
65
|
# It also supports two alternative formats for negatives as well as optional three digit comma
|
63
66
|
# separation and optional decimals.
|
64
67
|
##
|
65
|
-
|
68
|
+
# rubocop:disable Naming/ConstantName
|
69
|
+
Money = /
|
66
70
|
(
|
67
|
-
#{CURRENCY_SYMBOLS}
|
71
|
+
#{CURRENCY_SYMBOLS} -? #{Number} # $-45,231.21
|
68
72
|
|
|
69
|
-
|
73
|
+
-? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
|
70
74
|
|
|
71
75
|
\( #{CURRENCY_SYMBOLS} #{Number} \) # ($45,231.21)
|
72
76
|
)
|
73
|
-
|
77
|
+
/x
|
78
|
+
# rubocop:enable Naming/ConstantName
|
74
79
|
|
75
80
|
end
|
76
81
|
end
|
@@ -1,11 +1,11 @@
|
|
1
|
-
#encoding: UTF-8
|
2
1
|
module TextRank
|
3
2
|
module Tokenizer
|
4
3
|
|
5
4
|
##
|
6
5
|
# A tokenizer regex that preserves (optionally formatted) numbers as a single token.
|
7
6
|
##
|
8
|
-
|
7
|
+
# rubocop:disable Naming/ConstantName
|
8
|
+
Number = /
|
9
9
|
(
|
10
10
|
[1-9]\d{3,} # 453231162
|
11
11
|
(?:\.\d+)? # 453231162.17
|
@@ -25,7 +25,8 @@ module TextRank
|
|
25
25
|
|
26
26
|
(?:\.\d+) # .17
|
27
27
|
)
|
28
|
-
|
28
|
+
/x
|
29
|
+
# rubocop:enable Naming/ConstantName
|
29
30
|
|
30
31
|
end
|
31
32
|
end
|
@@ -1,11 +1,14 @@
|
|
1
1
|
module TextRank
|
2
2
|
module Tokenizer
|
3
|
+
|
3
4
|
##
|
4
5
|
# A tokenizer regex that preserves single punctuation symbols as a token. Use
|
5
6
|
# this if one or more of your TokenFilter classes need punctuation in order to
|
6
7
|
# make decisions.
|
7
8
|
##
|
8
|
-
|
9
|
+
# rubocop:disable Naming/ConstantName
|
10
|
+
Punctuation = /(\p{Punct})/
|
11
|
+
# rubocop:enable Naming/ConstantName
|
9
12
|
|
10
13
|
end
|
11
14
|
end
|
@@ -1,8 +1,10 @@
|
|
1
1
|
module TextRank
|
2
2
|
module Tokenizer
|
3
|
+
|
3
4
|
##
|
4
5
|
# A tokenizer regex that preserves entire URL's as a token (rather than split them up)
|
5
6
|
##
|
7
|
+
# rubocop:disable Naming/ConstantName
|
6
8
|
Url = %r{
|
7
9
|
(
|
8
10
|
(?:[\w-]+://?|www[.])
|
@@ -16,6 +18,7 @@ module TextRank
|
|
16
18
|
)
|
17
19
|
)
|
18
20
|
}xi
|
21
|
+
# rubocop:enable Naming/ConstantName
|
19
22
|
|
20
23
|
end
|
21
24
|
end
|
@@ -1,11 +1,14 @@
|
|
1
1
|
module TextRank
|
2
2
|
module Tokenizer
|
3
|
+
|
3
4
|
##
|
4
5
|
# A tokenizer regex that preserves single whitespace characters as a token. Use
|
5
6
|
# this if one or more of your TokenFilter classes need whitespace in order to
|
6
7
|
# make decisions.
|
7
8
|
##
|
8
|
-
|
9
|
+
# rubocop:disable Naming/ConstantName
|
10
|
+
Whitespace = /\s/
|
11
|
+
# rubocop:enable Naming/ConstantName
|
9
12
|
|
10
13
|
end
|
11
14
|
end
|