text_rank 1.2.0 → 1.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +5 -5
  2. data/.codeclimate.yml +1 -6
  3. data/.gitignore +4 -0
  4. data/.rubocop.yml +60 -1075
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +14 -5
  7. data/{LICENSE.txt → LICENSE} +0 -0
  8. data/README.md +2 -1
  9. data/Rakefile +5 -0
  10. data/bin/console +3 -3
  11. data/ext/text_rank/extconf.rb +3 -0
  12. data/ext/text_rank/page_rank_sparse_native.c +296 -0
  13. data/ext/text_rank/page_rank_sparse_native.h +93 -0
  14. data/ext/text_rank/text_rank.c +5 -0
  15. data/lib/page_rank.rb +7 -4
  16. data/lib/page_rank/base.rb +12 -9
  17. data/lib/page_rank/dense.rb +3 -2
  18. data/lib/page_rank/sparse.rb +6 -7
  19. data/lib/page_rank/sparse_native.rb +21 -0
  20. data/lib/text_rank.rb +14 -9
  21. data/lib/text_rank/char_filter.rb +1 -1
  22. data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
  23. data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
  24. data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
  25. data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
  26. data/lib/text_rank/fingerprint.rb +10 -18
  27. data/lib/text_rank/fingerprint_overlap.rb +55 -0
  28. data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
  29. data/lib/text_rank/keyword_extractor.rb +32 -25
  30. data/lib/text_rank/rank_filter/collapse_adjacent.rb +53 -26
  31. data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
  32. data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
  33. data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
  34. data/lib/text_rank/token_filter/stopwords.rb +1 -321
  35. data/lib/text_rank/token_filter/stopwords.yml +317 -0
  36. data/lib/text_rank/tokenizer.rb +1 -1
  37. data/lib/text_rank/tokenizer/money.rb +11 -6
  38. data/lib/text_rank/tokenizer/number.rb +4 -3
  39. data/lib/text_rank/tokenizer/punctuation.rb +4 -1
  40. data/lib/text_rank/tokenizer/url.rb +3 -0
  41. data/lib/text_rank/tokenizer/whitespace.rb +4 -1
  42. data/lib/text_rank/tokenizer/word.rb +5 -2
  43. data/lib/text_rank/version.rb +3 -1
  44. data/text_rank.gemspec +12 -10
  45. metadata +69 -33
@@ -0,0 +1,317 @@
1
+ - a
2
+ - about
3
+ - above
4
+ - across
5
+ - after
6
+ - afterwards
7
+ - again
8
+ - against
9
+ - all
10
+ - almost
11
+ - alone
12
+ - along
13
+ - already
14
+ - also
15
+ - although
16
+ - always
17
+ - am
18
+ - among
19
+ - amongst
20
+ - amoungst
21
+ - amount
22
+ - an
23
+ - and
24
+ - another
25
+ - any
26
+ - anyhow
27
+ - anyone
28
+ - anything
29
+ - anyway
30
+ - anywhere
31
+ - are
32
+ - around
33
+ - as
34
+ - at
35
+ - back
36
+ - be
37
+ - became
38
+ - because
39
+ - become
40
+ - becomes
41
+ - becoming
42
+ - been
43
+ - before
44
+ - beforehand
45
+ - behind
46
+ - being
47
+ - below
48
+ - beside
49
+ - besides
50
+ - between
51
+ - beyond
52
+ - bill
53
+ - both
54
+ - bottom
55
+ - but
56
+ - by
57
+ - call
58
+ - can
59
+ - cannot
60
+ - cant
61
+ - co
62
+ - con
63
+ - could
64
+ - couldnt
65
+ - cry
66
+ - de
67
+ - describe
68
+ - detail
69
+ - do
70
+ - done
71
+ - down
72
+ - due
73
+ - during
74
+ - each
75
+ - eg
76
+ - eight
77
+ - either
78
+ - eleven
79
+ - else
80
+ - elsewhere
81
+ - empty
82
+ - enough
83
+ - etc
84
+ - even
85
+ - ever
86
+ - every
87
+ - everyone
88
+ - everything
89
+ - everywhere
90
+ - except
91
+ - few
92
+ - fifteen
93
+ - fify
94
+ - fill
95
+ - find
96
+ - fire
97
+ - first
98
+ - five
99
+ - for
100
+ - former
101
+ - formerly
102
+ - forty
103
+ - found
104
+ - four
105
+ - from
106
+ - front
107
+ - full
108
+ - further
109
+ - get
110
+ - give
111
+ - go
112
+ - had
113
+ - has
114
+ - hasnt
115
+ - have
116
+ - he
117
+ - hence
118
+ - her
119
+ - here
120
+ - hereafter
121
+ - hereby
122
+ - herein
123
+ - hereupon
124
+ - hers
125
+ - herself
126
+ - him
127
+ - himself
128
+ - his
129
+ - how
130
+ - however
131
+ - hundred
132
+ - ie
133
+ - if
134
+ - in
135
+ - inc
136
+ - indeed
137
+ - interest
138
+ - into
139
+ - is
140
+ - it
141
+ - its
142
+ - itself
143
+ - keep
144
+ - last
145
+ - latter
146
+ - latterly
147
+ - least
148
+ - less
149
+ - ltd
150
+ - made
151
+ - many
152
+ - may
153
+ - me
154
+ - meanwhile
155
+ - might
156
+ - mill
157
+ - mine
158
+ - more
159
+ - moreover
160
+ - most
161
+ - mostly
162
+ - move
163
+ - much
164
+ - must
165
+ - my
166
+ - myself
167
+ - name
168
+ - namely
169
+ - neither
170
+ - never
171
+ - nevertheless
172
+ - next
173
+ - nine
174
+ - no
175
+ - nobody
176
+ - none
177
+ - noone
178
+ - nor
179
+ - not
180
+ - nothing
181
+ - now
182
+ - nowhere
183
+ - of
184
+ - off
185
+ - often
186
+ - on
187
+ - once
188
+ - one
189
+ - only
190
+ - onto
191
+ - or
192
+ - other
193
+ - others
194
+ - otherwise
195
+ - our
196
+ - ours
197
+ - ourselves
198
+ - out
199
+ - over
200
+ - own
201
+ - part
202
+ - per
203
+ - perhaps
204
+ - please
205
+ - put
206
+ - rather
207
+ - re
208
+ - same
209
+ - see
210
+ - seem
211
+ - seemed
212
+ - seeming
213
+ - seems
214
+ - serious
215
+ - several
216
+ - she
217
+ - should
218
+ - show
219
+ - side
220
+ - since
221
+ - sincere
222
+ - six
223
+ - sixty
224
+ - so
225
+ - some
226
+ - somehow
227
+ - someone
228
+ - something
229
+ - sometime
230
+ - sometimes
231
+ - somewhere
232
+ - still
233
+ - such
234
+ - system
235
+ - take
236
+ - ten
237
+ - than
238
+ - that
239
+ - the
240
+ - their
241
+ - them
242
+ - themselves
243
+ - then
244
+ - thence
245
+ - there
246
+ - thereafter
247
+ - thereby
248
+ - therefore
249
+ - therein
250
+ - thereupon
251
+ - these
252
+ - they
253
+ - thickv
254
+ - thin
255
+ - third
256
+ - this
257
+ - those
258
+ - though
259
+ - three
260
+ - through
261
+ - throughout
262
+ - thru
263
+ - thus
264
+ - to
265
+ - together
266
+ - too
267
+ - top
268
+ - toward
269
+ - towards
270
+ - twelve
271
+ - twenty
272
+ - two
273
+ - un
274
+ - under
275
+ - until
276
+ - up
277
+ - upon
278
+ - us
279
+ - very
280
+ - via
281
+ - was
282
+ - we
283
+ - well
284
+ - were
285
+ - what
286
+ - whatever
287
+ - when
288
+ - whence
289
+ - whenever
290
+ - where
291
+ - whereafter
292
+ - whereas
293
+ - whereby
294
+ - wherein
295
+ - whereupon
296
+ - wherever
297
+ - whether
298
+ - which
299
+ - while
300
+ - whither
301
+ - who
302
+ - whoever
303
+ - whole
304
+ - whom
305
+ - whose
306
+ - why
307
+ - will
308
+ - with
309
+ - within
310
+ - without
311
+ - would
312
+ - yet
313
+ - you
314
+ - your
315
+ - yours
316
+ - yourself
317
+ - yourselves
@@ -31,7 +31,7 @@ module TextRank
31
31
  tokens = []
32
32
  text.scan(Regexp.new(regular_expressions.flatten.join('|'))) do |matches|
33
33
  m = matches.compact.first
34
- tokens << m if m && m.size > 0
34
+ tokens << m if m&.size&.positive?
35
35
  end
36
36
  tokens
37
37
  end
@@ -1,4 +1,3 @@
1
- #encoding: UTF-8
2
1
  module TextRank
3
2
  module Tokenizer
4
3
 
@@ -12,7 +11,7 @@ module TextRank
12
11
  "\u20a4", # Lira Symbol
13
12
  "\u20a7", # Peseta Sign
14
13
  "\u20ac", # Euro Symbol
15
- "\u20B9", # Rupee
14
+ "\u20B9", # Rupee
16
15
  "\u20a9", # Won Sign
17
16
  "\u20b4", # Hryvnia Sign
18
17
  "\u20af", # Drachma Sign
@@ -34,6 +33,8 @@ module TextRank
34
33
  # A tokenizer regex that preserves money or formatted numbers as a single token. This
35
34
  # currently supports 24 different currency symbols:
36
35
  #
36
+ # rubocop:disable Style/AsciiComments
37
+ #
37
38
  # * ¤
38
39
  # * $
39
40
  # * ¢
@@ -58,19 +59,23 @@ module TextRank
58
59
  # * ₫
59
60
  # * %
60
61
  # * ‰
62
+
63
+ # rubocop:enable Style/AsciiComments
61
64
  #
62
65
  # It also supports two alternative formats for negatives as well as optional three digit comma
63
66
  # separation and optional decimals.
64
67
  ##
65
- Money = %r{
68
+ # rubocop:disable Naming/ConstantName
69
+ Money = /
66
70
  (
67
- #{CURRENCY_SYMBOLS} \-? #{Number} # $-45,231.21
71
+ #{CURRENCY_SYMBOLS} -? #{Number} # $-45,231.21
68
72
  |
69
- \-? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
73
+ -? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
70
74
  |
71
75
  \( #{CURRENCY_SYMBOLS} #{Number} \) # ($45,231.21)
72
76
  )
73
- }x
77
+ /x
78
+ # rubocop:enable Naming/ConstantName
74
79
 
75
80
  end
76
81
  end
@@ -1,11 +1,11 @@
1
- #encoding: UTF-8
2
1
  module TextRank
3
2
  module Tokenizer
4
3
 
5
4
  ##
6
5
  # A tokenizer regex that preserves (optionally formatted) numbers as a single token.
7
6
  ##
8
- Number = %r{
7
+ # rubocop:disable Naming/ConstantName
8
+ Number = /
9
9
  (
10
10
  [1-9]\d{3,} # 453231162
11
11
  (?:\.\d+)? # 453231162.17
@@ -25,7 +25,8 @@ module TextRank
25
25
 
26
26
  (?:\.\d+) # .17
27
27
  )
28
- }x
28
+ /x
29
+ # rubocop:enable Naming/ConstantName
29
30
 
30
31
  end
31
32
  end
@@ -1,11 +1,14 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves single punctuation symbols as a token. Use
5
6
  # this if one or more of your TokenFilter classes need punctuation in order to
6
7
  # make decisions.
7
8
  ##
8
- Punctuation = %r{([\p{Punct}])}
9
+ # rubocop:disable Naming/ConstantName
10
+ Punctuation = /(\p{Punct})/
11
+ # rubocop:enable Naming/ConstantName
9
12
 
10
13
  end
11
14
  end
@@ -1,8 +1,10 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves entire URL's as a token (rather than split them up)
5
6
  ##
7
+ # rubocop:disable Naming/ConstantName
6
8
  Url = %r{
7
9
  (
8
10
  (?:[\w-]+://?|www[.])
@@ -16,6 +18,7 @@ module TextRank
16
18
  )
17
19
  )
18
20
  }xi
21
+ # rubocop:enable Naming/ConstantName
19
22
 
20
23
  end
21
24
  end
@@ -1,11 +1,14 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves single whitespace characters as a token. Use
5
6
  # this if one or more of your TokenFilter classes need whitespace in order to
6
7
  # make decisions.
7
8
  ##
8
- Whitespace = %r{\s}
9
+ # rubocop:disable Naming/ConstantName
10
+ Whitespace = /\s/
11
+ # rubocop:enable Naming/ConstantName
9
12
 
10
13
  end
11
14
  end