text_rank 1.2.3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.codeclimate.yml +1 -1
  3. data/.gitignore +4 -0
  4. data/.rubocop.yml +7 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +1 -0
  7. data/Rakefile +5 -0
  8. data/bin/console +3 -3
  9. data/ext/text_rank/extconf.rb +3 -0
  10. data/ext/text_rank/page_rank_sparse_native.c +300 -0
  11. data/ext/text_rank/page_rank_sparse_native.h +93 -0
  12. data/ext/text_rank/text_rank.c +5 -0
  13. data/lib/page_rank/base.rb +12 -9
  14. data/lib/page_rank/dense.rb +3 -2
  15. data/lib/page_rank/sparse.rb +6 -7
  16. data/lib/page_rank/sparse_native.rb +21 -0
  17. data/lib/page_rank.rb +7 -4
  18. data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
  19. data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
  20. data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
  21. data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
  22. data/lib/text_rank/char_filter.rb +1 -1
  23. data/lib/text_rank/fingerprint.rb +10 -18
  24. data/lib/text_rank/fingerprint_overlap.rb +55 -0
  25. data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
  26. data/lib/text_rank/keyword_extractor.rb +32 -25
  27. data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
  28. data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
  29. data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
  30. data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
  31. data/lib/text_rank/token_filter/stopwords.rb +1 -321
  32. data/lib/text_rank/token_filter/stopwords.yml +317 -0
  33. data/lib/text_rank/tokenizer/money.rb +11 -6
  34. data/lib/text_rank/tokenizer/number.rb +4 -3
  35. data/lib/text_rank/tokenizer/punctuation.rb +4 -1
  36. data/lib/text_rank/tokenizer/url.rb +3 -0
  37. data/lib/text_rank/tokenizer/whitespace.rb +4 -1
  38. data/lib/text_rank/tokenizer/word.rb +5 -2
  39. data/lib/text_rank/tokenizer.rb +1 -1
  40. data/lib/text_rank/version.rb +3 -1
  41. data/lib/text_rank.rb +14 -9
  42. data/text_rank.gemspec +4 -1
  43. metadata +48 -12
@@ -0,0 +1,317 @@
1
+ - a
2
+ - about
3
+ - above
4
+ - across
5
+ - after
6
+ - afterwards
7
+ - again
8
+ - against
9
+ - all
10
+ - almost
11
+ - alone
12
+ - along
13
+ - already
14
+ - also
15
+ - although
16
+ - always
17
+ - am
18
+ - among
19
+ - amongst
20
+ - amoungst
21
+ - amount
22
+ - an
23
+ - and
24
+ - another
25
+ - any
26
+ - anyhow
27
+ - anyone
28
+ - anything
29
+ - anyway
30
+ - anywhere
31
+ - are
32
+ - around
33
+ - as
34
+ - at
35
+ - back
36
+ - be
37
+ - became
38
+ - because
39
+ - become
40
+ - becomes
41
+ - becoming
42
+ - been
43
+ - before
44
+ - beforehand
45
+ - behind
46
+ - being
47
+ - below
48
+ - beside
49
+ - besides
50
+ - between
51
+ - beyond
52
+ - bill
53
+ - both
54
+ - bottom
55
+ - but
56
+ - by
57
+ - call
58
+ - can
59
+ - cannot
60
+ - cant
61
+ - co
62
+ - con
63
+ - could
64
+ - couldnt
65
+ - cry
66
+ - de
67
+ - describe
68
+ - detail
69
+ - do
70
+ - done
71
+ - down
72
+ - due
73
+ - during
74
+ - each
75
+ - eg
76
+ - eight
77
+ - either
78
+ - eleven
79
+ - else
80
+ - elsewhere
81
+ - empty
82
+ - enough
83
+ - etc
84
+ - even
85
+ - ever
86
+ - every
87
+ - everyone
88
+ - everything
89
+ - everywhere
90
+ - except
91
+ - few
92
+ - fifteen
93
+ - fify
94
+ - fill
95
+ - find
96
+ - fire
97
+ - first
98
+ - five
99
+ - for
100
+ - former
101
+ - formerly
102
+ - forty
103
+ - found
104
+ - four
105
+ - from
106
+ - front
107
+ - full
108
+ - further
109
+ - get
110
+ - give
111
+ - go
112
+ - had
113
+ - has
114
+ - hasnt
115
+ - have
116
+ - he
117
+ - hence
118
+ - her
119
+ - here
120
+ - hereafter
121
+ - hereby
122
+ - herein
123
+ - hereupon
124
+ - hers
125
+ - herself
126
+ - him
127
+ - himself
128
+ - his
129
+ - how
130
+ - however
131
+ - hundred
132
+ - ie
133
+ - if
134
+ - in
135
+ - inc
136
+ - indeed
137
+ - interest
138
+ - into
139
+ - is
140
+ - it
141
+ - its
142
+ - itself
143
+ - keep
144
+ - last
145
+ - latter
146
+ - latterly
147
+ - least
148
+ - less
149
+ - ltd
150
+ - made
151
+ - many
152
+ - may
153
+ - me
154
+ - meanwhile
155
+ - might
156
+ - mill
157
+ - mine
158
+ - more
159
+ - moreover
160
+ - most
161
+ - mostly
162
+ - move
163
+ - much
164
+ - must
165
+ - my
166
+ - myself
167
+ - name
168
+ - namely
169
+ - neither
170
+ - never
171
+ - nevertheless
172
+ - next
173
+ - nine
174
+ - no
175
+ - nobody
176
+ - none
177
+ - noone
178
+ - nor
179
+ - not
180
+ - nothing
181
+ - now
182
+ - nowhere
183
+ - of
184
+ - off
185
+ - often
186
+ - on
187
+ - once
188
+ - one
189
+ - only
190
+ - onto
191
+ - or
192
+ - other
193
+ - others
194
+ - otherwise
195
+ - our
196
+ - ours
197
+ - ourselves
198
+ - out
199
+ - over
200
+ - own
201
+ - part
202
+ - per
203
+ - perhaps
204
+ - please
205
+ - put
206
+ - rather
207
+ - re
208
+ - same
209
+ - see
210
+ - seem
211
+ - seemed
212
+ - seeming
213
+ - seems
214
+ - serious
215
+ - several
216
+ - she
217
+ - should
218
+ - show
219
+ - side
220
+ - since
221
+ - sincere
222
+ - six
223
+ - sixty
224
+ - so
225
+ - some
226
+ - somehow
227
+ - someone
228
+ - something
229
+ - sometime
230
+ - sometimes
231
+ - somewhere
232
+ - still
233
+ - such
234
+ - system
235
+ - take
236
+ - ten
237
+ - than
238
+ - that
239
+ - the
240
+ - their
241
+ - them
242
+ - themselves
243
+ - then
244
+ - thence
245
+ - there
246
+ - thereafter
247
+ - thereby
248
+ - therefore
249
+ - therein
250
+ - thereupon
251
+ - these
252
+ - they
253
+ - thickv
254
+ - thin
255
+ - third
256
+ - this
257
+ - those
258
+ - though
259
+ - three
260
+ - through
261
+ - throughout
262
+ - thru
263
+ - thus
264
+ - to
265
+ - together
266
+ - too
267
+ - top
268
+ - toward
269
+ - towards
270
+ - twelve
271
+ - twenty
272
+ - two
273
+ - un
274
+ - under
275
+ - until
276
+ - up
277
+ - upon
278
+ - us
279
+ - very
280
+ - via
281
+ - was
282
+ - we
283
+ - well
284
+ - were
285
+ - what
286
+ - whatever
287
+ - when
288
+ - whence
289
+ - whenever
290
+ - where
291
+ - whereafter
292
+ - whereas
293
+ - whereby
294
+ - wherein
295
+ - whereupon
296
+ - wherever
297
+ - whether
298
+ - which
299
+ - while
300
+ - whither
301
+ - who
302
+ - whoever
303
+ - whole
304
+ - whom
305
+ - whose
306
+ - why
307
+ - will
308
+ - with
309
+ - within
310
+ - without
311
+ - would
312
+ - yet
313
+ - you
314
+ - your
315
+ - yours
316
+ - yourself
317
+ - yourselves
@@ -1,4 +1,3 @@
1
- #encoding: UTF-8
2
1
  module TextRank
3
2
  module Tokenizer
4
3
 
@@ -12,7 +11,7 @@ module TextRank
12
11
  "\u20a4", # Lira Symbol
13
12
  "\u20a7", # Peseta Sign
14
13
  "\u20ac", # Euro Symbol
15
- "\u20B9", # Rupee
14
+ "\u20B9", # Rupee
16
15
  "\u20a9", # Won Sign
17
16
  "\u20b4", # Hryvnia Sign
18
17
  "\u20af", # Drachma Sign
@@ -34,6 +33,8 @@ module TextRank
34
33
  # A tokenizer regex that preserves money or formatted numbers as a single token. This
35
34
  # currently supports 24 different currency symbols:
36
35
  #
36
+ # rubocop:disable Style/AsciiComments
37
+ #
37
38
  # * ¤
38
39
  # * $
39
40
  # * ¢
@@ -58,19 +59,23 @@ module TextRank
58
59
  # * ₫
59
60
  # * %
60
61
  # * ‰
62
+
63
+ # rubocop:enable Style/AsciiComments
61
64
  #
62
65
  # It also supports two alternative formats for negatives as well as optional three digit comma
63
66
  # separation and optional decimals.
64
67
  ##
65
- Money = %r{
68
+ # rubocop:disable Naming/ConstantName
69
+ Money = /
66
70
  (
67
- #{CURRENCY_SYMBOLS} \-? #{Number} # $-45,231.21
71
+ #{CURRENCY_SYMBOLS} -? #{Number} # $-45,231.21
68
72
  |
69
- \-? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
73
+ -? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
70
74
  |
71
75
  \( #{CURRENCY_SYMBOLS} #{Number} \) # ($45,231.21)
72
76
  )
73
- }x
77
+ /x
78
+ # rubocop:enable Naming/ConstantName
74
79
 
75
80
  end
76
81
  end
@@ -1,11 +1,11 @@
1
- #encoding: UTF-8
2
1
  module TextRank
3
2
  module Tokenizer
4
3
 
5
4
  ##
6
5
  # A tokenizer regex that preserves (optionally formatted) numbers as a single token.
7
6
  ##
8
- Number = %r{
7
+ # rubocop:disable Naming/ConstantName
8
+ Number = /
9
9
  (
10
10
  [1-9]\d{3,} # 453231162
11
11
  (?:\.\d+)? # 453231162.17
@@ -25,7 +25,8 @@ module TextRank
25
25
 
26
26
  (?:\.\d+) # .17
27
27
  )
28
- }x
28
+ /x
29
+ # rubocop:enable Naming/ConstantName
29
30
 
30
31
  end
31
32
  end
@@ -1,11 +1,14 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves single punctuation symbols as a token. Use
5
6
  # this if one or more of your TokenFilter classes need punctuation in order to
6
7
  # make decisions.
7
8
  ##
8
- Punctuation = %r{([\p{Punct}])}
9
+ # rubocop:disable Naming/ConstantName
10
+ Punctuation = /(\p{Punct})/
11
+ # rubocop:enable Naming/ConstantName
9
12
 
10
13
  end
11
14
  end
@@ -1,8 +1,10 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves entire URL's as a token (rather than split them up)
5
6
  ##
7
+ # rubocop:disable Naming/ConstantName
6
8
  Url = %r{
7
9
  (
8
10
  (?:[\w-]+://?|www[.])
@@ -16,6 +18,7 @@ module TextRank
16
18
  )
17
19
  )
18
20
  }xi
21
+ # rubocop:enable Naming/ConstantName
19
22
 
20
23
  end
21
24
  end
@@ -1,11 +1,14 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves single whitespace characters as a token. Use
5
6
  # this if one or more of your TokenFilter classes need whitespace in order to
6
7
  # make decisions.
7
8
  ##
8
- Whitespace = %r{\s}
9
+ # rubocop:disable Naming/ConstantName
10
+ Whitespace = /\s/
11
+ # rubocop:enable Naming/ConstantName
9
12
 
10
13
  end
11
14
  end
@@ -1,14 +1,17 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves a non-space, non-punctuation "word". It does
5
6
  # allow hyphens and numerals, but the first character must be an A-Z character.
6
7
  ##
7
- Word = %r{
8
+ # rubocop:disable Naming/ConstantName
9
+ Word = /
8
10
  (
9
11
  [a-z][a-z0-9-]*
10
12
  )
11
- }xi
13
+ /xi
14
+ # rubocop:enable Naming/ConstantName
12
15
 
13
16
  end
14
17
  end
@@ -31,7 +31,7 @@ module TextRank
31
31
  tokens = []
32
32
  text.scan(Regexp.new(regular_expressions.flatten.join('|'))) do |matches|
33
33
  m = matches.compact.first
34
- tokens << m if m && m.size > 0
34
+ tokens << m if m&.size&.positive?
35
35
  end
36
36
  tokens
37
37
  end
@@ -1,4 +1,6 @@
1
1
  module TextRank
2
+
2
3
  # Current gem version
3
- VERSION = '1.2.3'
4
+ VERSION = '1.3.0'
5
+
4
6
  end
data/lib/text_rank.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require 'page_rank'
2
+ require 'set'
3
+ require 'yaml'
2
4
 
3
5
  ##
4
6
  # Provides convenience methods for quickly extracting keywords.
@@ -7,17 +9,18 @@ require 'page_rank'
7
9
  ##
8
10
  module TextRank
9
11
 
10
- autoload :CharFilter, 'text_rank/char_filter'
11
- autoload :Fingerprint, 'text_rank/fingerprint'
12
- autoload :GraphStrategy, 'text_rank/graph_strategy'
13
- autoload :KeywordExtractor, 'text_rank/keyword_extractor'
14
- autoload :RankFilter, 'text_rank/rank_filter'
15
- autoload :TokenFilter, 'text_rank/token_filter'
16
- autoload :Tokenizer, 'text_rank/tokenizer'
17
- autoload :VERSION, 'text_rank/version'
12
+ autoload :CharFilter, 'text_rank/char_filter'
13
+ autoload :Fingerprint, 'text_rank/fingerprint'
14
+ autoload :FingerprintOverlap, 'text_rank/fingerprint_overlap'
15
+ autoload :GraphStrategy, 'text_rank/graph_strategy'
16
+ autoload :KeywordExtractor, 'text_rank/keyword_extractor'
17
+ autoload :RankFilter, 'text_rank/rank_filter'
18
+ autoload :TokenFilter, 'text_rank/token_filter'
19
+ autoload :Tokenizer, 'text_rank/tokenizer'
20
+ autoload :VERSION, 'text_rank/version'
18
21
 
19
22
  # A convenience method for quickly extracting keywords from text with default options
20
- # @param text [String] text from which to extract keywords
23
+ # @param text [String,Array<String>] text from which to extract keywords
21
24
  # @option (see KeywordExtractor.basic)
22
25
  # @return [Hash<String, Float>] of tokens and text rank (in descending order)
23
26
  def self.extract_keywords(text, **options)
@@ -37,3 +40,5 @@ module TextRank
37
40
  end
38
41
 
39
42
  end
43
+
44
+ require 'text_rank/text_rank'
data/text_rank.gemspec CHANGED
@@ -16,13 +16,16 @@ Gem::Specification.new do |spec|
16
16
  spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
17
  spec.bindir = 'exe'
18
18
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.extensions = ['ext/text_rank/extconf.rb']
19
20
  spec.require_paths = ['lib']
20
21
 
21
22
  spec.add_development_dependency 'bundler'
22
23
  spec.add_development_dependency 'rake'
24
+ spec.add_development_dependency 'rake-compiler'
23
25
  spec.add_development_dependency 'rspec'
24
26
  spec.add_development_dependency 'rubocop'
25
- spec.add_development_dependency 'simplecov', '~> 0.17.0' # 0.18 not supported by code climate
27
+ spec.add_development_dependency 'simplecov'
28
+ spec.add_development_dependency 'yard'
26
29
 
27
30
  spec.add_development_dependency 'engtagger' # Optional runtime dependency but needed for specs
28
31
  spec.add_development_dependency 'nokogiri' # Optional runtime dependency but needed for specs
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_rank
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.3
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David McCullars
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-06-10 00:00:00.000000000 Z
11
+ date: 2021-12-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rspec
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -70,16 +84,30 @@ dependencies:
70
84
  name: simplecov
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
- - - "~>"
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: yard
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
74
102
  - !ruby/object:Gem::Version
75
- version: 0.17.0
103
+ version: '0'
76
104
  type: :development
77
105
  prerelease: false
78
106
  version_requirements: !ruby/object:Gem::Requirement
79
107
  requirements:
80
- - - "~>"
108
+ - - ">="
81
109
  - !ruby/object:Gem::Version
82
- version: 0.17.0
110
+ version: '0'
83
111
  - !ruby/object:Gem::Dependency
84
112
  name: engtagger
85
113
  requirement: !ruby/object:Gem::Requirement
@@ -113,7 +141,8 @@ description: Implementation of TextRank solution to ranked keyword extraction.
113
141
  email:
114
142
  - david.mccullars@gmail.com
115
143
  executables: []
116
- extensions: []
144
+ extensions:
145
+ - ext/text_rank/extconf.rb
117
146
  extra_rdoc_files: []
118
147
  files:
119
148
  - ".codeclimate.yml"
@@ -129,10 +158,15 @@ files:
129
158
  - Rakefile
130
159
  - bin/console
131
160
  - bin/setup
161
+ - ext/text_rank/extconf.rb
162
+ - ext/text_rank/page_rank_sparse_native.c
163
+ - ext/text_rank/page_rank_sparse_native.h
164
+ - ext/text_rank/text_rank.c
132
165
  - lib/page_rank.rb
133
166
  - lib/page_rank/base.rb
134
167
  - lib/page_rank/dense.rb
135
168
  - lib/page_rank/sparse.rb
169
+ - lib/page_rank/sparse_native.rb
136
170
  - lib/text_rank.rb
137
171
  - lib/text_rank/char_filter.rb
138
172
  - lib/text_rank/char_filter/ascii_folding.rb
@@ -141,7 +175,9 @@ files:
141
175
  - lib/text_rank/char_filter/strip_html.rb
142
176
  - lib/text_rank/char_filter/strip_possessive.rb
143
177
  - lib/text_rank/char_filter/undo_contractions.rb
178
+ - lib/text_rank/char_filter/undo_contractions.yml
144
179
  - lib/text_rank/fingerprint.rb
180
+ - lib/text_rank/fingerprint_overlap.rb
145
181
  - lib/text_rank/graph_strategy.rb
146
182
  - lib/text_rank/graph_strategy/coocurrence.rb
147
183
  - lib/text_rank/keyword_extractor.rb
@@ -154,6 +190,7 @@ files:
154
190
  - lib/text_rank/token_filter/min_length.rb
155
191
  - lib/text_rank/token_filter/part_of_speech.rb
156
192
  - lib/text_rank/token_filter/stopwords.rb
193
+ - lib/text_rank/token_filter/stopwords.yml
157
194
  - lib/text_rank/tokenizer.rb
158
195
  - lib/text_rank/tokenizer/money.rb
159
196
  - lib/text_rank/tokenizer/number.rb
@@ -167,7 +204,7 @@ homepage: https://github.com/david-mccullars/text_rank
167
204
  licenses:
168
205
  - MIT
169
206
  metadata: {}
170
- post_install_message:
207
+ post_install_message:
171
208
  rdoc_options: []
172
209
  require_paths:
173
210
  - lib
@@ -182,9 +219,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
182
219
  - !ruby/object:Gem::Version
183
220
  version: '0'
184
221
  requirements: []
185
- rubyforge_project:
186
- rubygems_version: 2.7.6
187
- signing_key:
222
+ rubygems_version: 3.2.32
223
+ signing_key:
188
224
  specification_version: 4
189
225
  summary: Implementation of TextRank solution to ranked keyword extraction
190
226
  test_files: []