text_rank 1.2.3 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.codeclimate.yml +1 -1
  3. data/.gitignore +4 -0
  4. data/.rubocop.yml +7 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +1 -0
  7. data/Rakefile +5 -0
  8. data/bin/console +3 -3
  9. data/ext/text_rank/extconf.rb +3 -0
  10. data/ext/text_rank/page_rank_sparse_native.c +300 -0
  11. data/ext/text_rank/page_rank_sparse_native.h +93 -0
  12. data/ext/text_rank/text_rank.c +5 -0
  13. data/lib/page_rank/base.rb +12 -9
  14. data/lib/page_rank/dense.rb +3 -2
  15. data/lib/page_rank/sparse.rb +6 -7
  16. data/lib/page_rank/sparse_native.rb +21 -0
  17. data/lib/page_rank.rb +7 -4
  18. data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
  19. data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
  20. data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
  21. data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
  22. data/lib/text_rank/char_filter.rb +1 -1
  23. data/lib/text_rank/fingerprint.rb +10 -18
  24. data/lib/text_rank/fingerprint_overlap.rb +55 -0
  25. data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
  26. data/lib/text_rank/keyword_extractor.rb +32 -25
  27. data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
  28. data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
  29. data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
  30. data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
  31. data/lib/text_rank/token_filter/stopwords.rb +1 -321
  32. data/lib/text_rank/token_filter/stopwords.yml +317 -0
  33. data/lib/text_rank/tokenizer/money.rb +11 -6
  34. data/lib/text_rank/tokenizer/number.rb +4 -3
  35. data/lib/text_rank/tokenizer/punctuation.rb +4 -1
  36. data/lib/text_rank/tokenizer/url.rb +3 -0
  37. data/lib/text_rank/tokenizer/whitespace.rb +4 -1
  38. data/lib/text_rank/tokenizer/word.rb +5 -2
  39. data/lib/text_rank/tokenizer.rb +1 -1
  40. data/lib/text_rank/version.rb +3 -1
  41. data/lib/text_rank.rb +14 -9
  42. data/text_rank.gemspec +4 -1
  43. metadata +48 -12
@@ -0,0 +1,317 @@
1
+ - a
2
+ - about
3
+ - above
4
+ - across
5
+ - after
6
+ - afterwards
7
+ - again
8
+ - against
9
+ - all
10
+ - almost
11
+ - alone
12
+ - along
13
+ - already
14
+ - also
15
+ - although
16
+ - always
17
+ - am
18
+ - among
19
+ - amongst
20
+ - amoungst
21
+ - amount
22
+ - an
23
+ - and
24
+ - another
25
+ - any
26
+ - anyhow
27
+ - anyone
28
+ - anything
29
+ - anyway
30
+ - anywhere
31
+ - are
32
+ - around
33
+ - as
34
+ - at
35
+ - back
36
+ - be
37
+ - became
38
+ - because
39
+ - become
40
+ - becomes
41
+ - becoming
42
+ - been
43
+ - before
44
+ - beforehand
45
+ - behind
46
+ - being
47
+ - below
48
+ - beside
49
+ - besides
50
+ - between
51
+ - beyond
52
+ - bill
53
+ - both
54
+ - bottom
55
+ - but
56
+ - by
57
+ - call
58
+ - can
59
+ - cannot
60
+ - cant
61
+ - co
62
+ - con
63
+ - could
64
+ - couldnt
65
+ - cry
66
+ - de
67
+ - describe
68
+ - detail
69
+ - do
70
+ - done
71
+ - down
72
+ - due
73
+ - during
74
+ - each
75
+ - eg
76
+ - eight
77
+ - either
78
+ - eleven
79
+ - else
80
+ - elsewhere
81
+ - empty
82
+ - enough
83
+ - etc
84
+ - even
85
+ - ever
86
+ - every
87
+ - everyone
88
+ - everything
89
+ - everywhere
90
+ - except
91
+ - few
92
+ - fifteen
93
+ - fify
94
+ - fill
95
+ - find
96
+ - fire
97
+ - first
98
+ - five
99
+ - for
100
+ - former
101
+ - formerly
102
+ - forty
103
+ - found
104
+ - four
105
+ - from
106
+ - front
107
+ - full
108
+ - further
109
+ - get
110
+ - give
111
+ - go
112
+ - had
113
+ - has
114
+ - hasnt
115
+ - have
116
+ - he
117
+ - hence
118
+ - her
119
+ - here
120
+ - hereafter
121
+ - hereby
122
+ - herein
123
+ - hereupon
124
+ - hers
125
+ - herself
126
+ - him
127
+ - himself
128
+ - his
129
+ - how
130
+ - however
131
+ - hundred
132
+ - ie
133
+ - if
134
+ - in
135
+ - inc
136
+ - indeed
137
+ - interest
138
+ - into
139
+ - is
140
+ - it
141
+ - its
142
+ - itself
143
+ - keep
144
+ - last
145
+ - latter
146
+ - latterly
147
+ - least
148
+ - less
149
+ - ltd
150
+ - made
151
+ - many
152
+ - may
153
+ - me
154
+ - meanwhile
155
+ - might
156
+ - mill
157
+ - mine
158
+ - more
159
+ - moreover
160
+ - most
161
+ - mostly
162
+ - move
163
+ - much
164
+ - must
165
+ - my
166
+ - myself
167
+ - name
168
+ - namely
169
+ - neither
170
+ - never
171
+ - nevertheless
172
+ - next
173
+ - nine
174
+ - no
175
+ - nobody
176
+ - none
177
+ - noone
178
+ - nor
179
+ - not
180
+ - nothing
181
+ - now
182
+ - nowhere
183
+ - of
184
+ - off
185
+ - often
186
+ - on
187
+ - once
188
+ - one
189
+ - only
190
+ - onto
191
+ - or
192
+ - other
193
+ - others
194
+ - otherwise
195
+ - our
196
+ - ours
197
+ - ourselves
198
+ - out
199
+ - over
200
+ - own
201
+ - part
202
+ - per
203
+ - perhaps
204
+ - please
205
+ - put
206
+ - rather
207
+ - re
208
+ - same
209
+ - see
210
+ - seem
211
+ - seemed
212
+ - seeming
213
+ - seems
214
+ - serious
215
+ - several
216
+ - she
217
+ - should
218
+ - show
219
+ - side
220
+ - since
221
+ - sincere
222
+ - six
223
+ - sixty
224
+ - so
225
+ - some
226
+ - somehow
227
+ - someone
228
+ - something
229
+ - sometime
230
+ - sometimes
231
+ - somewhere
232
+ - still
233
+ - such
234
+ - system
235
+ - take
236
+ - ten
237
+ - than
238
+ - that
239
+ - the
240
+ - their
241
+ - them
242
+ - themselves
243
+ - then
244
+ - thence
245
+ - there
246
+ - thereafter
247
+ - thereby
248
+ - therefore
249
+ - therein
250
+ - thereupon
251
+ - these
252
+ - they
253
+ - thickv
254
+ - thin
255
+ - third
256
+ - this
257
+ - those
258
+ - though
259
+ - three
260
+ - through
261
+ - throughout
262
+ - thru
263
+ - thus
264
+ - to
265
+ - together
266
+ - too
267
+ - top
268
+ - toward
269
+ - towards
270
+ - twelve
271
+ - twenty
272
+ - two
273
+ - un
274
+ - under
275
+ - until
276
+ - up
277
+ - upon
278
+ - us
279
+ - very
280
+ - via
281
+ - was
282
+ - we
283
+ - well
284
+ - were
285
+ - what
286
+ - whatever
287
+ - when
288
+ - whence
289
+ - whenever
290
+ - where
291
+ - whereafter
292
+ - whereas
293
+ - whereby
294
+ - wherein
295
+ - whereupon
296
+ - wherever
297
+ - whether
298
+ - which
299
+ - while
300
+ - whither
301
+ - who
302
+ - whoever
303
+ - whole
304
+ - whom
305
+ - whose
306
+ - why
307
+ - will
308
+ - with
309
+ - within
310
+ - without
311
+ - would
312
+ - yet
313
+ - you
314
+ - your
315
+ - yours
316
+ - yourself
317
+ - yourselves
@@ -1,4 +1,3 @@
1
- #encoding: UTF-8
2
1
  module TextRank
3
2
  module Tokenizer
4
3
 
@@ -12,7 +11,7 @@ module TextRank
12
11
  "\u20a4", # Lira Symbol
13
12
  "\u20a7", # Peseta Sign
14
13
  "\u20ac", # Euro Symbol
15
- "\u20B9", # Rupee
14
+ "\u20B9", # Rupee
16
15
  "\u20a9", # Won Sign
17
16
  "\u20b4", # Hryvnia Sign
18
17
  "\u20af", # Drachma Sign
@@ -34,6 +33,8 @@ module TextRank
34
33
  # A tokenizer regex that preserves money or formatted numbers as a single token. This
35
34
  # currently supports 24 different currency symbols:
36
35
  #
36
+ # rubocop:disable Style/AsciiComments
37
+ #
37
38
  # * ¤
38
39
  # * $
39
40
  # * ¢
@@ -58,19 +59,23 @@ module TextRank
58
59
  # * ₫
59
60
  # * %
60
61
  # * ‰
62
+
63
+ # rubocop:enable Style/AsciiComments
61
64
  #
62
65
  # It also supports two alternative formats for negatives as well as optional three digit comma
63
66
  # separation and optional decimals.
64
67
  ##
65
- Money = %r{
68
+ # rubocop:disable Naming/ConstantName
69
+ Money = /
66
70
  (
67
- #{CURRENCY_SYMBOLS} \-? #{Number} # $-45,231.21
71
+ #{CURRENCY_SYMBOLS} -? #{Number} # $-45,231.21
68
72
  |
69
- \-? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
73
+ -? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
70
74
  |
71
75
  \( #{CURRENCY_SYMBOLS} #{Number} \) # ($45,231.21)
72
76
  )
73
- }x
77
+ /x
78
+ # rubocop:enable Naming/ConstantName
74
79
 
75
80
  end
76
81
  end
@@ -1,11 +1,11 @@
1
- #encoding: UTF-8
2
1
  module TextRank
3
2
  module Tokenizer
4
3
 
5
4
  ##
6
5
  # A tokenizer regex that preserves (optionally formatted) numbers as a single token.
7
6
  ##
8
- Number = %r{
7
+ # rubocop:disable Naming/ConstantName
8
+ Number = /
9
9
  (
10
10
  [1-9]\d{3,} # 453231162
11
11
  (?:\.\d+)? # 453231162.17
@@ -25,7 +25,8 @@ module TextRank
25
25
 
26
26
  (?:\.\d+) # .17
27
27
  )
28
- }x
28
+ /x
29
+ # rubocop:enable Naming/ConstantName
29
30
 
30
31
  end
31
32
  end
@@ -1,11 +1,14 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves single punctuation symbols as a token. Use
5
6
  # this if one or more of your TokenFilter classes need punctuation in order to
6
7
  # make decisions.
7
8
  ##
8
- Punctuation = %r{([\p{Punct}])}
9
+ # rubocop:disable Naming/ConstantName
10
+ Punctuation = /(\p{Punct})/
11
+ # rubocop:enable Naming/ConstantName
9
12
 
10
13
  end
11
14
  end
@@ -1,8 +1,10 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves entire URL's as a token (rather than split them up)
5
6
  ##
7
+ # rubocop:disable Naming/ConstantName
6
8
  Url = %r{
7
9
  (
8
10
  (?:[\w-]+://?|www[.])
@@ -16,6 +18,7 @@ module TextRank
16
18
  )
17
19
  )
18
20
  }xi
21
+ # rubocop:enable Naming/ConstantName
19
22
 
20
23
  end
21
24
  end
@@ -1,11 +1,14 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves single whitespace characters as a token. Use
5
6
  # this if one or more of your TokenFilter classes need whitespace in order to
6
7
  # make decisions.
7
8
  ##
8
- Whitespace = %r{\s}
9
+ # rubocop:disable Naming/ConstantName
10
+ Whitespace = /\s/
11
+ # rubocop:enable Naming/ConstantName
9
12
 
10
13
  end
11
14
  end
@@ -1,14 +1,17 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves a non-space, non-punctuation "word". It does
5
6
  # allow hyphens and numerals, but the first character must be an A-Z character.
6
7
  ##
7
- Word = %r{
8
+ # rubocop:disable Naming/ConstantName
9
+ Word = /
8
10
  (
9
11
  [a-z][a-z0-9-]*
10
12
  )
11
- }xi
13
+ /xi
14
+ # rubocop:enable Naming/ConstantName
12
15
 
13
16
  end
14
17
  end
@@ -31,7 +31,7 @@ module TextRank
31
31
  tokens = []
32
32
  text.scan(Regexp.new(regular_expressions.flatten.join('|'))) do |matches|
33
33
  m = matches.compact.first
34
- tokens << m if m && m.size > 0
34
+ tokens << m if m&.size&.positive?
35
35
  end
36
36
  tokens
37
37
  end
@@ -1,4 +1,6 @@
1
1
  module TextRank
2
+
2
3
  # Current gem version
3
- VERSION = '1.2.3'
4
+ VERSION = '1.3.0'
5
+
4
6
  end
data/lib/text_rank.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require 'page_rank'
2
+ require 'set'
3
+ require 'yaml'
2
4
 
3
5
  ##
4
6
  # Provides convenience methods for quickly extracting keywords.
@@ -7,17 +9,18 @@ require 'page_rank'
7
9
  ##
8
10
  module TextRank
9
11
 
10
- autoload :CharFilter, 'text_rank/char_filter'
11
- autoload :Fingerprint, 'text_rank/fingerprint'
12
- autoload :GraphStrategy, 'text_rank/graph_strategy'
13
- autoload :KeywordExtractor, 'text_rank/keyword_extractor'
14
- autoload :RankFilter, 'text_rank/rank_filter'
15
- autoload :TokenFilter, 'text_rank/token_filter'
16
- autoload :Tokenizer, 'text_rank/tokenizer'
17
- autoload :VERSION, 'text_rank/version'
12
+ autoload :CharFilter, 'text_rank/char_filter'
13
+ autoload :Fingerprint, 'text_rank/fingerprint'
14
+ autoload :FingerprintOverlap, 'text_rank/fingerprint_overlap'
15
+ autoload :GraphStrategy, 'text_rank/graph_strategy'
16
+ autoload :KeywordExtractor, 'text_rank/keyword_extractor'
17
+ autoload :RankFilter, 'text_rank/rank_filter'
18
+ autoload :TokenFilter, 'text_rank/token_filter'
19
+ autoload :Tokenizer, 'text_rank/tokenizer'
20
+ autoload :VERSION, 'text_rank/version'
18
21
 
19
22
  # A convenience method for quickly extracting keywords from text with default options
20
- # @param text [String] text from which to extract keywords
23
+ # @param text [String,Array<String>] text from which to extract keywords
21
24
  # @option (see KeywordExtractor.basic)
22
25
  # @return [Hash<String, Float>] of tokens and text rank (in descending order)
23
26
  def self.extract_keywords(text, **options)
@@ -37,3 +40,5 @@ module TextRank
37
40
  end
38
41
 
39
42
  end
43
+
44
+ require 'text_rank/text_rank'
data/text_rank.gemspec CHANGED
@@ -16,13 +16,16 @@ Gem::Specification.new do |spec|
16
16
  spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
17
  spec.bindir = 'exe'
18
18
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.extensions = ['ext/text_rank/extconf.rb']
19
20
  spec.require_paths = ['lib']
20
21
 
21
22
  spec.add_development_dependency 'bundler'
22
23
  spec.add_development_dependency 'rake'
24
+ spec.add_development_dependency 'rake-compiler'
23
25
  spec.add_development_dependency 'rspec'
24
26
  spec.add_development_dependency 'rubocop'
25
- spec.add_development_dependency 'simplecov', '~> 0.17.0' # 0.18 not supported by code climate
27
+ spec.add_development_dependency 'simplecov'
28
+ spec.add_development_dependency 'yard'
26
29
 
27
30
  spec.add_development_dependency 'engtagger' # Optional runtime dependency but needed for specs
28
31
  spec.add_development_dependency 'nokogiri' # Optional runtime dependency but needed for specs
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_rank
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.3
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David McCullars
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-06-10 00:00:00.000000000 Z
11
+ date: 2021-12-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rspec
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -70,16 +84,30 @@ dependencies:
70
84
  name: simplecov
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
- - - "~>"
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: yard
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
74
102
  - !ruby/object:Gem::Version
75
- version: 0.17.0
103
+ version: '0'
76
104
  type: :development
77
105
  prerelease: false
78
106
  version_requirements: !ruby/object:Gem::Requirement
79
107
  requirements:
80
- - - "~>"
108
+ - - ">="
81
109
  - !ruby/object:Gem::Version
82
- version: 0.17.0
110
+ version: '0'
83
111
  - !ruby/object:Gem::Dependency
84
112
  name: engtagger
85
113
  requirement: !ruby/object:Gem::Requirement
@@ -113,7 +141,8 @@ description: Implementation of TextRank solution to ranked keyword extraction.
113
141
  email:
114
142
  - david.mccullars@gmail.com
115
143
  executables: []
116
- extensions: []
144
+ extensions:
145
+ - ext/text_rank/extconf.rb
117
146
  extra_rdoc_files: []
118
147
  files:
119
148
  - ".codeclimate.yml"
@@ -129,10 +158,15 @@ files:
129
158
  - Rakefile
130
159
  - bin/console
131
160
  - bin/setup
161
+ - ext/text_rank/extconf.rb
162
+ - ext/text_rank/page_rank_sparse_native.c
163
+ - ext/text_rank/page_rank_sparse_native.h
164
+ - ext/text_rank/text_rank.c
132
165
  - lib/page_rank.rb
133
166
  - lib/page_rank/base.rb
134
167
  - lib/page_rank/dense.rb
135
168
  - lib/page_rank/sparse.rb
169
+ - lib/page_rank/sparse_native.rb
136
170
  - lib/text_rank.rb
137
171
  - lib/text_rank/char_filter.rb
138
172
  - lib/text_rank/char_filter/ascii_folding.rb
@@ -141,7 +175,9 @@ files:
141
175
  - lib/text_rank/char_filter/strip_html.rb
142
176
  - lib/text_rank/char_filter/strip_possessive.rb
143
177
  - lib/text_rank/char_filter/undo_contractions.rb
178
+ - lib/text_rank/char_filter/undo_contractions.yml
144
179
  - lib/text_rank/fingerprint.rb
180
+ - lib/text_rank/fingerprint_overlap.rb
145
181
  - lib/text_rank/graph_strategy.rb
146
182
  - lib/text_rank/graph_strategy/coocurrence.rb
147
183
  - lib/text_rank/keyword_extractor.rb
@@ -154,6 +190,7 @@ files:
154
190
  - lib/text_rank/token_filter/min_length.rb
155
191
  - lib/text_rank/token_filter/part_of_speech.rb
156
192
  - lib/text_rank/token_filter/stopwords.rb
193
+ - lib/text_rank/token_filter/stopwords.yml
157
194
  - lib/text_rank/tokenizer.rb
158
195
  - lib/text_rank/tokenizer/money.rb
159
196
  - lib/text_rank/tokenizer/number.rb
@@ -167,7 +204,7 @@ homepage: https://github.com/david-mccullars/text_rank
167
204
  licenses:
168
205
  - MIT
169
206
  metadata: {}
170
- post_install_message:
207
+ post_install_message:
171
208
  rdoc_options: []
172
209
  require_paths:
173
210
  - lib
@@ -182,9 +219,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
182
219
  - !ruby/object:Gem::Version
183
220
  version: '0'
184
221
  requirements: []
185
- rubyforge_project:
186
- rubygems_version: 2.7.6
187
- signing_key:
222
+ rubygems_version: 3.2.32
223
+ signing_key:
188
224
  specification_version: 4
189
225
  summary: Implementation of TextRank solution to ranked keyword extraction
190
226
  test_files: []