text_rank 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/.codeclimate.yml +29 -0
  3. data/.gitignore +10 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +1157 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +7 -0
  8. data/CODE_OF_CONDUCT.md +49 -0
  9. data/Gemfile +3 -0
  10. data/LICENSE.txt +21 -0
  11. data/README.md +137 -0
  12. data/Rakefile +12 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/lib/page_rank/base.rb +89 -0
  16. data/lib/page_rank/dense.rb +89 -0
  17. data/lib/page_rank/sparse.rb +87 -0
  18. data/lib/page_rank.rb +39 -0
  19. data/lib/text_rank/char_filter/ascii_folding.rb +26 -0
  20. data/lib/text_rank/char_filter/lowercase.rb +22 -0
  21. data/lib/text_rank/char_filter/strip_email.rb +24 -0
  22. data/lib/text_rank/char_filter/strip_html.rb +41 -0
  23. data/lib/text_rank/char_filter/strip_possessive.rb +24 -0
  24. data/lib/text_rank/char_filter/undo_contractions.rb +162 -0
  25. data/lib/text_rank/char_filter.rb +24 -0
  26. data/lib/text_rank/graph_strategy/coocurrence.rb +78 -0
  27. data/lib/text_rank/graph_strategy.rb +23 -0
  28. data/lib/text_rank/keyword_extractor.rb +155 -0
  29. data/lib/text_rank/rank_filter/collapse_adjacent.rb +81 -0
  30. data/lib/text_rank/rank_filter.rb +18 -0
  31. data/lib/text_rank/token_filter/min_length.rb +33 -0
  32. data/lib/text_rank/token_filter/part_of_speech.rb +45 -0
  33. data/lib/text_rank/token_filter/stopwords.rb +349 -0
  34. data/lib/text_rank/token_filter.rb +18 -0
  35. data/lib/text_rank/tokenizer/regex.rb +26 -0
  36. data/lib/text_rank/tokenizer/whitespace.rb +19 -0
  37. data/lib/text_rank/tokenizer/words_and_punctuation.rb +26 -0
  38. data/lib/text_rank/tokenizer.rb +19 -0
  39. data/lib/text_rank/version.rb +3 -0
  40. data/lib/text_rank.rb +34 -0
  41. data/text_rank.gemspec +30 -0
  42. metadata +183 -0
@@ -0,0 +1,349 @@
1
+ require 'set'
2
+
3
+ module TextRank
4
+ module TokenFilter
5
+ ##
6
+ # Token filter to remove common stop word tokens
7
+ #
8
+ # = Example
9
+ #
10
+ # Stopwords.new.filter!(%w[
11
+ # but for what purpose was the earth formed to drive us mad
12
+ # ])
13
+ # => ["purpose", "earth", "formed", "drive", "mad"]
14
+ ##
15
+ class Stopwords
16
+
17
+ # Default English stop-word list.
18
+ STOP_WORDS = Set.new(%w[
19
+ a
20
+ about
21
+ above
22
+ across
23
+ after
24
+ afterwards
25
+ again
26
+ against
27
+ all
28
+ almost
29
+ alone
30
+ along
31
+ already
32
+ also
33
+ although
34
+ always
35
+ am
36
+ among
37
+ amongst
38
+ amoungst
39
+ amount
40
+ an
41
+ and
42
+ another
43
+ any
44
+ anyhow
45
+ anyone
46
+ anything
47
+ anyway
48
+ anywhere
49
+ are
50
+ around
51
+ as
52
+ at
53
+ back
54
+ be
55
+ became
56
+ because
57
+ become
58
+ becomes
59
+ becoming
60
+ been
61
+ before
62
+ beforehand
63
+ behind
64
+ being
65
+ below
66
+ beside
67
+ besides
68
+ between
69
+ beyond
70
+ bill
71
+ both
72
+ bottom
73
+ but
74
+ by
75
+ call
76
+ can
77
+ cannot
78
+ cant
79
+ co
80
+ con
81
+ could
82
+ couldnt
83
+ cry
84
+ de
85
+ describe
86
+ detail
87
+ do
88
+ done
89
+ down
90
+ due
91
+ during
92
+ each
93
+ eg
94
+ eight
95
+ either
96
+ eleven
97
+ else
98
+ elsewhere
99
+ empty
100
+ enough
101
+ etc
102
+ even
103
+ ever
104
+ every
105
+ everyone
106
+ everything
107
+ everywhere
108
+ except
109
+ few
110
+ fifteen
111
+ fify
112
+ fill
113
+ find
114
+ fire
115
+ first
116
+ five
117
+ for
118
+ former
119
+ formerly
120
+ forty
121
+ found
122
+ four
123
+ from
124
+ front
125
+ full
126
+ further
127
+ get
128
+ give
129
+ go
130
+ had
131
+ has
132
+ hasnt
133
+ have
134
+ he
135
+ hence
136
+ her
137
+ here
138
+ hereafter
139
+ hereby
140
+ herein
141
+ hereupon
142
+ hers
143
+ herself
144
+ him
145
+ himself
146
+ his
147
+ how
148
+ however
149
+ hundred
150
+ ie
151
+ if
152
+ in
153
+ inc
154
+ indeed
155
+ interest
156
+ into
157
+ is
158
+ it
159
+ its
160
+ itself
161
+ keep
162
+ last
163
+ latter
164
+ latterly
165
+ least
166
+ less
167
+ ltd
168
+ made
169
+ many
170
+ may
171
+ me
172
+ meanwhile
173
+ might
174
+ mill
175
+ mine
176
+ more
177
+ moreover
178
+ most
179
+ mostly
180
+ move
181
+ much
182
+ must
183
+ my
184
+ myself
185
+ name
186
+ namely
187
+ neither
188
+ never
189
+ nevertheless
190
+ next
191
+ nine
192
+ no
193
+ nobody
194
+ none
195
+ noone
196
+ nor
197
+ not
198
+ nothing
199
+ now
200
+ nowhere
201
+ of
202
+ off
203
+ often
204
+ on
205
+ once
206
+ one
207
+ only
208
+ onto
209
+ or
210
+ other
211
+ others
212
+ otherwise
213
+ our
214
+ ours
215
+ ourselves
216
+ out
217
+ over
218
+ own
219
+ part
220
+ per
221
+ perhaps
222
+ please
223
+ put
224
+ rather
225
+ re
226
+ same
227
+ see
228
+ seem
229
+ seemed
230
+ seeming
231
+ seems
232
+ serious
233
+ several
234
+ she
235
+ should
236
+ show
237
+ side
238
+ since
239
+ sincere
240
+ six
241
+ sixty
242
+ so
243
+ some
244
+ somehow
245
+ someone
246
+ something
247
+ sometime
248
+ sometimes
249
+ somewhere
250
+ still
251
+ such
252
+ system
253
+ take
254
+ ten
255
+ than
256
+ that
257
+ the
258
+ their
259
+ them
260
+ themselves
261
+ then
262
+ thence
263
+ there
264
+ thereafter
265
+ thereby
266
+ therefore
267
+ therein
268
+ thereupon
269
+ these
270
+ they
271
+ thickv
272
+ thin
273
+ third
274
+ this
275
+ those
276
+ though
277
+ three
278
+ through
279
+ throughout
280
+ thru
281
+ thus
282
+ to
283
+ together
284
+ too
285
+ top
286
+ toward
287
+ towards
288
+ twelve
289
+ twenty
290
+ two
291
+ un
292
+ under
293
+ until
294
+ up
295
+ upon
296
+ us
297
+ very
298
+ via
299
+ was
300
+ we
301
+ well
302
+ were
303
+ what
304
+ whatever
305
+ when
306
+ whence
307
+ whenever
308
+ where
309
+ whereafter
310
+ whereas
311
+ whereby
312
+ wherein
313
+ whereupon
314
+ wherever
315
+ whether
316
+ which
317
+ while
318
+ whither
319
+ who
320
+ whoever
321
+ whole
322
+ whom
323
+ whose
324
+ why
325
+ will
326
+ with
327
+ within
328
+ without
329
+ would
330
+ yet
331
+ you
332
+ your
333
+ yours
334
+ yourself
335
+ yourselves
336
+ ])
337
+
338
+ # Perform the filter
339
+ # @param tokens [Array<String>]
340
+ # @return [Array<String>]
341
+ def filter!(tokens)
342
+ tokens.delete_if do |token|
343
+ STOP_WORDS.include?(token.downcase)
344
+ end
345
+ end
346
+
347
+ end
348
+ end
349
+ end
@@ -0,0 +1,18 @@
1
+ module TextRank
2
+ ##
3
+ # Token filters can be used to pre-process potential tokens prior to creating
4
+ # a graph or executing PageRank. Filters are typically used to throw out tokens
5
+ # which are not good candidates for keywords. However, it is possible for a
6
+ # filter to add new tokens or to modify existing ones.
7
+ #
8
+ # Token filters are applied as a chain, so care should be taken to use them
9
+ # in the desired order.
10
+ ##
11
+ module TokenFilter
12
+
13
+ autoload :MinLength, 'text_rank/token_filter/min_length'
14
+ autoload :PartOfSpeech, 'text_rank/token_filter/part_of_speech'
15
+ autoload :Stopwords, 'text_rank/token_filter/stopwords'
16
+
17
+ end
18
+ end
@@ -0,0 +1,26 @@
1
+ module TextRank
2
+ module Tokenizer
3
+ ##
4
+ # Base tokenizer that tokenizes on any regular expression
5
+ #
6
+ # = Example
7
+ #
8
+ # Regex.new(/:/).tokenize("i should:like to know:which is worse.")
9
+ # => ["i should", "like to know", "which is worse"]
10
+ ##
11
+ class Regex
12
+
13
+ # @param regex [Regexp] to use for string splitting
14
+ def initialize(regex)
15
+ @regex = regex
16
+ end
17
+
18
+ # @param text [String] string to tokenize
19
+ # return [Array<String>] non-empty tokens
20
+ def tokenize(text)
21
+ text.split(@regex) - ['']
22
+ end
23
+
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,19 @@
1
+ module TextRank
2
+ module Tokenizer
3
+ ##
4
+ # Tokenizer to split on any whitespace
5
+ #
6
+ # = Example
7
+ #
8
+ # Whitespace.new.tokenize("i should:like to know:which is worse.")
9
+ # => ["i", "should:like", "to", "know:which", "is", "worse."]
10
+ ##
11
+ class Whitespace < Regex
12
+
13
+ def initialize
14
+ super(/\s+/)
15
+ end
16
+
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,26 @@
1
+ module TextRank
2
+ module Tokenizer
3
+ ##
4
+ # A tokenizer that preserves punctuation as their own tokens (which can be
5
+ # used, for example, by the [TokenFilter::PartOfSpeechBase] filter).
6
+ #
7
+ # = Example
8
+ #
9
+ # WordsAndPunctuation.new.tokenize("i should:like to know:which is worse.")
10
+ # => ["i", "should", ":", "like", "to", "know", ":", "which", "is", "worse", "."]
11
+ ##
12
+ class WordsAndPunctuation < Regex
13
+
14
+ def initialize
15
+ super(/
16
+ ([a-z][a-z0-9-]+)
17
+ |
18
+ ([\p{Punct}])
19
+ |
20
+ \s+
21
+ /xi)
22
+ end
23
+
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,19 @@
1
+ module TextRank
2
+ ##
3
+ # Tokenizers are responsible for transforming a single String of text into an
4
+ # array of potential keywords ("tokens"). There are no requirements of tokens
5
+ # other than to be non-empty. When used in combination with token filters, it
6
+ # may make sense for a tokenizer to temporarily create tokens which might seem
7
+ # like ill-suited keywords. The token filter may use these "bad" keywords to
8
+ # help inform its decision on which tokens to keep and which to drop. An example
9
+ # of this is the part of speech token filter which uses punctuation tokens to
10
+ # help guess the part of speech of each non-punctuation token.
11
+ ##
12
+ module Tokenizer
13
+
14
+ autoload :Regex, 'text_rank/tokenizer/regex'
15
+ autoload :Whitespace, 'text_rank/tokenizer/whitespace'
16
+ autoload :WordsAndPunctuation, 'text_rank/tokenizer/words_and_punctuation'
17
+
18
+ end
19
+ end
@@ -0,0 +1,3 @@
1
+ module TextRank
2
+ VERSION = '1.1.0'
3
+ end
data/lib/text_rank.rb ADDED
@@ -0,0 +1,34 @@
1
+ require 'page_rank'
2
+
3
+ ##
4
+ # Provides convenience methods for quickly extracting keywords.
5
+ #
6
+ # @see README
7
+ ##
8
+ module TextRank
9
+
10
+ autoload :CharFilter, 'text_rank/char_filter'
11
+ autoload :GraphStrategy, 'text_rank/graph_strategy'
12
+ autoload :KeywordExtractor, 'text_rank/keyword_extractor'
13
+ autoload :RankFilter, 'text_rank/rank_filter'
14
+ autoload :TokenFilter, 'text_rank/token_filter'
15
+ autoload :Tokenizer, 'text_rank/tokenizer'
16
+ autoload :VERSION, 'text_rank/version'
17
+
18
+ # A convenience method for quickly extracting keywords from text with default options
19
+ # @param text [String] text from which to extract keywords
20
+ # @option (see KeywordExtractor.basic)
21
+ # @return [Hash<String, Float>] of tokens and text rank (in descending order)
22
+ def self.extract_keywords(text, **options)
23
+ TextRank::KeywordExtractor.basic(**options).extract(text, **options)
24
+ end
25
+
26
+ # A convenience method for quickly extracting keywords from text with default advanced options
27
+ # @param (see extract_keywords)
28
+ # @option (see KeywordExtractor.advanced)
29
+ # @return (see extract_keywords)
30
+ def self.extract_keywords_advanced(text, **options)
31
+ TextRank::KeywordExtractor.advanced(**options).extract(text, **options)
32
+ end
33
+
34
+ end
data/text_rank.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'text_rank/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'text_rank'
8
+ spec.version = TextRank::VERSION
9
+ spec.authors = ['David McCullars']
10
+ spec.email = ['david.mccullars@gmail.com']
11
+
12
+ spec.summary = %q{Implementation of TextRank solution to ranked keyword extraction}
13
+ spec.description = %q{See https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwjK9tfHxcvMAhVOzGMKHdaQBeEQFggdMAA&url=https%3A%2F%2Fweb.eecs.umich.edu%2F~mihalcea%2Fpapers%2Fmihalcea.emnlp04.pdf&usg=AFQjCNHL5SGlxLy4qmEg1yexaKGZK_Q7IA}
14
+ spec.homepage = 'https://github.com/david-mccullars/text_rank'
15
+ spec.license = 'MIT'
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = 'exe'
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ['lib']
21
+
22
+ spec.add_development_dependency 'bundler', '~> 1.11'
23
+ spec.add_development_dependency 'rake', '~> 10.0'
24
+ spec.add_development_dependency 'rspec', '~> 3.0'
25
+ spec.add_development_dependency 'simplecov', '~> 0.11'
26
+ spec.add_development_dependency 'codeclimate-test-reporter'
27
+
28
+ spec.add_development_dependency 'engtagger', '~> 0.2.0' # Optional runtime dependency but needed for specs
29
+ spec.add_development_dependency 'nokogiri', '~> 1.0' # Optional runtime dependency but needed for specs
30
+ end
metadata ADDED
@@ -0,0 +1,183 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text_rank
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.1.0
5
+ platform: ruby
6
+ authors:
7
+ - David McCullars
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-05-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.11'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.11'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: simplecov
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.11'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.11'
69
+ - !ruby/object:Gem::Dependency
70
+ name: codeclimate-test-reporter
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: engtagger
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.2.0
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.2.0
97
+ - !ruby/object:Gem::Dependency
98
+ name: nokogiri
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.0'
111
+ description: See https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwjK9tfHxcvMAhVOzGMKHdaQBeEQFggdMAA&url=https%3A%2F%2Fweb.eecs.umich.edu%2F~mihalcea%2Fpapers%2Fmihalcea.emnlp04.pdf&usg=AFQjCNHL5SGlxLy4qmEg1yexaKGZK_Q7IA
112
+ email:
113
+ - david.mccullars@gmail.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - ".codeclimate.yml"
119
+ - ".gitignore"
120
+ - ".rspec"
121
+ - ".rubocop.yml"
122
+ - ".ruby-version"
123
+ - ".travis.yml"
124
+ - CODE_OF_CONDUCT.md
125
+ - Gemfile
126
+ - LICENSE.txt
127
+ - README.md
128
+ - Rakefile
129
+ - bin/console
130
+ - bin/setup
131
+ - lib/page_rank.rb
132
+ - lib/page_rank/base.rb
133
+ - lib/page_rank/dense.rb
134
+ - lib/page_rank/sparse.rb
135
+ - lib/text_rank.rb
136
+ - lib/text_rank/char_filter.rb
137
+ - lib/text_rank/char_filter/ascii_folding.rb
138
+ - lib/text_rank/char_filter/lowercase.rb
139
+ - lib/text_rank/char_filter/strip_email.rb
140
+ - lib/text_rank/char_filter/strip_html.rb
141
+ - lib/text_rank/char_filter/strip_possessive.rb
142
+ - lib/text_rank/char_filter/undo_contractions.rb
143
+ - lib/text_rank/graph_strategy.rb
144
+ - lib/text_rank/graph_strategy/coocurrence.rb
145
+ - lib/text_rank/keyword_extractor.rb
146
+ - lib/text_rank/rank_filter.rb
147
+ - lib/text_rank/rank_filter/collapse_adjacent.rb
148
+ - lib/text_rank/token_filter.rb
149
+ - lib/text_rank/token_filter/min_length.rb
150
+ - lib/text_rank/token_filter/part_of_speech.rb
151
+ - lib/text_rank/token_filter/stopwords.rb
152
+ - lib/text_rank/tokenizer.rb
153
+ - lib/text_rank/tokenizer/regex.rb
154
+ - lib/text_rank/tokenizer/whitespace.rb
155
+ - lib/text_rank/tokenizer/words_and_punctuation.rb
156
+ - lib/text_rank/version.rb
157
+ - text_rank.gemspec
158
+ homepage: https://github.com/david-mccullars/text_rank
159
+ licenses:
160
+ - MIT
161
+ metadata: {}
162
+ post_install_message:
163
+ rdoc_options: []
164
+ require_paths:
165
+ - lib
166
+ required_ruby_version: !ruby/object:Gem::Requirement
167
+ requirements:
168
+ - - ">="
169
+ - !ruby/object:Gem::Version
170
+ version: '0'
171
+ required_rubygems_version: !ruby/object:Gem::Requirement
172
+ requirements:
173
+ - - ">="
174
+ - !ruby/object:Gem::Version
175
+ version: '0'
176
+ requirements: []
177
+ rubyforge_project:
178
+ rubygems_version: 2.5.1
179
+ signing_key:
180
+ specification_version: 4
181
+ summary: Implementation of TextRank solution to ranked keyword extraction
182
+ test_files: []
183
+ has_rdoc: