text_rank 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/.codeclimate.yml +29 -0
  3. data/.gitignore +10 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +1157 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +7 -0
  8. data/CODE_OF_CONDUCT.md +49 -0
  9. data/Gemfile +3 -0
  10. data/LICENSE.txt +21 -0
  11. data/README.md +137 -0
  12. data/Rakefile +12 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/lib/page_rank/base.rb +89 -0
  16. data/lib/page_rank/dense.rb +89 -0
  17. data/lib/page_rank/sparse.rb +87 -0
  18. data/lib/page_rank.rb +39 -0
  19. data/lib/text_rank/char_filter/ascii_folding.rb +26 -0
  20. data/lib/text_rank/char_filter/lowercase.rb +22 -0
  21. data/lib/text_rank/char_filter/strip_email.rb +24 -0
  22. data/lib/text_rank/char_filter/strip_html.rb +41 -0
  23. data/lib/text_rank/char_filter/strip_possessive.rb +24 -0
  24. data/lib/text_rank/char_filter/undo_contractions.rb +162 -0
  25. data/lib/text_rank/char_filter.rb +24 -0
  26. data/lib/text_rank/graph_strategy/coocurrence.rb +78 -0
  27. data/lib/text_rank/graph_strategy.rb +23 -0
  28. data/lib/text_rank/keyword_extractor.rb +155 -0
  29. data/lib/text_rank/rank_filter/collapse_adjacent.rb +81 -0
  30. data/lib/text_rank/rank_filter.rb +18 -0
  31. data/lib/text_rank/token_filter/min_length.rb +33 -0
  32. data/lib/text_rank/token_filter/part_of_speech.rb +45 -0
  33. data/lib/text_rank/token_filter/stopwords.rb +349 -0
  34. data/lib/text_rank/token_filter.rb +18 -0
  35. data/lib/text_rank/tokenizer/regex.rb +26 -0
  36. data/lib/text_rank/tokenizer/whitespace.rb +19 -0
  37. data/lib/text_rank/tokenizer/words_and_punctuation.rb +26 -0
  38. data/lib/text_rank/tokenizer.rb +19 -0
  39. data/lib/text_rank/version.rb +3 -0
  40. data/lib/text_rank.rb +34 -0
  41. data/text_rank.gemspec +30 -0
  42. metadata +183 -0
@@ -0,0 +1,349 @@
1
+ require 'set'
2
+
3
+ module TextRank
4
+ module TokenFilter
5
+ ##
6
+ # Token filter to remove common stop word tokens
7
+ #
8
+ # = Example
9
+ #
10
+ # Stopwords.new.filter!(%w[
11
+ # but for what purpose was the earth formed to drive us mad
12
+ # ])
13
+ # => ["purpose", "earth", "formed", "drive", "mad"]
14
+ ##
15
+ class Stopwords
16
+
17
+ # Default English stop-word list.
18
+ STOP_WORDS = Set.new(%w[
19
+ a
20
+ about
21
+ above
22
+ across
23
+ after
24
+ afterwards
25
+ again
26
+ against
27
+ all
28
+ almost
29
+ alone
30
+ along
31
+ already
32
+ also
33
+ although
34
+ always
35
+ am
36
+ among
37
+ amongst
38
+ amoungst
39
+ amount
40
+ an
41
+ and
42
+ another
43
+ any
44
+ anyhow
45
+ anyone
46
+ anything
47
+ anyway
48
+ anywhere
49
+ are
50
+ around
51
+ as
52
+ at
53
+ back
54
+ be
55
+ became
56
+ because
57
+ become
58
+ becomes
59
+ becoming
60
+ been
61
+ before
62
+ beforehand
63
+ behind
64
+ being
65
+ below
66
+ beside
67
+ besides
68
+ between
69
+ beyond
70
+ bill
71
+ both
72
+ bottom
73
+ but
74
+ by
75
+ call
76
+ can
77
+ cannot
78
+ cant
79
+ co
80
+ con
81
+ could
82
+ couldnt
83
+ cry
84
+ de
85
+ describe
86
+ detail
87
+ do
88
+ done
89
+ down
90
+ due
91
+ during
92
+ each
93
+ eg
94
+ eight
95
+ either
96
+ eleven
97
+ else
98
+ elsewhere
99
+ empty
100
+ enough
101
+ etc
102
+ even
103
+ ever
104
+ every
105
+ everyone
106
+ everything
107
+ everywhere
108
+ except
109
+ few
110
+ fifteen
111
+ fify
112
+ fill
113
+ find
114
+ fire
115
+ first
116
+ five
117
+ for
118
+ former
119
+ formerly
120
+ forty
121
+ found
122
+ four
123
+ from
124
+ front
125
+ full
126
+ further
127
+ get
128
+ give
129
+ go
130
+ had
131
+ has
132
+ hasnt
133
+ have
134
+ he
135
+ hence
136
+ her
137
+ here
138
+ hereafter
139
+ hereby
140
+ herein
141
+ hereupon
142
+ hers
143
+ herself
144
+ him
145
+ himself
146
+ his
147
+ how
148
+ however
149
+ hundred
150
+ ie
151
+ if
152
+ in
153
+ inc
154
+ indeed
155
+ interest
156
+ into
157
+ is
158
+ it
159
+ its
160
+ itself
161
+ keep
162
+ last
163
+ latter
164
+ latterly
165
+ least
166
+ less
167
+ ltd
168
+ made
169
+ many
170
+ may
171
+ me
172
+ meanwhile
173
+ might
174
+ mill
175
+ mine
176
+ more
177
+ moreover
178
+ most
179
+ mostly
180
+ move
181
+ much
182
+ must
183
+ my
184
+ myself
185
+ name
186
+ namely
187
+ neither
188
+ never
189
+ nevertheless
190
+ next
191
+ nine
192
+ no
193
+ nobody
194
+ none
195
+ noone
196
+ nor
197
+ not
198
+ nothing
199
+ now
200
+ nowhere
201
+ of
202
+ off
203
+ often
204
+ on
205
+ once
206
+ one
207
+ only
208
+ onto
209
+ or
210
+ other
211
+ others
212
+ otherwise
213
+ our
214
+ ours
215
+ ourselves
216
+ out
217
+ over
218
+ own
219
+ part
220
+ per
221
+ perhaps
222
+ please
223
+ put
224
+ rather
225
+ re
226
+ same
227
+ see
228
+ seem
229
+ seemed
230
+ seeming
231
+ seems
232
+ serious
233
+ several
234
+ she
235
+ should
236
+ show
237
+ side
238
+ since
239
+ sincere
240
+ six
241
+ sixty
242
+ so
243
+ some
244
+ somehow
245
+ someone
246
+ something
247
+ sometime
248
+ sometimes
249
+ somewhere
250
+ still
251
+ such
252
+ system
253
+ take
254
+ ten
255
+ than
256
+ that
257
+ the
258
+ their
259
+ them
260
+ themselves
261
+ then
262
+ thence
263
+ there
264
+ thereafter
265
+ thereby
266
+ therefore
267
+ therein
268
+ thereupon
269
+ these
270
+ they
271
+ thickv
272
+ thin
273
+ third
274
+ this
275
+ those
276
+ though
277
+ three
278
+ through
279
+ throughout
280
+ thru
281
+ thus
282
+ to
283
+ together
284
+ too
285
+ top
286
+ toward
287
+ towards
288
+ twelve
289
+ twenty
290
+ two
291
+ un
292
+ under
293
+ until
294
+ up
295
+ upon
296
+ us
297
+ very
298
+ via
299
+ was
300
+ we
301
+ well
302
+ were
303
+ what
304
+ whatever
305
+ when
306
+ whence
307
+ whenever
308
+ where
309
+ whereafter
310
+ whereas
311
+ whereby
312
+ wherein
313
+ whereupon
314
+ wherever
315
+ whether
316
+ which
317
+ while
318
+ whither
319
+ who
320
+ whoever
321
+ whole
322
+ whom
323
+ whose
324
+ why
325
+ will
326
+ with
327
+ within
328
+ without
329
+ would
330
+ yet
331
+ you
332
+ your
333
+ yours
334
+ yourself
335
+ yourselves
336
+ ])
337
+
338
+ # Perform the filter
339
+ # @param tokens [Array<String>]
340
+ # @return [Array<String>]
341
+ def filter!(tokens)
342
+ tokens.delete_if do |token|
343
+ STOP_WORDS.include?(token.downcase)
344
+ end
345
+ end
346
+
347
+ end
348
+ end
349
+ end
@@ -0,0 +1,18 @@
1
+ module TextRank
2
+ ##
3
+ # Token filters can be used to pre-process potential tokens prior to creating
4
+ # a graph or executing PageRank. Filters are typically used to throw out tokens
5
+ # which are not good candidates for keywords. However, it is possible for a
6
+ # filter to add new tokens or to modify existing ones.
7
+ #
8
+ # Token filters are applied as a chain, so care should be taken to use them
9
+ # in the desired order.
10
+ ##
11
+ module TokenFilter
12
+
13
+ autoload :MinLength, 'text_rank/token_filter/min_length'
14
+ autoload :PartOfSpeech, 'text_rank/token_filter/part_of_speech'
15
+ autoload :Stopwords, 'text_rank/token_filter/stopwords'
16
+
17
+ end
18
+ end
@@ -0,0 +1,26 @@
1
+ module TextRank
2
+ module Tokenizer
3
+ ##
4
+ # Base tokenizer that tokenizes on any regular expression
5
+ #
6
+ # = Example
7
+ #
8
+ # Regex.new(/:/).tokenize("i should:like to know:which is worse.")
9
+ # => ["i should", "like to know", "which is worse"]
10
+ ##
11
+ class Regex
12
+
13
+ # @param regex [Regexp] to use for string splitting
14
+ def initialize(regex)
15
+ @regex = regex
16
+ end
17
+
18
+ # @param text [String] string to tokenize
19
+ # return [Array<String>] non-empty tokens
20
+ def tokenize(text)
21
+ text.split(@regex) - ['']
22
+ end
23
+
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,19 @@
1
+ module TextRank
2
+ module Tokenizer
3
+ ##
4
+ # Tokenizer to split on any whitespace
5
+ #
6
+ # = Example
7
+ #
8
+ # Whitespace.new.tokenize("i should:like to know:which is worse.")
9
+ # => ["i", "should:like", "to", "know:which", "is", "worse."]
10
+ ##
11
+ class Whitespace < Regex
12
+
13
+ def initialize
14
+ super(/\s+/)
15
+ end
16
+
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,26 @@
1
+ module TextRank
2
+ module Tokenizer
3
+ ##
4
+ # A tokenizer that preserves punctuation as their own tokens (which can be
5
+ # used, for example, by the [TokenFilter::PartOfSpeechBase] filter).
6
+ #
7
+ # = Example
8
+ #
9
+ # WordsAndPunctuation.new.tokenize("i should:like to know:which is worse.")
10
+ # => ["i", "should", ":", "like", "to", "know", ":", "which", "is", "worse", "."]
11
+ ##
12
+ class WordsAndPunctuation < Regex
13
+
14
+ def initialize
15
+ super(/
16
+ ([a-z][a-z0-9-]+)
17
+ |
18
+ ([\p{Punct}])
19
+ |
20
+ \s+
21
+ /xi)
22
+ end
23
+
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,19 @@
1
+ module TextRank
2
+ ##
3
+ # Tokenizers are responsible for transforming a single String of text into an
4
+ # array of potential keywords ("tokens"). There are no requirements of tokens
5
+ # other than to be non-empty. When used in combination with token filters, it
6
+ # may make sense for a tokenizer to temporarily create tokens which might seem
7
+ # like ill-suited keywords. The token filter may use these "bad" keywords to
8
+ # help inform its decision on which tokens to keep and which to drop. An example
9
+ # of this is the part of speech token filter which uses punctuation tokens to
10
+ # help guess the part of speech of each non-punctuation token.
11
+ ##
12
+ module Tokenizer
13
+
14
+ autoload :Regex, 'text_rank/tokenizer/regex'
15
+ autoload :Whitespace, 'text_rank/tokenizer/whitespace'
16
+ autoload :WordsAndPunctuation, 'text_rank/tokenizer/words_and_punctuation'
17
+
18
+ end
19
+ end
@@ -0,0 +1,3 @@
1
+ module TextRank
2
+ VERSION = '1.1.0'
3
+ end
data/lib/text_rank.rb ADDED
@@ -0,0 +1,34 @@
1
+ require 'page_rank'
2
+
3
+ ##
4
+ # Provides convenience methods for quickly extracting keywords.
5
+ #
6
+ # @see README
7
+ ##
8
+ module TextRank
9
+
10
+ autoload :CharFilter, 'text_rank/char_filter'
11
+ autoload :GraphStrategy, 'text_rank/graph_strategy'
12
+ autoload :KeywordExtractor, 'text_rank/keyword_extractor'
13
+ autoload :RankFilter, 'text_rank/rank_filter'
14
+ autoload :TokenFilter, 'text_rank/token_filter'
15
+ autoload :Tokenizer, 'text_rank/tokenizer'
16
+ autoload :VERSION, 'text_rank/version'
17
+
18
+ # A convenience method for quickly extracting keywords from text with default options
19
+ # @param text [String] text from which to extract keywords
20
+ # @option (see KeywordExtractor.basic)
21
+ # @return [Hash<String, Float>] of tokens and text rank (in descending order)
22
+ def self.extract_keywords(text, **options)
23
+ TextRank::KeywordExtractor.basic(**options).extract(text, **options)
24
+ end
25
+
26
+ # A convenience method for quickly extracting keywords from text with default advanced options
27
+ # @param (see extract_keywords)
28
+ # @option (see KeywordExtractor.advanced)
29
+ # @return (see extract_keywords)
30
+ def self.extract_keywords_advanced(text, **options)
31
+ TextRank::KeywordExtractor.advanced(**options).extract(text, **options)
32
+ end
33
+
34
+ end
data/text_rank.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'text_rank/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'text_rank'
8
+ spec.version = TextRank::VERSION
9
+ spec.authors = ['David McCullars']
10
+ spec.email = ['david.mccullars@gmail.com']
11
+
12
+ spec.summary = %q{Implementation of TextRank solution to ranked keyword extraction}
13
+ spec.description = %q{See https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwjK9tfHxcvMAhVOzGMKHdaQBeEQFggdMAA&url=https%3A%2F%2Fweb.eecs.umich.edu%2F~mihalcea%2Fpapers%2Fmihalcea.emnlp04.pdf&usg=AFQjCNHL5SGlxLy4qmEg1yexaKGZK_Q7IA}
14
+ spec.homepage = 'https://github.com/david-mccullars/text_rank'
15
+ spec.license = 'MIT'
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = 'exe'
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ['lib']
21
+
22
+ spec.add_development_dependency 'bundler', '~> 1.11'
23
+ spec.add_development_dependency 'rake', '~> 10.0'
24
+ spec.add_development_dependency 'rspec', '~> 3.0'
25
+ spec.add_development_dependency 'simplecov', '~> 0.11'
26
+ spec.add_development_dependency 'codeclimate-test-reporter'
27
+
28
+ spec.add_development_dependency 'engtagger', '~> 0.2.0' # Optional runtime dependency but needed for specs
29
+ spec.add_development_dependency 'nokogiri', '~> 1.0' # Optional runtime dependency but needed for specs
30
+ end
metadata ADDED
@@ -0,0 +1,183 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text_rank
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.1.0
5
+ platform: ruby
6
+ authors:
7
+ - David McCullars
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-05-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.11'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.11'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: simplecov
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.11'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.11'
69
+ - !ruby/object:Gem::Dependency
70
+ name: codeclimate-test-reporter
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: engtagger
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.2.0
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.2.0
97
+ - !ruby/object:Gem::Dependency
98
+ name: nokogiri
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.0'
111
+ description: See https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwjK9tfHxcvMAhVOzGMKHdaQBeEQFggdMAA&url=https%3A%2F%2Fweb.eecs.umich.edu%2F~mihalcea%2Fpapers%2Fmihalcea.emnlp04.pdf&usg=AFQjCNHL5SGlxLy4qmEg1yexaKGZK_Q7IA
112
+ email:
113
+ - david.mccullars@gmail.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - ".codeclimate.yml"
119
+ - ".gitignore"
120
+ - ".rspec"
121
+ - ".rubocop.yml"
122
+ - ".ruby-version"
123
+ - ".travis.yml"
124
+ - CODE_OF_CONDUCT.md
125
+ - Gemfile
126
+ - LICENSE.txt
127
+ - README.md
128
+ - Rakefile
129
+ - bin/console
130
+ - bin/setup
131
+ - lib/page_rank.rb
132
+ - lib/page_rank/base.rb
133
+ - lib/page_rank/dense.rb
134
+ - lib/page_rank/sparse.rb
135
+ - lib/text_rank.rb
136
+ - lib/text_rank/char_filter.rb
137
+ - lib/text_rank/char_filter/ascii_folding.rb
138
+ - lib/text_rank/char_filter/lowercase.rb
139
+ - lib/text_rank/char_filter/strip_email.rb
140
+ - lib/text_rank/char_filter/strip_html.rb
141
+ - lib/text_rank/char_filter/strip_possessive.rb
142
+ - lib/text_rank/char_filter/undo_contractions.rb
143
+ - lib/text_rank/graph_strategy.rb
144
+ - lib/text_rank/graph_strategy/coocurrence.rb
145
+ - lib/text_rank/keyword_extractor.rb
146
+ - lib/text_rank/rank_filter.rb
147
+ - lib/text_rank/rank_filter/collapse_adjacent.rb
148
+ - lib/text_rank/token_filter.rb
149
+ - lib/text_rank/token_filter/min_length.rb
150
+ - lib/text_rank/token_filter/part_of_speech.rb
151
+ - lib/text_rank/token_filter/stopwords.rb
152
+ - lib/text_rank/tokenizer.rb
153
+ - lib/text_rank/tokenizer/regex.rb
154
+ - lib/text_rank/tokenizer/whitespace.rb
155
+ - lib/text_rank/tokenizer/words_and_punctuation.rb
156
+ - lib/text_rank/version.rb
157
+ - text_rank.gemspec
158
+ homepage: https://github.com/david-mccullars/text_rank
159
+ licenses:
160
+ - MIT
161
+ metadata: {}
162
+ post_install_message:
163
+ rdoc_options: []
164
+ require_paths:
165
+ - lib
166
+ required_ruby_version: !ruby/object:Gem::Requirement
167
+ requirements:
168
+ - - ">="
169
+ - !ruby/object:Gem::Version
170
+ version: '0'
171
+ required_rubygems_version: !ruby/object:Gem::Requirement
172
+ requirements:
173
+ - - ">="
174
+ - !ruby/object:Gem::Version
175
+ version: '0'
176
+ requirements: []
177
+ rubyforge_project:
178
+ rubygems_version: 2.5.1
179
+ signing_key:
180
+ specification_version: 4
181
+ summary: Implementation of TextRank solution to ranked keyword extraction
182
+ test_files: []
183
+ has_rdoc: