text_rank 1.1.7 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +5 -5
  2. data/.codeclimate.yml +1 -6
  3. data/.rubocop.yml +60 -1075
  4. data/.ruby-version +1 -1
  5. data/.travis.yml +13 -5
  6. data/{LICENSE.txt → LICENSE} +0 -0
  7. data/README.md +2 -1
  8. data/bin/console +3 -3
  9. data/lib/page_rank.rb +2 -0
  10. data/lib/page_rank/base.rb +9 -8
  11. data/lib/page_rank/dense.rb +2 -1
  12. data/lib/page_rank/sparse.rb +6 -7
  13. data/lib/text_rank.rb +12 -9
  14. data/lib/text_rank/char_filter.rb +1 -1
  15. data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
  16. data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
  17. data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
  18. data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
  19. data/lib/text_rank/fingerprint.rb +20 -28
  20. data/lib/text_rank/fingerprint_overlap.rb +55 -0
  21. data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
  22. data/lib/text_rank/keyword_extractor.rb +32 -25
  23. data/lib/text_rank/rank_filter/collapse_adjacent.rb +53 -25
  24. data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
  25. data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
  26. data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
  27. data/lib/text_rank/token_filter/stopwords.rb +1 -321
  28. data/lib/text_rank/token_filter/stopwords.yml +317 -0
  29. data/lib/text_rank/tokenizer.rb +1 -1
  30. data/lib/text_rank/tokenizer/money.rb +11 -6
  31. data/lib/text_rank/tokenizer/number.rb +4 -3
  32. data/lib/text_rank/tokenizer/punctuation.rb +4 -1
  33. data/lib/text_rank/tokenizer/url.rb +3 -0
  34. data/lib/text_rank/tokenizer/whitespace.rb +4 -1
  35. data/lib/text_rank/tokenizer/word.rb +5 -2
  36. data/lib/text_rank/version.rb +3 -1
  37. data/text_rank.gemspec +10 -10
  38. metadata +48 -32
@@ -31,7 +31,7 @@ module TextRank
31
31
  tokens = []
32
32
  text.scan(Regexp.new(regular_expressions.flatten.join('|'))) do |matches|
33
33
  m = matches.compact.first
34
- tokens << m if m && m.size > 0
34
+ tokens << m if m&.size&.positive?
35
35
  end
36
36
  tokens
37
37
  end
@@ -1,4 +1,3 @@
1
- #encoding: UTF-8
2
1
  module TextRank
3
2
  module Tokenizer
4
3
 
@@ -12,7 +11,7 @@ module TextRank
12
11
  "\u20a4", # Lira Symbol
13
12
  "\u20a7", # Peseta Sign
14
13
  "\u20ac", # Euro Symbol
15
- "\u20B9", # Rupee
14
+ "\u20B9", # Rupee
16
15
  "\u20a9", # Won Sign
17
16
  "\u20b4", # Hryvnia Sign
18
17
  "\u20af", # Drachma Sign
@@ -34,6 +33,8 @@ module TextRank
34
33
  # A tokenizer regex that preserves money or formatted numbers as a single token. This
35
34
  # currently supports 24 different currency symbols:
36
35
  #
36
+ # rubocop:disable Style/AsciiComments
37
+ #
37
38
  # * ¤
38
39
  # * $
39
40
  # * ¢
@@ -58,19 +59,23 @@ module TextRank
58
59
  # * ₫
59
60
  # * %
60
61
  # * ‰
62
+
63
+ # rubocop:enable Style/AsciiComments
61
64
  #
62
65
  # It also supports two alternative formats for negatives as well as optional three digit comma
63
66
  # separation and optional decimals.
64
67
  ##
65
- Money = %r{
68
+ # rubocop:disable Naming/ConstantName
69
+ Money = /
66
70
  (
67
- #{CURRENCY_SYMBOLS} \-? #{Number} # $-45,231.21
71
+ #{CURRENCY_SYMBOLS} -? #{Number} # $-45,231.21
68
72
  |
69
- \-? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
73
+ -? #{CURRENCY_SYMBOLS} #{Number} # -$45,231.21
70
74
  |
71
75
  \( #{CURRENCY_SYMBOLS} #{Number} \) # ($45,231.21)
72
76
  )
73
- }x
77
+ /x
78
+ # rubocop:enable Naming/ConstantName
74
79
 
75
80
  end
76
81
  end
@@ -1,11 +1,11 @@
1
- #encoding: UTF-8
2
1
  module TextRank
3
2
  module Tokenizer
4
3
 
5
4
  ##
6
5
  # A tokenizer regex that preserves (optionally formatted) numbers as a single token.
7
6
  ##
8
- Number = %r{
7
+ # rubocop:disable Naming/ConstantName
8
+ Number = /
9
9
  (
10
10
  [1-9]\d{3,} # 453231162
11
11
  (?:\.\d+)? # 453231162.17
@@ -25,7 +25,8 @@ module TextRank
25
25
 
26
26
  (?:\.\d+) # .17
27
27
  )
28
- }x
28
+ /x
29
+ # rubocop:enable Naming/ConstantName
29
30
 
30
31
  end
31
32
  end
@@ -1,11 +1,14 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves single punctuation symbols as a token. Use
5
6
  # this if one or more of your TokenFilter classes need punctuation in order to
6
7
  # make decisions.
7
8
  ##
8
- Punctuation = %r{([\p{Punct}])}
9
+ # rubocop:disable Naming/ConstantName
10
+ Punctuation = /(\p{Punct})/
11
+ # rubocop:enable Naming/ConstantName
9
12
 
10
13
  end
11
14
  end
@@ -1,8 +1,10 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves entire URL's as a token (rather than split them up)
5
6
  ##
7
+ # rubocop:disable Naming/ConstantName
6
8
  Url = %r{
7
9
  (
8
10
  (?:[\w-]+://?|www[.])
@@ -16,6 +18,7 @@ module TextRank
16
18
  )
17
19
  )
18
20
  }xi
21
+ # rubocop:enable Naming/ConstantName
19
22
 
20
23
  end
21
24
  end
@@ -1,11 +1,14 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves single whitespace characters as a token. Use
5
6
  # this if one or more of your TokenFilter classes need whitespace in order to
6
7
  # make decisions.
7
8
  ##
8
- Whitespace = %r{\s}
9
+ # rubocop:disable Naming/ConstantName
10
+ Whitespace = /\s/
11
+ # rubocop:enable Naming/ConstantName
9
12
 
10
13
  end
11
14
  end
@@ -1,14 +1,17 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
+
3
4
  ##
4
5
  # A tokenizer regex that preserves a non-space, non-punctuation "word". It does
5
6
  # allow hyphens and numerals, but the first character must be an A-Z character.
6
7
  ##
7
- Word = %r{
8
+ # rubocop:disable Naming/ConstantName
9
+ Word = /
8
10
  (
9
11
  [a-z][a-z0-9-]*
10
12
  )
11
- }xi
13
+ /xi
14
+ # rubocop:enable Naming/ConstantName
12
15
 
13
16
  end
14
17
  end
@@ -1,4 +1,6 @@
1
1
  module TextRank
2
+
2
3
  # Current gem version
3
- VERSION = '1.1.7'
4
+ VERSION = '1.2.5'
5
+
4
6
  end
@@ -1,4 +1,3 @@
1
- # coding: utf-8
2
1
  lib = File.expand_path('../lib', __FILE__)
3
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
3
  require 'text_rank/version'
@@ -9,8 +8,8 @@ Gem::Specification.new do |spec|
9
8
  spec.authors = ['David McCullars']
10
9
  spec.email = ['david.mccullars@gmail.com']
11
10
 
12
- spec.summary = %q{Implementation of TextRank solution to ranked keyword extraction}
13
- spec.description = %q{Implementation of TextRank solution to ranked keyword extraction. See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf}
11
+ spec.summary = 'Implementation of TextRank solution to ranked keyword extraction'
12
+ spec.description = 'Implementation of TextRank solution to ranked keyword extraction. See https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf'
14
13
  spec.homepage = 'https://github.com/david-mccullars/text_rank'
15
14
  spec.license = 'MIT'
16
15
 
@@ -19,12 +18,13 @@ Gem::Specification.new do |spec|
19
18
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
19
  spec.require_paths = ['lib']
21
20
 
22
- spec.add_development_dependency 'bundler', '~> 1.11'
23
- spec.add_development_dependency 'rake', '~> 10.0'
24
- spec.add_development_dependency 'rspec', '~> 3.0'
25
- spec.add_development_dependency 'simplecov', '~> 0.11'
26
- spec.add_development_dependency 'codeclimate-test-reporter'
21
+ spec.add_development_dependency 'bundler'
22
+ spec.add_development_dependency 'rake'
23
+ spec.add_development_dependency 'rspec'
24
+ spec.add_development_dependency 'rubocop'
25
+ spec.add_development_dependency 'simplecov', '~> 0.17.0' # 0.18 not supported by code climate
26
+ spec.add_development_dependency 'yard'
27
27
 
28
- spec.add_development_dependency 'engtagger', '~> 0.2.0' # Optional runtime dependency but needed for specs
29
- spec.add_development_dependency 'nokogiri', '~> 1.0' # Optional runtime dependency but needed for specs
28
+ spec.add_development_dependency 'engtagger' # Optional runtime dependency but needed for specs
29
+ spec.add_development_dependency 'nokogiri' # Optional runtime dependency but needed for specs
30
30
  end
metadata CHANGED
@@ -1,73 +1,87 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_rank
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.7
4
+ version: 1.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - David McCullars
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-07-05 00:00:00.000000000 Z
11
+ date: 2021-01-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '1.11'
19
+ version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '1.11'
26
+ version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '10.0'
33
+ version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: '3.0'
47
+ version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: '3.0'
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rubocop
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: simplecov
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
73
  - - "~>"
60
74
  - !ruby/object:Gem::Version
61
- version: '0.11'
75
+ version: 0.17.0
62
76
  type: :development
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
80
  - - "~>"
67
81
  - !ruby/object:Gem::Version
68
- version: '0.11'
82
+ version: 0.17.0
69
83
  - !ruby/object:Gem::Dependency
70
- name: codeclimate-test-reporter
84
+ name: yard
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
87
  - - ">="
@@ -84,30 +98,30 @@ dependencies:
84
98
  name: engtagger
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - "~>"
101
+ - - ">="
88
102
  - !ruby/object:Gem::Version
89
- version: 0.2.0
103
+ version: '0'
90
104
  type: :development
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
- - - "~>"
108
+ - - ">="
95
109
  - !ruby/object:Gem::Version
96
- version: 0.2.0
110
+ version: '0'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: nokogiri
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
- - - "~>"
115
+ - - ">="
102
116
  - !ruby/object:Gem::Version
103
- version: '1.0'
117
+ version: '0'
104
118
  type: :development
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
107
121
  requirements:
108
- - - "~>"
122
+ - - ">="
109
123
  - !ruby/object:Gem::Version
110
- version: '1.0'
124
+ version: '0'
111
125
  description: Implementation of TextRank solution to ranked keyword extraction. See
112
126
  https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
113
127
  email:
@@ -124,7 +138,7 @@ files:
124
138
  - ".travis.yml"
125
139
  - CODE_OF_CONDUCT.md
126
140
  - Gemfile
127
- - LICENSE.txt
141
+ - LICENSE
128
142
  - README.md
129
143
  - Rakefile
130
144
  - bin/console
@@ -141,7 +155,9 @@ files:
141
155
  - lib/text_rank/char_filter/strip_html.rb
142
156
  - lib/text_rank/char_filter/strip_possessive.rb
143
157
  - lib/text_rank/char_filter/undo_contractions.rb
158
+ - lib/text_rank/char_filter/undo_contractions.yml
144
159
  - lib/text_rank/fingerprint.rb
160
+ - lib/text_rank/fingerprint_overlap.rb
145
161
  - lib/text_rank/graph_strategy.rb
146
162
  - lib/text_rank/graph_strategy/coocurrence.rb
147
163
  - lib/text_rank/keyword_extractor.rb
@@ -154,6 +170,7 @@ files:
154
170
  - lib/text_rank/token_filter/min_length.rb
155
171
  - lib/text_rank/token_filter/part_of_speech.rb
156
172
  - lib/text_rank/token_filter/stopwords.rb
173
+ - lib/text_rank/token_filter/stopwords.yml
157
174
  - lib/text_rank/tokenizer.rb
158
175
  - lib/text_rank/tokenizer/money.rb
159
176
  - lib/text_rank/tokenizer/number.rb
@@ -167,7 +184,7 @@ homepage: https://github.com/david-mccullars/text_rank
167
184
  licenses:
168
185
  - MIT
169
186
  metadata: {}
170
- post_install_message:
187
+ post_install_message:
171
188
  rdoc_options: []
172
189
  require_paths:
173
190
  - lib
@@ -182,10 +199,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
182
199
  - !ruby/object:Gem::Version
183
200
  version: '0'
184
201
  requirements: []
185
- rubyforge_project:
186
- rubygems_version: 2.5.1
187
- signing_key:
202
+ rubyforge_project:
203
+ rubygems_version: 2.7.6
204
+ signing_key:
188
205
  specification_version: 4
189
206
  summary: Implementation of TextRank solution to ranked keyword extraction
190
207
  test_files: []
191
- has_rdoc: