text_rank 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.codeclimate.yml +29 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +1157 -0
- data/.ruby-version +1 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +21 -0
- data/README.md +137 -0
- data/Rakefile +12 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/page_rank/base.rb +89 -0
- data/lib/page_rank/dense.rb +89 -0
- data/lib/page_rank/sparse.rb +87 -0
- data/lib/page_rank.rb +39 -0
- data/lib/text_rank/char_filter/ascii_folding.rb +26 -0
- data/lib/text_rank/char_filter/lowercase.rb +22 -0
- data/lib/text_rank/char_filter/strip_email.rb +24 -0
- data/lib/text_rank/char_filter/strip_html.rb +41 -0
- data/lib/text_rank/char_filter/strip_possessive.rb +24 -0
- data/lib/text_rank/char_filter/undo_contractions.rb +162 -0
- data/lib/text_rank/char_filter.rb +24 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +78 -0
- data/lib/text_rank/graph_strategy.rb +23 -0
- data/lib/text_rank/keyword_extractor.rb +155 -0
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +81 -0
- data/lib/text_rank/rank_filter.rb +18 -0
- data/lib/text_rank/token_filter/min_length.rb +33 -0
- data/lib/text_rank/token_filter/part_of_speech.rb +45 -0
- data/lib/text_rank/token_filter/stopwords.rb +349 -0
- data/lib/text_rank/token_filter.rb +18 -0
- data/lib/text_rank/tokenizer/regex.rb +26 -0
- data/lib/text_rank/tokenizer/whitespace.rb +19 -0
- data/lib/text_rank/tokenizer/words_and_punctuation.rb +26 -0
- data/lib/text_rank/tokenizer.rb +19 -0
- data/lib/text_rank/version.rb +3 -0
- data/lib/text_rank.rb +34 -0
- data/text_rank.gemspec +30 -0
- metadata +183 -0
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
ruby-2.1.2
|
data/.travis.yml
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# Contributor Code of Conduct
|
2
|
+
|
3
|
+
As contributors and maintainers of this project, and in the interest of
|
4
|
+
fostering an open and welcoming community, we pledge to respect all people who
|
5
|
+
contribute through reporting issues, posting feature requests, updating
|
6
|
+
documentation, submitting pull requests or patches, and other activities.
|
7
|
+
|
8
|
+
We are committed to making participation in this project a harassment-free
|
9
|
+
experience for everyone, regardless of level of experience, gender, gender
|
10
|
+
identity and expression, sexual orientation, disability, personal appearance,
|
11
|
+
body size, race, ethnicity, age, religion, or nationality.
|
12
|
+
|
13
|
+
Examples of unacceptable behavior by participants include:
|
14
|
+
|
15
|
+
* The use of sexualized language or imagery
|
16
|
+
* Personal attacks
|
17
|
+
* Trolling or insulting/derogatory comments
|
18
|
+
* Public or private harassment
|
19
|
+
* Publishing other's private information, such as physical or electronic
|
20
|
+
addresses, without explicit permission
|
21
|
+
* Other unethical or unprofessional conduct
|
22
|
+
|
23
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
24
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
25
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
26
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
27
|
+
threatening, offensive, or harmful.
|
28
|
+
|
29
|
+
By adopting this Code of Conduct, project maintainers commit themselves to
|
30
|
+
fairly and consistently applying these principles to every aspect of managing
|
31
|
+
this project. Project maintainers who do not follow or enforce the Code of
|
32
|
+
Conduct may be permanently removed from the project team.
|
33
|
+
|
34
|
+
This code of conduct applies both within project spaces and in public spaces
|
35
|
+
when an individual is representing the project or its community.
|
36
|
+
|
37
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
38
|
+
reported by contacting a project maintainer at david@bloomfire.com. All
|
39
|
+
complaints will be reviewed and investigated and will result in a response that
|
40
|
+
is deemed necessary and appropriate to the circumstances. Maintainers are
|
41
|
+
obligated to maintain confidentiality with regard to the reporter of an
|
42
|
+
incident.
|
43
|
+
|
44
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
45
|
+
version 1.3.0, available at
|
46
|
+
[http://contributor-covenant.org/version/1/3/0/][version]
|
47
|
+
|
48
|
+
[homepage]: http://contributor-covenant.org
|
49
|
+
[version]: http://contributor-covenant.org/version/1/3/0/
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016 David McCullars
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,137 @@
|
|
1
|
+
# TextRank
|
2
|
+
|
3
|
+
* README: https://github.com/david-mccullars/text_rank
|
4
|
+
* Documentation: http://www.rubydoc.info/github/david-mccullars/text_rank
|
5
|
+
* Bug Reports: https://github.com/david-mccullars/text_rank/issues
|
6
|
+
|
7
|
+
|
8
|
+
## Status
|
9
|
+
|
10
|
+
[](https://travis-ci.org/david-mccullars/text_rank)
|
11
|
+
[](https://codeclimate.com/github/david-mccullars/text_rank)
|
12
|
+
[](https://codeclimate.com/github/david-mccullars/text_rank/coverage)
|
13
|
+
|
14
|
+
|
15
|
+
## Description
|
16
|
+
|
17
|
+
[TextRank](https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf) is
|
18
|
+
an unsupervised keyword extraction algorithm based on
|
19
|
+
[PageRank](http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf). Other
|
20
|
+
strategies for keyword extraction generally rely on either statistics (like
|
21
|
+
inverse document frequency and term frequency) which ignore context, or they
|
22
|
+
rely on machine learning, requiring a corpus of training data which likely will
|
23
|
+
not be suitable for all applications. TextRank is found to produce superior
|
24
|
+
results in many situations with minimal computational cost.
|
25
|
+
|
26
|
+
|
27
|
+
## Features
|
28
|
+
|
29
|
+
* Multiple PageRank implementations to choose one best suited for the performance
|
30
|
+
needs of your application
|
31
|
+
* Framework for adding additional PageRank implementations (e.g. a native
|
32
|
+
implemenation)
|
33
|
+
* Extensible architecture to customize how text is filtered
|
34
|
+
* Extensible architecture to customize how text is tokenized
|
35
|
+
* Extensible architecture to customize how tokens are filtered
|
36
|
+
* Extensible architecture to customize how keywords ranks are filtered/processed
|
37
|
+
|
38
|
+
|
39
|
+
## Installation
|
40
|
+
|
41
|
+
```
|
42
|
+
gem install text_rank
|
43
|
+
```
|
44
|
+
|
45
|
+
## Requirements
|
46
|
+
|
47
|
+
* Ruby 2.1.2 or higher
|
48
|
+
* [engtagger](https://github.com/yohasebe/engtagger) gem is optional but
|
49
|
+
required for `TextRank::TokenFilter::PartOfSpeech`
|
50
|
+
* [nokogiri](https://github.com/sparklemotion/nokogiri) gem is optional but
|
51
|
+
required for `TextRank::CharFilter::StripHtml`
|
52
|
+
|
53
|
+
## Usage
|
54
|
+
|
55
|
+
**TextRank**
|
56
|
+
|
57
|
+
```ruby
|
58
|
+
require 'text_rank'
|
59
|
+
|
60
|
+
text = <<-END
|
61
|
+
In a castle of Westphalia, belonging to the Baron of Thunder-ten-Tronckh, lived
|
62
|
+
a youth, whom nature had endowed with the most gentle manners. His countenance
|
63
|
+
was a true picture of his soul. He combined a true judgment with simplicity of
|
64
|
+
spirit, which was the reason, I apprehend, of his being called Candide. The old
|
65
|
+
servants of the family suspected him to have been the son of the Baron's
|
66
|
+
sister, by a good, honest gentleman of the neighborhood, whom that young lady
|
67
|
+
would never marry because he had been able to prove only seventy-one
|
68
|
+
quarterings, the rest of his genealogical tree having been lost through the
|
69
|
+
injuries of time.
|
70
|
+
END
|
71
|
+
|
72
|
+
# Default, basic keyword extraction. Try this first:
|
73
|
+
keywords = TextRank.extract_keywords(text)
|
74
|
+
|
75
|
+
# Keyword extraction with all of the bells and whistles:
|
76
|
+
keywords = TextRank.extract_keywords_advanced(text)
|
77
|
+
|
78
|
+
# Fully customized extraction:
|
79
|
+
extractor = TextRank::KeywordExtractor.new(
|
80
|
+
strategy: :sparse, # Specify PageRank strategy (dense or sparse)
|
81
|
+
damping: 0.85, # The probability of following the graph vs. randomly choosing a new node
|
82
|
+
tolerance: 0.0001, # The desired accuracy of the results
|
83
|
+
char_filters: [...], # A list of filters to be applied prior to tokenization
|
84
|
+
tokenizer: ..., # A class or tokenizer instance to perform tokenization
|
85
|
+
token_filters: [...], # A list of filters to be applied to each token after tokenization
|
86
|
+
graph_strategy: ..., # A class or strategy instance for producing a graph from tokens
|
87
|
+
rank_filters: [...], # A list of filters to be applied to the keyword ranks after keyword extraction
|
88
|
+
)
|
89
|
+
|
90
|
+
# Add another filter to the end of the char_filter chain
|
91
|
+
extractor.add_char_filter(:AsciiFolding)
|
92
|
+
|
93
|
+
# Add a part of speech filter to the token_filter chain BEFORE the Stopwords filter
|
94
|
+
pos_filter = TextRank::TokenFilter::PartOfSpeech.new(parts_to_keep: %w[nn])
|
95
|
+
extractor.add_token_filter(pos_filter, before: :Stopwords)
|
96
|
+
|
97
|
+
# Perform the extraction with at most 100 iterations
|
98
|
+
extractor.extract(text, max_iterations: 100)
|
99
|
+
```
|
100
|
+
|
101
|
+
**PageRank**
|
102
|
+
|
103
|
+
It is also possible to use this gem for PageRank only.
|
104
|
+
|
105
|
+
```ruby
|
106
|
+
require 'page_rank'
|
107
|
+
|
108
|
+
PageRank.calculate(strategy: :sparse, damping: 0.8, tolerance: 0.00001) do
|
109
|
+
add('node_a', 'node_b', weight: 3.2)
|
110
|
+
add('node_b', 'node_d', weight: 2.1)
|
111
|
+
add('node_b', 'node_e', weight: 4.7)
|
112
|
+
add('node_e', 'node_a', weight: 1.3)
|
113
|
+
end
|
114
|
+
```
|
115
|
+
|
116
|
+
There are currently two pure Ruby implementations of PageRank:
|
117
|
+
|
118
|
+
1. **sparse**: A sparsely-stored strategy which performs multiplication proportional
|
119
|
+
to the number of edges in the graph. For graphs with a very low node-to-edge
|
120
|
+
ratio, this will perform better in a pure Ruby setting. It is recommended to
|
121
|
+
use this strategy until such a time as there are native implementations.
|
122
|
+
2. **dense**: A densely-stored matrix strategy which performs up to `max_iterations`
|
123
|
+
matrix multiplications or until the tolerance is reached. This is more of a
|
124
|
+
canonical implementation and is fine for small or dense graphs, but it is not
|
125
|
+
advised for large, sparse graphs as Ruby is not fast when it comes to matrix
|
126
|
+
multiplication. Each iteration is O(N^3) where N is the number of graph nodes.
|
127
|
+
|
128
|
+
## License
|
129
|
+
|
130
|
+
MIT. See the `LICENSE.txt` file.
|
131
|
+
|
132
|
+
|
133
|
+
## References
|
134
|
+
|
135
|
+
> R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,” in Proceedings of EMNLP 2004. Association for Computational Linguistics, 2004, pp. 404–411.
|
136
|
+
|
137
|
+
> Brin, S.; Page, L. (1998). "The anatomy of a large-scale hypertextual Web search engine". Computer Networks and ISDN Systems 30: 107–117.
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
3
|
+
|
4
|
+
RSpec::Core::RakeTask.new(:spec)
|
5
|
+
|
6
|
+
task :default => :spec
|
7
|
+
|
8
|
+
require 'rdoc/task'
|
9
|
+
RDoc::Task.new do |rdoc|
|
10
|
+
rdoc.main = "README.md"
|
11
|
+
rdoc.rdoc_files.include("README.md", "lib/**/*.rb")
|
12
|
+
end
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "text_rank"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
module PageRank
|
2
|
+
##
|
3
|
+
# A base class for PageRank implementations. This class provides the basic
|
4
|
+
# framework for adding (optionall weighted) nodes to the graph and then
|
5
|
+
# performing iterations of PageRank to within the desired tolerance (or maximum
|
6
|
+
# allowed number of iterations).
|
7
|
+
##
|
8
|
+
class Base
|
9
|
+
|
10
|
+
# @param (see #damping=)
|
11
|
+
# @param (see #tolerance=)
|
12
|
+
def initialize(damping: nil, tolerance: nil, **_)
|
13
|
+
self.damping = damping
|
14
|
+
self.tolerance = tolerance
|
15
|
+
end
|
16
|
+
|
17
|
+
# Set the damping probability
|
18
|
+
# @param damping [Float] The probability of following the graph vs. randomly choosing a new node
|
19
|
+
# @return [Float]
|
20
|
+
def damping=(damping)
|
21
|
+
@damping = damping || 0.85
|
22
|
+
raise ArgumentError.new('Invalid damping factor') if @damping <= 0 || @damping > 1
|
23
|
+
@damping
|
24
|
+
end
|
25
|
+
|
26
|
+
# Set the tolerance value
|
27
|
+
# @param tolerance [Float] The desired accuracy of the results
|
28
|
+
# @return [Float]
|
29
|
+
def tolerance=(tolerance)
|
30
|
+
@tolerance = tolerance || 0.0001
|
31
|
+
raise ArgumentError.new('Invalid tolerance factor') if @tolerance < 0 || @tolerance > 1
|
32
|
+
@tolerance
|
33
|
+
end
|
34
|
+
|
35
|
+
# Adds a directed (and optionally weighted) edge to the graph
|
36
|
+
# @param source [Object] The source node
|
37
|
+
# @param dest [Object] The destination node
|
38
|
+
# @return [nil]
|
39
|
+
def add(_source, _dest, **_options)
|
40
|
+
raise NotImplementedError
|
41
|
+
end
|
42
|
+
|
43
|
+
# Perform the PageRank calculation
|
44
|
+
# @param max_iterations [Fixnum] Maximum number of PageRank iterations to perform (or -1 for no max)
|
45
|
+
# @return [Hash<Object, Float>] of nodes with rank
|
46
|
+
def calculate(max_iterations: -1, **_)
|
47
|
+
ranks = initial_ranks
|
48
|
+
loop do
|
49
|
+
break if max_iterations == 0
|
50
|
+
ranks, prev_ranks = calculate_step(ranks), ranks
|
51
|
+
break if distance(ranks, prev_ranks) < @tolerance
|
52
|
+
max_iterations -= 1
|
53
|
+
end
|
54
|
+
sort_ranks(ranks)
|
55
|
+
end
|
56
|
+
|
57
|
+
protected
|
58
|
+
|
59
|
+
# Should return the number of nodes in the graph
|
60
|
+
def node_count
|
61
|
+
raise NotImplementedError
|
62
|
+
end
|
63
|
+
|
64
|
+
# Should produce the initial ranks from which to start the first PageRank iteration
|
65
|
+
def initial_ranks
|
66
|
+
raise NotImplementedError
|
67
|
+
end
|
68
|
+
|
69
|
+
# Should apply any sort of sorting logic to the result rankings after PageRank has finished
|
70
|
+
def sort_ranks(_ranks)
|
71
|
+
raise NotImplementedError
|
72
|
+
end
|
73
|
+
|
74
|
+
# Performs a single step of the PageRank iteration
|
75
|
+
def calculate_step(_ranks)
|
76
|
+
raise NotImplementedError
|
77
|
+
end
|
78
|
+
|
79
|
+
# Calculate the Euclidean distance from one ranking to the next iteration
|
80
|
+
def distance(v1, v2)
|
81
|
+
sum_squares = node_count.times.reduce(0.0) do |sum, i|
|
82
|
+
d = v1[i] - v2[i]
|
83
|
+
sum + d * d
|
84
|
+
end
|
85
|
+
Math.sqrt(sum_squares)
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'matrix'
|
2
|
+
|
3
|
+
module PageRank
|
4
|
+
##
|
5
|
+
# Implementation of PageRank using matrix multiplication
|
6
|
+
#
|
7
|
+
# Ruby is not known for its speed, especial for math computations. As such this
|
8
|
+
# implementation is not well suited for large graphs, and it is especially not
|
9
|
+
# well suited for graphs that have a small edge-to-vertex ratio. The primary
|
10
|
+
# purpose of this implementation is to provide a checkpoint against other
|
11
|
+
# implementations to verify their validity.
|
12
|
+
#
|
13
|
+
# If speed is desired, it would be best to implement a NativeDense class (and
|
14
|
+
# optionally NativeSparse) which would perform the algorithm in C.
|
15
|
+
##
|
16
|
+
class Dense < Base
|
17
|
+
|
18
|
+
# Initialize with default damping and tolerance.
|
19
|
+
# A maximum number of iterations can also be supplied
|
20
|
+
# (default is no maximum, i.e. iterate until tolerance).
|
21
|
+
# @param (see Base#initialize)
|
22
|
+
def initialize(**options)
|
23
|
+
super(**options)
|
24
|
+
|
25
|
+
@out_links = []
|
26
|
+
@key_to_idx = {}
|
27
|
+
@idx_to_key = {}
|
28
|
+
end
|
29
|
+
|
30
|
+
# @param (see Base#add)
|
31
|
+
# @param weight [Float] Optional weight for the graph edge
|
32
|
+
# @return (see Base#add)
|
33
|
+
def add(source, dest, weight: 1.0)
|
34
|
+
return if source == dest
|
35
|
+
source_idx = index(source)
|
36
|
+
dest_idx = index(dest)
|
37
|
+
@out_links[source_idx] ||= []
|
38
|
+
@out_links[source_idx][dest_idx] ||= 0.0
|
39
|
+
@out_links[source_idx][dest_idx] += weight
|
40
|
+
nil
|
41
|
+
end
|
42
|
+
|
43
|
+
protected
|
44
|
+
|
45
|
+
def node_count
|
46
|
+
@key_to_idx.size
|
47
|
+
end
|
48
|
+
|
49
|
+
def initial_ranks
|
50
|
+
@matrix = to_matrix
|
51
|
+
Vector[*[1 / node_count.to_f] * node_count]
|
52
|
+
end
|
53
|
+
|
54
|
+
def calculate_step(ranks)
|
55
|
+
@matrix * ranks
|
56
|
+
end
|
57
|
+
|
58
|
+
def sort_ranks(ranks)
|
59
|
+
ranks.each_with_index.sort_by { |r, _| -r }.each_with_object({}) do |(r, i), all|
|
60
|
+
all[@idx_to_key[i]] = r
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def index(key)
|
67
|
+
@key_to_idx[key] ||= begin
|
68
|
+
@idx_to_key[node_count] = key
|
69
|
+
node_count
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def to_matrix
|
74
|
+
total_out_weights = @out_links.map do |links|
|
75
|
+
links.compact.reduce(:+) if links
|
76
|
+
end
|
77
|
+
Matrix.build(node_count, node_count) do |dest_idx, source_idx|
|
78
|
+
total = total_out_weights[source_idx]
|
79
|
+
if total
|
80
|
+
w = @out_links[source_idx][dest_idx] || 0.0
|
81
|
+
@damping * w / total + (1 - @damping) / node_count.to_f
|
82
|
+
else
|
83
|
+
1.0 / node_count.to_f
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module PageRank
|
4
|
+
##
|
5
|
+
# Implementation of PageRank using a sparse matrix representation of the graph
|
6
|
+
#
|
7
|
+
# Ruby is not known for its speed, especial for math computations. However,
|
8
|
+
# if the number of edges is relatively small in relation to the number of nodes,
|
9
|
+
# this pure Ruby implementation should perform well enough for many applications.
|
10
|
+
# It uses a sparse matrix representation and thus avoids an order of mangitude
|
11
|
+
# of calculations that are not necessary.
|
12
|
+
#
|
13
|
+
# If speed is desired, it would be best to implement a NativeSparse class (and
|
14
|
+
# optionally NativeDense) which would perform the algorithm in C.
|
15
|
+
##
|
16
|
+
class Sparse < Base
|
17
|
+
|
18
|
+
# Initialize with default damping and tolerance.
|
19
|
+
# A maximum number of iterations can also be supplied
|
20
|
+
# (default is no maximum, i.e. iterate until tolerance).
|
21
|
+
# @param (see Base#initialize)
|
22
|
+
def initialize(**options)
|
23
|
+
super(**options)
|
24
|
+
|
25
|
+
@graph = {}
|
26
|
+
@weight_totals = Hash.new(0.0)
|
27
|
+
@weights = {}
|
28
|
+
@nodes = Set.new
|
29
|
+
end
|
30
|
+
|
31
|
+
# @param (see Base#add)
|
32
|
+
# @param weight [Float] Optional weight for the graph edge
|
33
|
+
# @return (see Base#add)
|
34
|
+
def add(source, dest, weight: 1.0)
|
35
|
+
return false if source == dest
|
36
|
+
@graph[dest] ||= Set.new
|
37
|
+
@graph[dest] << source
|
38
|
+
@weights[source] ||= Hash.new(0.0)
|
39
|
+
@weights[source][dest] += weight
|
40
|
+
@weight_totals[source] ||= 0.0
|
41
|
+
@weight_totals[source] += weight
|
42
|
+
@nodes << source
|
43
|
+
@nodes << dest
|
44
|
+
nil
|
45
|
+
end
|
46
|
+
|
47
|
+
protected
|
48
|
+
|
49
|
+
def node_count
|
50
|
+
@nodes.size
|
51
|
+
end
|
52
|
+
|
53
|
+
def initial_ranks
|
54
|
+
@dangling_nodes = @nodes - @weight_totals.keys
|
55
|
+
@normalized_weights = @weights.each_with_object({}) do |(source, values), h|
|
56
|
+
h[source] = values.each_with_object({}) do |(dest, w), h2|
|
57
|
+
h2[dest] = w / @weight_totals[source]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
Hash[@nodes.map { |k| [k, 1.0 / node_count.to_f] }]
|
61
|
+
end
|
62
|
+
|
63
|
+
def calculate_step(ranks)
|
64
|
+
ranks.keys.each_with_object({}) do |dest, new_ranks|
|
65
|
+
sum = 0.0
|
66
|
+
Array(@graph[dest]).each do |source|
|
67
|
+
sum += ranks[source] * @normalized_weights[source][dest]
|
68
|
+
end
|
69
|
+
@dangling_nodes.each do |source|
|
70
|
+
sum += ranks[source] / node_count.to_f
|
71
|
+
end
|
72
|
+
new_ranks[dest] = @damping * sum + (1 - @damping)/node_count
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def sort_ranks(ranks)
|
77
|
+
sum = 0.0
|
78
|
+
ranks.each { |_, v| sum += v }
|
79
|
+
Hash[ranks.map { |k, v| [k, v / sum] }.sort_by { |_, v| -v }]
|
80
|
+
end
|
81
|
+
|
82
|
+
def distance(v1, v2)
|
83
|
+
super(v1.values.to_a, v2.values.to_a)
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|
data/lib/page_rank.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
##
|
2
|
+
# A module for supporting Ruby implementations of PageRank. Rather than rely on
|
3
|
+
# one single implementation, this module allows for multiple implementations that
|
4
|
+
# may be beneficial in different scenarios.
|
5
|
+
#
|
6
|
+
# = Example
|
7
|
+
#
|
8
|
+
# PageRank.calculate(strategy: :dense, damping: 0.8, max_iterations: 100) do
|
9
|
+
# add('nodeA', 'nodeC', weight: 4.3)
|
10
|
+
# add('nodeA', 'nodeE', weight: 2.1)
|
11
|
+
# add('nodeB', 'nodeC', weight: 3.6)
|
12
|
+
# add('nodeE', 'nodeD', weight: 1.9)
|
13
|
+
# add('nodeA', 'nodeC', weight: 5.3)
|
14
|
+
# end
|
15
|
+
##
|
16
|
+
module PageRank
|
17
|
+
|
18
|
+
autoload :Base, 'page_rank/base'
|
19
|
+
autoload :Dense, 'page_rank/dense'
|
20
|
+
autoload :Sparse, 'page_rank/sparse'
|
21
|
+
|
22
|
+
# @option options [Symbol] :strategy PageRank strategy to use (either :sparse or :dense)
|
23
|
+
# @option options [Float] :damping The probability of following the graph vs. randomly choosing a new node
|
24
|
+
# @option options [Float] :tolerance The desired accuracy of the results
|
25
|
+
# @return [PageRank::Base]
|
26
|
+
def self.new(strategy: :sparse, **options)
|
27
|
+
const_get(strategy.to_s.capitalize).new(**options)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Convenience method to quickly calculate PageRank. In the calling block, graph edges can be added.
|
31
|
+
# @option (see new)
|
32
|
+
# @return (see Base#calculate)
|
33
|
+
def self.calculate(**options, &block)
|
34
|
+
pr = new(**options)
|
35
|
+
pr.instance_exec(&block)
|
36
|
+
pr.calculate(**options)
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
module TextRank
|
3
|
+
module CharFilter
|
4
|
+
##
|
5
|
+
# Characater filter to transform non-ASCII (unicode) characters into ASCII-friendly versions.
|
6
|
+
#
|
7
|
+
# = Example
|
8
|
+
#
|
9
|
+
# AsciiFolding.new.filter!("the Perigordian Abbé then made answer, because a poor beggar of the country of Atrébatie heard some foolish things said")
|
10
|
+
# => "the Perigordian Abbe then made answer, because a poor beggar of the country of Atrebatie heard some foolish things said"
|
11
|
+
##
|
12
|
+
class AsciiFolding
|
13
|
+
|
14
|
+
NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'
|
15
|
+
EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'
|
16
|
+
|
17
|
+
# Perform the filter
|
18
|
+
# @param text [String]
|
19
|
+
# @return [String]
|
20
|
+
def filter!(text)
|
21
|
+
text.tr!(NON_ASCII_CHARS, EQUIVALENT_ASCII_CHARS)
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module TextRank
|
2
|
+
module CharFilter
|
3
|
+
##
|
4
|
+
# Character filter to force text to lowercase
|
5
|
+
#
|
6
|
+
# = Example
|
7
|
+
#
|
8
|
+
# Lowercase.new.filter!("What a pessimist you are! - Candide")
|
9
|
+
# => "what a pessimist you are! - candide"
|
10
|
+
##
|
11
|
+
class Lowercase
|
12
|
+
|
13
|
+
# Perform the filter
|
14
|
+
# @param text [String]
|
15
|
+
# @return [String]
|
16
|
+
def filter!(text)
|
17
|
+
text.downcase!
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module TextRank
|
2
|
+
module CharFilter
|
3
|
+
##
|
4
|
+
# Character filter to remove email addresses from text.
|
5
|
+
#
|
6
|
+
# = Example
|
7
|
+
#
|
8
|
+
# StripEmail.new.filter!("That is a hard question said candide@gmail.com")
|
9
|
+
# => "That is a hard question said "
|
10
|
+
##
|
11
|
+
class StripEmail
|
12
|
+
|
13
|
+
EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/i
|
14
|
+
|
15
|
+
# Perform the filter
|
16
|
+
# @param text [String]
|
17
|
+
# @return [String]
|
18
|
+
def filter!(text)
|
19
|
+
text.gsub!(EMAIL_REGEX, '')
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'stringio'
|
3
|
+
|
4
|
+
module TextRank
|
5
|
+
module CharFilter
|
6
|
+
##
|
7
|
+
# Character filter to remove HTML tags and convert HTML entities to text.
|
8
|
+
#
|
9
|
+
# = Example
|
10
|
+
#
|
11
|
+
# StripHtml.new.filter!(""Optimism", said Cacambo, "What is that?"")
|
12
|
+
# => "\"Optimism\", said Cacambo, \"What is that?\""
|
13
|
+
#
|
14
|
+
# StringHtml.new.filter!("<b>Alas! It is the <u>obstinacy</u> of maintaining that everything is best when it is worst.</b>")
|
15
|
+
# => "Alas! It is the obstinacy of maintaining that everything is best when it is worst."
|
16
|
+
##
|
17
|
+
class StripHtml < Nokogiri::XML::SAX::Document
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
@text = StringIO.new
|
21
|
+
end
|
22
|
+
|
23
|
+
# Perform the filter
|
24
|
+
# @param text [String]
|
25
|
+
# @return [String]
|
26
|
+
def filter!(text)
|
27
|
+
@text.rewind
|
28
|
+
Nokogiri::HTML::SAX::Parser.new(self).parse(text)
|
29
|
+
@text.string
|
30
|
+
end
|
31
|
+
|
32
|
+
protected
|
33
|
+
|
34
|
+
def characters(string)
|
35
|
+
@text << ' '
|
36
|
+
@text << string
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|