text_rank 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.codeclimate.yml +29 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +1157 -0
- data/.ruby-version +1 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +21 -0
- data/README.md +137 -0
- data/Rakefile +12 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/page_rank/base.rb +89 -0
- data/lib/page_rank/dense.rb +89 -0
- data/lib/page_rank/sparse.rb +87 -0
- data/lib/page_rank.rb +39 -0
- data/lib/text_rank/char_filter/ascii_folding.rb +26 -0
- data/lib/text_rank/char_filter/lowercase.rb +22 -0
- data/lib/text_rank/char_filter/strip_email.rb +24 -0
- data/lib/text_rank/char_filter/strip_html.rb +41 -0
- data/lib/text_rank/char_filter/strip_possessive.rb +24 -0
- data/lib/text_rank/char_filter/undo_contractions.rb +162 -0
- data/lib/text_rank/char_filter.rb +24 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +78 -0
- data/lib/text_rank/graph_strategy.rb +23 -0
- data/lib/text_rank/keyword_extractor.rb +155 -0
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +81 -0
- data/lib/text_rank/rank_filter.rb +18 -0
- data/lib/text_rank/token_filter/min_length.rb +33 -0
- data/lib/text_rank/token_filter/part_of_speech.rb +45 -0
- data/lib/text_rank/token_filter/stopwords.rb +349 -0
- data/lib/text_rank/token_filter.rb +18 -0
- data/lib/text_rank/tokenizer/regex.rb +26 -0
- data/lib/text_rank/tokenizer/whitespace.rb +19 -0
- data/lib/text_rank/tokenizer/words_and_punctuation.rb +26 -0
- data/lib/text_rank/tokenizer.rb +19 -0
- data/lib/text_rank/version.rb +3 -0
- data/lib/text_rank.rb +34 -0
- data/text_rank.gemspec +30 -0
- metadata +183 -0
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
ruby-2.1.2
|
data/.travis.yml
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# Contributor Code of Conduct
|
2
|
+
|
3
|
+
As contributors and maintainers of this project, and in the interest of
|
4
|
+
fostering an open and welcoming community, we pledge to respect all people who
|
5
|
+
contribute through reporting issues, posting feature requests, updating
|
6
|
+
documentation, submitting pull requests or patches, and other activities.
|
7
|
+
|
8
|
+
We are committed to making participation in this project a harassment-free
|
9
|
+
experience for everyone, regardless of level of experience, gender, gender
|
10
|
+
identity and expression, sexual orientation, disability, personal appearance,
|
11
|
+
body size, race, ethnicity, age, religion, or nationality.
|
12
|
+
|
13
|
+
Examples of unacceptable behavior by participants include:
|
14
|
+
|
15
|
+
* The use of sexualized language or imagery
|
16
|
+
* Personal attacks
|
17
|
+
* Trolling or insulting/derogatory comments
|
18
|
+
* Public or private harassment
|
19
|
+
* Publishing other's private information, such as physical or electronic
|
20
|
+
addresses, without explicit permission
|
21
|
+
* Other unethical or unprofessional conduct
|
22
|
+
|
23
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
24
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
25
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
26
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
27
|
+
threatening, offensive, or harmful.
|
28
|
+
|
29
|
+
By adopting this Code of Conduct, project maintainers commit themselves to
|
30
|
+
fairly and consistently applying these principles to every aspect of managing
|
31
|
+
this project. Project maintainers who do not follow or enforce the Code of
|
32
|
+
Conduct may be permanently removed from the project team.
|
33
|
+
|
34
|
+
This code of conduct applies both within project spaces and in public spaces
|
35
|
+
when an individual is representing the project or its community.
|
36
|
+
|
37
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
38
|
+
reported by contacting a project maintainer at david@bloomfire.com. All
|
39
|
+
complaints will be reviewed and investigated and will result in a response that
|
40
|
+
is deemed necessary and appropriate to the circumstances. Maintainers are
|
41
|
+
obligated to maintain confidentiality with regard to the reporter of an
|
42
|
+
incident.
|
43
|
+
|
44
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
45
|
+
version 1.3.0, available at
|
46
|
+
[http://contributor-covenant.org/version/1/3/0/][version]
|
47
|
+
|
48
|
+
[homepage]: http://contributor-covenant.org
|
49
|
+
[version]: http://contributor-covenant.org/version/1/3/0/
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016 David McCullars
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,137 @@
|
|
1
|
+
# TextRank
|
2
|
+
|
3
|
+
* README: https://github.com/david-mccullars/text_rank
|
4
|
+
* Documentation: http://www.rubydoc.info/github/david-mccullars/text_rank
|
5
|
+
* Bug Reports: https://github.com/david-mccullars/text_rank/issues
|
6
|
+
|
7
|
+
|
8
|
+
## Status
|
9
|
+
|
10
|
+
[![Travis Build Status](https://travis-ci.org/david-mccullars/text_rank.svg?branch=master)](https://travis-ci.org/david-mccullars/text_rank)
|
11
|
+
[![Code Climate](https://codeclimate.com/github/david-mccullars/text_rank/badges/gpa.svg)](https://codeclimate.com/github/david-mccullars/text_rank)
|
12
|
+
[![Test Coverage](https://codeclimate.com/github/david-mccullars/text_rank/badges/coverage.svg)](https://codeclimate.com/github/david-mccullars/text_rank/coverage)
|
13
|
+
|
14
|
+
|
15
|
+
## Description
|
16
|
+
|
17
|
+
[TextRank](https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf) is
|
18
|
+
an unsupervised keyword extraction algorithm based on
|
19
|
+
[PageRank](http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf). Other
|
20
|
+
strategies for keyword extraction generally rely on either statistics (like
|
21
|
+
inverse document frequency and term frequency) which ignore context, or they
|
22
|
+
rely on machine learning, requiring a corpus of training data which likely will
|
23
|
+
not be suitable for all applications. TextRank is found to produce superior
|
24
|
+
results in many situations with minimal computational cost.
|
25
|
+
|
26
|
+
|
27
|
+
## Features
|
28
|
+
|
29
|
+
* Multiple PageRank implementations to choose one best suited for the performance
|
30
|
+
needs of your application
|
31
|
+
* Framework for adding additional PageRank implementations (e.g. a native
|
32
|
+
implemenation)
|
33
|
+
* Extensible architecture to customize how text is filtered
|
34
|
+
* Extensible architecture to customize how text is tokenized
|
35
|
+
* Extensible architecture to customize how tokens are filtered
|
36
|
+
* Extensible architecture to customize how keywords ranks are filtered/processed
|
37
|
+
|
38
|
+
|
39
|
+
## Installation
|
40
|
+
|
41
|
+
```
|
42
|
+
gem install text_rank
|
43
|
+
```
|
44
|
+
|
45
|
+
## Requirements
|
46
|
+
|
47
|
+
* Ruby 2.1.2 or higher
|
48
|
+
* [engtagger](https://github.com/yohasebe/engtagger) gem is optional but
|
49
|
+
required for `TextRank::TokenFilter::PartOfSpeech`
|
50
|
+
* [nokogiri](https://github.com/sparklemotion/nokogiri) gem is optional but
|
51
|
+
required for `TextRank::CharFilter::StripHtml`
|
52
|
+
|
53
|
+
## Usage
|
54
|
+
|
55
|
+
**TextRank**
|
56
|
+
|
57
|
+
```ruby
|
58
|
+
require 'text_rank'
|
59
|
+
|
60
|
+
text = <<-END
|
61
|
+
In a castle of Westphalia, belonging to the Baron of Thunder-ten-Tronckh, lived
|
62
|
+
a youth, whom nature had endowed with the most gentle manners. His countenance
|
63
|
+
was a true picture of his soul. He combined a true judgment with simplicity of
|
64
|
+
spirit, which was the reason, I apprehend, of his being called Candide. The old
|
65
|
+
servants of the family suspected him to have been the son of the Baron's
|
66
|
+
sister, by a good, honest gentleman of the neighborhood, whom that young lady
|
67
|
+
would never marry because he had been able to prove only seventy-one
|
68
|
+
quarterings, the rest of his genealogical tree having been lost through the
|
69
|
+
injuries of time.
|
70
|
+
END
|
71
|
+
|
72
|
+
# Default, basic keyword extraction. Try this first:
|
73
|
+
keywords = TextRank.extract_keywords(text)
|
74
|
+
|
75
|
+
# Keyword extraction with all of the bells and whistles:
|
76
|
+
keywords = TextRank.extract_keywords_advanced(text)
|
77
|
+
|
78
|
+
# Fully customized extraction:
|
79
|
+
extractor = TextRank::KeywordExtractor.new(
|
80
|
+
strategy: :sparse, # Specify PageRank strategy (dense or sparse)
|
81
|
+
damping: 0.85, # The probability of following the graph vs. randomly choosing a new node
|
82
|
+
tolerance: 0.0001, # The desired accuracy of the results
|
83
|
+
char_filters: [...], # A list of filters to be applied prior to tokenization
|
84
|
+
tokenizer: ..., # A class or tokenizer instance to perform tokenization
|
85
|
+
token_filters: [...], # A list of filters to be applied to each token after tokenization
|
86
|
+
graph_strategy: ..., # A class or strategy instance for producing a graph from tokens
|
87
|
+
rank_filters: [...], # A list of filters to be applied to the keyword ranks after keyword extraction
|
88
|
+
)
|
89
|
+
|
90
|
+
# Add another filter to the end of the char_filter chain
|
91
|
+
extractor.add_char_filter(:AsciiFolding)
|
92
|
+
|
93
|
+
# Add a part of speech filter to the token_filter chain BEFORE the Stopwords filter
|
94
|
+
pos_filter = TextRank::TokenFilter::PartOfSpeech.new(parts_to_keep: %w[nn])
|
95
|
+
extractor.add_token_filter(pos_filter, before: :Stopwords)
|
96
|
+
|
97
|
+
# Perform the extraction with at most 100 iterations
|
98
|
+
extractor.extract(text, max_iterations: 100)
|
99
|
+
```
|
100
|
+
|
101
|
+
**PageRank**
|
102
|
+
|
103
|
+
It is also possible to use this gem for PageRank only.
|
104
|
+
|
105
|
+
```ruby
|
106
|
+
require 'page_rank'
|
107
|
+
|
108
|
+
PageRank.calculate(strategy: :sparse, damping: 0.8, tolerance: 0.00001) do
|
109
|
+
add('node_a', 'node_b', weight: 3.2)
|
110
|
+
add('node_b', 'node_d', weight: 2.1)
|
111
|
+
add('node_b', 'node_e', weight: 4.7)
|
112
|
+
add('node_e', 'node_a', weight: 1.3)
|
113
|
+
end
|
114
|
+
```
|
115
|
+
|
116
|
+
There are currently two pure Ruby implementations of PageRank:
|
117
|
+
|
118
|
+
1. **sparse**: A sparsely-stored strategy which performs multiplication proportional
|
119
|
+
to the number of edges in the graph. For graphs with a very low node-to-edge
|
120
|
+
ratio, this will perform better in a pure Ruby setting. It is recommended to
|
121
|
+
use this strategy until such a time as there are native implementations.
|
122
|
+
2. **dense**: A densely-stored matrix strategy which performs up to `max_iterations`
|
123
|
+
matrix multiplications or until the tolerance is reached. This is more of a
|
124
|
+
canonical implementation and is fine for small or dense graphs, but it is not
|
125
|
+
advised for large, sparse graphs as Ruby is not fast when it comes to matrix
|
126
|
+
multiplication. Each iteration is O(N^3) where N is the number of graph nodes.
|
127
|
+
|
128
|
+
## License
|
129
|
+
|
130
|
+
MIT. See the `LICENSE.txt` file.
|
131
|
+
|
132
|
+
|
133
|
+
## References
|
134
|
+
|
135
|
+
> R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,” in Proceedings of EMNLP 2004. Association for Computational Linguistics, 2004, pp. 404–411.
|
136
|
+
|
137
|
+
> Brin, S.; Page, L. (1998). "The anatomy of a large-scale hypertextual Web search engine". Computer Networks and ISDN Systems 30: 107–117.
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
3
|
+
|
4
|
+
RSpec::Core::RakeTask.new(:spec)
|
5
|
+
|
6
|
+
task :default => :spec
|
7
|
+
|
8
|
+
require 'rdoc/task'
|
9
|
+
RDoc::Task.new do |rdoc|
|
10
|
+
rdoc.main = "README.md"
|
11
|
+
rdoc.rdoc_files.include("README.md", "lib/**/*.rb")
|
12
|
+
end
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "text_rank"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
module PageRank
|
2
|
+
##
|
3
|
+
# A base class for PageRank implementations. This class provides the basic
|
4
|
+
# framework for adding (optionall weighted) nodes to the graph and then
|
5
|
+
# performing iterations of PageRank to within the desired tolerance (or maximum
|
6
|
+
# allowed number of iterations).
|
7
|
+
##
|
8
|
+
class Base
|
9
|
+
|
10
|
+
# @param (see #damping=)
|
11
|
+
# @param (see #tolerance=)
|
12
|
+
def initialize(damping: nil, tolerance: nil, **_)
|
13
|
+
self.damping = damping
|
14
|
+
self.tolerance = tolerance
|
15
|
+
end
|
16
|
+
|
17
|
+
# Set the damping probability
|
18
|
+
# @param damping [Float] The probability of following the graph vs. randomly choosing a new node
|
19
|
+
# @return [Float]
|
20
|
+
def damping=(damping)
|
21
|
+
@damping = damping || 0.85
|
22
|
+
raise ArgumentError.new('Invalid damping factor') if @damping <= 0 || @damping > 1
|
23
|
+
@damping
|
24
|
+
end
|
25
|
+
|
26
|
+
# Set the tolerance value
|
27
|
+
# @param tolerance [Float] The desired accuracy of the results
|
28
|
+
# @return [Float]
|
29
|
+
def tolerance=(tolerance)
|
30
|
+
@tolerance = tolerance || 0.0001
|
31
|
+
raise ArgumentError.new('Invalid tolerance factor') if @tolerance < 0 || @tolerance > 1
|
32
|
+
@tolerance
|
33
|
+
end
|
34
|
+
|
35
|
+
# Adds a directed (and optionally weighted) edge to the graph
|
36
|
+
# @param source [Object] The source node
|
37
|
+
# @param dest [Object] The destination node
|
38
|
+
# @return [nil]
|
39
|
+
def add(_source, _dest, **_options)
|
40
|
+
raise NotImplementedError
|
41
|
+
end
|
42
|
+
|
43
|
+
# Perform the PageRank calculation
|
44
|
+
# @param max_iterations [Fixnum] Maximum number of PageRank iterations to perform (or -1 for no max)
|
45
|
+
# @return [Hash<Object, Float>] of nodes with rank
|
46
|
+
def calculate(max_iterations: -1, **_)
|
47
|
+
ranks = initial_ranks
|
48
|
+
loop do
|
49
|
+
break if max_iterations == 0
|
50
|
+
ranks, prev_ranks = calculate_step(ranks), ranks
|
51
|
+
break if distance(ranks, prev_ranks) < @tolerance
|
52
|
+
max_iterations -= 1
|
53
|
+
end
|
54
|
+
sort_ranks(ranks)
|
55
|
+
end
|
56
|
+
|
57
|
+
protected
|
58
|
+
|
59
|
+
# Should return the number of nodes in the graph
|
60
|
+
def node_count
|
61
|
+
raise NotImplementedError
|
62
|
+
end
|
63
|
+
|
64
|
+
# Should produce the initial ranks from which to start the first PageRank iteration
|
65
|
+
def initial_ranks
|
66
|
+
raise NotImplementedError
|
67
|
+
end
|
68
|
+
|
69
|
+
# Should apply any sort of sorting logic to the result rankings after PageRank has finished
|
70
|
+
def sort_ranks(_ranks)
|
71
|
+
raise NotImplementedError
|
72
|
+
end
|
73
|
+
|
74
|
+
# Performs a single step of the PageRank iteration
|
75
|
+
def calculate_step(_ranks)
|
76
|
+
raise NotImplementedError
|
77
|
+
end
|
78
|
+
|
79
|
+
# Calculate the Euclidean distance from one ranking to the next iteration
|
80
|
+
def distance(v1, v2)
|
81
|
+
sum_squares = node_count.times.reduce(0.0) do |sum, i|
|
82
|
+
d = v1[i] - v2[i]
|
83
|
+
sum + d * d
|
84
|
+
end
|
85
|
+
Math.sqrt(sum_squares)
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'matrix'
|
2
|
+
|
3
|
+
module PageRank
|
4
|
+
##
|
5
|
+
# Implementation of PageRank using matrix multiplication
|
6
|
+
#
|
7
|
+
# Ruby is not known for its speed, especial for math computations. As such this
|
8
|
+
# implementation is not well suited for large graphs, and it is especially not
|
9
|
+
# well suited for graphs that have a small edge-to-vertex ratio. The primary
|
10
|
+
# purpose of this implementation is to provide a checkpoint against other
|
11
|
+
# implementations to verify their validity.
|
12
|
+
#
|
13
|
+
# If speed is desired, it would be best to implement a NativeDense class (and
|
14
|
+
# optionally NativeSparse) which would perform the algorithm in C.
|
15
|
+
##
|
16
|
+
class Dense < Base
|
17
|
+
|
18
|
+
# Initialize with default damping and tolerance.
|
19
|
+
# A maximum number of iterations can also be supplied
|
20
|
+
# (default is no maximum, i.e. iterate until tolerance).
|
21
|
+
# @param (see Base#initialize)
|
22
|
+
def initialize(**options)
|
23
|
+
super(**options)
|
24
|
+
|
25
|
+
@out_links = []
|
26
|
+
@key_to_idx = {}
|
27
|
+
@idx_to_key = {}
|
28
|
+
end
|
29
|
+
|
30
|
+
# @param (see Base#add)
|
31
|
+
# @param weight [Float] Optional weight for the graph edge
|
32
|
+
# @return (see Base#add)
|
33
|
+
def add(source, dest, weight: 1.0)
|
34
|
+
return if source == dest
|
35
|
+
source_idx = index(source)
|
36
|
+
dest_idx = index(dest)
|
37
|
+
@out_links[source_idx] ||= []
|
38
|
+
@out_links[source_idx][dest_idx] ||= 0.0
|
39
|
+
@out_links[source_idx][dest_idx] += weight
|
40
|
+
nil
|
41
|
+
end
|
42
|
+
|
43
|
+
protected
|
44
|
+
|
45
|
+
def node_count
|
46
|
+
@key_to_idx.size
|
47
|
+
end
|
48
|
+
|
49
|
+
def initial_ranks
|
50
|
+
@matrix = to_matrix
|
51
|
+
Vector[*[1 / node_count.to_f] * node_count]
|
52
|
+
end
|
53
|
+
|
54
|
+
def calculate_step(ranks)
|
55
|
+
@matrix * ranks
|
56
|
+
end
|
57
|
+
|
58
|
+
def sort_ranks(ranks)
|
59
|
+
ranks.each_with_index.sort_by { |r, _| -r }.each_with_object({}) do |(r, i), all|
|
60
|
+
all[@idx_to_key[i]] = r
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def index(key)
|
67
|
+
@key_to_idx[key] ||= begin
|
68
|
+
@idx_to_key[node_count] = key
|
69
|
+
node_count
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def to_matrix
|
74
|
+
total_out_weights = @out_links.map do |links|
|
75
|
+
links.compact.reduce(:+) if links
|
76
|
+
end
|
77
|
+
Matrix.build(node_count, node_count) do |dest_idx, source_idx|
|
78
|
+
total = total_out_weights[source_idx]
|
79
|
+
if total
|
80
|
+
w = @out_links[source_idx][dest_idx] || 0.0
|
81
|
+
@damping * w / total + (1 - @damping) / node_count.to_f
|
82
|
+
else
|
83
|
+
1.0 / node_count.to_f
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module PageRank
|
4
|
+
##
|
5
|
+
# Implementation of PageRank using a sparse matrix representation of the graph
|
6
|
+
#
|
7
|
+
# Ruby is not known for its speed, especial for math computations. However,
|
8
|
+
# if the number of edges is relatively small in relation to the number of nodes,
|
9
|
+
# this pure Ruby implementation should perform well enough for many applications.
|
10
|
+
# It uses a sparse matrix representation and thus avoids an order of mangitude
|
11
|
+
# of calculations that are not necessary.
|
12
|
+
#
|
13
|
+
# If speed is desired, it would be best to implement a NativeSparse class (and
|
14
|
+
# optionally NativeDense) which would perform the algorithm in C.
|
15
|
+
##
|
16
|
+
class Sparse < Base
|
17
|
+
|
18
|
+
# Initialize with default damping and tolerance.
|
19
|
+
# A maximum number of iterations can also be supplied
|
20
|
+
# (default is no maximum, i.e. iterate until tolerance).
|
21
|
+
# @param (see Base#initialize)
|
22
|
+
def initialize(**options)
|
23
|
+
super(**options)
|
24
|
+
|
25
|
+
@graph = {}
|
26
|
+
@weight_totals = Hash.new(0.0)
|
27
|
+
@weights = {}
|
28
|
+
@nodes = Set.new
|
29
|
+
end
|
30
|
+
|
31
|
+
# @param (see Base#add)
|
32
|
+
# @param weight [Float] Optional weight for the graph edge
|
33
|
+
# @return (see Base#add)
|
34
|
+
def add(source, dest, weight: 1.0)
|
35
|
+
return false if source == dest
|
36
|
+
@graph[dest] ||= Set.new
|
37
|
+
@graph[dest] << source
|
38
|
+
@weights[source] ||= Hash.new(0.0)
|
39
|
+
@weights[source][dest] += weight
|
40
|
+
@weight_totals[source] ||= 0.0
|
41
|
+
@weight_totals[source] += weight
|
42
|
+
@nodes << source
|
43
|
+
@nodes << dest
|
44
|
+
nil
|
45
|
+
end
|
46
|
+
|
47
|
+
protected
|
48
|
+
|
49
|
+
def node_count
|
50
|
+
@nodes.size
|
51
|
+
end
|
52
|
+
|
53
|
+
def initial_ranks
|
54
|
+
@dangling_nodes = @nodes - @weight_totals.keys
|
55
|
+
@normalized_weights = @weights.each_with_object({}) do |(source, values), h|
|
56
|
+
h[source] = values.each_with_object({}) do |(dest, w), h2|
|
57
|
+
h2[dest] = w / @weight_totals[source]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
Hash[@nodes.map { |k| [k, 1.0 / node_count.to_f] }]
|
61
|
+
end
|
62
|
+
|
63
|
+
def calculate_step(ranks)
|
64
|
+
ranks.keys.each_with_object({}) do |dest, new_ranks|
|
65
|
+
sum = 0.0
|
66
|
+
Array(@graph[dest]).each do |source|
|
67
|
+
sum += ranks[source] * @normalized_weights[source][dest]
|
68
|
+
end
|
69
|
+
@dangling_nodes.each do |source|
|
70
|
+
sum += ranks[source] / node_count.to_f
|
71
|
+
end
|
72
|
+
new_ranks[dest] = @damping * sum + (1 - @damping)/node_count
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def sort_ranks(ranks)
|
77
|
+
sum = 0.0
|
78
|
+
ranks.each { |_, v| sum += v }
|
79
|
+
Hash[ranks.map { |k, v| [k, v / sum] }.sort_by { |_, v| -v }]
|
80
|
+
end
|
81
|
+
|
82
|
+
def distance(v1, v2)
|
83
|
+
super(v1.values.to_a, v2.values.to_a)
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|
data/lib/page_rank.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
##
|
2
|
+
# A module for supporting Ruby implementations of PageRank. Rather than rely on
|
3
|
+
# one single implementation, this module allows for multiple implementations that
|
4
|
+
# may be beneficial in different scenarios.
|
5
|
+
#
|
6
|
+
# = Example
|
7
|
+
#
|
8
|
+
# PageRank.calculate(strategy: :dense, damping: 0.8, max_iterations: 100) do
|
9
|
+
# add('nodeA', 'nodeC', weight: 4.3)
|
10
|
+
# add('nodeA', 'nodeE', weight: 2.1)
|
11
|
+
# add('nodeB', 'nodeC', weight: 3.6)
|
12
|
+
# add('nodeE', 'nodeD', weight: 1.9)
|
13
|
+
# add('nodeA', 'nodeC', weight: 5.3)
|
14
|
+
# end
|
15
|
+
##
|
16
|
+
module PageRank
|
17
|
+
|
18
|
+
autoload :Base, 'page_rank/base'
|
19
|
+
autoload :Dense, 'page_rank/dense'
|
20
|
+
autoload :Sparse, 'page_rank/sparse'
|
21
|
+
|
22
|
+
# @option options [Symbol] :strategy PageRank strategy to use (either :sparse or :dense)
|
23
|
+
# @option options [Float] :damping The probability of following the graph vs. randomly choosing a new node
|
24
|
+
# @option options [Float] :tolerance The desired accuracy of the results
|
25
|
+
# @return [PageRank::Base]
|
26
|
+
def self.new(strategy: :sparse, **options)
|
27
|
+
const_get(strategy.to_s.capitalize).new(**options)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Convenience method to quickly calculate PageRank. In the calling block, graph edges can be added.
|
31
|
+
# @option (see new)
|
32
|
+
# @return (see Base#calculate)
|
33
|
+
def self.calculate(**options, &block)
|
34
|
+
pr = new(**options)
|
35
|
+
pr.instance_exec(&block)
|
36
|
+
pr.calculate(**options)
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
module TextRank
|
3
|
+
module CharFilter
|
4
|
+
##
|
5
|
+
# Characater filter to transform non-ASCII (unicode) characters into ASCII-friendly versions.
|
6
|
+
#
|
7
|
+
# = Example
|
8
|
+
#
|
9
|
+
# AsciiFolding.new.filter!("the Perigordian Abbé then made answer, because a poor beggar of the country of Atrébatie heard some foolish things said")
|
10
|
+
# => "the Perigordian Abbe then made answer, because a poor beggar of the country of Atrebatie heard some foolish things said"
|
11
|
+
##
|
12
|
+
class AsciiFolding
|
13
|
+
|
14
|
+
NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'
|
15
|
+
EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'
|
16
|
+
|
17
|
+
# Perform the filter
|
18
|
+
# @param text [String]
|
19
|
+
# @return [String]
|
20
|
+
def filter!(text)
|
21
|
+
text.tr!(NON_ASCII_CHARS, EQUIVALENT_ASCII_CHARS)
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module TextRank
|
2
|
+
module CharFilter
|
3
|
+
##
|
4
|
+
# Character filter to force text to lowercase
|
5
|
+
#
|
6
|
+
# = Example
|
7
|
+
#
|
8
|
+
# Lowercase.new.filter!("What a pessimist you are! - Candide")
|
9
|
+
# => "what a pessimist you are! - candide"
|
10
|
+
##
|
11
|
+
class Lowercase
|
12
|
+
|
13
|
+
# Perform the filter
|
14
|
+
# @param text [String]
|
15
|
+
# @return [String]
|
16
|
+
def filter!(text)
|
17
|
+
text.downcase!
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module TextRank
|
2
|
+
module CharFilter
|
3
|
+
##
|
4
|
+
# Character filter to remove email addresses from text.
|
5
|
+
#
|
6
|
+
# = Example
|
7
|
+
#
|
8
|
+
# StripEmail.new.filter!("That is a hard question said candide@gmail.com")
|
9
|
+
# => "That is a hard question said "
|
10
|
+
##
|
11
|
+
class StripEmail
|
12
|
+
|
13
|
+
EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/i
|
14
|
+
|
15
|
+
# Perform the filter
|
16
|
+
# @param text [String]
|
17
|
+
# @return [String]
|
18
|
+
def filter!(text)
|
19
|
+
text.gsub!(EMAIL_REGEX, '')
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'stringio'
|
3
|
+
|
4
|
+
module TextRank
|
5
|
+
module CharFilter
|
6
|
+
##
|
7
|
+
# Character filter to remove HTML tags and convert HTML entities to text.
|
8
|
+
#
|
9
|
+
# = Example
|
10
|
+
#
|
11
|
+
# StripHtml.new.filter!(""Optimism", said Cacambo, "What is that?"")
|
12
|
+
# => "\"Optimism\", said Cacambo, \"What is that?\""
|
13
|
+
#
|
14
|
+
# StringHtml.new.filter!("<b>Alas! It is the <u>obstinacy</u> of maintaining that everything is best when it is worst.</b>")
|
15
|
+
# => "Alas! It is the obstinacy of maintaining that everything is best when it is worst."
|
16
|
+
##
|
17
|
+
class StripHtml < Nokogiri::XML::SAX::Document
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
@text = StringIO.new
|
21
|
+
end
|
22
|
+
|
23
|
+
# Perform the filter
|
24
|
+
# @param text [String]
|
25
|
+
# @return [String]
|
26
|
+
def filter!(text)
|
27
|
+
@text.rewind
|
28
|
+
Nokogiri::HTML::SAX::Parser.new(self).parse(text)
|
29
|
+
@text.string
|
30
|
+
end
|
31
|
+
|
32
|
+
protected
|
33
|
+
|
34
|
+
def characters(string)
|
35
|
+
@text << ' '
|
36
|
+
@text << string
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|