graph-rank 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +16 -0
- data/README.md +71 -0
- data/lib/graph-rank.rb +13 -0
- data/lib/graph-rank/keywords.rb +39 -0
- data/lib/graph-rank/page_rank.rb +68 -0
- data/lib/graph-rank/text_rank.rb +56 -0
- metadata +51 -0
data/LICENSE
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
A Ruby implementation of the TextRank and PageRank algorithms.
|
2
|
+
|
3
|
+
This program is free software: you can redistribute it and/or modify
|
4
|
+
it under the terms of the GNU General Public License as published by
|
5
|
+
the Free Software Foundation, either version 3 of the License, or
|
6
|
+
(at your option) any later version.
|
7
|
+
|
8
|
+
This program is distributed in the hope that it will be useful,
|
9
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
GNU General Public License for more details.
|
12
|
+
|
13
|
+
You should have received a copy of the GNU General Public License
|
14
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2012.
|
data/README.md
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
###About
|
2
|
+
|
3
|
+
This gem implements a PageRank class and a class that allows to perform keyword ranking using the TextRank algorithm. Both were ported from the [PHP Implementation](https://github.com/crodas/textrank) by @crodas.
|
4
|
+
|
5
|
+
###Install
|
6
|
+
|
7
|
+
```
|
8
|
+
gem install graph-rank
|
9
|
+
```
|
10
|
+
|
11
|
+
###Usage
|
12
|
+
|
13
|
+
**TextRank**
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
text = 'PageRank is a link analysis algorithm, named after Larry ' +
|
17
|
+
'Page and used by the Google Internet search engine, that assigns ' +
|
18
|
+
'a numerical weighting to each element of a hyperlinked set of ' +
|
19
|
+
'documents, such as the World Wide Web, with the purpose of "measuring"' +
|
20
|
+
'its relative importance within the set.'
|
21
|
+
|
22
|
+
tr = GraphRank::Keywords.new
|
23
|
+
|
24
|
+
tr.run(text)
|
25
|
+
|
26
|
+
```
|
27
|
+
|
28
|
+
Optionally, you can pass the n-gram size (default = 3), as well as the damping and convergence (see PageRank) to the constructor. Finally, you can set stop words as follows:
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
t.stop_words = ["word", "another", "etc"]
|
32
|
+
```
|
33
|
+
|
34
|
+
The default stop word list is as follows:
|
35
|
+
|
36
|
+
"about","also","are","away","because",
|
37
|
+
"been","beside","besides","between","but","cannot",
|
38
|
+
"could","did","etc","even","ever","every","for","had",
|
39
|
+
"have","how","into","isn","maybe","non","nor","now",
|
40
|
+
"should","such","than","that","then","these","this",
|
41
|
+
"those","though","too","was","wasn","were","what","when",
|
42
|
+
"where","which","while","who","whom","whose","will",
|
43
|
+
"with","would","wouldn","yes"
|
44
|
+
|
45
|
+
> Reference: R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,” in Proceedings of EMNLP 2004. Association for Computational Linguistics, 2004, pp. 404–411.
|
46
|
+
|
47
|
+
**PageRank**
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
|
51
|
+
pr = GraphRank::PageRank.new
|
52
|
+
|
53
|
+
pr.add(1,2)
|
54
|
+
pr.add(1,4)
|
55
|
+
pr.add(1,5)
|
56
|
+
pr.add(4,5)
|
57
|
+
pr.add(4,1)
|
58
|
+
pr.add(4,3)
|
59
|
+
pr.add(1,3)
|
60
|
+
pr.add(3,1)
|
61
|
+
pr.add(5,1)
|
62
|
+
|
63
|
+
pr.calculate
|
64
|
+
# => [[1, 5.99497754810465], [3, 2.694723988738302],
|
65
|
+
# [5, 2.694723988738302], [4, 2.100731029131304],
|
66
|
+
# [2, 2.100731029131304]]
|
67
|
+
```
|
68
|
+
|
69
|
+
Optionally, you can pass the damping factor (default = 0.85) and the convergence criterion (default = 0.01) as parameters to the PageRank constructor.
|
70
|
+
|
71
|
+
> Reference: Brin, S.; Page, L. (1998). "The anatomy of a large-scale hypertextual Web search engine". Computer Networks and ISDN Systems 30: 107–117.
|
data/lib/graph-rank.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# Implement the PageRank algorithm
|
2
|
+
# for unsupervised keyword extraction.
|
3
|
+
class GraphRank::Keywords < GraphRank::TextRank
|
4
|
+
|
5
|
+
# Split the text on words.
|
6
|
+
def get_features
|
7
|
+
@text.split(' ')
|
8
|
+
end
|
9
|
+
|
10
|
+
# Remove short and stop words.
|
11
|
+
def filter_features
|
12
|
+
remove_short_words
|
13
|
+
remove_stop_words
|
14
|
+
end
|
15
|
+
|
16
|
+
# Remove all stop words.
|
17
|
+
def remove_stop_words
|
18
|
+
@features.delete_if { |word| @stop_words.include?(word) }
|
19
|
+
end
|
20
|
+
|
21
|
+
# Remove 1 and 2 char words.
|
22
|
+
def remove_short_words
|
23
|
+
@features.delete_if { |word| word.length < 3 }
|
24
|
+
end
|
25
|
+
|
26
|
+
# Build the co-occurence graph for an n-gram.
|
27
|
+
def build_graph
|
28
|
+
@features.each_with_index do |f,i|
|
29
|
+
min, max = i - @ngram_size, i + @ngram_size
|
30
|
+
while min < max
|
31
|
+
if @features[min] and min != i
|
32
|
+
@ranking.add(@features[i], @features[min])
|
33
|
+
end
|
34
|
+
min += 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# Brin, S.; Page, L. (1998). "The anatomy of
|
2
|
+
# a large-scale hypertextual Web search engine".
|
3
|
+
# Computer Networks and ISDN Systems 30: 107–117.
|
4
|
+
class GraphRank::PageRank
|
5
|
+
|
6
|
+
# Initialize with default damping and convergence.
|
7
|
+
def initialize(damping=nil, convergence=nil)
|
8
|
+
damping ||= 0.85; convergence ||= 0.01
|
9
|
+
if damping <= 0 or damping > 1
|
10
|
+
raise 'Invalid damping factor.'
|
11
|
+
elsif convergence < 0 or convergence > 1
|
12
|
+
raise 'Invalid convergence factor.'
|
13
|
+
end
|
14
|
+
@damping, @convergence = damping, convergence
|
15
|
+
@graph, @outlinks, @nodes = {}, {}, {} ####
|
16
|
+
end
|
17
|
+
|
18
|
+
# Add a node to the graph.
|
19
|
+
def add(source, dest)
|
20
|
+
return false if source == dest
|
21
|
+
@outlinks[source] ||= 0
|
22
|
+
@graph[dest] ||= []
|
23
|
+
@graph[dest] << source
|
24
|
+
@outlinks[source] += 1
|
25
|
+
@nodes[source] = 0.15
|
26
|
+
@nodes[dest] = 0.15
|
27
|
+
end
|
28
|
+
|
29
|
+
# Iterates the PageRank algorithm
|
30
|
+
# until convergence is reached.
|
31
|
+
def calculate
|
32
|
+
done = false
|
33
|
+
until done
|
34
|
+
new_nodes = iteration
|
35
|
+
done = convergence(new_nodes)
|
36
|
+
@nodes = new_nodes
|
37
|
+
end
|
38
|
+
@nodes.sort_by {|k,v|v}.reverse
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
# Performs one iteration to calculate
|
44
|
+
# the PageRank ranking for all nodes.
|
45
|
+
def iteration
|
46
|
+
new_nodes = {}
|
47
|
+
@graph.each do |node,links|
|
48
|
+
score = links.map do |id|
|
49
|
+
@nodes[id] / @outlinks[id]
|
50
|
+
end.inject(:+)
|
51
|
+
new_nodes[node] = (1-@damping/
|
52
|
+
@nodes.size) + @damping * score
|
53
|
+
end
|
54
|
+
new_nodes
|
55
|
+
end
|
56
|
+
|
57
|
+
# Check for convergence.
|
58
|
+
def convergence(current)
|
59
|
+
diff = {}
|
60
|
+
@nodes.each do |k,v|
|
61
|
+
diff[k] = current[k] - @nodes[k]
|
62
|
+
end
|
63
|
+
total = 0
|
64
|
+
diff.each { |k,v| total += diff[k] * v }
|
65
|
+
Math.sqrt(total/current.size) < @convergence
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,”
|
2
|
+
# in Proceedings of EMNLP 2004. Association for Computational
|
3
|
+
# Linguistics, 2004, pp. 404–411.
|
4
|
+
class GraphRank::TextRank
|
5
|
+
|
6
|
+
# Default English stop-word list.
|
7
|
+
StopWords = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]
|
8
|
+
|
9
|
+
# Allow configuration of stop words.
|
10
|
+
attr_accessor :stop_words
|
11
|
+
|
12
|
+
# Initialize with ngram size, damping and convergence.
|
13
|
+
def initialize(ngram_size=3, damping=nil, convergence=nil)
|
14
|
+
@ranking = GraphRank::PageRank.new(damping, convergence)
|
15
|
+
@stop_words = StopWords
|
16
|
+
@ngram_size = ngram_size
|
17
|
+
end
|
18
|
+
|
19
|
+
# Add text and return PageRank.
|
20
|
+
def run(text)
|
21
|
+
@text = text
|
22
|
+
clean_text
|
23
|
+
@features = get_features
|
24
|
+
filter_features
|
25
|
+
build_graph
|
26
|
+
calculate_ranking
|
27
|
+
end
|
28
|
+
|
29
|
+
# Clean text leaving just letters from a-z.
|
30
|
+
def clean_text
|
31
|
+
@text.downcase!
|
32
|
+
@text.gsub!(/[^a-z ]/, ' ')
|
33
|
+
@text.gsub!(/\s+/, " ")
|
34
|
+
end
|
35
|
+
|
36
|
+
# Return the features (keyword, sentence, etc.)
|
37
|
+
def get_features
|
38
|
+
raise 'Must be implemented in subclass.'
|
39
|
+
end
|
40
|
+
|
41
|
+
# Return only features of interest.
|
42
|
+
def filter_features
|
43
|
+
@features
|
44
|
+
end
|
45
|
+
|
46
|
+
# Build the graph from the features.
|
47
|
+
def build_graph
|
48
|
+
raise 'Must be implemented in subclass.'
|
49
|
+
end
|
50
|
+
|
51
|
+
# Calculate the PageRank ranking.
|
52
|
+
def calculate_ranking
|
53
|
+
@ranking.calculate
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
metadata
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: graph-rank
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Louis Mullie
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-12-16 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: ! ' GraphRank is an impementation of TextRank and PageRank in Ruby. '
|
15
|
+
email:
|
16
|
+
- louis.mullie@gmail.com
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- lib/graph-rank/keywords.rb
|
22
|
+
- lib/graph-rank/page_rank.rb
|
23
|
+
- lib/graph-rank/text_rank.rb
|
24
|
+
- lib/graph-rank.rb
|
25
|
+
- README.md
|
26
|
+
- LICENSE
|
27
|
+
homepage: https://github.com/louismullie/graphrank
|
28
|
+
licenses: []
|
29
|
+
post_install_message:
|
30
|
+
rdoc_options: []
|
31
|
+
require_paths:
|
32
|
+
- lib
|
33
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
34
|
+
none: false
|
35
|
+
requirements:
|
36
|
+
- - ! '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ! '>='
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: '0'
|
45
|
+
requirements: []
|
46
|
+
rubyforge_project:
|
47
|
+
rubygems_version: 1.8.24
|
48
|
+
signing_key:
|
49
|
+
specification_version: 3
|
50
|
+
summary: ! 'GraphRank: bringing TextRank and PageRank to Ruby.'
|
51
|
+
test_files: []
|