graph-rank 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +16 -0
- data/README.md +71 -0
- data/lib/graph-rank.rb +13 -0
- data/lib/graph-rank/keywords.rb +39 -0
- data/lib/graph-rank/page_rank.rb +68 -0
- data/lib/graph-rank/text_rank.rb +56 -0
- metadata +51 -0
data/LICENSE
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
A Ruby implementation of the TextRank and PageRank algorithms.
|
2
|
+
|
3
|
+
This program is free software: you can redistribute it and/or modify
|
4
|
+
it under the terms of the GNU General Public License as published by
|
5
|
+
the Free Software Foundation, either version 3 of the License, or
|
6
|
+
(at your option) any later version.
|
7
|
+
|
8
|
+
This program is distributed in the hope that it will be useful,
|
9
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
GNU General Public License for more details.
|
12
|
+
|
13
|
+
You should have received a copy of the GNU General Public License
|
14
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2012.
|
data/README.md
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
###About
|
2
|
+
|
3
|
+
This gem implements a PageRank class and a class that allows to perform keyword ranking using the TextRank algorithm. Both were ported from the [PHP Implementation](https://github.com/crodas/textrank) by @crodas.
|
4
|
+
|
5
|
+
###Install
|
6
|
+
|
7
|
+
```
|
8
|
+
gem install graph-rank
|
9
|
+
```
|
10
|
+
|
11
|
+
###Usage
|
12
|
+
|
13
|
+
**TextRank**
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
text = 'PageRank is a link analysis algorithm, named after Larry ' +
|
17
|
+
'Page and used by the Google Internet search engine, that assigns ' +
|
18
|
+
'a numerical weighting to each element of a hyperlinked set of ' +
|
19
|
+
'documents, such as the World Wide Web, with the purpose of "measuring"' +
|
20
|
+
'its relative importance within the set.'
|
21
|
+
|
22
|
+
tr = GraphRank::Keywords.new
|
23
|
+
|
24
|
+
tr.run(text)
|
25
|
+
|
26
|
+
```
|
27
|
+
|
28
|
+
Optionally, you can pass the n-gram size (default = 3), as well as the damping and convergence (see PageRank) to the constructor. Finally, you can set stop words as follows:
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
t.stop_words = ["word", "another", "etc"]
|
32
|
+
```
|
33
|
+
|
34
|
+
The default stop word list is as follows:
|
35
|
+
|
36
|
+
"about","also","are","away","because",
|
37
|
+
"been","beside","besides","between","but","cannot",
|
38
|
+
"could","did","etc","even","ever","every","for","had",
|
39
|
+
"have","how","into","isn","maybe","non","nor","now",
|
40
|
+
"should","such","than","that","then","these","this",
|
41
|
+
"those","though","too","was","wasn","were","what","when",
|
42
|
+
"where","which","while","who","whom","whose","will",
|
43
|
+
"with","would","wouldn","yes"
|
44
|
+
|
45
|
+
> Reference: R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,” in Proceedings of EMNLP 2004. Association for Computational Linguistics, 2004, pp. 404–411.
|
46
|
+
|
47
|
+
**PageRank**
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
|
51
|
+
pr = GraphRank::PageRank.new
|
52
|
+
|
53
|
+
pr.add(1,2)
|
54
|
+
pr.add(1,4)
|
55
|
+
pr.add(1,5)
|
56
|
+
pr.add(4,5)
|
57
|
+
pr.add(4,1)
|
58
|
+
pr.add(4,3)
|
59
|
+
pr.add(1,3)
|
60
|
+
pr.add(3,1)
|
61
|
+
pr.add(5,1)
|
62
|
+
|
63
|
+
pr.calculate
|
64
|
+
# => [[1, 5.99497754810465], [3, 2.694723988738302],
|
65
|
+
# [5, 2.694723988738302], [4, 2.100731029131304],
|
66
|
+
# [2, 2.100731029131304]]
|
67
|
+
```
|
68
|
+
|
69
|
+
Optionally, you can pass the damping factor (default = 0.85) and the convergence criterion (default = 0.01) as parameters to the PageRank constructor.
|
70
|
+
|
71
|
+
> Reference: Brin, S.; Page, L. (1998). "The anatomy of a large-scale hypertextual Web search engine". Computer Networks and ISDN Systems 30: 107–117.
|
data/lib/graph-rank.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# Implement the PageRank algorithm
|
2
|
+
# for unsupervised keyword extraction.
|
3
|
+
class GraphRank::Keywords < GraphRank::TextRank
|
4
|
+
|
5
|
+
# Split the text on words.
|
6
|
+
def get_features
|
7
|
+
@text.split(' ')
|
8
|
+
end
|
9
|
+
|
10
|
+
# Remove short and stop words.
|
11
|
+
def filter_features
|
12
|
+
remove_short_words
|
13
|
+
remove_stop_words
|
14
|
+
end
|
15
|
+
|
16
|
+
# Remove all stop words.
|
17
|
+
def remove_stop_words
|
18
|
+
@features.delete_if { |word| @stop_words.include?(word) }
|
19
|
+
end
|
20
|
+
|
21
|
+
# Remove 1 and 2 char words.
|
22
|
+
def remove_short_words
|
23
|
+
@features.delete_if { |word| word.length < 3 }
|
24
|
+
end
|
25
|
+
|
26
|
+
# Build the co-occurence graph for an n-gram.
|
27
|
+
def build_graph
|
28
|
+
@features.each_with_index do |f,i|
|
29
|
+
min, max = i - @ngram_size, i + @ngram_size
|
30
|
+
while min < max
|
31
|
+
if @features[min] and min != i
|
32
|
+
@ranking.add(@features[i], @features[min])
|
33
|
+
end
|
34
|
+
min += 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# Brin, S.; Page, L. (1998). "The anatomy of
|
2
|
+
# a large-scale hypertextual Web search engine".
|
3
|
+
# Computer Networks and ISDN Systems 30: 107–117.
|
4
|
+
class GraphRank::PageRank
|
5
|
+
|
6
|
+
# Initialize with default damping and convergence.
|
7
|
+
def initialize(damping=nil, convergence=nil)
|
8
|
+
damping ||= 0.85; convergence ||= 0.01
|
9
|
+
if damping <= 0 or damping > 1
|
10
|
+
raise 'Invalid damping factor.'
|
11
|
+
elsif convergence < 0 or convergence > 1
|
12
|
+
raise 'Invalid convergence factor.'
|
13
|
+
end
|
14
|
+
@damping, @convergence = damping, convergence
|
15
|
+
@graph, @outlinks, @nodes = {}, {}, {} ####
|
16
|
+
end
|
17
|
+
|
18
|
+
# Add a node to the graph.
|
19
|
+
def add(source, dest)
|
20
|
+
return false if source == dest
|
21
|
+
@outlinks[source] ||= 0
|
22
|
+
@graph[dest] ||= []
|
23
|
+
@graph[dest] << source
|
24
|
+
@outlinks[source] += 1
|
25
|
+
@nodes[source] = 0.15
|
26
|
+
@nodes[dest] = 0.15
|
27
|
+
end
|
28
|
+
|
29
|
+
# Iterates the PageRank algorithm
|
30
|
+
# until convergence is reached.
|
31
|
+
def calculate
|
32
|
+
done = false
|
33
|
+
until done
|
34
|
+
new_nodes = iteration
|
35
|
+
done = convergence(new_nodes)
|
36
|
+
@nodes = new_nodes
|
37
|
+
end
|
38
|
+
@nodes.sort_by {|k,v|v}.reverse
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
# Performs one iteration to calculate
|
44
|
+
# the PageRank ranking for all nodes.
|
45
|
+
def iteration
|
46
|
+
new_nodes = {}
|
47
|
+
@graph.each do |node,links|
|
48
|
+
score = links.map do |id|
|
49
|
+
@nodes[id] / @outlinks[id]
|
50
|
+
end.inject(:+)
|
51
|
+
new_nodes[node] = (1-@damping/
|
52
|
+
@nodes.size) + @damping * score
|
53
|
+
end
|
54
|
+
new_nodes
|
55
|
+
end
|
56
|
+
|
57
|
+
# Check for convergence.
|
58
|
+
def convergence(current)
|
59
|
+
diff = {}
|
60
|
+
@nodes.each do |k,v|
|
61
|
+
diff[k] = current[k] - @nodes[k]
|
62
|
+
end
|
63
|
+
total = 0
|
64
|
+
diff.each { |k,v| total += diff[k] * v }
|
65
|
+
Math.sqrt(total/current.size) < @convergence
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,”
|
2
|
+
# in Proceedings of EMNLP 2004. Association for Computational
|
3
|
+
# Linguistics, 2004, pp. 404–411.
|
4
|
+
class GraphRank::TextRank
|
5
|
+
|
6
|
+
# Default English stop-word list.
|
7
|
+
StopWords = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]
|
8
|
+
|
9
|
+
# Allow configuration of stop words.
|
10
|
+
attr_accessor :stop_words
|
11
|
+
|
12
|
+
# Initialize with ngram size, damping and convergence.
|
13
|
+
def initialize(ngram_size=3, damping=nil, convergence=nil)
|
14
|
+
@ranking = GraphRank::PageRank.new(damping, convergence)
|
15
|
+
@stop_words = StopWords
|
16
|
+
@ngram_size = ngram_size
|
17
|
+
end
|
18
|
+
|
19
|
+
# Add text and return PageRank.
|
20
|
+
def run(text)
|
21
|
+
@text = text
|
22
|
+
clean_text
|
23
|
+
@features = get_features
|
24
|
+
filter_features
|
25
|
+
build_graph
|
26
|
+
calculate_ranking
|
27
|
+
end
|
28
|
+
|
29
|
+
# Clean text leaving just letters from a-z.
|
30
|
+
def clean_text
|
31
|
+
@text.downcase!
|
32
|
+
@text.gsub!(/[^a-z ]/, ' ')
|
33
|
+
@text.gsub!(/\s+/, " ")
|
34
|
+
end
|
35
|
+
|
36
|
+
# Return the features (keyword, sentence, etc.)
|
37
|
+
def get_features
|
38
|
+
raise 'Must be implemented in subclass.'
|
39
|
+
end
|
40
|
+
|
41
|
+
# Return only features of interest.
|
42
|
+
def filter_features
|
43
|
+
@features
|
44
|
+
end
|
45
|
+
|
46
|
+
# Build the graph from the features.
|
47
|
+
def build_graph
|
48
|
+
raise 'Must be implemented in subclass.'
|
49
|
+
end
|
50
|
+
|
51
|
+
# Calculate the PageRank ranking.
|
52
|
+
def calculate_ranking
|
53
|
+
@ranking.calculate
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
metadata
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: graph-rank
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Louis Mullie
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-12-16 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: ! ' GraphRank is an impementation of TextRank and PageRank in Ruby. '
|
15
|
+
email:
|
16
|
+
- louis.mullie@gmail.com
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- lib/graph-rank/keywords.rb
|
22
|
+
- lib/graph-rank/page_rank.rb
|
23
|
+
- lib/graph-rank/text_rank.rb
|
24
|
+
- lib/graph-rank.rb
|
25
|
+
- README.md
|
26
|
+
- LICENSE
|
27
|
+
homepage: https://github.com/louismullie/graphrank
|
28
|
+
licenses: []
|
29
|
+
post_install_message:
|
30
|
+
rdoc_options: []
|
31
|
+
require_paths:
|
32
|
+
- lib
|
33
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
34
|
+
none: false
|
35
|
+
requirements:
|
36
|
+
- - ! '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ! '>='
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: '0'
|
45
|
+
requirements: []
|
46
|
+
rubyforge_project:
|
47
|
+
rubygems_version: 1.8.24
|
48
|
+
signing_key:
|
49
|
+
specification_version: 3
|
50
|
+
summary: ! 'GraphRank: bringing TextRank and PageRank to Ruby.'
|
51
|
+
test_files: []
|