graph-rank 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,16 @@
1
+ A Ruby implementation of the TextRank and PageRank algorithms.
2
+
3
+ This program is free software: you can redistribute it and/or modify
4
+ it under the terms of the GNU General Public License as published by
5
+ the Free Software Foundation, either version 3 of the License, or
6
+ (at your option) any later version.
7
+
8
+ This program is distributed in the hope that it will be useful,
9
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ GNU General Public License for more details.
12
+
13
+ You should have received a copy of the GNU General Public License
14
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2012.
@@ -0,0 +1,71 @@
1
+ ###About
2
+
3
+ This gem implements a PageRank class and a class that allows to perform keyword ranking using the TextRank algorithm. Both were ported from the [PHP Implementation](https://github.com/crodas/textrank) by @crodas.
4
+
5
+ ###Install
6
+
7
+ ```
8
+ gem install graph-rank
9
+ ```
10
+
11
+ ###Usage
12
+
13
+ **TextRank**
14
+
15
+ ```ruby
16
+ text = 'PageRank is a link analysis algorithm, named after Larry ' +
17
+ 'Page and used by the Google Internet search engine, that assigns ' +
18
+ 'a numerical weighting to each element of a hyperlinked set of ' +
19
+ 'documents, such as the World Wide Web, with the purpose of "measuring"' +
20
+ 'its relative importance within the set.'
21
+
22
+ tr = GraphRank::Keywords.new
23
+
24
+ tr.run(text)
25
+
26
+ ```
27
+
28
+ Optionally, you can pass the n-gram size (default = 3), as well as the damping and convergence (see PageRank) to the constructor. Finally, you can set stop words as follows:
29
+
30
+ ```ruby
31
+ t.stop_words = ["word", "another", "etc"]
32
+ ```
33
+
34
+ The default stop word list is as follows:
35
+
36
+ "about","also","are","away","because",
37
+ "been","beside","besides","between","but","cannot",
38
+ "could","did","etc","even","ever","every","for","had",
39
+ "have","how","into","isn","maybe","non","nor","now",
40
+ "should","such","than","that","then","these","this",
41
+ "those","though","too","was","wasn","were","what","when",
42
+ "where","which","while","who","whom","whose","will",
43
+ "with","would","wouldn","yes"
44
+
45
+ > Reference: R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,” in Proceedings of EMNLP 2004. Association for Computational Linguistics, 2004, pp. 404–411.
46
+
47
+ **PageRank**
48
+
49
+ ```ruby
50
+
51
+ pr = GraphRank::PageRank.new
52
+
53
+ pr.add(1,2)
54
+ pr.add(1,4)
55
+ pr.add(1,5)
56
+ pr.add(4,5)
57
+ pr.add(4,1)
58
+ pr.add(4,3)
59
+ pr.add(1,3)
60
+ pr.add(3,1)
61
+ pr.add(5,1)
62
+
63
+ pr.calculate
64
+ # => [[1, 5.99497754810465], [3, 2.694723988738302],
65
+ # [5, 2.694723988738302], [4, 2.100731029131304],
66
+ # [2, 2.100731029131304]]
67
+ ```
68
+
69
+ Optionally, you can pass the damping factor (default = 0.85) and the convergence criterion (default = 0.01) as parameters to the PageRank constructor.
70
+
71
+ > Reference: Brin, S.; Page, L. (1998). "The anatomy of a large-scale hypertextual Web search engine". Computer Networks and ISDN Systems 30: 107–117.
@@ -0,0 +1,13 @@
1
+ module GraphRank
2
+
3
+ # Version number.
4
+ VERSION = '0.0.1'
5
+
6
+ # Core classes.
7
+ require 'graph-rank/page_rank'
8
+ require 'graph-rank/text_rank'
9
+
10
+ # Implementations.
11
+ require 'graph-rank/keywords'
12
+
13
+ end
@@ -0,0 +1,39 @@
1
+ # Implement the PageRank algorithm
2
+ # for unsupervised keyword extraction.
3
+ class GraphRank::Keywords < GraphRank::TextRank
4
+
5
+ # Split the text on words.
6
+ def get_features
7
+ @text.split(' ')
8
+ end
9
+
10
+ # Remove short and stop words.
11
+ def filter_features
12
+ remove_short_words
13
+ remove_stop_words
14
+ end
15
+
16
+ # Remove all stop words.
17
+ def remove_stop_words
18
+ @features.delete_if { |word| @stop_words.include?(word) }
19
+ end
20
+
21
+ # Remove 1 and 2 char words.
22
+ def remove_short_words
23
+ @features.delete_if { |word| word.length < 3 }
24
+ end
25
+
26
+ # Build the co-occurence graph for an n-gram.
27
+ def build_graph
28
+ @features.each_with_index do |f,i|
29
+ min, max = i - @ngram_size, i + @ngram_size
30
+ while min < max
31
+ if @features[min] and min != i
32
+ @ranking.add(@features[i], @features[min])
33
+ end
34
+ min += 1
35
+ end
36
+ end
37
+ end
38
+
39
+ end
@@ -0,0 +1,68 @@
1
+ # Brin, S.; Page, L. (1998). "The anatomy of
2
+ # a large-scale hypertextual Web search engine".
3
+ # Computer Networks and ISDN Systems 30: 107–117.
4
+ class GraphRank::PageRank
5
+
6
+ # Initialize with default damping and convergence.
7
+ def initialize(damping=nil, convergence=nil)
8
+ damping ||= 0.85; convergence ||= 0.01
9
+ if damping <= 0 or damping > 1
10
+ raise 'Invalid damping factor.'
11
+ elsif convergence < 0 or convergence > 1
12
+ raise 'Invalid convergence factor.'
13
+ end
14
+ @damping, @convergence = damping, convergence
15
+ @graph, @outlinks, @nodes = {}, {}, {} ####
16
+ end
17
+
18
+ # Add a node to the graph.
19
+ def add(source, dest)
20
+ return false if source == dest
21
+ @outlinks[source] ||= 0
22
+ @graph[dest] ||= []
23
+ @graph[dest] << source
24
+ @outlinks[source] += 1
25
+ @nodes[source] = 0.15
26
+ @nodes[dest] = 0.15
27
+ end
28
+
29
+ # Iterates the PageRank algorithm
30
+ # until convergence is reached.
31
+ def calculate
32
+ done = false
33
+ until done
34
+ new_nodes = iteration
35
+ done = convergence(new_nodes)
36
+ @nodes = new_nodes
37
+ end
38
+ @nodes.sort_by {|k,v|v}.reverse
39
+ end
40
+
41
+ private
42
+
43
+ # Performs one iteration to calculate
44
+ # the PageRank ranking for all nodes.
45
+ def iteration
46
+ new_nodes = {}
47
+ @graph.each do |node,links|
48
+ score = links.map do |id|
49
+ @nodes[id] / @outlinks[id]
50
+ end.inject(:+)
51
+ new_nodes[node] = (1-@damping/
52
+ @nodes.size) + @damping * score
53
+ end
54
+ new_nodes
55
+ end
56
+
57
+ # Check for convergence.
58
+ def convergence(current)
59
+ diff = {}
60
+ @nodes.each do |k,v|
61
+ diff[k] = current[k] - @nodes[k]
62
+ end
63
+ total = 0
64
+ diff.each { |k,v| total += diff[k] * v }
65
+ Math.sqrt(total/current.size) < @convergence
66
+ end
67
+
68
+ end
@@ -0,0 +1,56 @@
1
+ # R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,”
2
+ # in Proceedings of EMNLP 2004. Association for Computational
3
+ # Linguistics, 2004, pp. 404–411.
4
+ class GraphRank::TextRank
5
+
6
+ # Default English stop-word list.
7
+ StopWords = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]
8
+
9
+ # Allow configuration of stop words.
10
+ attr_accessor :stop_words
11
+
12
+ # Initialize with ngram size, damping and convergence.
13
+ def initialize(ngram_size=3, damping=nil, convergence=nil)
14
+ @ranking = GraphRank::PageRank.new(damping, convergence)
15
+ @stop_words = StopWords
16
+ @ngram_size = ngram_size
17
+ end
18
+
19
+ # Add text and return PageRank.
20
+ def run(text)
21
+ @text = text
22
+ clean_text
23
+ @features = get_features
24
+ filter_features
25
+ build_graph
26
+ calculate_ranking
27
+ end
28
+
29
+ # Clean text leaving just letters from a-z.
30
+ def clean_text
31
+ @text.downcase!
32
+ @text.gsub!(/[^a-z ]/, ' ')
33
+ @text.gsub!(/\s+/, " ")
34
+ end
35
+
36
+ # Return the features (keyword, sentence, etc.)
37
+ def get_features
38
+ raise 'Must be implemented in subclass.'
39
+ end
40
+
41
+ # Return only features of interest.
42
+ def filter_features
43
+ @features
44
+ end
45
+
46
+ # Build the graph from the features.
47
+ def build_graph
48
+ raise 'Must be implemented in subclass.'
49
+ end
50
+
51
+ # Calculate the PageRank ranking.
52
+ def calculate_ranking
53
+ @ranking.calculate
54
+ end
55
+
56
+ end
metadata ADDED
@@ -0,0 +1,51 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: graph-rank
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Louis Mullie
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-12-16 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: ! ' GraphRank is an impementation of TextRank and PageRank in Ruby. '
15
+ email:
16
+ - louis.mullie@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - lib/graph-rank/keywords.rb
22
+ - lib/graph-rank/page_rank.rb
23
+ - lib/graph-rank/text_rank.rb
24
+ - lib/graph-rank.rb
25
+ - README.md
26
+ - LICENSE
27
+ homepage: https://github.com/louismullie/graphrank
28
+ licenses: []
29
+ post_install_message:
30
+ rdoc_options: []
31
+ require_paths:
32
+ - lib
33
+ required_ruby_version: !ruby/object:Gem::Requirement
34
+ none: false
35
+ requirements:
36
+ - - ! '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ! '>='
43
+ - !ruby/object:Gem::Version
44
+ version: '0'
45
+ requirements: []
46
+ rubyforge_project:
47
+ rubygems_version: 1.8.24
48
+ signing_key:
49
+ specification_version: 3
50
+ summary: ! 'GraphRank: bringing TextRank and PageRank to Ruby.'
51
+ test_files: []