graph-rank 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,16 @@
1
+ A Ruby implementation of the TextRank and PageRank algorithms.
2
+
3
+ This program is free software: you can redistribute it and/or modify
4
+ it under the terms of the GNU General Public License as published by
5
+ the Free Software Foundation, either version 3 of the License, or
6
+ (at your option) any later version.
7
+
8
+ This program is distributed in the hope that it will be useful,
9
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ GNU General Public License for more details.
12
+
13
+ You should have received a copy of the GNU General Public License
14
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2012.
@@ -0,0 +1,71 @@
1
+ ###About
2
+
3
+ This gem implements a PageRank class and a class that allows to perform keyword ranking using the TextRank algorithm. Both were ported from the [PHP Implementation](https://github.com/crodas/textrank) by @crodas.
4
+
5
+ ###Install
6
+
7
+ ```
8
+ gem install graph-rank
9
+ ```
10
+
11
+ ###Usage
12
+
13
+ **TextRank**
14
+
15
+ ```ruby
16
+ text = 'PageRank is a link analysis algorithm, named after Larry ' +
17
+ 'Page and used by the Google Internet search engine, that assigns ' +
18
+ 'a numerical weighting to each element of a hyperlinked set of ' +
19
+ 'documents, such as the World Wide Web, with the purpose of "measuring"' +
20
+ 'its relative importance within the set.'
21
+
22
+ tr = GraphRank::Keywords.new
23
+
24
+ tr.run(text)
25
+
26
+ ```
27
+
28
+ Optionally, you can pass the n-gram size (default = 3), as well as the damping and convergence (see PageRank) to the constructor. Finally, you can set stop words as follows:
29
+
30
+ ```ruby
31
+ t.stop_words = ["word", "another", "etc"]
32
+ ```
33
+
34
+ The default stop word list is as follows:
35
+
36
+ "about","also","are","away","because",
37
+ "been","beside","besides","between","but","cannot",
38
+ "could","did","etc","even","ever","every","for","had",
39
+ "have","how","into","isn","maybe","non","nor","now",
40
+ "should","such","than","that","then","these","this",
41
+ "those","though","too","was","wasn","were","what","when",
42
+ "where","which","while","who","whom","whose","will",
43
+ "with","would","wouldn","yes"
44
+
45
+ > Reference: R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,” in Proceedings of EMNLP 2004. Association for Computational Linguistics, 2004, pp. 404–411.
46
+
47
+ **PageRank**
48
+
49
+ ```ruby
50
+
51
+ pr = GraphRank::PageRank.new
52
+
53
+ pr.add(1,2)
54
+ pr.add(1,4)
55
+ pr.add(1,5)
56
+ pr.add(4,5)
57
+ pr.add(4,1)
58
+ pr.add(4,3)
59
+ pr.add(1,3)
60
+ pr.add(3,1)
61
+ pr.add(5,1)
62
+
63
+ pr.calculate
64
+ # => [[1, 5.99497754810465], [3, 2.694723988738302],
65
+ # [5, 2.694723988738302], [4, 2.100731029131304],
66
+ # [2, 2.100731029131304]]
67
+ ```
68
+
69
+ Optionally, you can pass the damping factor (default = 0.85) and the convergence criterion (default = 0.01) as parameters to the PageRank constructor.
70
+
71
+ > Reference: Brin, S.; Page, L. (1998). "The anatomy of a large-scale hypertextual Web search engine". Computer Networks and ISDN Systems 30: 107–117.
@@ -0,0 +1,13 @@
1
+ module GraphRank
2
+
3
+ # Version number.
4
+ VERSION = '0.0.1'
5
+
6
+ # Core classes.
7
+ require 'graph-rank/page_rank'
8
+ require 'graph-rank/text_rank'
9
+
10
+ # Implementations.
11
+ require 'graph-rank/keywords'
12
+
13
+ end
@@ -0,0 +1,39 @@
1
+ # Implement the PageRank algorithm
2
+ # for unsupervised keyword extraction.
3
+ class GraphRank::Keywords < GraphRank::TextRank
4
+
5
+ # Split the text on words.
6
+ def get_features
7
+ @text.split(' ')
8
+ end
9
+
10
+ # Remove short and stop words.
11
+ def filter_features
12
+ remove_short_words
13
+ remove_stop_words
14
+ end
15
+
16
+ # Remove all stop words.
17
+ def remove_stop_words
18
+ @features.delete_if { |word| @stop_words.include?(word) }
19
+ end
20
+
21
+ # Remove 1 and 2 char words.
22
+ def remove_short_words
23
+ @features.delete_if { |word| word.length < 3 }
24
+ end
25
+
26
+ # Build the co-occurence graph for an n-gram.
27
+ def build_graph
28
+ @features.each_with_index do |f,i|
29
+ min, max = i - @ngram_size, i + @ngram_size
30
+ while min < max
31
+ if @features[min] and min != i
32
+ @ranking.add(@features[i], @features[min])
33
+ end
34
+ min += 1
35
+ end
36
+ end
37
+ end
38
+
39
+ end
@@ -0,0 +1,68 @@
1
+ # Brin, S.; Page, L. (1998). "The anatomy of
2
+ # a large-scale hypertextual Web search engine".
3
+ # Computer Networks and ISDN Systems 30: 107–117.
4
+ class GraphRank::PageRank
5
+
6
+ # Initialize with default damping and convergence.
7
+ def initialize(damping=nil, convergence=nil)
8
+ damping ||= 0.85; convergence ||= 0.01
9
+ if damping <= 0 or damping > 1
10
+ raise 'Invalid damping factor.'
11
+ elsif convergence < 0 or convergence > 1
12
+ raise 'Invalid convergence factor.'
13
+ end
14
+ @damping, @convergence = damping, convergence
15
+ @graph, @outlinks, @nodes = {}, {}, {} ####
16
+ end
17
+
18
+ # Add a node to the graph.
19
+ def add(source, dest)
20
+ return false if source == dest
21
+ @outlinks[source] ||= 0
22
+ @graph[dest] ||= []
23
+ @graph[dest] << source
24
+ @outlinks[source] += 1
25
+ @nodes[source] = 0.15
26
+ @nodes[dest] = 0.15
27
+ end
28
+
29
+ # Iterates the PageRank algorithm
30
+ # until convergence is reached.
31
+ def calculate
32
+ done = false
33
+ until done
34
+ new_nodes = iteration
35
+ done = convergence(new_nodes)
36
+ @nodes = new_nodes
37
+ end
38
+ @nodes.sort_by {|k,v|v}.reverse
39
+ end
40
+
41
+ private
42
+
43
+ # Performs one iteration to calculate
44
+ # the PageRank ranking for all nodes.
45
+ def iteration
46
+ new_nodes = {}
47
+ @graph.each do |node,links|
48
+ score = links.map do |id|
49
+ @nodes[id] / @outlinks[id]
50
+ end.inject(:+)
51
+ new_nodes[node] = (1-@damping/
52
+ @nodes.size) + @damping * score
53
+ end
54
+ new_nodes
55
+ end
56
+
57
+ # Check for convergence.
58
+ def convergence(current)
59
+ diff = {}
60
+ @nodes.each do |k,v|
61
+ diff[k] = current[k] - @nodes[k]
62
+ end
63
+ total = 0
64
+ diff.each { |k,v| total += diff[k] * v }
65
+ Math.sqrt(total/current.size) < @convergence
66
+ end
67
+
68
+ end
@@ -0,0 +1,56 @@
1
+ # R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,”
2
+ # in Proceedings of EMNLP 2004. Association for Computational
3
+ # Linguistics, 2004, pp. 404–411.
4
+ class GraphRank::TextRank
5
+
6
+ # Default English stop-word list.
7
+ StopWords = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]
8
+
9
+ # Allow configuration of stop words.
10
+ attr_accessor :stop_words
11
+
12
+ # Initialize with ngram size, damping and convergence.
13
+ def initialize(ngram_size=3, damping=nil, convergence=nil)
14
+ @ranking = GraphRank::PageRank.new(damping, convergence)
15
+ @stop_words = StopWords
16
+ @ngram_size = ngram_size
17
+ end
18
+
19
+ # Add text and return PageRank.
20
+ def run(text)
21
+ @text = text
22
+ clean_text
23
+ @features = get_features
24
+ filter_features
25
+ build_graph
26
+ calculate_ranking
27
+ end
28
+
29
+ # Clean text leaving just letters from a-z.
30
+ def clean_text
31
+ @text.downcase!
32
+ @text.gsub!(/[^a-z ]/, ' ')
33
+ @text.gsub!(/\s+/, " ")
34
+ end
35
+
36
+ # Return the features (keyword, sentence, etc.)
37
+ def get_features
38
+ raise 'Must be implemented in subclass.'
39
+ end
40
+
41
+ # Return only features of interest.
42
+ def filter_features
43
+ @features
44
+ end
45
+
46
+ # Build the graph from the features.
47
+ def build_graph
48
+ raise 'Must be implemented in subclass.'
49
+ end
50
+
51
+ # Calculate the PageRank ranking.
52
+ def calculate_ranking
53
+ @ranking.calculate
54
+ end
55
+
56
+ end
metadata ADDED
@@ -0,0 +1,51 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: graph-rank
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Louis Mullie
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-12-16 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: ! ' GraphRank is an impementation of TextRank and PageRank in Ruby. '
15
+ email:
16
+ - louis.mullie@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - lib/graph-rank/keywords.rb
22
+ - lib/graph-rank/page_rank.rb
23
+ - lib/graph-rank/text_rank.rb
24
+ - lib/graph-rank.rb
25
+ - README.md
26
+ - LICENSE
27
+ homepage: https://github.com/louismullie/graphrank
28
+ licenses: []
29
+ post_install_message:
30
+ rdoc_options: []
31
+ require_paths:
32
+ - lib
33
+ required_ruby_version: !ruby/object:Gem::Requirement
34
+ none: false
35
+ requirements:
36
+ - - ! '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ! '>='
43
+ - !ruby/object:Gem::Version
44
+ version: '0'
45
+ requirements: []
46
+ rubyforge_project:
47
+ rubygems_version: 1.8.24
48
+ signing_key:
49
+ specification_version: 3
50
+ summary: ! 'GraphRank: bringing TextRank and PageRank to Ruby.'
51
+ test_files: []