graph-rank 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +9 -19
- data/lib/graph-rank.rb +1 -1
- data/lib/graph-rank/keywords.rb +9 -1
- data/lib/graph-rank/page_rank.rb +8 -6
- data/lib/graph-rank/sentences.rb +49 -0
- data/lib/graph-rank/text_rank.rb +1 -9
- metadata +2 -1
data/README.md
CHANGED
@@ -12,6 +12,8 @@ gem install graph-rank
|
|
12
12
|
|
13
13
|
**TextRank**
|
14
14
|
|
15
|
+
> Reference: R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,” in Proceedings of EMNLP 2004. Association for Computational Linguistics, 2004, pp. 404–411.
|
16
|
+
|
15
17
|
```ruby
|
16
18
|
text = 'PageRank is a link analysis algorithm, named after Larry ' +
|
17
19
|
'Page and used by the Google Internet search engine, that assigns ' +
|
@@ -21,31 +23,20 @@ text = 'PageRank is a link analysis algorithm, named after Larry ' +
|
|
21
23
|
|
22
24
|
tr = GraphRank::Keywords.new
|
23
25
|
|
24
|
-
tr.run(text)
|
26
|
+
tr.run(text).inspect
|
25
27
|
|
26
28
|
```
|
27
29
|
|
28
30
|
Optionally, you can pass the n-gram size (default = 3), as well as the damping and convergence (see PageRank) to the constructor. Finally, you can set stop words as follows:
|
29
31
|
|
30
32
|
```ruby
|
31
|
-
|
33
|
+
tr.stop_words = ["word", "another", "etc"]
|
32
34
|
```
|
33
35
|
|
34
|
-
The default stop word list is as follows:
|
35
|
-
|
36
|
-
"about","also","are","away","because",
|
37
|
-
"been","beside","besides","between","but","cannot",
|
38
|
-
"could","did","etc","even","ever","every","for","had",
|
39
|
-
"have","how","into","isn","maybe","non","nor","now",
|
40
|
-
"should","such","than","that","then","these","this",
|
41
|
-
"those","though","too","was","wasn","were","what","when",
|
42
|
-
"where","which","while","who","whom","whose","will",
|
43
|
-
"with","would","wouldn","yes"
|
44
|
-
|
45
|
-
> Reference: R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,” in Proceedings of EMNLP 2004. Association for Computational Linguistics, 2004, pp. 404–411.
|
46
|
-
|
47
36
|
**PageRank**
|
48
37
|
|
38
|
+
> Reference: Brin, S.; Page, L. (1998). "The anatomy of a large-scale hypertextual Web search engine". Computer Networks and ISDN Systems 30: 107–117.
|
39
|
+
|
49
40
|
```ruby
|
50
41
|
|
51
42
|
pr = GraphRank::PageRank.new
|
@@ -60,12 +51,11 @@ pr.add(1,3)
|
|
60
51
|
pr.add(3,1)
|
61
52
|
pr.add(5,1)
|
62
53
|
|
63
|
-
pr.calculate
|
54
|
+
puts pr.calculate.inspect
|
55
|
+
|
64
56
|
# => [[1, 5.99497754810465], [3, 2.694723988738302],
|
65
57
|
# [5, 2.694723988738302], [4, 2.100731029131304],
|
66
58
|
# [2, 2.100731029131304]]
|
67
59
|
```
|
68
60
|
|
69
|
-
Optionally, you can pass the damping factor (default = 0.85) and the convergence criterion (default = 0.01) as parameters to the PageRank constructor.
|
70
|
-
|
71
|
-
> Reference: Brin, S.; Page, L. (1998). "The anatomy of a large-scale hypertextual Web search engine". Computer Networks and ISDN Systems 30: 107–117.
|
61
|
+
Optionally, you can pass the damping factor (default = 0.85) and the convergence criterion (default = 0.01) as parameters to the PageRank constructor. Additionally, you can pass in an edge weight parameter to `#add` and it will be used in the PageRank calculation.
|
data/lib/graph-rank.rb
CHANGED
data/lib/graph-rank/keywords.rb
CHANGED
@@ -4,7 +4,8 @@ class GraphRank::Keywords < GraphRank::TextRank
|
|
4
4
|
|
5
5
|
# Split the text on words.
|
6
6
|
def get_features
|
7
|
-
|
7
|
+
clean_text
|
8
|
+
@features = @text.split(' ')
|
8
9
|
end
|
9
10
|
|
10
11
|
# Remove short and stop words.
|
@@ -13,6 +14,13 @@ class GraphRank::Keywords < GraphRank::TextRank
|
|
13
14
|
remove_stop_words
|
14
15
|
end
|
15
16
|
|
17
|
+
# Clean text leaving just letters from a-z.
|
18
|
+
def clean_text
|
19
|
+
@text.downcase!
|
20
|
+
@text.gsub!(/[^a-z ]/, ' ')
|
21
|
+
@text.gsub!(/\s+/, " ")
|
22
|
+
end
|
23
|
+
|
16
24
|
# Remove all stop words.
|
17
25
|
def remove_stop_words
|
18
26
|
@features.delete_if { |word| @stop_words.include?(word) }
|
data/lib/graph-rank/page_rank.rb
CHANGED
@@ -12,18 +12,20 @@ class GraphRank::PageRank
|
|
12
12
|
raise 'Invalid convergence factor.'
|
13
13
|
end
|
14
14
|
@damping, @convergence = damping, convergence
|
15
|
-
@graph, @outlinks, @nodes = {}, {}, {}
|
15
|
+
@graph, @outlinks, @nodes, @weights = {}, {}, {}, {}
|
16
16
|
end
|
17
17
|
|
18
18
|
# Add a node to the graph.
|
19
|
-
def add(source, dest)
|
19
|
+
def add(source, dest, weight=1.0)
|
20
20
|
return false if source == dest
|
21
|
-
@outlinks[source] ||= 0
|
21
|
+
@outlinks[source] ||= 0.0
|
22
22
|
@graph[dest] ||= []
|
23
23
|
@graph[dest] << source
|
24
|
-
@outlinks[source] += 1
|
24
|
+
@outlinks[source] += 1.0
|
25
25
|
@nodes[source] = 0.15
|
26
26
|
@nodes[dest] = 0.15
|
27
|
+
@weights[source] ||= {}
|
28
|
+
@weights[source][dest] = weight
|
27
29
|
end
|
28
30
|
|
29
31
|
# Iterates the PageRank algorithm
|
@@ -46,7 +48,7 @@ class GraphRank::PageRank
|
|
46
48
|
new_nodes = {}
|
47
49
|
@graph.each do |node,links|
|
48
50
|
score = links.map do |id|
|
49
|
-
@nodes[id] / @outlinks[id]
|
51
|
+
@nodes[id] / @outlinks[id] * @weights[id][node]
|
50
52
|
end.inject(:+)
|
51
53
|
new_nodes[node] = (1-@damping/
|
52
54
|
@nodes.size) + @damping * score
|
@@ -60,7 +62,7 @@ class GraphRank::PageRank
|
|
60
62
|
@nodes.each do |k,v|
|
61
63
|
diff[k] = current[k] - @nodes[k]
|
62
64
|
end
|
63
|
-
total = 0
|
65
|
+
total = 0.0
|
64
66
|
diff.each { |k,v| total += diff[k] * v }
|
65
67
|
Math.sqrt(total/current.size) < @convergence
|
66
68
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'graph-rank'
|
2
|
+
# Implement the PageRank algorithm
|
3
|
+
# for unsupervised sentence extraction.
|
4
|
+
class GraphRank::Sentences < GraphRank::TextRank
|
5
|
+
|
6
|
+
begin
|
7
|
+
require 'treat'
|
8
|
+
include Treat::Core::DSL
|
9
|
+
rescue
|
10
|
+
puts "GraphRank::Sentences requires the treat " +
|
11
|
+
"gem to work. Please run `gem install treat`."
|
12
|
+
end
|
13
|
+
|
14
|
+
# Stem stop words!
|
15
|
+
def get_features
|
16
|
+
@section = section(@text)
|
17
|
+
.apply(:chunk,:segment,:tokenize)
|
18
|
+
@features = @section.groups
|
19
|
+
end
|
20
|
+
|
21
|
+
# Build the co-occurence graph for an n-gram.
|
22
|
+
def build_graph
|
23
|
+
@features.each do |grp|
|
24
|
+
wc = grp.word_count
|
25
|
+
@features.each do |grp2|
|
26
|
+
wc2 = grp2.word_count
|
27
|
+
score = 0.0
|
28
|
+
grp.each_word do |wrd|
|
29
|
+
next if @stop_words.include?(wrd.to_s)
|
30
|
+
grp2.each_word do |wrd2|
|
31
|
+
score += 1 if wrd.stem == wrd2.stem
|
32
|
+
end
|
33
|
+
end
|
34
|
+
score /= (Math.log(wc) + Math.log(wc2))
|
35
|
+
@ranking.add(grp.id, grp2.id, score)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def run(text,n=4)
|
41
|
+
rankings = super(text)
|
42
|
+
rankings = rankings[0..n].map { |x|x[0] }
|
43
|
+
@section.groups.select do |grp|
|
44
|
+
rankings.include?(grp.id)
|
45
|
+
end.map(&:to_s)
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
end
|
data/lib/graph-rank/text_rank.rb
CHANGED
@@ -19,20 +19,12 @@ class GraphRank::TextRank
|
|
19
19
|
# Add text and return PageRank.
|
20
20
|
def run(text)
|
21
21
|
@text = text
|
22
|
-
|
23
|
-
@features = get_features
|
22
|
+
get_features
|
24
23
|
filter_features
|
25
24
|
build_graph
|
26
25
|
calculate_ranking
|
27
26
|
end
|
28
27
|
|
29
|
-
# Clean text leaving just letters from a-z.
|
30
|
-
def clean_text
|
31
|
-
@text.downcase!
|
32
|
-
@text.gsub!(/[^a-z ]/, ' ')
|
33
|
-
@text.gsub!(/\s+/, " ")
|
34
|
-
end
|
35
|
-
|
36
28
|
# Return the features (keyword, sentence, etc.)
|
37
29
|
def get_features
|
38
30
|
raise 'Must be implemented in subclass.'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: graph-rank
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -20,6 +20,7 @@ extra_rdoc_files: []
|
|
20
20
|
files:
|
21
21
|
- lib/graph-rank/keywords.rb
|
22
22
|
- lib/graph-rank/page_rank.rb
|
23
|
+
- lib/graph-rank/sentences.rb
|
23
24
|
- lib/graph-rank/text_rank.rb
|
24
25
|
- lib/graph-rank.rb
|
25
26
|
- README.md
|