graph-rank 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +9 -19
- data/lib/graph-rank.rb +1 -1
- data/lib/graph-rank/keywords.rb +9 -1
- data/lib/graph-rank/page_rank.rb +8 -6
- data/lib/graph-rank/sentences.rb +49 -0
- data/lib/graph-rank/text_rank.rb +1 -9
- metadata +2 -1
data/README.md
CHANGED
@@ -12,6 +12,8 @@ gem install graph-rank
|
|
12
12
|
|
13
13
|
**TextRank**
|
14
14
|
|
15
|
+
> Reference: R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,” in Proceedings of EMNLP 2004. Association for Computational Linguistics, 2004, pp. 404–411.
|
16
|
+
|
15
17
|
```ruby
|
16
18
|
text = 'PageRank is a link analysis algorithm, named after Larry ' +
|
17
19
|
'Page and used by the Google Internet search engine, that assigns ' +
|
@@ -21,31 +23,20 @@ text = 'PageRank is a link analysis algorithm, named after Larry ' +
|
|
21
23
|
|
22
24
|
tr = GraphRank::Keywords.new
|
23
25
|
|
24
|
-
tr.run(text)
|
26
|
+
tr.run(text).inspect
|
25
27
|
|
26
28
|
```
|
27
29
|
|
28
30
|
Optionally, you can pass the n-gram size (default = 3), as well as the damping and convergence (see PageRank) to the constructor. Finally, you can set stop words as follows:
|
29
31
|
|
30
32
|
```ruby
|
31
|
-
|
33
|
+
tr.stop_words = ["word", "another", "etc"]
|
32
34
|
```
|
33
35
|
|
34
|
-
The default stop word list is as follows:
|
35
|
-
|
36
|
-
"about","also","are","away","because",
|
37
|
-
"been","beside","besides","between","but","cannot",
|
38
|
-
"could","did","etc","even","ever","every","for","had",
|
39
|
-
"have","how","into","isn","maybe","non","nor","now",
|
40
|
-
"should","such","than","that","then","these","this",
|
41
|
-
"those","though","too","was","wasn","were","what","when",
|
42
|
-
"where","which","while","who","whom","whose","will",
|
43
|
-
"with","would","wouldn","yes"
|
44
|
-
|
45
|
-
> Reference: R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,” in Proceedings of EMNLP 2004. Association for Computational Linguistics, 2004, pp. 404–411.
|
46
|
-
|
47
36
|
**PageRank**
|
48
37
|
|
38
|
+
> Reference: Brin, S.; Page, L. (1998). "The anatomy of a large-scale hypertextual Web search engine". Computer Networks and ISDN Systems 30: 107–117.
|
39
|
+
|
49
40
|
```ruby
|
50
41
|
|
51
42
|
pr = GraphRank::PageRank.new
|
@@ -60,12 +51,11 @@ pr.add(1,3)
|
|
60
51
|
pr.add(3,1)
|
61
52
|
pr.add(5,1)
|
62
53
|
|
63
|
-
pr.calculate
|
54
|
+
puts pr.calculate.inspect
|
55
|
+
|
64
56
|
# => [[1, 5.99497754810465], [3, 2.694723988738302],
|
65
57
|
# [5, 2.694723988738302], [4, 2.100731029131304],
|
66
58
|
# [2, 2.100731029131304]]
|
67
59
|
```
|
68
60
|
|
69
|
-
Optionally, you can pass the damping factor (default = 0.85) and the convergence criterion (default = 0.01) as parameters to the PageRank constructor.
|
70
|
-
|
71
|
-
> Reference: Brin, S.; Page, L. (1998). "The anatomy of a large-scale hypertextual Web search engine". Computer Networks and ISDN Systems 30: 107–117.
|
61
|
+
Optionally, you can pass the damping factor (default = 0.85) and the convergence criterion (default = 0.01) as parameters to the PageRank constructor. Additionally, you can pass in an edge weight parameter to `#add` and it will be used in the PageRank calculation.
|
data/lib/graph-rank.rb
CHANGED
data/lib/graph-rank/keywords.rb
CHANGED
@@ -4,7 +4,8 @@ class GraphRank::Keywords < GraphRank::TextRank
|
|
4
4
|
|
5
5
|
# Split the text on words.
|
6
6
|
def get_features
|
7
|
-
|
7
|
+
clean_text
|
8
|
+
@features = @text.split(' ')
|
8
9
|
end
|
9
10
|
|
10
11
|
# Remove short and stop words.
|
@@ -13,6 +14,13 @@ class GraphRank::Keywords < GraphRank::TextRank
|
|
13
14
|
remove_stop_words
|
14
15
|
end
|
15
16
|
|
17
|
+
# Clean text leaving just letters from a-z.
|
18
|
+
def clean_text
|
19
|
+
@text.downcase!
|
20
|
+
@text.gsub!(/[^a-z ]/, ' ')
|
21
|
+
@text.gsub!(/\s+/, " ")
|
22
|
+
end
|
23
|
+
|
16
24
|
# Remove all stop words.
|
17
25
|
def remove_stop_words
|
18
26
|
@features.delete_if { |word| @stop_words.include?(word) }
|
data/lib/graph-rank/page_rank.rb
CHANGED
@@ -12,18 +12,20 @@ class GraphRank::PageRank
|
|
12
12
|
raise 'Invalid convergence factor.'
|
13
13
|
end
|
14
14
|
@damping, @convergence = damping, convergence
|
15
|
-
@graph, @outlinks, @nodes = {}, {}, {}
|
15
|
+
@graph, @outlinks, @nodes, @weights = {}, {}, {}, {}
|
16
16
|
end
|
17
17
|
|
18
18
|
# Add a node to the graph.
|
19
|
-
def add(source, dest)
|
19
|
+
def add(source, dest, weight=1.0)
|
20
20
|
return false if source == dest
|
21
|
-
@outlinks[source] ||= 0
|
21
|
+
@outlinks[source] ||= 0.0
|
22
22
|
@graph[dest] ||= []
|
23
23
|
@graph[dest] << source
|
24
|
-
@outlinks[source] += 1
|
24
|
+
@outlinks[source] += 1.0
|
25
25
|
@nodes[source] = 0.15
|
26
26
|
@nodes[dest] = 0.15
|
27
|
+
@weights[source] ||= {}
|
28
|
+
@weights[source][dest] = weight
|
27
29
|
end
|
28
30
|
|
29
31
|
# Iterates the PageRank algorithm
|
@@ -46,7 +48,7 @@ class GraphRank::PageRank
|
|
46
48
|
new_nodes = {}
|
47
49
|
@graph.each do |node,links|
|
48
50
|
score = links.map do |id|
|
49
|
-
@nodes[id] / @outlinks[id]
|
51
|
+
@nodes[id] / @outlinks[id] * @weights[id][node]
|
50
52
|
end.inject(:+)
|
51
53
|
new_nodes[node] = (1-@damping/
|
52
54
|
@nodes.size) + @damping * score
|
@@ -60,7 +62,7 @@ class GraphRank::PageRank
|
|
60
62
|
@nodes.each do |k,v|
|
61
63
|
diff[k] = current[k] - @nodes[k]
|
62
64
|
end
|
63
|
-
total = 0
|
65
|
+
total = 0.0
|
64
66
|
diff.each { |k,v| total += diff[k] * v }
|
65
67
|
Math.sqrt(total/current.size) < @convergence
|
66
68
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'graph-rank'
|
2
|
+
# Implement the PageRank algorithm
|
3
|
+
# for unsupervised sentence extraction.
|
4
|
+
class GraphRank::Sentences < GraphRank::TextRank
|
5
|
+
|
6
|
+
begin
|
7
|
+
require 'treat'
|
8
|
+
include Treat::Core::DSL
|
9
|
+
rescue
|
10
|
+
puts "GraphRank::Sentences requires the treat " +
|
11
|
+
"gem to work. Please run `gem install treat`."
|
12
|
+
end
|
13
|
+
|
14
|
+
# Stem stop words!
|
15
|
+
def get_features
|
16
|
+
@section = section(@text)
|
17
|
+
.apply(:chunk,:segment,:tokenize)
|
18
|
+
@features = @section.groups
|
19
|
+
end
|
20
|
+
|
21
|
+
# Build the co-occurence graph for an n-gram.
|
22
|
+
def build_graph
|
23
|
+
@features.each do |grp|
|
24
|
+
wc = grp.word_count
|
25
|
+
@features.each do |grp2|
|
26
|
+
wc2 = grp2.word_count
|
27
|
+
score = 0.0
|
28
|
+
grp.each_word do |wrd|
|
29
|
+
next if @stop_words.include?(wrd.to_s)
|
30
|
+
grp2.each_word do |wrd2|
|
31
|
+
score += 1 if wrd.stem == wrd2.stem
|
32
|
+
end
|
33
|
+
end
|
34
|
+
score /= (Math.log(wc) + Math.log(wc2))
|
35
|
+
@ranking.add(grp.id, grp2.id, score)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def run(text,n=4)
|
41
|
+
rankings = super(text)
|
42
|
+
rankings = rankings[0..n].map { |x|x[0] }
|
43
|
+
@section.groups.select do |grp|
|
44
|
+
rankings.include?(grp.id)
|
45
|
+
end.map(&:to_s)
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
end
|
data/lib/graph-rank/text_rank.rb
CHANGED
@@ -19,20 +19,12 @@ class GraphRank::TextRank
|
|
19
19
|
# Add text and return PageRank.
|
20
20
|
def run(text)
|
21
21
|
@text = text
|
22
|
-
|
23
|
-
@features = get_features
|
22
|
+
get_features
|
24
23
|
filter_features
|
25
24
|
build_graph
|
26
25
|
calculate_ranking
|
27
26
|
end
|
28
27
|
|
29
|
-
# Clean text leaving just letters from a-z.
|
30
|
-
def clean_text
|
31
|
-
@text.downcase!
|
32
|
-
@text.gsub!(/[^a-z ]/, ' ')
|
33
|
-
@text.gsub!(/\s+/, " ")
|
34
|
-
end
|
35
|
-
|
36
28
|
# Return the features (keyword, sentence, etc.)
|
37
29
|
def get_features
|
38
30
|
raise 'Must be implemented in subclass.'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: graph-rank
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -20,6 +20,7 @@ extra_rdoc_files: []
|
|
20
20
|
files:
|
21
21
|
- lib/graph-rank/keywords.rb
|
22
22
|
- lib/graph-rank/page_rank.rb
|
23
|
+
- lib/graph-rank/sentences.rb
|
23
24
|
- lib/graph-rank/text_rank.rb
|
24
25
|
- lib/graph-rank.rb
|
25
26
|
- README.md
|