graph-rank 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -12,6 +12,8 @@ gem install graph-rank
12
12
 
13
13
  **TextRank**
14
14
 
15
+ > Reference: R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,” in Proceedings of EMNLP 2004. Association for Computational Linguistics, 2004, pp. 404–411.
16
+
15
17
  ```ruby
16
18
  text = 'PageRank is a link analysis algorithm, named after Larry ' +
17
19
  'Page and used by the Google Internet search engine, that assigns ' +
@@ -21,31 +23,20 @@ text = 'PageRank is a link analysis algorithm, named after Larry ' +
21
23
 
22
24
  tr = GraphRank::Keywords.new
23
25
 
24
- tr.run(text)
26
+ tr.run(text).inspect
25
27
 
26
28
  ```
27
29
 
28
30
  Optionally, you can pass the n-gram size (default = 3), as well as the damping and convergence (see PageRank) to the constructor. Finally, you can set stop words as follows:
29
31
 
30
32
  ```ruby
31
- t.stop_words = ["word", "another", "etc"]
33
+ tr.stop_words = ["word", "another", "etc"]
32
34
  ```
33
35
 
34
- The default stop word list is as follows:
35
-
36
- "about","also","are","away","because",
37
- "been","beside","besides","between","but","cannot",
38
- "could","did","etc","even","ever","every","for","had",
39
- "have","how","into","isn","maybe","non","nor","now",
40
- "should","such","than","that","then","these","this",
41
- "those","though","too","was","wasn","were","what","when",
42
- "where","which","while","who","whom","whose","will",
43
- "with","would","wouldn","yes"
44
-
45
- > Reference: R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,” in Proceedings of EMNLP 2004. Association for Computational Linguistics, 2004, pp. 404–411.
46
-
47
36
  **PageRank**
48
37
 
38
+ > Reference: Brin, S.; Page, L. (1998). "The anatomy of a large-scale hypertextual Web search engine". Computer Networks and ISDN Systems 30: 107–117.
39
+
49
40
  ```ruby
50
41
 
51
42
  pr = GraphRank::PageRank.new
@@ -60,12 +51,11 @@ pr.add(1,3)
60
51
  pr.add(3,1)
61
52
  pr.add(5,1)
62
53
 
63
- pr.calculate
54
+ puts pr.calculate.inspect
55
+
64
56
  # => [[1, 5.99497754810465], [3, 2.694723988738302],
65
57
  # [5, 2.694723988738302], [4, 2.100731029131304],
66
58
  # [2, 2.100731029131304]]
67
59
  ```
68
60
 
69
- Optionally, you can pass the damping factor (default = 0.85) and the convergence criterion (default = 0.01) as parameters to the PageRank constructor.
70
-
71
- > Reference: Brin, S.; Page, L. (1998). "The anatomy of a large-scale hypertextual Web search engine". Computer Networks and ISDN Systems 30: 107–117.
61
+ Optionally, you can pass the damping factor (default = 0.85) and the convergence criterion (default = 0.01) as parameters to the PageRank constructor. Additionally, you can pass in an edge weight parameter to `#add` and it will be used in the PageRank calculation.
@@ -1,7 +1,7 @@
1
1
  module GraphRank
2
2
 
3
3
  # Version number.
4
- VERSION = '0.0.1'
4
+ VERSION = '0.0.2'
5
5
 
6
6
  # Core classes.
7
7
  require 'graph-rank/page_rank'
@@ -4,7 +4,8 @@ class GraphRank::Keywords < GraphRank::TextRank
4
4
 
5
5
  # Split the text on words.
6
6
  def get_features
7
- @text.split(' ')
7
+ clean_text
8
+ @features = @text.split(' ')
8
9
  end
9
10
 
10
11
  # Remove short and stop words.
@@ -13,6 +14,13 @@ class GraphRank::Keywords < GraphRank::TextRank
13
14
  remove_stop_words
14
15
  end
15
16
 
17
+ # Clean text leaving just letters from a-z.
18
+ def clean_text
19
+ @text.downcase!
20
+ @text.gsub!(/[^a-z ]/, ' ')
21
+ @text.gsub!(/\s+/, " ")
22
+ end
23
+
16
24
  # Remove all stop words.
17
25
  def remove_stop_words
18
26
  @features.delete_if { |word| @stop_words.include?(word) }
@@ -12,18 +12,20 @@ class GraphRank::PageRank
12
12
  raise 'Invalid convergence factor.'
13
13
  end
14
14
  @damping, @convergence = damping, convergence
15
- @graph, @outlinks, @nodes = {}, {}, {} ####
15
+ @graph, @outlinks, @nodes, @weights = {}, {}, {}, {}
16
16
  end
17
17
 
18
18
  # Add a node to the graph.
19
- def add(source, dest)
19
+ def add(source, dest, weight=1.0)
20
20
  return false if source == dest
21
- @outlinks[source] ||= 0
21
+ @outlinks[source] ||= 0.0
22
22
  @graph[dest] ||= []
23
23
  @graph[dest] << source
24
- @outlinks[source] += 1
24
+ @outlinks[source] += 1.0
25
25
  @nodes[source] = 0.15
26
26
  @nodes[dest] = 0.15
27
+ @weights[source] ||= {}
28
+ @weights[source][dest] = weight
27
29
  end
28
30
 
29
31
  # Iterates the PageRank algorithm
@@ -46,7 +48,7 @@ class GraphRank::PageRank
46
48
  new_nodes = {}
47
49
  @graph.each do |node,links|
48
50
  score = links.map do |id|
49
- @nodes[id] / @outlinks[id]
51
+ @nodes[id] / @outlinks[id] * @weights[id][node]
50
52
  end.inject(:+)
51
53
  new_nodes[node] = (1-@damping/
52
54
  @nodes.size) + @damping * score
@@ -60,7 +62,7 @@ class GraphRank::PageRank
60
62
  @nodes.each do |k,v|
61
63
  diff[k] = current[k] - @nodes[k]
62
64
  end
63
- total = 0
65
+ total = 0.0
64
66
  diff.each { |k,v| total += diff[k] * v }
65
67
  Math.sqrt(total/current.size) < @convergence
66
68
  end
@@ -0,0 +1,49 @@
1
+ require 'graph-rank'
2
+ # Implement the PageRank algorithm
3
+ # for unsupervised sentence extraction.
4
+ class GraphRank::Sentences < GraphRank::TextRank
5
+
6
+ begin
7
+ require 'treat'
8
+ include Treat::Core::DSL
9
+ rescue
10
+ puts "GraphRank::Sentences requires the treat " +
11
+ "gem to work. Please run `gem install treat`."
12
+ end
13
+
14
+ # Stem stop words!
15
+ def get_features
16
+ @section = section(@text)
17
+ .apply(:chunk,:segment,:tokenize)
18
+ @features = @section.groups
19
+ end
20
+
21
+ # Build the co-occurence graph for an n-gram.
22
+ def build_graph
23
+ @features.each do |grp|
24
+ wc = grp.word_count
25
+ @features.each do |grp2|
26
+ wc2 = grp2.word_count
27
+ score = 0.0
28
+ grp.each_word do |wrd|
29
+ next if @stop_words.include?(wrd.to_s)
30
+ grp2.each_word do |wrd2|
31
+ score += 1 if wrd.stem == wrd2.stem
32
+ end
33
+ end
34
+ score /= (Math.log(wc) + Math.log(wc2))
35
+ @ranking.add(grp.id, grp2.id, score)
36
+ end
37
+ end
38
+ end
39
+
40
+ def run(text,n=4)
41
+ rankings = super(text)
42
+ rankings = rankings[0..n].map { |x|x[0] }
43
+ @section.groups.select do |grp|
44
+ rankings.include?(grp.id)
45
+ end.map(&:to_s)
46
+ end
47
+
48
+
49
+ end
@@ -19,20 +19,12 @@ class GraphRank::TextRank
19
19
  # Add text and return PageRank.
20
20
  def run(text)
21
21
  @text = text
22
- clean_text
23
- @features = get_features
22
+ get_features
24
23
  filter_features
25
24
  build_graph
26
25
  calculate_ranking
27
26
  end
28
27
 
29
- # Clean text leaving just letters from a-z.
30
- def clean_text
31
- @text.downcase!
32
- @text.gsub!(/[^a-z ]/, ' ')
33
- @text.gsub!(/\s+/, " ")
34
- end
35
-
36
28
  # Return the features (keyword, sentence, etc.)
37
29
  def get_features
38
30
  raise 'Must be implemented in subclass.'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: graph-rank
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -20,6 +20,7 @@ extra_rdoc_files: []
20
20
  files:
21
21
  - lib/graph-rank/keywords.rb
22
22
  - lib/graph-rank/page_rank.rb
23
+ - lib/graph-rank/sentences.rb
23
24
  - lib/graph-rank/text_rank.rb
24
25
  - lib/graph-rank.rb
25
26
  - README.md