graph-rank 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -12,6 +12,8 @@ gem install graph-rank
12
12
 
13
13
  **TextRank**
14
14
 
15
+ > Reference: R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,” in Proceedings of EMNLP 2004. Association for Computational Linguistics, 2004, pp. 404–411.
16
+
15
17
  ```ruby
16
18
  text = 'PageRank is a link analysis algorithm, named after Larry ' +
17
19
  'Page and used by the Google Internet search engine, that assigns ' +
@@ -21,31 +23,20 @@ text = 'PageRank is a link analysis algorithm, named after Larry ' +
21
23
 
22
24
  tr = GraphRank::Keywords.new
23
25
 
24
- tr.run(text)
26
+ tr.run(text).inspect
25
27
 
26
28
  ```
27
29
 
28
30
  Optionally, you can pass the n-gram size (default = 3), as well as the damping and convergence (see PageRank) to the constructor. Finally, you can set stop words as follows:
29
31
 
30
32
  ```ruby
31
- t.stop_words = ["word", "another", "etc"]
33
+ tr.stop_words = ["word", "another", "etc"]
32
34
  ```
33
35
 
34
- The default stop word list is as follows:
35
-
36
- "about","also","are","away","because",
37
- "been","beside","besides","between","but","cannot",
38
- "could","did","etc","even","ever","every","for","had",
39
- "have","how","into","isn","maybe","non","nor","now",
40
- "should","such","than","that","then","these","this",
41
- "those","though","too","was","wasn","were","what","when",
42
- "where","which","while","who","whom","whose","will",
43
- "with","would","wouldn","yes"
44
-
45
- > Reference: R. Mihalcea and P. Tarau, “TextRank: Bringing Order into Texts,” in Proceedings of EMNLP 2004. Association for Computational Linguistics, 2004, pp. 404–411.
46
-
47
36
  **PageRank**
48
37
 
38
+ > Reference: Brin, S.; Page, L. (1998). "The anatomy of a large-scale hypertextual Web search engine". Computer Networks and ISDN Systems 30: 107–117.
39
+
49
40
  ```ruby
50
41
 
51
42
  pr = GraphRank::PageRank.new
@@ -60,12 +51,11 @@ pr.add(1,3)
60
51
  pr.add(3,1)
61
52
  pr.add(5,1)
62
53
 
63
- pr.calculate
54
+ puts pr.calculate.inspect
55
+
64
56
  # => [[1, 5.99497754810465], [3, 2.694723988738302],
65
57
  # [5, 2.694723988738302], [4, 2.100731029131304],
66
58
  # [2, 2.100731029131304]]
67
59
  ```
68
60
 
69
- Optionally, you can pass the damping factor (default = 0.85) and the convergence criterion (default = 0.01) as parameters to the PageRank constructor.
70
-
71
- > Reference: Brin, S.; Page, L. (1998). "The anatomy of a large-scale hypertextual Web search engine". Computer Networks and ISDN Systems 30: 107–117.
61
+ Optionally, you can pass the damping factor (default = 0.85) and the convergence criterion (default = 0.01) as parameters to the PageRank constructor. Additionally, you can pass in an edge weight parameter to `#add` and it will be used in the PageRank calculation.
@@ -1,7 +1,7 @@
1
1
  module GraphRank
2
2
 
3
3
  # Version number.
4
- VERSION = '0.0.1'
4
+ VERSION = '0.0.2'
5
5
 
6
6
  # Core classes.
7
7
  require 'graph-rank/page_rank'
@@ -4,7 +4,8 @@ class GraphRank::Keywords < GraphRank::TextRank
4
4
 
5
5
  # Split the text on words.
6
6
  def get_features
7
- @text.split(' ')
7
+ clean_text
8
+ @features = @text.split(' ')
8
9
  end
9
10
 
10
11
  # Remove short and stop words.
@@ -13,6 +14,13 @@ class GraphRank::Keywords < GraphRank::TextRank
13
14
  remove_stop_words
14
15
  end
15
16
 
17
+ # Clean text leaving just letters from a-z.
18
+ def clean_text
19
+ @text.downcase!
20
+ @text.gsub!(/[^a-z ]/, ' ')
21
+ @text.gsub!(/\s+/, " ")
22
+ end
23
+
16
24
  # Remove all stop words.
17
25
  def remove_stop_words
18
26
  @features.delete_if { |word| @stop_words.include?(word) }
@@ -12,18 +12,20 @@ class GraphRank::PageRank
12
12
  raise 'Invalid convergence factor.'
13
13
  end
14
14
  @damping, @convergence = damping, convergence
15
- @graph, @outlinks, @nodes = {}, {}, {} ####
15
+ @graph, @outlinks, @nodes, @weights = {}, {}, {}, {}
16
16
  end
17
17
 
18
18
  # Add a node to the graph.
19
- def add(source, dest)
19
+ def add(source, dest, weight=1.0)
20
20
  return false if source == dest
21
- @outlinks[source] ||= 0
21
+ @outlinks[source] ||= 0.0
22
22
  @graph[dest] ||= []
23
23
  @graph[dest] << source
24
- @outlinks[source] += 1
24
+ @outlinks[source] += 1.0
25
25
  @nodes[source] = 0.15
26
26
  @nodes[dest] = 0.15
27
+ @weights[source] ||= {}
28
+ @weights[source][dest] = weight
27
29
  end
28
30
 
29
31
  # Iterates the PageRank algorithm
@@ -46,7 +48,7 @@ class GraphRank::PageRank
46
48
  new_nodes = {}
47
49
  @graph.each do |node,links|
48
50
  score = links.map do |id|
49
- @nodes[id] / @outlinks[id]
51
+ @nodes[id] / @outlinks[id] * @weights[id][node]
50
52
  end.inject(:+)
51
53
  new_nodes[node] = (1-@damping/
52
54
  @nodes.size) + @damping * score
@@ -60,7 +62,7 @@ class GraphRank::PageRank
60
62
  @nodes.each do |k,v|
61
63
  diff[k] = current[k] - @nodes[k]
62
64
  end
63
- total = 0
65
+ total = 0.0
64
66
  diff.each { |k,v| total += diff[k] * v }
65
67
  Math.sqrt(total/current.size) < @convergence
66
68
  end
@@ -0,0 +1,49 @@
1
+ require 'graph-rank'
2
+ # Implement the PageRank algorithm
3
+ # for unsupervised sentence extraction.
4
+ class GraphRank::Sentences < GraphRank::TextRank
5
+
6
+ begin
7
+ require 'treat'
8
+ include Treat::Core::DSL
9
+ rescue
10
+ puts "GraphRank::Sentences requires the treat " +
11
+ "gem to work. Please run `gem install treat`."
12
+ end
13
+
14
+ # Stem stop words!
15
+ def get_features
16
+ @section = section(@text)
17
+ .apply(:chunk,:segment,:tokenize)
18
+ @features = @section.groups
19
+ end
20
+
21
+ # Build the co-occurence graph for an n-gram.
22
+ def build_graph
23
+ @features.each do |grp|
24
+ wc = grp.word_count
25
+ @features.each do |grp2|
26
+ wc2 = grp2.word_count
27
+ score = 0.0
28
+ grp.each_word do |wrd|
29
+ next if @stop_words.include?(wrd.to_s)
30
+ grp2.each_word do |wrd2|
31
+ score += 1 if wrd.stem == wrd2.stem
32
+ end
33
+ end
34
+ score /= (Math.log(wc) + Math.log(wc2))
35
+ @ranking.add(grp.id, grp2.id, score)
36
+ end
37
+ end
38
+ end
39
+
40
+ def run(text,n=4)
41
+ rankings = super(text)
42
+ rankings = rankings[0..n].map { |x|x[0] }
43
+ @section.groups.select do |grp|
44
+ rankings.include?(grp.id)
45
+ end.map(&:to_s)
46
+ end
47
+
48
+
49
+ end
@@ -19,20 +19,12 @@ class GraphRank::TextRank
19
19
  # Add text and return PageRank.
20
20
  def run(text)
21
21
  @text = text
22
- clean_text
23
- @features = get_features
22
+ get_features
24
23
  filter_features
25
24
  build_graph
26
25
  calculate_ranking
27
26
  end
28
27
 
29
- # Clean text leaving just letters from a-z.
30
- def clean_text
31
- @text.downcase!
32
- @text.gsub!(/[^a-z ]/, ' ')
33
- @text.gsub!(/\s+/, " ")
34
- end
35
-
36
28
  # Return the features (keyword, sentence, etc.)
37
29
  def get_features
38
30
  raise 'Must be implemented in subclass.'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: graph-rank
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -20,6 +20,7 @@ extra_rdoc_files: []
20
20
  files:
21
21
  - lib/graph-rank/keywords.rb
22
22
  - lib/graph-rank/page_rank.rb
23
+ - lib/graph-rank/sentences.rb
23
24
  - lib/graph-rank/text_rank.rb
24
25
  - lib/graph-rank.rb
25
26
  - README.md