jeremyf-anemone 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,116 @@
1
+ module Anemone
2
+ class PageHash < Hash
3
+
4
+ #
5
+ # Use a breadth-first search to calculate the single-source
6
+ # shortest paths from *root* to all pages in the PageHash
7
+ #
8
+ def shortest_paths!(root)
9
+ root = URI(root) if root.is_a?(String)
10
+ raise "Root node not found" if !has_key?(root)
11
+
12
+ each_value {|p| p.visited = false if p}
13
+
14
+ q = Queue.new
15
+
16
+ q.enq(root)
17
+ self[root].depth = 0
18
+ self[root].visited = true
19
+ while(!q.empty?)
20
+ url = q.deq
21
+
22
+ next if !has_key?(url)
23
+
24
+ page = self[url]
25
+
26
+ page.links.each do |u|
27
+ next if !has_key?(u) or self[u].nil?
28
+ link = self[u]
29
+ aliases = [link].concat(link.aliases.map {|a| self[a] })
30
+
31
+ aliases.each do |node|
32
+ if node.depth.nil? or page.depth + 1 < node.depth
33
+ node.depth = page.depth + 1
34
+ end
35
+ end
36
+
37
+ q.enq(self[u].url) if !self[u].visited
38
+ self[u].visited = true
39
+ end
40
+ end
41
+
42
+ self
43
+ end
44
+
45
+ #
46
+ # Returns a new PageHash by removing redirect-aliases for each
47
+ # non-redirect Page
48
+ #
49
+ def uniq
50
+ results = PageHash.new
51
+ each do |url, page|
52
+ #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
53
+ page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
54
+ if !page.redirect? and !page_added
55
+ results[url] = page.clone
56
+ results[url].aliases = []
57
+ end
58
+ end
59
+
60
+ results
61
+ end
62
+
63
+ #
64
+ # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
65
+ # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
66
+ #
67
+ def pages_linking_to(urls)
68
+ unless urls.is_a?(Array)
69
+ urls = [urls] unless urls.is_a?(Array)
70
+ single = true
71
+ end
72
+
73
+ urls.map! do |url|
74
+ if url.is_a?(String)
75
+ URI(url) rescue nil
76
+ else
77
+ url
78
+ end
79
+ end
80
+ urls.compact
81
+
82
+ links = {}
83
+ urls.each { |url| links[url] = [] }
84
+ values.each do |page|
85
+ urls.each { |url| links[url] << page if page.links.include?(url) }
86
+ end
87
+
88
+ if single and !links.empty?
89
+ return links.first
90
+ else
91
+ return links
92
+ end
93
+ end
94
+
95
+ #
96
+ # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
97
+ # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
98
+ #
99
+ def urls_linking_to(urls)
100
+ unless urls.is_a?(Array)
101
+ urls = [urls] unless urls.is_a?(Array)
102
+ single = true
103
+ end
104
+
105
+ links = pages_linking_to(urls)
106
+ links.each { |url, pages| links[url] = pages.map{|p| p.url} }
107
+
108
+ if single and !links.empty?
109
+ return links.first
110
+ else
111
+ return links
112
+ end
113
+ end
114
+
115
+ end
116
+ end
@@ -0,0 +1,31 @@
1
+ require 'anemone/page'
2
+
3
+ module Anemone
4
+ class Tentacle
5
+
6
+ #
7
+ # Create a new Tentacle
8
+ #
9
+ def initialize(link_queue, page_queue)
10
+ @link_queue = link_queue
11
+ @page_queue = page_queue
12
+ end
13
+
14
+ #
15
+ # Gets links from @link_queue, and returns the fetched
16
+ # Page objects into @page_queue
17
+ #
18
+ def run
19
+ while true do
20
+ link = @link_queue.deq
21
+
22
+ break if link == :END
23
+
24
+ page = Page.fetch(link)
25
+
26
+ @page_queue.enq(page)
27
+ end
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,27 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe Anemone do
4
+
5
+ it "should have a version and user agent" do
6
+ Anemone.const_defined?('VERSION').should == true
7
+ Anemone.const_defined?('USER_AGENT').should == true
8
+ end
9
+
10
+ it "should have options" do
11
+ Anemone.should respond_to(:options)
12
+ end
13
+
14
+ it "should accept options for the crawl" do
15
+ Anemone.crawl(SPEC_DOMAIN, :verbose => false, :threads => 2, :discard_page_bodies => true)
16
+ Anemone.options.verbose.should == false
17
+ Anemone.options.threads.should == 2
18
+ Anemone.options.discard_page_bodies.should == true
19
+ end
20
+
21
+ it "should return a Anemone::Core from the crawl, which has a PageHash" do
22
+ result = Anemone.crawl(SPEC_DOMAIN)
23
+ result.should be_an_instance_of(Anemone::Core)
24
+ result.pages.should be_an_instance_of(Anemone::PageHash)
25
+ end
26
+
27
+ end
data/spec/core_spec.rb ADDED
@@ -0,0 +1,114 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Core do
5
+
6
+ before(:each) do
7
+ FakeWeb.clean_registry
8
+ end
9
+
10
+ it "should crawl all the html pages in a domain by following <a> href's" do
11
+ pages = []
12
+ pages << FakePage.new('0', :links => ['1', '2'])
13
+ pages << FakePage.new('1', :links => ['3'])
14
+ pages << FakePage.new('2')
15
+ pages << FakePage.new('3')
16
+
17
+ Anemone.crawl(pages[0].url).should have(4).pages
18
+ end
19
+
20
+ it "should not leave the original domain" do
21
+ pages = []
22
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
23
+ pages << FakePage.new('1')
24
+
25
+ core = Anemone.crawl(pages[0].url)
26
+
27
+ core.should have(2).pages
28
+ core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
29
+ end
30
+
31
+ it "should follow http redirects" do
32
+ pages = []
33
+ pages << FakePage.new('0', :links => ['1'])
34
+ pages << FakePage.new('1', :redirect => '2')
35
+ pages << FakePage.new('2')
36
+
37
+ Anemone.crawl(pages[0].url).should have(3).pages
38
+ end
39
+
40
+ it "should accept multiple starting URLs" do
41
+ pages = []
42
+ pages << FakePage.new('0', :links => ['1'])
43
+ pages << FakePage.new('1')
44
+ pages << FakePage.new('2', :links => ['3'])
45
+ pages << FakePage.new('3')
46
+
47
+ Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
48
+ end
49
+
50
+ it "should include the query string when following links" do
51
+ pages = []
52
+ pages << FakePage.new('0', :links => ['1?foo=1'])
53
+ pages << FakePage.new('1?foo=1')
54
+ pages << FakePage.new('1')
55
+
56
+ core = Anemone.crawl(pages[0].url)
57
+
58
+ core.should have(2).pages
59
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
60
+ end
61
+
62
+ it "should be able to skip links based on a RegEx" do
63
+ pages = []
64
+ pages << FakePage.new('0', :links => ['1', '2'])
65
+ pages << FakePage.new('1')
66
+ pages << FakePage.new('2')
67
+
68
+ core = Anemone.crawl(pages[0].url) do |a|
69
+ a.skip_links_like /1/
70
+ end
71
+
72
+ core.should have(2).pages
73
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
74
+ end
75
+
76
+ it "should be able to call a block on every page" do
77
+ pages = []
78
+ pages << FakePage.new('0', :links => ['1', '2'])
79
+ pages << FakePage.new('1')
80
+ pages << FakePage.new('2')
81
+
82
+ count = 0
83
+ Anemone.crawl(pages[0].url) do |a|
84
+ a.on_every_page { count += 1 }
85
+ end
86
+
87
+ count.should == 3
88
+ end
89
+
90
+ it "should not discard page bodies by default" do
91
+ Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
92
+ end
93
+
94
+ it "should optionally discard page bodies to conserve memory" do
95
+ core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
96
+ core.pages.values.first.doc.should be_nil
97
+ end
98
+
99
+ it "should provide a focus_crawl method to select the links on each page to follow" do
100
+ pages = []
101
+ pages << FakePage.new('0', :links => ['1', '2'])
102
+ pages << FakePage.new('1')
103
+ pages << FakePage.new('2')
104
+
105
+ core = Anemone.crawl(pages[0].url) do |a|
106
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
107
+ end
108
+
109
+ core.should have(2).pages
110
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
111
+ end
112
+
113
+ end
114
+ end
@@ -0,0 +1,55 @@
1
+ begin
2
+ require 'fakeweb'
3
+ rescue LoadError
4
+ warn "You need the 'fakeweb' gem installed to test Anemone"
5
+ exit
6
+ end
7
+
8
+ FakeWeb.allow_net_connect = false
9
+
10
+ module Anemone
11
+ SPEC_DOMAIN = "http://www.example.com/"
12
+
13
+ class FakePage
14
+ attr_accessor :links
15
+ attr_accessor :hrefs
16
+
17
+ def initialize(name = '', options = {})
18
+ @name = name
19
+ @links = [options[:links]].flatten if options.has_key?(:links)
20
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
21
+ @redirect = options[:redirect] if options.has_key?(:redirect)
22
+
23
+ create_body
24
+ add_to_fakeweb
25
+ end
26
+
27
+ def url
28
+ SPEC_DOMAIN + @name
29
+ end
30
+
31
+ private
32
+
33
+ def create_body
34
+ @body = "<html><body>"
35
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
36
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
37
+ @body += "</body></html>"
38
+ end
39
+
40
+ def add_to_fakeweb
41
+ options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
42
+
43
+ if @redirect
44
+ options[:status] = [301, "Permanently Moved"]
45
+ options[:location] = SPEC_DOMAIN + @redirect
46
+ end
47
+
48
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
49
+ end
50
+ end
51
+ end
52
+
53
+ #default root
54
+ Anemone::FakePage.new
55
+
data/spec/page_spec.rb ADDED
@@ -0,0 +1,49 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Page do
5
+
6
+ before(:each) do
7
+ @page = Page.fetch(FakePage.new('home').url)
8
+ end
9
+
10
+ it "should be able to fetch a page" do
11
+ @page.should_not be_nil
12
+ @page.url.to_s.should include('home')
13
+ end
14
+
15
+ it "should store the response headers when fetching a page" do
16
+ @page.headers.should_not be_nil
17
+ @page.headers.should have_key('content-type')
18
+ end
19
+
20
+ it "should have an OpenStruct attribute for the developer to store data in" do
21
+ @page.data.should_not be_nil
22
+ @page.data.should be_an_instance_of(OpenStruct)
23
+
24
+ @page.data.test = 'test'
25
+ @page.data.test.should == 'test'
26
+ end
27
+
28
+ it "should have a Nokogori::HTML::Document attribute for the page body" do
29
+ @page.doc.should_not be_nil
30
+ @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
31
+ end
32
+
33
+ it "should indicate whether it was fetched after an HTTP redirect" do
34
+ @page.should respond_to(:redirect?)
35
+
36
+ @page.redirect?.should == false
37
+
38
+ Page.fetch(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
39
+ end
40
+
41
+ it "should have a method to tell if a URI is in the same domain as the page" do
42
+ @page.should respond_to(:in_domain?)
43
+
44
+ @page.in_domain?(URI(FakePage.new('test').url)).should == true
45
+ @page.in_domain?(URI('http://www.other.com/')).should == false
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,5 @@
1
+ require File.dirname(__FILE__) + '/../lib/anemone'
2
+ require File.dirname(__FILE__) + '/fakeweb_helper'
3
+ require 'rubygems'
4
+
5
+ SPEC_DOMAIN = 'http://www.example.com/'
metadata ADDED
@@ -0,0 +1,85 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jeremyf-anemone
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.3
5
+ platform: ruby
6
+ authors:
7
+ - Chris Kite
8
+ - Jeremy Friesen
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2009-08-05 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description:
18
+ email: jeremy.n.friesen@gmail.com
19
+ executables:
20
+ - anemone_count.rb
21
+ - anemone_cron.rb
22
+ - anemone_pagedepth.rb
23
+ - anemone_serialize.rb
24
+ - anemone_url_list.rb
25
+ extensions: []
26
+
27
+ extra_rdoc_files:
28
+ - LICENSE.txt
29
+ - README.rdoc
30
+ files:
31
+ - LICENSE.txt
32
+ - README.rdoc
33
+ - Rakefile
34
+ - VERSION.yml
35
+ - anemone.gemspec
36
+ - bin/anemone_count.rb
37
+ - bin/anemone_cron.rb
38
+ - bin/anemone_pagedepth.rb
39
+ - bin/anemone_serialize.rb
40
+ - bin/anemone_url_list.rb
41
+ - lib/anemone.rb
42
+ - lib/anemone/anemone.rb
43
+ - lib/anemone/core.rb
44
+ - lib/anemone/http.rb
45
+ - lib/anemone/page.rb
46
+ - lib/anemone/page_hash.rb
47
+ - lib/anemone/tentacle.rb
48
+ - spec/anemone_spec.rb
49
+ - spec/core_spec.rb
50
+ - spec/fakeweb_helper.rb
51
+ - spec/page_spec.rb
52
+ - spec/spec_helper.rb
53
+ has_rdoc: false
54
+ homepage: http://github.com/jeremyf/anemone
55
+ licenses:
56
+ post_install_message:
57
+ rdoc_options:
58
+ - --charset=UTF-8
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: "0"
66
+ version:
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: "0"
72
+ version:
73
+ requirements: []
74
+
75
+ rubyforge_project:
76
+ rubygems_version: 1.3.5
77
+ signing_key:
78
+ specification_version: 3
79
+ summary: Anemone is a web spider framework that can spider a domain.
80
+ test_files:
81
+ - spec/anemone_spec.rb
82
+ - spec/core_spec.rb
83
+ - spec/fakeweb_helper.rb
84
+ - spec/page_spec.rb
85
+ - spec/spec_helper.rb