jeremyf-anemone 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,116 @@
1
+ module Anemone
2
+ class PageHash < Hash
3
+
4
+ #
5
+ # Use a breadth-first search to calculate the single-source
6
+ # shortest paths from *root* to all pages in the PageHash
7
+ #
8
+ def shortest_paths!(root)
9
+ root = URI(root) if root.is_a?(String)
10
+ raise "Root node not found" if !has_key?(root)
11
+
12
+ each_value {|p| p.visited = false if p}
13
+
14
+ q = Queue.new
15
+
16
+ q.enq(root)
17
+ self[root].depth = 0
18
+ self[root].visited = true
19
+ while(!q.empty?)
20
+ url = q.deq
21
+
22
+ next if !has_key?(url)
23
+
24
+ page = self[url]
25
+
26
+ page.links.each do |u|
27
+ next if !has_key?(u) or self[u].nil?
28
+ link = self[u]
29
+ aliases = [link].concat(link.aliases.map {|a| self[a] })
30
+
31
+ aliases.each do |node|
32
+ if node.depth.nil? or page.depth + 1 < node.depth
33
+ node.depth = page.depth + 1
34
+ end
35
+ end
36
+
37
+ q.enq(self[u].url) if !self[u].visited
38
+ self[u].visited = true
39
+ end
40
+ end
41
+
42
+ self
43
+ end
44
+
45
+ #
46
+ # Returns a new PageHash by removing redirect-aliases for each
47
+ # non-redirect Page
48
+ #
49
+ def uniq
50
+ results = PageHash.new
51
+ each do |url, page|
52
+ #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
53
+ page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
54
+ if !page.redirect? and !page_added
55
+ results[url] = page.clone
56
+ results[url].aliases = []
57
+ end
58
+ end
59
+
60
+ results
61
+ end
62
+
63
+ #
64
+ # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
65
+ # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
66
+ #
67
+ def pages_linking_to(urls)
68
+ unless urls.is_a?(Array)
69
+ urls = [urls] unless urls.is_a?(Array)
70
+ single = true
71
+ end
72
+
73
+ urls.map! do |url|
74
+ if url.is_a?(String)
75
+ URI(url) rescue nil
76
+ else
77
+ url
78
+ end
79
+ end
80
+ urls.compact
81
+
82
+ links = {}
83
+ urls.each { |url| links[url] = [] }
84
+ values.each do |page|
85
+ urls.each { |url| links[url] << page if page.links.include?(url) }
86
+ end
87
+
88
+ if single and !links.empty?
89
+ return links.first
90
+ else
91
+ return links
92
+ end
93
+ end
94
+
95
+ #
96
+ # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
97
+ # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
98
+ #
99
+ def urls_linking_to(urls)
100
+ unless urls.is_a?(Array)
101
+ urls = [urls] unless urls.is_a?(Array)
102
+ single = true
103
+ end
104
+
105
+ links = pages_linking_to(urls)
106
+ links.each { |url, pages| links[url] = pages.map{|p| p.url} }
107
+
108
+ if single and !links.empty?
109
+ return links.first
110
+ else
111
+ return links
112
+ end
113
+ end
114
+
115
+ end
116
+ end
@@ -0,0 +1,31 @@
1
+ require 'anemone/page'
2
+
3
+ module Anemone
4
+ class Tentacle
5
+
6
+ #
7
+ # Create a new Tentacle
8
+ #
9
+ def initialize(link_queue, page_queue)
10
+ @link_queue = link_queue
11
+ @page_queue = page_queue
12
+ end
13
+
14
+ #
15
+ # Gets links from @link_queue, and returns the fetched
16
+ # Page objects into @page_queue
17
+ #
18
+ def run
19
+ while true do
20
+ link = @link_queue.deq
21
+
22
+ break if link == :END
23
+
24
+ page = Page.fetch(link)
25
+
26
+ @page_queue.enq(page)
27
+ end
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,27 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe Anemone do
4
+
5
+ it "should have a version and user agent" do
6
+ Anemone.const_defined?('VERSION').should == true
7
+ Anemone.const_defined?('USER_AGENT').should == true
8
+ end
9
+
10
+ it "should have options" do
11
+ Anemone.should respond_to(:options)
12
+ end
13
+
14
+ it "should accept options for the crawl" do
15
+ Anemone.crawl(SPEC_DOMAIN, :verbose => false, :threads => 2, :discard_page_bodies => true)
16
+ Anemone.options.verbose.should == false
17
+ Anemone.options.threads.should == 2
18
+ Anemone.options.discard_page_bodies.should == true
19
+ end
20
+
21
+ it "should return a Anemone::Core from the crawl, which has a PageHash" do
22
+ result = Anemone.crawl(SPEC_DOMAIN)
23
+ result.should be_an_instance_of(Anemone::Core)
24
+ result.pages.should be_an_instance_of(Anemone::PageHash)
25
+ end
26
+
27
+ end
data/spec/core_spec.rb ADDED
@@ -0,0 +1,114 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Core do
5
+
6
+ before(:each) do
7
+ FakeWeb.clean_registry
8
+ end
9
+
10
+ it "should crawl all the html pages in a domain by following <a> href's" do
11
+ pages = []
12
+ pages << FakePage.new('0', :links => ['1', '2'])
13
+ pages << FakePage.new('1', :links => ['3'])
14
+ pages << FakePage.new('2')
15
+ pages << FakePage.new('3')
16
+
17
+ Anemone.crawl(pages[0].url).should have(4).pages
18
+ end
19
+
20
+ it "should not leave the original domain" do
21
+ pages = []
22
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
23
+ pages << FakePage.new('1')
24
+
25
+ core = Anemone.crawl(pages[0].url)
26
+
27
+ core.should have(2).pages
28
+ core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
29
+ end
30
+
31
+ it "should follow http redirects" do
32
+ pages = []
33
+ pages << FakePage.new('0', :links => ['1'])
34
+ pages << FakePage.new('1', :redirect => '2')
35
+ pages << FakePage.new('2')
36
+
37
+ Anemone.crawl(pages[0].url).should have(3).pages
38
+ end
39
+
40
+ it "should accept multiple starting URLs" do
41
+ pages = []
42
+ pages << FakePage.new('0', :links => ['1'])
43
+ pages << FakePage.new('1')
44
+ pages << FakePage.new('2', :links => ['3'])
45
+ pages << FakePage.new('3')
46
+
47
+ Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
48
+ end
49
+
50
+ it "should include the query string when following links" do
51
+ pages = []
52
+ pages << FakePage.new('0', :links => ['1?foo=1'])
53
+ pages << FakePage.new('1?foo=1')
54
+ pages << FakePage.new('1')
55
+
56
+ core = Anemone.crawl(pages[0].url)
57
+
58
+ core.should have(2).pages
59
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
60
+ end
61
+
62
+ it "should be able to skip links based on a RegEx" do
63
+ pages = []
64
+ pages << FakePage.new('0', :links => ['1', '2'])
65
+ pages << FakePage.new('1')
66
+ pages << FakePage.new('2')
67
+
68
+ core = Anemone.crawl(pages[0].url) do |a|
69
+ a.skip_links_like /1/
70
+ end
71
+
72
+ core.should have(2).pages
73
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
74
+ end
75
+
76
+ it "should be able to call a block on every page" do
77
+ pages = []
78
+ pages << FakePage.new('0', :links => ['1', '2'])
79
+ pages << FakePage.new('1')
80
+ pages << FakePage.new('2')
81
+
82
+ count = 0
83
+ Anemone.crawl(pages[0].url) do |a|
84
+ a.on_every_page { count += 1 }
85
+ end
86
+
87
+ count.should == 3
88
+ end
89
+
90
+ it "should not discard page bodies by default" do
91
+ Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
92
+ end
93
+
94
+ it "should optionally discard page bodies to conserve memory" do
95
+ core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
96
+ core.pages.values.first.doc.should be_nil
97
+ end
98
+
99
+ it "should provide a focus_crawl method to select the links on each page to follow" do
100
+ pages = []
101
+ pages << FakePage.new('0', :links => ['1', '2'])
102
+ pages << FakePage.new('1')
103
+ pages << FakePage.new('2')
104
+
105
+ core = Anemone.crawl(pages[0].url) do |a|
106
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
107
+ end
108
+
109
+ core.should have(2).pages
110
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
111
+ end
112
+
113
+ end
114
+ end
@@ -0,0 +1,55 @@
1
+ begin
2
+ require 'fakeweb'
3
+ rescue LoadError
4
+ warn "You need the 'fakeweb' gem installed to test Anemone"
5
+ exit
6
+ end
7
+
8
+ FakeWeb.allow_net_connect = false
9
+
10
+ module Anemone
11
+ SPEC_DOMAIN = "http://www.example.com/"
12
+
13
+ class FakePage
14
+ attr_accessor :links
15
+ attr_accessor :hrefs
16
+
17
+ def initialize(name = '', options = {})
18
+ @name = name
19
+ @links = [options[:links]].flatten if options.has_key?(:links)
20
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
21
+ @redirect = options[:redirect] if options.has_key?(:redirect)
22
+
23
+ create_body
24
+ add_to_fakeweb
25
+ end
26
+
27
+ def url
28
+ SPEC_DOMAIN + @name
29
+ end
30
+
31
+ private
32
+
33
+ def create_body
34
+ @body = "<html><body>"
35
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
36
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
37
+ @body += "</body></html>"
38
+ end
39
+
40
+ def add_to_fakeweb
41
+ options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
42
+
43
+ if @redirect
44
+ options[:status] = [301, "Permanently Moved"]
45
+ options[:location] = SPEC_DOMAIN + @redirect
46
+ end
47
+
48
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
49
+ end
50
+ end
51
+ end
52
+
53
+ #default root
54
+ Anemone::FakePage.new
55
+
data/spec/page_spec.rb ADDED
@@ -0,0 +1,49 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Page do
5
+
6
+ before(:each) do
7
+ @page = Page.fetch(FakePage.new('home').url)
8
+ end
9
+
10
+ it "should be able to fetch a page" do
11
+ @page.should_not be_nil
12
+ @page.url.to_s.should include('home')
13
+ end
14
+
15
+ it "should store the response headers when fetching a page" do
16
+ @page.headers.should_not be_nil
17
+ @page.headers.should have_key('content-type')
18
+ end
19
+
20
+ it "should have an OpenStruct attribute for the developer to store data in" do
21
+ @page.data.should_not be_nil
22
+ @page.data.should be_an_instance_of(OpenStruct)
23
+
24
+ @page.data.test = 'test'
25
+ @page.data.test.should == 'test'
26
+ end
27
+
28
+ it "should have a Nokogori::HTML::Document attribute for the page body" do
29
+ @page.doc.should_not be_nil
30
+ @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
31
+ end
32
+
33
+ it "should indicate whether it was fetched after an HTTP redirect" do
34
+ @page.should respond_to(:redirect?)
35
+
36
+ @page.redirect?.should == false
37
+
38
+ Page.fetch(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
39
+ end
40
+
41
+ it "should have a method to tell if a URI is in the same domain as the page" do
42
+ @page.should respond_to(:in_domain?)
43
+
44
+ @page.in_domain?(URI(FakePage.new('test').url)).should == true
45
+ @page.in_domain?(URI('http://www.other.com/')).should == false
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,5 @@
1
+ require File.dirname(__FILE__) + '/../lib/anemone'
2
+ require File.dirname(__FILE__) + '/fakeweb_helper'
3
+ require 'rubygems'
4
+
5
+ SPEC_DOMAIN = 'http://www.example.com/'
metadata ADDED
@@ -0,0 +1,85 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jeremyf-anemone
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.3
5
+ platform: ruby
6
+ authors:
7
+ - Chris Kite
8
+ - Jeremy Friesen
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2009-08-05 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description:
18
+ email: jeremy.n.friesen@gmail.com
19
+ executables:
20
+ - anemone_count.rb
21
+ - anemone_cron.rb
22
+ - anemone_pagedepth.rb
23
+ - anemone_serialize.rb
24
+ - anemone_url_list.rb
25
+ extensions: []
26
+
27
+ extra_rdoc_files:
28
+ - LICENSE.txt
29
+ - README.rdoc
30
+ files:
31
+ - LICENSE.txt
32
+ - README.rdoc
33
+ - Rakefile
34
+ - VERSION.yml
35
+ - anemone.gemspec
36
+ - bin/anemone_count.rb
37
+ - bin/anemone_cron.rb
38
+ - bin/anemone_pagedepth.rb
39
+ - bin/anemone_serialize.rb
40
+ - bin/anemone_url_list.rb
41
+ - lib/anemone.rb
42
+ - lib/anemone/anemone.rb
43
+ - lib/anemone/core.rb
44
+ - lib/anemone/http.rb
45
+ - lib/anemone/page.rb
46
+ - lib/anemone/page_hash.rb
47
+ - lib/anemone/tentacle.rb
48
+ - spec/anemone_spec.rb
49
+ - spec/core_spec.rb
50
+ - spec/fakeweb_helper.rb
51
+ - spec/page_spec.rb
52
+ - spec/spec_helper.rb
53
+ has_rdoc: false
54
+ homepage: http://github.com/jeremyf/anemone
55
+ licenses:
56
+ post_install_message:
57
+ rdoc_options:
58
+ - --charset=UTF-8
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: "0"
66
+ version:
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: "0"
72
+ version:
73
+ requirements: []
74
+
75
+ rubyforge_project:
76
+ rubygems_version: 1.3.5
77
+ signing_key:
78
+ specification_version: 3
79
+ summary: Anemone is a web spider framework that can spider a domain.
80
+ test_files:
81
+ - spec/anemone_spec.rb
82
+ - spec/core_spec.rb
83
+ - spec/fakeweb_helper.rb
84
+ - spec/page_spec.rb
85
+ - spec/spec_helper.rb