anemone 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2009 Vertive, Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
@@ -3,11 +3,8 @@ require 'anemone/core'
3
3
 
4
4
  module Anemone
5
5
  # Version number
6
- VERSION = '0.1.1'
7
-
8
- # User-Agent string used for HTTP requests
9
- USER_AGENT = "Anemone/#{self::VERSION}"
10
-
6
+ VERSION = '0.1.2'
7
+
11
8
  #module-wide options
12
9
  def Anemone.options=(options)
13
10
  @options = options
@@ -31,7 +28,18 @@ module Anemone
31
28
 
32
29
  #by default, don't throw away the page response body after scanning it for links
33
30
  Anemone.options.discard_page_bodies ||= false
34
-
31
+
32
+ #by default, identify self as Anemone/VERSION
33
+ Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
34
+
35
+ #no delay between requests by default
36
+ Anemone.options.delay ||= 0
37
+
38
+ #use a single thread if a delay was requested
39
+ if(Anemone.options.delay != 0)
40
+ Anemone.options.threads = 1
41
+ end
42
+
35
43
  Core.crawl(urls, &block)
36
44
  end
37
45
  end
@@ -31,7 +31,7 @@ module Anemone
31
31
  def self.get_response(url)
32
32
  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
33
33
  Net::HTTP.start(url.host, url.port) do |http|
34
- return http.get(full_path, {'User-Agent' => Anemone::USER_AGENT })
34
+ return http.get(full_path, {'User-Agent' => Anemone.options.user_agent })
35
35
  end
36
36
  end
37
37
  end
@@ -24,6 +24,8 @@ module Anemone
24
24
  page = Page.fetch(link)
25
25
 
26
26
  @page_queue.enq(page)
27
+
28
+ sleep Anemone.options.delay
27
29
  end
28
30
  end
29
31
 
@@ -0,0 +1,36 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe Anemone do
4
+
5
+ it "should have a version" do
6
+ Anemone.const_defined?('VERSION').should == true
7
+ end
8
+
9
+ it "should have options" do
10
+ Anemone.should respond_to(:options)
11
+ end
12
+
13
+ it "should accept options for the crawl" do
14
+ Anemone.crawl(SPEC_DOMAIN, :verbose => false,
15
+ :threads => 2,
16
+ :discard_page_bodies => true,
17
+ :user_agent => 'test')
18
+ Anemone.options.verbose.should == false
19
+ Anemone.options.threads.should == 2
20
+ Anemone.options.discard_page_bodies.should == true
21
+ Anemone.options.delay.should == 0
22
+ Anemone.options.user_agent.should == 'test'
23
+ end
24
+
25
+ it "should use 1 thread if a delay is requested" do
26
+ Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
27
+ Anemone.options.threads.should == 1
28
+ end
29
+
30
+ it "should return a Anemone::Core from the crawl, which has a PageHash" do
31
+ result = Anemone.crawl(SPEC_DOMAIN)
32
+ result.should be_an_instance_of(Anemone::Core)
33
+ result.pages.should be_an_instance_of(Anemone::PageHash)
34
+ end
35
+
36
+ end
@@ -0,0 +1,128 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Core do
5
+
6
+ before(:each) do
7
+ FakeWeb.clean_registry
8
+ end
9
+
10
+ it "should crawl all the html pages in a domain by following <a> href's" do
11
+ pages = []
12
+ pages << FakePage.new('0', :links => ['1', '2'])
13
+ pages << FakePage.new('1', :links => ['3'])
14
+ pages << FakePage.new('2')
15
+ pages << FakePage.new('3')
16
+
17
+ Anemone.crawl(pages[0].url).should have(4).pages
18
+ end
19
+
20
+ it "should not leave the original domain" do
21
+ pages = []
22
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
23
+ pages << FakePage.new('1')
24
+
25
+ core = Anemone.crawl(pages[0].url)
26
+
27
+ core.should have(2).pages
28
+ core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
29
+ end
30
+
31
+ it "should follow http redirects" do
32
+ pages = []
33
+ pages << FakePage.new('0', :links => ['1'])
34
+ pages << FakePage.new('1', :redirect => '2')
35
+ pages << FakePage.new('2')
36
+
37
+ Anemone.crawl(pages[0].url).should have(3).pages
38
+ end
39
+
40
+ it "should accept multiple starting URLs" do
41
+ pages = []
42
+ pages << FakePage.new('0', :links => ['1'])
43
+ pages << FakePage.new('1')
44
+ pages << FakePage.new('2', :links => ['3'])
45
+ pages << FakePage.new('3')
46
+
47
+ Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
48
+ end
49
+
50
+ it "should include the query string when following links" do
51
+ pages = []
52
+ pages << FakePage.new('0', :links => ['1?foo=1'])
53
+ pages << FakePage.new('1?foo=1')
54
+ pages << FakePage.new('1')
55
+
56
+ core = Anemone.crawl(pages[0].url)
57
+
58
+ core.should have(2).pages
59
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
60
+ end
61
+
62
+ it "should be able to skip links based on a RegEx" do
63
+ pages = []
64
+ pages << FakePage.new('0', :links => ['1', '2'])
65
+ pages << FakePage.new('1')
66
+ pages << FakePage.new('2')
67
+
68
+ core = Anemone.crawl(pages[0].url) do |a|
69
+ a.skip_links_like /1/
70
+ end
71
+
72
+ core.should have(2).pages
73
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
74
+ end
75
+
76
+ it "should be able to call a block on every page" do
77
+ pages = []
78
+ pages << FakePage.new('0', :links => ['1', '2'])
79
+ pages << FakePage.new('1')
80
+ pages << FakePage.new('2')
81
+
82
+ count = 0
83
+ Anemone.crawl(pages[0].url) do |a|
84
+ a.on_every_page { count += 1 }
85
+ end
86
+
87
+ count.should == 3
88
+ end
89
+
90
+ it "should not discard page bodies by default" do
91
+ Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
92
+ end
93
+
94
+ it "should optionally discard page bodies to conserve memory" do
95
+ core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
96
+ core.pages.values.first.doc.should be_nil
97
+ end
98
+
99
+ it "should provide a focus_crawl method to select the links on each page to follow" do
100
+ pages = []
101
+ pages << FakePage.new('0', :links => ['1', '2'])
102
+ pages << FakePage.new('1')
103
+ pages << FakePage.new('2')
104
+
105
+ core = Anemone.crawl(pages[0].url) do |a|
106
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
107
+ end
108
+
109
+ core.should have(2).pages
110
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
111
+ end
112
+
113
+ it "should optionally delay between page requests" do
114
+ delay = 0.25
115
+
116
+ pages = []
117
+ pages << FakePage.new('0', :links => '1')
118
+ pages << FakePage.new('1')
119
+
120
+ start = Time.now
121
+ Anemone.crawl(pages[0].url, :delay => delay)
122
+ finish = Time.now
123
+
124
+ (finish - start).should satisfy {|t| t > delay * 2}
125
+ end
126
+
127
+ end
128
+ end
@@ -0,0 +1,55 @@
1
+ begin
2
+ require 'fakeweb'
3
+ rescue LoadError
4
+ warn "You need the 'fakeweb' gem installed to test Anemone"
5
+ exit
6
+ end
7
+
8
+ FakeWeb.allow_net_connect = false
9
+
10
+ module Anemone
11
+ SPEC_DOMAIN = "http://www.example.com/"
12
+
13
+ class FakePage
14
+ attr_accessor :links
15
+ attr_accessor :hrefs
16
+
17
+ def initialize(name = '', options = {})
18
+ @name = name
19
+ @links = [options[:links]].flatten if options.has_key?(:links)
20
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
21
+ @redirect = options[:redirect] if options.has_key?(:redirect)
22
+
23
+ create_body
24
+ add_to_fakeweb
25
+ end
26
+
27
+ def url
28
+ SPEC_DOMAIN + @name
29
+ end
30
+
31
+ private
32
+
33
+ def create_body
34
+ @body = "<html><body>"
35
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
36
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
37
+ @body += "</body></html>"
38
+ end
39
+
40
+ def add_to_fakeweb
41
+ options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
42
+
43
+ if @redirect
44
+ options[:status] = [301, "Permanently Moved"]
45
+ options[:location] = SPEC_DOMAIN + @redirect
46
+ end
47
+
48
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
49
+ end
50
+ end
51
+ end
52
+
53
+ #default root
54
+ Anemone::FakePage.new
55
+
@@ -0,0 +1,49 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Page do
5
+
6
+ before(:each) do
7
+ @page = Page.fetch(FakePage.new('home').url)
8
+ end
9
+
10
+ it "should be able to fetch a page" do
11
+ @page.should_not be_nil
12
+ @page.url.to_s.should include('home')
13
+ end
14
+
15
+ it "should store the response headers when fetching a page" do
16
+ @page.headers.should_not be_nil
17
+ @page.headers.should have_key('content-type')
18
+ end
19
+
20
+ it "should have an OpenStruct attribute for the developer to store data in" do
21
+ @page.data.should_not be_nil
22
+ @page.data.should be_an_instance_of(OpenStruct)
23
+
24
+ @page.data.test = 'test'
25
+ @page.data.test.should == 'test'
26
+ end
27
+
28
+ it "should have a Nokogori::HTML::Document attribute for the page body" do
29
+ @page.doc.should_not be_nil
30
+ @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
31
+ end
32
+
33
+ it "should indicate whether it was fetched after an HTTP redirect" do
34
+ @page.should respond_to(:redirect?)
35
+
36
+ @page.redirect?.should == false
37
+
38
+ Page.fetch(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
39
+ end
40
+
41
+ it "should have a method to tell if a URI is in the same domain as the page" do
42
+ @page.should respond_to(:in_domain?)
43
+
44
+ @page.in_domain?(URI(FakePage.new('test').url)).should == true
45
+ @page.in_domain?(URI('http://www.other.com/')).should == false
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,7 @@
1
+ require File.dirname(__FILE__) + '/fakeweb_helper'
2
+ require 'rubygems'
3
+
4
+ $:.unshift(File.dirname(__FILE__) + '/../lib/')
5
+ require 'anemone'
6
+
7
+ SPEC_DOMAIN = 'http://www.example.com/'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-07-22 00:00:00 -05:00
12
+ date: 2009-08-10 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -35,23 +35,22 @@ extensions: []
35
35
  extra_rdoc_files:
36
36
  - README.rdoc
37
37
  files:
38
- - bin/anemone_pagedepth.rb
39
- - bin/anemone_url_list.rb
40
- - bin/anemone_cron.rb
38
+ - LICENSE.txt
39
+ - README.rdoc
41
40
  - bin/anemone_count.rb
41
+ - bin/anemone_cron.rb
42
+ - bin/anemone_pagedepth.rb
42
43
  - bin/anemone_serialize.rb
43
- - lib/anemone/tentacle.rb
44
- - lib/anemone/page.rb
45
- - lib/anemone/page_hash.rb
44
+ - bin/anemone_url_list.rb
45
+ - lib/anemone.rb
46
+ - lib/anemone/anemone.rb
46
47
  - lib/anemone/core.rb
47
48
  - lib/anemone/http.rb
48
- - lib/anemone/anemone.rb
49
- - lib/anemone.rb
50
- - README.rdoc
49
+ - lib/anemone/page.rb
50
+ - lib/anemone/page_hash.rb
51
+ - lib/anemone/tentacle.rb
51
52
  has_rdoc: true
52
53
  homepage: http://anemone.rubyforge.org
53
- licenses: []
54
-
55
54
  post_install_message:
56
55
  rdoc_options:
57
56
  - -m
@@ -75,9 +74,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
75
74
  requirements: []
76
75
 
77
76
  rubyforge_project: anemone
78
- rubygems_version: 1.3.4
77
+ rubygems_version: 1.3.1
79
78
  signing_key:
80
- specification_version: 3
79
+ specification_version: 2
81
80
  summary: Anemone web-spider framework
82
- test_files: []
83
-
81
+ test_files:
82
+ - spec/anemone_spec.rb
83
+ - spec/core_spec.rb
84
+ - spec/page_spec.rb
85
+ - spec/fakeweb_helper.rb
86
+ - spec/spec_helper.rb