anemone 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2009 Vertive, Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
@@ -3,11 +3,8 @@ require 'anemone/core'
3
3
 
4
4
  module Anemone
5
5
  # Version number
6
- VERSION = '0.1.1'
7
-
8
- # User-Agent string used for HTTP requests
9
- USER_AGENT = "Anemone/#{self::VERSION}"
10
-
6
+ VERSION = '0.1.2'
7
+
11
8
  #module-wide options
12
9
  def Anemone.options=(options)
13
10
  @options = options
@@ -31,7 +28,18 @@ module Anemone
31
28
 
32
29
  #by default, don't throw away the page response body after scanning it for links
33
30
  Anemone.options.discard_page_bodies ||= false
34
-
31
+
32
+ #by default, identify self as Anemone/VERSION
33
+ Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
34
+
35
+ #no delay between requests by default
36
+ Anemone.options.delay ||= 0
37
+
38
+ #use a single thread if a delay was requested
39
+ if(Anemone.options.delay != 0)
40
+ Anemone.options.threads = 1
41
+ end
42
+
35
43
  Core.crawl(urls, &block)
36
44
  end
37
45
  end
@@ -31,7 +31,7 @@ module Anemone
31
31
  def self.get_response(url)
32
32
  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
33
33
  Net::HTTP.start(url.host, url.port) do |http|
34
- return http.get(full_path, {'User-Agent' => Anemone::USER_AGENT })
34
+ return http.get(full_path, {'User-Agent' => Anemone.options.user_agent })
35
35
  end
36
36
  end
37
37
  end
@@ -24,6 +24,8 @@ module Anemone
24
24
  page = Page.fetch(link)
25
25
 
26
26
  @page_queue.enq(page)
27
+
28
+ sleep Anemone.options.delay
27
29
  end
28
30
  end
29
31
 
@@ -0,0 +1,36 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe Anemone do
4
+
5
+ it "should have a version" do
6
+ Anemone.const_defined?('VERSION').should == true
7
+ end
8
+
9
+ it "should have options" do
10
+ Anemone.should respond_to(:options)
11
+ end
12
+
13
+ it "should accept options for the crawl" do
14
+ Anemone.crawl(SPEC_DOMAIN, :verbose => false,
15
+ :threads => 2,
16
+ :discard_page_bodies => true,
17
+ :user_agent => 'test')
18
+ Anemone.options.verbose.should == false
19
+ Anemone.options.threads.should == 2
20
+ Anemone.options.discard_page_bodies.should == true
21
+ Anemone.options.delay.should == 0
22
+ Anemone.options.user_agent.should == 'test'
23
+ end
24
+
25
+ it "should use 1 thread if a delay is requested" do
26
+ Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
27
+ Anemone.options.threads.should == 1
28
+ end
29
+
30
+ it "should return a Anemone::Core from the crawl, which has a PageHash" do
31
+ result = Anemone.crawl(SPEC_DOMAIN)
32
+ result.should be_an_instance_of(Anemone::Core)
33
+ result.pages.should be_an_instance_of(Anemone::PageHash)
34
+ end
35
+
36
+ end
@@ -0,0 +1,128 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Core do
5
+
6
+ before(:each) do
7
+ FakeWeb.clean_registry
8
+ end
9
+
10
+ it "should crawl all the html pages in a domain by following <a> href's" do
11
+ pages = []
12
+ pages << FakePage.new('0', :links => ['1', '2'])
13
+ pages << FakePage.new('1', :links => ['3'])
14
+ pages << FakePage.new('2')
15
+ pages << FakePage.new('3')
16
+
17
+ Anemone.crawl(pages[0].url).should have(4).pages
18
+ end
19
+
20
+ it "should not leave the original domain" do
21
+ pages = []
22
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
23
+ pages << FakePage.new('1')
24
+
25
+ core = Anemone.crawl(pages[0].url)
26
+
27
+ core.should have(2).pages
28
+ core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
29
+ end
30
+
31
+ it "should follow http redirects" do
32
+ pages = []
33
+ pages << FakePage.new('0', :links => ['1'])
34
+ pages << FakePage.new('1', :redirect => '2')
35
+ pages << FakePage.new('2')
36
+
37
+ Anemone.crawl(pages[0].url).should have(3).pages
38
+ end
39
+
40
+ it "should accept multiple starting URLs" do
41
+ pages = []
42
+ pages << FakePage.new('0', :links => ['1'])
43
+ pages << FakePage.new('1')
44
+ pages << FakePage.new('2', :links => ['3'])
45
+ pages << FakePage.new('3')
46
+
47
+ Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
48
+ end
49
+
50
+ it "should include the query string when following links" do
51
+ pages = []
52
+ pages << FakePage.new('0', :links => ['1?foo=1'])
53
+ pages << FakePage.new('1?foo=1')
54
+ pages << FakePage.new('1')
55
+
56
+ core = Anemone.crawl(pages[0].url)
57
+
58
+ core.should have(2).pages
59
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
60
+ end
61
+
62
+ it "should be able to skip links based on a RegEx" do
63
+ pages = []
64
+ pages << FakePage.new('0', :links => ['1', '2'])
65
+ pages << FakePage.new('1')
66
+ pages << FakePage.new('2')
67
+
68
+ core = Anemone.crawl(pages[0].url) do |a|
69
+ a.skip_links_like /1/
70
+ end
71
+
72
+ core.should have(2).pages
73
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
74
+ end
75
+
76
+ it "should be able to call a block on every page" do
77
+ pages = []
78
+ pages << FakePage.new('0', :links => ['1', '2'])
79
+ pages << FakePage.new('1')
80
+ pages << FakePage.new('2')
81
+
82
+ count = 0
83
+ Anemone.crawl(pages[0].url) do |a|
84
+ a.on_every_page { count += 1 }
85
+ end
86
+
87
+ count.should == 3
88
+ end
89
+
90
+ it "should not discard page bodies by default" do
91
+ Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
92
+ end
93
+
94
+ it "should optionally discard page bodies to conserve memory" do
95
+ core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
96
+ core.pages.values.first.doc.should be_nil
97
+ end
98
+
99
+ it "should provide a focus_crawl method to select the links on each page to follow" do
100
+ pages = []
101
+ pages << FakePage.new('0', :links => ['1', '2'])
102
+ pages << FakePage.new('1')
103
+ pages << FakePage.new('2')
104
+
105
+ core = Anemone.crawl(pages[0].url) do |a|
106
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
107
+ end
108
+
109
+ core.should have(2).pages
110
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
111
+ end
112
+
113
+ it "should optionally delay between page requests" do
114
+ delay = 0.25
115
+
116
+ pages = []
117
+ pages << FakePage.new('0', :links => '1')
118
+ pages << FakePage.new('1')
119
+
120
+ start = Time.now
121
+ Anemone.crawl(pages[0].url, :delay => delay)
122
+ finish = Time.now
123
+
124
+ (finish - start).should satisfy {|t| t > delay * 2}
125
+ end
126
+
127
+ end
128
+ end
@@ -0,0 +1,55 @@
1
+ begin
2
+ require 'fakeweb'
3
+ rescue LoadError
4
+ warn "You need the 'fakeweb' gem installed to test Anemone"
5
+ exit
6
+ end
7
+
8
+ FakeWeb.allow_net_connect = false
9
+
10
+ module Anemone
11
+ SPEC_DOMAIN = "http://www.example.com/"
12
+
13
+ class FakePage
14
+ attr_accessor :links
15
+ attr_accessor :hrefs
16
+
17
+ def initialize(name = '', options = {})
18
+ @name = name
19
+ @links = [options[:links]].flatten if options.has_key?(:links)
20
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
21
+ @redirect = options[:redirect] if options.has_key?(:redirect)
22
+
23
+ create_body
24
+ add_to_fakeweb
25
+ end
26
+
27
+ def url
28
+ SPEC_DOMAIN + @name
29
+ end
30
+
31
+ private
32
+
33
+ def create_body
34
+ @body = "<html><body>"
35
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
36
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
37
+ @body += "</body></html>"
38
+ end
39
+
40
+ def add_to_fakeweb
41
+ options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
42
+
43
+ if @redirect
44
+ options[:status] = [301, "Permanently Moved"]
45
+ options[:location] = SPEC_DOMAIN + @redirect
46
+ end
47
+
48
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
49
+ end
50
+ end
51
+ end
52
+
53
+ #default root
54
+ Anemone::FakePage.new
55
+
@@ -0,0 +1,49 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Page do
5
+
6
+ before(:each) do
7
+ @page = Page.fetch(FakePage.new('home').url)
8
+ end
9
+
10
+ it "should be able to fetch a page" do
11
+ @page.should_not be_nil
12
+ @page.url.to_s.should include('home')
13
+ end
14
+
15
+ it "should store the response headers when fetching a page" do
16
+ @page.headers.should_not be_nil
17
+ @page.headers.should have_key('content-type')
18
+ end
19
+
20
+ it "should have an OpenStruct attribute for the developer to store data in" do
21
+ @page.data.should_not be_nil
22
+ @page.data.should be_an_instance_of(OpenStruct)
23
+
24
+ @page.data.test = 'test'
25
+ @page.data.test.should == 'test'
26
+ end
27
+
28
+ it "should have a Nokogori::HTML::Document attribute for the page body" do
29
+ @page.doc.should_not be_nil
30
+ @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
31
+ end
32
+
33
+ it "should indicate whether it was fetched after an HTTP redirect" do
34
+ @page.should respond_to(:redirect?)
35
+
36
+ @page.redirect?.should == false
37
+
38
+ Page.fetch(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
39
+ end
40
+
41
+ it "should have a method to tell if a URI is in the same domain as the page" do
42
+ @page.should respond_to(:in_domain?)
43
+
44
+ @page.in_domain?(URI(FakePage.new('test').url)).should == true
45
+ @page.in_domain?(URI('http://www.other.com/')).should == false
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,7 @@
1
+ require File.dirname(__FILE__) + '/fakeweb_helper'
2
+ require 'rubygems'
3
+
4
+ $:.unshift(File.dirname(__FILE__) + '/../lib/')
5
+ require 'anemone'
6
+
7
+ SPEC_DOMAIN = 'http://www.example.com/'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-07-22 00:00:00 -05:00
12
+ date: 2009-08-10 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -35,23 +35,22 @@ extensions: []
35
35
  extra_rdoc_files:
36
36
  - README.rdoc
37
37
  files:
38
- - bin/anemone_pagedepth.rb
39
- - bin/anemone_url_list.rb
40
- - bin/anemone_cron.rb
38
+ - LICENSE.txt
39
+ - README.rdoc
41
40
  - bin/anemone_count.rb
41
+ - bin/anemone_cron.rb
42
+ - bin/anemone_pagedepth.rb
42
43
  - bin/anemone_serialize.rb
43
- - lib/anemone/tentacle.rb
44
- - lib/anemone/page.rb
45
- - lib/anemone/page_hash.rb
44
+ - bin/anemone_url_list.rb
45
+ - lib/anemone.rb
46
+ - lib/anemone/anemone.rb
46
47
  - lib/anemone/core.rb
47
48
  - lib/anemone/http.rb
48
- - lib/anemone/anemone.rb
49
- - lib/anemone.rb
50
- - README.rdoc
49
+ - lib/anemone/page.rb
50
+ - lib/anemone/page_hash.rb
51
+ - lib/anemone/tentacle.rb
51
52
  has_rdoc: true
52
53
  homepage: http://anemone.rubyforge.org
53
- licenses: []
54
-
55
54
  post_install_message:
56
55
  rdoc_options:
57
56
  - -m
@@ -75,9 +74,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
75
74
  requirements: []
76
75
 
77
76
  rubyforge_project: anemone
78
- rubygems_version: 1.3.4
77
+ rubygems_version: 1.3.1
79
78
  signing_key:
80
- specification_version: 3
79
+ specification_version: 2
81
80
  summary: Anemone web-spider framework
82
- test_files: []
83
-
81
+ test_files:
82
+ - spec/anemone_spec.rb
83
+ - spec/core_spec.rb
84
+ - spec/page_spec.rb
85
+ - spec/fakeweb_helper.rb
86
+ - spec/spec_helper.rb