spk-anemone 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe Anemone do
4
+
5
+ it "should have a version" do
6
+ Anemone.const_defined?('VERSION').should == true
7
+ end
8
+
9
+ it "should return a Anemone::Core from the crawl, which has a PageHash" do
10
+ result = Anemone.crawl(SPEC_DOMAIN)
11
+ result.should be_an_instance_of(Anemone::Core)
12
+ result.pages.should be_an_instance_of(Anemone::PageHash)
13
+ end
14
+
15
+ end
data/spec/core_spec.rb ADDED
@@ -0,0 +1,203 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Core do
5
+
6
+ before(:each) do
7
+ FakeWeb.clean_registry
8
+ end
9
+
10
+ it "should crawl all the html pages in a domain by following <a> href's" do
11
+ pages = []
12
+ pages << FakePage.new('0', :links => ['1', '2'])
13
+ pages << FakePage.new('1', :links => ['3'])
14
+ pages << FakePage.new('2')
15
+ pages << FakePage.new('3')
16
+
17
+ Anemone.crawl(pages[0].url).should have(4).pages
18
+ end
19
+
20
+ it "should not leave the original domain" do
21
+ pages = []
22
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
23
+ pages << FakePage.new('1')
24
+
25
+ core = Anemone.crawl(pages[0].url)
26
+
27
+ core.should have(2).pages
28
+ core.pages.keys.should_not include('http://www.other.com/')
29
+ end
30
+
31
+ it "should follow http redirects" do
32
+ pages = []
33
+ pages << FakePage.new('0', :links => ['1'])
34
+ pages << FakePage.new('1', :redirect => '2')
35
+ pages << FakePage.new('2')
36
+
37
+ Anemone.crawl(pages[0].url).should have(3).pages
38
+ end
39
+
40
+ it "should accept multiple starting URLs" do
41
+ pages = []
42
+ pages << FakePage.new('0', :links => ['1'])
43
+ pages << FakePage.new('1')
44
+ pages << FakePage.new('2', :links => ['3'])
45
+ pages << FakePage.new('3')
46
+
47
+ Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
48
+ end
49
+
50
+ it "should include the query string when following links" do
51
+ pages = []
52
+ pages << FakePage.new('0', :links => ['1?foo=1'])
53
+ pages << FakePage.new('1?foo=1')
54
+ pages << FakePage.new('1')
55
+
56
+ core = Anemone.crawl(pages[0].url)
57
+
58
+ core.should have(2).pages
59
+ core.pages.keys.should_not include(pages[2].url)
60
+ end
61
+
62
+ it "should be able to skip links based on a RegEx" do
63
+ pages = []
64
+ pages << FakePage.new('0', :links => ['1', '2'])
65
+ pages << FakePage.new('1')
66
+ pages << FakePage.new('2')
67
+ pages << FakePage.new('3')
68
+
69
+ core = Anemone.crawl(pages[0].url) do |a|
70
+ a.skip_links_like /1/, /3/
71
+ end
72
+
73
+ core.should have(2).pages
74
+ core.pages.keys.should_not include(pages[1].url)
75
+ core.pages.keys.should_not include(pages[3].url)
76
+ end
77
+
78
+ it "should be able to call a block on every page" do
79
+ pages = []
80
+ pages << FakePage.new('0', :links => ['1', '2'])
81
+ pages << FakePage.new('1')
82
+ pages << FakePage.new('2')
83
+
84
+ count = 0
85
+ Anemone.crawl(pages[0].url) do |a|
86
+ a.on_every_page { count += 1 }
87
+ end
88
+
89
+ count.should == 3
90
+ end
91
+
92
+ it "should not discard page bodies by default" do
93
+ Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
94
+ end
95
+
96
+ it "should optionally discard page bodies to conserve memory" do
97
+ core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
98
+ core.pages.values.first.doc.should be_nil
99
+ end
100
+
101
+ it "should provide a focus_crawl method to select the links on each page to follow" do
102
+ pages = []
103
+ pages << FakePage.new('0', :links => ['1', '2'])
104
+ pages << FakePage.new('1')
105
+ pages << FakePage.new('2')
106
+
107
+ core = Anemone.crawl(pages[0].url) do |a|
108
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
109
+ end
110
+
111
+ core.should have(2).pages
112
+ core.pages.keys.should_not include(pages[1].url)
113
+ end
114
+
115
+ it "should optionally delay between page requests" do
116
+ delay = 0.25
117
+
118
+ pages = []
119
+ pages << FakePage.new('0', :links => '1')
120
+ pages << FakePage.new('1')
121
+
122
+ start = Time.now
123
+ Anemone.crawl(pages[0].url, :delay => delay)
124
+ finish = Time.now
125
+
126
+ (finish - start).should satisfy {|t| t > delay * 2}
127
+ end
128
+
129
+ it "should optionally obey the robots exclusion protocol" do
130
+ pages = []
131
+ pages << FakePage.new('0', :links => '1')
132
+ pages << FakePage.new('1')
133
+ pages << FakePage.new('robots.txt',
134
+ :body => "User-agent: *\nDisallow: /1",
135
+ :content_type => 'text/plain')
136
+
137
+ core = Anemone.crawl(pages[0].url, :obey_robots_txt => true)
138
+ urls = core.pages.keys
139
+
140
+ urls.should include(pages[0].url)
141
+ urls.should_not include(pages[1].url)
142
+ end
143
+
144
+ describe "many pages" do
145
+ before(:each) do
146
+ @pages, size = [], 5
147
+
148
+ size.times do |n|
149
+ # register this page with a link to the next page
150
+ link = (n + 1).to_s if n + 1 < size
151
+ @pages << FakePage.new(n.to_s, :links => Array(link))
152
+ end
153
+ end
154
+
155
+ it "should track the page depth and referer" do
156
+ core = Anemone.crawl(@pages[0].url)
157
+ previous_page = nil
158
+
159
+ @pages.each_with_index do |page, i|
160
+ page = core.pages[page.url]
161
+ page.should be
162
+ page.depth.should == i
163
+
164
+ if previous_page
165
+ page.referer.should == previous_page.url
166
+ else
167
+ page.referer.should be_nil
168
+ end
169
+ previous_page = page
170
+ end
171
+ end
172
+
173
+ it "should optionally limit the depth of the crawl" do
174
+ core = Anemone.crawl(@pages[0].url, :depth_limit => 3)
175
+ core.should have(4).pages
176
+ end
177
+ end
178
+
179
+ describe "options" do
180
+ it "should accept options for the crawl" do
181
+ core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
182
+ :threads => 2,
183
+ :discard_page_bodies => true,
184
+ :user_agent => 'test',
185
+ :obey_robots_txt => true,
186
+ :depth_limit => 3)
187
+
188
+ core.opts[:verbose].should == false
189
+ core.opts[:threads].should == 2
190
+ core.opts[:discard_page_bodies].should == true
191
+ core.opts[:delay].should == 0
192
+ core.opts[:user_agent].should == 'test'
193
+ core.opts[:obey_robots_txt].should == true
194
+ core.opts[:depth_limit].should == 3
195
+ end
196
+
197
+ it "should use 1 thread if a delay is requested" do
198
+ Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
199
+ end
200
+ end
201
+
202
+ end
203
+ end
@@ -0,0 +1,57 @@
1
+ begin
2
+ require 'fakeweb'
3
+ rescue LoadError
4
+ warn "You need the 'fakeweb' gem installed to test Anemone"
5
+ exit
6
+ end
7
+
8
+ FakeWeb.allow_net_connect = false
9
+
10
+ module Anemone
11
+ SPEC_DOMAIN = "http://www.example.com/"
12
+
13
+ class FakePage
14
+ attr_accessor :links
15
+ attr_accessor :hrefs
16
+ attr_accessor :body
17
+
18
+ def initialize(name = '', options = {})
19
+ @name = name
20
+ @links = [options[:links]].flatten if options.has_key?(:links)
21
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
22
+ @redirect = options[:redirect] if options.has_key?(:redirect)
23
+ @content_type = options[:content_type] || "text/html"
24
+ @body = options[:body]
25
+
26
+ create_body unless @body
27
+ add_to_fakeweb
28
+ end
29
+
30
+ def url
31
+ SPEC_DOMAIN + @name
32
+ end
33
+
34
+ private
35
+
36
+ def create_body
37
+ @body = "<html><body>"
38
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
39
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
40
+ @body += "</body></html>"
41
+ end
42
+
43
+ def add_to_fakeweb
44
+ options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
45
+
46
+ if @redirect
47
+ options[:status] = [301, "Permanently Moved"]
48
+ options[:location] = SPEC_DOMAIN + @redirect
49
+ end
50
+
51
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
52
+ end
53
+ end
54
+ end
55
+
56
+ #default root
57
+ Anemone::FakePage.new
data/spec/page_spec.rb ADDED
@@ -0,0 +1,52 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Page do
5
+
6
+ before(:all) do
7
+ @http = Anemone::HTTP.new
8
+ end
9
+
10
+ before(:each) do
11
+ @page = @http.fetch_page(FakePage.new('home').url)
12
+ end
13
+
14
+ it "should store the response headers when fetching a page" do
15
+ @page.headers.should_not be_nil
16
+ @page.headers.should have_key('content-type')
17
+ end
18
+
19
+ it "should have an OpenStruct attribute for the developer to store data in" do
20
+ @page.data.should_not be_nil
21
+ @page.data.should be_an_instance_of(OpenStruct)
22
+
23
+ @page.data.test = 'test'
24
+ @page.data.test.should == 'test'
25
+ end
26
+
27
+ it "should have a Nokogori::HTML::Document attribute for the page body" do
28
+ @page.doc.should_not be_nil
29
+ @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
30
+ end
31
+
32
+ it "should indicate whether it was fetched after an HTTP redirect" do
33
+ @page.should respond_to(:redirect?)
34
+
35
+ @page.redirect?.should == false
36
+
37
+ @http.fetch_page(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
38
+ end
39
+
40
+ it "should have a method to tell if a URI is in the same domain as the page" do
41
+ @page.should respond_to(:in_domain?)
42
+
43
+ @page.in_domain?(URI(FakePage.new('test').url)).should == true
44
+ @page.in_domain?(URI('http://www.other.com/')).should == false
45
+ end
46
+
47
+ it "should include the response time for the HTTP request" do
48
+ @page.should respond_to(:response_time)
49
+ end
50
+
51
+ end
52
+ end
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ require File.dirname(__FILE__) + '/fakeweb_helper'
3
+
4
+ $:.unshift(File.dirname(__FILE__) + '/../lib/')
5
+ require 'anemone'
6
+
7
+ SPEC_DOMAIN = 'http://www.example.com/'
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spk-anemone
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.4
5
+ platform: ruby
6
+ authors:
7
+ - Chris Kite
8
+ - spk
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2009-11-26 00:00:00 +01:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: nokogiri
18
+ type: :runtime
19
+ version_requirement:
20
+ version_requirements: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: 1.4.0
25
+ version:
26
+ - !ruby/object:Gem::Dependency
27
+ name: robots
28
+ type: :runtime
29
+ version_requirement:
30
+ version_requirements: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: 0.7.2
35
+ version:
36
+ description:
37
+ email:
38
+ executables:
39
+ - anemone
40
+ extensions: []
41
+
42
+ extra_rdoc_files:
43
+ - README.rdoc
44
+ files:
45
+ - LICENSE.txt
46
+ - CHANGELOG.rdoc
47
+ - README.rdoc
48
+ - bin/anemone
49
+ - lib/anemone.rb
50
+ - lib/anemone/core.rb
51
+ - lib/anemone/http.rb
52
+ - lib/anemone/page.rb
53
+ - lib/anemone/page_hash.rb
54
+ - lib/anemone/tentacle.rb
55
+ - lib/anemone/cli.rb
56
+ - lib/anemone/cli/url_list.rb
57
+ - lib/anemone/cli/cron.rb
58
+ - lib/anemone/cli/count.rb
59
+ - lib/anemone/cli/pagedepth.rb
60
+ - lib/anemone/cli/serialize.rb
61
+ has_rdoc: true
62
+ homepage: http://anemone.rubyforge.org
63
+ licenses: []
64
+
65
+ post_install_message:
66
+ rdoc_options:
67
+ - -m
68
+ - README.rdoc
69
+ - -t
70
+ - Anemone
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: "0"
78
+ version:
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: "0"
84
+ version:
85
+ requirements: []
86
+
87
+ rubyforge_project: anemone
88
+ rubygems_version: 1.3.5
89
+ signing_key:
90
+ specification_version: 3
91
+ summary: Anemone web-spider framework
92
+ test_files:
93
+ - spec/anemone_spec.rb
94
+ - spec/core_spec.rb
95
+ - spec/page_spec.rb
96
+ - spec/fakeweb_helper.rb
97
+ - spec/spec_helper.rb