shingara-anemone 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe Anemone do
4
+
5
+ it "should have a version" do
6
+ Anemone.const_defined?('VERSION').should == true
7
+ end
8
+
9
+ it "should return a Anemone::Core from the crawl, which has a PageHash" do
10
+ result = Anemone.crawl(SPEC_DOMAIN)
11
+ result.should be_an_instance_of(Anemone::Core)
12
+ result.pages.should be_an_instance_of(Anemone::PageHash)
13
+ end
14
+
15
+ end
@@ -0,0 +1,203 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Core do
5
+
6
+ before(:each) do
7
+ FakeWeb.clean_registry
8
+ end
9
+
10
+ it "should crawl all the html pages in a domain by following <a> href's" do
11
+ pages = []
12
+ pages << FakePage.new('0', :links => ['1', '2'])
13
+ pages << FakePage.new('1', :links => ['3'])
14
+ pages << FakePage.new('2')
15
+ pages << FakePage.new('3')
16
+
17
+ Anemone.crawl(pages[0].url).should have(4).pages
18
+ end
19
+
20
+ it "should not leave the original domain" do
21
+ pages = []
22
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
23
+ pages << FakePage.new('1')
24
+
25
+ core = Anemone.crawl(pages[0].url)
26
+
27
+ core.should have(2).pages
28
+ core.pages.keys.should_not include('http://www.other.com/')
29
+ end
30
+
31
+ it "should follow http redirects" do
32
+ pages = []
33
+ pages << FakePage.new('0', :links => ['1'])
34
+ pages << FakePage.new('1', :redirect => '2')
35
+ pages << FakePage.new('2')
36
+
37
+ Anemone.crawl(pages[0].url).should have(3).pages
38
+ end
39
+
40
+ it "should accept multiple starting URLs" do
41
+ pages = []
42
+ pages << FakePage.new('0', :links => ['1'])
43
+ pages << FakePage.new('1')
44
+ pages << FakePage.new('2', :links => ['3'])
45
+ pages << FakePage.new('3')
46
+
47
+ Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
48
+ end
49
+
50
+ it "should include the query string when following links" do
51
+ pages = []
52
+ pages << FakePage.new('0', :links => ['1?foo=1'])
53
+ pages << FakePage.new('1?foo=1')
54
+ pages << FakePage.new('1')
55
+
56
+ core = Anemone.crawl(pages[0].url)
57
+
58
+ core.should have(2).pages
59
+ core.pages.keys.should_not include(pages[2].url)
60
+ end
61
+
62
+ it "should be able to skip links based on a RegEx" do
63
+ pages = []
64
+ pages << FakePage.new('0', :links => ['1', '2'])
65
+ pages << FakePage.new('1')
66
+ pages << FakePage.new('2')
67
+ pages << FakePage.new('3')
68
+
69
+ core = Anemone.crawl(pages[0].url) do |a|
70
+ a.skip_links_like /1/, /3/
71
+ end
72
+
73
+ core.should have(2).pages
74
+ core.pages.keys.should_not include(pages[1].url)
75
+ core.pages.keys.should_not include(pages[3].url)
76
+ end
77
+
78
+ it "should be able to call a block on every page" do
79
+ pages = []
80
+ pages << FakePage.new('0', :links => ['1', '2'])
81
+ pages << FakePage.new('1')
82
+ pages << FakePage.new('2')
83
+
84
+ count = 0
85
+ Anemone.crawl(pages[0].url) do |a|
86
+ a.on_every_page { count += 1 }
87
+ end
88
+
89
+ count.should == 3
90
+ end
91
+
92
+ it "should not discard page bodies by default" do
93
+ Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
94
+ end
95
+
96
+ it "should optionally discard page bodies to conserve memory" do
97
+ core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
98
+ core.pages.values.first.doc.should be_nil
99
+ end
100
+
101
+ it "should provide a focus_crawl method to select the links on each page to follow" do
102
+ pages = []
103
+ pages << FakePage.new('0', :links => ['1', '2'])
104
+ pages << FakePage.new('1')
105
+ pages << FakePage.new('2')
106
+
107
+ core = Anemone.crawl(pages[0].url) do |a|
108
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
109
+ end
110
+
111
+ core.should have(2).pages
112
+ core.pages.keys.should_not include(pages[1].url)
113
+ end
114
+
115
+ it "should optionally delay between page requests" do
116
+ delay = 0.25
117
+
118
+ pages = []
119
+ pages << FakePage.new('0', :links => '1')
120
+ pages << FakePage.new('1')
121
+
122
+ start = Time.now
123
+ Anemone.crawl(pages[0].url, :delay => delay)
124
+ finish = Time.now
125
+
126
+ (finish - start).should satisfy {|t| t > delay * 2}
127
+ end
128
+
129
+ it "should optionally obey the robots exclusion protocol" do
130
+ pages = []
131
+ pages << FakePage.new('0', :links => '1')
132
+ pages << FakePage.new('1')
133
+ pages << FakePage.new('robots.txt',
134
+ :body => "User-agent: *\nDisallow: /1",
135
+ :content_type => 'text/plain')
136
+
137
+ core = Anemone.crawl(pages[0].url, :obey_robots_txt => true)
138
+ urls = core.pages.keys
139
+
140
+ urls.should include(pages[0].url)
141
+ urls.should_not include(pages[1].url)
142
+ end
143
+
144
+ describe "many pages" do
145
+ before(:each) do
146
+ @pages, size = [], 5
147
+
148
+ size.times do |n|
149
+ # register this page with a link to the next page
150
+ link = (n + 1).to_s if n + 1 < size
151
+ @pages << FakePage.new(n.to_s, :links => Array(link))
152
+ end
153
+ end
154
+
155
+ it "should track the page depth and referer" do
156
+ core = Anemone.crawl(@pages[0].url)
157
+ previous_page = nil
158
+
159
+ @pages.each_with_index do |page, i|
160
+ page = core.pages[page.url]
161
+ page.should be
162
+ page.depth.should == i
163
+
164
+ if previous_page
165
+ page.referer.should == previous_page.url
166
+ else
167
+ page.referer.should be_nil
168
+ end
169
+ previous_page = page
170
+ end
171
+ end
172
+
173
+ it "should optionally limit the depth of the crawl" do
174
+ core = Anemone.crawl(@pages[0].url, :depth_limit => 3)
175
+ core.should have(4).pages
176
+ end
177
+ end
178
+
179
+ describe "options" do
180
+ it "should accept options for the crawl" do
181
+ core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
182
+ :threads => 2,
183
+ :discard_page_bodies => true,
184
+ :user_agent => 'test',
185
+ :obey_robots_txt => true,
186
+ :depth_limit => 3)
187
+
188
+ core.opts[:verbose].should == false
189
+ core.opts[:threads].should == 2
190
+ core.opts[:discard_page_bodies].should == true
191
+ core.opts[:delay].should == 0
192
+ core.opts[:user_agent].should == 'test'
193
+ core.opts[:obey_robots_txt].should == true
194
+ core.opts[:depth_limit].should == 3
195
+ end
196
+
197
+ it "should use 1 thread if a delay is requested" do
198
+ Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
199
+ end
200
+ end
201
+
202
+ end
203
+ end
@@ -0,0 +1,57 @@
1
+ begin
2
+ require 'fakeweb'
3
+ rescue LoadError
4
+ warn "You need the 'fakeweb' gem installed to test Anemone"
5
+ exit
6
+ end
7
+
8
+ FakeWeb.allow_net_connect = false
9
+
10
+ module Anemone
11
+ SPEC_DOMAIN = "http://www.example.com/"
12
+
13
+ class FakePage
14
+ attr_accessor :links
15
+ attr_accessor :hrefs
16
+ attr_accessor :body
17
+
18
+ def initialize(name = '', options = {})
19
+ @name = name
20
+ @links = [options[:links]].flatten if options.has_key?(:links)
21
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
22
+ @redirect = options[:redirect] if options.has_key?(:redirect)
23
+ @content_type = options[:content_type] || "text/html"
24
+ @body = options[:body]
25
+
26
+ create_body unless @body
27
+ add_to_fakeweb
28
+ end
29
+
30
+ def url
31
+ SPEC_DOMAIN + @name
32
+ end
33
+
34
+ private
35
+
36
+ def create_body
37
+ @body = "<html><body>"
38
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
39
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
40
+ @body += "</body></html>"
41
+ end
42
+
43
+ def add_to_fakeweb
44
+ options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
45
+
46
+ if @redirect
47
+ options[:status] = [301, "Permanently Moved"]
48
+ options[:location] = SPEC_DOMAIN + @redirect
49
+ end
50
+
51
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
52
+ end
53
+ end
54
+ end
55
+
56
+ #default root
57
+ Anemone::FakePage.new
@@ -0,0 +1,52 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Page do
5
+
6
+ before(:all) do
7
+ @http = Anemone::HTTP.new
8
+ end
9
+
10
+ before(:each) do
11
+ @page = @http.fetch_page(FakePage.new('home').url)
12
+ end
13
+
14
+ it "should store the response headers when fetching a page" do
15
+ @page.headers.should_not be_nil
16
+ @page.headers.should have_key('content-type')
17
+ end
18
+
19
+ it "should have an OpenStruct attribute for the developer to store data in" do
20
+ @page.data.should_not be_nil
21
+ @page.data.should be_an_instance_of(OpenStruct)
22
+
23
+ @page.data.test = 'test'
24
+ @page.data.test.should == 'test'
25
+ end
26
+
27
+ it "should have a Nokogori::HTML::Document attribute for the page body" do
28
+ @page.doc.should_not be_nil
29
+ @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
30
+ end
31
+
32
+ it "should indicate whether it was fetched after an HTTP redirect" do
33
+ @page.should respond_to(:redirect?)
34
+
35
+ @page.redirect?.should == false
36
+
37
+ @http.fetch_page(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
38
+ end
39
+
40
+ it "should have a method to tell if a URI is in the same domain as the page" do
41
+ @page.should respond_to(:in_domain?)
42
+
43
+ @page.in_domain?(URI(FakePage.new('test').url)).should == true
44
+ @page.in_domain?(URI('http://www.other.com/')).should == false
45
+ end
46
+
47
+ it "should include the response time for the HTTP request" do
48
+ @page.should respond_to(:response_time)
49
+ end
50
+
51
+ end
52
+ end
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ require File.dirname(__FILE__) + '/fakeweb_helper'
3
+
4
+ $:.unshift(File.dirname(__FILE__) + '/../lib/')
5
+ require 'anemone'
6
+
7
+ SPEC_DOMAIN = 'http://www.example.com/'
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: shingara-anemone
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.4
5
+ platform: ruby
6
+ authors:
7
+ - Chris Kite
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-11-20 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: nokogiri
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.3.0
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: robots
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.7.2
34
+ version:
35
+ description:
36
+ email:
37
+ executables:
38
+ - anemone
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - README.rdoc
43
+ files:
44
+ - LICENSE.txt
45
+ - CHANGELOG.rdoc
46
+ - README.rdoc
47
+ - bin/anemone
48
+ - lib/anemone.rb
49
+ - lib/anemone/core.rb
50
+ - lib/anemone/http.rb
51
+ - lib/anemone/page.rb
52
+ - lib/anemone/page_hash.rb
53
+ - lib/anemone/tentacle.rb
54
+ - lib/anemone/cli.rb
55
+ - lib/anemone/cli/url_list.rb
56
+ - lib/anemone/cli/cron.rb
57
+ - lib/anemone/cli/count.rb
58
+ - lib/anemone/cli/pagedepth.rb
59
+ - lib/anemone/cli/serialize.rb
60
+ has_rdoc: true
61
+ homepage: http://anemone.rubyforge.org
62
+ licenses: []
63
+
64
+ post_install_message:
65
+ rdoc_options:
66
+ - -m
67
+ - README.rdoc
68
+ - -t
69
+ - Anemone
70
+ require_paths:
71
+ - lib
72
+ required_ruby_version: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: "0"
77
+ version:
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: "0"
83
+ version:
84
+ requirements: []
85
+
86
+ rubyforge_project: anemone
87
+ rubygems_version: 1.3.5
88
+ signing_key:
89
+ specification_version: 3
90
+ summary: Anemone web-spider framework
91
+ test_files:
92
+ - spec/anemone_spec.rb
93
+ - spec/core_spec.rb
94
+ - spec/page_spec.rb
95
+ - spec/fakeweb_helper.rb
96
+ - spec/spec_helper.rb