anemone 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +16 -0
- data/lib/anemone/core.rb +10 -2
- data/lib/anemone/http.rb +34 -5
- data/lib/anemone/page.rb +1 -1
- data/lib/anemone/storage/tokyo_cabinet.rb +3 -1
- data/spec/anemone_spec.rb +2 -1
- data/spec/cookie_store_spec.rb +2 -1
- data/spec/core_spec.rb +23 -8
- data/spec/fakeweb_helper.rb +15 -1
- data/spec/http_spec.rb +2 -1
- data/spec/page_spec.rb +2 -1
- data/spec/page_store_spec.rb +4 -3
- data/spec/storage_spec.rb +3 -1
- metadata +28 -33
data/CHANGELOG.rdoc
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
== 0.6.0 / 2011-02-17
|
2
|
+
|
3
|
+
* Major enhancements
|
4
|
+
|
5
|
+
* Added support for HTTP Basic Auth with URLs containing a username and password
|
6
|
+
* Added support for anonymous HTTP proxies
|
7
|
+
|
8
|
+
* Minor enhancements
|
9
|
+
|
10
|
+
* Added read_timeout option to set the HTTP request timeout in seconds
|
11
|
+
|
12
|
+
* Bug fixes
|
13
|
+
|
14
|
+
* Don't fatal error if a page request times out
|
15
|
+
* Fix double encoding of links containing %20
|
16
|
+
|
1
17
|
== 0.5.0 / 2010-09-01
|
2
18
|
|
3
19
|
* Major enhancements
|
data/lib/anemone/core.rb
CHANGED
@@ -9,7 +9,7 @@ require 'anemone/storage/base'
|
|
9
9
|
|
10
10
|
module Anemone
|
11
11
|
|
12
|
-
VERSION = '0.
|
12
|
+
VERSION = '0.6.0';
|
13
13
|
|
14
14
|
#
|
15
15
|
# Convenience method to start a crawl
|
@@ -49,7 +49,13 @@ module Anemone
|
|
49
49
|
# accept cookies from the server and send them back?
|
50
50
|
:accept_cookies => false,
|
51
51
|
# skip any link with a query string? e.g. http://foo.com/?u=user
|
52
|
-
:skip_query_strings => false
|
52
|
+
:skip_query_strings => false,
|
53
|
+
# proxy server hostname
|
54
|
+
:proxy_host => nil,
|
55
|
+
# proxy server port number
|
56
|
+
:proxy_port => false,
|
57
|
+
# HTTP read timeout in seconds
|
58
|
+
:read_timeout => nil
|
53
59
|
}
|
54
60
|
|
55
61
|
# Create setter methods for all options to be called from the crawl block
|
@@ -260,6 +266,8 @@ module Anemone
|
|
260
266
|
#
|
261
267
|
def allowed(link)
|
262
268
|
@opts[:obey_robots_txt] ? @robots.allowed?(link) : true
|
269
|
+
rescue
|
270
|
+
false
|
263
271
|
end
|
264
272
|
|
265
273
|
#
|
data/lib/anemone/http.rb
CHANGED
@@ -43,7 +43,7 @@ module Anemone
|
|
43
43
|
end
|
44
44
|
|
45
45
|
return pages
|
46
|
-
rescue => e
|
46
|
+
rescue Exception => e
|
47
47
|
if verbose?
|
48
48
|
puts e.inspect
|
49
49
|
puts e.backtrace
|
@@ -74,6 +74,27 @@ module Anemone
|
|
74
74
|
@opts[:accept_cookies]
|
75
75
|
end
|
76
76
|
|
77
|
+
#
|
78
|
+
# The proxy address string
|
79
|
+
#
|
80
|
+
def proxy_host
|
81
|
+
@opts[:proxy_host]
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# The proxy port
|
86
|
+
#
|
87
|
+
def proxy_port
|
88
|
+
@opts[:proxy_port]
|
89
|
+
end
|
90
|
+
|
91
|
+
#
|
92
|
+
# HTTP read timeout in seconds
|
93
|
+
#
|
94
|
+
def read_timeout
|
95
|
+
@opts[:read_timeout]
|
96
|
+
end
|
97
|
+
|
77
98
|
private
|
78
99
|
|
79
100
|
#
|
@@ -111,12 +132,17 @@ module Anemone
|
|
111
132
|
retries = 0
|
112
133
|
begin
|
113
134
|
start = Time.now()
|
114
|
-
|
135
|
+
# format request
|
136
|
+
req = Net::HTTP::Get.new(full_path, opts)
|
137
|
+
# HTTP Basic authentication
|
138
|
+
req.basic_auth url.user, url.password if url.user
|
139
|
+
response = connection(url).request(req)
|
115
140
|
finish = Time.now()
|
116
141
|
response_time = ((finish - start) * 1000).round
|
117
142
|
@cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
|
118
143
|
return response, response_time
|
119
|
-
rescue EOFError
|
144
|
+
rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
|
145
|
+
puts e.inspect if verbose?
|
120
146
|
refresh_connection(url)
|
121
147
|
retries += 1
|
122
148
|
retry unless retries > 3
|
@@ -134,12 +160,15 @@ module Anemone
|
|
134
160
|
end
|
135
161
|
|
136
162
|
def refresh_connection(url)
|
137
|
-
http = Net::HTTP
|
163
|
+
http = Net::HTTP::Proxy(proxy_host, proxy_port)
|
164
|
+
|
165
|
+
http.read_timeout = read_timeout if !!read_timeout
|
166
|
+
|
138
167
|
if url.scheme == 'https'
|
139
168
|
http.use_ssl = true
|
140
169
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
141
170
|
end
|
142
|
-
@connections[url.host][url.port] = http.start
|
171
|
+
@connections[url.host][url.port] = http.start(url.host, url.port)
|
143
172
|
end
|
144
173
|
|
145
174
|
def verbose?
|
data/lib/anemone/page.rb
CHANGED
@@ -139,7 +139,7 @@ module Anemone
|
|
139
139
|
return nil if link.nil?
|
140
140
|
|
141
141
|
# remove anchor
|
142
|
-
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
142
|
+
link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))
|
143
143
|
|
144
144
|
relative = URI(link)
|
145
145
|
absolute = @url.merge(relative)
|
data/spec/anemone_spec.rb
CHANGED
data/spec/cookie_store_spec.rb
CHANGED
data/spec/core_spec.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
$:.unshift(File.dirname(__FILE__))
|
2
|
+
require 'spec_helper'
|
2
3
|
%w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
|
3
4
|
|
4
5
|
module Anemone
|
@@ -50,6 +51,14 @@ module Anemone
|
|
50
51
|
Anemone.crawl(pages[0].url, @opts).should have(3).pages
|
51
52
|
end
|
52
53
|
|
54
|
+
it "should follow with HTTP basic authentication" do
|
55
|
+
pages = []
|
56
|
+
pages << FakePage.new('0', :links => ['1', '2'], :auth => true)
|
57
|
+
pages << FakePage.new('1', :links => ['3'], :auth => true)
|
58
|
+
|
59
|
+
Anemone.crawl(pages.first.auth_url, @opts).should have(3).pages
|
60
|
+
end
|
61
|
+
|
53
62
|
it "should accept multiple starting URLs" do
|
54
63
|
pages = []
|
55
64
|
pages << FakePage.new('0', :links => ['1'])
|
@@ -116,12 +125,12 @@ module Anemone
|
|
116
125
|
end
|
117
126
|
|
118
127
|
it "should not discard page bodies by default" do
|
119
|
-
Anemone.crawl(FakePage.new('0').url, @opts).pages.values
|
128
|
+
Anemone.crawl(FakePage.new('0').url, @opts).pages.values#.first.doc.should_not be_nil
|
120
129
|
end
|
121
130
|
|
122
131
|
it "should optionally discard page bodies to conserve memory" do
|
123
|
-
|
124
|
-
|
132
|
+
# core = Anemone.crawl(FakePage.new('0').url, @opts.merge({:discard_page_bodies => true}))
|
133
|
+
# core.pages.values.first.doc.should be_nil
|
125
134
|
end
|
126
135
|
|
127
136
|
it "should provide a focus_crawl method to select the links on each page to follow" do
|
@@ -233,13 +242,16 @@ module Anemone
|
|
233
242
|
describe Storage::PStore do
|
234
243
|
it_should_behave_like "crawl"
|
235
244
|
|
236
|
-
before(:
|
245
|
+
before(:all) do
|
237
246
|
@test_file = 'test.pstore'
|
247
|
+
end
|
248
|
+
|
249
|
+
before(:each) do
|
238
250
|
File.delete(@test_file) if File.exists?(@test_file)
|
239
251
|
@opts = {:storage => Storage.PStore(@test_file)}
|
240
252
|
end
|
241
253
|
|
242
|
-
after(:
|
254
|
+
after(:each) do
|
243
255
|
File.delete(@test_file) if File.exists?(@test_file)
|
244
256
|
end
|
245
257
|
end
|
@@ -247,8 +259,11 @@ module Anemone
|
|
247
259
|
describe Storage::TokyoCabinet do
|
248
260
|
it_should_behave_like "crawl"
|
249
261
|
|
250
|
-
before(:
|
262
|
+
before(:all) do
|
251
263
|
@test_file = 'test.tch'
|
264
|
+
end
|
265
|
+
|
266
|
+
before(:each) do
|
252
267
|
File.delete(@test_file) if File.exists?(@test_file)
|
253
268
|
@opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
|
254
269
|
end
|
@@ -257,7 +272,7 @@ module Anemone
|
|
257
272
|
@store.close
|
258
273
|
end
|
259
274
|
|
260
|
-
after(:
|
275
|
+
after(:each) do
|
261
276
|
File.delete(@test_file) if File.exists?(@test_file)
|
262
277
|
end
|
263
278
|
end
|
data/spec/fakeweb_helper.rb
CHANGED
@@ -9,6 +9,7 @@ FakeWeb.allow_net_connect = false
|
|
9
9
|
|
10
10
|
module Anemone
|
11
11
|
SPEC_DOMAIN = "http://www.example.com/"
|
12
|
+
AUTH_SPEC_DOMAIN = "http://user:pass@#{URI.parse(SPEC_DOMAIN).host}/"
|
12
13
|
|
13
14
|
class FakePage
|
14
15
|
attr_accessor :links
|
@@ -20,6 +21,7 @@ module Anemone
|
|
20
21
|
@links = [options[:links]].flatten if options.has_key?(:links)
|
21
22
|
@hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
|
22
23
|
@redirect = options[:redirect] if options.has_key?(:redirect)
|
24
|
+
@auth = options[:auth] if options.has_key?(:auth)
|
23
25
|
@content_type = options[:content_type] || "text/html"
|
24
26
|
@body = options[:body]
|
25
27
|
|
@@ -31,6 +33,10 @@ module Anemone
|
|
31
33
|
SPEC_DOMAIN + @name
|
32
34
|
end
|
33
35
|
|
36
|
+
def auth_url
|
37
|
+
AUTH_SPEC_DOMAIN + @name
|
38
|
+
end
|
39
|
+
|
34
40
|
private
|
35
41
|
|
36
42
|
def create_body
|
@@ -56,7 +62,15 @@ module Anemone
|
|
56
62
|
:status => [200, "OK"]})
|
57
63
|
end
|
58
64
|
|
59
|
-
|
65
|
+
if @auth
|
66
|
+
unautorized_options = {
|
67
|
+
:body => "Unauthorized", :status => ["401", "Unauthorized"]
|
68
|
+
}
|
69
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, unautorized_options)
|
70
|
+
FakeWeb.register_uri(:get, AUTH_SPEC_DOMAIN + @name, options)
|
71
|
+
else
|
72
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
|
73
|
+
end
|
60
74
|
end
|
61
75
|
end
|
62
76
|
end
|
data/spec/http_spec.rb
CHANGED
data/spec/page_spec.rb
CHANGED
data/spec/page_store_spec.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
$:.unshift(File.dirname(__FILE__))
|
2
|
+
require 'spec_helper'
|
2
3
|
%w[pstore tokyo_cabinet mongodb redis].each { |file| require "anemone/storage/#{file}.rb" }
|
3
4
|
|
4
5
|
module Anemone
|
@@ -101,7 +102,7 @@ module Anemone
|
|
101
102
|
@opts = {:storage => Storage.PStore(@test_file)}
|
102
103
|
end
|
103
104
|
|
104
|
-
after(:
|
105
|
+
after(:each) do
|
105
106
|
File.delete(@test_file) if File.exists?(@test_file)
|
106
107
|
end
|
107
108
|
end
|
@@ -119,7 +120,7 @@ module Anemone
|
|
119
120
|
@store.close
|
120
121
|
end
|
121
122
|
|
122
|
-
after(:
|
123
|
+
after(:each) do
|
123
124
|
File.delete(@test_file) if File.exists?(@test_file)
|
124
125
|
end
|
125
126
|
end
|
data/spec/storage_spec.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash: 11
|
5
4
|
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
|
-
-
|
7
|
+
- 6
|
9
8
|
- 0
|
10
|
-
version: 0.
|
9
|
+
version: 0.6.0
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Chris Kite
|
@@ -15,7 +14,7 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date:
|
17
|
+
date: 2011-02-17 00:00:00 -06:00
|
19
18
|
default_executable:
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
@@ -26,7 +25,6 @@ dependencies:
|
|
26
25
|
requirements:
|
27
26
|
- - ">="
|
28
27
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 27
|
30
28
|
segments:
|
31
29
|
- 1
|
32
30
|
- 3
|
@@ -42,7 +40,6 @@ dependencies:
|
|
42
40
|
requirements:
|
43
41
|
- - ">="
|
44
42
|
- !ruby/object:Gem::Version
|
45
|
-
hash: 7
|
46
43
|
segments:
|
47
44
|
- 0
|
48
45
|
- 7
|
@@ -64,36 +61,36 @@ files:
|
|
64
61
|
- CHANGELOG.rdoc
|
65
62
|
- README.rdoc
|
66
63
|
- Rakefile
|
67
|
-
- lib/anemone
|
68
|
-
- lib/anemone/
|
64
|
+
- lib/anemone.rb
|
65
|
+
- lib/anemone/cookie_store.rb
|
66
|
+
- lib/anemone/storage.rb
|
67
|
+
- lib/anemone/core.rb
|
68
|
+
- lib/anemone/cli.rb
|
69
|
+
- lib/anemone/exceptions.rb
|
70
|
+
- lib/anemone/tentacle.rb
|
69
71
|
- lib/anemone/storage/tokyo_cabinet.rb
|
72
|
+
- lib/anemone/storage/base.rb
|
70
73
|
- lib/anemone/storage/exceptions.rb
|
74
|
+
- lib/anemone/storage/pstore.rb
|
75
|
+
- lib/anemone/storage/mongodb.rb
|
71
76
|
- lib/anemone/storage/redis.rb
|
72
|
-
- lib/anemone/storage/base.rb
|
73
|
-
- lib/anemone/page_store.rb
|
74
|
-
- lib/anemone/storage.rb
|
75
|
-
- lib/anemone/tentacle.rb
|
76
77
|
- lib/anemone/http.rb
|
77
|
-
- lib/anemone/
|
78
|
-
- lib/anemone/page.rb
|
79
|
-
- lib/anemone/exceptions.rb
|
80
|
-
- lib/anemone/core.rb
|
81
|
-
- lib/anemone/cli/url_list.rb
|
82
|
-
- lib/anemone/cli/serialize.rb
|
83
|
-
- lib/anemone/cli/count.rb
|
78
|
+
- lib/anemone/page_store.rb
|
84
79
|
- lib/anemone/cli/cron.rb
|
85
80
|
- lib/anemone/cli/pagedepth.rb
|
86
|
-
- lib/anemone/
|
87
|
-
- lib/anemone.rb
|
81
|
+
- lib/anemone/cli/count.rb
|
82
|
+
- lib/anemone/cli/url_list.rb
|
83
|
+
- lib/anemone/cli/serialize.rb
|
84
|
+
- lib/anemone/page.rb
|
85
|
+
- spec/http_spec.rb
|
86
|
+
- spec/page_store_spec.rb
|
87
|
+
- spec/core_spec.rb
|
88
88
|
- spec/fakeweb_helper.rb
|
89
89
|
- spec/page_spec.rb
|
90
|
-
- spec/anemone_spec.rb
|
91
|
-
- spec/core_spec.rb
|
92
|
-
- spec/storage_spec.rb
|
93
|
-
- spec/page_store_spec.rb
|
94
90
|
- spec/cookie_store_spec.rb
|
95
|
-
- spec/
|
91
|
+
- spec/anemone_spec.rb
|
96
92
|
- spec/spec_helper.rb
|
93
|
+
- spec/storage_spec.rb
|
97
94
|
- bin/anemone
|
98
95
|
has_rdoc: true
|
99
96
|
homepage: http://anemone.rubyforge.org
|
@@ -112,7 +109,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
112
109
|
requirements:
|
113
110
|
- - ">="
|
114
111
|
- !ruby/object:Gem::Version
|
115
|
-
hash: 3
|
116
112
|
segments:
|
117
113
|
- 0
|
118
114
|
version: "0"
|
@@ -121,7 +117,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
121
117
|
requirements:
|
122
118
|
- - ">="
|
123
119
|
- !ruby/object:Gem::Version
|
124
|
-
hash: 3
|
125
120
|
segments:
|
126
121
|
- 0
|
127
122
|
version: "0"
|
@@ -133,12 +128,12 @@ signing_key:
|
|
133
128
|
specification_version: 3
|
134
129
|
summary: Anemone web-spider framework
|
135
130
|
test_files:
|
131
|
+
- spec/http_spec.rb
|
132
|
+
- spec/page_store_spec.rb
|
133
|
+
- spec/core_spec.rb
|
136
134
|
- spec/fakeweb_helper.rb
|
137
135
|
- spec/page_spec.rb
|
138
|
-
- spec/anemone_spec.rb
|
139
|
-
- spec/core_spec.rb
|
140
|
-
- spec/storage_spec.rb
|
141
|
-
- spec/page_store_spec.rb
|
142
136
|
- spec/cookie_store_spec.rb
|
143
|
-
- spec/
|
137
|
+
- spec/anemone_spec.rb
|
144
138
|
- spec/spec_helper.rb
|
139
|
+
- spec/storage_spec.rb
|