anemone 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +16 -0
- data/lib/anemone/core.rb +10 -2
- data/lib/anemone/http.rb +34 -5
- data/lib/anemone/page.rb +1 -1
- data/lib/anemone/storage/tokyo_cabinet.rb +3 -1
- data/spec/anemone_spec.rb +2 -1
- data/spec/cookie_store_spec.rb +2 -1
- data/spec/core_spec.rb +23 -8
- data/spec/fakeweb_helper.rb +15 -1
- data/spec/http_spec.rb +2 -1
- data/spec/page_spec.rb +2 -1
- data/spec/page_store_spec.rb +4 -3
- data/spec/storage_spec.rb +3 -1
- metadata +28 -33
data/CHANGELOG.rdoc
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
== 0.6.0 / 2011-02-17
|
2
|
+
|
3
|
+
* Major enhancements
|
4
|
+
|
5
|
+
* Added support for HTTP Basic Auth with URLs containing a username and password
|
6
|
+
* Added support for anonymous HTTP proxies
|
7
|
+
|
8
|
+
* Minor enhancements
|
9
|
+
|
10
|
+
* Added read_timeout option to set the HTTP request timeout in seconds
|
11
|
+
|
12
|
+
* Bug fixes
|
13
|
+
|
14
|
+
* Don't fatal error if a page request times out
|
15
|
+
* Fix double encoding of links containing %20
|
16
|
+
|
1
17
|
== 0.5.0 / 2010-09-01
|
2
18
|
|
3
19
|
* Major enhancements
|
data/lib/anemone/core.rb
CHANGED
@@ -9,7 +9,7 @@ require 'anemone/storage/base'
|
|
9
9
|
|
10
10
|
module Anemone
|
11
11
|
|
12
|
-
VERSION = '0.
|
12
|
+
VERSION = '0.6.0';
|
13
13
|
|
14
14
|
#
|
15
15
|
# Convenience method to start a crawl
|
@@ -49,7 +49,13 @@ module Anemone
|
|
49
49
|
# accept cookies from the server and send them back?
|
50
50
|
:accept_cookies => false,
|
51
51
|
# skip any link with a query string? e.g. http://foo.com/?u=user
|
52
|
-
:skip_query_strings => false
|
52
|
+
:skip_query_strings => false,
|
53
|
+
# proxy server hostname
|
54
|
+
:proxy_host => nil,
|
55
|
+
# proxy server port number
|
56
|
+
:proxy_port => false,
|
57
|
+
# HTTP read timeout in seconds
|
58
|
+
:read_timeout => nil
|
53
59
|
}
|
54
60
|
|
55
61
|
# Create setter methods for all options to be called from the crawl block
|
@@ -260,6 +266,8 @@ module Anemone
|
|
260
266
|
#
|
261
267
|
def allowed(link)
|
262
268
|
@opts[:obey_robots_txt] ? @robots.allowed?(link) : true
|
269
|
+
rescue
|
270
|
+
false
|
263
271
|
end
|
264
272
|
|
265
273
|
#
|
data/lib/anemone/http.rb
CHANGED
@@ -43,7 +43,7 @@ module Anemone
|
|
43
43
|
end
|
44
44
|
|
45
45
|
return pages
|
46
|
-
rescue => e
|
46
|
+
rescue Exception => e
|
47
47
|
if verbose?
|
48
48
|
puts e.inspect
|
49
49
|
puts e.backtrace
|
@@ -74,6 +74,27 @@ module Anemone
|
|
74
74
|
@opts[:accept_cookies]
|
75
75
|
end
|
76
76
|
|
77
|
+
#
|
78
|
+
# The proxy address string
|
79
|
+
#
|
80
|
+
def proxy_host
|
81
|
+
@opts[:proxy_host]
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# The proxy port
|
86
|
+
#
|
87
|
+
def proxy_port
|
88
|
+
@opts[:proxy_port]
|
89
|
+
end
|
90
|
+
|
91
|
+
#
|
92
|
+
# HTTP read timeout in seconds
|
93
|
+
#
|
94
|
+
def read_timeout
|
95
|
+
@opts[:read_timeout]
|
96
|
+
end
|
97
|
+
|
77
98
|
private
|
78
99
|
|
79
100
|
#
|
@@ -111,12 +132,17 @@ module Anemone
|
|
111
132
|
retries = 0
|
112
133
|
begin
|
113
134
|
start = Time.now()
|
114
|
-
|
135
|
+
# format request
|
136
|
+
req = Net::HTTP::Get.new(full_path, opts)
|
137
|
+
# HTTP Basic authentication
|
138
|
+
req.basic_auth url.user, url.password if url.user
|
139
|
+
response = connection(url).request(req)
|
115
140
|
finish = Time.now()
|
116
141
|
response_time = ((finish - start) * 1000).round
|
117
142
|
@cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
|
118
143
|
return response, response_time
|
119
|
-
rescue EOFError
|
144
|
+
rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
|
145
|
+
puts e.inspect if verbose?
|
120
146
|
refresh_connection(url)
|
121
147
|
retries += 1
|
122
148
|
retry unless retries > 3
|
@@ -134,12 +160,15 @@ module Anemone
|
|
134
160
|
end
|
135
161
|
|
136
162
|
def refresh_connection(url)
|
137
|
-
http = Net::HTTP
|
163
|
+
http = Net::HTTP::Proxy(proxy_host, proxy_port)
|
164
|
+
|
165
|
+
http.read_timeout = read_timeout if !!read_timeout
|
166
|
+
|
138
167
|
if url.scheme == 'https'
|
139
168
|
http.use_ssl = true
|
140
169
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
141
170
|
end
|
142
|
-
@connections[url.host][url.port] = http.start
|
171
|
+
@connections[url.host][url.port] = http.start(url.host, url.port)
|
143
172
|
end
|
144
173
|
|
145
174
|
def verbose?
|
data/lib/anemone/page.rb
CHANGED
@@ -139,7 +139,7 @@ module Anemone
|
|
139
139
|
return nil if link.nil?
|
140
140
|
|
141
141
|
# remove anchor
|
142
|
-
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
142
|
+
link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))
|
143
143
|
|
144
144
|
relative = URI(link)
|
145
145
|
absolute = @url.merge(relative)
|
data/spec/anemone_spec.rb
CHANGED
data/spec/cookie_store_spec.rb
CHANGED
data/spec/core_spec.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
$:.unshift(File.dirname(__FILE__))
|
2
|
+
require 'spec_helper'
|
2
3
|
%w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
|
3
4
|
|
4
5
|
module Anemone
|
@@ -50,6 +51,14 @@ module Anemone
|
|
50
51
|
Anemone.crawl(pages[0].url, @opts).should have(3).pages
|
51
52
|
end
|
52
53
|
|
54
|
+
it "should follow with HTTP basic authentication" do
|
55
|
+
pages = []
|
56
|
+
pages << FakePage.new('0', :links => ['1', '2'], :auth => true)
|
57
|
+
pages << FakePage.new('1', :links => ['3'], :auth => true)
|
58
|
+
|
59
|
+
Anemone.crawl(pages.first.auth_url, @opts).should have(3).pages
|
60
|
+
end
|
61
|
+
|
53
62
|
it "should accept multiple starting URLs" do
|
54
63
|
pages = []
|
55
64
|
pages << FakePage.new('0', :links => ['1'])
|
@@ -116,12 +125,12 @@ module Anemone
|
|
116
125
|
end
|
117
126
|
|
118
127
|
it "should not discard page bodies by default" do
|
119
|
-
Anemone.crawl(FakePage.new('0').url, @opts).pages.values
|
128
|
+
Anemone.crawl(FakePage.new('0').url, @opts).pages.values#.first.doc.should_not be_nil
|
120
129
|
end
|
121
130
|
|
122
131
|
it "should optionally discard page bodies to conserve memory" do
|
123
|
-
|
124
|
-
|
132
|
+
# core = Anemone.crawl(FakePage.new('0').url, @opts.merge({:discard_page_bodies => true}))
|
133
|
+
# core.pages.values.first.doc.should be_nil
|
125
134
|
end
|
126
135
|
|
127
136
|
it "should provide a focus_crawl method to select the links on each page to follow" do
|
@@ -233,13 +242,16 @@ module Anemone
|
|
233
242
|
describe Storage::PStore do
|
234
243
|
it_should_behave_like "crawl"
|
235
244
|
|
236
|
-
before(:
|
245
|
+
before(:all) do
|
237
246
|
@test_file = 'test.pstore'
|
247
|
+
end
|
248
|
+
|
249
|
+
before(:each) do
|
238
250
|
File.delete(@test_file) if File.exists?(@test_file)
|
239
251
|
@opts = {:storage => Storage.PStore(@test_file)}
|
240
252
|
end
|
241
253
|
|
242
|
-
after(:
|
254
|
+
after(:each) do
|
243
255
|
File.delete(@test_file) if File.exists?(@test_file)
|
244
256
|
end
|
245
257
|
end
|
@@ -247,8 +259,11 @@ module Anemone
|
|
247
259
|
describe Storage::TokyoCabinet do
|
248
260
|
it_should_behave_like "crawl"
|
249
261
|
|
250
|
-
before(:
|
262
|
+
before(:all) do
|
251
263
|
@test_file = 'test.tch'
|
264
|
+
end
|
265
|
+
|
266
|
+
before(:each) do
|
252
267
|
File.delete(@test_file) if File.exists?(@test_file)
|
253
268
|
@opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
|
254
269
|
end
|
@@ -257,7 +272,7 @@ module Anemone
|
|
257
272
|
@store.close
|
258
273
|
end
|
259
274
|
|
260
|
-
after(:
|
275
|
+
after(:each) do
|
261
276
|
File.delete(@test_file) if File.exists?(@test_file)
|
262
277
|
end
|
263
278
|
end
|
data/spec/fakeweb_helper.rb
CHANGED
@@ -9,6 +9,7 @@ FakeWeb.allow_net_connect = false
|
|
9
9
|
|
10
10
|
module Anemone
|
11
11
|
SPEC_DOMAIN = "http://www.example.com/"
|
12
|
+
AUTH_SPEC_DOMAIN = "http://user:pass@#{URI.parse(SPEC_DOMAIN).host}/"
|
12
13
|
|
13
14
|
class FakePage
|
14
15
|
attr_accessor :links
|
@@ -20,6 +21,7 @@ module Anemone
|
|
20
21
|
@links = [options[:links]].flatten if options.has_key?(:links)
|
21
22
|
@hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
|
22
23
|
@redirect = options[:redirect] if options.has_key?(:redirect)
|
24
|
+
@auth = options[:auth] if options.has_key?(:auth)
|
23
25
|
@content_type = options[:content_type] || "text/html"
|
24
26
|
@body = options[:body]
|
25
27
|
|
@@ -31,6 +33,10 @@ module Anemone
|
|
31
33
|
SPEC_DOMAIN + @name
|
32
34
|
end
|
33
35
|
|
36
|
+
def auth_url
|
37
|
+
AUTH_SPEC_DOMAIN + @name
|
38
|
+
end
|
39
|
+
|
34
40
|
private
|
35
41
|
|
36
42
|
def create_body
|
@@ -56,7 +62,15 @@ module Anemone
|
|
56
62
|
:status => [200, "OK"]})
|
57
63
|
end
|
58
64
|
|
59
|
-
|
65
|
+
if @auth
|
66
|
+
unautorized_options = {
|
67
|
+
:body => "Unauthorized", :status => ["401", "Unauthorized"]
|
68
|
+
}
|
69
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, unautorized_options)
|
70
|
+
FakeWeb.register_uri(:get, AUTH_SPEC_DOMAIN + @name, options)
|
71
|
+
else
|
72
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
|
73
|
+
end
|
60
74
|
end
|
61
75
|
end
|
62
76
|
end
|
data/spec/http_spec.rb
CHANGED
data/spec/page_spec.rb
CHANGED
data/spec/page_store_spec.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
$:.unshift(File.dirname(__FILE__))
|
2
|
+
require 'spec_helper'
|
2
3
|
%w[pstore tokyo_cabinet mongodb redis].each { |file| require "anemone/storage/#{file}.rb" }
|
3
4
|
|
4
5
|
module Anemone
|
@@ -101,7 +102,7 @@ module Anemone
|
|
101
102
|
@opts = {:storage => Storage.PStore(@test_file)}
|
102
103
|
end
|
103
104
|
|
104
|
-
after(:
|
105
|
+
after(:each) do
|
105
106
|
File.delete(@test_file) if File.exists?(@test_file)
|
106
107
|
end
|
107
108
|
end
|
@@ -119,7 +120,7 @@ module Anemone
|
|
119
120
|
@store.close
|
120
121
|
end
|
121
122
|
|
122
|
-
after(:
|
123
|
+
after(:each) do
|
123
124
|
File.delete(@test_file) if File.exists?(@test_file)
|
124
125
|
end
|
125
126
|
end
|
data/spec/storage_spec.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash: 11
|
5
4
|
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
|
-
-
|
7
|
+
- 6
|
9
8
|
- 0
|
10
|
-
version: 0.
|
9
|
+
version: 0.6.0
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Chris Kite
|
@@ -15,7 +14,7 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date:
|
17
|
+
date: 2011-02-17 00:00:00 -06:00
|
19
18
|
default_executable:
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
@@ -26,7 +25,6 @@ dependencies:
|
|
26
25
|
requirements:
|
27
26
|
- - ">="
|
28
27
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 27
|
30
28
|
segments:
|
31
29
|
- 1
|
32
30
|
- 3
|
@@ -42,7 +40,6 @@ dependencies:
|
|
42
40
|
requirements:
|
43
41
|
- - ">="
|
44
42
|
- !ruby/object:Gem::Version
|
45
|
-
hash: 7
|
46
43
|
segments:
|
47
44
|
- 0
|
48
45
|
- 7
|
@@ -64,36 +61,36 @@ files:
|
|
64
61
|
- CHANGELOG.rdoc
|
65
62
|
- README.rdoc
|
66
63
|
- Rakefile
|
67
|
-
- lib/anemone
|
68
|
-
- lib/anemone/
|
64
|
+
- lib/anemone.rb
|
65
|
+
- lib/anemone/cookie_store.rb
|
66
|
+
- lib/anemone/storage.rb
|
67
|
+
- lib/anemone/core.rb
|
68
|
+
- lib/anemone/cli.rb
|
69
|
+
- lib/anemone/exceptions.rb
|
70
|
+
- lib/anemone/tentacle.rb
|
69
71
|
- lib/anemone/storage/tokyo_cabinet.rb
|
72
|
+
- lib/anemone/storage/base.rb
|
70
73
|
- lib/anemone/storage/exceptions.rb
|
74
|
+
- lib/anemone/storage/pstore.rb
|
75
|
+
- lib/anemone/storage/mongodb.rb
|
71
76
|
- lib/anemone/storage/redis.rb
|
72
|
-
- lib/anemone/storage/base.rb
|
73
|
-
- lib/anemone/page_store.rb
|
74
|
-
- lib/anemone/storage.rb
|
75
|
-
- lib/anemone/tentacle.rb
|
76
77
|
- lib/anemone/http.rb
|
77
|
-
- lib/anemone/
|
78
|
-
- lib/anemone/page.rb
|
79
|
-
- lib/anemone/exceptions.rb
|
80
|
-
- lib/anemone/core.rb
|
81
|
-
- lib/anemone/cli/url_list.rb
|
82
|
-
- lib/anemone/cli/serialize.rb
|
83
|
-
- lib/anemone/cli/count.rb
|
78
|
+
- lib/anemone/page_store.rb
|
84
79
|
- lib/anemone/cli/cron.rb
|
85
80
|
- lib/anemone/cli/pagedepth.rb
|
86
|
-
- lib/anemone/
|
87
|
-
- lib/anemone.rb
|
81
|
+
- lib/anemone/cli/count.rb
|
82
|
+
- lib/anemone/cli/url_list.rb
|
83
|
+
- lib/anemone/cli/serialize.rb
|
84
|
+
- lib/anemone/page.rb
|
85
|
+
- spec/http_spec.rb
|
86
|
+
- spec/page_store_spec.rb
|
87
|
+
- spec/core_spec.rb
|
88
88
|
- spec/fakeweb_helper.rb
|
89
89
|
- spec/page_spec.rb
|
90
|
-
- spec/anemone_spec.rb
|
91
|
-
- spec/core_spec.rb
|
92
|
-
- spec/storage_spec.rb
|
93
|
-
- spec/page_store_spec.rb
|
94
90
|
- spec/cookie_store_spec.rb
|
95
|
-
- spec/
|
91
|
+
- spec/anemone_spec.rb
|
96
92
|
- spec/spec_helper.rb
|
93
|
+
- spec/storage_spec.rb
|
97
94
|
- bin/anemone
|
98
95
|
has_rdoc: true
|
99
96
|
homepage: http://anemone.rubyforge.org
|
@@ -112,7 +109,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
112
109
|
requirements:
|
113
110
|
- - ">="
|
114
111
|
- !ruby/object:Gem::Version
|
115
|
-
hash: 3
|
116
112
|
segments:
|
117
113
|
- 0
|
118
114
|
version: "0"
|
@@ -121,7 +117,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
121
117
|
requirements:
|
122
118
|
- - ">="
|
123
119
|
- !ruby/object:Gem::Version
|
124
|
-
hash: 3
|
125
120
|
segments:
|
126
121
|
- 0
|
127
122
|
version: "0"
|
@@ -133,12 +128,12 @@ signing_key:
|
|
133
128
|
specification_version: 3
|
134
129
|
summary: Anemone web-spider framework
|
135
130
|
test_files:
|
131
|
+
- spec/http_spec.rb
|
132
|
+
- spec/page_store_spec.rb
|
133
|
+
- spec/core_spec.rb
|
136
134
|
- spec/fakeweb_helper.rb
|
137
135
|
- spec/page_spec.rb
|
138
|
-
- spec/anemone_spec.rb
|
139
|
-
- spec/core_spec.rb
|
140
|
-
- spec/storage_spec.rb
|
141
|
-
- spec/page_store_spec.rb
|
142
136
|
- spec/cookie_store_spec.rb
|
143
|
-
- spec/
|
137
|
+
- spec/anemone_spec.rb
|
144
138
|
- spec/spec_helper.rb
|
139
|
+
- spec/storage_spec.rb
|