anemone 0.3.2 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +6 -0
- data/lib/anemone/cookie_store.rb +35 -0
- data/lib/anemone/core.rb +27 -11
- data/lib/anemone/http.rb +35 -17
- data/lib/anemone/page.rb +8 -0
- data/spec/cookie_store_spec.rb +27 -0
- data/spec/core_spec.rb +18 -0
- data/spec/http_spec.rb +3 -2
- data/spec/page_spec.rb +5 -0
- metadata +4 -2
data/CHANGELOG.rdoc
CHANGED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'delegate'
|
2
|
+
require 'webrick/cookie'
|
3
|
+
|
4
|
+
class WEBrick::Cookie
|
5
|
+
def expired?
|
6
|
+
!!expires && expires < Time.now
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
module Anemone
|
11
|
+
class CookieStore < DelegateClass(Hash)
|
12
|
+
|
13
|
+
def initialize(cookies = nil)
|
14
|
+
@cookies = {}
|
15
|
+
cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
|
16
|
+
super(@cookies)
|
17
|
+
end
|
18
|
+
|
19
|
+
def merge!(set_cookie_str)
|
20
|
+
begin
|
21
|
+
cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
|
22
|
+
hash[cookie.name] = cookie if !!cookie
|
23
|
+
hash
|
24
|
+
end
|
25
|
+
@cookies.merge! cookie_hash
|
26
|
+
rescue
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
@cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
data/lib/anemone/core.rb
CHANGED
@@ -7,7 +7,7 @@ require 'anemone/storage'
|
|
7
7
|
|
8
8
|
module Anemone
|
9
9
|
|
10
|
-
VERSION = '0.
|
10
|
+
VERSION = '0.4.0';
|
11
11
|
|
12
12
|
#
|
13
13
|
# Convenience method to start a crawl
|
@@ -41,13 +41,17 @@ module Anemone
|
|
41
41
|
# number of times HTTP redirects will be followed
|
42
42
|
:redirect_limit => 5,
|
43
43
|
# storage engine defaults to Hash in +process_options+ if none specified
|
44
|
-
:storage => nil
|
44
|
+
:storage => nil,
|
45
|
+
# Hash of cookie name => value to send with HTTP requests
|
46
|
+
:cookies => nil,
|
47
|
+
# accept cookies from the server and send them back?
|
48
|
+
:accept_cookies => false
|
45
49
|
}
|
46
50
|
|
47
51
|
# Create setter methods for all options to be called from the crawl block
|
48
52
|
DEFAULT_OPTS.keys.each do |key|
|
49
|
-
define_method "#{key}=" do
|
50
|
-
@opts[key.to_sym] =
|
53
|
+
define_method "#{key}=" do |value|
|
54
|
+
@opts[key.to_sym] = value
|
51
55
|
end
|
52
56
|
end
|
53
57
|
|
@@ -173,7 +177,7 @@ module Anemone
|
|
173
177
|
end
|
174
178
|
end
|
175
179
|
|
176
|
-
@tentacles.each { |
|
180
|
+
@tentacles.each { |thread| thread.join }
|
177
181
|
do_after_crawl_blocks
|
178
182
|
self
|
179
183
|
end
|
@@ -185,25 +189,37 @@ module Anemone
|
|
185
189
|
@opts[:threads] = 1 if @opts[:delay] > 0
|
186
190
|
@pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
|
187
191
|
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
192
|
+
|
193
|
+
freeze_options
|
194
|
+
end
|
195
|
+
|
196
|
+
#
|
197
|
+
# Freeze the opts Hash so that no options can be modified
|
198
|
+
# once the crawl begins
|
199
|
+
#
|
200
|
+
def freeze_options
|
201
|
+
@opts.freeze
|
202
|
+
@opts.each_key { |key| @opts[key].freeze }
|
203
|
+
@opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
|
188
204
|
end
|
189
205
|
|
190
206
|
#
|
191
207
|
# Execute the after_crawl blocks
|
192
208
|
#
|
193
209
|
def do_after_crawl_blocks
|
194
|
-
@after_crawl_blocks.each { |
|
210
|
+
@after_crawl_blocks.each { |block| block.call(@pages) }
|
195
211
|
end
|
196
212
|
|
197
213
|
#
|
198
214
|
# Execute the on_every_page blocks for *page*
|
199
215
|
#
|
200
216
|
def do_page_blocks(page)
|
201
|
-
@on_every_page_blocks.each do |
|
202
|
-
|
217
|
+
@on_every_page_blocks.each do |block|
|
218
|
+
block.call(page)
|
203
219
|
end
|
204
220
|
|
205
|
-
@on_pages_like_blocks.each do |pattern,
|
206
|
-
|
221
|
+
@on_pages_like_blocks.each do |pattern, blocks|
|
222
|
+
blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
|
207
223
|
end
|
208
224
|
end
|
209
225
|
|
@@ -241,7 +257,7 @@ module Anemone
|
|
241
257
|
# its URL matches a skip_link pattern.
|
242
258
|
#
|
243
259
|
def skip_link?(link)
|
244
|
-
@skip_link_patterns.any? { |
|
260
|
+
@skip_link_patterns.any? { |pattern| link.path =~ pattern }
|
245
261
|
end
|
246
262
|
|
247
263
|
end
|
data/lib/anemone/http.rb
CHANGED
@@ -1,14 +1,19 @@
|
|
1
1
|
require 'net/https'
|
2
2
|
require 'anemone/page'
|
3
|
+
require 'anemone/cookie_store'
|
3
4
|
|
4
5
|
module Anemone
|
5
6
|
class HTTP
|
6
7
|
# Maximum number of redirects to follow on each get_response
|
7
8
|
REDIRECT_LIMIT = 5
|
8
9
|
|
10
|
+
# CookieStore for this HTTP client
|
11
|
+
attr_reader :cookie_store
|
12
|
+
|
9
13
|
def initialize(opts = {})
|
10
14
|
@connections = {}
|
11
15
|
@opts = opts
|
16
|
+
@cookie_store = CookieStore.new(@opts[:cookies])
|
12
17
|
end
|
13
18
|
|
14
19
|
#
|
@@ -47,6 +52,28 @@ module Anemone
|
|
47
52
|
end
|
48
53
|
end
|
49
54
|
|
55
|
+
#
|
56
|
+
# The maximum number of redirects to follow
|
57
|
+
#
|
58
|
+
def redirect_limit
|
59
|
+
@opts[:redirect_limit] || REDIRECT_LIMIT
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# The user-agent string which will be sent with each request,
|
64
|
+
# or nil if no such option is set
|
65
|
+
#
|
66
|
+
def user_agent
|
67
|
+
@opts[:user_agent]
|
68
|
+
end
|
69
|
+
|
70
|
+
#
|
71
|
+
# Does this HTTP client accept cookies from the server?
|
72
|
+
#
|
73
|
+
def accept_cookies?
|
74
|
+
@opts[:accept_cookies]
|
75
|
+
end
|
76
|
+
|
50
77
|
private
|
51
78
|
|
52
79
|
#
|
@@ -55,22 +82,19 @@ module Anemone
|
|
55
82
|
# for each response.
|
56
83
|
#
|
57
84
|
def get(url, referer = nil)
|
58
|
-
response, response_time = get_response(url, referer)
|
59
|
-
code = Integer(response.code)
|
60
|
-
loc = url
|
61
|
-
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
|
62
|
-
yield response, code, loc, redirect_to, response_time
|
63
|
-
|
64
85
|
limit = redirect_limit
|
65
|
-
|
66
|
-
|
86
|
+
loc = url
|
87
|
+
begin
|
88
|
+
# if redirected to a relative url, merge it with the host of the original
|
89
|
+
# request url
|
67
90
|
loc = url.merge(loc) if loc.relative?
|
91
|
+
|
68
92
|
response, response_time = get_response(loc, referer)
|
69
93
|
code = Integer(response.code)
|
70
94
|
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
|
71
95
|
yield response, code, loc, redirect_to, response_time
|
72
96
|
limit -= 1
|
73
|
-
end
|
97
|
+
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
74
98
|
end
|
75
99
|
|
76
100
|
#
|
@@ -82,6 +106,7 @@ module Anemone
|
|
82
106
|
opts = {}
|
83
107
|
opts['User-Agent'] = user_agent if user_agent
|
84
108
|
opts['Referer'] = referer.to_s if referer
|
109
|
+
opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
|
85
110
|
|
86
111
|
retries = 0
|
87
112
|
begin
|
@@ -89,6 +114,7 @@ module Anemone
|
|
89
114
|
response = connection(url).get(full_path, opts)
|
90
115
|
finish = Time.now()
|
91
116
|
response_time = ((finish - start) * 1000).round
|
117
|
+
@cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
|
92
118
|
return response, response_time
|
93
119
|
rescue EOFError
|
94
120
|
refresh_connection(url)
|
@@ -116,14 +142,6 @@ module Anemone
|
|
116
142
|
@connections[url.host][url.port] = http.start
|
117
143
|
end
|
118
144
|
|
119
|
-
def redirect_limit
|
120
|
-
@opts[:redirect_limit] || REDIRECT_LIMIT
|
121
|
-
end
|
122
|
-
|
123
|
-
def user_agent
|
124
|
-
@opts[:user_agent]
|
125
|
-
end
|
126
|
-
|
127
145
|
def verbose?
|
128
146
|
@opts[:verbose]
|
129
147
|
end
|
data/lib/anemone/page.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'ostruct'
|
3
|
+
require 'webrick/cookie'
|
3
4
|
|
4
5
|
module Anemone
|
5
6
|
class Page
|
@@ -92,6 +93,13 @@ module Anemone
|
|
92
93
|
@fetched
|
93
94
|
end
|
94
95
|
|
96
|
+
#
|
97
|
+
# Array of cookies received with this page as WEBrick::Cookie objects.
|
98
|
+
#
|
99
|
+
def cookies
|
100
|
+
WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
|
101
|
+
end
|
102
|
+
|
95
103
|
#
|
96
104
|
# The content-type returned by the HTTP request for this page
|
97
105
|
#
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
describe CookieStore do
|
5
|
+
|
6
|
+
it "should start out empty if no cookies are specified" do
|
7
|
+
CookieStore.new.empty?.should be true
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should accept a Hash of cookies in the constructor" do
|
11
|
+
CookieStore.new({'test' => 'cookie'})['test'].value.should == 'cookie'
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should be able to merge an HTTP cookie string" do
|
15
|
+
cs = CookieStore.new({'a' => 'a', 'b' => 'b'})
|
16
|
+
cs.merge! "a=A; path=/, c=C; path=/"
|
17
|
+
cs['a'].value.should == 'A'
|
18
|
+
cs['b'].value.should == 'b'
|
19
|
+
cs['c'].value.should == 'C'
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should have a to_s method to turn the cookies into a string for the HTTP Cookie header" do
|
23
|
+
CookieStore.new({'a' => 'a', 'b' => 'b'}).to_s.should == 'a=a;b=b'
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
data/spec/core_spec.rb
CHANGED
@@ -154,6 +154,24 @@ module Anemone
|
|
154
154
|
urls.should_not include(pages[1].url)
|
155
155
|
end
|
156
156
|
|
157
|
+
it "should be able to set cookies to send with HTTP requests" do
|
158
|
+
cookies = {:a => '1', :b => '2'}
|
159
|
+
core = Anemone.crawl(FakePage.new('0').url) do |anemone|
|
160
|
+
anemone.cookies = cookies
|
161
|
+
end
|
162
|
+
core.opts[:cookies].should == cookies
|
163
|
+
end
|
164
|
+
|
165
|
+
it "should freeze the options once the crawl begins" do
|
166
|
+
core = Anemone.crawl(FakePage.new('0').url) do |anemone|
|
167
|
+
anemone.threads = 4
|
168
|
+
anemone.on_every_page do
|
169
|
+
lambda {anemone.threads = 2}.should raise_error
|
170
|
+
end
|
171
|
+
end
|
172
|
+
core.opts[:threads].should == 4
|
173
|
+
end
|
174
|
+
|
157
175
|
describe "many pages" do
|
158
176
|
before(:each) do
|
159
177
|
@pages, size = [], 5
|
data/spec/http_spec.rb
CHANGED
data/spec/page_spec.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-04-08 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -46,6 +46,7 @@ files:
|
|
46
46
|
- README.rdoc
|
47
47
|
- bin/anemone
|
48
48
|
- lib/anemone.rb
|
49
|
+
- lib/anemone/cookie_store.rb
|
49
50
|
- lib/anemone/core.rb
|
50
51
|
- lib/anemone/http.rb
|
51
52
|
- lib/anemone/page.rb
|
@@ -93,6 +94,7 @@ specification_version: 3
|
|
93
94
|
summary: Anemone web-spider framework
|
94
95
|
test_files:
|
95
96
|
- spec/anemone_spec.rb
|
97
|
+
- spec/cookie_store_spec.rb
|
96
98
|
- spec/core_spec.rb
|
97
99
|
- spec/page_spec.rb
|
98
100
|
- spec/page_store_spec.rb
|