anemone 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +6 -0
- data/lib/anemone/cookie_store.rb +35 -0
- data/lib/anemone/core.rb +27 -11
- data/lib/anemone/http.rb +35 -17
- data/lib/anemone/page.rb +8 -0
- data/spec/cookie_store_spec.rb +27 -0
- data/spec/core_spec.rb +18 -0
- data/spec/http_spec.rb +3 -2
- data/spec/page_spec.rb +5 -0
- metadata +4 -2
data/CHANGELOG.rdoc
CHANGED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'delegate'
|
2
|
+
require 'webrick/cookie'
|
3
|
+
|
4
|
+
class WEBrick::Cookie
|
5
|
+
def expired?
|
6
|
+
!!expires && expires < Time.now
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
module Anemone
|
11
|
+
class CookieStore < DelegateClass(Hash)
|
12
|
+
|
13
|
+
def initialize(cookies = nil)
|
14
|
+
@cookies = {}
|
15
|
+
cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
|
16
|
+
super(@cookies)
|
17
|
+
end
|
18
|
+
|
19
|
+
def merge!(set_cookie_str)
|
20
|
+
begin
|
21
|
+
cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
|
22
|
+
hash[cookie.name] = cookie if !!cookie
|
23
|
+
hash
|
24
|
+
end
|
25
|
+
@cookies.merge! cookie_hash
|
26
|
+
rescue
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
@cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
data/lib/anemone/core.rb
CHANGED
@@ -7,7 +7,7 @@ require 'anemone/storage'
|
|
7
7
|
|
8
8
|
module Anemone
|
9
9
|
|
10
|
-
VERSION = '0.
|
10
|
+
VERSION = '0.4.0';
|
11
11
|
|
12
12
|
#
|
13
13
|
# Convenience method to start a crawl
|
@@ -41,13 +41,17 @@ module Anemone
|
|
41
41
|
# number of times HTTP redirects will be followed
|
42
42
|
:redirect_limit => 5,
|
43
43
|
# storage engine defaults to Hash in +process_options+ if none specified
|
44
|
-
:storage => nil
|
44
|
+
:storage => nil,
|
45
|
+
# Hash of cookie name => value to send with HTTP requests
|
46
|
+
:cookies => nil,
|
47
|
+
# accept cookies from the server and send them back?
|
48
|
+
:accept_cookies => false
|
45
49
|
}
|
46
50
|
|
47
51
|
# Create setter methods for all options to be called from the crawl block
|
48
52
|
DEFAULT_OPTS.keys.each do |key|
|
49
|
-
define_method "#{key}=" do
|
50
|
-
@opts[key.to_sym] =
|
53
|
+
define_method "#{key}=" do |value|
|
54
|
+
@opts[key.to_sym] = value
|
51
55
|
end
|
52
56
|
end
|
53
57
|
|
@@ -173,7 +177,7 @@ module Anemone
|
|
173
177
|
end
|
174
178
|
end
|
175
179
|
|
176
|
-
@tentacles.each { |
|
180
|
+
@tentacles.each { |thread| thread.join }
|
177
181
|
do_after_crawl_blocks
|
178
182
|
self
|
179
183
|
end
|
@@ -185,25 +189,37 @@ module Anemone
|
|
185
189
|
@opts[:threads] = 1 if @opts[:delay] > 0
|
186
190
|
@pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
|
187
191
|
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
192
|
+
|
193
|
+
freeze_options
|
194
|
+
end
|
195
|
+
|
196
|
+
#
|
197
|
+
# Freeze the opts Hash so that no options can be modified
|
198
|
+
# once the crawl begins
|
199
|
+
#
|
200
|
+
def freeze_options
|
201
|
+
@opts.freeze
|
202
|
+
@opts.each_key { |key| @opts[key].freeze }
|
203
|
+
@opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
|
188
204
|
end
|
189
205
|
|
190
206
|
#
|
191
207
|
# Execute the after_crawl blocks
|
192
208
|
#
|
193
209
|
def do_after_crawl_blocks
|
194
|
-
@after_crawl_blocks.each { |
|
210
|
+
@after_crawl_blocks.each { |block| block.call(@pages) }
|
195
211
|
end
|
196
212
|
|
197
213
|
#
|
198
214
|
# Execute the on_every_page blocks for *page*
|
199
215
|
#
|
200
216
|
def do_page_blocks(page)
|
201
|
-
@on_every_page_blocks.each do |
|
202
|
-
|
217
|
+
@on_every_page_blocks.each do |block|
|
218
|
+
block.call(page)
|
203
219
|
end
|
204
220
|
|
205
|
-
@on_pages_like_blocks.each do |pattern,
|
206
|
-
|
221
|
+
@on_pages_like_blocks.each do |pattern, blocks|
|
222
|
+
blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
|
207
223
|
end
|
208
224
|
end
|
209
225
|
|
@@ -241,7 +257,7 @@ module Anemone
|
|
241
257
|
# its URL matches a skip_link pattern.
|
242
258
|
#
|
243
259
|
def skip_link?(link)
|
244
|
-
@skip_link_patterns.any? { |
|
260
|
+
@skip_link_patterns.any? { |pattern| link.path =~ pattern }
|
245
261
|
end
|
246
262
|
|
247
263
|
end
|
data/lib/anemone/http.rb
CHANGED
@@ -1,14 +1,19 @@
|
|
1
1
|
require 'net/https'
|
2
2
|
require 'anemone/page'
|
3
|
+
require 'anemone/cookie_store'
|
3
4
|
|
4
5
|
module Anemone
|
5
6
|
class HTTP
|
6
7
|
# Maximum number of redirects to follow on each get_response
|
7
8
|
REDIRECT_LIMIT = 5
|
8
9
|
|
10
|
+
# CookieStore for this HTTP client
|
11
|
+
attr_reader :cookie_store
|
12
|
+
|
9
13
|
def initialize(opts = {})
|
10
14
|
@connections = {}
|
11
15
|
@opts = opts
|
16
|
+
@cookie_store = CookieStore.new(@opts[:cookies])
|
12
17
|
end
|
13
18
|
|
14
19
|
#
|
@@ -47,6 +52,28 @@ module Anemone
|
|
47
52
|
end
|
48
53
|
end
|
49
54
|
|
55
|
+
#
|
56
|
+
# The maximum number of redirects to follow
|
57
|
+
#
|
58
|
+
def redirect_limit
|
59
|
+
@opts[:redirect_limit] || REDIRECT_LIMIT
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# The user-agent string which will be sent with each request,
|
64
|
+
# or nil if no such option is set
|
65
|
+
#
|
66
|
+
def user_agent
|
67
|
+
@opts[:user_agent]
|
68
|
+
end
|
69
|
+
|
70
|
+
#
|
71
|
+
# Does this HTTP client accept cookies from the server?
|
72
|
+
#
|
73
|
+
def accept_cookies?
|
74
|
+
@opts[:accept_cookies]
|
75
|
+
end
|
76
|
+
|
50
77
|
private
|
51
78
|
|
52
79
|
#
|
@@ -55,22 +82,19 @@ module Anemone
|
|
55
82
|
# for each response.
|
56
83
|
#
|
57
84
|
def get(url, referer = nil)
|
58
|
-
response, response_time = get_response(url, referer)
|
59
|
-
code = Integer(response.code)
|
60
|
-
loc = url
|
61
|
-
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
|
62
|
-
yield response, code, loc, redirect_to, response_time
|
63
|
-
|
64
85
|
limit = redirect_limit
|
65
|
-
|
66
|
-
|
86
|
+
loc = url
|
87
|
+
begin
|
88
|
+
# if redirected to a relative url, merge it with the host of the original
|
89
|
+
# request url
|
67
90
|
loc = url.merge(loc) if loc.relative?
|
91
|
+
|
68
92
|
response, response_time = get_response(loc, referer)
|
69
93
|
code = Integer(response.code)
|
70
94
|
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
|
71
95
|
yield response, code, loc, redirect_to, response_time
|
72
96
|
limit -= 1
|
73
|
-
end
|
97
|
+
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
74
98
|
end
|
75
99
|
|
76
100
|
#
|
@@ -82,6 +106,7 @@ module Anemone
|
|
82
106
|
opts = {}
|
83
107
|
opts['User-Agent'] = user_agent if user_agent
|
84
108
|
opts['Referer'] = referer.to_s if referer
|
109
|
+
opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
|
85
110
|
|
86
111
|
retries = 0
|
87
112
|
begin
|
@@ -89,6 +114,7 @@ module Anemone
|
|
89
114
|
response = connection(url).get(full_path, opts)
|
90
115
|
finish = Time.now()
|
91
116
|
response_time = ((finish - start) * 1000).round
|
117
|
+
@cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
|
92
118
|
return response, response_time
|
93
119
|
rescue EOFError
|
94
120
|
refresh_connection(url)
|
@@ -116,14 +142,6 @@ module Anemone
|
|
116
142
|
@connections[url.host][url.port] = http.start
|
117
143
|
end
|
118
144
|
|
119
|
-
def redirect_limit
|
120
|
-
@opts[:redirect_limit] || REDIRECT_LIMIT
|
121
|
-
end
|
122
|
-
|
123
|
-
def user_agent
|
124
|
-
@opts[:user_agent]
|
125
|
-
end
|
126
|
-
|
127
145
|
def verbose?
|
128
146
|
@opts[:verbose]
|
129
147
|
end
|
data/lib/anemone/page.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'ostruct'
|
3
|
+
require 'webrick/cookie'
|
3
4
|
|
4
5
|
module Anemone
|
5
6
|
class Page
|
@@ -92,6 +93,13 @@ module Anemone
|
|
92
93
|
@fetched
|
93
94
|
end
|
94
95
|
|
96
|
+
#
|
97
|
+
# Array of cookies received with this page as WEBrick::Cookie objects.
|
98
|
+
#
|
99
|
+
def cookies
|
100
|
+
WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
|
101
|
+
end
|
102
|
+
|
95
103
|
#
|
96
104
|
# The content-type returned by the HTTP request for this page
|
97
105
|
#
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
describe CookieStore do
|
5
|
+
|
6
|
+
it "should start out empty if no cookies are specified" do
|
7
|
+
CookieStore.new.empty?.should be true
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should accept a Hash of cookies in the constructor" do
|
11
|
+
CookieStore.new({'test' => 'cookie'})['test'].value.should == 'cookie'
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should be able to merge an HTTP cookie string" do
|
15
|
+
cs = CookieStore.new({'a' => 'a', 'b' => 'b'})
|
16
|
+
cs.merge! "a=A; path=/, c=C; path=/"
|
17
|
+
cs['a'].value.should == 'A'
|
18
|
+
cs['b'].value.should == 'b'
|
19
|
+
cs['c'].value.should == 'C'
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should have a to_s method to turn the cookies into a string for the HTTP Cookie header" do
|
23
|
+
CookieStore.new({'a' => 'a', 'b' => 'b'}).to_s.should == 'a=a;b=b'
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
data/spec/core_spec.rb
CHANGED
@@ -154,6 +154,24 @@ module Anemone
|
|
154
154
|
urls.should_not include(pages[1].url)
|
155
155
|
end
|
156
156
|
|
157
|
+
it "should be able to set cookies to send with HTTP requests" do
|
158
|
+
cookies = {:a => '1', :b => '2'}
|
159
|
+
core = Anemone.crawl(FakePage.new('0').url) do |anemone|
|
160
|
+
anemone.cookies = cookies
|
161
|
+
end
|
162
|
+
core.opts[:cookies].should == cookies
|
163
|
+
end
|
164
|
+
|
165
|
+
it "should freeze the options once the crawl begins" do
|
166
|
+
core = Anemone.crawl(FakePage.new('0').url) do |anemone|
|
167
|
+
anemone.threads = 4
|
168
|
+
anemone.on_every_page do
|
169
|
+
lambda {anemone.threads = 2}.should raise_error
|
170
|
+
end
|
171
|
+
end
|
172
|
+
core.opts[:threads].should == 4
|
173
|
+
end
|
174
|
+
|
157
175
|
describe "many pages" do
|
158
176
|
before(:each) do
|
159
177
|
@pages, size = [], 5
|
data/spec/http_spec.rb
CHANGED
data/spec/page_spec.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-04-08 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -46,6 +46,7 @@ files:
|
|
46
46
|
- README.rdoc
|
47
47
|
- bin/anemone
|
48
48
|
- lib/anemone.rb
|
49
|
+
- lib/anemone/cookie_store.rb
|
49
50
|
- lib/anemone/core.rb
|
50
51
|
- lib/anemone/http.rb
|
51
52
|
- lib/anemone/page.rb
|
@@ -93,6 +94,7 @@ specification_version: 3
|
|
93
94
|
summary: Anemone web-spider framework
|
94
95
|
test_files:
|
95
96
|
- spec/anemone_spec.rb
|
97
|
+
- spec/cookie_store_spec.rb
|
96
98
|
- spec/core_spec.rb
|
97
99
|
- spec/page_spec.rb
|
98
100
|
- spec/page_store_spec.rb
|