spk-anemone 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,15 @@
1
+ == 0.4.0 / 2010-04-08
2
+
3
+ * Major enchancements
4
+
5
+ * Cookies can be accepted and sent with each HTTP request.
6
+
7
+ == 0.3.2 / 2010-02-04
8
+
9
+ * Bug fixes
10
+
11
+ * Fixed issue that allowed following redirects off the original domain
12
+
1
13
  == 0.3.1 / 2010-01-22
2
14
 
3
15
  * Minor enhancements
@@ -0,0 +1,35 @@
1
+ require 'delegate'
2
+ require 'webrick/cookie'
3
+
4
+ class WEBrick::Cookie
5
+ def expired?
6
+ !!expires && expires < Time.now
7
+ end
8
+ end
9
+
10
+ module Anemone
11
+ class CookieStore < DelegateClass(Hash)
12
+
13
+ def initialize(cookies = nil)
14
+ @cookies = {}
15
+ cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
16
+ super(@cookies)
17
+ end
18
+
19
+ def merge!(set_cookie_str)
20
+ begin
21
+ cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
22
+ hash[cookie.name] = cookie if !!cookie
23
+ hash
24
+ end
25
+ @cookies.merge! cookie_hash
26
+ rescue
27
+ end
28
+ end
29
+
30
+ def to_s
31
+ @cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
32
+ end
33
+
34
+ end
35
+ end
@@ -7,7 +7,7 @@ require 'anemone/storage'
7
7
 
8
8
  module Anemone
9
9
 
10
- VERSION = '0.3.1';
10
+ VERSION = '0.4.0';
11
11
 
12
12
  #
13
13
  # Convenience method to start a crawl
@@ -42,14 +42,18 @@ module Anemone
42
42
  :redirect_limit => 5,
43
43
  # storage engine defaults to Hash in +process_options+ if none specified
44
44
  :storage => nil,
45
+ # Hash of cookie name => value to send with HTTP requests
46
+ :cookies => nil,
47
+ # accept cookies from the server and send them back?
48
+ :accept_cookies => false,
45
49
  # Authentication
46
50
  :authorization => nil,
47
51
  }
48
52
 
49
53
  # Create setter methods for all options to be called from the crawl block
50
54
  DEFAULT_OPTS.keys.each do |key|
51
- define_method "#{key}=" do |*args|
52
- @opts[key.to_sym] = *args
55
+ define_method "#{key}=" do |value|
56
+ @opts[key.to_sym] = value
53
57
  end
54
58
  end
55
59
 
@@ -178,7 +182,7 @@ module Anemone
178
182
  end
179
183
  end
180
184
 
181
- @tentacles.each { |t| t.join }
185
+ @tentacles.each { |thread| thread.join }
182
186
  do_after_crawl_blocks
183
187
  self
184
188
  end
@@ -191,6 +195,18 @@ module Anemone
191
195
  @opts[:threads] = 1 if @opts[:delay] > 0
192
196
  @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
193
197
  @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
198
+
199
+ freeze_options
200
+ end
201
+
202
+ #
203
+ # Freeze the opts Hash so that no options can be modified
204
+ # once the crawl begins
205
+ #
206
+ def freeze_options
207
+ @opts.freeze
208
+ @opts.each_key { |key| @opts[key].freeze }
209
+ @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
194
210
  end
195
211
 
196
212
  # Generate Authorization string and set authorization opts
@@ -213,19 +229,19 @@ module Anemone
213
229
  # Execute the after_crawl blocks
214
230
  #
215
231
  def do_after_crawl_blocks
216
- @after_crawl_blocks.each { |b| b.call(@pages) }
232
+ @after_crawl_blocks.each { |block| block.call(@pages) }
217
233
  end
218
234
 
219
235
  #
220
236
  # Execute the on_every_page blocks for *page*
221
237
  #
222
238
  def do_page_blocks(page)
223
- @on_every_page_blocks.each do |blk|
224
- blk.call(page)
239
+ @on_every_page_blocks.each do |block|
240
+ block.call(page)
225
241
  end
226
242
 
227
- @on_pages_like_blocks.each do |pattern, blks|
228
- blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
243
+ @on_pages_like_blocks.each do |pattern, blocks|
244
+ blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
229
245
  end
230
246
  end
231
247
 
@@ -263,7 +279,7 @@ module Anemone
263
279
  # its URL matches a skip_link pattern.
264
280
  #
265
281
  def skip_link?(link)
266
- @skip_link_patterns.any? { |p| link.path =~ p }
282
+ @skip_link_patterns.any? { |pattern| link.path =~ pattern }
267
283
  end
268
284
 
269
285
  end
@@ -1,14 +1,19 @@
1
1
  require 'net/https'
2
2
  require 'anemone/page'
3
+ require 'anemone/cookie_store'
3
4
 
4
5
  module Anemone
5
6
  class HTTP
6
7
  # Maximum number of redirects to follow on each get_response
7
8
  REDIRECT_LIMIT = 5
8
9
 
10
+ # CookieStore for this HTTP client
11
+ attr_reader :cookie_store
12
+
9
13
  def initialize(opts = {})
10
14
  @connections = {}
11
15
  @opts = opts
16
+ @cookie_store = CookieStore.new(@opts[:cookies])
12
17
  end
13
18
 
14
19
  #
@@ -47,6 +52,28 @@ module Anemone
47
52
  end
48
53
  end
49
54
 
55
+ #
56
+ # The maximum number of redirects to follow
57
+ #
58
+ def redirect_limit
59
+ @opts[:redirect_limit] || REDIRECT_LIMIT
60
+ end
61
+
62
+ #
63
+ # The user-agent string which will be sent with each request,
64
+ # or nil if no such option is set
65
+ #
66
+ def user_agent
67
+ @opts[:user_agent]
68
+ end
69
+
70
+ #
71
+ # Does this HTTP client accept cookies from the server?
72
+ #
73
+ def accept_cookies?
74
+ @opts[:accept_cookies]
75
+ end
76
+
50
77
  private
51
78
 
52
79
  #
@@ -55,22 +82,19 @@ module Anemone
55
82
  # for each response.
56
83
  #
57
84
  def get(url, referer = nil)
58
- response, response_time = get_response(url, referer)
59
- code = Integer(response.code)
60
- loc = url
61
- redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
62
- yield response, code, loc, redirect_to, response_time
63
-
64
85
  limit = redirect_limit
65
- while response.is_a?(Net::HTTPRedirection) and limit > 0
66
- loc = redirect_to
86
+ loc = url
87
+ begin
88
+ # if redirected to a relative url, merge it with the host of the original
89
+ # request url
67
90
  loc = url.merge(loc) if loc.relative?
91
+
68
92
  response, response_time = get_response(loc, referer)
69
93
  code = Integer(response.code)
70
94
  redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
71
95
  yield response, code, loc, redirect_to, response_time
72
96
  limit -= 1
73
- end
97
+ end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
74
98
  end
75
99
 
76
100
  #
@@ -82,6 +106,7 @@ module Anemone
82
106
  opts = {}
83
107
  opts['User-Agent'] = user_agent if user_agent
84
108
  opts['Referer'] = referer.to_s if referer
109
+ opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
85
110
  opts['Authorization'] = authorization if authorization
86
111
 
87
112
  retries = 0
@@ -90,6 +115,7 @@ module Anemone
90
115
  response = connection(url).get(full_path, opts)
91
116
  finish = Time.now()
92
117
  response_time = ((finish - start) * 1000).round
118
+ @cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
93
119
  return response, response_time
94
120
  rescue EOFError
95
121
  refresh_connection(url)
@@ -117,18 +143,17 @@ module Anemone
117
143
  @connections[url.host][url.port] = http.start
118
144
  end
119
145
 
120
- def redirect_limit
121
- @opts[:redirect_limit] || REDIRECT_LIMIT
122
- end
123
-
124
- def user_agent
125
- @opts[:user_agent]
126
- end
127
-
128
146
  def verbose?
129
147
  @opts[:verbose]
130
148
  end
131
149
 
150
+ #
151
+ # Allowed to connect to the requested url?
152
+ #
153
+ def allowed?(to_url, from_url)
154
+ to_url.host.nil? || (to_url.host == from_url.host)
155
+ end
156
+
132
157
  def authorization
133
158
  @opts[:authorization]
134
159
  end
@@ -1,5 +1,6 @@
1
1
  require 'nokogiri'
2
2
  require 'ostruct'
3
+ require 'webrick/cookie'
3
4
 
4
5
  module Anemone
5
6
  class Page
@@ -94,6 +95,13 @@ module Anemone
94
95
  @fetched
95
96
  end
96
97
 
98
+ #
99
+ # Array of cookies received with this page as WEBrick::Cookie objects.
100
+ #
101
+ def cookies
102
+ WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
103
+ end
104
+
97
105
  #
98
106
  # The content-type returned by the HTTP request for this page
99
107
  #
@@ -0,0 +1,27 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe CookieStore do
5
+
6
+ it "should start out empty if no cookies are specified" do
7
+ CookieStore.new.empty?.should be true
8
+ end
9
+
10
+ it "should accept a Hash of cookies in the constructor" do
11
+ CookieStore.new({'test' => 'cookie'})['test'].value.should == 'cookie'
12
+ end
13
+
14
+ it "should be able to merge an HTTP cookie string" do
15
+ cs = CookieStore.new({'a' => 'a', 'b' => 'b'})
16
+ cs.merge! "a=A; path=/, c=C; path=/"
17
+ cs['a'].value.should == 'A'
18
+ cs['b'].value.should == 'b'
19
+ cs['c'].value.should == 'C'
20
+ end
21
+
22
+ it "should have a to_s method to turn the cookies into a string for the HTTP Cookie header" do
23
+ CookieStore.new({'a' => 'a', 'b' => 'b'}).to_s.should == 'a=a;b=b'
24
+ end
25
+
26
+ end
27
+ end
@@ -19,7 +19,7 @@ module Anemone
19
19
  Anemone.crawl(pages[0].url, @opts).should have(4).pages
20
20
  end
21
21
 
22
- it "should not leave the original domain" do
22
+ it "should not follow links that leave the original domain" do
23
23
  pages = []
24
24
  pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
25
25
  pages << FakePage.new('1')
@@ -30,6 +30,17 @@ module Anemone
30
30
  core.pages.keys.should_not include('http://www.other.com/')
31
31
  end
32
32
 
33
+ it "should not follow redirects that leave the original domain" do
34
+ pages = []
35
+ pages << FakePage.new('0', :links => ['1'], :redirect => 'http://www.other.com/')
36
+ pages << FakePage.new('1')
37
+
38
+ core = Anemone.crawl(pages[0].url, @opts)
39
+
40
+ core.should have(2).pages
41
+ core.pages.keys.should_not include('http://www.other.com/')
42
+ end
43
+
33
44
  it "should follow http redirects" do
34
45
  pages = []
35
46
  pages << FakePage.new('0', :links => ['1'])
@@ -143,6 +154,24 @@ module Anemone
143
154
  urls.should_not include(pages[1].url)
144
155
  end
145
156
 
157
+ it "should be able to set cookies to send with HTTP requests" do
158
+ cookies = {:a => '1', :b => '2'}
159
+ core = Anemone.crawl(FakePage.new('0').url) do |anemone|
160
+ anemone.cookies = cookies
161
+ end
162
+ core.opts[:cookies].should == cookies
163
+ end
164
+
165
+ it "should freeze the options once the crawl begins" do
166
+ core = Anemone.crawl(FakePage.new('0').url) do |anemone|
167
+ anemone.threads = 4
168
+ anemone.on_every_page do
169
+ lambda {anemone.threads = 2}.should raise_error
170
+ end
171
+ end
172
+ core.opts[:threads].should == 4
173
+ end
174
+
146
175
  describe "many pages" do
147
176
  before(:each) do
148
177
  @pages, size = [], 5
@@ -9,12 +9,12 @@ FakeWeb.allow_net_connect = false
9
9
 
10
10
  module Anemone
11
11
  SPEC_DOMAIN = "http://www.example.com/"
12
-
12
+
13
13
  class FakePage
14
14
  attr_accessor :links
15
15
  attr_accessor :hrefs
16
16
  attr_accessor :body
17
-
17
+
18
18
  def initialize(name = '', options = {})
19
19
  @name = name
20
20
  @links = [options[:links]].flatten if options.has_key?(:links)
@@ -22,30 +22,38 @@ module Anemone
22
22
  @redirect = options[:redirect] if options.has_key?(:redirect)
23
23
  @content_type = options[:content_type] || "text/html"
24
24
  @body = options[:body]
25
-
25
+
26
26
  create_body unless @body
27
27
  add_to_fakeweb
28
28
  end
29
-
29
+
30
30
  def url
31
31
  SPEC_DOMAIN + @name
32
32
  end
33
-
33
+
34
34
  private
35
-
35
+
36
36
  def create_body
37
37
  @body = "<html><body>"
38
38
  @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
39
39
  @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
40
40
  @body += "</body></html>"
41
41
  end
42
-
42
+
43
43
  def add_to_fakeweb
44
44
  options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
45
-
45
+
46
46
  if @redirect
47
- options[:status] = [301, "Permanently Moved"]
48
- options[:location] = SPEC_DOMAIN + @redirect
47
+ options[:status] = [301, "Permanently Moved"]
48
+
49
+ # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
50
+ redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
51
+ options[:location] = redirect_url
52
+
53
+ # register the page this one redirects to
54
+ FakeWeb.register_uri(:get, redirect_url, {:body => '',
55
+ :content_type => @content_type,
56
+ :status => [200, "OK"]})
49
57
  end
50
58
 
51
59
  FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
@@ -15,9 +15,10 @@ module Anemone
15
15
  end
16
16
  end
17
17
 
18
- http = HTTP.new
18
+ http = Anemone::HTTP.new
19
19
  http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
20
20
  end
21
+
21
22
  end
22
23
  end
23
- end
24
+ end
@@ -68,5 +68,10 @@ module Anemone
68
68
  @page.should respond_to(:response_time)
69
69
  end
70
70
 
71
+ it "should have the cookies received with the page" do
72
+ @page.should respond_to(:cookies)
73
+ @page.cookies.should == []
74
+ end
75
+
71
76
  end
72
77
  end
metadata CHANGED
@@ -1,7 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spk-anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ hash: 15
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 4
9
+ - 0
10
+ version: 0.4.0
5
11
  platform: ruby
6
12
  authors:
7
13
  - Chris Kite
@@ -9,29 +15,41 @@ autorequire:
9
15
  bindir: bin
10
16
  cert_chain: []
11
17
 
12
- date: 2010-01-29 00:00:00 +01:00
18
+ date: 2010-08-18 00:00:00 +02:00
13
19
  default_executable:
14
20
  dependencies:
15
21
  - !ruby/object:Gem::Dependency
16
22
  name: nokogiri
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
20
26
  requirements:
21
27
  - - ">="
22
28
  - !ruby/object:Gem::Version
29
+ hash: 5
30
+ segments:
31
+ - 1
32
+ - 4
33
+ - 1
23
34
  version: 1.4.1
24
- version:
35
+ type: :runtime
36
+ version_requirements: *id001
25
37
  - !ruby/object:Gem::Dependency
26
38
  name: robots
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
30
42
  requirements:
31
43
  - - ">="
32
44
  - !ruby/object:Gem::Version
45
+ hash: 7
46
+ segments:
47
+ - 0
48
+ - 7
49
+ - 2
33
50
  version: 0.7.2
34
- version:
51
+ type: :runtime
52
+ version_requirements: *id002
35
53
  description:
36
54
  email:
37
55
  executables:
@@ -46,6 +64,7 @@ files:
46
64
  - README.rdoc
47
65
  - bin/anemone
48
66
  - lib/anemone.rb
67
+ - lib/anemone/cookie_store.rb
49
68
  - lib/anemone/core.rb
50
69
  - lib/anemone/http.rb
51
70
  - lib/anemone/page.rb
@@ -60,6 +79,15 @@ files:
60
79
  - lib/anemone/cli/count.rb
61
80
  - lib/anemone/cli/pagedepth.rb
62
81
  - lib/anemone/cli/serialize.rb
82
+ - spec/anemone_spec.rb
83
+ - spec/cookie_store_spec.rb
84
+ - spec/core_spec.rb
85
+ - spec/page_spec.rb
86
+ - spec/page_store_spec.rb
87
+ - spec/http_spec.rb
88
+ - spec/storage_spec.rb
89
+ - spec/fakeweb_helper.rb
90
+ - spec/spec_helper.rb
63
91
  has_rdoc: true
64
92
  homepage: http://anemone.rubyforge.org
65
93
  licenses: []
@@ -73,26 +101,33 @@ rdoc_options:
73
101
  require_paths:
74
102
  - lib
75
103
  required_ruby_version: !ruby/object:Gem::Requirement
104
+ none: false
76
105
  requirements:
77
106
  - - ">="
78
107
  - !ruby/object:Gem::Version
108
+ hash: 3
109
+ segments:
110
+ - 0
79
111
  version: "0"
80
- version:
81
112
  required_rubygems_version: !ruby/object:Gem::Requirement
113
+ none: false
82
114
  requirements:
83
115
  - - ">="
84
116
  - !ruby/object:Gem::Version
117
+ hash: 3
118
+ segments:
119
+ - 0
85
120
  version: "0"
86
- version:
87
121
  requirements: []
88
122
 
89
123
  rubyforge_project: anemone
90
- rubygems_version: 1.3.5
124
+ rubygems_version: 1.3.7
91
125
  signing_key:
92
126
  specification_version: 3
93
127
  summary: Anemone web-spider framework
94
128
  test_files:
95
129
  - spec/anemone_spec.rb
130
+ - spec/cookie_store_spec.rb
96
131
  - spec/core_spec.rb
97
132
  - spec/page_spec.rb
98
133
  - spec/page_store_spec.rb