spk-anemone 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,15 @@
1
+ == 0.4.0 / 2010-04-08
2
+
3
+ * Major enchancements
4
+
5
+ * Cookies can be accepted and sent with each HTTP request.
6
+
7
+ == 0.3.2 / 2010-02-04
8
+
9
+ * Bug fixes
10
+
11
+ * Fixed issue that allowed following redirects off the original domain
12
+
1
13
  == 0.3.1 / 2010-01-22
2
14
 
3
15
  * Minor enhancements
@@ -0,0 +1,35 @@
1
+ require 'delegate'
2
+ require 'webrick/cookie'
3
+
4
+ class WEBrick::Cookie
5
+ def expired?
6
+ !!expires && expires < Time.now
7
+ end
8
+ end
9
+
10
+ module Anemone
11
+ class CookieStore < DelegateClass(Hash)
12
+
13
+ def initialize(cookies = nil)
14
+ @cookies = {}
15
+ cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
16
+ super(@cookies)
17
+ end
18
+
19
+ def merge!(set_cookie_str)
20
+ begin
21
+ cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
22
+ hash[cookie.name] = cookie if !!cookie
23
+ hash
24
+ end
25
+ @cookies.merge! cookie_hash
26
+ rescue
27
+ end
28
+ end
29
+
30
+ def to_s
31
+ @cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
32
+ end
33
+
34
+ end
35
+ end
@@ -7,7 +7,7 @@ require 'anemone/storage'
7
7
 
8
8
  module Anemone
9
9
 
10
- VERSION = '0.3.1';
10
+ VERSION = '0.4.0';
11
11
 
12
12
  #
13
13
  # Convenience method to start a crawl
@@ -42,14 +42,18 @@ module Anemone
42
42
  :redirect_limit => 5,
43
43
  # storage engine defaults to Hash in +process_options+ if none specified
44
44
  :storage => nil,
45
+ # Hash of cookie name => value to send with HTTP requests
46
+ :cookies => nil,
47
+ # accept cookies from the server and send them back?
48
+ :accept_cookies => false,
45
49
  # Authentication
46
50
  :authorization => nil,
47
51
  }
48
52
 
49
53
  # Create setter methods for all options to be called from the crawl block
50
54
  DEFAULT_OPTS.keys.each do |key|
51
- define_method "#{key}=" do |*args|
52
- @opts[key.to_sym] = *args
55
+ define_method "#{key}=" do |value|
56
+ @opts[key.to_sym] = value
53
57
  end
54
58
  end
55
59
 
@@ -178,7 +182,7 @@ module Anemone
178
182
  end
179
183
  end
180
184
 
181
- @tentacles.each { |t| t.join }
185
+ @tentacles.each { |thread| thread.join }
182
186
  do_after_crawl_blocks
183
187
  self
184
188
  end
@@ -191,6 +195,18 @@ module Anemone
191
195
  @opts[:threads] = 1 if @opts[:delay] > 0
192
196
  @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
193
197
  @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
198
+
199
+ freeze_options
200
+ end
201
+
202
+ #
203
+ # Freeze the opts Hash so that no options can be modified
204
+ # once the crawl begins
205
+ #
206
+ def freeze_options
207
+ @opts.freeze
208
+ @opts.each_key { |key| @opts[key].freeze }
209
+ @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
194
210
  end
195
211
 
196
212
  # Generate Authorization string and set authorization opts
@@ -213,19 +229,19 @@ module Anemone
213
229
  # Execute the after_crawl blocks
214
230
  #
215
231
  def do_after_crawl_blocks
216
- @after_crawl_blocks.each { |b| b.call(@pages) }
232
+ @after_crawl_blocks.each { |block| block.call(@pages) }
217
233
  end
218
234
 
219
235
  #
220
236
  # Execute the on_every_page blocks for *page*
221
237
  #
222
238
  def do_page_blocks(page)
223
- @on_every_page_blocks.each do |blk|
224
- blk.call(page)
239
+ @on_every_page_blocks.each do |block|
240
+ block.call(page)
225
241
  end
226
242
 
227
- @on_pages_like_blocks.each do |pattern, blks|
228
- blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
243
+ @on_pages_like_blocks.each do |pattern, blocks|
244
+ blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
229
245
  end
230
246
  end
231
247
 
@@ -263,7 +279,7 @@ module Anemone
263
279
  # its URL matches a skip_link pattern.
264
280
  #
265
281
  def skip_link?(link)
266
- @skip_link_patterns.any? { |p| link.path =~ p }
282
+ @skip_link_patterns.any? { |pattern| link.path =~ pattern }
267
283
  end
268
284
 
269
285
  end
@@ -1,14 +1,19 @@
1
1
  require 'net/https'
2
2
  require 'anemone/page'
3
+ require 'anemone/cookie_store'
3
4
 
4
5
  module Anemone
5
6
  class HTTP
6
7
  # Maximum number of redirects to follow on each get_response
7
8
  REDIRECT_LIMIT = 5
8
9
 
10
+ # CookieStore for this HTTP client
11
+ attr_reader :cookie_store
12
+
9
13
  def initialize(opts = {})
10
14
  @connections = {}
11
15
  @opts = opts
16
+ @cookie_store = CookieStore.new(@opts[:cookies])
12
17
  end
13
18
 
14
19
  #
@@ -47,6 +52,28 @@ module Anemone
47
52
  end
48
53
  end
49
54
 
55
+ #
56
+ # The maximum number of redirects to follow
57
+ #
58
+ def redirect_limit
59
+ @opts[:redirect_limit] || REDIRECT_LIMIT
60
+ end
61
+
62
+ #
63
+ # The user-agent string which will be sent with each request,
64
+ # or nil if no such option is set
65
+ #
66
+ def user_agent
67
+ @opts[:user_agent]
68
+ end
69
+
70
+ #
71
+ # Does this HTTP client accept cookies from the server?
72
+ #
73
+ def accept_cookies?
74
+ @opts[:accept_cookies]
75
+ end
76
+
50
77
  private
51
78
 
52
79
  #
@@ -55,22 +82,19 @@ module Anemone
55
82
  # for each response.
56
83
  #
57
84
  def get(url, referer = nil)
58
- response, response_time = get_response(url, referer)
59
- code = Integer(response.code)
60
- loc = url
61
- redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
62
- yield response, code, loc, redirect_to, response_time
63
-
64
85
  limit = redirect_limit
65
- while response.is_a?(Net::HTTPRedirection) and limit > 0
66
- loc = redirect_to
86
+ loc = url
87
+ begin
88
+ # if redirected to a relative url, merge it with the host of the original
89
+ # request url
67
90
  loc = url.merge(loc) if loc.relative?
91
+
68
92
  response, response_time = get_response(loc, referer)
69
93
  code = Integer(response.code)
70
94
  redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
71
95
  yield response, code, loc, redirect_to, response_time
72
96
  limit -= 1
73
- end
97
+ end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
74
98
  end
75
99
 
76
100
  #
@@ -82,6 +106,7 @@ module Anemone
82
106
  opts = {}
83
107
  opts['User-Agent'] = user_agent if user_agent
84
108
  opts['Referer'] = referer.to_s if referer
109
+ opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
85
110
  opts['Authorization'] = authorization if authorization
86
111
 
87
112
  retries = 0
@@ -90,6 +115,7 @@ module Anemone
90
115
  response = connection(url).get(full_path, opts)
91
116
  finish = Time.now()
92
117
  response_time = ((finish - start) * 1000).round
118
+ @cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
93
119
  return response, response_time
94
120
  rescue EOFError
95
121
  refresh_connection(url)
@@ -117,18 +143,17 @@ module Anemone
117
143
  @connections[url.host][url.port] = http.start
118
144
  end
119
145
 
120
- def redirect_limit
121
- @opts[:redirect_limit] || REDIRECT_LIMIT
122
- end
123
-
124
- def user_agent
125
- @opts[:user_agent]
126
- end
127
-
128
146
  def verbose?
129
147
  @opts[:verbose]
130
148
  end
131
149
 
150
+ #
151
+ # Allowed to connect to the requested url?
152
+ #
153
+ def allowed?(to_url, from_url)
154
+ to_url.host.nil? || (to_url.host == from_url.host)
155
+ end
156
+
132
157
  def authorization
133
158
  @opts[:authorization]
134
159
  end
@@ -1,5 +1,6 @@
1
1
  require 'nokogiri'
2
2
  require 'ostruct'
3
+ require 'webrick/cookie'
3
4
 
4
5
  module Anemone
5
6
  class Page
@@ -94,6 +95,13 @@ module Anemone
94
95
  @fetched
95
96
  end
96
97
 
98
+ #
99
+ # Array of cookies received with this page as WEBrick::Cookie objects.
100
+ #
101
+ def cookies
102
+ WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
103
+ end
104
+
97
105
  #
98
106
  # The content-type returned by the HTTP request for this page
99
107
  #
@@ -0,0 +1,27 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe CookieStore do
5
+
6
+ it "should start out empty if no cookies are specified" do
7
+ CookieStore.new.empty?.should be true
8
+ end
9
+
10
+ it "should accept a Hash of cookies in the constructor" do
11
+ CookieStore.new({'test' => 'cookie'})['test'].value.should == 'cookie'
12
+ end
13
+
14
+ it "should be able to merge an HTTP cookie string" do
15
+ cs = CookieStore.new({'a' => 'a', 'b' => 'b'})
16
+ cs.merge! "a=A; path=/, c=C; path=/"
17
+ cs['a'].value.should == 'A'
18
+ cs['b'].value.should == 'b'
19
+ cs['c'].value.should == 'C'
20
+ end
21
+
22
+ it "should have a to_s method to turn the cookies into a string for the HTTP Cookie header" do
23
+ CookieStore.new({'a' => 'a', 'b' => 'b'}).to_s.should == 'a=a;b=b'
24
+ end
25
+
26
+ end
27
+ end
@@ -19,7 +19,7 @@ module Anemone
19
19
  Anemone.crawl(pages[0].url, @opts).should have(4).pages
20
20
  end
21
21
 
22
- it "should not leave the original domain" do
22
+ it "should not follow links that leave the original domain" do
23
23
  pages = []
24
24
  pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
25
25
  pages << FakePage.new('1')
@@ -30,6 +30,17 @@ module Anemone
30
30
  core.pages.keys.should_not include('http://www.other.com/')
31
31
  end
32
32
 
33
+ it "should not follow redirects that leave the original domain" do
34
+ pages = []
35
+ pages << FakePage.new('0', :links => ['1'], :redirect => 'http://www.other.com/')
36
+ pages << FakePage.new('1')
37
+
38
+ core = Anemone.crawl(pages[0].url, @opts)
39
+
40
+ core.should have(2).pages
41
+ core.pages.keys.should_not include('http://www.other.com/')
42
+ end
43
+
33
44
  it "should follow http redirects" do
34
45
  pages = []
35
46
  pages << FakePage.new('0', :links => ['1'])
@@ -143,6 +154,24 @@ module Anemone
143
154
  urls.should_not include(pages[1].url)
144
155
  end
145
156
 
157
+ it "should be able to set cookies to send with HTTP requests" do
158
+ cookies = {:a => '1', :b => '2'}
159
+ core = Anemone.crawl(FakePage.new('0').url) do |anemone|
160
+ anemone.cookies = cookies
161
+ end
162
+ core.opts[:cookies].should == cookies
163
+ end
164
+
165
+ it "should freeze the options once the crawl begins" do
166
+ core = Anemone.crawl(FakePage.new('0').url) do |anemone|
167
+ anemone.threads = 4
168
+ anemone.on_every_page do
169
+ lambda {anemone.threads = 2}.should raise_error
170
+ end
171
+ end
172
+ core.opts[:threads].should == 4
173
+ end
174
+
146
175
  describe "many pages" do
147
176
  before(:each) do
148
177
  @pages, size = [], 5
@@ -9,12 +9,12 @@ FakeWeb.allow_net_connect = false
9
9
 
10
10
  module Anemone
11
11
  SPEC_DOMAIN = "http://www.example.com/"
12
-
12
+
13
13
  class FakePage
14
14
  attr_accessor :links
15
15
  attr_accessor :hrefs
16
16
  attr_accessor :body
17
-
17
+
18
18
  def initialize(name = '', options = {})
19
19
  @name = name
20
20
  @links = [options[:links]].flatten if options.has_key?(:links)
@@ -22,30 +22,38 @@ module Anemone
22
22
  @redirect = options[:redirect] if options.has_key?(:redirect)
23
23
  @content_type = options[:content_type] || "text/html"
24
24
  @body = options[:body]
25
-
25
+
26
26
  create_body unless @body
27
27
  add_to_fakeweb
28
28
  end
29
-
29
+
30
30
  def url
31
31
  SPEC_DOMAIN + @name
32
32
  end
33
-
33
+
34
34
  private
35
-
35
+
36
36
  def create_body
37
37
  @body = "<html><body>"
38
38
  @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
39
39
  @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
40
40
  @body += "</body></html>"
41
41
  end
42
-
42
+
43
43
  def add_to_fakeweb
44
44
  options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
45
-
45
+
46
46
  if @redirect
47
- options[:status] = [301, "Permanently Moved"]
48
- options[:location] = SPEC_DOMAIN + @redirect
47
+ options[:status] = [301, "Permanently Moved"]
48
+
49
+ # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
50
+ redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
51
+ options[:location] = redirect_url
52
+
53
+ # register the page this one redirects to
54
+ FakeWeb.register_uri(:get, redirect_url, {:body => '',
55
+ :content_type => @content_type,
56
+ :status => [200, "OK"]})
49
57
  end
50
58
 
51
59
  FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
@@ -15,9 +15,10 @@ module Anemone
15
15
  end
16
16
  end
17
17
 
18
- http = HTTP.new
18
+ http = Anemone::HTTP.new
19
19
  http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
20
20
  end
21
+
21
22
  end
22
23
  end
23
- end
24
+ end
@@ -68,5 +68,10 @@ module Anemone
68
68
  @page.should respond_to(:response_time)
69
69
  end
70
70
 
71
+ it "should have the cookies received with the page" do
72
+ @page.should respond_to(:cookies)
73
+ @page.cookies.should == []
74
+ end
75
+
71
76
  end
72
77
  end
metadata CHANGED
@@ -1,7 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spk-anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ hash: 15
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 4
9
+ - 0
10
+ version: 0.4.0
5
11
  platform: ruby
6
12
  authors:
7
13
  - Chris Kite
@@ -9,29 +15,41 @@ autorequire:
9
15
  bindir: bin
10
16
  cert_chain: []
11
17
 
12
- date: 2010-01-29 00:00:00 +01:00
18
+ date: 2010-08-18 00:00:00 +02:00
13
19
  default_executable:
14
20
  dependencies:
15
21
  - !ruby/object:Gem::Dependency
16
22
  name: nokogiri
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
20
26
  requirements:
21
27
  - - ">="
22
28
  - !ruby/object:Gem::Version
29
+ hash: 5
30
+ segments:
31
+ - 1
32
+ - 4
33
+ - 1
23
34
  version: 1.4.1
24
- version:
35
+ type: :runtime
36
+ version_requirements: *id001
25
37
  - !ruby/object:Gem::Dependency
26
38
  name: robots
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
30
42
  requirements:
31
43
  - - ">="
32
44
  - !ruby/object:Gem::Version
45
+ hash: 7
46
+ segments:
47
+ - 0
48
+ - 7
49
+ - 2
33
50
  version: 0.7.2
34
- version:
51
+ type: :runtime
52
+ version_requirements: *id002
35
53
  description:
36
54
  email:
37
55
  executables:
@@ -46,6 +64,7 @@ files:
46
64
  - README.rdoc
47
65
  - bin/anemone
48
66
  - lib/anemone.rb
67
+ - lib/anemone/cookie_store.rb
49
68
  - lib/anemone/core.rb
50
69
  - lib/anemone/http.rb
51
70
  - lib/anemone/page.rb
@@ -60,6 +79,15 @@ files:
60
79
  - lib/anemone/cli/count.rb
61
80
  - lib/anemone/cli/pagedepth.rb
62
81
  - lib/anemone/cli/serialize.rb
82
+ - spec/anemone_spec.rb
83
+ - spec/cookie_store_spec.rb
84
+ - spec/core_spec.rb
85
+ - spec/page_spec.rb
86
+ - spec/page_store_spec.rb
87
+ - spec/http_spec.rb
88
+ - spec/storage_spec.rb
89
+ - spec/fakeweb_helper.rb
90
+ - spec/spec_helper.rb
63
91
  has_rdoc: true
64
92
  homepage: http://anemone.rubyforge.org
65
93
  licenses: []
@@ -73,26 +101,33 @@ rdoc_options:
73
101
  require_paths:
74
102
  - lib
75
103
  required_ruby_version: !ruby/object:Gem::Requirement
104
+ none: false
76
105
  requirements:
77
106
  - - ">="
78
107
  - !ruby/object:Gem::Version
108
+ hash: 3
109
+ segments:
110
+ - 0
79
111
  version: "0"
80
- version:
81
112
  required_rubygems_version: !ruby/object:Gem::Requirement
113
+ none: false
82
114
  requirements:
83
115
  - - ">="
84
116
  - !ruby/object:Gem::Version
117
+ hash: 3
118
+ segments:
119
+ - 0
85
120
  version: "0"
86
- version:
87
121
  requirements: []
88
122
 
89
123
  rubyforge_project: anemone
90
- rubygems_version: 1.3.5
124
+ rubygems_version: 1.3.7
91
125
  signing_key:
92
126
  specification_version: 3
93
127
  summary: Anemone web-spider framework
94
128
  test_files:
95
129
  - spec/anemone_spec.rb
130
+ - spec/cookie_store_spec.rb
96
131
  - spec/core_spec.rb
97
132
  - spec/page_spec.rb
98
133
  - spec/page_store_spec.rb