anemone 0.3.2 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.rdoc CHANGED
@@ -1,3 +1,9 @@
1
+ == 0.4.0 / 2010-04-08
2
+
3
+ * Major enchancements
4
+
5
+ * Cookies can be accepted and sent with each HTTP request.
6
+
1
7
  == 0.3.2 / 2010-02-04
2
8
 
3
9
  * Bug fixes
@@ -0,0 +1,35 @@
1
+ require 'delegate'
2
+ require 'webrick/cookie'
3
+
4
+ class WEBrick::Cookie
5
+ def expired?
6
+ !!expires && expires < Time.now
7
+ end
8
+ end
9
+
10
+ module Anemone
11
+ class CookieStore < DelegateClass(Hash)
12
+
13
+ def initialize(cookies = nil)
14
+ @cookies = {}
15
+ cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
16
+ super(@cookies)
17
+ end
18
+
19
+ def merge!(set_cookie_str)
20
+ begin
21
+ cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
22
+ hash[cookie.name] = cookie if !!cookie
23
+ hash
24
+ end
25
+ @cookies.merge! cookie_hash
26
+ rescue
27
+ end
28
+ end
29
+
30
+ def to_s
31
+ @cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
32
+ end
33
+
34
+ end
35
+ end
data/lib/anemone/core.rb CHANGED
@@ -7,7 +7,7 @@ require 'anemone/storage'
7
7
 
8
8
  module Anemone
9
9
 
10
- VERSION = '0.3.2';
10
+ VERSION = '0.4.0';
11
11
 
12
12
  #
13
13
  # Convenience method to start a crawl
@@ -41,13 +41,17 @@ module Anemone
41
41
  # number of times HTTP redirects will be followed
42
42
  :redirect_limit => 5,
43
43
  # storage engine defaults to Hash in +process_options+ if none specified
44
- :storage => nil
44
+ :storage => nil,
45
+ # Hash of cookie name => value to send with HTTP requests
46
+ :cookies => nil,
47
+ # accept cookies from the server and send them back?
48
+ :accept_cookies => false
45
49
  }
46
50
 
47
51
  # Create setter methods for all options to be called from the crawl block
48
52
  DEFAULT_OPTS.keys.each do |key|
49
- define_method "#{key}=" do |*args|
50
- @opts[key.to_sym] = *args
53
+ define_method "#{key}=" do |value|
54
+ @opts[key.to_sym] = value
51
55
  end
52
56
  end
53
57
 
@@ -173,7 +177,7 @@ module Anemone
173
177
  end
174
178
  end
175
179
 
176
- @tentacles.each { |t| t.join }
180
+ @tentacles.each { |thread| thread.join }
177
181
  do_after_crawl_blocks
178
182
  self
179
183
  end
@@ -185,25 +189,37 @@ module Anemone
185
189
  @opts[:threads] = 1 if @opts[:delay] > 0
186
190
  @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
187
191
  @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
192
+
193
+ freeze_options
194
+ end
195
+
196
+ #
197
+ # Freeze the opts Hash so that no options can be modified
198
+ # once the crawl begins
199
+ #
200
+ def freeze_options
201
+ @opts.freeze
202
+ @opts.each_key { |key| @opts[key].freeze }
203
+ @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
188
204
  end
189
205
 
190
206
  #
191
207
  # Execute the after_crawl blocks
192
208
  #
193
209
  def do_after_crawl_blocks
194
- @after_crawl_blocks.each { |b| b.call(@pages) }
210
+ @after_crawl_blocks.each { |block| block.call(@pages) }
195
211
  end
196
212
 
197
213
  #
198
214
  # Execute the on_every_page blocks for *page*
199
215
  #
200
216
  def do_page_blocks(page)
201
- @on_every_page_blocks.each do |blk|
202
- blk.call(page)
217
+ @on_every_page_blocks.each do |block|
218
+ block.call(page)
203
219
  end
204
220
 
205
- @on_pages_like_blocks.each do |pattern, blks|
206
- blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
221
+ @on_pages_like_blocks.each do |pattern, blocks|
222
+ blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
207
223
  end
208
224
  end
209
225
 
@@ -241,7 +257,7 @@ module Anemone
241
257
  # its URL matches a skip_link pattern.
242
258
  #
243
259
  def skip_link?(link)
244
- @skip_link_patterns.any? { |p| link.path =~ p }
260
+ @skip_link_patterns.any? { |pattern| link.path =~ pattern }
245
261
  end
246
262
 
247
263
  end
data/lib/anemone/http.rb CHANGED
@@ -1,14 +1,19 @@
1
1
  require 'net/https'
2
2
  require 'anemone/page'
3
+ require 'anemone/cookie_store'
3
4
 
4
5
  module Anemone
5
6
  class HTTP
6
7
  # Maximum number of redirects to follow on each get_response
7
8
  REDIRECT_LIMIT = 5
8
9
 
10
+ # CookieStore for this HTTP client
11
+ attr_reader :cookie_store
12
+
9
13
  def initialize(opts = {})
10
14
  @connections = {}
11
15
  @opts = opts
16
+ @cookie_store = CookieStore.new(@opts[:cookies])
12
17
  end
13
18
 
14
19
  #
@@ -47,6 +52,28 @@ module Anemone
47
52
  end
48
53
  end
49
54
 
55
+ #
56
+ # The maximum number of redirects to follow
57
+ #
58
+ def redirect_limit
59
+ @opts[:redirect_limit] || REDIRECT_LIMIT
60
+ end
61
+
62
+ #
63
+ # The user-agent string which will be sent with each request,
64
+ # or nil if no such option is set
65
+ #
66
+ def user_agent
67
+ @opts[:user_agent]
68
+ end
69
+
70
+ #
71
+ # Does this HTTP client accept cookies from the server?
72
+ #
73
+ def accept_cookies?
74
+ @opts[:accept_cookies]
75
+ end
76
+
50
77
  private
51
78
 
52
79
  #
@@ -55,22 +82,19 @@ module Anemone
55
82
  # for each response.
56
83
  #
57
84
  def get(url, referer = nil)
58
- response, response_time = get_response(url, referer)
59
- code = Integer(response.code)
60
- loc = url
61
- redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
62
- yield response, code, loc, redirect_to, response_time
63
-
64
85
  limit = redirect_limit
65
- while redirect_to && allowed?(redirect_to, url) && limit > 0
66
- loc = redirect_to
86
+ loc = url
87
+ begin
88
+ # if redirected to a relative url, merge it with the host of the original
89
+ # request url
67
90
  loc = url.merge(loc) if loc.relative?
91
+
68
92
  response, response_time = get_response(loc, referer)
69
93
  code = Integer(response.code)
70
94
  redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
71
95
  yield response, code, loc, redirect_to, response_time
72
96
  limit -= 1
73
- end
97
+ end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
74
98
  end
75
99
 
76
100
  #
@@ -82,6 +106,7 @@ module Anemone
82
106
  opts = {}
83
107
  opts['User-Agent'] = user_agent if user_agent
84
108
  opts['Referer'] = referer.to_s if referer
109
+ opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
85
110
 
86
111
  retries = 0
87
112
  begin
@@ -89,6 +114,7 @@ module Anemone
89
114
  response = connection(url).get(full_path, opts)
90
115
  finish = Time.now()
91
116
  response_time = ((finish - start) * 1000).round
117
+ @cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
92
118
  return response, response_time
93
119
  rescue EOFError
94
120
  refresh_connection(url)
@@ -116,14 +142,6 @@ module Anemone
116
142
  @connections[url.host][url.port] = http.start
117
143
  end
118
144
 
119
- def redirect_limit
120
- @opts[:redirect_limit] || REDIRECT_LIMIT
121
- end
122
-
123
- def user_agent
124
- @opts[:user_agent]
125
- end
126
-
127
145
  def verbose?
128
146
  @opts[:verbose]
129
147
  end
data/lib/anemone/page.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'nokogiri'
2
2
  require 'ostruct'
3
+ require 'webrick/cookie'
3
4
 
4
5
  module Anemone
5
6
  class Page
@@ -92,6 +93,13 @@ module Anemone
92
93
  @fetched
93
94
  end
94
95
 
96
+ #
97
+ # Array of cookies received with this page as WEBrick::Cookie objects.
98
+ #
99
+ def cookies
100
+ WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
101
+ end
102
+
95
103
  #
96
104
  # The content-type returned by the HTTP request for this page
97
105
  #
@@ -0,0 +1,27 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe CookieStore do
5
+
6
+ it "should start out empty if no cookies are specified" do
7
+ CookieStore.new.empty?.should be true
8
+ end
9
+
10
+ it "should accept a Hash of cookies in the constructor" do
11
+ CookieStore.new({'test' => 'cookie'})['test'].value.should == 'cookie'
12
+ end
13
+
14
+ it "should be able to merge an HTTP cookie string" do
15
+ cs = CookieStore.new({'a' => 'a', 'b' => 'b'})
16
+ cs.merge! "a=A; path=/, c=C; path=/"
17
+ cs['a'].value.should == 'A'
18
+ cs['b'].value.should == 'b'
19
+ cs['c'].value.should == 'C'
20
+ end
21
+
22
+ it "should have a to_s method to turn the cookies into a string for the HTTP Cookie header" do
23
+ CookieStore.new({'a' => 'a', 'b' => 'b'}).to_s.should == 'a=a;b=b'
24
+ end
25
+
26
+ end
27
+ end
data/spec/core_spec.rb CHANGED
@@ -154,6 +154,24 @@ module Anemone
154
154
  urls.should_not include(pages[1].url)
155
155
  end
156
156
 
157
+ it "should be able to set cookies to send with HTTP requests" do
158
+ cookies = {:a => '1', :b => '2'}
159
+ core = Anemone.crawl(FakePage.new('0').url) do |anemone|
160
+ anemone.cookies = cookies
161
+ end
162
+ core.opts[:cookies].should == cookies
163
+ end
164
+
165
+ it "should freeze the options once the crawl begins" do
166
+ core = Anemone.crawl(FakePage.new('0').url) do |anemone|
167
+ anemone.threads = 4
168
+ anemone.on_every_page do
169
+ lambda {anemone.threads = 2}.should raise_error
170
+ end
171
+ end
172
+ core.opts[:threads].should == 4
173
+ end
174
+
157
175
  describe "many pages" do
158
176
  before(:each) do
159
177
  @pages, size = [], 5
data/spec/http_spec.rb CHANGED
@@ -15,9 +15,10 @@ module Anemone
15
15
  end
16
16
  end
17
17
 
18
- http = HTTP.new
18
+ http = Anemone::HTTP.new
19
19
  http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
20
20
  end
21
+
21
22
  end
22
23
  end
23
- end
24
+ end
data/spec/page_spec.rb CHANGED
@@ -68,5 +68,10 @@ module Anemone
68
68
  @page.should respond_to(:response_time)
69
69
  end
70
70
 
71
+ it "should have the cookies received with the page" do
72
+ @page.should respond_to(:cookies)
73
+ @page.cookies.should == []
74
+ end
75
+
71
76
  end
72
77
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-04 00:00:00 -06:00
12
+ date: 2010-04-08 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -46,6 +46,7 @@ files:
46
46
  - README.rdoc
47
47
  - bin/anemone
48
48
  - lib/anemone.rb
49
+ - lib/anemone/cookie_store.rb
49
50
  - lib/anemone/core.rb
50
51
  - lib/anemone/http.rb
51
52
  - lib/anemone/page.rb
@@ -93,6 +94,7 @@ specification_version: 3
93
94
  summary: Anemone web-spider framework
94
95
  test_files:
95
96
  - spec/anemone_spec.rb
97
+ - spec/cookie_store_spec.rb
96
98
  - spec/core_spec.rb
97
99
  - spec/page_spec.rb
98
100
  - spec/page_store_spec.rb