anemone 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG.rdoc CHANGED
@@ -1,3 +1,9 @@
1
+ == 0.4.0 / 2010-04-08
2
+
3
+ * Major enchancements
4
+
5
+ * Cookies can be accepted and sent with each HTTP request.
6
+
1
7
  == 0.3.2 / 2010-02-04
2
8
 
3
9
  * Bug fixes
@@ -0,0 +1,35 @@
1
+ require 'delegate'
2
+ require 'webrick/cookie'
3
+
4
+ class WEBrick::Cookie
5
+ def expired?
6
+ !!expires && expires < Time.now
7
+ end
8
+ end
9
+
10
+ module Anemone
11
+ class CookieStore < DelegateClass(Hash)
12
+
13
+ def initialize(cookies = nil)
14
+ @cookies = {}
15
+ cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
16
+ super(@cookies)
17
+ end
18
+
19
+ def merge!(set_cookie_str)
20
+ begin
21
+ cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
22
+ hash[cookie.name] = cookie if !!cookie
23
+ hash
24
+ end
25
+ @cookies.merge! cookie_hash
26
+ rescue
27
+ end
28
+ end
29
+
30
+ def to_s
31
+ @cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
32
+ end
33
+
34
+ end
35
+ end
data/lib/anemone/core.rb CHANGED
@@ -7,7 +7,7 @@ require 'anemone/storage'
7
7
 
8
8
  module Anemone
9
9
 
10
- VERSION = '0.3.2';
10
+ VERSION = '0.4.0';
11
11
 
12
12
  #
13
13
  # Convenience method to start a crawl
@@ -41,13 +41,17 @@ module Anemone
41
41
  # number of times HTTP redirects will be followed
42
42
  :redirect_limit => 5,
43
43
  # storage engine defaults to Hash in +process_options+ if none specified
44
- :storage => nil
44
+ :storage => nil,
45
+ # Hash of cookie name => value to send with HTTP requests
46
+ :cookies => nil,
47
+ # accept cookies from the server and send them back?
48
+ :accept_cookies => false
45
49
  }
46
50
 
47
51
  # Create setter methods for all options to be called from the crawl block
48
52
  DEFAULT_OPTS.keys.each do |key|
49
- define_method "#{key}=" do |*args|
50
- @opts[key.to_sym] = *args
53
+ define_method "#{key}=" do |value|
54
+ @opts[key.to_sym] = value
51
55
  end
52
56
  end
53
57
 
@@ -173,7 +177,7 @@ module Anemone
173
177
  end
174
178
  end
175
179
 
176
- @tentacles.each { |t| t.join }
180
+ @tentacles.each { |thread| thread.join }
177
181
  do_after_crawl_blocks
178
182
  self
179
183
  end
@@ -185,25 +189,37 @@ module Anemone
185
189
  @opts[:threads] = 1 if @opts[:delay] > 0
186
190
  @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
187
191
  @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
192
+
193
+ freeze_options
194
+ end
195
+
196
+ #
197
+ # Freeze the opts Hash so that no options can be modified
198
+ # once the crawl begins
199
+ #
200
+ def freeze_options
201
+ @opts.freeze
202
+ @opts.each_key { |key| @opts[key].freeze }
203
+ @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
188
204
  end
189
205
 
190
206
  #
191
207
  # Execute the after_crawl blocks
192
208
  #
193
209
  def do_after_crawl_blocks
194
- @after_crawl_blocks.each { |b| b.call(@pages) }
210
+ @after_crawl_blocks.each { |block| block.call(@pages) }
195
211
  end
196
212
 
197
213
  #
198
214
  # Execute the on_every_page blocks for *page*
199
215
  #
200
216
  def do_page_blocks(page)
201
- @on_every_page_blocks.each do |blk|
202
- blk.call(page)
217
+ @on_every_page_blocks.each do |block|
218
+ block.call(page)
203
219
  end
204
220
 
205
- @on_pages_like_blocks.each do |pattern, blks|
206
- blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
221
+ @on_pages_like_blocks.each do |pattern, blocks|
222
+ blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
207
223
  end
208
224
  end
209
225
 
@@ -241,7 +257,7 @@ module Anemone
241
257
  # its URL matches a skip_link pattern.
242
258
  #
243
259
  def skip_link?(link)
244
- @skip_link_patterns.any? { |p| link.path =~ p }
260
+ @skip_link_patterns.any? { |pattern| link.path =~ pattern }
245
261
  end
246
262
 
247
263
  end
data/lib/anemone/http.rb CHANGED
@@ -1,14 +1,19 @@
1
1
  require 'net/https'
2
2
  require 'anemone/page'
3
+ require 'anemone/cookie_store'
3
4
 
4
5
  module Anemone
5
6
  class HTTP
6
7
  # Maximum number of redirects to follow on each get_response
7
8
  REDIRECT_LIMIT = 5
8
9
 
10
+ # CookieStore for this HTTP client
11
+ attr_reader :cookie_store
12
+
9
13
  def initialize(opts = {})
10
14
  @connections = {}
11
15
  @opts = opts
16
+ @cookie_store = CookieStore.new(@opts[:cookies])
12
17
  end
13
18
 
14
19
  #
@@ -47,6 +52,28 @@ module Anemone
47
52
  end
48
53
  end
49
54
 
55
+ #
56
+ # The maximum number of redirects to follow
57
+ #
58
+ def redirect_limit
59
+ @opts[:redirect_limit] || REDIRECT_LIMIT
60
+ end
61
+
62
+ #
63
+ # The user-agent string which will be sent with each request,
64
+ # or nil if no such option is set
65
+ #
66
+ def user_agent
67
+ @opts[:user_agent]
68
+ end
69
+
70
+ #
71
+ # Does this HTTP client accept cookies from the server?
72
+ #
73
+ def accept_cookies?
74
+ @opts[:accept_cookies]
75
+ end
76
+
50
77
  private
51
78
 
52
79
  #
@@ -55,22 +82,19 @@ module Anemone
55
82
  # for each response.
56
83
  #
57
84
  def get(url, referer = nil)
58
- response, response_time = get_response(url, referer)
59
- code = Integer(response.code)
60
- loc = url
61
- redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
62
- yield response, code, loc, redirect_to, response_time
63
-
64
85
  limit = redirect_limit
65
- while redirect_to && allowed?(redirect_to, url) && limit > 0
66
- loc = redirect_to
86
+ loc = url
87
+ begin
88
+ # if redirected to a relative url, merge it with the host of the original
89
+ # request url
67
90
  loc = url.merge(loc) if loc.relative?
91
+
68
92
  response, response_time = get_response(loc, referer)
69
93
  code = Integer(response.code)
70
94
  redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
71
95
  yield response, code, loc, redirect_to, response_time
72
96
  limit -= 1
73
- end
97
+ end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
74
98
  end
75
99
 
76
100
  #
@@ -82,6 +106,7 @@ module Anemone
82
106
  opts = {}
83
107
  opts['User-Agent'] = user_agent if user_agent
84
108
  opts['Referer'] = referer.to_s if referer
109
+ opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
85
110
 
86
111
  retries = 0
87
112
  begin
@@ -89,6 +114,7 @@ module Anemone
89
114
  response = connection(url).get(full_path, opts)
90
115
  finish = Time.now()
91
116
  response_time = ((finish - start) * 1000).round
117
+ @cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
92
118
  return response, response_time
93
119
  rescue EOFError
94
120
  refresh_connection(url)
@@ -116,14 +142,6 @@ module Anemone
116
142
  @connections[url.host][url.port] = http.start
117
143
  end
118
144
 
119
- def redirect_limit
120
- @opts[:redirect_limit] || REDIRECT_LIMIT
121
- end
122
-
123
- def user_agent
124
- @opts[:user_agent]
125
- end
126
-
127
145
  def verbose?
128
146
  @opts[:verbose]
129
147
  end
data/lib/anemone/page.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'nokogiri'
2
2
  require 'ostruct'
3
+ require 'webrick/cookie'
3
4
 
4
5
  module Anemone
5
6
  class Page
@@ -92,6 +93,13 @@ module Anemone
92
93
  @fetched
93
94
  end
94
95
 
96
+ #
97
+ # Array of cookies received with this page as WEBrick::Cookie objects.
98
+ #
99
+ def cookies
100
+ WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
101
+ end
102
+
95
103
  #
96
104
  # The content-type returned by the HTTP request for this page
97
105
  #
@@ -0,0 +1,27 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe CookieStore do
5
+
6
+ it "should start out empty if no cookies are specified" do
7
+ CookieStore.new.empty?.should be true
8
+ end
9
+
10
+ it "should accept a Hash of cookies in the constructor" do
11
+ CookieStore.new({'test' => 'cookie'})['test'].value.should == 'cookie'
12
+ end
13
+
14
+ it "should be able to merge an HTTP cookie string" do
15
+ cs = CookieStore.new({'a' => 'a', 'b' => 'b'})
16
+ cs.merge! "a=A; path=/, c=C; path=/"
17
+ cs['a'].value.should == 'A'
18
+ cs['b'].value.should == 'b'
19
+ cs['c'].value.should == 'C'
20
+ end
21
+
22
+ it "should have a to_s method to turn the cookies into a string for the HTTP Cookie header" do
23
+ CookieStore.new({'a' => 'a', 'b' => 'b'}).to_s.should == 'a=a;b=b'
24
+ end
25
+
26
+ end
27
+ end
data/spec/core_spec.rb CHANGED
@@ -154,6 +154,24 @@ module Anemone
154
154
  urls.should_not include(pages[1].url)
155
155
  end
156
156
 
157
+ it "should be able to set cookies to send with HTTP requests" do
158
+ cookies = {:a => '1', :b => '2'}
159
+ core = Anemone.crawl(FakePage.new('0').url) do |anemone|
160
+ anemone.cookies = cookies
161
+ end
162
+ core.opts[:cookies].should == cookies
163
+ end
164
+
165
+ it "should freeze the options once the crawl begins" do
166
+ core = Anemone.crawl(FakePage.new('0').url) do |anemone|
167
+ anemone.threads = 4
168
+ anemone.on_every_page do
169
+ lambda {anemone.threads = 2}.should raise_error
170
+ end
171
+ end
172
+ core.opts[:threads].should == 4
173
+ end
174
+
157
175
  describe "many pages" do
158
176
  before(:each) do
159
177
  @pages, size = [], 5
data/spec/http_spec.rb CHANGED
@@ -15,9 +15,10 @@ module Anemone
15
15
  end
16
16
  end
17
17
 
18
- http = HTTP.new
18
+ http = Anemone::HTTP.new
19
19
  http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
20
20
  end
21
+
21
22
  end
22
23
  end
23
- end
24
+ end
data/spec/page_spec.rb CHANGED
@@ -68,5 +68,10 @@ module Anemone
68
68
  @page.should respond_to(:response_time)
69
69
  end
70
70
 
71
+ it "should have the cookies received with the page" do
72
+ @page.should respond_to(:cookies)
73
+ @page.cookies.should == []
74
+ end
75
+
71
76
  end
72
77
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-04 00:00:00 -06:00
12
+ date: 2010-04-08 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -46,6 +46,7 @@ files:
46
46
  - README.rdoc
47
47
  - bin/anemone
48
48
  - lib/anemone.rb
49
+ - lib/anemone/cookie_store.rb
49
50
  - lib/anemone/core.rb
50
51
  - lib/anemone/http.rb
51
52
  - lib/anemone/page.rb
@@ -93,6 +94,7 @@ specification_version: 3
93
94
  summary: Anemone web-spider framework
94
95
  test_files:
95
96
  - spec/anemone_spec.rb
97
+ - spec/cookie_store_spec.rb
96
98
  - spec/core_spec.rb
97
99
  - spec/page_spec.rb
98
100
  - spec/page_store_spec.rb