scrapey 0.0.5 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -17,12 +17,13 @@ A simple framework for solving common scraping problems
17
17
  ## Examples
18
18
 
19
19
  ### CSV
20
+ By default scrapey will save as 'output.csv'
21
+ You can change this with:
22
+
23
+ @output = 'mycsv.csv'
20
24
 
21
25
  ```ruby
22
26
  require 'scrapey'
23
- # By default scrapey will save as 'output.csv'
24
- # You can change this with:
25
- # @output = 'mycsv.csv'
26
27
 
27
28
  page = get 'http://www.alexa.com/topsites'
28
29
  page.search('li.site-listing').each do |li|
@@ -31,10 +32,9 @@ end
31
32
  ```
32
33
 
33
34
  ### Database
35
+ if you created a scrapey project you can fill out the database connection information in config/config.yml
34
36
  ```ruby
35
37
  require 'scrapey'
36
- # if you created a scrapey project you can fill out the database connection
37
- # information in config/config.yml
38
38
 
39
39
  tables 'Movie', 'Actor' # create ActiveRecord models
40
40
 
@@ -90,14 +90,13 @@ get 'some_throttled_website_url'
90
90
  Scrapey will ensure that the callbacks are threadsafe
91
91
  ```ruby
92
92
  require 'scrapey'
93
- require 'scrapey/multi'
94
93
 
95
94
  fields 'url', 'title'
96
95
 
97
- def scrape url, response, header
96
+ def scrape url, response
98
97
  doc = Nokogiri::HTML response
99
98
  save({'url' => url, 'title' => doc.at('title').text})
100
99
  end
101
100
 
102
- multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], :threads => 3, :callback => :scrape
101
+ multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], :threads => 3, :on_success => :scrape
103
102
  ```
data/examples/multi.rb CHANGED
@@ -1,11 +1,10 @@
1
1
  require 'scrapey'
2
- require 'scrapey/multi'
3
2
 
4
3
  fields 'url', 'title'
5
4
 
6
- def scrape url, response, header
5
+ def scrape url, response
7
6
  doc = Nokogiri::HTML response
8
- save({'url' => url, 'title' => doc.at('title').text})
7
+ save({'url' => url, 'title' => doc.title})
9
8
  end
10
9
 
11
- multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], :threads => 3, :callback => :scrape
10
+ multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], :threads => 3, :on_success => :scrape
data/examples/multi2.rb CHANGED
@@ -1,25 +1,24 @@
1
1
  require 'scrapey'
2
- require 'scrapey/multi'
3
2
 
4
3
  fields 'url', 'title'
5
4
 
6
5
  def scrape url, response, header
7
6
  doc = Nokogiri::HTML response
8
- save({'url' => url, 'title' => doc.at('title').text})
7
+ save({'url' => url, 'title' => doc.title})
9
8
  puts "scraped #{url}."
10
9
  end
11
10
 
12
11
  options = {
13
12
  :threads => 3,
14
- :callback => :scrape,
15
- :proxy => {:host => 'localhost', :port => 8888},
16
- :head => {
13
+ :on_success => :scrape,
14
+ :proxy => 'http://localhost:8888',
15
+ :headers => {
17
16
  "Accept" => "*/*",
18
- #"User-Agent" => "Scrapey #{Scrapey::VERSION}",
19
- "Keep-alive" => "true"
17
+ "Keep-alive" => "true",
18
+ "Cookie" => "foo=bar"
20
19
  }
21
20
  }
22
21
 
23
- multi_get ['http://www.yahoo.com/', 'http://www.google.com/', 'http://www.bing.com/'], options
22
+ multi_get ["https://twitter.com/", 'http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], options
24
23
 
25
24
  puts "this happens after all callbacks."
@@ -0,0 +1,2 @@
1
+ url,title
2
+ https://twitter.com/,Twitter
@@ -1,15 +1,14 @@
1
1
  require 'scrapey'
2
2
  require 'scrapey/multi'
3
- require 'pry'
4
3
 
5
4
  fields 'url', 'status'
6
5
 
7
- def on_success url, response, header
8
- save({'url' => url, 'status' => header.status})
6
+ def on_success url, response
7
+ save({'url' => url, 'status' => response.status_code})
9
8
  end
10
9
 
11
10
  def on_error url, e
12
- save({'url' => url, 'status' => e})
11
+ save({'url' => url, 'status' => e.message})
13
12
  end
14
13
 
15
14
  multi_head ['http://locahlost2/foo', 'http://www.google.com/', 'http://www.bing.com/', 'http://www.bing.com/404.html']
data/lib/scrapey.rb CHANGED
@@ -7,6 +7,7 @@ require "scrapey/scrapey"
7
7
  require "scrapey/constants"
8
8
  require "scrapey/cache"
9
9
  require "scrapey/database"
10
+ require "scrapey/multi"
10
11
 
11
12
  include Scrapey
12
13
 
@@ -12,10 +12,14 @@ module Scrapey
12
12
  filename = cache_filename url
13
13
  return nil unless File::exists?(filename)
14
14
  debug "Loading #{filename} from cache"
15
- Nokogiri::HTML Marshal.load(File.read(filename))
15
+ begin
16
+ Nokogiri::HTML Marshal.load(File.read(filename))
17
+ rescue Exception => e
18
+ puts e.message
19
+ end
16
20
  end
17
21
 
18
22
  def save_cache url, doc, options = {}
19
- File.open(cache_filename(url), "w") {|f| f << Marshal.dump(doc) }
23
+ File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
20
24
  end
21
25
  end
@@ -9,9 +9,7 @@ module Scrapey
9
9
  def load_cache url
10
10
  debug "Loading #{url} from cache"
11
11
  return nil unless str = @redis.get(url)
12
- debug "found it"
13
- #binding.pry
14
- Nokogiri::HTML Marshal.load(str)
12
+ Nokogiri::HTML Marshal.load(str) rescue nil
15
13
  end
16
14
 
17
15
  def save_cache url, body, options = {}
@@ -1,6 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.7"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
- #ENV['SSL_FILE'] = "#{Gem.dir}/gems/scrapey-#{Scrapey::VERSION}/ssl/cacert.pem"
6
5
  end
data/lib/scrapey/multi.rb CHANGED
@@ -1,47 +1,42 @@
1
- require 'em-http-request'
1
+ require 'httpclient'
2
2
 
3
3
  module Scrapey
4
4
  def multi_get_or_post method, all_urls, options = {}
5
- head = options.delete(:head) || {}
6
- request_options = {:redirects => 10, :head => {"User-Agent" => "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"}.merge(head)}
7
- threads = options[:threads] || 20
8
- on_success = options[:on_success] || :on_success
9
- on_error = options[:on_error] || :on_error
10
5
  all_urls.reject!{|url| is_cached? url} if @use_cache
11
- @lock = Mutex.new
6
+ return unless all_urls.size > 0
7
+
8
+ threads = options[:threads] || [10, all_urls.size].min
9
+ on_success = options[:on_success] || :on_success
10
+ on_error = options[:on_error] || :on_error
11
+ user_agent = options[:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
12
+ proxy = options[:proxy] || nil
13
+
14
+ @lock ||= Mutex.new
15
+ @http_clients ||= threads.times.map{HTTPClient.new(options[:proxies] ? options[:proxies].rotate!.first : proxy, user_agent).tap{|c| c.ssl_config.verify_mode, c.receive_timeout = OpenSSL::SSL::VERIFY_NONE, 10000}}
16
+ debug 'starting multi'
17
+
12
18
  all_urls.each_slice(threads) do |urls|
13
- next unless urls.size > 0
14
- EventMachine.run do
15
- multi = EventMachine::MultiRequest.new
16
- urls.each_with_index do |url, i|
17
- multi.add i, EventMachine::HttpRequest.new(url, options).send(method, request_options)
18
- end
19
- multi.callback do
20
- (0...multi.requests.length).each do |i|
21
- if multi.responses[:callback][i]
22
- @lock.synchronize do
23
- if defined? on_success
24
- send on_success, urls[i], multi.responses[:callback][i].response, multi.responses[:callback][i].response_header
25
- else
26
- raise "#{on_success} not defined!"
27
- end
28
- end
19
+ urls.each_with_index.map do |url, i|
20
+ Thread.new do
21
+ begin
22
+ response = @http_clients[i].send method, url, options[:query], options[:headers]
23
+ rescue Exception => e
24
+ error = e
25
+ end
26
+ @lock.synchronize do
27
+ if response
28
+ send on_success, url, response
29
29
  else
30
- if defined? on_error
31
- send on_error, urls[i], multi.requests[i].error
32
- else
33
- raise "#{on_error} not defined!"
34
- end
30
+ send on_error, url, e
35
31
  end
36
32
  end
37
- EventMachine.stop
38
33
  end
39
- end
34
+ end.each{|thread| thread.join}
40
35
  end
41
36
  end
42
37
 
43
- def multi_get *args; multi_get_or_post 'get', *args; end
44
- def multi_post *args; multi_get_or_post 'post', *args; end
38
+ def multi_get *args; multi_get_or_post 'get_content', *args; end
39
+ def multi_post *args; multi_get_or_post 'post_content', *args; end
45
40
  def multi_head *args; multi_get_or_post 'head', *args; end
46
41
 
47
42
  end
@@ -52,7 +52,9 @@ module Scrapey
52
52
  end
53
53
  case
54
54
  when item.is_a?(Array) then @csv << item
55
- when item.is_a?(Hash) then @csv << @fields.map{|f| item[f]}
55
+ when item.is_a?(Hash)
56
+ raise 'No fields defined!' unless @fields
57
+ @csv << @fields.map{|f| item[f]}
56
58
  else raise "unsupported type: #{item.class}"
57
59
  end
58
60
  end
@@ -12,7 +12,7 @@ module Scrapey
12
12
  FileUtils.mv fn, fn.gsub('template', name)
13
13
  end
14
14
  buf = File.read "#{name}.iss"
15
- buf.gsub! /Template/, "rightmove_rentals".tr('_', ' ').gsub(/\w+/){|x| x.capitalize}
15
+ buf.gsub! /Template/, name.tr('_', ' ').gsub(/\w+/){|x| x.capitalize}
16
16
  buf.gsub! /template/, name
17
17
  File.open("#{name}.iss", 'w'){|f| f << buf}
18
18
 
data/output.csv CHANGED
@@ -1,3 +1,5 @@
1
1
  url,status
2
2
  http://www.bing.com/,200
3
3
  http://www.bing.com/404.html,404
4
+ http://locahlost2/foo,getaddrinfo: No such host is known. (http://locahlost2:80)
5
+ http://www.google.com/,302
data/template/Rakefile CHANGED
@@ -13,10 +13,11 @@ end
13
13
 
14
14
  desc "Copy installer to dropbox folder"
15
15
  task 'dropbox' do
16
+ file = 'setup.exe'
16
17
  raise 'no dropbox folder!' unless ENV['DROPBOX']
17
18
  folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
18
19
  FileUtils.mkdir(folder) unless File.exists?(folder)
19
- FileUtils.cp "Output/setup.exe", folder
20
- url = [ENV['DROPBOX_public_url'], name, 'setup.exe'].join('/').squeeze('/')
20
+ FileUtils.cp "Output/#{file}", folder
21
+ url = [ENV['DROPBOX_public_url'], name, file].join('/').squeeze('/')
21
22
  puts "uploaded to #{url}"
22
23
  end
@@ -0,0 +1,2 @@
1
+ name,address,zip
2
+ ,,
@@ -1,6 +1,30 @@
1
1
  require 'scrapey'
2
+ require 'pry'
3
+
4
+ # some skeleton code that I like to start with
2
5
  # require 'scrapey/multi' #=> requires em-http-request
3
6
 
4
7
  # sample customizations...
5
8
  # @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
6
9
  # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
10
+
11
+ def scrape div
12
+ a = div.at('a')
13
+ url = URI.join(@url, a[:href]).to_s
14
+ return unless visited? url
15
+ item = {}
16
+
17
+ save item
18
+ exit if defined? Ocra
19
+ rescue StandardError => e
20
+ binding.pry
21
+ end
22
+
23
+ #use_cache :redis
24
+
25
+ fields 'name', 'address', 'zip'
26
+
27
+ @url = "http://www.example.com/"
28
+
29
+ page = get @url
30
+ scrape page.at('div')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-15 00:00:00.000000000 Z
12
+ date: 2012-08-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -59,6 +59,7 @@ files:
59
59
  - examples/imdb.rb
60
60
  - examples/multi.rb
61
61
  - examples/multi2.rb
62
+ - examples/output.csv
62
63
  - examples/redis.rb
63
64
  - examples/status_check.rb
64
65
  - lib/scrapey/cache/disk.rb
@@ -71,11 +72,11 @@ files:
71
72
  - lib/scrapey/template.rb
72
73
  - lib/scrapey.rb
73
74
  - output.csv
74
- - ponsesq
75
75
  - scrapey.gemspec
76
76
  - template/config/config.yml
77
77
  - template/Gemfile
78
78
  - template/icon.ico
79
+ - template/output.csv
79
80
  - template/Rakefile
80
81
  - template/src/schema.rb
81
82
  - template/src/template.rb
data/ponsesq DELETED
@@ -1,593 +0,0 @@
1
- => #<EventMachine::MultiRequest:0x2237178
2
- @callbacks=[],
3
- @deferred_args=[#<EventMachine::MultiRequest:0x2237178 ...>],
4
- @deferred_status=:succeeded,
5
- @deferred_timeout=nil,
6
- @errbacks=nil,
7
- @requests=
8
- {0=>
9
- #<EventMachine::HttpClient:0x21adc28
10
- @callbacks=[],
11
- @conn=
12
- #<EventMachine::HttpConnection:0x21f3f30
13
- @connopts=
14
- #<HttpConnectionOptions:0x2231100
15
- @connect_timeout=5,
16
- @host="locahlost2",
17
- @inactivity_timeout=10,
18
- @port=80,
19
- @proxy=nil,
20
- @tls={}>,
21
- @deferred=true,
22
- @middleware=[],
23
- @uri="http://locahlost2/foo">,
24
- @content_charset=nil,
25
- @content_decoder=nil,
26
- @cookiejar=
27
- #<EventMachine::HttpClient::CookieJar:0x21adc70
28
- @jar=#<CookieJar::Jar:0x21adcb8 @domains={}>>,
29
- @cookies=[],
30
- @deferred_args=[#<EventMachine::HttpClient:0x21adc28 ...>],
31
- @deferred_status=:failed,
32
- @deferred_timeout=nil,
33
- @errbacks=[],
34
- @error="unable to resolve server address",
35
- @headers=nil,
36
- @req=
37
- #<HttpClientOptions:0x21f42f0
38
- @body=nil,
39
- @decoding=true,
40
- @file=nil,
41
- @followed=0,
42
- @headers=
43
- {"User-Agent"=>
44
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
45
- @host="locahlost2",
46
- @keepalive=false,
47
- @method="HEAD",
48
- @pass_cookies=true,
49
- @path=nil,
50
- @port=80,
51
- @query=nil,
52
- @redirects=10,
53
- @uri=#<Addressable::URI:0x10e696c URI:http://locahlost2:80/foo>>,
54
- @response="",
55
- @response_header={},
56
- @state=:response_header,
57
- @stream=nil>,
58
- 1=>
59
- #<EventMachine::HttpClient:0x20ed610
60
- @callbacks=[],
61
- @conn=
62
- #<EventMachine::HttpConnection:0x2136d08
63
- @clients=[],
64
- @conn=
65
- #<EventMachine::HttpStubConnection:0x20c2e60
66
- @callbacks=[],
67
- @deferred_args=[],
68
- @deferred_status=:succeeded,
69
- @deferred_timeout=nil,
70
- @errbacks=nil,
71
- @parent=#<EventMachine::HttpConnection:0x2136d08 ...>,
72
- @signature=5>,
73
- @connopts=
74
- #<HttpConnectionOptions:0x215c7d8
75
- @connect_timeout=5,
76
- @host="www.google.com.",
77
- @inactivity_timeout=10,
78
- @port=80,
79
- @proxy=nil,
80
- @tls={}>,
81
- @deferred=false,
82
- @middleware=[],
83
- @p=#<HTTP::Parser:0x20da578>,
84
- @peer="\x02\x00\x00PJ}G^\x00\x00\x00\x00\x00\x00\x00\x00",
85
- @pending=[],
86
- @uri="http://www.google.com.">,
87
- @content_charset=nil,
88
- @content_decoder=nil,
89
- @cookiejar=
90
- #<EventMachine::HttpClient::CookieJar:0x20ed670
91
- @jar=
92
- #<CookieJar::Jar:0x20e5198
93
- @domains=
94
- {".google.com.ph"=>
95
- {"/"=>
96
- {"PREF"=>
97
- PREF=ID=ec6b270e7fe890fa:FF=0:TM=1344992862:LM=1344992862:S=WlLM9juILblGoi5k,
98
- "NID"=>
99
- NID=62=5O_DMdySUEeIJXvveuCr1U8UEfYEhurh0X2Is-a5f0xhTw5CxEY9gELcAyCmwqt4MxVLDpvT2anCV79hhXHfd-QPi0zRY8bCiqh7BlH1B3w0wfE3eg0PTR_KbXUJBBFm}}}>>,
100
- @cookies=[],
101
- @deferred_args=[#<EventMachine::HttpClient:0x20ed610 ...>],
102
- @deferred_status=:succeeded,
103
- @deferred_timeout=nil,
104
- @errbacks=
105
- [#<Proc:0x20297e8@C:/Ruby193/lib/ruby/gems/1.9.1/gems/em-http-request-1.0.2/lib/em-http/multi.rb:42>],
106
- @error=nil,
107
- @headers=nil,
108
- @req=
109
- #<HttpClientOptions:0x2129ec8
110
- @body=nil,
111
- @decoding=true,
112
- @file=nil,
113
- @followed=1,
114
- @headers=
115
- {"User-Agent"=>
116
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
117
- @host="www.google.com.ph",
118
- @keepalive=false,
119
- @method="HEAD",
120
- @pass_cookies=true,
121
- @path=nil,
122
- @port=80,
123
- @query=nil,
124
- @redirects=10,
125
- @uri=#<Addressable::URI:0xe683ec URI:http://www.google.com.ph:80/>>,
126
- @response="",
127
- @response_header=
128
- {"DATE"=>"Wed, 15 Aug 2012 01:07:42 GMT",
129
- "EXPIRES"=>"-1",
130
- "CACHE_CONTROL"=>"private, max-age=0",
131
- "CONTENT_TYPE"=>"text/html; charset=ISO-8859-1",
132
- "SET_COOKIE"=>
133
- ["PREF=ID=ec6b270e7fe890fa:FF=0:TM=1344992862:LM=1344992862:S=WlLM9juILblGoi5k; expires=Fri, 15-Aug-2014 01:07:42 GMT; path=/; domain=.google.com.ph",
134
- "NID=62=5O_DMdySUEeIJXvveuCr1U8UEfYEhurh0X2Is-a5f0xhTw5CxEY9gELcAyCmwqt4MxVLDpvT2anCV79hhXHfd-QPi0zRY8bCiqh7BlH1B3w0wfE3eg0PTR_KbXUJBBFm; expires=Thu, 14-Feb-2013 01:07:42 GMT; path=/; domain=.google.com.ph; HttpOnly"],
135
- "P3P"=>
136
- "CP=\"This is not a P3P policy! See http://www.google.com/support/accounts/bin/answer.py?hl=en&answer=151657 for more info.\"",
137
- "SERVER"=>"gws",
138
- "X_XSS_PROTECTION"=>"1; mode=block",
139
- "X_FRAME_OPTIONS"=>"SAMEORIGIN",
140
- "CONNECTION"=>"close"},
141
- @state=:finished,
142
- @stream=nil>,
143
- 2=>
144
- #<EventMachine::HttpClient:0x08d8490
145
- @callbacks=[],
146
- @conn=
147
- #<EventMachine::HttpConnection:0x08d91b0
148
- @clients=[],
149
- @conn=
150
- #<EventMachine::HttpStubConnection:0x08d8148
151
- @callbacks=[],
152
- @deferred_args=[],
153
- @deferred_status=:succeeded,
154
- @deferred_timeout=nil,
155
- @errbacks=nil,
156
- @parent=#<EventMachine::HttpConnection:0x08d91b0 ...>,
157
- @signature=3>,
158
- @connopts=
159
- #<HttpConnectionOptions:0x1fed3b0
160
- @connect_timeout=5,
161
- @host="www.bing.com",
162
- @inactivity_timeout=10,
163
- @port=80,
164
- @proxy=nil,
165
- @tls={}>,
166
- @deferred=false,
167
- @middleware=[],
168
- @p=#<HTTP::Parser:0x08d8028>,
169
- @peer="\x02\x00\x00P|j\xAE\xB2\x00\x00\x00\x00\x00\x00\x00\x00",
170
- @pending=[],
171
- @uri="http://www.bing.com/">,
172
- @content_charset=nil,
173
- @content_decoder=nil,
174
- @cookiejar=
175
- #<EventMachine::HttpClient::CookieJar:0x08d8340
176
- @jar=
177
- #<CookieJar::Jar:0x08d8328
178
- @domains=
179
- {".bing.com"=>
180
- {"/"=>
181
- {"_FS"=>_FS=NU=1,
182
- "_SS"=>_SS=SID=847F099F99524E2F97F8236B4B203509,
183
- "SRCHD"=>SRCHD=D=2430787&MS=2430787&AF=NOFORM,
184
- "SRCHUSR"=>SRCHUSR=AUTOREDIR=0&GEOVAR=&DOB=20120815}},
185
- "www.bing.com"=>
186
- {"/"=>
187
- {"SRCHUID"=>
188
- SRCHUID=V=2&GUID=28C754BC00C346D19F70AD5235BC50B4}}}>>,
189
- @cookies=[],
190
- @deferred_args=[#<EventMachine::HttpClient:0x08d8490 ...>],
191
- @deferred_status=:succeeded,
192
- @deferred_timeout=nil,
193
- @errbacks=[],
194
- @error=nil,
195
- @headers=nil,
196
- @req=
197
- #<HttpClientOptions:0x08d9030
198
- @body=nil,
199
- @decoding=true,
200
- @file=nil,
201
- @followed=0,
202
- @headers=
203
- {"User-Agent"=>
204
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
205
- @host="www.bing.com",
206
- @keepalive=false,
207
- @method="HEAD",
208
- @pass_cookies=true,
209
- @path=nil,
210
- @port=80,
211
- @query=nil,
212
- @redirects=10,
213
- @uri=#<Addressable::URI:0x46c50c URI:http://www.bing.com:80/>>,
214
- @response="",
215
- @response_header=
216
- {"CACHE_CONTROL"=>"private, max-age=0",
217
- "CONTENT_TYPE"=>"text/html",
218
- "P3P"=>"CP=\"NON UNI COM NAV STA LOC CURa DEVa PSAa PSDa OUR IND\"",
219
- "DATE"=>"Wed, 15 Aug 2012 01:07:42 GMT",
220
- "CONTENT_LENGTH"=>"1",
221
- "CONNECTION"=>"close",
222
- "SET_COOKIE"=>
223
- ["_FS=NU=1; domain=.bing.com; path=/",
224
- "_SS=SID=847F099F99524E2F97F8236B4B203509; domain=.bing.com; path=/",
225
- "SRCHD=D=2430787&MS=2430787&AF=NOFORM; expires=Fri, 15-Aug-2014 01:07:42 GMT; domain=.bing.com; path=/",
226
- "SRCHUID=V=2&GUID=28C754BC00C346D19F70AD5235BC50B4; expires=Fri, 15-Aug-2014 01:07:42 GMT; path=/",
227
- "SRCHUSR=AUTOREDIR=0&GEOVAR=&DOB=20120815; expires=Fri, 15-Aug-2014 01:07:42 GMT; domain=.bing.com; path=/"]},
228
- @state=:finished,
229
- @stream=nil>,
230
- 3=>
231
- #<EventMachine::HttpClient:0x1cc2a98
232
- @callbacks=[],
233
- @conn=
234
- #<EventMachine::HttpConnection:0x08d7698
235
- @clients=[],
236
- @conn=
237
- #<EventMachine::HttpStubConnection:0x1cc22e8
238
- @callbacks=[],
239
- @deferred_args=[],
240
- @deferred_status=:succeeded,
241
- @deferred_timeout=nil,
242
- @errbacks=nil,
243
- @parent=#<EventMachine::HttpConnection:0x08d7698 ...>,
244
- @signature=4>,
245
- @connopts=
246
- #<HttpConnectionOptions:0x08d7d70
247
- @connect_timeout=5,
248
- @host="www.bing.com",
249
- @inactivity_timeout=10,
250
- @port=80,
251
- @proxy=nil,
252
- @tls={}>,
253
- @deferred=false,
254
- @middleware=[],
255
- @p=#<HTTP::Parser:0x1cc2150>,
256
- @peer="\x02\x00\x00P|j\xAE\xB2\x00\x00\x00\x00\x00\x00\x00\x00",
257
- @pending=[],
258
- @uri="http://www.bing.com/404.html">,
259
- @content_charset=nil,
260
- @content_decoder=nil,
261
- @cookiejar=
262
- #<EventMachine::HttpClient::CookieJar:0x1cc27c8
263
- @jar=#<CookieJar::Jar:0x1cc2660 @domains={}>>,
264
- @cookies=[],
265
- @deferred_args=[#<EventMachine::HttpClient:0x1cc2a98 ...>],
266
- @deferred_status=:succeeded,
267
- @deferred_timeout=nil,
268
- @errbacks=[],
269
- @error=nil,
270
- @headers=nil,
271
- @req=
272
- #<HttpClientOptions:0x08d7650
273
- @body=nil,
274
- @decoding=true,
275
- @file=nil,
276
- @followed=0,
277
- @headers=
278
- {"User-Agent"=>
279
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
280
- @host="www.bing.com",
281
- @keepalive=false,
282
- @method="HEAD",
283
- @pass_cookies=true,
284
- @path=nil,
285
- @port=80,
286
- @query=nil,
287
- @redirects=10,
288
- @uri=#<Addressable::URI:0xe61f18 URI:http://www.bing.com:80/404.html>>,
289
- @response="",
290
- @response_header=
291
- {"CACHE_CONTROL"=>"no-cache",
292
- "CONTENT_TYPE"=>"text/html",
293
- "P3P"=>"CP=\"NON UNI COM NAV STA LOC CURa DEVa PSAa PSDa OUR IND\"",
294
- "DATE"=>"Wed, 15 Aug 2012 01:07:42 GMT",
295
- "CONTENT_LENGTH"=>"1",
296
- "CONNECTION"=>"close"},
297
- @state=:finished,
298
- @stream=nil>},
299
- @responses=
300
- {:callback=>
301
- {2=>
302
- #<EventMachine::HttpClient:0x08d8490
303
- @callbacks=[],
304
- @conn=
305
- #<EventMachine::HttpConnection:0x08d91b0
306
- @clients=[],
307
- @conn=
308
- #<EventMachine::HttpStubConnection:0x08d8148
309
- @callbacks=[],
310
- @deferred_args=[],
311
- @deferred_status=:succeeded,
312
- @deferred_timeout=nil,
313
- @errbacks=nil,
314
- @parent=#<EventMachine::HttpConnection:0x08d91b0 ...>,
315
- @signature=3>,
316
- @connopts=
317
- #<HttpConnectionOptions:0x1fed3b0
318
- @connect_timeout=5,
319
- @host="www.bing.com",
320
- @inactivity_timeout=10,
321
- @port=80,
322
- @proxy=nil,
323
- @tls={}>,
324
- @deferred=false,
325
- @middleware=[],
326
- @p=#<HTTP::Parser:0x08d8028>,
327
- @peer="\x02\x00\x00P|j\xAE\xB2\x00\x00\x00\x00\x00\x00\x00\x00",
328
- @pending=[],
329
- @uri="http://www.bing.com/">,
330
- @content_charset=nil,
331
- @content_decoder=nil,
332
- @cookiejar=
333
- #<EventMachine::HttpClient::CookieJar:0x08d8340
334
- @jar=
335
- #<CookieJar::Jar:0x08d8328
336
- @domains=
337
- {".bing.com"=>
338
- {"/"=>
339
- {"_FS"=>_FS=NU=1,
340
- "_SS"=>_SS=SID=847F099F99524E2F97F8236B4B203509,
341
- "SRCHD"=>SRCHD=D=2430787&MS=2430787&AF=NOFORM,
342
- "SRCHUSR"=>SRCHUSR=AUTOREDIR=0&GEOVAR=&DOB=20120815}},
343
- "www.bing.com"=>
344
- {"/"=>
345
- {"SRCHUID"=>
346
- SRCHUID=V=2&GUID=28C754BC00C346D19F70AD5235BC50B4}}}>>,
347
- @cookies=[],
348
- @deferred_args=[#<EventMachine::HttpClient:0x08d8490 ...>],
349
- @deferred_status=:succeeded,
350
- @deferred_timeout=nil,
351
- @errbacks=[],
352
- @error=nil,
353
- @headers=nil,
354
- @req=
355
- #<HttpClientOptions:0x08d9030
356
- @body=nil,
357
- @decoding=true,
358
- @file=nil,
359
- @followed=0,
360
- @headers=
361
- {"User-Agent"=>
362
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
363
- @host="www.bing.com",
364
- @keepalive=false,
365
- @method="HEAD",
366
- @pass_cookies=true,
367
- @path=nil,
368
- @port=80,
369
- @query=nil,
370
- @redirects=10,
371
- @uri=#<Addressable::URI:0x46c50c URI:http://www.bing.com:80/>>,
372
- @response="",
373
- @response_header=
374
- {"CACHE_CONTROL"=>"private, max-age=0",
375
- "CONTENT_TYPE"=>"text/html",
376
- "P3P"=>"CP=\"NON UNI COM NAV STA LOC CURa DEVa PSAa PSDa OUR IND\"",
377
- "DATE"=>"Wed, 15 Aug 2012 01:07:42 GMT",
378
- "CONTENT_LENGTH"=>"1",
379
- "CONNECTION"=>"close",
380
- "SET_COOKIE"=>
381
- ["_FS=NU=1; domain=.bing.com; path=/",
382
- "_SS=SID=847F099F99524E2F97F8236B4B203509; domain=.bing.com; path=/",
383
- "SRCHD=D=2430787&MS=2430787&AF=NOFORM; expires=Fri, 15-Aug-2014 01:07:42 GMT; domain=.bing.com; path=/",
384
- "SRCHUID=V=2&GUID=28C754BC00C346D19F70AD5235BC50B4; expires=Fri, 15-Aug-2014 01:07:42 GMT; path=/",
385
- "SRCHUSR=AUTOREDIR=0&GEOVAR=&DOB=20120815; expires=Fri, 15-Aug-2014 01:07:42 GMT; domain=.bing.com; path=/"]},
386
- @state=:finished,
387
- @stream=nil>,
388
- 3=>
389
- #<EventMachine::HttpClient:0x1cc2a98
390
- @callbacks=[],
391
- @conn=
392
- #<EventMachine::HttpConnection:0x08d7698
393
- @clients=[],
394
- @conn=
395
- #<EventMachine::HttpStubConnection:0x1cc22e8
396
- @callbacks=[],
397
- @deferred_args=[],
398
- @deferred_status=:succeeded,
399
- @deferred_timeout=nil,
400
- @errbacks=nil,
401
- @parent=#<EventMachine::HttpConnection:0x08d7698 ...>,
402
- @signature=4>,
403
- @connopts=
404
- #<HttpConnectionOptions:0x08d7d70
405
- @connect_timeout=5,
406
- @host="www.bing.com",
407
- @inactivity_timeout=10,
408
- @port=80,
409
- @proxy=nil,
410
- @tls={}>,
411
- @deferred=false,
412
- @middleware=[],
413
- @p=#<HTTP::Parser:0x1cc2150>,
414
- @peer="\x02\x00\x00P|j\xAE\xB2\x00\x00\x00\x00\x00\x00\x00\x00",
415
- @pending=[],
416
- @uri="http://www.bing.com/404.html">,
417
- @content_charset=nil,
418
- @content_decoder=nil,
419
- @cookiejar=
420
- #<EventMachine::HttpClient::CookieJar:0x1cc27c8
421
- @jar=#<CookieJar::Jar:0x1cc2660 @domains={}>>,
422
- @cookies=[],
423
- @deferred_args=[#<EventMachine::HttpClient:0x1cc2a98 ...>],
424
- @deferred_status=:succeeded,
425
- @deferred_timeout=nil,
426
- @errbacks=[],
427
- @error=nil,
428
- @headers=nil,
429
- @req=
430
- #<HttpClientOptions:0x08d7650
431
- @body=nil,
432
- @decoding=true,
433
- @file=nil,
434
- @followed=0,
435
- @headers=
436
- {"User-Agent"=>
437
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
438
- @host="www.bing.com",
439
- @keepalive=false,
440
- @method="HEAD",
441
- @pass_cookies=true,
442
- @path=nil,
443
- @port=80,
444
- @query=nil,
445
- @redirects=10,
446
- @uri=
447
- #<Addressable::URI:0xe61f18 URI:http://www.bing.com:80/404.html>>,
448
- @response="",
449
- @response_header=
450
- {"CACHE_CONTROL"=>"no-cache",
451
- "CONTENT_TYPE"=>"text/html",
452
- "P3P"=>"CP=\"NON UNI COM NAV STA LOC CURa DEVa PSAa PSDa OUR IND\"",
453
- "DATE"=>"Wed, 15 Aug 2012 01:07:42 GMT",
454
- "CONTENT_LENGTH"=>"1",
455
- "CONNECTION"=>"close"},
456
- @state=:finished,
457
- @stream=nil>,
458
- 1=>
459
- #<EventMachine::HttpClient:0x20ed610
460
- @callbacks=[],
461
- @conn=
462
- #<EventMachine::HttpConnection:0x2136d08
463
- @clients=[],
464
- @conn=
465
- #<EventMachine::HttpStubConnection:0x20c2e60
466
- @callbacks=[],
467
- @deferred_args=[],
468
- @deferred_status=:succeeded,
469
- @deferred_timeout=nil,
470
- @errbacks=nil,
471
- @parent=#<EventMachine::HttpConnection:0x2136d08 ...>,
472
- @signature=5>,
473
- @connopts=
474
- #<HttpConnectionOptions:0x215c7d8
475
- @connect_timeout=5,
476
- @host="www.google.com.",
477
- @inactivity_timeout=10,
478
- @port=80,
479
- @proxy=nil,
480
- @tls={}>,
481
- @deferred=false,
482
- @middleware=[],
483
- @p=#<HTTP::Parser:0x20da578>,
484
- @peer="\x02\x00\x00PJ}G^\x00\x00\x00\x00\x00\x00\x00\x00",
485
- @pending=[],
486
- @uri="http://www.google.com.">,
487
- @content_charset=nil,
488
- @content_decoder=nil,
489
- @cookiejar=
490
- #<EventMachine::HttpClient::CookieJar:0x20ed670
491
- @jar=
492
- #<CookieJar::Jar:0x20e5198
493
- @domains=
494
- {".google.com.ph"=>
495
- {"/"=>
496
- {"PREF"=>
497
- PREF=ID=ec6b270e7fe890fa:FF=0:TM=1344992862:LM=1344992862:S=WlLM9juILblGoi5k,
498
- "NID"=>
499
- NID=62=5O_DMdySUEeIJXvveuCr1U8UEfYEhurh0X2Is-a5f0xhTw5CxEY9gELcAyCmwqt4MxVLDpvT2anCV79hhXHfd-QPi0zRY8bCiqh7BlH1B3w0wfE3eg0PTR_KbXUJBBFm}}}>>,
500
- @cookies=[],
501
- @deferred_args=[#<EventMachine::HttpClient:0x20ed610 ...>],
502
- @deferred_status=:succeeded,
503
- @deferred_timeout=nil,
504
- @errbacks=
505
- [#<Proc:0x20297e8@C:/Ruby193/lib/ruby/gems/1.9.1/gems/em-http-request-1.0.2/lib/em-http/multi.rb:42>],
506
- @error=nil,
507
- @headers=nil,
508
- @req=
509
- #<HttpClientOptions:0x2129ec8
510
- @body=nil,
511
- @decoding=true,
512
- @file=nil,
513
- @followed=1,
514
- @headers=
515
- {"User-Agent"=>
516
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
517
- @host="www.google.com.ph",
518
- @keepalive=false,
519
- @method="HEAD",
520
- @pass_cookies=true,
521
- @path=nil,
522
- @port=80,
523
- @query=nil,
524
- @redirects=10,
525
- @uri=#<Addressable::URI:0xe683ec URI:http://www.google.com.ph:80/>>,
526
- @response="",
527
- @response_header=
528
- {"DATE"=>"Wed, 15 Aug 2012 01:07:42 GMT",
529
- "EXPIRES"=>"-1",
530
- "CACHE_CONTROL"=>"private, max-age=0",
531
- "CONTENT_TYPE"=>"text/html; charset=ISO-8859-1",
532
- "SET_COOKIE"=>
533
- ["PREF=ID=ec6b270e7fe890fa:FF=0:TM=1344992862:LM=1344992862:S=WlLM9juILblGoi5k; expires=Fri, 15-Aug-2014 01:07:42 GMT; path=/; domain=.google.com.ph",
534
- "NID=62=5O_DMdySUEeIJXvveuCr1U8UEfYEhurh0X2Is-a5f0xhTw5CxEY9gELcAyCmwqt4MxVLDpvT2anCV79hhXHfd-QPi0zRY8bCiqh7BlH1B3w0wfE3eg0PTR_KbXUJBBFm; expires=Thu, 14-Feb-2013 01:07:42 GMT; path=/; domain=.google.com.ph; HttpOnly"],
535
- "P3P"=>
536
- "CP=\"This is not a P3P policy! See http://www.google.com/support/accounts/bin/answer.py?hl=en&answer=151657 for more info.\"",
537
- "SERVER"=>"gws",
538
- "X_XSS_PROTECTION"=>"1; mode=block",
539
- "X_FRAME_OPTIONS"=>"SAMEORIGIN",
540
- "CONNECTION"=>"close"},
541
- @state=:finished,
542
- @stream=nil>},
543
- :errback=>
544
- {0=>
545
- #<EventMachine::HttpClient:0x21adc28
546
- @callbacks=[],
547
- @conn=
548
- #<EventMachine::HttpConnection:0x21f3f30
549
- @connopts=
550
- #<HttpConnectionOptions:0x2231100
551
- @connect_timeout=5,
552
- @host="locahlost2",
553
- @inactivity_timeout=10,
554
- @port=80,
555
- @proxy=nil,
556
- @tls={}>,
557
- @deferred=true,
558
- @middleware=[],
559
- @uri="http://locahlost2/foo">,
560
- @content_charset=nil,
561
- @content_decoder=nil,
562
- @cookiejar=
563
- #<EventMachine::HttpClient::CookieJar:0x21adc70
564
- @jar=#<CookieJar::Jar:0x21adcb8 @domains={}>>,
565
- @cookies=[],
566
- @deferred_args=[#<EventMachine::HttpClient:0x21adc28 ...>],
567
- @deferred_status=:failed,
568
- @deferred_timeout=nil,
569
- @errbacks=[],
570
- @error="unable to resolve server address",
571
- @headers=nil,
572
- @req=
573
- #<HttpClientOptions:0x21f42f0
574
- @body=nil,
575
- @decoding=true,
576
- @file=nil,
577
- @followed=0,
578
- @headers=
579
- {"User-Agent"=>
580
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
581
- @host="locahlost2",
582
- @keepalive=false,
583
- @method="HEAD",
584
- @pass_cookies=true,
585
- @path=nil,
586
- @port=80,
587
- @query=nil,
588
- @redirects=10,
589
- @uri=#<Addressable::URI:0x10e696c URI:http://locahlost2:80/foo>>,
590
- @response="",
591
- @response_header={},
592
- @state=:response_header,
593
- @stream=nil>}}>