scrapey 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -17,12 +17,13 @@ A simple framework for solving common scraping problems
17
17
  ## Examples
18
18
 
19
19
  ### CSV
20
+ By default scrapey will save as 'output.csv'
21
+ You can change this with:
22
+
23
+ @output = 'mycsv.csv'
20
24
 
21
25
  ```ruby
22
26
  require 'scrapey'
23
- # By default scrapey will save as 'output.csv'
24
- # You can change this with:
25
- # @output = 'mycsv.csv'
26
27
 
27
28
  page = get 'http://www.alexa.com/topsites'
28
29
  page.search('li.site-listing').each do |li|
@@ -31,10 +32,9 @@ end
31
32
  ```
32
33
 
33
34
  ### Database
35
+ if you created a scrapey project you can fill out the database connection information in config/config.yml
34
36
  ```ruby
35
37
  require 'scrapey'
36
- # if you created a scrapey project you can fill out the database connection
37
- # information in config/config.yml
38
38
 
39
39
  tables 'Movie', 'Actor' # create ActiveRecord models
40
40
 
@@ -90,14 +90,13 @@ get 'some_throttled_website_url'
90
90
  Scrapey will ensure that the callbacks are threadsafe
91
91
  ```ruby
92
92
  require 'scrapey'
93
- require 'scrapey/multi'
94
93
 
95
94
  fields 'url', 'title'
96
95
 
97
- def scrape url, response, header
96
+ def scrape url, response
98
97
  doc = Nokogiri::HTML response
99
98
  save({'url' => url, 'title' => doc.at('title').text})
100
99
  end
101
100
 
102
- multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], :threads => 3, :callback => :scrape
101
+ multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], :threads => 3, :on_success => :scrape
103
102
  ```
data/examples/multi.rb CHANGED
@@ -1,11 +1,10 @@
1
1
  require 'scrapey'
2
- require 'scrapey/multi'
3
2
 
4
3
  fields 'url', 'title'
5
4
 
6
- def scrape url, response, header
5
+ def scrape url, response
7
6
  doc = Nokogiri::HTML response
8
- save({'url' => url, 'title' => doc.at('title').text})
7
+ save({'url' => url, 'title' => doc.title})
9
8
  end
10
9
 
11
- multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], :threads => 3, :callback => :scrape
10
+ multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], :threads => 3, :on_success => :scrape
data/examples/multi2.rb CHANGED
@@ -1,25 +1,24 @@
1
1
  require 'scrapey'
2
- require 'scrapey/multi'
3
2
 
4
3
  fields 'url', 'title'
5
4
 
6
5
  def scrape url, response, header
7
6
  doc = Nokogiri::HTML response
8
- save({'url' => url, 'title' => doc.at('title').text})
7
+ save({'url' => url, 'title' => doc.title})
9
8
  puts "scraped #{url}."
10
9
  end
11
10
 
12
11
  options = {
13
12
  :threads => 3,
14
- :callback => :scrape,
15
- :proxy => {:host => 'localhost', :port => 8888},
16
- :head => {
13
+ :on_success => :scrape,
14
+ :proxy => 'http://localhost:8888',
15
+ :headers => {
17
16
  "Accept" => "*/*",
18
- #"User-Agent" => "Scrapey #{Scrapey::VERSION}",
19
- "Keep-alive" => "true"
17
+ "Keep-alive" => "true",
18
+ "Cookie" => "foo=bar"
20
19
  }
21
20
  }
22
21
 
23
- multi_get ['http://www.yahoo.com/', 'http://www.google.com/', 'http://www.bing.com/'], options
22
+ multi_get ["https://twitter.com/", 'http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], options
24
23
 
25
24
  puts "this happens after all callbacks."
@@ -0,0 +1,2 @@
1
+ url,title
2
+ https://twitter.com/,Twitter
@@ -1,15 +1,14 @@
1
1
  require 'scrapey'
2
2
  require 'scrapey/multi'
3
- require 'pry'
4
3
 
5
4
  fields 'url', 'status'
6
5
 
7
- def on_success url, response, header
8
- save({'url' => url, 'status' => header.status})
6
+ def on_success url, response
7
+ save({'url' => url, 'status' => response.status_code})
9
8
  end
10
9
 
11
10
  def on_error url, e
12
- save({'url' => url, 'status' => e})
11
+ save({'url' => url, 'status' => e.message})
13
12
  end
14
13
 
15
14
  multi_head ['http://locahlost2/foo', 'http://www.google.com/', 'http://www.bing.com/', 'http://www.bing.com/404.html']
data/lib/scrapey.rb CHANGED
@@ -7,6 +7,7 @@ require "scrapey/scrapey"
7
7
  require "scrapey/constants"
8
8
  require "scrapey/cache"
9
9
  require "scrapey/database"
10
+ require "scrapey/multi"
10
11
 
11
12
  include Scrapey
12
13
 
@@ -12,10 +12,14 @@ module Scrapey
12
12
  filename = cache_filename url
13
13
  return nil unless File::exists?(filename)
14
14
  debug "Loading #{filename} from cache"
15
- Nokogiri::HTML Marshal.load(File.read(filename))
15
+ begin
16
+ Nokogiri::HTML Marshal.load(File.read(filename))
17
+ rescue Exception => e
18
+ puts e.message
19
+ end
16
20
  end
17
21
 
18
22
  def save_cache url, doc, options = {}
19
- File.open(cache_filename(url), "w") {|f| f << Marshal.dump(doc) }
23
+ File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
20
24
  end
21
25
  end
@@ -9,9 +9,7 @@ module Scrapey
9
9
  def load_cache url
10
10
  debug "Loading #{url} from cache"
11
11
  return nil unless str = @redis.get(url)
12
- debug "found it"
13
- #binding.pry
14
- Nokogiri::HTML Marshal.load(str)
12
+ Nokogiri::HTML Marshal.load(str) rescue nil
15
13
  end
16
14
 
17
15
  def save_cache url, body, options = {}
@@ -1,6 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.7"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
- #ENV['SSL_FILE'] = "#{Gem.dir}/gems/scrapey-#{Scrapey::VERSION}/ssl/cacert.pem"
6
5
  end
data/lib/scrapey/multi.rb CHANGED
@@ -1,47 +1,42 @@
1
- require 'em-http-request'
1
+ require 'httpclient'
2
2
 
3
3
  module Scrapey
4
4
  def multi_get_or_post method, all_urls, options = {}
5
- head = options.delete(:head) || {}
6
- request_options = {:redirects => 10, :head => {"User-Agent" => "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"}.merge(head)}
7
- threads = options[:threads] || 20
8
- on_success = options[:on_success] || :on_success
9
- on_error = options[:on_error] || :on_error
10
5
  all_urls.reject!{|url| is_cached? url} if @use_cache
11
- @lock = Mutex.new
6
+ return unless all_urls.size > 0
7
+
8
+ threads = options[:threads] || [10, all_urls.size].min
9
+ on_success = options[:on_success] || :on_success
10
+ on_error = options[:on_error] || :on_error
11
+ user_agent = options[:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
12
+ proxy = options[:proxy] || nil
13
+
14
+ @lock ||= Mutex.new
15
+ @http_clients ||= threads.times.map{HTTPClient.new(options[:proxies] ? options[:proxies].rotate!.first : proxy, user_agent).tap{|c| c.ssl_config.verify_mode, c.receive_timeout = OpenSSL::SSL::VERIFY_NONE, 10000}}
16
+ debug 'starting multi'
17
+
12
18
  all_urls.each_slice(threads) do |urls|
13
- next unless urls.size > 0
14
- EventMachine.run do
15
- multi = EventMachine::MultiRequest.new
16
- urls.each_with_index do |url, i|
17
- multi.add i, EventMachine::HttpRequest.new(url, options).send(method, request_options)
18
- end
19
- multi.callback do
20
- (0...multi.requests.length).each do |i|
21
- if multi.responses[:callback][i]
22
- @lock.synchronize do
23
- if defined? on_success
24
- send on_success, urls[i], multi.responses[:callback][i].response, multi.responses[:callback][i].response_header
25
- else
26
- raise "#{on_success} not defined!"
27
- end
28
- end
19
+ urls.each_with_index.map do |url, i|
20
+ Thread.new do
21
+ begin
22
+ response = @http_clients[i].send method, url, options[:query], options[:headers]
23
+ rescue Exception => e
24
+ error = e
25
+ end
26
+ @lock.synchronize do
27
+ if response
28
+ send on_success, url, response
29
29
  else
30
- if defined? on_error
31
- send on_error, urls[i], multi.requests[i].error
32
- else
33
- raise "#{on_error} not defined!"
34
- end
30
+ send on_error, url, e
35
31
  end
36
32
  end
37
- EventMachine.stop
38
33
  end
39
- end
34
+ end.each{|thread| thread.join}
40
35
  end
41
36
  end
42
37
 
43
- def multi_get *args; multi_get_or_post 'get', *args; end
44
- def multi_post *args; multi_get_or_post 'post', *args; end
38
+ def multi_get *args; multi_get_or_post 'get_content', *args; end
39
+ def multi_post *args; multi_get_or_post 'post_content', *args; end
45
40
  def multi_head *args; multi_get_or_post 'head', *args; end
46
41
 
47
42
  end
@@ -52,7 +52,9 @@ module Scrapey
52
52
  end
53
53
  case
54
54
  when item.is_a?(Array) then @csv << item
55
- when item.is_a?(Hash) then @csv << @fields.map{|f| item[f]}
55
+ when item.is_a?(Hash)
56
+ raise 'No fields defined!' unless @fields
57
+ @csv << @fields.map{|f| item[f]}
56
58
  else raise "unsupported type: #{item.class}"
57
59
  end
58
60
  end
@@ -12,7 +12,7 @@ module Scrapey
12
12
  FileUtils.mv fn, fn.gsub('template', name)
13
13
  end
14
14
  buf = File.read "#{name}.iss"
15
- buf.gsub! /Template/, "rightmove_rentals".tr('_', ' ').gsub(/\w+/){|x| x.capitalize}
15
+ buf.gsub! /Template/, name.tr('_', ' ').gsub(/\w+/){|x| x.capitalize}
16
16
  buf.gsub! /template/, name
17
17
  File.open("#{name}.iss", 'w'){|f| f << buf}
18
18
 
data/output.csv CHANGED
@@ -1,3 +1,5 @@
1
1
  url,status
2
2
  http://www.bing.com/,200
3
3
  http://www.bing.com/404.html,404
4
+ http://locahlost2/foo,getaddrinfo: No such host is known. (http://locahlost2:80)
5
+ http://www.google.com/,302
data/template/Rakefile CHANGED
@@ -13,10 +13,11 @@ end
13
13
 
14
14
  desc "Copy installer to dropbox folder"
15
15
  task 'dropbox' do
16
+ file = 'setup.exe'
16
17
  raise 'no dropbox folder!' unless ENV['DROPBOX']
17
18
  folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
18
19
  FileUtils.mkdir(folder) unless File.exists?(folder)
19
- FileUtils.cp "Output/setup.exe", folder
20
- url = [ENV['DROPBOX_public_url'], name, 'setup.exe'].join('/').squeeze('/')
20
+ FileUtils.cp "Output/#{file}", folder
21
+ url = [ENV['DROPBOX_public_url'], name, file].join('/').squeeze('/')
21
22
  puts "uploaded to #{url}"
22
23
  end
@@ -0,0 +1,2 @@
1
+ name,address,zip
2
+ ,,
@@ -1,6 +1,30 @@
1
1
  require 'scrapey'
2
+ require 'pry'
3
+
4
+ # some skeleton code that I like to start with
2
5
  # require 'scrapey/multi' #=> requires em-http-request
3
6
 
4
7
  # sample customizations...
5
8
  # @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
6
9
  # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
10
+
11
+ def scrape div
12
+ a = div.at('a')
13
+ url = URI.join(@url, a[:href]).to_s
14
+ return unless visited? url
15
+ item = {}
16
+
17
+ save item
18
+ exit if defined? Ocra
19
+ rescue StandardError => e
20
+ binding.pry
21
+ end
22
+
23
+ #use_cache :redis
24
+
25
+ fields 'name', 'address', 'zip'
26
+
27
+ @url = "http://www.example.com/"
28
+
29
+ page = get @url
30
+ scrape page.at('div')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-15 00:00:00.000000000 Z
12
+ date: 2012-08-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -59,6 +59,7 @@ files:
59
59
  - examples/imdb.rb
60
60
  - examples/multi.rb
61
61
  - examples/multi2.rb
62
+ - examples/output.csv
62
63
  - examples/redis.rb
63
64
  - examples/status_check.rb
64
65
  - lib/scrapey/cache/disk.rb
@@ -71,11 +72,11 @@ files:
71
72
  - lib/scrapey/template.rb
72
73
  - lib/scrapey.rb
73
74
  - output.csv
74
- - ponsesq
75
75
  - scrapey.gemspec
76
76
  - template/config/config.yml
77
77
  - template/Gemfile
78
78
  - template/icon.ico
79
+ - template/output.csv
79
80
  - template/Rakefile
80
81
  - template/src/schema.rb
81
82
  - template/src/template.rb
data/ponsesq DELETED
@@ -1,593 +0,0 @@
1
- => #<EventMachine::MultiRequest:0x2237178
2
- @callbacks=[],
3
- @deferred_args=[#<EventMachine::MultiRequest:0x2237178 ...>],
4
- @deferred_status=:succeeded,
5
- @deferred_timeout=nil,
6
- @errbacks=nil,
7
- @requests=
8
- {0=>
9
- #<EventMachine::HttpClient:0x21adc28
10
- @callbacks=[],
11
- @conn=
12
- #<EventMachine::HttpConnection:0x21f3f30
13
- @connopts=
14
- #<HttpConnectionOptions:0x2231100
15
- @connect_timeout=5,
16
- @host="locahlost2",
17
- @inactivity_timeout=10,
18
- @port=80,
19
- @proxy=nil,
20
- @tls={}>,
21
- @deferred=true,
22
- @middleware=[],
23
- @uri="http://locahlost2/foo">,
24
- @content_charset=nil,
25
- @content_decoder=nil,
26
- @cookiejar=
27
- #<EventMachine::HttpClient::CookieJar:0x21adc70
28
- @jar=#<CookieJar::Jar:0x21adcb8 @domains={}>>,
29
- @cookies=[],
30
- @deferred_args=[#<EventMachine::HttpClient:0x21adc28 ...>],
31
- @deferred_status=:failed,
32
- @deferred_timeout=nil,
33
- @errbacks=[],
34
- @error="unable to resolve server address",
35
- @headers=nil,
36
- @req=
37
- #<HttpClientOptions:0x21f42f0
38
- @body=nil,
39
- @decoding=true,
40
- @file=nil,
41
- @followed=0,
42
- @headers=
43
- {"User-Agent"=>
44
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
45
- @host="locahlost2",
46
- @keepalive=false,
47
- @method="HEAD",
48
- @pass_cookies=true,
49
- @path=nil,
50
- @port=80,
51
- @query=nil,
52
- @redirects=10,
53
- @uri=#<Addressable::URI:0x10e696c URI:http://locahlost2:80/foo>>,
54
- @response="",
55
- @response_header={},
56
- @state=:response_header,
57
- @stream=nil>,
58
- 1=>
59
- #<EventMachine::HttpClient:0x20ed610
60
- @callbacks=[],
61
- @conn=
62
- #<EventMachine::HttpConnection:0x2136d08
63
- @clients=[],
64
- @conn=
65
- #<EventMachine::HttpStubConnection:0x20c2e60
66
- @callbacks=[],
67
- @deferred_args=[],
68
- @deferred_status=:succeeded,
69
- @deferred_timeout=nil,
70
- @errbacks=nil,
71
- @parent=#<EventMachine::HttpConnection:0x2136d08 ...>,
72
- @signature=5>,
73
- @connopts=
74
- #<HttpConnectionOptions:0x215c7d8
75
- @connect_timeout=5,
76
- @host="www.google.com.",
77
- @inactivity_timeout=10,
78
- @port=80,
79
- @proxy=nil,
80
- @tls={}>,
81
- @deferred=false,
82
- @middleware=[],
83
- @p=#<HTTP::Parser:0x20da578>,
84
- @peer="\x02\x00\x00PJ}G^\x00\x00\x00\x00\x00\x00\x00\x00",
85
- @pending=[],
86
- @uri="http://www.google.com.">,
87
- @content_charset=nil,
88
- @content_decoder=nil,
89
- @cookiejar=
90
- #<EventMachine::HttpClient::CookieJar:0x20ed670
91
- @jar=
92
- #<CookieJar::Jar:0x20e5198
93
- @domains=
94
- {".google.com.ph"=>
95
- {"/"=>
96
- {"PREF"=>
97
- PREF=ID=ec6b270e7fe890fa:FF=0:TM=1344992862:LM=1344992862:S=WlLM9juILblGoi5k,
98
- "NID"=>
99
- NID=62=5O_DMdySUEeIJXvveuCr1U8UEfYEhurh0X2Is-a5f0xhTw5CxEY9gELcAyCmwqt4MxVLDpvT2anCV79hhXHfd-QPi0zRY8bCiqh7BlH1B3w0wfE3eg0PTR_KbXUJBBFm}}}>>,
100
- @cookies=[],
101
- @deferred_args=[#<EventMachine::HttpClient:0x20ed610 ...>],
102
- @deferred_status=:succeeded,
103
- @deferred_timeout=nil,
104
- @errbacks=
105
- [#<Proc:0x20297e8@C:/Ruby193/lib/ruby/gems/1.9.1/gems/em-http-request-1.0.2/lib/em-http/multi.rb:42>],
106
- @error=nil,
107
- @headers=nil,
108
- @req=
109
- #<HttpClientOptions:0x2129ec8
110
- @body=nil,
111
- @decoding=true,
112
- @file=nil,
113
- @followed=1,
114
- @headers=
115
- {"User-Agent"=>
116
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
117
- @host="www.google.com.ph",
118
- @keepalive=false,
119
- @method="HEAD",
120
- @pass_cookies=true,
121
- @path=nil,
122
- @port=80,
123
- @query=nil,
124
- @redirects=10,
125
- @uri=#<Addressable::URI:0xe683ec URI:http://www.google.com.ph:80/>>,
126
- @response="",
127
- @response_header=
128
- {"DATE"=>"Wed, 15 Aug 2012 01:07:42 GMT",
129
- "EXPIRES"=>"-1",
130
- "CACHE_CONTROL"=>"private, max-age=0",
131
- "CONTENT_TYPE"=>"text/html; charset=ISO-8859-1",
132
- "SET_COOKIE"=>
133
- ["PREF=ID=ec6b270e7fe890fa:FF=0:TM=1344992862:LM=1344992862:S=WlLM9juILblGoi5k; expires=Fri, 15-Aug-2014 01:07:42 GMT; path=/; domain=.google.com.ph",
134
- "NID=62=5O_DMdySUEeIJXvveuCr1U8UEfYEhurh0X2Is-a5f0xhTw5CxEY9gELcAyCmwqt4MxVLDpvT2anCV79hhXHfd-QPi0zRY8bCiqh7BlH1B3w0wfE3eg0PTR_KbXUJBBFm; expires=Thu, 14-Feb-2013 01:07:42 GMT; path=/; domain=.google.com.ph; HttpOnly"],
135
- "P3P"=>
136
- "CP=\"This is not a P3P policy! See http://www.google.com/support/accounts/bin/answer.py?hl=en&answer=151657 for more info.\"",
137
- "SERVER"=>"gws",
138
- "X_XSS_PROTECTION"=>"1; mode=block",
139
- "X_FRAME_OPTIONS"=>"SAMEORIGIN",
140
- "CONNECTION"=>"close"},
141
- @state=:finished,
142
- @stream=nil>,
143
- 2=>
144
- #<EventMachine::HttpClient:0x08d8490
145
- @callbacks=[],
146
- @conn=
147
- #<EventMachine::HttpConnection:0x08d91b0
148
- @clients=[],
149
- @conn=
150
- #<EventMachine::HttpStubConnection:0x08d8148
151
- @callbacks=[],
152
- @deferred_args=[],
153
- @deferred_status=:succeeded,
154
- @deferred_timeout=nil,
155
- @errbacks=nil,
156
- @parent=#<EventMachine::HttpConnection:0x08d91b0 ...>,
157
- @signature=3>,
158
- @connopts=
159
- #<HttpConnectionOptions:0x1fed3b0
160
- @connect_timeout=5,
161
- @host="www.bing.com",
162
- @inactivity_timeout=10,
163
- @port=80,
164
- @proxy=nil,
165
- @tls={}>,
166
- @deferred=false,
167
- @middleware=[],
168
- @p=#<HTTP::Parser:0x08d8028>,
169
- @peer="\x02\x00\x00P|j\xAE\xB2\x00\x00\x00\x00\x00\x00\x00\x00",
170
- @pending=[],
171
- @uri="http://www.bing.com/">,
172
- @content_charset=nil,
173
- @content_decoder=nil,
174
- @cookiejar=
175
- #<EventMachine::HttpClient::CookieJar:0x08d8340
176
- @jar=
177
- #<CookieJar::Jar:0x08d8328
178
- @domains=
179
- {".bing.com"=>
180
- {"/"=>
181
- {"_FS"=>_FS=NU=1,
182
- "_SS"=>_SS=SID=847F099F99524E2F97F8236B4B203509,
183
- "SRCHD"=>SRCHD=D=2430787&MS=2430787&AF=NOFORM,
184
- "SRCHUSR"=>SRCHUSR=AUTOREDIR=0&GEOVAR=&DOB=20120815}},
185
- "www.bing.com"=>
186
- {"/"=>
187
- {"SRCHUID"=>
188
- SRCHUID=V=2&GUID=28C754BC00C346D19F70AD5235BC50B4}}}>>,
189
- @cookies=[],
190
- @deferred_args=[#<EventMachine::HttpClient:0x08d8490 ...>],
191
- @deferred_status=:succeeded,
192
- @deferred_timeout=nil,
193
- @errbacks=[],
194
- @error=nil,
195
- @headers=nil,
196
- @req=
197
- #<HttpClientOptions:0x08d9030
198
- @body=nil,
199
- @decoding=true,
200
- @file=nil,
201
- @followed=0,
202
- @headers=
203
- {"User-Agent"=>
204
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
205
- @host="www.bing.com",
206
- @keepalive=false,
207
- @method="HEAD",
208
- @pass_cookies=true,
209
- @path=nil,
210
- @port=80,
211
- @query=nil,
212
- @redirects=10,
213
- @uri=#<Addressable::URI:0x46c50c URI:http://www.bing.com:80/>>,
214
- @response="",
215
- @response_header=
216
- {"CACHE_CONTROL"=>"private, max-age=0",
217
- "CONTENT_TYPE"=>"text/html",
218
- "P3P"=>"CP=\"NON UNI COM NAV STA LOC CURa DEVa PSAa PSDa OUR IND\"",
219
- "DATE"=>"Wed, 15 Aug 2012 01:07:42 GMT",
220
- "CONTENT_LENGTH"=>"1",
221
- "CONNECTION"=>"close",
222
- "SET_COOKIE"=>
223
- ["_FS=NU=1; domain=.bing.com; path=/",
224
- "_SS=SID=847F099F99524E2F97F8236B4B203509; domain=.bing.com; path=/",
225
- "SRCHD=D=2430787&MS=2430787&AF=NOFORM; expires=Fri, 15-Aug-2014 01:07:42 GMT; domain=.bing.com; path=/",
226
- "SRCHUID=V=2&GUID=28C754BC00C346D19F70AD5235BC50B4; expires=Fri, 15-Aug-2014 01:07:42 GMT; path=/",
227
- "SRCHUSR=AUTOREDIR=0&GEOVAR=&DOB=20120815; expires=Fri, 15-Aug-2014 01:07:42 GMT; domain=.bing.com; path=/"]},
228
- @state=:finished,
229
- @stream=nil>,
230
- 3=>
231
- #<EventMachine::HttpClient:0x1cc2a98
232
- @callbacks=[],
233
- @conn=
234
- #<EventMachine::HttpConnection:0x08d7698
235
- @clients=[],
236
- @conn=
237
- #<EventMachine::HttpStubConnection:0x1cc22e8
238
- @callbacks=[],
239
- @deferred_args=[],
240
- @deferred_status=:succeeded,
241
- @deferred_timeout=nil,
242
- @errbacks=nil,
243
- @parent=#<EventMachine::HttpConnection:0x08d7698 ...>,
244
- @signature=4>,
245
- @connopts=
246
- #<HttpConnectionOptions:0x08d7d70
247
- @connect_timeout=5,
248
- @host="www.bing.com",
249
- @inactivity_timeout=10,
250
- @port=80,
251
- @proxy=nil,
252
- @tls={}>,
253
- @deferred=false,
254
- @middleware=[],
255
- @p=#<HTTP::Parser:0x1cc2150>,
256
- @peer="\x02\x00\x00P|j\xAE\xB2\x00\x00\x00\x00\x00\x00\x00\x00",
257
- @pending=[],
258
- @uri="http://www.bing.com/404.html">,
259
- @content_charset=nil,
260
- @content_decoder=nil,
261
- @cookiejar=
262
- #<EventMachine::HttpClient::CookieJar:0x1cc27c8
263
- @jar=#<CookieJar::Jar:0x1cc2660 @domains={}>>,
264
- @cookies=[],
265
- @deferred_args=[#<EventMachine::HttpClient:0x1cc2a98 ...>],
266
- @deferred_status=:succeeded,
267
- @deferred_timeout=nil,
268
- @errbacks=[],
269
- @error=nil,
270
- @headers=nil,
271
- @req=
272
- #<HttpClientOptions:0x08d7650
273
- @body=nil,
274
- @decoding=true,
275
- @file=nil,
276
- @followed=0,
277
- @headers=
278
- {"User-Agent"=>
279
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
280
- @host="www.bing.com",
281
- @keepalive=false,
282
- @method="HEAD",
283
- @pass_cookies=true,
284
- @path=nil,
285
- @port=80,
286
- @query=nil,
287
- @redirects=10,
288
- @uri=#<Addressable::URI:0xe61f18 URI:http://www.bing.com:80/404.html>>,
289
- @response="",
290
- @response_header=
291
- {"CACHE_CONTROL"=>"no-cache",
292
- "CONTENT_TYPE"=>"text/html",
293
- "P3P"=>"CP=\"NON UNI COM NAV STA LOC CURa DEVa PSAa PSDa OUR IND\"",
294
- "DATE"=>"Wed, 15 Aug 2012 01:07:42 GMT",
295
- "CONTENT_LENGTH"=>"1",
296
- "CONNECTION"=>"close"},
297
- @state=:finished,
298
- @stream=nil>},
299
- @responses=
300
- {:callback=>
301
- {2=>
302
- #<EventMachine::HttpClient:0x08d8490
303
- @callbacks=[],
304
- @conn=
305
- #<EventMachine::HttpConnection:0x08d91b0
306
- @clients=[],
307
- @conn=
308
- #<EventMachine::HttpStubConnection:0x08d8148
309
- @callbacks=[],
310
- @deferred_args=[],
311
- @deferred_status=:succeeded,
312
- @deferred_timeout=nil,
313
- @errbacks=nil,
314
- @parent=#<EventMachine::HttpConnection:0x08d91b0 ...>,
315
- @signature=3>,
316
- @connopts=
317
- #<HttpConnectionOptions:0x1fed3b0
318
- @connect_timeout=5,
319
- @host="www.bing.com",
320
- @inactivity_timeout=10,
321
- @port=80,
322
- @proxy=nil,
323
- @tls={}>,
324
- @deferred=false,
325
- @middleware=[],
326
- @p=#<HTTP::Parser:0x08d8028>,
327
- @peer="\x02\x00\x00P|j\xAE\xB2\x00\x00\x00\x00\x00\x00\x00\x00",
328
- @pending=[],
329
- @uri="http://www.bing.com/">,
330
- @content_charset=nil,
331
- @content_decoder=nil,
332
- @cookiejar=
333
- #<EventMachine::HttpClient::CookieJar:0x08d8340
334
- @jar=
335
- #<CookieJar::Jar:0x08d8328
336
- @domains=
337
- {".bing.com"=>
338
- {"/"=>
339
- {"_FS"=>_FS=NU=1,
340
- "_SS"=>_SS=SID=847F099F99524E2F97F8236B4B203509,
341
- "SRCHD"=>SRCHD=D=2430787&MS=2430787&AF=NOFORM,
342
- "SRCHUSR"=>SRCHUSR=AUTOREDIR=0&GEOVAR=&DOB=20120815}},
343
- "www.bing.com"=>
344
- {"/"=>
345
- {"SRCHUID"=>
346
- SRCHUID=V=2&GUID=28C754BC00C346D19F70AD5235BC50B4}}}>>,
347
- @cookies=[],
348
- @deferred_args=[#<EventMachine::HttpClient:0x08d8490 ...>],
349
- @deferred_status=:succeeded,
350
- @deferred_timeout=nil,
351
- @errbacks=[],
352
- @error=nil,
353
- @headers=nil,
354
- @req=
355
- #<HttpClientOptions:0x08d9030
356
- @body=nil,
357
- @decoding=true,
358
- @file=nil,
359
- @followed=0,
360
- @headers=
361
- {"User-Agent"=>
362
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
363
- @host="www.bing.com",
364
- @keepalive=false,
365
- @method="HEAD",
366
- @pass_cookies=true,
367
- @path=nil,
368
- @port=80,
369
- @query=nil,
370
- @redirects=10,
371
- @uri=#<Addressable::URI:0x46c50c URI:http://www.bing.com:80/>>,
372
- @response="",
373
- @response_header=
374
- {"CACHE_CONTROL"=>"private, max-age=0",
375
- "CONTENT_TYPE"=>"text/html",
376
- "P3P"=>"CP=\"NON UNI COM NAV STA LOC CURa DEVa PSAa PSDa OUR IND\"",
377
- "DATE"=>"Wed, 15 Aug 2012 01:07:42 GMT",
378
- "CONTENT_LENGTH"=>"1",
379
- "CONNECTION"=>"close",
380
- "SET_COOKIE"=>
381
- ["_FS=NU=1; domain=.bing.com; path=/",
382
- "_SS=SID=847F099F99524E2F97F8236B4B203509; domain=.bing.com; path=/",
383
- "SRCHD=D=2430787&MS=2430787&AF=NOFORM; expires=Fri, 15-Aug-2014 01:07:42 GMT; domain=.bing.com; path=/",
384
- "SRCHUID=V=2&GUID=28C754BC00C346D19F70AD5235BC50B4; expires=Fri, 15-Aug-2014 01:07:42 GMT; path=/",
385
- "SRCHUSR=AUTOREDIR=0&GEOVAR=&DOB=20120815; expires=Fri, 15-Aug-2014 01:07:42 GMT; domain=.bing.com; path=/"]},
386
- @state=:finished,
387
- @stream=nil>,
388
- 3=>
389
- #<EventMachine::HttpClient:0x1cc2a98
390
- @callbacks=[],
391
- @conn=
392
- #<EventMachine::HttpConnection:0x08d7698
393
- @clients=[],
394
- @conn=
395
- #<EventMachine::HttpStubConnection:0x1cc22e8
396
- @callbacks=[],
397
- @deferred_args=[],
398
- @deferred_status=:succeeded,
399
- @deferred_timeout=nil,
400
- @errbacks=nil,
401
- @parent=#<EventMachine::HttpConnection:0x08d7698 ...>,
402
- @signature=4>,
403
- @connopts=
404
- #<HttpConnectionOptions:0x08d7d70
405
- @connect_timeout=5,
406
- @host="www.bing.com",
407
- @inactivity_timeout=10,
408
- @port=80,
409
- @proxy=nil,
410
- @tls={}>,
411
- @deferred=false,
412
- @middleware=[],
413
- @p=#<HTTP::Parser:0x1cc2150>,
414
- @peer="\x02\x00\x00P|j\xAE\xB2\x00\x00\x00\x00\x00\x00\x00\x00",
415
- @pending=[],
416
- @uri="http://www.bing.com/404.html">,
417
- @content_charset=nil,
418
- @content_decoder=nil,
419
- @cookiejar=
420
- #<EventMachine::HttpClient::CookieJar:0x1cc27c8
421
- @jar=#<CookieJar::Jar:0x1cc2660 @domains={}>>,
422
- @cookies=[],
423
- @deferred_args=[#<EventMachine::HttpClient:0x1cc2a98 ...>],
424
- @deferred_status=:succeeded,
425
- @deferred_timeout=nil,
426
- @errbacks=[],
427
- @error=nil,
428
- @headers=nil,
429
- @req=
430
- #<HttpClientOptions:0x08d7650
431
- @body=nil,
432
- @decoding=true,
433
- @file=nil,
434
- @followed=0,
435
- @headers=
436
- {"User-Agent"=>
437
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
438
- @host="www.bing.com",
439
- @keepalive=false,
440
- @method="HEAD",
441
- @pass_cookies=true,
442
- @path=nil,
443
- @port=80,
444
- @query=nil,
445
- @redirects=10,
446
- @uri=
447
- #<Addressable::URI:0xe61f18 URI:http://www.bing.com:80/404.html>>,
448
- @response="",
449
- @response_header=
450
- {"CACHE_CONTROL"=>"no-cache",
451
- "CONTENT_TYPE"=>"text/html",
452
- "P3P"=>"CP=\"NON UNI COM NAV STA LOC CURa DEVa PSAa PSDa OUR IND\"",
453
- "DATE"=>"Wed, 15 Aug 2012 01:07:42 GMT",
454
- "CONTENT_LENGTH"=>"1",
455
- "CONNECTION"=>"close"},
456
- @state=:finished,
457
- @stream=nil>,
458
- 1=>
459
- #<EventMachine::HttpClient:0x20ed610
460
- @callbacks=[],
461
- @conn=
462
- #<EventMachine::HttpConnection:0x2136d08
463
- @clients=[],
464
- @conn=
465
- #<EventMachine::HttpStubConnection:0x20c2e60
466
- @callbacks=[],
467
- @deferred_args=[],
468
- @deferred_status=:succeeded,
469
- @deferred_timeout=nil,
470
- @errbacks=nil,
471
- @parent=#<EventMachine::HttpConnection:0x2136d08 ...>,
472
- @signature=5>,
473
- @connopts=
474
- #<HttpConnectionOptions:0x215c7d8
475
- @connect_timeout=5,
476
- @host="www.google.com.",
477
- @inactivity_timeout=10,
478
- @port=80,
479
- @proxy=nil,
480
- @tls={}>,
481
- @deferred=false,
482
- @middleware=[],
483
- @p=#<HTTP::Parser:0x20da578>,
484
- @peer="\x02\x00\x00PJ}G^\x00\x00\x00\x00\x00\x00\x00\x00",
485
- @pending=[],
486
- @uri="http://www.google.com.">,
487
- @content_charset=nil,
488
- @content_decoder=nil,
489
- @cookiejar=
490
- #<EventMachine::HttpClient::CookieJar:0x20ed670
491
- @jar=
492
- #<CookieJar::Jar:0x20e5198
493
- @domains=
494
- {".google.com.ph"=>
495
- {"/"=>
496
- {"PREF"=>
497
- PREF=ID=ec6b270e7fe890fa:FF=0:TM=1344992862:LM=1344992862:S=WlLM9juILblGoi5k,
498
- "NID"=>
499
- NID=62=5O_DMdySUEeIJXvveuCr1U8UEfYEhurh0X2Is-a5f0xhTw5CxEY9gELcAyCmwqt4MxVLDpvT2anCV79hhXHfd-QPi0zRY8bCiqh7BlH1B3w0wfE3eg0PTR_KbXUJBBFm}}}>>,
500
- @cookies=[],
501
- @deferred_args=[#<EventMachine::HttpClient:0x20ed610 ...>],
502
- @deferred_status=:succeeded,
503
- @deferred_timeout=nil,
504
- @errbacks=
505
- [#<Proc:0x20297e8@C:/Ruby193/lib/ruby/gems/1.9.1/gems/em-http-request-1.0.2/lib/em-http/multi.rb:42>],
506
- @error=nil,
507
- @headers=nil,
508
- @req=
509
- #<HttpClientOptions:0x2129ec8
510
- @body=nil,
511
- @decoding=true,
512
- @file=nil,
513
- @followed=1,
514
- @headers=
515
- {"User-Agent"=>
516
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
517
- @host="www.google.com.ph",
518
- @keepalive=false,
519
- @method="HEAD",
520
- @pass_cookies=true,
521
- @path=nil,
522
- @port=80,
523
- @query=nil,
524
- @redirects=10,
525
- @uri=#<Addressable::URI:0xe683ec URI:http://www.google.com.ph:80/>>,
526
- @response="",
527
- @response_header=
528
- {"DATE"=>"Wed, 15 Aug 2012 01:07:42 GMT",
529
- "EXPIRES"=>"-1",
530
- "CACHE_CONTROL"=>"private, max-age=0",
531
- "CONTENT_TYPE"=>"text/html; charset=ISO-8859-1",
532
- "SET_COOKIE"=>
533
- ["PREF=ID=ec6b270e7fe890fa:FF=0:TM=1344992862:LM=1344992862:S=WlLM9juILblGoi5k; expires=Fri, 15-Aug-2014 01:07:42 GMT; path=/; domain=.google.com.ph",
534
- "NID=62=5O_DMdySUEeIJXvveuCr1U8UEfYEhurh0X2Is-a5f0xhTw5CxEY9gELcAyCmwqt4MxVLDpvT2anCV79hhXHfd-QPi0zRY8bCiqh7BlH1B3w0wfE3eg0PTR_KbXUJBBFm; expires=Thu, 14-Feb-2013 01:07:42 GMT; path=/; domain=.google.com.ph; HttpOnly"],
535
- "P3P"=>
536
- "CP=\"This is not a P3P policy! See http://www.google.com/support/accounts/bin/answer.py?hl=en&answer=151657 for more info.\"",
537
- "SERVER"=>"gws",
538
- "X_XSS_PROTECTION"=>"1; mode=block",
539
- "X_FRAME_OPTIONS"=>"SAMEORIGIN",
540
- "CONNECTION"=>"close"},
541
- @state=:finished,
542
- @stream=nil>},
543
- :errback=>
544
- {0=>
545
- #<EventMachine::HttpClient:0x21adc28
546
- @callbacks=[],
547
- @conn=
548
- #<EventMachine::HttpConnection:0x21f3f30
549
- @connopts=
550
- #<HttpConnectionOptions:0x2231100
551
- @connect_timeout=5,
552
- @host="locahlost2",
553
- @inactivity_timeout=10,
554
- @port=80,
555
- @proxy=nil,
556
- @tls={}>,
557
- @deferred=true,
558
- @middleware=[],
559
- @uri="http://locahlost2/foo">,
560
- @content_charset=nil,
561
- @content_decoder=nil,
562
- @cookiejar=
563
- #<EventMachine::HttpClient::CookieJar:0x21adc70
564
- @jar=#<CookieJar::Jar:0x21adcb8 @domains={}>>,
565
- @cookies=[],
566
- @deferred_args=[#<EventMachine::HttpClient:0x21adc28 ...>],
567
- @deferred_status=:failed,
568
- @deferred_timeout=nil,
569
- @errbacks=[],
570
- @error="unable to resolve server address",
571
- @headers=nil,
572
- @req=
573
- #<HttpClientOptions:0x21f42f0
574
- @body=nil,
575
- @decoding=true,
576
- @file=nil,
577
- @followed=0,
578
- @headers=
579
- {"User-Agent"=>
580
- "Scrapey v0.0.5 - https://github.com/monkeysuffrage/scrapey"},
581
- @host="locahlost2",
582
- @keepalive=false,
583
- @method="HEAD",
584
- @pass_cookies=true,
585
- @path=nil,
586
- @port=80,
587
- @query=nil,
588
- @redirects=10,
589
- @uri=#<Addressable::URI:0x10e696c URI:http://locahlost2:80/foo>>,
590
- @response="",
591
- @response_header={},
592
- @state=:response_header,
593
- @stream=nil>}}>