polipus 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +20 -0
- data/README.md +55 -0
- data/README.rdoc +3 -0
- data/Rakefile +9 -0
- data/examples/basic.rb +58 -0
- data/examples/survival.rb +9 -0
- data/lib/polipus.rb +451 -0
- data/lib/polipus/http.rb +195 -0
- data/lib/polipus/page.rb +219 -0
- data/lib/polipus/plugin.rb +13 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +17 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +24 -0
- data/lib/polipus/queue_overflow/base.rb +6 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
- data/lib/polipus/queue_overflow/manager.rb +50 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +17 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/mongo_store.rb +86 -0
- data/lib/polipus/storage/s3_store.rb +100 -0
- data/lib/polipus/url_tracker.rb +20 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +4 -0
- data/polipus.gemspec +39 -0
- data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
- data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
- data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
- data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
- data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
- data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
- data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
- data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
- data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +11 -0
- data/spec/http_spec.rb +31 -0
- data/spec/page_spec.rb +22 -0
- data/spec/queue_overflow_manager_spec.rb +89 -0
- data/spec/queue_overflow_spec.rb +71 -0
- data/spec/spec_helper.rb +34 -0
- data/spec/storage_mongo_spec.rb +102 -0
- data/spec/storage_s3_spec.rb +115 -0
- data/spec/url_tracker_spec.rb +28 -0
- metadata +313 -0
data/lib/polipus/http.rb
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
require "net/https"
|
2
|
+
require "polipus/page"
|
3
|
+
require "zlib"
|
4
|
+
require 'http/cookie'
|
5
|
+
|
6
|
+
module Polipus
|
7
|
+
class HTTP
|
8
|
+
# Maximum number of redirects to follow on each get_response
|
9
|
+
REDIRECT_LIMIT = 5
|
10
|
+
|
11
|
+
def initialize(opts = {})
|
12
|
+
@connections = {}
|
13
|
+
@opts = opts
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
# Fetch a single Page from the response of an HTTP request to *url*.
|
18
|
+
# Just gets the final destination page.
|
19
|
+
#
|
20
|
+
def fetch_page(url, referer = nil, depth = nil)
|
21
|
+
fetch_pages(url, referer, depth).last
|
22
|
+
end
|
23
|
+
|
24
|
+
#
|
25
|
+
# Create new Pages from the response of an HTTP request to *url*,
|
26
|
+
# including redirects
|
27
|
+
#
|
28
|
+
def fetch_pages(url, referer = nil, depth = nil)
|
29
|
+
begin
|
30
|
+
url = URI(url) unless url.is_a?(URI)
|
31
|
+
pages = []
|
32
|
+
get(url, referer) do |response, code, location, redirect_to, response_time|
|
33
|
+
body = response.body.dup
|
34
|
+
if response.to_hash.fetch('content-encoding', [])[0] == 'gzip'
|
35
|
+
gzip = Zlib::GzipReader.new(StringIO.new(body))
|
36
|
+
body = gzip.read
|
37
|
+
end
|
38
|
+
pages << Page.new(location, :body => response.body.dup,
|
39
|
+
:code => code,
|
40
|
+
:headers => response.to_hash,
|
41
|
+
:referer => referer,
|
42
|
+
:depth => depth,
|
43
|
+
:redirect_to => redirect_to,
|
44
|
+
:response_time => response_time)
|
45
|
+
end
|
46
|
+
|
47
|
+
return pages
|
48
|
+
rescue StandardError => e
|
49
|
+
if verbose?
|
50
|
+
puts e.inspect
|
51
|
+
puts e.backtrace
|
52
|
+
end
|
53
|
+
return [Page.new(url, :error => e)]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# The maximum number of redirects to follow
|
59
|
+
#
|
60
|
+
def redirect_limit
|
61
|
+
@opts[:redirect_limit] || REDIRECT_LIMIT
|
62
|
+
end
|
63
|
+
|
64
|
+
#
|
65
|
+
# The user-agent string which will be sent with each request,
|
66
|
+
# or nil if no such option is set
|
67
|
+
#
|
68
|
+
def user_agent
|
69
|
+
@opts[:user_agent]
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
#
|
74
|
+
# The proxy address string
|
75
|
+
#
|
76
|
+
def proxy_host
|
77
|
+
@opts[:proxy_host]
|
78
|
+
end
|
79
|
+
|
80
|
+
#
|
81
|
+
# The proxy port
|
82
|
+
#
|
83
|
+
def proxy_port
|
84
|
+
@opts[:proxy_port]
|
85
|
+
end
|
86
|
+
|
87
|
+
#
|
88
|
+
# HTTP read timeout in seconds
|
89
|
+
#
|
90
|
+
def read_timeout
|
91
|
+
@opts[:read_timeout]
|
92
|
+
end
|
93
|
+
|
94
|
+
# Does this HTTP client accept cookies from the server?
|
95
|
+
#
|
96
|
+
def accept_cookies?
|
97
|
+
@opts[:accept_cookies]
|
98
|
+
end
|
99
|
+
|
100
|
+
def cookie_jar
|
101
|
+
@opts[:cookie_jar] ||= ::HTTP::CookieJar.new
|
102
|
+
@opts[:cookie_jar]
|
103
|
+
end
|
104
|
+
|
105
|
+
private
|
106
|
+
|
107
|
+
#
|
108
|
+
# Retrieve HTTP responses for *url*, including redirects.
|
109
|
+
# Yields the response object, response code, and URI location
|
110
|
+
# for each response.
|
111
|
+
#
|
112
|
+
def get(url, referer = nil)
|
113
|
+
limit = redirect_limit
|
114
|
+
loc = url
|
115
|
+
begin
|
116
|
+
# if redirected to a relative url, merge it with the host of the original
|
117
|
+
# request url
|
118
|
+
loc = url.merge(loc) if loc.relative?
|
119
|
+
|
120
|
+
response, response_time = get_response(loc, referer)
|
121
|
+
code = Integer(response.code)
|
122
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
123
|
+
yield response, code, loc, redirect_to, response_time
|
124
|
+
limit -= 1
|
125
|
+
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
126
|
+
end
|
127
|
+
|
128
|
+
#
|
129
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
130
|
+
#
|
131
|
+
def get_response(url, referer = nil)
|
132
|
+
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
133
|
+
|
134
|
+
opts = {}
|
135
|
+
opts['User-Agent'] = user_agent if user_agent
|
136
|
+
opts['Referer'] = referer.to_s if referer
|
137
|
+
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
138
|
+
|
139
|
+
retries = 0
|
140
|
+
begin
|
141
|
+
start = Time.now()
|
142
|
+
# format request
|
143
|
+
req = Net::HTTP::Get.new(full_path, opts)
|
144
|
+
# HTTP Basic authentication
|
145
|
+
req.basic_auth url.user, url.password if url.user
|
146
|
+
response = connection(url).request(req)
|
147
|
+
finish = Time.now()
|
148
|
+
response_time = ((finish - start) * 1000).round
|
149
|
+
cookie_jar.parse(response["Set-Cookie"], url) if accept_cookies?
|
150
|
+
return response, response_time
|
151
|
+
rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
|
152
|
+
|
153
|
+
puts e.inspect if verbose?
|
154
|
+
refresh_connection(url)
|
155
|
+
retries += 1
|
156
|
+
retry unless retries > 3
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
def connection(url)
|
161
|
+
@connections[url.host] ||= {}
|
162
|
+
|
163
|
+
if conn = @connections[url.host][url.port]
|
164
|
+
return conn
|
165
|
+
end
|
166
|
+
|
167
|
+
refresh_connection url
|
168
|
+
end
|
169
|
+
|
170
|
+
def refresh_connection(url)
|
171
|
+
http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
|
172
|
+
|
173
|
+
http.read_timeout = read_timeout if !!read_timeout
|
174
|
+
|
175
|
+
if url.scheme == 'https'
|
176
|
+
http.use_ssl = true
|
177
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
178
|
+
end
|
179
|
+
|
180
|
+
@connections[url.host][url.port] = http.start
|
181
|
+
end
|
182
|
+
|
183
|
+
def verbose?
|
184
|
+
@opts[:verbose]
|
185
|
+
end
|
186
|
+
|
187
|
+
#
|
188
|
+
# Allowed to connect to the requested url?
|
189
|
+
#
|
190
|
+
def allowed?(to_url, from_url)
|
191
|
+
to_url.host.nil? || (to_url.host == from_url.host)
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
195
|
+
end
|
data/lib/polipus/page.rb
ADDED
@@ -0,0 +1,219 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'json'
|
3
|
+
require 'ostruct'
|
4
|
+
require 'set'
|
5
|
+
module Polipus
|
6
|
+
class Page
|
7
|
+
|
8
|
+
# The URL of the page
|
9
|
+
attr_reader :url
|
10
|
+
# The raw HTTP response body of the page
|
11
|
+
attr_reader :body
|
12
|
+
# Headers of the HTTP response
|
13
|
+
attr_reader :headers
|
14
|
+
# URL of the page this one redirected to, if any
|
15
|
+
attr_reader :redirect_to
|
16
|
+
# Exception object, if one was raised during HTTP#fetch_page
|
17
|
+
attr_reader :error
|
18
|
+
# Integer response code of the page
|
19
|
+
attr_accessor :code
|
20
|
+
# Depth of this page from the root of the crawl. This is not necessarily the
|
21
|
+
# shortest path; use PageStore#shortest_paths! to find that value.
|
22
|
+
attr_accessor :depth
|
23
|
+
# URL of the page that brought us to this page
|
24
|
+
attr_accessor :referer
|
25
|
+
# Response time of the request for this page in milliseconds
|
26
|
+
attr_accessor :response_time
|
27
|
+
# OpenStruct it holds users defined data
|
28
|
+
attr_accessor :user_data
|
29
|
+
|
30
|
+
attr_accessor :aliases
|
31
|
+
|
32
|
+
attr_accessor :domain_aliases
|
33
|
+
|
34
|
+
#
|
35
|
+
# Create a new page
|
36
|
+
#
|
37
|
+
def initialize(url, params = {})
|
38
|
+
@url = url.kind_of?(URI) ? url : URI(url)
|
39
|
+
@code = params[:code]
|
40
|
+
@headers = params[:headers] || {}
|
41
|
+
@headers['content-type'] ||= ['']
|
42
|
+
@aliases = Array(params[:aka]).compact
|
43
|
+
@referer = params[:referer]
|
44
|
+
@depth = params[:depth] || 0
|
45
|
+
@redirect_to = to_absolute(params[:redirect_to])
|
46
|
+
@response_time = params[:response_time]
|
47
|
+
@body = params[:body]
|
48
|
+
@error = params[:error]
|
49
|
+
@fetched = !params[:code].nil?
|
50
|
+
@user_data = OpenStruct.new
|
51
|
+
@domain_aliases = params[:domain_aliases] ||= []
|
52
|
+
end
|
53
|
+
|
54
|
+
#
|
55
|
+
# Array of distinct A tag HREFs from the page
|
56
|
+
#
|
57
|
+
def links
|
58
|
+
return @links.to_a unless @links.nil?
|
59
|
+
@links = Set.new
|
60
|
+
return [] if !doc
|
61
|
+
|
62
|
+
doc.search("//a[@href]").each do |a|
|
63
|
+
u = a['href']
|
64
|
+
next if u.nil? or u.empty?
|
65
|
+
abs = to_absolute(u) rescue next
|
66
|
+
@links << abs if in_domain?(abs)
|
67
|
+
end
|
68
|
+
@links.to_a
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
# Nokogiri document for the HTML body
|
73
|
+
#
|
74
|
+
def doc
|
75
|
+
return @doc if @doc
|
76
|
+
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# Discard links, a next call of page.links will return an empty array
|
81
|
+
#
|
82
|
+
def discard_links!
|
83
|
+
@links = []
|
84
|
+
end
|
85
|
+
|
86
|
+
#
|
87
|
+
# Delete the Nokogiri document and response body to conserve memory
|
88
|
+
#
|
89
|
+
def discard_doc!
|
90
|
+
links # force parsing of page links before we trash the document
|
91
|
+
@doc = @body = nil
|
92
|
+
end
|
93
|
+
|
94
|
+
#
|
95
|
+
# Was the page successfully fetched?
|
96
|
+
# +true+ if the page was fetched with no error, +false+ otherwise.
|
97
|
+
#
|
98
|
+
def fetched?
|
99
|
+
@fetched
|
100
|
+
end
|
101
|
+
|
102
|
+
#
|
103
|
+
# The content-type returned by the HTTP request for this page
|
104
|
+
#
|
105
|
+
def content_type
|
106
|
+
headers['content-type'].first
|
107
|
+
end
|
108
|
+
|
109
|
+
#
|
110
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
111
|
+
# otherwise.
|
112
|
+
#
|
113
|
+
def html?
|
114
|
+
!!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
|
115
|
+
end
|
116
|
+
|
117
|
+
#
|
118
|
+
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
119
|
+
# otherwise.
|
120
|
+
#
|
121
|
+
def redirect?
|
122
|
+
(300..307).include?(@code)
|
123
|
+
end
|
124
|
+
|
125
|
+
#
|
126
|
+
# Returns +true+ if the page was not found (returned 404 code),
|
127
|
+
# returns +false+ otherwise.
|
128
|
+
#
|
129
|
+
def not_found?
|
130
|
+
404 == @code
|
131
|
+
end
|
132
|
+
|
133
|
+
#
|
134
|
+
# Base URI from the HTML doc head element
|
135
|
+
# http://www.w3.org/TR/html4/struct/links.html#edef-BASE
|
136
|
+
#
|
137
|
+
def base
|
138
|
+
@base = if doc
|
139
|
+
href = doc.search('//head/base/@href')
|
140
|
+
URI(href.to_s) unless href.nil? rescue nil
|
141
|
+
end unless @base
|
142
|
+
|
143
|
+
return nil if @base && @base.to_s().empty?
|
144
|
+
@base
|
145
|
+
end
|
146
|
+
|
147
|
+
#
|
148
|
+
# Converts relative URL *link* into an absolute URL based on the
|
149
|
+
# location of the page
|
150
|
+
#
|
151
|
+
def to_absolute(link)
|
152
|
+
return nil if link.nil?
|
153
|
+
|
154
|
+
# remove anchor
|
155
|
+
link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))
|
156
|
+
|
157
|
+
relative = URI(link)
|
158
|
+
absolute = base ? base.merge(relative) : @url.merge(relative)
|
159
|
+
|
160
|
+
absolute.path = '/' if absolute.path.empty?
|
161
|
+
|
162
|
+
return absolute
|
163
|
+
end
|
164
|
+
|
165
|
+
#
|
166
|
+
# Returns +true+ if *uri* is in the same domain as the page, returns
|
167
|
+
# +false+ otherwise
|
168
|
+
#
|
169
|
+
def in_domain?(uri)
|
170
|
+
@domain_aliases ||= []
|
171
|
+
uri.host == @url.host || @domain_aliases.include?(uri.host)
|
172
|
+
end
|
173
|
+
|
174
|
+
def to_hash
|
175
|
+
{'url' => @url.to_s,
|
176
|
+
'headers' => Marshal.dump(@headers),
|
177
|
+
'body' => @body,
|
178
|
+
'links' => links.map(&:to_s),
|
179
|
+
'code' => @code,
|
180
|
+
'depth' => @depth,
|
181
|
+
'referer' => @referer.to_s,
|
182
|
+
'redirect_to' => @redirect_to.to_s,
|
183
|
+
'response_time' => @response_time,
|
184
|
+
'fetched' => @fetched,
|
185
|
+
'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump
|
186
|
+
}
|
187
|
+
end
|
188
|
+
|
189
|
+
def to_json
|
190
|
+
th = to_hash.dup
|
191
|
+
th.each {|k,v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?)}
|
192
|
+
th.delete('headers') if content_type.empty?
|
193
|
+
th.to_json
|
194
|
+
end
|
195
|
+
|
196
|
+
def self.from_hash(hash)
|
197
|
+
page = self.new(URI(hash['url']))
|
198
|
+
{'@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']},
|
199
|
+
'@body' => hash['body'],
|
200
|
+
'@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
|
201
|
+
'@code' => hash['code'].to_i,
|
202
|
+
'@depth' => hash['depth'].to_i,
|
203
|
+
'@referer' => hash['referer'],
|
204
|
+
'@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
|
205
|
+
'@response_time' => hash['response_time'].to_i,
|
206
|
+
'@fetched' => hash['fetched'],
|
207
|
+
'@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil
|
208
|
+
}.each do |var, value|
|
209
|
+
page.instance_variable_set(var, value)
|
210
|
+
end
|
211
|
+
page
|
212
|
+
end
|
213
|
+
|
214
|
+
def self.from_json(json)
|
215
|
+
hash = JSON.parse json
|
216
|
+
self.from_hash hash
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Polipus
|
2
|
+
module Plugin
|
3
|
+
class Cleaner
|
4
|
+
|
5
|
+
def initialize(options = {})
|
6
|
+
@reset = options[:reset] ||= false
|
7
|
+
end
|
8
|
+
|
9
|
+
def on_initialize crawler
|
10
|
+
crawler.logger.info {"Cleaner plugin loaded"}
|
11
|
+
unless @reset
|
12
|
+
crawler.logger.info {"Cleaner plugin is disable, add :reset => true to the plugin if you really know what you are doing"}
|
13
|
+
return nil
|
14
|
+
end
|
15
|
+
crawler.logger.info {"Cleaning all: url_tracker, storage, queue"}
|
16
|
+
Proc.new {
|
17
|
+
url_tracker.clear
|
18
|
+
storage.clear
|
19
|
+
queue_factory.clear
|
20
|
+
@options[:queue_overflow_adapter].clear if @options[:queue_overflow_adapter]
|
21
|
+
}
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|