polipus 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +20 -0
- data/README.md +55 -0
- data/README.rdoc +3 -0
- data/Rakefile +9 -0
- data/examples/basic.rb +58 -0
- data/examples/survival.rb +9 -0
- data/lib/polipus.rb +451 -0
- data/lib/polipus/http.rb +195 -0
- data/lib/polipus/page.rb +219 -0
- data/lib/polipus/plugin.rb +13 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +17 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +24 -0
- data/lib/polipus/queue_overflow/base.rb +6 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
- data/lib/polipus/queue_overflow/manager.rb +50 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +17 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/mongo_store.rb +86 -0
- data/lib/polipus/storage/s3_store.rb +100 -0
- data/lib/polipus/url_tracker.rb +20 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +4 -0
- data/polipus.gemspec +39 -0
- data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
- data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
- data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
- data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
- data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
- data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
- data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
- data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
- data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +11 -0
- data/spec/http_spec.rb +31 -0
- data/spec/page_spec.rb +22 -0
- data/spec/queue_overflow_manager_spec.rb +89 -0
- data/spec/queue_overflow_spec.rb +71 -0
- data/spec/spec_helper.rb +34 -0
- data/spec/storage_mongo_spec.rb +102 -0
- data/spec/storage_s3_spec.rb +115 -0
- data/spec/url_tracker_spec.rb +28 -0
- metadata +313 -0
data/lib/polipus/http.rb
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
require "net/https"
|
2
|
+
require "polipus/page"
|
3
|
+
require "zlib"
|
4
|
+
require 'http/cookie'
|
5
|
+
|
6
|
+
module Polipus
|
7
|
+
class HTTP
|
8
|
+
# Maximum number of redirects to follow on each get_response
|
9
|
+
REDIRECT_LIMIT = 5
|
10
|
+
|
11
|
+
def initialize(opts = {})
|
12
|
+
@connections = {}
|
13
|
+
@opts = opts
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
# Fetch a single Page from the response of an HTTP request to *url*.
|
18
|
+
# Just gets the final destination page.
|
19
|
+
#
|
20
|
+
def fetch_page(url, referer = nil, depth = nil)
|
21
|
+
fetch_pages(url, referer, depth).last
|
22
|
+
end
|
23
|
+
|
24
|
+
#
|
25
|
+
# Create new Pages from the response of an HTTP request to *url*,
|
26
|
+
# including redirects
|
27
|
+
#
|
28
|
+
def fetch_pages(url, referer = nil, depth = nil)
|
29
|
+
begin
|
30
|
+
url = URI(url) unless url.is_a?(URI)
|
31
|
+
pages = []
|
32
|
+
get(url, referer) do |response, code, location, redirect_to, response_time|
|
33
|
+
body = response.body.dup
|
34
|
+
if response.to_hash.fetch('content-encoding', [])[0] == 'gzip'
|
35
|
+
gzip = Zlib::GzipReader.new(StringIO.new(body))
|
36
|
+
body = gzip.read
|
37
|
+
end
|
38
|
+
pages << Page.new(location, :body => response.body.dup,
|
39
|
+
:code => code,
|
40
|
+
:headers => response.to_hash,
|
41
|
+
:referer => referer,
|
42
|
+
:depth => depth,
|
43
|
+
:redirect_to => redirect_to,
|
44
|
+
:response_time => response_time)
|
45
|
+
end
|
46
|
+
|
47
|
+
return pages
|
48
|
+
rescue StandardError => e
|
49
|
+
if verbose?
|
50
|
+
puts e.inspect
|
51
|
+
puts e.backtrace
|
52
|
+
end
|
53
|
+
return [Page.new(url, :error => e)]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# The maximum number of redirects to follow
|
59
|
+
#
|
60
|
+
def redirect_limit
|
61
|
+
@opts[:redirect_limit] || REDIRECT_LIMIT
|
62
|
+
end
|
63
|
+
|
64
|
+
#
|
65
|
+
# The user-agent string which will be sent with each request,
|
66
|
+
# or nil if no such option is set
|
67
|
+
#
|
68
|
+
def user_agent
|
69
|
+
@opts[:user_agent]
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
#
|
74
|
+
# The proxy address string
|
75
|
+
#
|
76
|
+
def proxy_host
|
77
|
+
@opts[:proxy_host]
|
78
|
+
end
|
79
|
+
|
80
|
+
#
|
81
|
+
# The proxy port
|
82
|
+
#
|
83
|
+
def proxy_port
|
84
|
+
@opts[:proxy_port]
|
85
|
+
end
|
86
|
+
|
87
|
+
#
|
88
|
+
# HTTP read timeout in seconds
|
89
|
+
#
|
90
|
+
def read_timeout
|
91
|
+
@opts[:read_timeout]
|
92
|
+
end
|
93
|
+
|
94
|
+
# Does this HTTP client accept cookies from the server?
|
95
|
+
#
|
96
|
+
def accept_cookies?
|
97
|
+
@opts[:accept_cookies]
|
98
|
+
end
|
99
|
+
|
100
|
+
def cookie_jar
|
101
|
+
@opts[:cookie_jar] ||= ::HTTP::CookieJar.new
|
102
|
+
@opts[:cookie_jar]
|
103
|
+
end
|
104
|
+
|
105
|
+
private
|
106
|
+
|
107
|
+
#
|
108
|
+
# Retrieve HTTP responses for *url*, including redirects.
|
109
|
+
# Yields the response object, response code, and URI location
|
110
|
+
# for each response.
|
111
|
+
#
|
112
|
+
def get(url, referer = nil)
|
113
|
+
limit = redirect_limit
|
114
|
+
loc = url
|
115
|
+
begin
|
116
|
+
# if redirected to a relative url, merge it with the host of the original
|
117
|
+
# request url
|
118
|
+
loc = url.merge(loc) if loc.relative?
|
119
|
+
|
120
|
+
response, response_time = get_response(loc, referer)
|
121
|
+
code = Integer(response.code)
|
122
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
123
|
+
yield response, code, loc, redirect_to, response_time
|
124
|
+
limit -= 1
|
125
|
+
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
126
|
+
end
|
127
|
+
|
128
|
+
#
|
129
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
130
|
+
#
|
131
|
+
def get_response(url, referer = nil)
|
132
|
+
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
133
|
+
|
134
|
+
opts = {}
|
135
|
+
opts['User-Agent'] = user_agent if user_agent
|
136
|
+
opts['Referer'] = referer.to_s if referer
|
137
|
+
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
138
|
+
|
139
|
+
retries = 0
|
140
|
+
begin
|
141
|
+
start = Time.now()
|
142
|
+
# format request
|
143
|
+
req = Net::HTTP::Get.new(full_path, opts)
|
144
|
+
# HTTP Basic authentication
|
145
|
+
req.basic_auth url.user, url.password if url.user
|
146
|
+
response = connection(url).request(req)
|
147
|
+
finish = Time.now()
|
148
|
+
response_time = ((finish - start) * 1000).round
|
149
|
+
cookie_jar.parse(response["Set-Cookie"], url) if accept_cookies?
|
150
|
+
return response, response_time
|
151
|
+
rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
|
152
|
+
|
153
|
+
puts e.inspect if verbose?
|
154
|
+
refresh_connection(url)
|
155
|
+
retries += 1
|
156
|
+
retry unless retries > 3
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
def connection(url)
|
161
|
+
@connections[url.host] ||= {}
|
162
|
+
|
163
|
+
if conn = @connections[url.host][url.port]
|
164
|
+
return conn
|
165
|
+
end
|
166
|
+
|
167
|
+
refresh_connection url
|
168
|
+
end
|
169
|
+
|
170
|
+
def refresh_connection(url)
|
171
|
+
http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
|
172
|
+
|
173
|
+
http.read_timeout = read_timeout if !!read_timeout
|
174
|
+
|
175
|
+
if url.scheme == 'https'
|
176
|
+
http.use_ssl = true
|
177
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
178
|
+
end
|
179
|
+
|
180
|
+
@connections[url.host][url.port] = http.start
|
181
|
+
end
|
182
|
+
|
183
|
+
def verbose?
|
184
|
+
@opts[:verbose]
|
185
|
+
end
|
186
|
+
|
187
|
+
#
|
188
|
+
# Allowed to connect to the requested url?
|
189
|
+
#
|
190
|
+
def allowed?(to_url, from_url)
|
191
|
+
to_url.host.nil? || (to_url.host == from_url.host)
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
195
|
+
end
|
data/lib/polipus/page.rb
ADDED
@@ -0,0 +1,219 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'json'
|
3
|
+
require 'ostruct'
|
4
|
+
require 'set'
|
5
|
+
module Polipus
|
6
|
+
class Page
|
7
|
+
|
8
|
+
# The URL of the page
|
9
|
+
attr_reader :url
|
10
|
+
# The raw HTTP response body of the page
|
11
|
+
attr_reader :body
|
12
|
+
# Headers of the HTTP response
|
13
|
+
attr_reader :headers
|
14
|
+
# URL of the page this one redirected to, if any
|
15
|
+
attr_reader :redirect_to
|
16
|
+
# Exception object, if one was raised during HTTP#fetch_page
|
17
|
+
attr_reader :error
|
18
|
+
# Integer response code of the page
|
19
|
+
attr_accessor :code
|
20
|
+
# Depth of this page from the root of the crawl. This is not necessarily the
|
21
|
+
# shortest path; use PageStore#shortest_paths! to find that value.
|
22
|
+
attr_accessor :depth
|
23
|
+
# URL of the page that brought us to this page
|
24
|
+
attr_accessor :referer
|
25
|
+
# Response time of the request for this page in milliseconds
|
26
|
+
attr_accessor :response_time
|
27
|
+
# OpenStruct it holds users defined data
|
28
|
+
attr_accessor :user_data
|
29
|
+
|
30
|
+
attr_accessor :aliases
|
31
|
+
|
32
|
+
attr_accessor :domain_aliases
|
33
|
+
|
34
|
+
#
|
35
|
+
# Create a new page
|
36
|
+
#
|
37
|
+
def initialize(url, params = {})
|
38
|
+
@url = url.kind_of?(URI) ? url : URI(url)
|
39
|
+
@code = params[:code]
|
40
|
+
@headers = params[:headers] || {}
|
41
|
+
@headers['content-type'] ||= ['']
|
42
|
+
@aliases = Array(params[:aka]).compact
|
43
|
+
@referer = params[:referer]
|
44
|
+
@depth = params[:depth] || 0
|
45
|
+
@redirect_to = to_absolute(params[:redirect_to])
|
46
|
+
@response_time = params[:response_time]
|
47
|
+
@body = params[:body]
|
48
|
+
@error = params[:error]
|
49
|
+
@fetched = !params[:code].nil?
|
50
|
+
@user_data = OpenStruct.new
|
51
|
+
@domain_aliases = params[:domain_aliases] ||= []
|
52
|
+
end
|
53
|
+
|
54
|
+
#
|
55
|
+
# Array of distinct A tag HREFs from the page
|
56
|
+
#
|
57
|
+
def links
|
58
|
+
return @links.to_a unless @links.nil?
|
59
|
+
@links = Set.new
|
60
|
+
return [] if !doc
|
61
|
+
|
62
|
+
doc.search("//a[@href]").each do |a|
|
63
|
+
u = a['href']
|
64
|
+
next if u.nil? or u.empty?
|
65
|
+
abs = to_absolute(u) rescue next
|
66
|
+
@links << abs if in_domain?(abs)
|
67
|
+
end
|
68
|
+
@links.to_a
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
# Nokogiri document for the HTML body
|
73
|
+
#
|
74
|
+
def doc
|
75
|
+
return @doc if @doc
|
76
|
+
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# Discard links, a next call of page.links will return an empty array
|
81
|
+
#
|
82
|
+
def discard_links!
|
83
|
+
@links = []
|
84
|
+
end
|
85
|
+
|
86
|
+
#
|
87
|
+
# Delete the Nokogiri document and response body to conserve memory
|
88
|
+
#
|
89
|
+
def discard_doc!
|
90
|
+
links # force parsing of page links before we trash the document
|
91
|
+
@doc = @body = nil
|
92
|
+
end
|
93
|
+
|
94
|
+
#
|
95
|
+
# Was the page successfully fetched?
|
96
|
+
# +true+ if the page was fetched with no error, +false+ otherwise.
|
97
|
+
#
|
98
|
+
def fetched?
|
99
|
+
@fetched
|
100
|
+
end
|
101
|
+
|
102
|
+
#
|
103
|
+
# The content-type returned by the HTTP request for this page
|
104
|
+
#
|
105
|
+
def content_type
|
106
|
+
headers['content-type'].first
|
107
|
+
end
|
108
|
+
|
109
|
+
#
|
110
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
111
|
+
# otherwise.
|
112
|
+
#
|
113
|
+
def html?
|
114
|
+
!!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
|
115
|
+
end
|
116
|
+
|
117
|
+
#
|
118
|
+
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
119
|
+
# otherwise.
|
120
|
+
#
|
121
|
+
def redirect?
|
122
|
+
(300..307).include?(@code)
|
123
|
+
end
|
124
|
+
|
125
|
+
#
|
126
|
+
# Returns +true+ if the page was not found (returned 404 code),
|
127
|
+
# returns +false+ otherwise.
|
128
|
+
#
|
129
|
+
def not_found?
|
130
|
+
404 == @code
|
131
|
+
end
|
132
|
+
|
133
|
+
#
|
134
|
+
# Base URI from the HTML doc head element
|
135
|
+
# http://www.w3.org/TR/html4/struct/links.html#edef-BASE
|
136
|
+
#
|
137
|
+
def base
|
138
|
+
@base = if doc
|
139
|
+
href = doc.search('//head/base/@href')
|
140
|
+
URI(href.to_s) unless href.nil? rescue nil
|
141
|
+
end unless @base
|
142
|
+
|
143
|
+
return nil if @base && @base.to_s().empty?
|
144
|
+
@base
|
145
|
+
end
|
146
|
+
|
147
|
+
#
|
148
|
+
# Converts relative URL *link* into an absolute URL based on the
|
149
|
+
# location of the page
|
150
|
+
#
|
151
|
+
def to_absolute(link)
|
152
|
+
return nil if link.nil?
|
153
|
+
|
154
|
+
# remove anchor
|
155
|
+
link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))
|
156
|
+
|
157
|
+
relative = URI(link)
|
158
|
+
absolute = base ? base.merge(relative) : @url.merge(relative)
|
159
|
+
|
160
|
+
absolute.path = '/' if absolute.path.empty?
|
161
|
+
|
162
|
+
return absolute
|
163
|
+
end
|
164
|
+
|
165
|
+
#
|
166
|
+
# Returns +true+ if *uri* is in the same domain as the page, returns
|
167
|
+
# +false+ otherwise
|
168
|
+
#
|
169
|
+
def in_domain?(uri)
|
170
|
+
@domain_aliases ||= []
|
171
|
+
uri.host == @url.host || @domain_aliases.include?(uri.host)
|
172
|
+
end
|
173
|
+
|
174
|
+
def to_hash
|
175
|
+
{'url' => @url.to_s,
|
176
|
+
'headers' => Marshal.dump(@headers),
|
177
|
+
'body' => @body,
|
178
|
+
'links' => links.map(&:to_s),
|
179
|
+
'code' => @code,
|
180
|
+
'depth' => @depth,
|
181
|
+
'referer' => @referer.to_s,
|
182
|
+
'redirect_to' => @redirect_to.to_s,
|
183
|
+
'response_time' => @response_time,
|
184
|
+
'fetched' => @fetched,
|
185
|
+
'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump
|
186
|
+
}
|
187
|
+
end
|
188
|
+
|
189
|
+
def to_json
|
190
|
+
th = to_hash.dup
|
191
|
+
th.each {|k,v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?)}
|
192
|
+
th.delete('headers') if content_type.empty?
|
193
|
+
th.to_json
|
194
|
+
end
|
195
|
+
|
196
|
+
def self.from_hash(hash)
|
197
|
+
page = self.new(URI(hash['url']))
|
198
|
+
{'@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']},
|
199
|
+
'@body' => hash['body'],
|
200
|
+
'@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
|
201
|
+
'@code' => hash['code'].to_i,
|
202
|
+
'@depth' => hash['depth'].to_i,
|
203
|
+
'@referer' => hash['referer'],
|
204
|
+
'@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
|
205
|
+
'@response_time' => hash['response_time'].to_i,
|
206
|
+
'@fetched' => hash['fetched'],
|
207
|
+
'@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil
|
208
|
+
}.each do |var, value|
|
209
|
+
page.instance_variable_set(var, value)
|
210
|
+
end
|
211
|
+
page
|
212
|
+
end
|
213
|
+
|
214
|
+
def self.from_json(json)
|
215
|
+
hash = JSON.parse json
|
216
|
+
self.from_hash hash
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Polipus
|
2
|
+
module Plugin
|
3
|
+
class Cleaner
|
4
|
+
|
5
|
+
def initialize(options = {})
|
6
|
+
@reset = options[:reset] ||= false
|
7
|
+
end
|
8
|
+
|
9
|
+
def on_initialize crawler
|
10
|
+
crawler.logger.info {"Cleaner plugin loaded"}
|
11
|
+
unless @reset
|
12
|
+
crawler.logger.info {"Cleaner plugin is disable, add :reset => true to the plugin if you really know what you are doing"}
|
13
|
+
return nil
|
14
|
+
end
|
15
|
+
crawler.logger.info {"Cleaning all: url_tracker, storage, queue"}
|
16
|
+
Proc.new {
|
17
|
+
url_tracker.clear
|
18
|
+
storage.clear
|
19
|
+
queue_factory.clear
|
20
|
+
@options[:queue_overflow_adapter].clear if @options[:queue_overflow_adapter]
|
21
|
+
}
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|