polipus 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +15 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +55 -0
  8. data/README.rdoc +3 -0
  9. data/Rakefile +9 -0
  10. data/examples/basic.rb +58 -0
  11. data/examples/survival.rb +9 -0
  12. data/lib/polipus.rb +451 -0
  13. data/lib/polipus/http.rb +195 -0
  14. data/lib/polipus/page.rb +219 -0
  15. data/lib/polipus/plugin.rb +13 -0
  16. data/lib/polipus/plugins/cleaner.rb +25 -0
  17. data/lib/polipus/plugins/sample.rb +17 -0
  18. data/lib/polipus/plugins/sleeper.rb +22 -0
  19. data/lib/polipus/queue_overflow.rb +24 -0
  20. data/lib/polipus/queue_overflow/base.rb +6 -0
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
  22. data/lib/polipus/queue_overflow/manager.rb +50 -0
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
  25. data/lib/polipus/storage.rb +31 -0
  26. data/lib/polipus/storage/base.rb +17 -0
  27. data/lib/polipus/storage/dev_null.rb +35 -0
  28. data/lib/polipus/storage/mongo_store.rb +86 -0
  29. data/lib/polipus/storage/s3_store.rb +100 -0
  30. data/lib/polipus/url_tracker.rb +20 -0
  31. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  32. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  33. data/lib/polipus/version.rb +4 -0
  34. data/polipus.gemspec +39 -0
  35. data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
  36. data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
  37. data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
  38. data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
  39. data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
  40. data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
  41. data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
  42. data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
  43. data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
  44. data/spec/cassettes/http_test.yml +1418 -0
  45. data/spec/cassettes/http_test_redirect.yml +71 -0
  46. data/spec/clear.rb +11 -0
  47. data/spec/http_spec.rb +31 -0
  48. data/spec/page_spec.rb +22 -0
  49. data/spec/queue_overflow_manager_spec.rb +89 -0
  50. data/spec/queue_overflow_spec.rb +71 -0
  51. data/spec/spec_helper.rb +34 -0
  52. data/spec/storage_mongo_spec.rb +102 -0
  53. data/spec/storage_s3_spec.rb +115 -0
  54. data/spec/url_tracker_spec.rb +28 -0
  55. metadata +313 -0
@@ -0,0 +1,195 @@
1
+ require "net/https"
2
+ require "polipus/page"
3
+ require "zlib"
4
+ require 'http/cookie'
5
+
6
+ module Polipus
7
+ class HTTP
8
+ # Maximum number of redirects to follow on each get_response
9
+ REDIRECT_LIMIT = 5
10
+
11
+ def initialize(opts = {})
12
+ @connections = {}
13
+ @opts = opts
14
+ end
15
+
16
+ #
17
+ # Fetch a single Page from the response of an HTTP request to *url*.
18
+ # Just gets the final destination page.
19
+ #
20
+ def fetch_page(url, referer = nil, depth = nil)
21
+ fetch_pages(url, referer, depth).last
22
+ end
23
+
24
+ #
25
+ # Create new Pages from the response of an HTTP request to *url*,
26
+ # including redirects
27
+ #
28
+ def fetch_pages(url, referer = nil, depth = nil)
29
+ begin
30
+ url = URI(url) unless url.is_a?(URI)
31
+ pages = []
32
+ get(url, referer) do |response, code, location, redirect_to, response_time|
33
+ body = response.body.dup
34
+ if response.to_hash.fetch('content-encoding', [])[0] == 'gzip'
35
+ gzip = Zlib::GzipReader.new(StringIO.new(body))
36
+ body = gzip.read
37
+ end
38
+ pages << Page.new(location, :body => response.body.dup,
39
+ :code => code,
40
+ :headers => response.to_hash,
41
+ :referer => referer,
42
+ :depth => depth,
43
+ :redirect_to => redirect_to,
44
+ :response_time => response_time)
45
+ end
46
+
47
+ return pages
48
+ rescue StandardError => e
49
+ if verbose?
50
+ puts e.inspect
51
+ puts e.backtrace
52
+ end
53
+ return [Page.new(url, :error => e)]
54
+ end
55
+ end
56
+
57
+ #
58
+ # The maximum number of redirects to follow
59
+ #
60
+ def redirect_limit
61
+ @opts[:redirect_limit] || REDIRECT_LIMIT
62
+ end
63
+
64
+ #
65
+ # The user-agent string which will be sent with each request,
66
+ # or nil if no such option is set
67
+ #
68
+ def user_agent
69
+ @opts[:user_agent]
70
+ end
71
+
72
+
73
+ #
74
+ # The proxy address string
75
+ #
76
+ def proxy_host
77
+ @opts[:proxy_host]
78
+ end
79
+
80
+ #
81
+ # The proxy port
82
+ #
83
+ def proxy_port
84
+ @opts[:proxy_port]
85
+ end
86
+
87
+ #
88
+ # HTTP read timeout in seconds
89
+ #
90
+ def read_timeout
91
+ @opts[:read_timeout]
92
+ end
93
+
94
+ # Does this HTTP client accept cookies from the server?
95
+ #
96
+ def accept_cookies?
97
+ @opts[:accept_cookies]
98
+ end
99
+
100
+ def cookie_jar
101
+ @opts[:cookie_jar] ||= ::HTTP::CookieJar.new
102
+ @opts[:cookie_jar]
103
+ end
104
+
105
+ private
106
+
107
+ #
108
+ # Retrieve HTTP responses for *url*, including redirects.
109
+ # Yields the response object, response code, and URI location
110
+ # for each response.
111
+ #
112
+ def get(url, referer = nil)
113
+ limit = redirect_limit
114
+ loc = url
115
+ begin
116
+ # if redirected to a relative url, merge it with the host of the original
117
+ # request url
118
+ loc = url.merge(loc) if loc.relative?
119
+
120
+ response, response_time = get_response(loc, referer)
121
+ code = Integer(response.code)
122
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
123
+ yield response, code, loc, redirect_to, response_time
124
+ limit -= 1
125
+ end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
126
+ end
127
+
128
+ #
129
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
130
+ #
131
+ def get_response(url, referer = nil)
132
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
133
+
134
+ opts = {}
135
+ opts['User-Agent'] = user_agent if user_agent
136
+ opts['Referer'] = referer.to_s if referer
137
+ opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
138
+
139
+ retries = 0
140
+ begin
141
+ start = Time.now()
142
+ # format request
143
+ req = Net::HTTP::Get.new(full_path, opts)
144
+ # HTTP Basic authentication
145
+ req.basic_auth url.user, url.password if url.user
146
+ response = connection(url).request(req)
147
+ finish = Time.now()
148
+ response_time = ((finish - start) * 1000).round
149
+ cookie_jar.parse(response["Set-Cookie"], url) if accept_cookies?
150
+ return response, response_time
151
+ rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
152
+
153
+ puts e.inspect if verbose?
154
+ refresh_connection(url)
155
+ retries += 1
156
+ retry unless retries > 3
157
+ end
158
+ end
159
+
160
+ def connection(url)
161
+ @connections[url.host] ||= {}
162
+
163
+ if conn = @connections[url.host][url.port]
164
+ return conn
165
+ end
166
+
167
+ refresh_connection url
168
+ end
169
+
170
+ def refresh_connection(url)
171
+ http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
172
+
173
+ http.read_timeout = read_timeout if !!read_timeout
174
+
175
+ if url.scheme == 'https'
176
+ http.use_ssl = true
177
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
178
+ end
179
+
180
+ @connections[url.host][url.port] = http.start
181
+ end
182
+
183
+ def verbose?
184
+ @opts[:verbose]
185
+ end
186
+
187
+ #
188
+ # Allowed to connect to the requested url?
189
+ #
190
+ def allowed?(to_url, from_url)
191
+ to_url.host.nil? || (to_url.host == from_url.host)
192
+ end
193
+
194
+ end
195
+ end
@@ -0,0 +1,219 @@
1
+ require 'nokogiri'
2
+ require 'json'
3
+ require 'ostruct'
4
+ require 'set'
5
+ module Polipus
6
+ class Page
7
+
8
+ # The URL of the page
9
+ attr_reader :url
10
+ # The raw HTTP response body of the page
11
+ attr_reader :body
12
+ # Headers of the HTTP response
13
+ attr_reader :headers
14
+ # URL of the page this one redirected to, if any
15
+ attr_reader :redirect_to
16
+ # Exception object, if one was raised during HTTP#fetch_page
17
+ attr_reader :error
18
+ # Integer response code of the page
19
+ attr_accessor :code
20
+ # Depth of this page from the root of the crawl. This is not necessarily the
21
+ # shortest path; use PageStore#shortest_paths! to find that value.
22
+ attr_accessor :depth
23
+ # URL of the page that brought us to this page
24
+ attr_accessor :referer
25
+ # Response time of the request for this page in milliseconds
26
+ attr_accessor :response_time
27
+ # OpenStruct it holds users defined data
28
+ attr_accessor :user_data
29
+
30
+ attr_accessor :aliases
31
+
32
+ attr_accessor :domain_aliases
33
+
34
+ #
35
+ # Create a new page
36
+ #
37
+ def initialize(url, params = {})
38
+ @url = url.kind_of?(URI) ? url : URI(url)
39
+ @code = params[:code]
40
+ @headers = params[:headers] || {}
41
+ @headers['content-type'] ||= ['']
42
+ @aliases = Array(params[:aka]).compact
43
+ @referer = params[:referer]
44
+ @depth = params[:depth] || 0
45
+ @redirect_to = to_absolute(params[:redirect_to])
46
+ @response_time = params[:response_time]
47
+ @body = params[:body]
48
+ @error = params[:error]
49
+ @fetched = !params[:code].nil?
50
+ @user_data = OpenStruct.new
51
+ @domain_aliases = params[:domain_aliases] ||= []
52
+ end
53
+
54
+ #
55
+ # Array of distinct A tag HREFs from the page
56
+ #
57
+ def links
58
+ return @links.to_a unless @links.nil?
59
+ @links = Set.new
60
+ return [] if !doc
61
+
62
+ doc.search("//a[@href]").each do |a|
63
+ u = a['href']
64
+ next if u.nil? or u.empty?
65
+ abs = to_absolute(u) rescue next
66
+ @links << abs if in_domain?(abs)
67
+ end
68
+ @links.to_a
69
+ end
70
+
71
+ #
72
+ # Nokogiri document for the HTML body
73
+ #
74
+ def doc
75
+ return @doc if @doc
76
+ @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
77
+ end
78
+
79
+ #
80
+ # Discard links, a next call of page.links will return an empty array
81
+ #
82
+ def discard_links!
83
+ @links = []
84
+ end
85
+
86
+ #
87
+ # Delete the Nokogiri document and response body to conserve memory
88
+ #
89
+ def discard_doc!
90
+ links # force parsing of page links before we trash the document
91
+ @doc = @body = nil
92
+ end
93
+
94
+ #
95
+ # Was the page successfully fetched?
96
+ # +true+ if the page was fetched with no error, +false+ otherwise.
97
+ #
98
+ def fetched?
99
+ @fetched
100
+ end
101
+
102
+ #
103
+ # The content-type returned by the HTTP request for this page
104
+ #
105
+ def content_type
106
+ headers['content-type'].first
107
+ end
108
+
109
+ #
110
+ # Returns +true+ if the page is a HTML document, returns +false+
111
+ # otherwise.
112
+ #
113
+ def html?
114
+ !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
115
+ end
116
+
117
+ #
118
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
119
+ # otherwise.
120
+ #
121
+ def redirect?
122
+ (300..307).include?(@code)
123
+ end
124
+
125
+ #
126
+ # Returns +true+ if the page was not found (returned 404 code),
127
+ # returns +false+ otherwise.
128
+ #
129
+ def not_found?
130
+ 404 == @code
131
+ end
132
+
133
+ #
134
+ # Base URI from the HTML doc head element
135
+ # http://www.w3.org/TR/html4/struct/links.html#edef-BASE
136
+ #
137
+ def base
138
+ @base = if doc
139
+ href = doc.search('//head/base/@href')
140
+ URI(href.to_s) unless href.nil? rescue nil
141
+ end unless @base
142
+
143
+ return nil if @base && @base.to_s().empty?
144
+ @base
145
+ end
146
+
147
+ #
148
+ # Converts relative URL *link* into an absolute URL based on the
149
+ # location of the page
150
+ #
151
+ def to_absolute(link)
152
+ return nil if link.nil?
153
+
154
+ # remove anchor
155
+ link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))
156
+
157
+ relative = URI(link)
158
+ absolute = base ? base.merge(relative) : @url.merge(relative)
159
+
160
+ absolute.path = '/' if absolute.path.empty?
161
+
162
+ return absolute
163
+ end
164
+
165
+ #
166
+ # Returns +true+ if *uri* is in the same domain as the page, returns
167
+ # +false+ otherwise
168
+ #
169
+ def in_domain?(uri)
170
+ @domain_aliases ||= []
171
+ uri.host == @url.host || @domain_aliases.include?(uri.host)
172
+ end
173
+
174
+ def to_hash
175
+ {'url' => @url.to_s,
176
+ 'headers' => Marshal.dump(@headers),
177
+ 'body' => @body,
178
+ 'links' => links.map(&:to_s),
179
+ 'code' => @code,
180
+ 'depth' => @depth,
181
+ 'referer' => @referer.to_s,
182
+ 'redirect_to' => @redirect_to.to_s,
183
+ 'response_time' => @response_time,
184
+ 'fetched' => @fetched,
185
+ 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump
186
+ }
187
+ end
188
+
189
+ def to_json
190
+ th = to_hash.dup
191
+ th.each {|k,v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?)}
192
+ th.delete('headers') if content_type.empty?
193
+ th.to_json
194
+ end
195
+
196
+ def self.from_hash(hash)
197
+ page = self.new(URI(hash['url']))
198
+ {'@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']},
199
+ '@body' => hash['body'],
200
+ '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
201
+ '@code' => hash['code'].to_i,
202
+ '@depth' => hash['depth'].to_i,
203
+ '@referer' => hash['referer'],
204
+ '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
205
+ '@response_time' => hash['response_time'].to_i,
206
+ '@fetched' => hash['fetched'],
207
+ '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil
208
+ }.each do |var, value|
209
+ page.instance_variable_set(var, value)
210
+ end
211
+ page
212
+ end
213
+
214
+ def self.from_json(json)
215
+ hash = JSON.parse json
216
+ self.from_hash hash
217
+ end
218
+ end
219
+ end
@@ -0,0 +1,13 @@
1
+ module Polipus
2
+ module Plugin
3
+ @@plugins = {}
4
+ def self.register plugin, options = {}
5
+ o = plugin.new(options)
6
+ @@plugins[o.class.name] = o
7
+ end
8
+
9
+ def self.plugins
10
+ @@plugins
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,25 @@
1
+ module Polipus
2
+ module Plugin
3
+ class Cleaner
4
+
5
+ def initialize(options = {})
6
+ @reset = options[:reset] ||= false
7
+ end
8
+
9
+ def on_initialize crawler
10
+ crawler.logger.info {"Cleaner plugin loaded"}
11
+ unless @reset
12
+ crawler.logger.info {"Cleaner plugin is disable, add :reset => true to the plugin if you really know what you are doing"}
13
+ return nil
14
+ end
15
+ crawler.logger.info {"Cleaning all: url_tracker, storage, queue"}
16
+ Proc.new {
17
+ url_tracker.clear
18
+ storage.clear
19
+ queue_factory.clear
20
+ @options[:queue_overflow_adapter].clear if @options[:queue_overflow_adapter]
21
+ }
22
+ end
23
+ end
24
+ end
25
+ end