polipus 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +15 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +55 -0
  8. data/README.rdoc +3 -0
  9. data/Rakefile +9 -0
  10. data/examples/basic.rb +58 -0
  11. data/examples/survival.rb +9 -0
  12. data/lib/polipus.rb +451 -0
  13. data/lib/polipus/http.rb +195 -0
  14. data/lib/polipus/page.rb +219 -0
  15. data/lib/polipus/plugin.rb +13 -0
  16. data/lib/polipus/plugins/cleaner.rb +25 -0
  17. data/lib/polipus/plugins/sample.rb +17 -0
  18. data/lib/polipus/plugins/sleeper.rb +22 -0
  19. data/lib/polipus/queue_overflow.rb +24 -0
  20. data/lib/polipus/queue_overflow/base.rb +6 -0
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
  22. data/lib/polipus/queue_overflow/manager.rb +50 -0
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
  25. data/lib/polipus/storage.rb +31 -0
  26. data/lib/polipus/storage/base.rb +17 -0
  27. data/lib/polipus/storage/dev_null.rb +35 -0
  28. data/lib/polipus/storage/mongo_store.rb +86 -0
  29. data/lib/polipus/storage/s3_store.rb +100 -0
  30. data/lib/polipus/url_tracker.rb +20 -0
  31. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  32. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  33. data/lib/polipus/version.rb +4 -0
  34. data/polipus.gemspec +39 -0
  35. data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
  36. data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
  37. data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
  38. data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
  39. data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
  40. data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
  41. data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
  42. data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
  43. data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
  44. data/spec/cassettes/http_test.yml +1418 -0
  45. data/spec/cassettes/http_test_redirect.yml +71 -0
  46. data/spec/clear.rb +11 -0
  47. data/spec/http_spec.rb +31 -0
  48. data/spec/page_spec.rb +22 -0
  49. data/spec/queue_overflow_manager_spec.rb +89 -0
  50. data/spec/queue_overflow_spec.rb +71 -0
  51. data/spec/spec_helper.rb +34 -0
  52. data/spec/storage_mongo_spec.rb +102 -0
  53. data/spec/storage_s3_spec.rb +115 -0
  54. data/spec/url_tracker_spec.rb +28 -0
  55. metadata +313 -0
@@ -0,0 +1,195 @@
1
+ require "net/https"
2
+ require "polipus/page"
3
+ require "zlib"
4
+ require 'http/cookie'
5
+
6
+ module Polipus
7
+ class HTTP
8
+ # Maximum number of redirects to follow on each get_response
9
+ REDIRECT_LIMIT = 5
10
+
11
+ def initialize(opts = {})
12
+ @connections = {}
13
+ @opts = opts
14
+ end
15
+
16
+ #
17
+ # Fetch a single Page from the response of an HTTP request to *url*.
18
+ # Just gets the final destination page.
19
+ #
20
+ def fetch_page(url, referer = nil, depth = nil)
21
+ fetch_pages(url, referer, depth).last
22
+ end
23
+
24
+ #
25
+ # Create new Pages from the response of an HTTP request to *url*,
26
+ # including redirects
27
+ #
28
+ def fetch_pages(url, referer = nil, depth = nil)
29
+ begin
30
+ url = URI(url) unless url.is_a?(URI)
31
+ pages = []
32
+ get(url, referer) do |response, code, location, redirect_to, response_time|
33
+ body = response.body.dup
34
+ if response.to_hash.fetch('content-encoding', [])[0] == 'gzip'
35
+ gzip = Zlib::GzipReader.new(StringIO.new(body))
36
+ body = gzip.read
37
+ end
38
+ pages << Page.new(location, :body => response.body.dup,
39
+ :code => code,
40
+ :headers => response.to_hash,
41
+ :referer => referer,
42
+ :depth => depth,
43
+ :redirect_to => redirect_to,
44
+ :response_time => response_time)
45
+ end
46
+
47
+ return pages
48
+ rescue StandardError => e
49
+ if verbose?
50
+ puts e.inspect
51
+ puts e.backtrace
52
+ end
53
+ return [Page.new(url, :error => e)]
54
+ end
55
+ end
56
+
57
+ #
58
+ # The maximum number of redirects to follow
59
+ #
60
+ def redirect_limit
61
+ @opts[:redirect_limit] || REDIRECT_LIMIT
62
+ end
63
+
64
+ #
65
+ # The user-agent string which will be sent with each request,
66
+ # or nil if no such option is set
67
+ #
68
+ def user_agent
69
+ @opts[:user_agent]
70
+ end
71
+
72
+
73
+ #
74
+ # The proxy address string
75
+ #
76
+ def proxy_host
77
+ @opts[:proxy_host]
78
+ end
79
+
80
+ #
81
+ # The proxy port
82
+ #
83
+ def proxy_port
84
+ @opts[:proxy_port]
85
+ end
86
+
87
+ #
88
+ # HTTP read timeout in seconds
89
+ #
90
+ def read_timeout
91
+ @opts[:read_timeout]
92
+ end
93
+
94
+ # Does this HTTP client accept cookies from the server?
95
+ #
96
+ def accept_cookies?
97
+ @opts[:accept_cookies]
98
+ end
99
+
100
+ def cookie_jar
101
+ @opts[:cookie_jar] ||= ::HTTP::CookieJar.new
102
+ @opts[:cookie_jar]
103
+ end
104
+
105
+ private
106
+
107
+ #
108
+ # Retrieve HTTP responses for *url*, including redirects.
109
+ # Yields the response object, response code, and URI location
110
+ # for each response.
111
+ #
112
+ def get(url, referer = nil)
113
+ limit = redirect_limit
114
+ loc = url
115
+ begin
116
+ # if redirected to a relative url, merge it with the host of the original
117
+ # request url
118
+ loc = url.merge(loc) if loc.relative?
119
+
120
+ response, response_time = get_response(loc, referer)
121
+ code = Integer(response.code)
122
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
123
+ yield response, code, loc, redirect_to, response_time
124
+ limit -= 1
125
+ end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
126
+ end
127
+
128
+ #
129
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
130
+ #
131
+ def get_response(url, referer = nil)
132
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
133
+
134
+ opts = {}
135
+ opts['User-Agent'] = user_agent if user_agent
136
+ opts['Referer'] = referer.to_s if referer
137
+ opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
138
+
139
+ retries = 0
140
+ begin
141
+ start = Time.now()
142
+ # format request
143
+ req = Net::HTTP::Get.new(full_path, opts)
144
+ # HTTP Basic authentication
145
+ req.basic_auth url.user, url.password if url.user
146
+ response = connection(url).request(req)
147
+ finish = Time.now()
148
+ response_time = ((finish - start) * 1000).round
149
+ cookie_jar.parse(response["Set-Cookie"], url) if accept_cookies?
150
+ return response, response_time
151
+ rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
152
+
153
+ puts e.inspect if verbose?
154
+ refresh_connection(url)
155
+ retries += 1
156
+ retry unless retries > 3
157
+ end
158
+ end
159
+
160
+ def connection(url)
161
+ @connections[url.host] ||= {}
162
+
163
+ if conn = @connections[url.host][url.port]
164
+ return conn
165
+ end
166
+
167
+ refresh_connection url
168
+ end
169
+
170
+ def refresh_connection(url)
171
+ http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
172
+
173
+ http.read_timeout = read_timeout if !!read_timeout
174
+
175
+ if url.scheme == 'https'
176
+ http.use_ssl = true
177
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
178
+ end
179
+
180
+ @connections[url.host][url.port] = http.start
181
+ end
182
+
183
+ def verbose?
184
+ @opts[:verbose]
185
+ end
186
+
187
+ #
188
+ # Allowed to connect to the requested url?
189
+ #
190
+ def allowed?(to_url, from_url)
191
+ to_url.host.nil? || (to_url.host == from_url.host)
192
+ end
193
+
194
+ end
195
+ end
@@ -0,0 +1,219 @@
1
+ require 'nokogiri'
2
+ require 'json'
3
+ require 'ostruct'
4
+ require 'set'
5
+ module Polipus
6
+ class Page
7
+
8
+ # The URL of the page
9
+ attr_reader :url
10
+ # The raw HTTP response body of the page
11
+ attr_reader :body
12
+ # Headers of the HTTP response
13
+ attr_reader :headers
14
+ # URL of the page this one redirected to, if any
15
+ attr_reader :redirect_to
16
+ # Exception object, if one was raised during HTTP#fetch_page
17
+ attr_reader :error
18
+ # Integer response code of the page
19
+ attr_accessor :code
20
+ # Depth of this page from the root of the crawl. This is not necessarily the
21
+ # shortest path; use PageStore#shortest_paths! to find that value.
22
+ attr_accessor :depth
23
+ # URL of the page that brought us to this page
24
+ attr_accessor :referer
25
+ # Response time of the request for this page in milliseconds
26
+ attr_accessor :response_time
27
+ # OpenStruct it holds users defined data
28
+ attr_accessor :user_data
29
+
30
+ attr_accessor :aliases
31
+
32
+ attr_accessor :domain_aliases
33
+
34
+ #
35
+ # Create a new page
36
+ #
37
+ def initialize(url, params = {})
38
+ @url = url.kind_of?(URI) ? url : URI(url)
39
+ @code = params[:code]
40
+ @headers = params[:headers] || {}
41
+ @headers['content-type'] ||= ['']
42
+ @aliases = Array(params[:aka]).compact
43
+ @referer = params[:referer]
44
+ @depth = params[:depth] || 0
45
+ @redirect_to = to_absolute(params[:redirect_to])
46
+ @response_time = params[:response_time]
47
+ @body = params[:body]
48
+ @error = params[:error]
49
+ @fetched = !params[:code].nil?
50
+ @user_data = OpenStruct.new
51
+ @domain_aliases = params[:domain_aliases] ||= []
52
+ end
53
+
54
+ #
55
+ # Array of distinct A tag HREFs from the page
56
+ #
57
+ def links
58
+ return @links.to_a unless @links.nil?
59
+ @links = Set.new
60
+ return [] if !doc
61
+
62
+ doc.search("//a[@href]").each do |a|
63
+ u = a['href']
64
+ next if u.nil? or u.empty?
65
+ abs = to_absolute(u) rescue next
66
+ @links << abs if in_domain?(abs)
67
+ end
68
+ @links.to_a
69
+ end
70
+
71
+ #
72
+ # Nokogiri document for the HTML body
73
+ #
74
+ def doc
75
+ return @doc if @doc
76
+ @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
77
+ end
78
+
79
+ #
80
+ # Discard links, a next call of page.links will return an empty array
81
+ #
82
+ def discard_links!
83
+ @links = []
84
+ end
85
+
86
+ #
87
+ # Delete the Nokogiri document and response body to conserve memory
88
+ #
89
+ def discard_doc!
90
+ links # force parsing of page links before we trash the document
91
+ @doc = @body = nil
92
+ end
93
+
94
+ #
95
+ # Was the page successfully fetched?
96
+ # +true+ if the page was fetched with no error, +false+ otherwise.
97
+ #
98
+ def fetched?
99
+ @fetched
100
+ end
101
+
102
+ #
103
+ # The content-type returned by the HTTP request for this page
104
+ #
105
+ def content_type
106
+ headers['content-type'].first
107
+ end
108
+
109
+ #
110
+ # Returns +true+ if the page is a HTML document, returns +false+
111
+ # otherwise.
112
+ #
113
+ def html?
114
+ !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
115
+ end
116
+
117
+ #
118
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
119
+ # otherwise.
120
+ #
121
+ def redirect?
122
+ (300..307).include?(@code)
123
+ end
124
+
125
+ #
126
+ # Returns +true+ if the page was not found (returned 404 code),
127
+ # returns +false+ otherwise.
128
+ #
129
+ def not_found?
130
+ 404 == @code
131
+ end
132
+
133
+ #
134
+ # Base URI from the HTML doc head element
135
+ # http://www.w3.org/TR/html4/struct/links.html#edef-BASE
136
+ #
137
+ def base
138
+ @base = if doc
139
+ href = doc.search('//head/base/@href')
140
+ URI(href.to_s) unless href.nil? rescue nil
141
+ end unless @base
142
+
143
+ return nil if @base && @base.to_s().empty?
144
+ @base
145
+ end
146
+
147
+ #
148
+ # Converts relative URL *link* into an absolute URL based on the
149
+ # location of the page
150
+ #
151
+ def to_absolute(link)
152
+ return nil if link.nil?
153
+
154
+ # remove anchor
155
+ link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))
156
+
157
+ relative = URI(link)
158
+ absolute = base ? base.merge(relative) : @url.merge(relative)
159
+
160
+ absolute.path = '/' if absolute.path.empty?
161
+
162
+ return absolute
163
+ end
164
+
165
+ #
166
+ # Returns +true+ if *uri* is in the same domain as the page, returns
167
+ # +false+ otherwise
168
+ #
169
+ def in_domain?(uri)
170
+ @domain_aliases ||= []
171
+ uri.host == @url.host || @domain_aliases.include?(uri.host)
172
+ end
173
+
174
+ def to_hash
175
+ {'url' => @url.to_s,
176
+ 'headers' => Marshal.dump(@headers),
177
+ 'body' => @body,
178
+ 'links' => links.map(&:to_s),
179
+ 'code' => @code,
180
+ 'depth' => @depth,
181
+ 'referer' => @referer.to_s,
182
+ 'redirect_to' => @redirect_to.to_s,
183
+ 'response_time' => @response_time,
184
+ 'fetched' => @fetched,
185
+ 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump
186
+ }
187
+ end
188
+
189
+ def to_json
190
+ th = to_hash.dup
191
+ th.each {|k,v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?)}
192
+ th.delete('headers') if content_type.empty?
193
+ th.to_json
194
+ end
195
+
196
+ def self.from_hash(hash)
197
+ page = self.new(URI(hash['url']))
198
+ {'@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']},
199
+ '@body' => hash['body'],
200
+ '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
201
+ '@code' => hash['code'].to_i,
202
+ '@depth' => hash['depth'].to_i,
203
+ '@referer' => hash['referer'],
204
+ '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
205
+ '@response_time' => hash['response_time'].to_i,
206
+ '@fetched' => hash['fetched'],
207
+ '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil
208
+ }.each do |var, value|
209
+ page.instance_variable_set(var, value)
210
+ end
211
+ page
212
+ end
213
+
214
+ def self.from_json(json)
215
+ hash = JSON.parse json
216
+ self.from_hash hash
217
+ end
218
+ end
219
+ end
@@ -0,0 +1,13 @@
1
+ module Polipus
2
+ module Plugin
3
+ @@plugins = {}
4
+ def self.register plugin, options = {}
5
+ o = plugin.new(options)
6
+ @@plugins[o.class.name] = o
7
+ end
8
+
9
+ def self.plugins
10
+ @@plugins
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,25 @@
1
+ module Polipus
2
+ module Plugin
3
+ class Cleaner
4
+
5
+ def initialize(options = {})
6
+ @reset = options[:reset] ||= false
7
+ end
8
+
9
+ def on_initialize crawler
10
+ crawler.logger.info {"Cleaner plugin loaded"}
11
+ unless @reset
12
+ crawler.logger.info {"Cleaner plugin is disable, add :reset => true to the plugin if you really know what you are doing"}
13
+ return nil
14
+ end
15
+ crawler.logger.info {"Cleaning all: url_tracker, storage, queue"}
16
+ Proc.new {
17
+ url_tracker.clear
18
+ storage.clear
19
+ queue_factory.clear
20
+ @options[:queue_overflow_adapter].clear if @options[:queue_overflow_adapter]
21
+ }
22
+ end
23
+ end
24
+ end
25
+ end