polipus 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/lib/polipus.rb +7 -8
- data/lib/polipus/http.rb +0 -2
- data/lib/polipus/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MjkyMTAxZWU0ODJmMzI5OTcwZjI0ZTFlNzZjOTYxNzY1MGUxOTJjZQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OTg4M2UzY2I2ODNkMWMxZDYwMzkxOGI5MmRhMWJkN2I0N2ViYTMyMg==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YWE4ZjM0NDA1NWJiMGVkYzUyMTM5ZWJjN2I3YTU5YTI2NDIwNmI0ZTNkZTRl
|
10
|
+
NTg0MzQyOTgxMGYzZjBlZGVkMmE2ODJkZTA0ZTg3NzY0MTJjZTljYWEwN2Fm
|
11
|
+
YTc4MjI3NGViMmNlMjQzNWIxODVlMmNlNWJjNDFhMzE1MzQxMjk=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YjRlMmVjYzA4MDVlYzAxMmM0NGYwMWFiYWMwNmYxNDg3MmMyOTY4YzQzZDQ2
|
14
|
+
OWZjNWUzYjJlMzk3MDg5NjE4ZmQxYTk5MWQ1Y2M0ZmQ5MjhkYjQxZjdkNzRk
|
15
|
+
ZjZjNzkyMjJmM2M0MzNiNDNkOTQxOWYyMDc2MDdhZTJlNWE1Y2M=
|
data/lib/polipus.rb
CHANGED
@@ -106,9 +106,7 @@ module Polipus
|
|
106
106
|
|
107
107
|
@storage = @options[:storage] ||= Storage.dev_null
|
108
108
|
|
109
|
-
@http_pool = []
|
110
109
|
@workers_pool = []
|
111
|
-
@queues_pool = []
|
112
110
|
|
113
111
|
@follow_links_like = []
|
114
112
|
@skip_links_like = []
|
@@ -150,8 +148,8 @@ module Polipus
|
|
150
148
|
@options[:workers].times do |worker_number|
|
151
149
|
@workers_pool << Thread.new do
|
152
150
|
@logger.debug { "Start worker #{worker_number}" }
|
153
|
-
http =
|
154
|
-
queue =
|
151
|
+
http = HTTP.new(@options)
|
152
|
+
queue = queue_factory
|
155
153
|
queue.process(false, @options[:queue_timeout]) do |message|
|
156
154
|
|
157
155
|
next if message.nil?
|
@@ -216,7 +214,7 @@ module Polipus
|
|
216
214
|
if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
|
217
215
|
links_for(page).each do |url_to_visit|
|
218
216
|
next unless should_be_visited?(url_to_visit)
|
219
|
-
enqueue url_to_visit, page
|
217
|
+
enqueue url_to_visit, page
|
220
218
|
end
|
221
219
|
else
|
222
220
|
@logger.info { "[worker ##{worker_number}] Depth limit reached #{page.depth}" }
|
@@ -396,12 +394,12 @@ module Polipus
|
|
396
394
|
end
|
397
395
|
|
398
396
|
# The url is enqueued for a later visit
|
399
|
-
def enqueue(url_to_visit, current_page
|
397
|
+
def enqueue(url_to_visit, current_page)
|
400
398
|
page_to_visit = Page.new(url_to_visit.to_s, referer: current_page.url.to_s, depth: current_page.depth + 1)
|
401
|
-
|
399
|
+
internal_queue << page_to_visit.to_json
|
402
400
|
to_track = @options[:include_query_string_in_saved_page] ? url_to_visit.to_s : url_to_visit.to_s.gsub(/\?.*$/, '')
|
403
401
|
url_tracker.visit to_track
|
404
|
-
@logger.debug { "Added
|
402
|
+
@logger.debug { "Added (#{url_to_visit}) to the queue" }
|
405
403
|
end
|
406
404
|
|
407
405
|
# It creates a redis client
|
@@ -444,6 +442,7 @@ module Polipus
|
|
444
442
|
removed, restored = @overflow_manager.perform
|
445
443
|
@logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{queue_overflow_adapter.size}" }
|
446
444
|
sleep @options[:queue_overflow_manager_check_time]
|
445
|
+
break if SignalHandler.terminated?
|
447
446
|
end
|
448
447
|
|
449
448
|
end
|
data/lib/polipus/http.rb
CHANGED
@@ -209,8 +209,6 @@ module Polipus
|
|
209
209
|
end
|
210
210
|
|
211
211
|
def refresh_connection(url)
|
212
|
-
proxy_host, proxy_port = proxy_host_port unless @opts[:proxy_host_port].nil?
|
213
|
-
|
214
212
|
if @opts[:logger] && proxy_host && proxy_port
|
215
213
|
@opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
|
216
214
|
end
|
data/lib/polipus/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polipus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francesco Laurita
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-07-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis-bloomfilter
|