polipus 0.3.7 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.rspec +1 -1
- data/.rubocop.yml +3 -3
- data/.rubocop_todo.yml +1 -1
- data/.travis.yml +14 -4
- data/AUTHORS.md +1 -0
- data/CHANGELOG.md +9 -1
- data/Gemfile +9 -0
- data/README.md +2 -3
- data/Rakefile +1 -3
- data/examples/basic.rb +8 -1
- data/lib/polipus.rb +25 -13
- data/lib/polipus/queue_overflow.rb +1 -0
- data/lib/polipus/queue_overflow/manager.rb +1 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +1 -1
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/storage.rb +10 -16
- data/lib/polipus/storage/mongo_store.rb +6 -1
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/version.rb +1 -1
- data/polipus.gemspec +16 -18
- data/spec/{http_spec.rb → polipus/http_spec.rb} +26 -37
- data/spec/{page_spec.rb → polipus/page_spec.rb} +7 -11
- data/spec/{queue_overflow_manager_spec.rb → polipus/queue_overflow/manager_spec.rb} +22 -29
- data/spec/{queue_overflow_spec.rb → polipus/queue_overflow_spec.rb} +14 -20
- data/spec/{robotex_spec.rb → polipus/robotex_spec.rb} +10 -11
- data/spec/{signal_handler_spec.rb → polipus/signal_handler_spec.rb} +2 -6
- data/spec/{storage_memory_spec.rb → polipus/storage/memory_store_spec.rb} +18 -21
- data/spec/{storage_mongo_spec.rb → polipus/storage/mongo_store_spec.rb} +23 -25
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/{url_tracker_spec.rb → polipus/url_tracker_spec.rb} +4 -4
- data/spec/polipus_spec.rb +13 -15
- data/spec/spec_helper.rb +13 -12
- metadata +76 -154
- data/lib/polipus/storage/s3_store.rb +0 -96
- data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +0 -166
- data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +0 -166
- data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +0 -270
- data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +0 -194
- data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +0 -183
- data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +0 -221
- data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +0 -221
- data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +0 -221
- data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +0 -695
- data/spec/storage_s3_spec.rb +0 -115
checksums.yaml
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
---
|
|
2
2
|
!binary "U0hBMQ==":
|
|
3
3
|
metadata.gz: !binary |-
|
|
4
|
-
|
|
4
|
+
NGU0MWZlMWIwMGM2MWNhMmJiOTU3MDM0YWI0ZjY0MzI3NmNkYzgyNA==
|
|
5
5
|
data.tar.gz: !binary |-
|
|
6
|
-
|
|
6
|
+
MTAxYjZmYWNjMTlkYzk3Y2ZhNjdjMDFmODM2OTQ5YmQ3ZDcyNTQzNw==
|
|
7
7
|
SHA512:
|
|
8
8
|
metadata.gz: !binary |-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
9
|
+
ZDE5NDExNmE1M2M4MDM0MTExNzVkNmM0ZmVjNTQyOTg5Y2JkYTdmZjBhY2Fi
|
|
10
|
+
NmM1MDc5ZTI0ODkxNzlhY2ZkZGVjZjI2ZTMyNjZhNTMwOWZhZjQ2OGY4ZTJl
|
|
11
|
+
ZmI4ZjUwNmQ3YjAxOTdlODI2MWRmMWUzNjY3Yzk1MmNiZTk0Y2E=
|
|
12
12
|
data.tar.gz: !binary |-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
13
|
+
YjNlYTZlMjU5OTBmN2ZjMTI1OTY1MzgzNzBhYjBjYTAzN2MwYzY0M2U1ZGYw
|
|
14
|
+
MjcyMmMxM2I1ZTY5OWEzZDQ3MjQ1OTgzMzcyN2I5NGQ5NTAzZTEzY2FmMDRl
|
|
15
|
+
MmJiZDdjOTFmMjQwZWU1MzM1OGJhN2E4NTRhNGExZTAyMTVmN2I=
|
data/.rspec
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
--color
|
|
2
|
-
--format
|
|
2
|
+
--format documentation
|
data/.rubocop.yml
CHANGED
|
@@ -4,14 +4,14 @@ AllCops:
|
|
|
4
4
|
- my_test/**/*
|
|
5
5
|
- examples/**/*
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
Metrics/LineLength:
|
|
8
8
|
Enabled: false
|
|
9
9
|
|
|
10
10
|
Style/TrivialAccessors:
|
|
11
11
|
Enabled: false
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
Metrics/ClassLength:
|
|
14
14
|
Enabled: false
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
Metrics/MethodLength:
|
|
17
17
|
Enabled: false
|
data/.rubocop_todo.yml
CHANGED
data/.travis.yml
CHANGED
|
@@ -3,10 +3,20 @@ rvm:
|
|
|
3
3
|
- jruby
|
|
4
4
|
- 1.9.3
|
|
5
5
|
- 2.0.0
|
|
6
|
-
- 2.1.
|
|
6
|
+
- 2.1.5
|
|
7
|
+
- 2.2.0
|
|
7
8
|
- rbx-2
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
-
|
|
10
|
+
# Until travis supports rethinkdb as service...
|
|
11
|
+
before_install:
|
|
12
|
+
- source /etc/lsb-release && echo "deb http://download.rethinkdb.com/apt $DISTRIB_CODENAME main" | sudo tee /etc/apt/sources.list.d/rethinkdb.list
|
|
13
|
+
- wget -qO- http://download.rethinkdb.com/apt/pubkey.gpg | sudo apt-key add -
|
|
14
|
+
- sudo apt-get update -q
|
|
15
|
+
- sudo apt-get install rethinkdb
|
|
16
|
+
- sudo cp /etc/rethinkdb/default.conf.sample /etc/rethinkdb/instances.d/instance1.conf
|
|
17
|
+
- sudo service rethinkdb restart
|
|
12
18
|
|
|
19
|
+
services:
|
|
20
|
+
- redis
|
|
21
|
+
- mongodb
|
|
22
|
+
# - rethinkdb
|
data/AUTHORS.md
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.4.0 (2015-01-12)
|
|
4
|
+
|
|
5
|
+
[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.3...0.4.0)
|
|
6
|
+
|
|
7
|
+
* Adds RethinkDB Storage
|
|
8
|
+
* BugFix: Update and fix mongo driver v1.11.1 'upsert: 1' -> 'upsert: true'
|
|
9
|
+
* Organize and update specs to rspec 3
|
|
10
|
+
|
|
3
11
|
## 0.3.3 (2015-06-26)
|
|
4
12
|
|
|
5
13
|
[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.2...0.3.3)
|
|
@@ -27,7 +35,7 @@
|
|
|
27
35
|
```ruby
|
|
28
36
|
enable_signal_handler: true / false
|
|
29
37
|
```
|
|
30
|
-
|
|
38
|
+
|
|
31
39
|
* Zlib::GzipFile::Error handling
|
|
32
40
|
[da3b927](https://github.com/taganaka/polipus/commit/da3b927acb1b50c26276ed458da0a365c22fd98b)
|
|
33
41
|
* Faster and easier overflow management
|
data/Gemfile
CHANGED
data/README.md
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
[](https://travis-ci.org/taganaka/polipus)
|
|
2
|
-
[](https://coveralls.io/r/taganaka/polipus?branch=master)
|
|
3
|
+
[](https://codeclimate.com/github/taganaka/polipus)
|
|
4
4
|
[](https://rubygems.org/gems/polipus)
|
|
5
5
|
|
|
6
|
-
|
|
7
6
|
# Polipus #
|
|
8
7
|
|
|
9
8
|
A distributed web crawler written in ruby, backed by Redis
|
data/Rakefile
CHANGED
data/examples/basic.rb
CHANGED
|
@@ -20,7 +20,14 @@ options = {
|
|
|
20
20
|
user_agent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71',
|
|
21
21
|
# Use 5 threads
|
|
22
22
|
workers: 5,
|
|
23
|
-
#
|
|
23
|
+
# Queue overflow settings:
|
|
24
|
+
# * No more than 5000 elements on the Redis queue
|
|
25
|
+
# * Exceeded Items will stored on Mongo into 'rubygems_queue_overflow' collection
|
|
26
|
+
# * Check cycle is done every 60 sec
|
|
27
|
+
queue_items_limit: 5_000,
|
|
28
|
+
queue_overflow_adapter: Polipus::QueueOverflow.mongo_queue(mongo, 'rubygems_queue_overflow'),
|
|
29
|
+
queue_overflow_manager_check_time: 60,
|
|
30
|
+
# Logs goes to the stdout
|
|
24
31
|
logger: Logger.new(STDOUT)
|
|
25
32
|
}
|
|
26
33
|
Polipus::Plugin.register Polipus::Plugin::Cleaner, reset: true
|
data/lib/polipus.rb
CHANGED
|
@@ -118,6 +118,7 @@ module Polipus
|
|
|
118
118
|
@on_before_save = []
|
|
119
119
|
@on_page_error = []
|
|
120
120
|
@focus_crawl_block = nil
|
|
121
|
+
@on_crawl_start = []
|
|
121
122
|
@on_crawl_end = []
|
|
122
123
|
@redis_factory = nil
|
|
123
124
|
|
|
@@ -131,6 +132,20 @@ module Polipus
|
|
|
131
132
|
@robots = Polipus::Robotex.new(@options[:user_agent]) if @options[:obey_robots_txt]
|
|
132
133
|
# Attach signal handling if enabled
|
|
133
134
|
SignalHandler.enable if @options[:enable_signal_handler]
|
|
135
|
+
|
|
136
|
+
if queue_overflow_adapter
|
|
137
|
+
@on_crawl_start << lambda do |_|
|
|
138
|
+
Thread.new do
|
|
139
|
+
Thread.current[:name] = :overflow_items_controller
|
|
140
|
+
overflow_items_controller.run
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
@on_crawl_end << lambda do |_|
|
|
146
|
+
Thread.list.select { |thread| thread.status && Thread.current[:name] == :overflow_items_controller }.each(&:kill)
|
|
147
|
+
end
|
|
148
|
+
|
|
134
149
|
execute_plugin 'on_initialize'
|
|
135
150
|
|
|
136
151
|
yield self if block_given?
|
|
@@ -141,13 +156,13 @@ module Polipus
|
|
|
141
156
|
end
|
|
142
157
|
|
|
143
158
|
def takeover
|
|
144
|
-
overflow_items_controller if queue_overflow_adapter
|
|
145
|
-
|
|
146
159
|
@urls.each do |u|
|
|
147
160
|
add_url(u) { |page| page.user_data.p_seeded = true }
|
|
148
161
|
end
|
|
149
162
|
return if internal_queue.empty?
|
|
150
163
|
|
|
164
|
+
@on_crawl_start.each { |e| e.call(self) }
|
|
165
|
+
|
|
151
166
|
execute_plugin 'on_crawl_start'
|
|
152
167
|
@options[:workers].times do |worker_number|
|
|
153
168
|
@workers_pool << Thread.new do
|
|
@@ -237,6 +252,7 @@ module Polipus
|
|
|
237
252
|
end
|
|
238
253
|
end
|
|
239
254
|
end
|
|
255
|
+
|
|
240
256
|
@workers_pool.each { |w| w.join }
|
|
241
257
|
@on_crawl_end.each { |e| e.call(self) }
|
|
242
258
|
execute_plugin 'on_crawl_end'
|
|
@@ -269,6 +285,12 @@ module Polipus
|
|
|
269
285
|
self
|
|
270
286
|
end
|
|
271
287
|
|
|
288
|
+
# A block of code will be executed when crawl session is starting
|
|
289
|
+
def on_crawl_start(&block)
|
|
290
|
+
@on_crawl_start << block
|
|
291
|
+
self
|
|
292
|
+
end
|
|
293
|
+
|
|
272
294
|
# A block of code will be executed on every page downloaded
|
|
273
295
|
# before being saved in the registered storage
|
|
274
296
|
def on_before_save(&block)
|
|
@@ -439,17 +461,7 @@ module Polipus
|
|
|
439
461
|
should_be_visited?(page.url, false)
|
|
440
462
|
end
|
|
441
463
|
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
loop do
|
|
445
|
-
@logger.info { 'Overflow Manager: cycle started' }
|
|
446
|
-
removed, restored = @overflow_manager.perform
|
|
447
|
-
@logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{queue_overflow_adapter.size}" }
|
|
448
|
-
sleep @options[:queue_overflow_manager_check_time]
|
|
449
|
-
break if SignalHandler.terminated?
|
|
450
|
-
end
|
|
451
|
-
|
|
452
|
-
end
|
|
464
|
+
QueueOverflow::Worker.new(@overflow_manager)
|
|
453
465
|
end
|
|
454
466
|
|
|
455
467
|
def internal_queue
|
|
@@ -28,7 +28,7 @@ module Polipus
|
|
|
28
28
|
|
|
29
29
|
def push(data)
|
|
30
30
|
if @options[:ensure_uniq]
|
|
31
|
-
@mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert:
|
|
31
|
+
@mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: true, w: 1 })
|
|
32
32
|
else
|
|
33
33
|
@mongo_db[@collection_name].insert(payload: data)
|
|
34
34
|
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
module Polipus
|
|
3
|
+
module QueueOverflow
|
|
4
|
+
class Worker
|
|
5
|
+
def initialize(manager)
|
|
6
|
+
@logger = manager.polipus.logger
|
|
7
|
+
@delay = manager.polipus.options[:queue_overflow_manager_check_time]
|
|
8
|
+
@adapter = manager.polipus.queue_overflow_adapter
|
|
9
|
+
@manager = manager
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def run
|
|
13
|
+
@logger.info { 'Overflow::Worker::run' }
|
|
14
|
+
loop do
|
|
15
|
+
@logger.info { 'Overflow Manager: cycle started' }
|
|
16
|
+
removed, restored = @manager.perform
|
|
17
|
+
@logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{@adapter.size}" }
|
|
18
|
+
sleep @delay
|
|
19
|
+
break if SignalHandler.terminated?
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
data/lib/polipus/storage.rb
CHANGED
|
@@ -1,27 +1,21 @@
|
|
|
1
|
-
# encoding: UTF-8
|
|
2
1
|
require 'polipus/storage/base'
|
|
2
|
+
|
|
3
3
|
module Polipus
|
|
4
4
|
module Storage
|
|
5
|
-
|
|
5
|
+
COLLECTION = 'pages'
|
|
6
|
+
|
|
7
|
+
def self.mongo_store(mongo = nil, collection = COLLECTION, except = [])
|
|
6
8
|
require 'polipus/storage/mongo_store'
|
|
7
9
|
mongo ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
|
|
8
10
|
fail 'First argument must be an instance of Mongo::DB' unless mongo.is_a?(Mongo::DB)
|
|
9
|
-
self::MongoStore.new(mongo: mongo, collection:
|
|
11
|
+
self::MongoStore.new(mongo: mongo, collection: collection, except: except)
|
|
10
12
|
end
|
|
11
13
|
|
|
12
|
-
def self.
|
|
13
|
-
require 'polipus/storage/
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
end
|
|
18
|
-
|
|
19
|
-
self::S3Store.new(
|
|
20
|
-
bucket: bucket_name,
|
|
21
|
-
access_key_id: aws_credential[:access_key_id],
|
|
22
|
-
secret_access_key: aws_credential[:secret_access_key],
|
|
23
|
-
except: except
|
|
24
|
-
)
|
|
14
|
+
def self.rethink_store(conn = nil, table = COLLECTION, except = [])
|
|
15
|
+
require 'polipus/storage/rethink_store'
|
|
16
|
+
conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus' )
|
|
17
|
+
fail "First argument must be a RethinkDB::Connection, got `#{conn.class}`" unless conn.is_a?(RethinkDB::Connection)
|
|
18
|
+
self::RethinkStore.new(conn: conn, table: table, except: except)
|
|
25
19
|
end
|
|
26
20
|
|
|
27
21
|
def self.dev_null
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
require 'mongo'
|
|
3
3
|
require 'zlib'
|
|
4
4
|
require 'thread'
|
|
5
|
+
|
|
5
6
|
module Polipus
|
|
6
7
|
module Storage
|
|
7
8
|
class MongoStore < Base
|
|
@@ -10,7 +11,11 @@ module Polipus
|
|
|
10
11
|
@mongo = options[:mongo]
|
|
11
12
|
@collection = options[:collection]
|
|
12
13
|
@mongo.create_collection(@collection)
|
|
13
|
-
|
|
14
|
+
begin
|
|
15
|
+
@mongo[@collection].ensure_index(:uuid, unique: true, dropDups: true, background: true)
|
|
16
|
+
rescue Exception
|
|
17
|
+
end
|
|
18
|
+
|
|
14
19
|
@compress_body = options[:compress_body] ||= true
|
|
15
20
|
@except = options[:except] ||= []
|
|
16
21
|
@semaphore = Mutex.new
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
require 'rethinkdb'
|
|
3
|
+
require 'thread'
|
|
4
|
+
require 'zlib'
|
|
5
|
+
|
|
6
|
+
module Polipus
|
|
7
|
+
module Storage
|
|
8
|
+
class RethinkStore < Base
|
|
9
|
+
BINARY_FIELDS = %w(body headers data)
|
|
10
|
+
def initialize(options = {})
|
|
11
|
+
@r = RethinkDB::RQL.new
|
|
12
|
+
@rethink = options[:conn]
|
|
13
|
+
@table = options[:table]
|
|
14
|
+
|
|
15
|
+
unless @r.table_list.run(@rethink).include?(@table)
|
|
16
|
+
@r.table_create(@table).run(@rethink)
|
|
17
|
+
@r.table(@table).index_create('created_at')
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
@compress_body = options[:compress_body] ||= true
|
|
21
|
+
@except = options[:except] ||= []
|
|
22
|
+
@semaphore = Mutex.new
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def add(page)
|
|
26
|
+
@semaphore.synchronize do
|
|
27
|
+
obj = page.to_hash
|
|
28
|
+
@except.each { |e| obj.delete e.to_s }
|
|
29
|
+
obj[:id] = uuid(page)
|
|
30
|
+
obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
|
|
31
|
+
obj['created_at'] ||= Time.now.to_i
|
|
32
|
+
BINARY_FIELDS.each do |field|
|
|
33
|
+
# Use some marshalling?
|
|
34
|
+
obj[field] = @r.binary(obj[field]) unless obj[field].nil?
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
@r.table(@table).insert(obj).run(@rethink, durability: 'soft')
|
|
38
|
+
obj[:id]
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def exists?(page)
|
|
43
|
+
@semaphore.synchronize do
|
|
44
|
+
doc = @r.table(@table).get(uuid(page)).run(@rethink)
|
|
45
|
+
!doc.nil?
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def get(page)
|
|
50
|
+
@semaphore.synchronize do
|
|
51
|
+
data = @r.table(@table).get(uuid(page)).run(@rethink)
|
|
52
|
+
return load_page(data) if data
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def remove(page)
|
|
57
|
+
@semaphore.synchronize do
|
|
58
|
+
@r.table(@table).get(uuid(page)).delete.run(@rethink)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def count
|
|
63
|
+
@r.table(@table).count.run(@rethink)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def each
|
|
67
|
+
@r.table(@table).run(@rethink).each do |doc|
|
|
68
|
+
page = load_page(doc)
|
|
69
|
+
yield doc[:id], page
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def clear
|
|
74
|
+
@r.table(@table).delete.run(@rethink)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
private
|
|
78
|
+
|
|
79
|
+
def load_page(hash)
|
|
80
|
+
BINARY_FIELDS.each do |field|
|
|
81
|
+
hash[field] = hash[field].to_s
|
|
82
|
+
end
|
|
83
|
+
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
|
84
|
+
page = Page.from_hash(hash)
|
|
85
|
+
page.fetched_at ||= hash['created_at']
|
|
86
|
+
page
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|