polipus 0.3.7 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.rspec +1 -1
- data/.rubocop.yml +3 -3
- data/.rubocop_todo.yml +1 -1
- data/.travis.yml +14 -4
- data/AUTHORS.md +1 -0
- data/CHANGELOG.md +9 -1
- data/Gemfile +9 -0
- data/README.md +2 -3
- data/Rakefile +1 -3
- data/examples/basic.rb +8 -1
- data/lib/polipus.rb +25 -13
- data/lib/polipus/queue_overflow.rb +1 -0
- data/lib/polipus/queue_overflow/manager.rb +1 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +1 -1
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/storage.rb +10 -16
- data/lib/polipus/storage/mongo_store.rb +6 -1
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/version.rb +1 -1
- data/polipus.gemspec +16 -18
- data/spec/{http_spec.rb → polipus/http_spec.rb} +26 -37
- data/spec/{page_spec.rb → polipus/page_spec.rb} +7 -11
- data/spec/{queue_overflow_manager_spec.rb → polipus/queue_overflow/manager_spec.rb} +22 -29
- data/spec/{queue_overflow_spec.rb → polipus/queue_overflow_spec.rb} +14 -20
- data/spec/{robotex_spec.rb → polipus/robotex_spec.rb} +10 -11
- data/spec/{signal_handler_spec.rb → polipus/signal_handler_spec.rb} +2 -6
- data/spec/{storage_memory_spec.rb → polipus/storage/memory_store_spec.rb} +18 -21
- data/spec/{storage_mongo_spec.rb → polipus/storage/mongo_store_spec.rb} +23 -25
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/{url_tracker_spec.rb → polipus/url_tracker_spec.rb} +4 -4
- data/spec/polipus_spec.rb +13 -15
- data/spec/spec_helper.rb +13 -12
- metadata +76 -154
- data/lib/polipus/storage/s3_store.rb +0 -96
- data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +0 -166
- data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +0 -166
- data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +0 -270
- data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +0 -194
- data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +0 -183
- data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +0 -221
- data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +0 -221
- data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +0 -221
- data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +0 -695
- data/spec/storage_s3_spec.rb +0 -115
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
NGU0MWZlMWIwMGM2MWNhMmJiOTU3MDM0YWI0ZjY0MzI3NmNkYzgyNA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
MTAxYjZmYWNjMTlkYzk3Y2ZhNjdjMDFmODM2OTQ5YmQ3ZDcyNTQzNw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZDE5NDExNmE1M2M4MDM0MTExNzVkNmM0ZmVjNTQyOTg5Y2JkYTdmZjBhY2Fi
|
10
|
+
NmM1MDc5ZTI0ODkxNzlhY2ZkZGVjZjI2ZTMyNjZhNTMwOWZhZjQ2OGY4ZTJl
|
11
|
+
ZmI4ZjUwNmQ3YjAxOTdlODI2MWRmMWUzNjY3Yzk1MmNiZTk0Y2E=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YjNlYTZlMjU5OTBmN2ZjMTI1OTY1MzgzNzBhYjBjYTAzN2MwYzY0M2U1ZGYw
|
14
|
+
MjcyMmMxM2I1ZTY5OWEzZDQ3MjQ1OTgzMzcyN2I5NGQ5NTAzZTEzY2FmMDRl
|
15
|
+
MmJiZDdjOTFmMjQwZWU1MzM1OGJhN2E4NTRhNGExZTAyMTVmN2I=
|
data/.rspec
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
--color
|
2
|
-
--format
|
2
|
+
--format documentation
|
data/.rubocop.yml
CHANGED
@@ -4,14 +4,14 @@ AllCops:
|
|
4
4
|
- my_test/**/*
|
5
5
|
- examples/**/*
|
6
6
|
|
7
|
-
|
7
|
+
Metrics/LineLength:
|
8
8
|
Enabled: false
|
9
9
|
|
10
10
|
Style/TrivialAccessors:
|
11
11
|
Enabled: false
|
12
12
|
|
13
|
-
|
13
|
+
Metrics/ClassLength:
|
14
14
|
Enabled: false
|
15
15
|
|
16
|
-
|
16
|
+
Metrics/MethodLength:
|
17
17
|
Enabled: false
|
data/.rubocop_todo.yml
CHANGED
data/.travis.yml
CHANGED
@@ -3,10 +3,20 @@ rvm:
|
|
3
3
|
- jruby
|
4
4
|
- 1.9.3
|
5
5
|
- 2.0.0
|
6
|
-
- 2.1.
|
6
|
+
- 2.1.5
|
7
|
+
- 2.2.0
|
7
8
|
- rbx-2
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
-
|
10
|
+
# Until travis supports rethinkdb as service...
|
11
|
+
before_install:
|
12
|
+
- source /etc/lsb-release && echo "deb http://download.rethinkdb.com/apt $DISTRIB_CODENAME main" | sudo tee /etc/apt/sources.list.d/rethinkdb.list
|
13
|
+
- wget -qO- http://download.rethinkdb.com/apt/pubkey.gpg | sudo apt-key add -
|
14
|
+
- sudo apt-get update -q
|
15
|
+
- sudo apt-get install rethinkdb
|
16
|
+
- sudo cp /etc/rethinkdb/default.conf.sample /etc/rethinkdb/instances.d/instance1.conf
|
17
|
+
- sudo service rethinkdb restart
|
12
18
|
|
19
|
+
services:
|
20
|
+
- redis
|
21
|
+
- mongodb
|
22
|
+
# - rethinkdb
|
data/AUTHORS.md
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,13 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## 0.4.0 (2015-01-12)
|
4
|
+
|
5
|
+
[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.3...0.4.0)
|
6
|
+
|
7
|
+
* Adds RethinkDB Storage
|
8
|
+
* BugFix: Update and fix mongo driver v1.11.1 'upsert: 1' -> 'upsert: true'
|
9
|
+
* Organize and update specs to rspec 3
|
10
|
+
|
3
11
|
## 0.3.3 (2015-06-26)
|
4
12
|
|
5
13
|
[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.2...0.3.3)
|
@@ -27,7 +35,7 @@
|
|
27
35
|
```ruby
|
28
36
|
enable_signal_handler: true / false
|
29
37
|
```
|
30
|
-
|
38
|
+
|
31
39
|
* Zlib::GzipFile::Error handling
|
32
40
|
[da3b927](https://github.com/taganaka/polipus/commit/da3b927acb1b50c26276ed458da0a365c22fd98b)
|
33
41
|
* Faster and easier overflow management
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,9 +1,8 @@
|
|
1
1
|
[![Build Status](https://travis-ci.org/taganaka/polipus.svg?branch=master)](https://travis-ci.org/taganaka/polipus)
|
2
|
-
[![Coverage Status](https://
|
3
|
-
[![Code Climate](https://codeclimate.com/github/taganaka/polipus.
|
2
|
+
[![Coverage Status](https://img.shields.io/coveralls/taganaka/polipus/master.svg)](https://coveralls.io/r/taganaka/polipus?branch=master)
|
3
|
+
[![Code Climate](https://codeclimate.com/github/taganaka/polipus.svg)](https://codeclimate.com/github/taganaka/polipus)
|
4
4
|
[![RubyGems](http://img.shields.io/gem/v/polipus.svg)](https://rubygems.org/gems/polipus)
|
5
5
|
|
6
|
-
|
7
6
|
# Polipus #
|
8
7
|
|
9
8
|
A distributed web crawler written in ruby, backed by Redis
|
data/Rakefile
CHANGED
data/examples/basic.rb
CHANGED
@@ -20,7 +20,14 @@ options = {
|
|
20
20
|
user_agent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71',
|
21
21
|
# Use 5 threads
|
22
22
|
workers: 5,
|
23
|
-
#
|
23
|
+
# Queue overflow settings:
|
24
|
+
# * No more than 5000 elements on the Redis queue
|
25
|
+
# * Exceeded Items will stored on Mongo into 'rubygems_queue_overflow' collection
|
26
|
+
# * Check cycle is done every 60 sec
|
27
|
+
queue_items_limit: 5_000,
|
28
|
+
queue_overflow_adapter: Polipus::QueueOverflow.mongo_queue(mongo, 'rubygems_queue_overflow'),
|
29
|
+
queue_overflow_manager_check_time: 60,
|
30
|
+
# Logs goes to the stdout
|
24
31
|
logger: Logger.new(STDOUT)
|
25
32
|
}
|
26
33
|
Polipus::Plugin.register Polipus::Plugin::Cleaner, reset: true
|
data/lib/polipus.rb
CHANGED
@@ -118,6 +118,7 @@ module Polipus
|
|
118
118
|
@on_before_save = []
|
119
119
|
@on_page_error = []
|
120
120
|
@focus_crawl_block = nil
|
121
|
+
@on_crawl_start = []
|
121
122
|
@on_crawl_end = []
|
122
123
|
@redis_factory = nil
|
123
124
|
|
@@ -131,6 +132,20 @@ module Polipus
|
|
131
132
|
@robots = Polipus::Robotex.new(@options[:user_agent]) if @options[:obey_robots_txt]
|
132
133
|
# Attach signal handling if enabled
|
133
134
|
SignalHandler.enable if @options[:enable_signal_handler]
|
135
|
+
|
136
|
+
if queue_overflow_adapter
|
137
|
+
@on_crawl_start << lambda do |_|
|
138
|
+
Thread.new do
|
139
|
+
Thread.current[:name] = :overflow_items_controller
|
140
|
+
overflow_items_controller.run
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
@on_crawl_end << lambda do |_|
|
146
|
+
Thread.list.select { |thread| thread.status && Thread.current[:name] == :overflow_items_controller }.each(&:kill)
|
147
|
+
end
|
148
|
+
|
134
149
|
execute_plugin 'on_initialize'
|
135
150
|
|
136
151
|
yield self if block_given?
|
@@ -141,13 +156,13 @@ module Polipus
|
|
141
156
|
end
|
142
157
|
|
143
158
|
def takeover
|
144
|
-
overflow_items_controller if queue_overflow_adapter
|
145
|
-
|
146
159
|
@urls.each do |u|
|
147
160
|
add_url(u) { |page| page.user_data.p_seeded = true }
|
148
161
|
end
|
149
162
|
return if internal_queue.empty?
|
150
163
|
|
164
|
+
@on_crawl_start.each { |e| e.call(self) }
|
165
|
+
|
151
166
|
execute_plugin 'on_crawl_start'
|
152
167
|
@options[:workers].times do |worker_number|
|
153
168
|
@workers_pool << Thread.new do
|
@@ -237,6 +252,7 @@ module Polipus
|
|
237
252
|
end
|
238
253
|
end
|
239
254
|
end
|
255
|
+
|
240
256
|
@workers_pool.each { |w| w.join }
|
241
257
|
@on_crawl_end.each { |e| e.call(self) }
|
242
258
|
execute_plugin 'on_crawl_end'
|
@@ -269,6 +285,12 @@ module Polipus
|
|
269
285
|
self
|
270
286
|
end
|
271
287
|
|
288
|
+
# A block of code will be executed when crawl session is starting
|
289
|
+
def on_crawl_start(&block)
|
290
|
+
@on_crawl_start << block
|
291
|
+
self
|
292
|
+
end
|
293
|
+
|
272
294
|
# A block of code will be executed on every page downloaded
|
273
295
|
# before being saved in the registered storage
|
274
296
|
def on_before_save(&block)
|
@@ -439,17 +461,7 @@ module Polipus
|
|
439
461
|
should_be_visited?(page.url, false)
|
440
462
|
end
|
441
463
|
|
442
|
-
|
443
|
-
|
444
|
-
loop do
|
445
|
-
@logger.info { 'Overflow Manager: cycle started' }
|
446
|
-
removed, restored = @overflow_manager.perform
|
447
|
-
@logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{queue_overflow_adapter.size}" }
|
448
|
-
sleep @options[:queue_overflow_manager_check_time]
|
449
|
-
break if SignalHandler.terminated?
|
450
|
-
end
|
451
|
-
|
452
|
-
end
|
464
|
+
QueueOverflow::Worker.new(@overflow_manager)
|
453
465
|
end
|
454
466
|
|
455
467
|
def internal_queue
|
@@ -28,7 +28,7 @@ module Polipus
|
|
28
28
|
|
29
29
|
def push(data)
|
30
30
|
if @options[:ensure_uniq]
|
31
|
-
@mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert:
|
31
|
+
@mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: true, w: 1 })
|
32
32
|
else
|
33
33
|
@mongo_db[@collection_name].insert(payload: data)
|
34
34
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module QueueOverflow
|
4
|
+
class Worker
|
5
|
+
def initialize(manager)
|
6
|
+
@logger = manager.polipus.logger
|
7
|
+
@delay = manager.polipus.options[:queue_overflow_manager_check_time]
|
8
|
+
@adapter = manager.polipus.queue_overflow_adapter
|
9
|
+
@manager = manager
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
@logger.info { 'Overflow::Worker::run' }
|
14
|
+
loop do
|
15
|
+
@logger.info { 'Overflow Manager: cycle started' }
|
16
|
+
removed, restored = @manager.perform
|
17
|
+
@logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{@adapter.size}" }
|
18
|
+
sleep @delay
|
19
|
+
break if SignalHandler.terminated?
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/polipus/storage.rb
CHANGED
@@ -1,27 +1,21 @@
|
|
1
|
-
# encoding: UTF-8
|
2
1
|
require 'polipus/storage/base'
|
2
|
+
|
3
3
|
module Polipus
|
4
4
|
module Storage
|
5
|
-
|
5
|
+
COLLECTION = 'pages'
|
6
|
+
|
7
|
+
def self.mongo_store(mongo = nil, collection = COLLECTION, except = [])
|
6
8
|
require 'polipus/storage/mongo_store'
|
7
9
|
mongo ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
|
8
10
|
fail 'First argument must be an instance of Mongo::DB' unless mongo.is_a?(Mongo::DB)
|
9
|
-
self::MongoStore.new(mongo: mongo, collection:
|
11
|
+
self::MongoStore.new(mongo: mongo, collection: collection, except: except)
|
10
12
|
end
|
11
13
|
|
12
|
-
def self.
|
13
|
-
require 'polipus/storage/
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
self::S3Store.new(
|
20
|
-
bucket: bucket_name,
|
21
|
-
access_key_id: aws_credential[:access_key_id],
|
22
|
-
secret_access_key: aws_credential[:secret_access_key],
|
23
|
-
except: except
|
24
|
-
)
|
14
|
+
def self.rethink_store(conn = nil, table = COLLECTION, except = [])
|
15
|
+
require 'polipus/storage/rethink_store'
|
16
|
+
conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus' )
|
17
|
+
fail "First argument must be a RethinkDB::Connection, got `#{conn.class}`" unless conn.is_a?(RethinkDB::Connection)
|
18
|
+
self::RethinkStore.new(conn: conn, table: table, except: except)
|
25
19
|
end
|
26
20
|
|
27
21
|
def self.dev_null
|
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'mongo'
|
3
3
|
require 'zlib'
|
4
4
|
require 'thread'
|
5
|
+
|
5
6
|
module Polipus
|
6
7
|
module Storage
|
7
8
|
class MongoStore < Base
|
@@ -10,7 +11,11 @@ module Polipus
|
|
10
11
|
@mongo = options[:mongo]
|
11
12
|
@collection = options[:collection]
|
12
13
|
@mongo.create_collection(@collection)
|
13
|
-
|
14
|
+
begin
|
15
|
+
@mongo[@collection].ensure_index(:uuid, unique: true, dropDups: true, background: true)
|
16
|
+
rescue Exception
|
17
|
+
end
|
18
|
+
|
14
19
|
@compress_body = options[:compress_body] ||= true
|
15
20
|
@except = options[:except] ||= []
|
16
21
|
@semaphore = Mutex.new
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'rethinkdb'
|
3
|
+
require 'thread'
|
4
|
+
require 'zlib'
|
5
|
+
|
6
|
+
module Polipus
|
7
|
+
module Storage
|
8
|
+
class RethinkStore < Base
|
9
|
+
BINARY_FIELDS = %w(body headers data)
|
10
|
+
def initialize(options = {})
|
11
|
+
@r = RethinkDB::RQL.new
|
12
|
+
@rethink = options[:conn]
|
13
|
+
@table = options[:table]
|
14
|
+
|
15
|
+
unless @r.table_list.run(@rethink).include?(@table)
|
16
|
+
@r.table_create(@table).run(@rethink)
|
17
|
+
@r.table(@table).index_create('created_at')
|
18
|
+
end
|
19
|
+
|
20
|
+
@compress_body = options[:compress_body] ||= true
|
21
|
+
@except = options[:except] ||= []
|
22
|
+
@semaphore = Mutex.new
|
23
|
+
end
|
24
|
+
|
25
|
+
def add(page)
|
26
|
+
@semaphore.synchronize do
|
27
|
+
obj = page.to_hash
|
28
|
+
@except.each { |e| obj.delete e.to_s }
|
29
|
+
obj[:id] = uuid(page)
|
30
|
+
obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
|
31
|
+
obj['created_at'] ||= Time.now.to_i
|
32
|
+
BINARY_FIELDS.each do |field|
|
33
|
+
# Use some marshalling?
|
34
|
+
obj[field] = @r.binary(obj[field]) unless obj[field].nil?
|
35
|
+
end
|
36
|
+
|
37
|
+
@r.table(@table).insert(obj).run(@rethink, durability: 'soft')
|
38
|
+
obj[:id]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def exists?(page)
|
43
|
+
@semaphore.synchronize do
|
44
|
+
doc = @r.table(@table).get(uuid(page)).run(@rethink)
|
45
|
+
!doc.nil?
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def get(page)
|
50
|
+
@semaphore.synchronize do
|
51
|
+
data = @r.table(@table).get(uuid(page)).run(@rethink)
|
52
|
+
return load_page(data) if data
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def remove(page)
|
57
|
+
@semaphore.synchronize do
|
58
|
+
@r.table(@table).get(uuid(page)).delete.run(@rethink)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def count
|
63
|
+
@r.table(@table).count.run(@rethink)
|
64
|
+
end
|
65
|
+
|
66
|
+
def each
|
67
|
+
@r.table(@table).run(@rethink).each do |doc|
|
68
|
+
page = load_page(doc)
|
69
|
+
yield doc[:id], page
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def clear
|
74
|
+
@r.table(@table).delete.run(@rethink)
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def load_page(hash)
|
80
|
+
BINARY_FIELDS.each do |field|
|
81
|
+
hash[field] = hash[field].to_s
|
82
|
+
end
|
83
|
+
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
84
|
+
page = Page.from_hash(hash)
|
85
|
+
page.fetched_at ||= hash['created_at']
|
86
|
+
page
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|