polipus 0.3.7 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +8 -8
  2. data/.rspec +1 -1
  3. data/.rubocop.yml +3 -3
  4. data/.rubocop_todo.yml +1 -1
  5. data/.travis.yml +14 -4
  6. data/AUTHORS.md +1 -0
  7. data/CHANGELOG.md +9 -1
  8. data/Gemfile +9 -0
  9. data/README.md +2 -3
  10. data/Rakefile +1 -3
  11. data/examples/basic.rb +8 -1
  12. data/lib/polipus.rb +25 -13
  13. data/lib/polipus/queue_overflow.rb +1 -0
  14. data/lib/polipus/queue_overflow/manager.rb +1 -0
  15. data/lib/polipus/queue_overflow/mongo_queue.rb +1 -1
  16. data/lib/polipus/queue_overflow/worker.rb +24 -0
  17. data/lib/polipus/storage.rb +10 -16
  18. data/lib/polipus/storage/mongo_store.rb +6 -1
  19. data/lib/polipus/storage/rethink_store.rb +90 -0
  20. data/lib/polipus/version.rb +1 -1
  21. data/polipus.gemspec +16 -18
  22. data/spec/{http_spec.rb → polipus/http_spec.rb} +26 -37
  23. data/spec/{page_spec.rb → polipus/page_spec.rb} +7 -11
  24. data/spec/{queue_overflow_manager_spec.rb → polipus/queue_overflow/manager_spec.rb} +22 -29
  25. data/spec/{queue_overflow_spec.rb → polipus/queue_overflow_spec.rb} +14 -20
  26. data/spec/{robotex_spec.rb → polipus/robotex_spec.rb} +10 -11
  27. data/spec/{signal_handler_spec.rb → polipus/signal_handler_spec.rb} +2 -6
  28. data/spec/{storage_memory_spec.rb → polipus/storage/memory_store_spec.rb} +18 -21
  29. data/spec/{storage_mongo_spec.rb → polipus/storage/mongo_store_spec.rb} +23 -25
  30. data/spec/polipus/storage/rethink_store_spec.rb +117 -0
  31. data/spec/{url_tracker_spec.rb → polipus/url_tracker_spec.rb} +4 -4
  32. data/spec/polipus_spec.rb +13 -15
  33. data/spec/spec_helper.rb +13 -12
  34. metadata +76 -154
  35. data/lib/polipus/storage/s3_store.rb +0 -96
  36. data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +0 -166
  37. data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +0 -166
  38. data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +0 -270
  39. data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +0 -194
  40. data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +0 -183
  41. data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +0 -221
  42. data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +0 -221
  43. data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +0 -221
  44. data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +0 -695
  45. data/spec/storage_s3_spec.rb +0 -115
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZmMzMzJiMDQ1NGQzYTRhNTRmMGZiZWNhY2Y4YTgwNTU2OTljNzdkNQ==
4
+ NGU0MWZlMWIwMGM2MWNhMmJiOTU3MDM0YWI0ZjY0MzI3NmNkYzgyNA==
5
5
  data.tar.gz: !binary |-
6
- ZDQyNTBiNjkzYzQ0ZDc3ODc3N2MxMTcxOTAzOWFkZjliZGZlYzFjZQ==
6
+ MTAxYjZmYWNjMTlkYzk3Y2ZhNjdjMDFmODM2OTQ5YmQ3ZDcyNTQzNw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MWMwNjYzMzY5ZmYzYzNiMTdlMTFkOTNhNGYzNTViZmQ4ZjdkYWFkMjE1YTMz
10
- MDc3ZDA1Y2JlN2FiNjVmZGZmYWRlNDU2YjcwYjdlNjFiYjViY2NiZTA1OTNm
11
- M2JhYjVhYjkwMzlhNWIxMmM2MzhmNWM5YmY5ZTQ1ZjE2OWY5MGM=
9
+ ZDE5NDExNmE1M2M4MDM0MTExNzVkNmM0ZmVjNTQyOTg5Y2JkYTdmZjBhY2Fi
10
+ NmM1MDc5ZTI0ODkxNzlhY2ZkZGVjZjI2ZTMyNjZhNTMwOWZhZjQ2OGY4ZTJl
11
+ ZmI4ZjUwNmQ3YjAxOTdlODI2MWRmMWUzNjY3Yzk1MmNiZTk0Y2E=
12
12
  data.tar.gz: !binary |-
13
- ODlkYzUyNTI1MTVmOTI1YzI2ZjFkMzY1NTMyZWIzMWEwYjI4MGFmODc4Y2Yx
14
- ZGVkZTA2NmM2OGU5ZThkOTg1ODJjYjMxNTJiNGZkZDVhYzkwZjkzOWI1ZGI2
15
- OTcxOGViODEwZDBiYmQ1ZDU4NmRmZWI2NWZkMTFlNzIzYjdlMjc=
13
+ YjNlYTZlMjU5OTBmN2ZjMTI1OTY1MzgzNzBhYjBjYTAzN2MwYzY0M2U1ZGYw
14
+ MjcyMmMxM2I1ZTY5OWEzZDQ3MjQ1OTgzMzcyN2I5NGQ5NTAzZTEzY2FmMDRl
15
+ MmJiZDdjOTFmMjQwZWU1MzM1OGJhN2E4NTRhNGExZTAyMTVmN2I=
data/.rspec CHANGED
@@ -1,2 +1,2 @@
1
1
  --color
2
- --format progress
2
+ --format documentation
@@ -4,14 +4,14 @@ AllCops:
4
4
  - my_test/**/*
5
5
  - examples/**/*
6
6
 
7
- Style/LineLength:
7
+ Metrics/LineLength:
8
8
  Enabled: false
9
9
 
10
10
  Style/TrivialAccessors:
11
11
  Enabled: false
12
12
 
13
- Style/ClassLength:
13
+ Metrics/ClassLength:
14
14
  Enabled: false
15
15
 
16
- Style/MethodLength:
16
+ Metrics/MethodLength:
17
17
  Enabled: false
@@ -10,7 +10,7 @@ Style/ClassVars:
10
10
  Enabled: false
11
11
 
12
12
  # Offense count: 10
13
- Style/CyclomaticComplexity:
13
+ Metrics/CyclomaticComplexity:
14
14
  Max: 16
15
15
 
16
16
  # Offense count: 26
@@ -3,10 +3,20 @@ rvm:
3
3
  - jruby
4
4
  - 1.9.3
5
5
  - 2.0.0
6
- - 2.1.2
6
+ - 2.1.5
7
+ - 2.2.0
7
8
  - rbx-2
8
9
 
9
- services:
10
- - mongodb
11
- - redis
10
+ # Until travis supports rethinkdb as service...
11
+ before_install:
12
+ - source /etc/lsb-release && echo "deb http://download.rethinkdb.com/apt $DISTRIB_CODENAME main" | sudo tee /etc/apt/sources.list.d/rethinkdb.list
13
+ - wget -qO- http://download.rethinkdb.com/apt/pubkey.gpg | sudo apt-key add -
14
+ - sudo apt-get update -q
15
+ - sudo apt-get install rethinkdb
16
+ - sudo cp /etc/rethinkdb/default.conf.sample /etc/rethinkdb/instances.d/instance1.conf
17
+ - sudo service rethinkdb restart
12
18
 
19
+ services:
20
+ - redis
21
+ - mongodb
22
+ # - rethinkdb
data/AUTHORS.md CHANGED
@@ -2,3 +2,4 @@
2
2
 
3
3
  * [Francesco Laurita](francesco.laurita@gmail.com)
4
4
  * [Tobias L. Maier](http://tobiasmaier.info/)
5
+ * [Marcos Piccinini](https://github.com/nofxx)
@@ -1,5 +1,13 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.4.0 (2015-01-12)
4
+
5
+ [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.3...0.4.0)
6
+
7
+ * Adds RethinkDB Storage
8
+ * BugFix: Update and fix mongo driver v1.11.1 'upsert: 1' -> 'upsert: true'
9
+ * Organize and update specs to rspec 3
10
+
3
11
  ## 0.3.3 (2015-06-26)
4
12
 
5
13
  [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.2...0.3.3)
@@ -27,7 +35,7 @@
27
35
  ```ruby
28
36
  enable_signal_handler: true / false
29
37
  ```
30
-
38
+
31
39
  * Zlib::GzipFile::Error handling
32
40
  [da3b927](https://github.com/taganaka/polipus/commit/da3b927acb1b50c26276ed458da0a365c22fd98b)
33
41
  * Faster and easier overflow management
data/Gemfile CHANGED
@@ -1,3 +1,12 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
3
  gemspec
4
+
5
+ platform :ruby do
6
+ gem 'bson_ext'
7
+ end
8
+
9
+ platform :jruby do
10
+ gem 'json'
11
+ gem 'bson'
12
+ end
data/README.md CHANGED
@@ -1,9 +1,8 @@
1
1
  [![Build Status](https://travis-ci.org/taganaka/polipus.svg?branch=master)](https://travis-ci.org/taganaka/polipus)
2
- [![Coverage Status](https://coveralls.io/repos/taganaka/polipus/badge.png?branch=master)](https://coveralls.io/r/taganaka/polipus?branch=master)
3
- [![Code Climate](https://codeclimate.com/github/taganaka/polipus.png)](https://codeclimate.com/github/taganaka/polipus)
2
+ [![Coverage Status](https://img.shields.io/coveralls/taganaka/polipus/master.svg)](https://coveralls.io/r/taganaka/polipus?branch=master)
3
+ [![Code Climate](https://codeclimate.com/github/taganaka/polipus.svg)](https://codeclimate.com/github/taganaka/polipus)
4
4
  [![RubyGems](http://img.shields.io/gem/v/polipus.svg)](https://rubygems.org/gems/polipus)
5
5
 
6
-
7
6
  # Polipus #
8
7
 
9
8
  A distributed web crawler written in ruby, backed by Redis
data/Rakefile CHANGED
@@ -2,9 +2,7 @@
2
2
  require 'bundler/gem_tasks'
3
3
  require 'rspec/core/rake_task'
4
4
 
5
- RSpec::Core::RakeTask.new(:spec) do |spec|
6
- spec.pattern = 'spec/*_spec.rb'
7
- end
5
+ RSpec::Core::RakeTask.new(:spec)
8
6
 
9
7
  task default: :spec
10
8
  task test: :spec
@@ -20,7 +20,14 @@ options = {
20
20
  user_agent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71',
21
21
  # Use 5 threads
22
22
  workers: 5,
23
- # Logs goes to the crawler.log file
23
+ # Queue overflow settings:
24
+ # * No more than 5000 elements on the Redis queue
25
+ # * Exceeded Items will stored on Mongo into 'rubygems_queue_overflow' collection
26
+ # * Check cycle is done every 60 sec
27
+ queue_items_limit: 5_000,
28
+ queue_overflow_adapter: Polipus::QueueOverflow.mongo_queue(mongo, 'rubygems_queue_overflow'),
29
+ queue_overflow_manager_check_time: 60,
30
+ # Logs goes to the stdout
24
31
  logger: Logger.new(STDOUT)
25
32
  }
26
33
  Polipus::Plugin.register Polipus::Plugin::Cleaner, reset: true
@@ -118,6 +118,7 @@ module Polipus
118
118
  @on_before_save = []
119
119
  @on_page_error = []
120
120
  @focus_crawl_block = nil
121
+ @on_crawl_start = []
121
122
  @on_crawl_end = []
122
123
  @redis_factory = nil
123
124
 
@@ -131,6 +132,20 @@ module Polipus
131
132
  @robots = Polipus::Robotex.new(@options[:user_agent]) if @options[:obey_robots_txt]
132
133
  # Attach signal handling if enabled
133
134
  SignalHandler.enable if @options[:enable_signal_handler]
135
+
136
+ if queue_overflow_adapter
137
+ @on_crawl_start << lambda do |_|
138
+ Thread.new do
139
+ Thread.current[:name] = :overflow_items_controller
140
+ overflow_items_controller.run
141
+ end
142
+ end
143
+ end
144
+
145
+ @on_crawl_end << lambda do |_|
146
+ Thread.list.select { |thread| thread.status && Thread.current[:name] == :overflow_items_controller }.each(&:kill)
147
+ end
148
+
134
149
  execute_plugin 'on_initialize'
135
150
 
136
151
  yield self if block_given?
@@ -141,13 +156,13 @@ module Polipus
141
156
  end
142
157
 
143
158
  def takeover
144
- overflow_items_controller if queue_overflow_adapter
145
-
146
159
  @urls.each do |u|
147
160
  add_url(u) { |page| page.user_data.p_seeded = true }
148
161
  end
149
162
  return if internal_queue.empty?
150
163
 
164
+ @on_crawl_start.each { |e| e.call(self) }
165
+
151
166
  execute_plugin 'on_crawl_start'
152
167
  @options[:workers].times do |worker_number|
153
168
  @workers_pool << Thread.new do
@@ -237,6 +252,7 @@ module Polipus
237
252
  end
238
253
  end
239
254
  end
255
+
240
256
  @workers_pool.each { |w| w.join }
241
257
  @on_crawl_end.each { |e| e.call(self) }
242
258
  execute_plugin 'on_crawl_end'
@@ -269,6 +285,12 @@ module Polipus
269
285
  self
270
286
  end
271
287
 
288
+ # A block of code will be executed when crawl session is starting
289
+ def on_crawl_start(&block)
290
+ @on_crawl_start << block
291
+ self
292
+ end
293
+
272
294
  # A block of code will be executed on every page downloaded
273
295
  # before being saved in the registered storage
274
296
  def on_before_save(&block)
@@ -439,17 +461,7 @@ module Polipus
439
461
  should_be_visited?(page.url, false)
440
462
  end
441
463
 
442
- Thread.new do
443
-
444
- loop do
445
- @logger.info { 'Overflow Manager: cycle started' }
446
- removed, restored = @overflow_manager.perform
447
- @logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{queue_overflow_adapter.size}" }
448
- sleep @options[:queue_overflow_manager_check_time]
449
- break if SignalHandler.terminated?
450
- end
451
-
452
- end
464
+ QueueOverflow::Worker.new(@overflow_manager)
453
465
  end
454
466
 
455
467
  def internal_queue
@@ -1,5 +1,6 @@
1
1
  # encoding: UTF-8
2
2
  require 'polipus/queue_overflow/manager'
3
+ require 'polipus/queue_overflow/worker'
3
4
  module Polipus
4
5
  module QueueOverflow
5
6
  def self.mongo_queue(mongo_db, queue_name, options = {})
@@ -3,6 +3,7 @@ module Polipus
3
3
  module QueueOverflow
4
4
  class Manager
5
5
  attr_accessor :url_filter
6
+ attr_reader :polipus
6
7
  def initialize(polipus, main_q, item_limit)
7
8
  @polipus = polipus
8
9
  @main_q = main_q
@@ -28,7 +28,7 @@ module Polipus
28
28
 
29
29
  def push(data)
30
30
  if @options[:ensure_uniq]
31
- @mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: 1, w: 1 })
31
+ @mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: true, w: 1 })
32
32
  else
33
33
  @mongo_db[@collection_name].insert(payload: data)
34
34
  end
@@ -0,0 +1,24 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module QueueOverflow
4
+ class Worker
5
+ def initialize(manager)
6
+ @logger = manager.polipus.logger
7
+ @delay = manager.polipus.options[:queue_overflow_manager_check_time]
8
+ @adapter = manager.polipus.queue_overflow_adapter
9
+ @manager = manager
10
+ end
11
+
12
+ def run
13
+ @logger.info { 'Overflow::Worker::run' }
14
+ loop do
15
+ @logger.info { 'Overflow Manager: cycle started' }
16
+ removed, restored = @manager.perform
17
+ @logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{@adapter.size}" }
18
+ sleep @delay
19
+ break if SignalHandler.terminated?
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -1,27 +1,21 @@
1
- # encoding: UTF-8
2
1
  require 'polipus/storage/base'
2
+
3
3
  module Polipus
4
4
  module Storage
5
- def self.mongo_store(mongo = nil, collection_name = 'pages', except = [])
5
+ COLLECTION = 'pages'
6
+
7
+ def self.mongo_store(mongo = nil, collection = COLLECTION, except = [])
6
8
  require 'polipus/storage/mongo_store'
7
9
  mongo ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
8
10
  fail 'First argument must be an instance of Mongo::DB' unless mongo.is_a?(Mongo::DB)
9
- self::MongoStore.new(mongo: mongo, collection: collection_name, except: except)
11
+ self::MongoStore.new(mongo: mongo, collection: collection, except: except)
10
12
  end
11
13
 
12
- def self.s3_store(bucket_name = 'pages', aws_credential = {}, except = [])
13
- require 'polipus/storage/s3_store'
14
-
15
- if aws_credential[:access_key_id].nil? || aws_credential[:secret_access_key].nil?
16
- fail 'You have to specify AWS crediantials: :access_key_id and :secret_access_key'
17
- end
18
-
19
- self::S3Store.new(
20
- bucket: bucket_name,
21
- access_key_id: aws_credential[:access_key_id],
22
- secret_access_key: aws_credential[:secret_access_key],
23
- except: except
24
- )
14
+ def self.rethink_store(conn = nil, table = COLLECTION, except = [])
15
+ require 'polipus/storage/rethink_store'
16
+ conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus' )
17
+ fail "First argument must be a RethinkDB::Connection, got `#{conn.class}`" unless conn.is_a?(RethinkDB::Connection)
18
+ self::RethinkStore.new(conn: conn, table: table, except: except)
25
19
  end
26
20
 
27
21
  def self.dev_null
@@ -2,6 +2,7 @@
2
2
  require 'mongo'
3
3
  require 'zlib'
4
4
  require 'thread'
5
+
5
6
  module Polipus
6
7
  module Storage
7
8
  class MongoStore < Base
@@ -10,7 +11,11 @@ module Polipus
10
11
  @mongo = options[:mongo]
11
12
  @collection = options[:collection]
12
13
  @mongo.create_collection(@collection)
13
- @mongo[@collection].ensure_index(:uuid, unique: true, dropDups: true, background: true)
14
+ begin
15
+ @mongo[@collection].ensure_index(:uuid, unique: true, dropDups: true, background: true)
16
+ rescue Exception
17
+ end
18
+
14
19
  @compress_body = options[:compress_body] ||= true
15
20
  @except = options[:except] ||= []
16
21
  @semaphore = Mutex.new
@@ -0,0 +1,90 @@
1
+ # encoding: UTF-8
2
+ require 'rethinkdb'
3
+ require 'thread'
4
+ require 'zlib'
5
+
6
+ module Polipus
7
+ module Storage
8
+ class RethinkStore < Base
9
+ BINARY_FIELDS = %w(body headers data)
10
+ def initialize(options = {})
11
+ @r = RethinkDB::RQL.new
12
+ @rethink = options[:conn]
13
+ @table = options[:table]
14
+
15
+ unless @r.table_list.run(@rethink).include?(@table)
16
+ @r.table_create(@table).run(@rethink)
17
+ @r.table(@table).index_create('created_at')
18
+ end
19
+
20
+ @compress_body = options[:compress_body] ||= true
21
+ @except = options[:except] ||= []
22
+ @semaphore = Mutex.new
23
+ end
24
+
25
+ def add(page)
26
+ @semaphore.synchronize do
27
+ obj = page.to_hash
28
+ @except.each { |e| obj.delete e.to_s }
29
+ obj[:id] = uuid(page)
30
+ obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
31
+ obj['created_at'] ||= Time.now.to_i
32
+ BINARY_FIELDS.each do |field|
33
+ # Use some marshalling?
34
+ obj[field] = @r.binary(obj[field]) unless obj[field].nil?
35
+ end
36
+
37
+ @r.table(@table).insert(obj).run(@rethink, durability: 'soft')
38
+ obj[:id]
39
+ end
40
+ end
41
+
42
+ def exists?(page)
43
+ @semaphore.synchronize do
44
+ doc = @r.table(@table).get(uuid(page)).run(@rethink)
45
+ !doc.nil?
46
+ end
47
+ end
48
+
49
+ def get(page)
50
+ @semaphore.synchronize do
51
+ data = @r.table(@table).get(uuid(page)).run(@rethink)
52
+ return load_page(data) if data
53
+ end
54
+ end
55
+
56
+ def remove(page)
57
+ @semaphore.synchronize do
58
+ @r.table(@table).get(uuid(page)).delete.run(@rethink)
59
+ end
60
+ end
61
+
62
+ def count
63
+ @r.table(@table).count.run(@rethink)
64
+ end
65
+
66
+ def each
67
+ @r.table(@table).run(@rethink).each do |doc|
68
+ page = load_page(doc)
69
+ yield doc[:id], page
70
+ end
71
+ end
72
+
73
+ def clear
74
+ @r.table(@table).delete.run(@rethink)
75
+ end
76
+
77
+ private
78
+
79
+ def load_page(hash)
80
+ BINARY_FIELDS.each do |field|
81
+ hash[field] = hash[field].to_s
82
+ end
83
+ hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
84
+ page = Page.from_hash(hash)
85
+ page.fetched_at ||= hash['created_at']
86
+ page
87
+ end
88
+ end
89
+ end
90
+ end