polipus 0.3.7 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +8 -8
  2. data/.rspec +1 -1
  3. data/.rubocop.yml +3 -3
  4. data/.rubocop_todo.yml +1 -1
  5. data/.travis.yml +14 -4
  6. data/AUTHORS.md +1 -0
  7. data/CHANGELOG.md +9 -1
  8. data/Gemfile +9 -0
  9. data/README.md +2 -3
  10. data/Rakefile +1 -3
  11. data/examples/basic.rb +8 -1
  12. data/lib/polipus.rb +25 -13
  13. data/lib/polipus/queue_overflow.rb +1 -0
  14. data/lib/polipus/queue_overflow/manager.rb +1 -0
  15. data/lib/polipus/queue_overflow/mongo_queue.rb +1 -1
  16. data/lib/polipus/queue_overflow/worker.rb +24 -0
  17. data/lib/polipus/storage.rb +10 -16
  18. data/lib/polipus/storage/mongo_store.rb +6 -1
  19. data/lib/polipus/storage/rethink_store.rb +90 -0
  20. data/lib/polipus/version.rb +1 -1
  21. data/polipus.gemspec +16 -18
  22. data/spec/{http_spec.rb → polipus/http_spec.rb} +26 -37
  23. data/spec/{page_spec.rb → polipus/page_spec.rb} +7 -11
  24. data/spec/{queue_overflow_manager_spec.rb → polipus/queue_overflow/manager_spec.rb} +22 -29
  25. data/spec/{queue_overflow_spec.rb → polipus/queue_overflow_spec.rb} +14 -20
  26. data/spec/{robotex_spec.rb → polipus/robotex_spec.rb} +10 -11
  27. data/spec/{signal_handler_spec.rb → polipus/signal_handler_spec.rb} +2 -6
  28. data/spec/{storage_memory_spec.rb → polipus/storage/memory_store_spec.rb} +18 -21
  29. data/spec/{storage_mongo_spec.rb → polipus/storage/mongo_store_spec.rb} +23 -25
  30. data/spec/polipus/storage/rethink_store_spec.rb +117 -0
  31. data/spec/{url_tracker_spec.rb → polipus/url_tracker_spec.rb} +4 -4
  32. data/spec/polipus_spec.rb +13 -15
  33. data/spec/spec_helper.rb +13 -12
  34. metadata +76 -154
  35. data/lib/polipus/storage/s3_store.rb +0 -96
  36. data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +0 -166
  37. data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +0 -166
  38. data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +0 -270
  39. data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +0 -194
  40. data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +0 -183
  41. data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +0 -221
  42. data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +0 -221
  43. data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +0 -221
  44. data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +0 -695
  45. data/spec/storage_s3_spec.rb +0 -115
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZmMzMzJiMDQ1NGQzYTRhNTRmMGZiZWNhY2Y4YTgwNTU2OTljNzdkNQ==
4
+ NGU0MWZlMWIwMGM2MWNhMmJiOTU3MDM0YWI0ZjY0MzI3NmNkYzgyNA==
5
5
  data.tar.gz: !binary |-
6
- ZDQyNTBiNjkzYzQ0ZDc3ODc3N2MxMTcxOTAzOWFkZjliZGZlYzFjZQ==
6
+ MTAxYjZmYWNjMTlkYzk3Y2ZhNjdjMDFmODM2OTQ5YmQ3ZDcyNTQzNw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MWMwNjYzMzY5ZmYzYzNiMTdlMTFkOTNhNGYzNTViZmQ4ZjdkYWFkMjE1YTMz
10
- MDc3ZDA1Y2JlN2FiNjVmZGZmYWRlNDU2YjcwYjdlNjFiYjViY2NiZTA1OTNm
11
- M2JhYjVhYjkwMzlhNWIxMmM2MzhmNWM5YmY5ZTQ1ZjE2OWY5MGM=
9
+ ZDE5NDExNmE1M2M4MDM0MTExNzVkNmM0ZmVjNTQyOTg5Y2JkYTdmZjBhY2Fi
10
+ NmM1MDc5ZTI0ODkxNzlhY2ZkZGVjZjI2ZTMyNjZhNTMwOWZhZjQ2OGY4ZTJl
11
+ ZmI4ZjUwNmQ3YjAxOTdlODI2MWRmMWUzNjY3Yzk1MmNiZTk0Y2E=
12
12
  data.tar.gz: !binary |-
13
- ODlkYzUyNTI1MTVmOTI1YzI2ZjFkMzY1NTMyZWIzMWEwYjI4MGFmODc4Y2Yx
14
- ZGVkZTA2NmM2OGU5ZThkOTg1ODJjYjMxNTJiNGZkZDVhYzkwZjkzOWI1ZGI2
15
- OTcxOGViODEwZDBiYmQ1ZDU4NmRmZWI2NWZkMTFlNzIzYjdlMjc=
13
+ YjNlYTZlMjU5OTBmN2ZjMTI1OTY1MzgzNzBhYjBjYTAzN2MwYzY0M2U1ZGYw
14
+ MjcyMmMxM2I1ZTY5OWEzZDQ3MjQ1OTgzMzcyN2I5NGQ5NTAzZTEzY2FmMDRl
15
+ MmJiZDdjOTFmMjQwZWU1MzM1OGJhN2E4NTRhNGExZTAyMTVmN2I=
data/.rspec CHANGED
@@ -1,2 +1,2 @@
1
1
  --color
2
- --format progress
2
+ --format documentation
@@ -4,14 +4,14 @@ AllCops:
4
4
  - my_test/**/*
5
5
  - examples/**/*
6
6
 
7
- Style/LineLength:
7
+ Metrics/LineLength:
8
8
  Enabled: false
9
9
 
10
10
  Style/TrivialAccessors:
11
11
  Enabled: false
12
12
 
13
- Style/ClassLength:
13
+ Metrics/ClassLength:
14
14
  Enabled: false
15
15
 
16
- Style/MethodLength:
16
+ Metrics/MethodLength:
17
17
  Enabled: false
@@ -10,7 +10,7 @@ Style/ClassVars:
10
10
  Enabled: false
11
11
 
12
12
  # Offense count: 10
13
- Style/CyclomaticComplexity:
13
+ Metrics/CyclomaticComplexity:
14
14
  Max: 16
15
15
 
16
16
  # Offense count: 26
@@ -3,10 +3,20 @@ rvm:
3
3
  - jruby
4
4
  - 1.9.3
5
5
  - 2.0.0
6
- - 2.1.2
6
+ - 2.1.5
7
+ - 2.2.0
7
8
  - rbx-2
8
9
 
9
- services:
10
- - mongodb
11
- - redis
10
+ # Until travis supports rethinkdb as service...
11
+ before_install:
12
+ - source /etc/lsb-release && echo "deb http://download.rethinkdb.com/apt $DISTRIB_CODENAME main" | sudo tee /etc/apt/sources.list.d/rethinkdb.list
13
+ - wget -qO- http://download.rethinkdb.com/apt/pubkey.gpg | sudo apt-key add -
14
+ - sudo apt-get update -q
15
+ - sudo apt-get install rethinkdb
16
+ - sudo cp /etc/rethinkdb/default.conf.sample /etc/rethinkdb/instances.d/instance1.conf
17
+ - sudo service rethinkdb restart
12
18
 
19
+ services:
20
+ - redis
21
+ - mongodb
22
+ # - rethinkdb
data/AUTHORS.md CHANGED
@@ -2,3 +2,4 @@
2
2
 
3
3
  * [Francesco Laurita](francesco.laurita@gmail.com)
4
4
  * [Tobias L. Maier](http://tobiasmaier.info/)
5
+ * [Marcos Piccinini](https://github.com/nofxx)
@@ -1,5 +1,13 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.4.0 (2015-01-12)
4
+
5
+ [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.3...0.4.0)
6
+
7
+ * Adds RethinkDB Storage
8
+ * BugFix: Update and fix mongo driver v1.11.1 'upsert: 1' -> 'upsert: true'
9
+ * Organize and update specs to rspec 3
10
+
3
11
  ## 0.3.3 (2015-06-26)
4
12
 
5
13
  [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.2...0.3.3)
@@ -27,7 +35,7 @@
27
35
  ```ruby
28
36
  enable_signal_handler: true / false
29
37
  ```
30
-
38
+
31
39
  * Zlib::GzipFile::Error handling
32
40
  [da3b927](https://github.com/taganaka/polipus/commit/da3b927acb1b50c26276ed458da0a365c22fd98b)
33
41
  * Faster and easier overflow management
data/Gemfile CHANGED
@@ -1,3 +1,12 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
3
  gemspec
4
+
5
+ platform :ruby do
6
+ gem 'bson_ext'
7
+ end
8
+
9
+ platform :jruby do
10
+ gem 'json'
11
+ gem 'bson'
12
+ end
data/README.md CHANGED
@@ -1,9 +1,8 @@
1
1
  [![Build Status](https://travis-ci.org/taganaka/polipus.svg?branch=master)](https://travis-ci.org/taganaka/polipus)
2
- [![Coverage Status](https://coveralls.io/repos/taganaka/polipus/badge.png?branch=master)](https://coveralls.io/r/taganaka/polipus?branch=master)
3
- [![Code Climate](https://codeclimate.com/github/taganaka/polipus.png)](https://codeclimate.com/github/taganaka/polipus)
2
+ [![Coverage Status](https://img.shields.io/coveralls/taganaka/polipus/master.svg)](https://coveralls.io/r/taganaka/polipus?branch=master)
3
+ [![Code Climate](https://codeclimate.com/github/taganaka/polipus.svg)](https://codeclimate.com/github/taganaka/polipus)
4
4
  [![RubyGems](http://img.shields.io/gem/v/polipus.svg)](https://rubygems.org/gems/polipus)
5
5
 
6
-
7
6
  # Polipus #
8
7
 
9
8
  A distributed web crawler written in ruby, backed by Redis
data/Rakefile CHANGED
@@ -2,9 +2,7 @@
2
2
  require 'bundler/gem_tasks'
3
3
  require 'rspec/core/rake_task'
4
4
 
5
- RSpec::Core::RakeTask.new(:spec) do |spec|
6
- spec.pattern = 'spec/*_spec.rb'
7
- end
5
+ RSpec::Core::RakeTask.new(:spec)
8
6
 
9
7
  task default: :spec
10
8
  task test: :spec
@@ -20,7 +20,14 @@ options = {
20
20
  user_agent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71',
21
21
  # Use 5 threads
22
22
  workers: 5,
23
- # Logs goes to the crawler.log file
23
+ # Queue overflow settings:
24
+ # * No more than 5000 elements on the Redis queue
25
+ # * Exceeded Items will stored on Mongo into 'rubygems_queue_overflow' collection
26
+ # * Check cycle is done every 60 sec
27
+ queue_items_limit: 5_000,
28
+ queue_overflow_adapter: Polipus::QueueOverflow.mongo_queue(mongo, 'rubygems_queue_overflow'),
29
+ queue_overflow_manager_check_time: 60,
30
+ # Logs goes to the stdout
24
31
  logger: Logger.new(STDOUT)
25
32
  }
26
33
  Polipus::Plugin.register Polipus::Plugin::Cleaner, reset: true
@@ -118,6 +118,7 @@ module Polipus
118
118
  @on_before_save = []
119
119
  @on_page_error = []
120
120
  @focus_crawl_block = nil
121
+ @on_crawl_start = []
121
122
  @on_crawl_end = []
122
123
  @redis_factory = nil
123
124
 
@@ -131,6 +132,20 @@ module Polipus
131
132
  @robots = Polipus::Robotex.new(@options[:user_agent]) if @options[:obey_robots_txt]
132
133
  # Attach signal handling if enabled
133
134
  SignalHandler.enable if @options[:enable_signal_handler]
135
+
136
+ if queue_overflow_adapter
137
+ @on_crawl_start << lambda do |_|
138
+ Thread.new do
139
+ Thread.current[:name] = :overflow_items_controller
140
+ overflow_items_controller.run
141
+ end
142
+ end
143
+ end
144
+
145
+ @on_crawl_end << lambda do |_|
146
+ Thread.list.select { |thread| thread.status && Thread.current[:name] == :overflow_items_controller }.each(&:kill)
147
+ end
148
+
134
149
  execute_plugin 'on_initialize'
135
150
 
136
151
  yield self if block_given?
@@ -141,13 +156,13 @@ module Polipus
141
156
  end
142
157
 
143
158
  def takeover
144
- overflow_items_controller if queue_overflow_adapter
145
-
146
159
  @urls.each do |u|
147
160
  add_url(u) { |page| page.user_data.p_seeded = true }
148
161
  end
149
162
  return if internal_queue.empty?
150
163
 
164
+ @on_crawl_start.each { |e| e.call(self) }
165
+
151
166
  execute_plugin 'on_crawl_start'
152
167
  @options[:workers].times do |worker_number|
153
168
  @workers_pool << Thread.new do
@@ -237,6 +252,7 @@ module Polipus
237
252
  end
238
253
  end
239
254
  end
255
+
240
256
  @workers_pool.each { |w| w.join }
241
257
  @on_crawl_end.each { |e| e.call(self) }
242
258
  execute_plugin 'on_crawl_end'
@@ -269,6 +285,12 @@ module Polipus
269
285
  self
270
286
  end
271
287
 
288
+ # A block of code will be executed when crawl session is starting
289
+ def on_crawl_start(&block)
290
+ @on_crawl_start << block
291
+ self
292
+ end
293
+
272
294
  # A block of code will be executed on every page downloaded
273
295
  # before being saved in the registered storage
274
296
  def on_before_save(&block)
@@ -439,17 +461,7 @@ module Polipus
439
461
  should_be_visited?(page.url, false)
440
462
  end
441
463
 
442
- Thread.new do
443
-
444
- loop do
445
- @logger.info { 'Overflow Manager: cycle started' }
446
- removed, restored = @overflow_manager.perform
447
- @logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{queue_overflow_adapter.size}" }
448
- sleep @options[:queue_overflow_manager_check_time]
449
- break if SignalHandler.terminated?
450
- end
451
-
452
- end
464
+ QueueOverflow::Worker.new(@overflow_manager)
453
465
  end
454
466
 
455
467
  def internal_queue
@@ -1,5 +1,6 @@
1
1
  # encoding: UTF-8
2
2
  require 'polipus/queue_overflow/manager'
3
+ require 'polipus/queue_overflow/worker'
3
4
  module Polipus
4
5
  module QueueOverflow
5
6
  def self.mongo_queue(mongo_db, queue_name, options = {})
@@ -3,6 +3,7 @@ module Polipus
3
3
  module QueueOverflow
4
4
  class Manager
5
5
  attr_accessor :url_filter
6
+ attr_reader :polipus
6
7
  def initialize(polipus, main_q, item_limit)
7
8
  @polipus = polipus
8
9
  @main_q = main_q
@@ -28,7 +28,7 @@ module Polipus
28
28
 
29
29
  def push(data)
30
30
  if @options[:ensure_uniq]
31
- @mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: 1, w: 1 })
31
+ @mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: true, w: 1 })
32
32
  else
33
33
  @mongo_db[@collection_name].insert(payload: data)
34
34
  end
@@ -0,0 +1,24 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module QueueOverflow
4
+ class Worker
5
+ def initialize(manager)
6
+ @logger = manager.polipus.logger
7
+ @delay = manager.polipus.options[:queue_overflow_manager_check_time]
8
+ @adapter = manager.polipus.queue_overflow_adapter
9
+ @manager = manager
10
+ end
11
+
12
+ def run
13
+ @logger.info { 'Overflow::Worker::run' }
14
+ loop do
15
+ @logger.info { 'Overflow Manager: cycle started' }
16
+ removed, restored = @manager.perform
17
+ @logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{@adapter.size}" }
18
+ sleep @delay
19
+ break if SignalHandler.terminated?
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -1,27 +1,21 @@
1
- # encoding: UTF-8
2
1
  require 'polipus/storage/base'
2
+
3
3
  module Polipus
4
4
  module Storage
5
- def self.mongo_store(mongo = nil, collection_name = 'pages', except = [])
5
+ COLLECTION = 'pages'
6
+
7
+ def self.mongo_store(mongo = nil, collection = COLLECTION, except = [])
6
8
  require 'polipus/storage/mongo_store'
7
9
  mongo ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
8
10
  fail 'First argument must be an instance of Mongo::DB' unless mongo.is_a?(Mongo::DB)
9
- self::MongoStore.new(mongo: mongo, collection: collection_name, except: except)
11
+ self::MongoStore.new(mongo: mongo, collection: collection, except: except)
10
12
  end
11
13
 
12
- def self.s3_store(bucket_name = 'pages', aws_credential = {}, except = [])
13
- require 'polipus/storage/s3_store'
14
-
15
- if aws_credential[:access_key_id].nil? || aws_credential[:secret_access_key].nil?
16
- fail 'You have to specify AWS crediantials: :access_key_id and :secret_access_key'
17
- end
18
-
19
- self::S3Store.new(
20
- bucket: bucket_name,
21
- access_key_id: aws_credential[:access_key_id],
22
- secret_access_key: aws_credential[:secret_access_key],
23
- except: except
24
- )
14
+ def self.rethink_store(conn = nil, table = COLLECTION, except = [])
15
+ require 'polipus/storage/rethink_store'
16
+ conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus' )
17
+ fail "First argument must be a RethinkDB::Connection, got `#{conn.class}`" unless conn.is_a?(RethinkDB::Connection)
18
+ self::RethinkStore.new(conn: conn, table: table, except: except)
25
19
  end
26
20
 
27
21
  def self.dev_null
@@ -2,6 +2,7 @@
2
2
  require 'mongo'
3
3
  require 'zlib'
4
4
  require 'thread'
5
+
5
6
  module Polipus
6
7
  module Storage
7
8
  class MongoStore < Base
@@ -10,7 +11,11 @@ module Polipus
10
11
  @mongo = options[:mongo]
11
12
  @collection = options[:collection]
12
13
  @mongo.create_collection(@collection)
13
- @mongo[@collection].ensure_index(:uuid, unique: true, dropDups: true, background: true)
14
+ begin
15
+ @mongo[@collection].ensure_index(:uuid, unique: true, dropDups: true, background: true)
16
+ rescue Exception
17
+ end
18
+
14
19
  @compress_body = options[:compress_body] ||= true
15
20
  @except = options[:except] ||= []
16
21
  @semaphore = Mutex.new
@@ -0,0 +1,90 @@
1
+ # encoding: UTF-8
2
+ require 'rethinkdb'
3
+ require 'thread'
4
+ require 'zlib'
5
+
6
+ module Polipus
7
+ module Storage
8
+ class RethinkStore < Base
9
+ BINARY_FIELDS = %w(body headers data)
10
+ def initialize(options = {})
11
+ @r = RethinkDB::RQL.new
12
+ @rethink = options[:conn]
13
+ @table = options[:table]
14
+
15
+ unless @r.table_list.run(@rethink).include?(@table)
16
+ @r.table_create(@table).run(@rethink)
17
+ @r.table(@table).index_create('created_at')
18
+ end
19
+
20
+ @compress_body = options[:compress_body] ||= true
21
+ @except = options[:except] ||= []
22
+ @semaphore = Mutex.new
23
+ end
24
+
25
+ def add(page)
26
+ @semaphore.synchronize do
27
+ obj = page.to_hash
28
+ @except.each { |e| obj.delete e.to_s }
29
+ obj[:id] = uuid(page)
30
+ obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
31
+ obj['created_at'] ||= Time.now.to_i
32
+ BINARY_FIELDS.each do |field|
33
+ # Use some marshalling?
34
+ obj[field] = @r.binary(obj[field]) unless obj[field].nil?
35
+ end
36
+
37
+ @r.table(@table).insert(obj).run(@rethink, durability: 'soft')
38
+ obj[:id]
39
+ end
40
+ end
41
+
42
+ def exists?(page)
43
+ @semaphore.synchronize do
44
+ doc = @r.table(@table).get(uuid(page)).run(@rethink)
45
+ !doc.nil?
46
+ end
47
+ end
48
+
49
+ def get(page)
50
+ @semaphore.synchronize do
51
+ data = @r.table(@table).get(uuid(page)).run(@rethink)
52
+ return load_page(data) if data
53
+ end
54
+ end
55
+
56
+ def remove(page)
57
+ @semaphore.synchronize do
58
+ @r.table(@table).get(uuid(page)).delete.run(@rethink)
59
+ end
60
+ end
61
+
62
+ def count
63
+ @r.table(@table).count.run(@rethink)
64
+ end
65
+
66
+ def each
67
+ @r.table(@table).run(@rethink).each do |doc|
68
+ page = load_page(doc)
69
+ yield doc[:id], page
70
+ end
71
+ end
72
+
73
+ def clear
74
+ @r.table(@table).delete.run(@rethink)
75
+ end
76
+
77
+ private
78
+
79
+ def load_page(hash)
80
+ BINARY_FIELDS.each do |field|
81
+ hash[field] = hash[field].to_s
82
+ end
83
+ hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
84
+ page = Page.from_hash(hash)
85
+ page.fetched_at ||= hash['created_at']
86
+ page
87
+ end
88
+ end
89
+ end
90
+ end