parallel588_polipus 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +17 -0
  6. data/.rubocop_todo.yml +33 -0
  7. data/.travis.yml +22 -0
  8. data/AUTHORS.md +5 -0
  9. data/CHANGELOG.md +61 -0
  10. data/Gemfile +12 -0
  11. data/LICENSE.txt +20 -0
  12. data/README.md +70 -0
  13. data/Rakefile +8 -0
  14. data/examples/basic.rb +63 -0
  15. data/examples/error_handling.rb +23 -0
  16. data/examples/incremental.rb +63 -0
  17. data/examples/robots_txt_handling.rb +14 -0
  18. data/examples/survival.rb +10 -0
  19. data/lib/polipus.rb +488 -0
  20. data/lib/polipus/http.rb +282 -0
  21. data/lib/polipus/page.rb +256 -0
  22. data/lib/polipus/plugin.rb +14 -0
  23. data/lib/polipus/plugins/cleaner.rb +25 -0
  24. data/lib/polipus/plugins/sample.rb +15 -0
  25. data/lib/polipus/plugins/sleeper.rb +22 -0
  26. data/lib/polipus/queue_overflow.rb +26 -0
  27. data/lib/polipus/queue_overflow/base.rb +7 -0
  28. data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
  29. data/lib/polipus/queue_overflow/manager.rb +57 -0
  30. data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
  31. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
  32. data/lib/polipus/queue_overflow/worker.rb +24 -0
  33. data/lib/polipus/robotex.rb +145 -0
  34. data/lib/polipus/signal_handler.rb +42 -0
  35. data/lib/polipus/storage.rb +31 -0
  36. data/lib/polipus/storage/base.rb +20 -0
  37. data/lib/polipus/storage/dev_null.rb +35 -0
  38. data/lib/polipus/storage/memory_store.rb +56 -0
  39. data/lib/polipus/storage/mongo_store.rb +90 -0
  40. data/lib/polipus/storage/rethink_store.rb +90 -0
  41. data/lib/polipus/url_tracker.rb +21 -0
  42. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  43. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  44. data/lib/polipus/version.rb +5 -0
  45. data/polipus.gemspec +44 -0
  46. data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
  47. data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
  48. data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
  49. data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
  50. data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
  51. data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
  52. data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
  53. data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
  54. data/spec/cassettes/gzipped_on.yml +147 -0
  55. data/spec/cassettes/http_cookies.yml +133 -0
  56. data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
  57. data/spec/cassettes/http_test.yml +1418 -0
  58. data/spec/cassettes/http_test_redirect.yml +71 -0
  59. data/spec/clear.rb +12 -0
  60. data/spec/polipus/http_spec.rb +139 -0
  61. data/spec/polipus/page_spec.rb +68 -0
  62. data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
  63. data/spec/polipus/queue_overflow_spec.rb +66 -0
  64. data/spec/polipus/robotex_spec.rb +85 -0
  65. data/spec/polipus/signal_handler_spec.rb +15 -0
  66. data/spec/polipus/storage/memory_store_spec.rb +87 -0
  67. data/spec/polipus/storage/mongo_store_spec.rb +119 -0
  68. data/spec/polipus/storage/rethink_store_spec.rb +117 -0
  69. data/spec/polipus/url_tracker_spec.rb +29 -0
  70. data/spec/polipus_spec.rb +107 -0
  71. data/spec/spec_helper.rb +42 -0
  72. metadata +348 -0
@@ -0,0 +1,14 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module Plugin
4
+ @@plugins = {}
5
+ def self.register(plugin, options = {})
6
+ o = plugin.new(options)
7
+ @@plugins[o.class.name] = o
8
+ end
9
+
10
+ def self.plugins
11
+ @@plugins
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,25 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module Plugin
4
+ class Cleaner
5
+ def initialize(options = {})
6
+ @reset = options[:reset] ||= false
7
+ end
8
+
9
+ def on_initialize(crawler)
10
+ crawler.logger.info { 'Cleaner plugin loaded' }
11
+ unless @reset
12
+ crawler.logger.info { 'Cleaner plugin is disabled, add :reset => true to the plugin if you really know what you are doing' }
13
+ return nil
14
+ end
15
+ crawler.logger.info { 'Cleaning all: url_tracker, storage, queue' }
16
+ proc do
17
+ url_tracker.clear
18
+ storage.clear
19
+ queue_factory.clear
20
+ @options[:queue_overflow_adapter].clear if @options[:queue_overflow_adapter]
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,15 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module Plugin
4
+ class Sample
5
+ def initialize(_options = {})
6
+ end
7
+
8
+ def on_initialize(_crawler)
9
+ proc do
10
+ @options.each { |k, v| @logger.info { "Polipus configuration: #{k} => #{v}" } }
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,22 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module Plugin
4
+ class Sleeper
5
+ def initialize(options = {})
6
+ @delay = options[:delay] ||= 1
7
+ end
8
+
9
+ def on_initialize(crawler)
10
+ crawler.logger.info { "Sleeper plugin loaded, sleep for #{@delay} after each request" }
11
+ proc do
12
+ # Set to 1 the number of threads
13
+ @options[:workers] = 1
14
+ end
15
+ end
16
+
17
+ def on_message_processed(_crawler)
18
+ sleep @delay
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,26 @@
1
+ # encoding: UTF-8
2
+ require 'polipus/queue_overflow/manager'
3
+ require 'polipus/queue_overflow/worker'
4
+ module Polipus
5
+ module QueueOverflow
6
+ def self.mongo_queue(mongo_db, queue_name, options = {})
7
+ require 'polipus/queue_overflow/mongo_queue'
8
+ mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
9
+ fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
10
+ self::MongoQueue.new mongo_db, queue_name, options
11
+ end
12
+
13
+ def self.mongo_queue_capped(mongo_db, queue_name, options = {})
14
+ require 'polipus/queue_overflow/mongo_queue_capped'
15
+ mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
16
+ fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
17
+ options[:max] = 1_000_000 if options[:max].nil?
18
+ self::MongoQueueCapped.new mongo_db, queue_name, options
19
+ end
20
+
21
+ def self.dev_null_queue(_options = {})
22
+ require 'polipus/queue_overflow/dev_null_queue'
23
+ self::DevNullQueue.new
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,7 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module QueueOverflow
4
+ class Base
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,34 @@
1
+ # encoding: UTF-8
2
+ require 'thread'
3
+ module Polipus
4
+ module QueueOverflow
5
+ class DevNullQueue
6
+ def initialize
7
+ end
8
+
9
+ def length
10
+ 0
11
+ end
12
+
13
+ def empty?
14
+ true
15
+ end
16
+
17
+ def clear
18
+ end
19
+
20
+ def push(_data)
21
+ end
22
+
23
+ def pop(_ = false)
24
+ nil
25
+ end
26
+
27
+ alias_method :size, :length
28
+ alias_method :dec, :pop
29
+ alias_method :shift, :pop
30
+ alias_method :enc, :push
31
+ alias_method :<<, :push
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,57 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module QueueOverflow
4
+ class Manager
5
+ attr_accessor :url_filter
6
+ attr_reader :polipus
7
+ def initialize(polipus, main_q, item_limit)
8
+ @polipus = polipus
9
+ @main_q = main_q
10
+ @adapter = @polipus.queue_overflow_adapter
11
+ @item_limit = item_limit
12
+ @redis = @polipus.redis
13
+ end
14
+
15
+ def url_filter(&block)
16
+ @url_filter = block
17
+ end
18
+
19
+ def perform
20
+ removed = 0
21
+ restored = 0
22
+ main_q_size = @main_q.size
23
+ if main_q_size > @item_limit
24
+ @polipus.logger.info { "Overflow Manager: Going to offload items from redis: ~#{main_q_size - @item_limit}" }
25
+ removed = rotate(@main_q, @adapter) { @main_q.size > @item_limit }
26
+ elsif main_q_size < @item_limit && !@adapter.empty?
27
+ @polipus.logger.info { "Overflow Manager: Going to restore items into redis: ~#{@item_limit - main_q_size}" }
28
+ restored = rotate(@adapter, @main_q) { @main_q.size <= @item_limit }
29
+ end
30
+ [removed, restored]
31
+ end
32
+
33
+ private
34
+
35
+ def rotate(source, dest)
36
+ performed = 0
37
+ loop do
38
+ message = source.pop(true)
39
+ if message
40
+ page = Page.from_json message
41
+ unless @polipus.storage.exists?(page)
42
+ allowed = @url_filter.nil? ? true : @url_filter.call(page)
43
+ if allowed
44
+ dest << message
45
+ performed += 1
46
+ end
47
+ end
48
+ end
49
+ source.commit if source.respond_to? :commit
50
+ break if !message || source.empty?
51
+ break unless yield source, dest
52
+ end
53
+ performed
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: UTF-8
2
+ require 'thread'
3
+ require 'mongo'
4
+ module Polipus
5
+ module QueueOverflow
6
+ class MongoQueue
7
+ def initialize(mongo_db, queue_name, options = {})
8
+ @mongo_db = mongo_db
9
+ @collection_name = "polipus_q_overflow_#{queue_name}"
10
+ @semaphore = Mutex.new
11
+ @options = options
12
+ @options[:ensure_uniq] ||= false
13
+ @options[:ensure_uniq] && ensure_index
14
+ end
15
+
16
+ def length
17
+ @mongo_db[@collection_name].count
18
+ end
19
+
20
+ def empty?
21
+ !(length > 0)
22
+ end
23
+
24
+ def clear
25
+ @mongo_db[@collection_name].drop
26
+ @options[:ensure_uniq] && ensure_index
27
+ end
28
+
29
+ def push(data)
30
+ if @options[:ensure_uniq]
31
+ @mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: true, w: 1 })
32
+ else
33
+ @mongo_db[@collection_name].insert(payload: data)
34
+ end
35
+ true
36
+ end
37
+
38
+ def pop(_ = false)
39
+ @semaphore.synchronize do
40
+ doc = @mongo_db[@collection_name].find({}, sort: { _id: 1 }).limit(1).first
41
+ return nil if doc.nil?
42
+ @mongo_db[@collection_name].remove(_id: doc['_id'])
43
+ doc && doc['payload'] ? doc['payload'] : nil
44
+ end
45
+ end
46
+
47
+ alias_method :size, :length
48
+ alias_method :dec, :pop
49
+ alias_method :shift, :pop
50
+ alias_method :enc, :push
51
+ alias_method :<<, :push
52
+
53
+ protected
54
+
55
+ def ensure_index
56
+ @mongo_db[@collection_name].ensure_index({ payload: 1 }, { background: 1, unique: 1, drop_dups: 1 })
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,29 @@
1
+ # encoding: UTF-8
2
+ require 'polipus/queue_overflow/mongo_queue'
3
+ module Polipus
4
+ module QueueOverflow
5
+ class MongoQueueCapped < MongoQueue
6
+ def initialize(mongo_db, queue_name, options = {})
7
+ super
8
+ @max = @options[:max]
9
+ end
10
+
11
+ def push(data)
12
+ super
13
+ @semaphore.synchronize do
14
+ s = size
15
+ if s > @max
16
+ docs = @mongo_db[@collection_name].find({}, { sort: { _id: 1 }, fields: [:_id] }).limit(s - @max).map { |e| e['_id'] }
17
+ @mongo_db[@collection_name].remove(:_id => { '$in' => docs }, '$isolated' => 1)
18
+ end
19
+ end
20
+ end
21
+
22
+ alias_method :size, :length
23
+ alias_method :dec, :pop
24
+ alias_method :shift, :pop
25
+ alias_method :enc, :push
26
+ alias_method :<<, :push
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,24 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module QueueOverflow
4
+ class Worker
5
+ def initialize(manager)
6
+ @logger = manager.polipus.logger
7
+ @delay = manager.polipus.options[:queue_overflow_manager_check_time]
8
+ @adapter = manager.polipus.queue_overflow_adapter
9
+ @manager = manager
10
+ end
11
+
12
+ def run
13
+ @logger.info { 'Overflow::Worker::run' }
14
+ loop do
15
+ @logger.info { 'Overflow Manager: cycle started' }
16
+ removed, restored = @manager.perform
17
+ @logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{@adapter.size}" }
18
+ sleep @delay
19
+ break if SignalHandler.terminated?
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,145 @@
1
+ # encoding: UTF-8
2
+ require 'open-uri'
3
+ require 'uri'
4
+ require 'timeout'
5
+ module Polipus
6
+ # Original code taken from
7
+ # https://github.com/chriskite/robotex/blob/master/lib/robotex.rb
8
+
9
+ class Robotex
10
+ DEFAULT_TIMEOUT = 3
11
+ VERSION = '1.0.0'
12
+
13
+ attr_reader :user_agent
14
+
15
+ class ParsedRobots
16
+ def initialize(uri, user_agent)
17
+ io = Robotex.get_robots_txt(uri, user_agent)
18
+ if !io || io.content_type != 'text/plain' || io.status != %w(200 OK)
19
+ io = StringIO.new("User-agent: *\nAllow: /\n")
20
+ end
21
+
22
+ @disallows = {}
23
+ @allows = {}
24
+ @delays = {}
25
+ agent = /.*/
26
+ io.each do |line|
27
+ next if line =~ /^\s*(#.*|$)/
28
+ arr = line.split(':')
29
+ key = arr.shift
30
+ value = arr.join(':').strip
31
+ value.strip!
32
+ case key.downcase
33
+ when 'user-agent'
34
+ agent = to_regex(value)
35
+ when 'allow'
36
+ unless value.empty?
37
+ @allows[agent] ||= []
38
+ @allows[agent] << to_regex(value)
39
+ end
40
+ when 'disallow'
41
+ unless value.empty?
42
+ @disallows[agent] ||= []
43
+ @disallows[agent] << to_regex(value)
44
+ end
45
+ when 'crawl-delay'
46
+ @delays[agent] = value.to_i
47
+ end
48
+ end
49
+ @parsed = true
50
+ end
51
+
52
+ def allowed?(uri, user_agent)
53
+ return true unless @parsed
54
+ allowed = true
55
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
56
+ path = uri.request_uri
57
+
58
+ @allows.each do |key, value|
59
+ unless allowed
60
+ if user_agent =~ key
61
+ value.each do |rule|
62
+ path =~ rule && allowed = true
63
+ end
64
+ end
65
+ end
66
+ end
67
+
68
+ @disallows.each do |key, value|
69
+ if user_agent =~ key
70
+ value.each do |rule|
71
+ path =~ rule && allowed = false
72
+ end
73
+ end
74
+ end
75
+
76
+ allowed
77
+ end
78
+
79
+ def delay(user_agent)
80
+ @delays.each do |agent, delay|
81
+ return delay if agent =~ user_agent
82
+ end
83
+ nil
84
+ end
85
+
86
+ protected
87
+
88
+ def to_regex(pattern)
89
+ pattern = Regexp.escape(pattern)
90
+ pattern.gsub!(Regexp.escape('*'), '.*')
91
+ Regexp.compile("^#{pattern}")
92
+ end
93
+ end
94
+
95
+ def self.get_robots_txt(uri, user_agent)
96
+ Timeout.timeout(Robotex.timeout) do
97
+ URI.join(uri.to_s, '/robots.txt').open('User-Agent' => user_agent) rescue nil
98
+ end
99
+ rescue Timeout::Error
100
+ STDERR.puts 'robots.txt request timed out'
101
+ end
102
+
103
+ class << self
104
+ attr_writer :timeout
105
+ end
106
+
107
+ def self.timeout
108
+ @timeout || DEFAULT_TIMEOUT
109
+ end
110
+
111
+ def initialize(user_agent = nil)
112
+ user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
113
+ @user_agent = user_agent
114
+ @last_accessed = Time.at(1)
115
+ @parsed = {}
116
+ end
117
+
118
+ def parse_host(uri)
119
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
120
+ @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
121
+ end
122
+
123
+ #
124
+ # Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
125
+ #
126
+ def allowed?(uri)
127
+ parse_host(uri).allowed?(uri, @user_agent)
128
+ end
129
+
130
+ #
131
+ # Return the value of the Crawl-Delay directive, or nil if none
132
+ def delay(uri)
133
+ parse_host(uri).delay(@user_agent)
134
+ end
135
+
136
+ #
137
+ # Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server
138
+ #
139
+ def delay!(uri)
140
+ delay = delay(uri)
141
+ sleep delay - (Time.now - @last_accessed) if delay
142
+ @last_accessed = Time.now
143
+ end
144
+ end
145
+ end