parallel588_polipus 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +17 -0
  6. data/.rubocop_todo.yml +33 -0
  7. data/.travis.yml +22 -0
  8. data/AUTHORS.md +5 -0
  9. data/CHANGELOG.md +61 -0
  10. data/Gemfile +12 -0
  11. data/LICENSE.txt +20 -0
  12. data/README.md +70 -0
  13. data/Rakefile +8 -0
  14. data/examples/basic.rb +63 -0
  15. data/examples/error_handling.rb +23 -0
  16. data/examples/incremental.rb +63 -0
  17. data/examples/robots_txt_handling.rb +14 -0
  18. data/examples/survival.rb +10 -0
  19. data/lib/polipus.rb +488 -0
  20. data/lib/polipus/http.rb +282 -0
  21. data/lib/polipus/page.rb +256 -0
  22. data/lib/polipus/plugin.rb +14 -0
  23. data/lib/polipus/plugins/cleaner.rb +25 -0
  24. data/lib/polipus/plugins/sample.rb +15 -0
  25. data/lib/polipus/plugins/sleeper.rb +22 -0
  26. data/lib/polipus/queue_overflow.rb +26 -0
  27. data/lib/polipus/queue_overflow/base.rb +7 -0
  28. data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
  29. data/lib/polipus/queue_overflow/manager.rb +57 -0
  30. data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
  31. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
  32. data/lib/polipus/queue_overflow/worker.rb +24 -0
  33. data/lib/polipus/robotex.rb +145 -0
  34. data/lib/polipus/signal_handler.rb +42 -0
  35. data/lib/polipus/storage.rb +31 -0
  36. data/lib/polipus/storage/base.rb +20 -0
  37. data/lib/polipus/storage/dev_null.rb +35 -0
  38. data/lib/polipus/storage/memory_store.rb +56 -0
  39. data/lib/polipus/storage/mongo_store.rb +90 -0
  40. data/lib/polipus/storage/rethink_store.rb +90 -0
  41. data/lib/polipus/url_tracker.rb +21 -0
  42. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  43. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  44. data/lib/polipus/version.rb +5 -0
  45. data/polipus.gemspec +44 -0
  46. data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
  47. data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
  48. data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
  49. data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
  50. data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
  51. data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
  52. data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
  53. data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
  54. data/spec/cassettes/gzipped_on.yml +147 -0
  55. data/spec/cassettes/http_cookies.yml +133 -0
  56. data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
  57. data/spec/cassettes/http_test.yml +1418 -0
  58. data/spec/cassettes/http_test_redirect.yml +71 -0
  59. data/spec/clear.rb +12 -0
  60. data/spec/polipus/http_spec.rb +139 -0
  61. data/spec/polipus/page_spec.rb +68 -0
  62. data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
  63. data/spec/polipus/queue_overflow_spec.rb +66 -0
  64. data/spec/polipus/robotex_spec.rb +85 -0
  65. data/spec/polipus/signal_handler_spec.rb +15 -0
  66. data/spec/polipus/storage/memory_store_spec.rb +87 -0
  67. data/spec/polipus/storage/mongo_store_spec.rb +119 -0
  68. data/spec/polipus/storage/rethink_store_spec.rb +117 -0
  69. data/spec/polipus/url_tracker_spec.rb +29 -0
  70. data/spec/polipus_spec.rb +107 -0
  71. data/spec/spec_helper.rb +42 -0
  72. metadata +348 -0
@@ -0,0 +1,14 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module Plugin
4
+ @@plugins = {}
5
+ def self.register(plugin, options = {})
6
+ o = plugin.new(options)
7
+ @@plugins[o.class.name] = o
8
+ end
9
+
10
+ def self.plugins
11
+ @@plugins
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,25 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module Plugin
4
+ class Cleaner
5
+ def initialize(options = {})
6
+ @reset = options[:reset] ||= false
7
+ end
8
+
9
+ def on_initialize(crawler)
10
+ crawler.logger.info { 'Cleaner plugin loaded' }
11
+ unless @reset
12
+ crawler.logger.info { 'Cleaner plugin is disabled, add :reset => true to the plugin if you really know what you are doing' }
13
+ return nil
14
+ end
15
+ crawler.logger.info { 'Cleaning all: url_tracker, storage, queue' }
16
+ proc do
17
+ url_tracker.clear
18
+ storage.clear
19
+ queue_factory.clear
20
+ @options[:queue_overflow_adapter].clear if @options[:queue_overflow_adapter]
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,15 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module Plugin
4
+ class Sample
5
+ def initialize(_options = {})
6
+ end
7
+
8
+ def on_initialize(_crawler)
9
+ proc do
10
+ @options.each { |k, v| @logger.info { "Polipus configuration: #{k} => #{v}" } }
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,22 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module Plugin
4
+ class Sleeper
5
+ def initialize(options = {})
6
+ @delay = options[:delay] ||= 1
7
+ end
8
+
9
+ def on_initialize(crawler)
10
+ crawler.logger.info { "Sleeper plugin loaded, sleep for #{@delay} after each request" }
11
+ proc do
12
+ # Set to 1 the number of threads
13
+ @options[:workers] = 1
14
+ end
15
+ end
16
+
17
+ def on_message_processed(_crawler)
18
+ sleep @delay
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,26 @@
1
+ # encoding: UTF-8
2
+ require 'polipus/queue_overflow/manager'
3
+ require 'polipus/queue_overflow/worker'
4
+ module Polipus
5
+ module QueueOverflow
6
+ def self.mongo_queue(mongo_db, queue_name, options = {})
7
+ require 'polipus/queue_overflow/mongo_queue'
8
+ mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
9
+ fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
10
+ self::MongoQueue.new mongo_db, queue_name, options
11
+ end
12
+
13
+ def self.mongo_queue_capped(mongo_db, queue_name, options = {})
14
+ require 'polipus/queue_overflow/mongo_queue_capped'
15
+ mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
16
+ fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
17
+ options[:max] = 1_000_000 if options[:max].nil?
18
+ self::MongoQueueCapped.new mongo_db, queue_name, options
19
+ end
20
+
21
+ def self.dev_null_queue(_options = {})
22
+ require 'polipus/queue_overflow/dev_null_queue'
23
+ self::DevNullQueue.new
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,7 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module QueueOverflow
4
+ class Base
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,34 @@
1
+ # encoding: UTF-8
2
+ require 'thread'
3
+ module Polipus
4
+ module QueueOverflow
5
+ class DevNullQueue
6
+ def initialize
7
+ end
8
+
9
+ def length
10
+ 0
11
+ end
12
+
13
+ def empty?
14
+ true
15
+ end
16
+
17
+ def clear
18
+ end
19
+
20
+ def push(_data)
21
+ end
22
+
23
+ def pop(_ = false)
24
+ nil
25
+ end
26
+
27
+ alias_method :size, :length
28
+ alias_method :dec, :pop
29
+ alias_method :shift, :pop
30
+ alias_method :enc, :push
31
+ alias_method :<<, :push
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,57 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module QueueOverflow
4
+ class Manager
5
+ attr_accessor :url_filter
6
+ attr_reader :polipus
7
+ def initialize(polipus, main_q, item_limit)
8
+ @polipus = polipus
9
+ @main_q = main_q
10
+ @adapter = @polipus.queue_overflow_adapter
11
+ @item_limit = item_limit
12
+ @redis = @polipus.redis
13
+ end
14
+
15
+ def url_filter(&block)
16
+ @url_filter = block
17
+ end
18
+
19
+ def perform
20
+ removed = 0
21
+ restored = 0
22
+ main_q_size = @main_q.size
23
+ if main_q_size > @item_limit
24
+ @polipus.logger.info { "Overflow Manager: Going to offload items from redis: ~#{main_q_size - @item_limit}" }
25
+ removed = rotate(@main_q, @adapter) { @main_q.size > @item_limit }
26
+ elsif main_q_size < @item_limit && !@adapter.empty?
27
+ @polipus.logger.info { "Overflow Manager: Going to restore items into redis: ~#{@item_limit - main_q_size}" }
28
+ restored = rotate(@adapter, @main_q) { @main_q.size <= @item_limit }
29
+ end
30
+ [removed, restored]
31
+ end
32
+
33
+ private
34
+
35
+ def rotate(source, dest)
36
+ performed = 0
37
+ loop do
38
+ message = source.pop(true)
39
+ if message
40
+ page = Page.from_json message
41
+ unless @polipus.storage.exists?(page)
42
+ allowed = @url_filter.nil? ? true : @url_filter.call(page)
43
+ if allowed
44
+ dest << message
45
+ performed += 1
46
+ end
47
+ end
48
+ end
49
+ source.commit if source.respond_to? :commit
50
+ break if !message || source.empty?
51
+ break unless yield source, dest
52
+ end
53
+ performed
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: UTF-8
2
+ require 'thread'
3
+ require 'mongo'
4
+ module Polipus
5
+ module QueueOverflow
6
+ class MongoQueue
7
+ def initialize(mongo_db, queue_name, options = {})
8
+ @mongo_db = mongo_db
9
+ @collection_name = "polipus_q_overflow_#{queue_name}"
10
+ @semaphore = Mutex.new
11
+ @options = options
12
+ @options[:ensure_uniq] ||= false
13
+ @options[:ensure_uniq] && ensure_index
14
+ end
15
+
16
+ def length
17
+ @mongo_db[@collection_name].count
18
+ end
19
+
20
+ def empty?
21
+ !(length > 0)
22
+ end
23
+
24
+ def clear
25
+ @mongo_db[@collection_name].drop
26
+ @options[:ensure_uniq] && ensure_index
27
+ end
28
+
29
+ def push(data)
30
+ if @options[:ensure_uniq]
31
+ @mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: true, w: 1 })
32
+ else
33
+ @mongo_db[@collection_name].insert(payload: data)
34
+ end
35
+ true
36
+ end
37
+
38
+ def pop(_ = false)
39
+ @semaphore.synchronize do
40
+ doc = @mongo_db[@collection_name].find({}, sort: { _id: 1 }).limit(1).first
41
+ return nil if doc.nil?
42
+ @mongo_db[@collection_name].remove(_id: doc['_id'])
43
+ doc && doc['payload'] ? doc['payload'] : nil
44
+ end
45
+ end
46
+
47
+ alias_method :size, :length
48
+ alias_method :dec, :pop
49
+ alias_method :shift, :pop
50
+ alias_method :enc, :push
51
+ alias_method :<<, :push
52
+
53
+ protected
54
+
55
+ def ensure_index
56
+ @mongo_db[@collection_name].ensure_index({ payload: 1 }, { background: 1, unique: 1, drop_dups: 1 })
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,29 @@
1
+ # encoding: UTF-8
2
+ require 'polipus/queue_overflow/mongo_queue'
3
+ module Polipus
4
+ module QueueOverflow
5
+ class MongoQueueCapped < MongoQueue
6
+ def initialize(mongo_db, queue_name, options = {})
7
+ super
8
+ @max = @options[:max]
9
+ end
10
+
11
+ def push(data)
12
+ super
13
+ @semaphore.synchronize do
14
+ s = size
15
+ if s > @max
16
+ docs = @mongo_db[@collection_name].find({}, { sort: { _id: 1 }, fields: [:_id] }).limit(s - @max).map { |e| e['_id'] }
17
+ @mongo_db[@collection_name].remove(:_id => { '$in' => docs }, '$isolated' => 1)
18
+ end
19
+ end
20
+ end
21
+
22
+ alias_method :size, :length
23
+ alias_method :dec, :pop
24
+ alias_method :shift, :pop
25
+ alias_method :enc, :push
26
+ alias_method :<<, :push
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,24 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module QueueOverflow
4
+ class Worker
5
+ def initialize(manager)
6
+ @logger = manager.polipus.logger
7
+ @delay = manager.polipus.options[:queue_overflow_manager_check_time]
8
+ @adapter = manager.polipus.queue_overflow_adapter
9
+ @manager = manager
10
+ end
11
+
12
+ def run
13
+ @logger.info { 'Overflow::Worker::run' }
14
+ loop do
15
+ @logger.info { 'Overflow Manager: cycle started' }
16
+ removed, restored = @manager.perform
17
+ @logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{@adapter.size}" }
18
+ sleep @delay
19
+ break if SignalHandler.terminated?
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,145 @@
1
+ # encoding: UTF-8
2
+ require 'open-uri'
3
+ require 'uri'
4
+ require 'timeout'
5
+ module Polipus
6
+ # Original code taken from
7
+ # https://github.com/chriskite/robotex/blob/master/lib/robotex.rb
8
+
9
+ class Robotex
10
+ DEFAULT_TIMEOUT = 3
11
+ VERSION = '1.0.0'
12
+
13
+ attr_reader :user_agent
14
+
15
+ class ParsedRobots
16
+ def initialize(uri, user_agent)
17
+ io = Robotex.get_robots_txt(uri, user_agent)
18
+ if !io || io.content_type != 'text/plain' || io.status != %w(200 OK)
19
+ io = StringIO.new("User-agent: *\nAllow: /\n")
20
+ end
21
+
22
+ @disallows = {}
23
+ @allows = {}
24
+ @delays = {}
25
+ agent = /.*/
26
+ io.each do |line|
27
+ next if line =~ /^\s*(#.*|$)/
28
+ arr = line.split(':')
29
+ key = arr.shift
30
+ value = arr.join(':').strip
31
+ value.strip!
32
+ case key.downcase
33
+ when 'user-agent'
34
+ agent = to_regex(value)
35
+ when 'allow'
36
+ unless value.empty?
37
+ @allows[agent] ||= []
38
+ @allows[agent] << to_regex(value)
39
+ end
40
+ when 'disallow'
41
+ unless value.empty?
42
+ @disallows[agent] ||= []
43
+ @disallows[agent] << to_regex(value)
44
+ end
45
+ when 'crawl-delay'
46
+ @delays[agent] = value.to_i
47
+ end
48
+ end
49
+ @parsed = true
50
+ end
51
+
52
+ def allowed?(uri, user_agent)
53
+ return true unless @parsed
54
+ allowed = true
55
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
56
+ path = uri.request_uri
57
+
58
+ @allows.each do |key, value|
59
+ unless allowed
60
+ if user_agent =~ key
61
+ value.each do |rule|
62
+ path =~ rule && allowed = true
63
+ end
64
+ end
65
+ end
66
+ end
67
+
68
+ @disallows.each do |key, value|
69
+ if user_agent =~ key
70
+ value.each do |rule|
71
+ path =~ rule && allowed = false
72
+ end
73
+ end
74
+ end
75
+
76
+ allowed
77
+ end
78
+
79
+ def delay(user_agent)
80
+ @delays.each do |agent, delay|
81
+ return delay if agent =~ user_agent
82
+ end
83
+ nil
84
+ end
85
+
86
+ protected
87
+
88
+ def to_regex(pattern)
89
+ pattern = Regexp.escape(pattern)
90
+ pattern.gsub!(Regexp.escape('*'), '.*')
91
+ Regexp.compile("^#{pattern}")
92
+ end
93
+ end
94
+
95
+ def self.get_robots_txt(uri, user_agent)
96
+ Timeout.timeout(Robotex.timeout) do
97
+ URI.join(uri.to_s, '/robots.txt').open('User-Agent' => user_agent) rescue nil
98
+ end
99
+ rescue Timeout::Error
100
+ STDERR.puts 'robots.txt request timed out'
101
+ end
102
+
103
+ class << self
104
+ attr_writer :timeout
105
+ end
106
+
107
+ def self.timeout
108
+ @timeout || DEFAULT_TIMEOUT
109
+ end
110
+
111
+ def initialize(user_agent = nil)
112
+ user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
113
+ @user_agent = user_agent
114
+ @last_accessed = Time.at(1)
115
+ @parsed = {}
116
+ end
117
+
118
+ def parse_host(uri)
119
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
120
+ @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
121
+ end
122
+
123
+ #
124
+ # Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
125
+ #
126
+ def allowed?(uri)
127
+ parse_host(uri).allowed?(uri, @user_agent)
128
+ end
129
+
130
+ #
131
+ # Return the value of the Crawl-Delay directive, or nil if none
132
+ def delay(uri)
133
+ parse_host(uri).delay(@user_agent)
134
+ end
135
+
136
+ #
137
+ # Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server
138
+ #
139
+ def delay!(uri)
140
+ delay = delay(uri)
141
+ sleep delay - (Time.now - @last_accessed) if delay
142
+ @last_accessed = Time.now
143
+ end
144
+ end
145
+ end