polipus 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -13
- data/.rubocop_todo.yml +23 -8
- data/AUTHORS.md +1 -0
- data/CHANGELOG.md +7 -0
- data/lib/polipus.rb +11 -5
- data/lib/polipus/http.rb +14 -4
- data/lib/polipus/page.rb +21 -4
- data/lib/polipus/queue_overflow.rb +4 -4
- data/lib/polipus/queue_overflow/mongo_queue.rb +7 -6
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +2 -2
- data/lib/polipus/storage.rb +3 -3
- data/lib/polipus/storage/memory_store.rb +1 -1
- data/lib/polipus/storage/mongo_store.rb +13 -9
- data/lib/polipus/storage/rethink_store.rb +3 -0
- data/lib/polipus/version.rb +1 -1
- data/polipus.gemspec +7 -5
- data/spec/polipus/http_spec.rb +28 -0
- data/spec/polipus/queue_overflow/manager_spec.rb +1 -1
- data/spec/polipus/storage/memory_store_spec.rb +1 -1
- data/spec/polipus/storage/mongo_store_spec.rb +6 -6
- data/spec/polipus_spec.rb +9 -0
- metadata +68 -55
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
MTAxYjZmYWNjMTlkYzk3Y2ZhNjdjMDFmODM2OTQ5YmQ3ZDcyNTQzNw==
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 0e3c15f4ab5c3b4eca8f4703b80d7d8fa74210c3
|
4
|
+
data.tar.gz: ae0dd32d81bbcbef350949e0c51fecaaf59ac8ff
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
NmM1MDc5ZTI0ODkxNzlhY2ZkZGVjZjI2ZTMyNjZhNTMwOWZhZjQ2OGY4ZTJl
|
11
|
-
ZmI4ZjUwNmQ3YjAxOTdlODI2MWRmMWUzNjY3Yzk1MmNiZTk0Y2E=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
YjNlYTZlMjU5OTBmN2ZjMTI1OTY1MzgzNzBhYjBjYTAzN2MwYzY0M2U1ZGYw
|
14
|
-
MjcyMmMxM2I1ZTY5OWEzZDQ3MjQ1OTgzMzcyN2I5NGQ5NTAzZTEzY2FmMDRl
|
15
|
-
MmJiZDdjOTFmMjQwZWU1MzM1OGJhN2E4NTRhNGExZTAyMTVmN2I=
|
6
|
+
metadata.gz: c3a57d1cbfbab4c77cf3b746c7b73eb84cf675e7998eb5979af6d17d997ee26781803e519eae50de6762590fc7287d09ccf4ac0d640dff40602b8a507a356de2
|
7
|
+
data.tar.gz: 743b128ee0dd8fefbbdd1bd7443afbc4a2c62dd2f3d3751d3d693719e9a8e2e145668ccd58945b77f5f780f50757a11248df2b1963c8e7cc35706e937627e5e0
|
data/.rubocop_todo.yml
CHANGED
@@ -1,25 +1,36 @@
|
|
1
1
|
# This configuration was generated by `rubocop --auto-gen-config`
|
2
|
-
# on
|
2
|
+
# on 2015-07-08 20:22:49 -0700 using RuboCop version 0.29.1.
|
3
3
|
# The point is for the user to remove these configuration records
|
4
4
|
# one by one as the offenses are removed from the code base.
|
5
5
|
# Note that changes in the inspected code, or installation of new
|
6
6
|
# versions of RuboCop, may require this file to be generated again.
|
7
7
|
|
8
8
|
# Offense count: 1
|
9
|
-
|
9
|
+
Lint/HandleExceptions:
|
10
10
|
Enabled: false
|
11
11
|
|
12
|
-
# Offense count:
|
12
|
+
# Offense count: 21
|
13
|
+
Metrics/AbcSize:
|
14
|
+
Max: 103
|
15
|
+
|
16
|
+
# Offense count: 12
|
13
17
|
Metrics/CyclomaticComplexity:
|
14
|
-
Max:
|
18
|
+
Max: 15
|
15
19
|
|
16
|
-
# Offense count:
|
17
|
-
|
20
|
+
# Offense count: 10
|
21
|
+
Metrics/PerceivedComplexity:
|
22
|
+
Max: 17
|
23
|
+
|
24
|
+
# Offense count: 1
|
25
|
+
Style/ClassVars:
|
18
26
|
Enabled: false
|
19
27
|
|
28
|
+
# Offense count: 27
|
29
|
+
Style/Documentation:
|
30
|
+
Enabled: false
|
20
31
|
|
21
32
|
# Offense count: 2
|
22
|
-
# Configuration parameters: EnforcedStyle, SupportedStyles.
|
33
|
+
# Configuration parameters: EnforcedStyle, MinBodyLength, SupportedStyles.
|
23
34
|
Style/Next:
|
24
35
|
Enabled: false
|
25
36
|
|
@@ -28,6 +39,10 @@ Style/Next:
|
|
28
39
|
Style/RegexpLiteral:
|
29
40
|
Enabled: false
|
30
41
|
|
31
|
-
# Offense count:
|
42
|
+
# Offense count: 3
|
32
43
|
Style/RescueModifier:
|
33
44
|
Enabled: false
|
45
|
+
|
46
|
+
# Offense count: 1
|
47
|
+
Style/UnlessElse:
|
48
|
+
Enabled: false
|
data/AUTHORS.md
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## 0.5.0 (2015-07-08)
|
4
|
+
|
5
|
+
[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.4.0...0.5.0)
|
6
|
+
|
7
|
+
* Support for MongoDB driver ~> 2.0.6 has been added
|
8
|
+
* Minor code cleanup
|
9
|
+
|
3
10
|
## 0.4.0 (2015-01-12)
|
4
11
|
|
5
12
|
[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.3...0.4.0)
|
data/lib/polipus.rb
CHANGED
@@ -129,7 +129,14 @@ module Polipus
|
|
129
129
|
|
130
130
|
@urls = [urls].flatten.map { |url| URI(url) }
|
131
131
|
@urls.each { |url| url.path = '/' if url.path.empty? }
|
132
|
-
|
132
|
+
if @options[:obey_robots_txt]
|
133
|
+
@robots =
|
134
|
+
if @options[:user_agent].respond_to?(:sample)
|
135
|
+
Polipus::Robotex.new(@options[:user_agent].sample)
|
136
|
+
else
|
137
|
+
Polipus::Robotex.new(@options[:user_agent])
|
138
|
+
end
|
139
|
+
end
|
133
140
|
# Attach signal handling if enabled
|
134
141
|
SignalHandler.enable if @options[:enable_signal_handler]
|
135
142
|
|
@@ -170,7 +177,6 @@ module Polipus
|
|
170
177
|
http = HTTP.new(@options)
|
171
178
|
queue = queue_factory
|
172
179
|
queue.process(false, @options[:queue_timeout]) do |message|
|
173
|
-
|
174
180
|
next if message.nil?
|
175
181
|
|
176
182
|
execute_plugin 'on_message_received'
|
@@ -199,7 +205,7 @@ module Polipus
|
|
199
205
|
rurls = pages.map { |e| e.url.to_s }.join(' --> ')
|
200
206
|
@logger.info { "Got redirects! #{rurls}" }
|
201
207
|
page = pages.pop
|
202
|
-
page.aliases = pages.map
|
208
|
+
page.aliases = pages.map(&:url)
|
203
209
|
if page_exists? page
|
204
210
|
@logger.info { "[worker ##{worker_number}] Page (#{page.url}) already stored." }
|
205
211
|
queue.commit
|
@@ -253,7 +259,7 @@ module Polipus
|
|
253
259
|
end
|
254
260
|
end
|
255
261
|
|
256
|
-
@workers_pool.each
|
262
|
+
@workers_pool.each(&:join)
|
257
263
|
@on_crawl_end.each { |e| e.call(self) }
|
258
264
|
execute_plugin 'on_crawl_end'
|
259
265
|
end
|
@@ -474,7 +480,7 @@ module Polipus
|
|
474
480
|
next unless p.respond_to?(method)
|
475
481
|
@logger.info { "Running plugin method #{method} on #{k}" }
|
476
482
|
ret_val = p.send(method, self)
|
477
|
-
instance_eval(&ret_val) if ret_val.
|
483
|
+
instance_eval(&ret_val) if ret_val.is_a? Proc
|
478
484
|
end
|
479
485
|
end
|
480
486
|
end
|
data/lib/polipus/http.rb
CHANGED
@@ -80,7 +80,11 @@ module Polipus
|
|
80
80
|
# or nil if no such option is set
|
81
81
|
#
|
82
82
|
def user_agent
|
83
|
-
@opts[:user_agent]
|
83
|
+
if @opts[:user_agent].respond_to?(:sample)
|
84
|
+
@opts[:user_agent].sample
|
85
|
+
else
|
86
|
+
@opts[:user_agent]
|
87
|
+
end
|
84
88
|
end
|
85
89
|
|
86
90
|
#
|
@@ -108,7 +112,7 @@ module Polipus
|
|
108
112
|
# The proxy password
|
109
113
|
#
|
110
114
|
def proxy_pass
|
111
|
-
#return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
|
115
|
+
# return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
|
112
116
|
@opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
|
113
117
|
end
|
114
118
|
|
@@ -162,7 +166,13 @@ module Polipus
|
|
162
166
|
|
163
167
|
response, response_time = get_response(loc, referer)
|
164
168
|
code = Integer(response.code)
|
165
|
-
redirect_to =
|
169
|
+
redirect_to =
|
170
|
+
begin
|
171
|
+
response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
172
|
+
rescue URI::InvalidURIError => e
|
173
|
+
@opts[:logger].debug { "Request #{url} got #{e}" } if @opts[:logger]
|
174
|
+
nil
|
175
|
+
end
|
166
176
|
yield response, code, loc, redirect_to, response_time
|
167
177
|
limit -= 1
|
168
178
|
break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
@@ -233,7 +243,7 @@ module Polipus
|
|
233
243
|
|
234
244
|
# Block has higher priority
|
235
245
|
unless @opts[:proxy_host_port].nil?
|
236
|
-
p_host, p_port, p_user, p_pass = proxy_host_port
|
246
|
+
p_host, p_port, p_user, p_pass = proxy_host_port
|
237
247
|
else
|
238
248
|
p_host = proxy_host
|
239
249
|
p_port = proxy_port
|
data/lib/polipus/page.rb
CHANGED
@@ -3,6 +3,8 @@ require 'nokogiri'
|
|
3
3
|
require 'json'
|
4
4
|
require 'ostruct'
|
5
5
|
require 'set'
|
6
|
+
require 'kconv'
|
7
|
+
|
6
8
|
module Polipus
|
7
9
|
class Page
|
8
10
|
# The URL of the page
|
@@ -70,7 +72,7 @@ module Polipus
|
|
70
72
|
u = a['href']
|
71
73
|
next if u.nil? || u.empty?
|
72
74
|
abs = to_absolute(u) rescue next
|
73
|
-
@links << abs if in_domain?(abs)
|
75
|
+
@links << abs if abs && in_domain?(abs)
|
74
76
|
end
|
75
77
|
@links.to_a
|
76
78
|
end
|
@@ -80,7 +82,10 @@ module Polipus
|
|
80
82
|
#
|
81
83
|
def doc
|
82
84
|
return @doc if @doc
|
83
|
-
@
|
85
|
+
@body ||= ''
|
86
|
+
@body = @body.encode('utf-8', 'binary', invalid: :replace,
|
87
|
+
undef: :replace, replace: '')
|
88
|
+
@doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
|
84
89
|
end
|
85
90
|
|
86
91
|
#
|
@@ -166,10 +171,22 @@ module Polipus
|
|
166
171
|
def to_absolute(link)
|
167
172
|
return nil if link.nil?
|
168
173
|
|
174
|
+
valid_link = link.to_s.encode('utf-8', 'binary', invalid: :replace,
|
175
|
+
undef: :replace, replace: '')
|
176
|
+
|
169
177
|
# remove anchor
|
170
|
-
link =
|
178
|
+
link =
|
179
|
+
begin
|
180
|
+
URI.encode(URI.decode(valid_link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
|
181
|
+
rescue URI::Error
|
182
|
+
return nil
|
183
|
+
end
|
171
184
|
|
172
|
-
relative =
|
185
|
+
relative = begin
|
186
|
+
URI(link)
|
187
|
+
rescue URI::Error
|
188
|
+
return nil
|
189
|
+
end
|
173
190
|
absolute = base ? base.merge(relative) : @url.merge(relative)
|
174
191
|
|
175
192
|
absolute.path = '/' if absolute.path.empty?
|
@@ -5,15 +5,15 @@ module Polipus
|
|
5
5
|
module QueueOverflow
|
6
6
|
def self.mongo_queue(mongo_db, queue_name, options = {})
|
7
7
|
require 'polipus/queue_overflow/mongo_queue'
|
8
|
-
mongo_db ||= Mongo::
|
9
|
-
fail 'First argument must be an instance of Mongo::
|
8
|
+
mongo_db ||= Mongo::Client.new(['localhost:27_017'], database: 'polipus')
|
9
|
+
fail 'First argument must be an instance of Mongo::Client' unless mongo_db.is_a?(Mongo::Client)
|
10
10
|
self::MongoQueue.new mongo_db, queue_name, options
|
11
11
|
end
|
12
12
|
|
13
13
|
def self.mongo_queue_capped(mongo_db, queue_name, options = {})
|
14
14
|
require 'polipus/queue_overflow/mongo_queue_capped'
|
15
|
-
mongo_db ||= Mongo::
|
16
|
-
fail 'First argument must be an instance of Mongo::
|
15
|
+
mongo_db ||= Mongo::Client.new(['localhost:27_017'], database: 'polipus')
|
16
|
+
fail 'First argument must be an instance of Mongo::Client' unless mongo_db.is_a?(Mongo::Client)
|
17
17
|
options[:max] = 1_000_000 if options[:max].nil?
|
18
18
|
self::MongoQueueCapped.new mongo_db, queue_name, options
|
19
19
|
end
|
@@ -14,7 +14,7 @@ module Polipus
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def length
|
17
|
-
@mongo_db[@collection_name].count
|
17
|
+
@mongo_db[@collection_name].find.count
|
18
18
|
end
|
19
19
|
|
20
20
|
def empty?
|
@@ -28,18 +28,18 @@ module Polipus
|
|
28
28
|
|
29
29
|
def push(data)
|
30
30
|
if @options[:ensure_uniq]
|
31
|
-
@mongo_db[@collection_name].
|
31
|
+
@mongo_db[@collection_name].find(payload: data).replace_one({ payload: data }, upsert: true)
|
32
32
|
else
|
33
|
-
@mongo_db[@collection_name].
|
33
|
+
@mongo_db[@collection_name].insert_one(payload: data)
|
34
34
|
end
|
35
35
|
true
|
36
36
|
end
|
37
37
|
|
38
38
|
def pop(_ = false)
|
39
39
|
@semaphore.synchronize do
|
40
|
-
doc = @mongo_db[@collection_name].find(
|
40
|
+
doc = @mongo_db[@collection_name].find.sort(_id: 1).limit(1).first
|
41
41
|
return nil if doc.nil?
|
42
|
-
@mongo_db[@collection_name].
|
42
|
+
@mongo_db[@collection_name].find(_id: doc['_id']).delete_one
|
43
43
|
doc && doc['payload'] ? doc['payload'] : nil
|
44
44
|
end
|
45
45
|
end
|
@@ -53,7 +53,8 @@ module Polipus
|
|
53
53
|
protected
|
54
54
|
|
55
55
|
def ensure_index
|
56
|
-
@
|
56
|
+
# @TODO: Drop dups option was removed. We may want to add something here to remove duplications
|
57
|
+
@mongo_db[@collection_name].indexes.create_one({ payload: 1 }, background: true, unique: true)
|
57
58
|
end
|
58
59
|
end
|
59
60
|
end
|
@@ -13,8 +13,8 @@ module Polipus
|
|
13
13
|
@semaphore.synchronize do
|
14
14
|
s = size
|
15
15
|
if s > @max
|
16
|
-
docs = @mongo_db[@collection_name].find(
|
17
|
-
@mongo_db[@collection_name].
|
16
|
+
docs = @mongo_db[@collection_name].find.sort(_id: 1).projection(_id: 1).limit(s - @max).map { |e| e['_id'] }
|
17
|
+
@mongo_db[@collection_name].find(_id: { '$in' => docs }).delete_many
|
18
18
|
end
|
19
19
|
end
|
20
20
|
end
|
data/lib/polipus/storage.rb
CHANGED
@@ -6,14 +6,14 @@ module Polipus
|
|
6
6
|
|
7
7
|
def self.mongo_store(mongo = nil, collection = COLLECTION, except = [])
|
8
8
|
require 'polipus/storage/mongo_store'
|
9
|
-
mongo ||= Mongo::
|
10
|
-
fail 'First argument must be an instance of Mongo::
|
9
|
+
mongo ||= Mongo::Client.new(['localhost:27_017'], database: 'polipus')
|
10
|
+
fail 'First argument must be an instance of Mongo::Client' unless mongo.is_a?(Mongo::Client)
|
11
11
|
self::MongoStore.new(mongo: mongo, collection: collection, except: except)
|
12
12
|
end
|
13
13
|
|
14
14
|
def self.rethink_store(conn = nil, table = COLLECTION, except = [])
|
15
15
|
require 'polipus/storage/rethink_store'
|
16
|
-
conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus'
|
16
|
+
conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus')
|
17
17
|
fail "First argument must be a RethinkDB::Connection, got `#{conn.class}`" unless conn.is_a?(RethinkDB::Connection)
|
18
18
|
self::RethinkStore.new(conn: conn, table: table, except: except)
|
19
19
|
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'mongo'
|
3
3
|
require 'zlib'
|
4
4
|
require 'thread'
|
5
|
+
require 'pry'
|
5
6
|
|
6
7
|
module Polipus
|
7
8
|
module Storage
|
@@ -10,10 +11,9 @@ module Polipus
|
|
10
11
|
def initialize(options = {})
|
11
12
|
@mongo = options[:mongo]
|
12
13
|
@collection = options[:collection]
|
13
|
-
@mongo.create_collection(@collection)
|
14
14
|
begin
|
15
15
|
@mongo[@collection].ensure_index(:uuid, unique: true, dropDups: true, background: true)
|
16
|
-
rescue
|
16
|
+
rescue StandardError
|
17
17
|
end
|
18
18
|
|
19
19
|
@compress_body = options[:compress_body] ||= true
|
@@ -28,16 +28,20 @@ module Polipus
|
|
28
28
|
obj['uuid'] = uuid(page)
|
29
29
|
obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
|
30
30
|
BINARY_FIELDS.each do |field|
|
31
|
-
obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
|
31
|
+
obj[field] = BSON::Binary.new(obj[field].force_encoding('UTF-8').encode('UTF-8')) unless obj[field].nil?
|
32
32
|
end
|
33
|
-
|
33
|
+
|
34
|
+
# We really need 2.0.6+ version for this to work
|
35
|
+
# https://jira.mongodb.org/browse/RUBY-881
|
36
|
+
@mongo[@collection].find(uuid: uuid(page)).replace_one(obj, upsert: true)
|
37
|
+
|
34
38
|
obj['uuid']
|
35
39
|
end
|
36
40
|
end
|
37
41
|
|
38
42
|
def exists?(page)
|
39
43
|
@semaphore.synchronize do
|
40
|
-
doc = @mongo[@collection].find(
|
44
|
+
doc = @mongo[@collection].find(uuid: uuid(page)).projection(_id: 1).limit(1).first
|
41
45
|
!doc.nil?
|
42
46
|
end
|
43
47
|
end
|
@@ -51,16 +55,16 @@ module Polipus
|
|
51
55
|
|
52
56
|
def remove(page)
|
53
57
|
@semaphore.synchronize do
|
54
|
-
@mongo[@collection].
|
58
|
+
@mongo[@collection].find(uuid: uuid(page)).delete_one
|
55
59
|
end
|
56
60
|
end
|
57
61
|
|
58
62
|
def count
|
59
|
-
@mongo[@collection].count
|
63
|
+
@mongo[@collection].find.count
|
60
64
|
end
|
61
65
|
|
62
66
|
def each
|
63
|
-
@mongo[@collection].find
|
67
|
+
@mongo[@collection].find.no_cursor_timeout do |cursor|
|
64
68
|
cursor.each do |doc|
|
65
69
|
page = load_page(doc)
|
66
70
|
yield doc['uuid'], page
|
@@ -76,7 +80,7 @@ module Polipus
|
|
76
80
|
|
77
81
|
def load_page(hash)
|
78
82
|
BINARY_FIELDS.each do |field|
|
79
|
-
hash[field] = hash[field].
|
83
|
+
hash[field] = hash[field].data unless hash[field].nil?
|
80
84
|
end
|
81
85
|
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
82
86
|
page = Page.from_hash(hash)
|
@@ -12,6 +12,9 @@ module Polipus
|
|
12
12
|
@rethink = options[:conn]
|
13
13
|
@table = options[:table]
|
14
14
|
|
15
|
+
unless @r.db_list.run(@rethink).include?(@rethink.default_db)
|
16
|
+
@r.db_create(@rethink.default_db).run(@rethink)
|
17
|
+
end
|
15
18
|
unless @r.table_list.run(@rethink).include?(@table)
|
16
19
|
@r.table_create(@table).run(@rethink)
|
17
20
|
@r.table(@table).index_create('created_at')
|
data/lib/polipus/version.rb
CHANGED
data/polipus.gemspec
CHANGED
@@ -8,10 +8,10 @@ Gem::Specification.new do |s|
|
|
8
8
|
s.authors = ['Francesco Laurita']
|
9
9
|
s.email = ['francesco.laurita@gmail.com']
|
10
10
|
s.homepage = Polipus::HOMEPAGE
|
11
|
-
s.summary =
|
12
|
-
s.description =
|
11
|
+
s.summary = 'Polipus distributed web-crawler framework'
|
12
|
+
s.description = '
|
13
13
|
An easy to use distributed web-crawler framework based on Redis
|
14
|
-
|
14
|
+
'
|
15
15
|
s.licenses = ['MIT']
|
16
16
|
s.platform = Gem::Platform::RUBY
|
17
17
|
|
@@ -30,8 +30,8 @@ Gem::Specification.new do |s|
|
|
30
30
|
s.add_runtime_dependency 'redis-queue', '~> 0.0', '>= 0.0.4'
|
31
31
|
s.add_runtime_dependency 'redis-bloomfilter', '~> 0.0', '>= 0.0.3'
|
32
32
|
|
33
|
-
s.add_development_dependency 'mongo', '~>
|
34
|
-
s.add_development_dependency 'rethinkdb', '~>1.15.0'
|
33
|
+
s.add_development_dependency 'mongo', '~> 2.0.6'
|
34
|
+
s.add_development_dependency 'rethinkdb', '~> 1.15.0'
|
35
35
|
|
36
36
|
s.add_development_dependency 'rake', '~> 10.3'
|
37
37
|
s.add_development_dependency 'rspec', '~> 3.1.0'
|
@@ -41,4 +41,6 @@ Gem::Specification.new do |s|
|
|
41
41
|
s.add_development_dependency 'webmock', '~> 1.20.0'
|
42
42
|
|
43
43
|
s.add_development_dependency 'coveralls'
|
44
|
+
|
45
|
+
s.add_development_dependency 'pry'
|
44
46
|
end
|
data/spec/polipus/http_spec.rb
CHANGED
@@ -108,4 +108,32 @@ describe Polipus::HTTP do
|
|
108
108
|
end
|
109
109
|
end
|
110
110
|
end
|
111
|
+
|
112
|
+
describe 'random user_agent' do
|
113
|
+
context 'when user_agent is string' do
|
114
|
+
it '#user_agent' do
|
115
|
+
http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1, user_agent: 'Googlebot')
|
116
|
+
expect(http.user_agent).to eq('Googlebot')
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
context 'when user_agent is list' do
|
121
|
+
let(:user_agents) do
|
122
|
+
['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1',
|
123
|
+
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
|
124
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
|
125
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
|
126
|
+
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10',
|
127
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
|
128
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5',
|
129
|
+
'Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)'
|
130
|
+
]
|
131
|
+
end
|
132
|
+
|
133
|
+
it '#user_agent' do
|
134
|
+
http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1, user_agent: user_agents)
|
135
|
+
expect(user_agents).to include(http.user_agent)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
111
139
|
end
|
@@ -6,7 +6,7 @@ require 'redis-queue'
|
|
6
6
|
|
7
7
|
describe Polipus::QueueOverflow::Manager do
|
8
8
|
before(:all) do
|
9
|
-
@mongo = Mongo::
|
9
|
+
@mongo = Mongo::Client.new(['localhost:27_017'], database: '_test_polipus')
|
10
10
|
@mongo['_test_pages'].drop
|
11
11
|
@storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
12
12
|
@redis_q = Redis::Queue.new('queue_test', 'bp_queue_test', redis: Redis.new)
|
@@ -4,7 +4,7 @@ require 'mongo'
|
|
4
4
|
require 'polipus/storage/mongo_store'
|
5
5
|
describe Polipus::Storage::MongoStore do
|
6
6
|
before(:all)do
|
7
|
-
@mongo = Mongo::
|
7
|
+
@mongo = Mongo::Client.new(['localhost:27_017'], database: '_test_polipus')
|
8
8
|
@mongo['_test_pages'].drop
|
9
9
|
@storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
10
10
|
end
|
@@ -21,8 +21,8 @@ describe Polipus::Storage::MongoStore do
|
|
21
21
|
p = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
|
22
22
|
uuid = @storage.add p
|
23
23
|
expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
|
24
|
-
expect(@storage.count).to be 1
|
25
|
-
expect(@mongo['_test_pages'].count).to be 1
|
24
|
+
expect(@storage.count.to_i).to be 1
|
25
|
+
expect(@mongo['_test_pages'].find.count.to_i).to be 1
|
26
26
|
p = @storage.get p
|
27
27
|
expect(p.url.to_s).to eq('http://www.google.com')
|
28
28
|
expect(p.body).to eq('<html></html>')
|
@@ -33,7 +33,7 @@ describe Polipus::Storage::MongoStore do
|
|
33
33
|
@storage.add p
|
34
34
|
p = @storage.get p
|
35
35
|
expect(p.code).to eq(301)
|
36
|
-
expect(@mongo['_test_pages'].count).to be 1
|
36
|
+
expect(@mongo['_test_pages'].find.count.to_i).to be 1
|
37
37
|
end
|
38
38
|
|
39
39
|
it 'should iterate over stored pages' do
|
@@ -47,7 +47,7 @@ describe Polipus::Storage::MongoStore do
|
|
47
47
|
p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
48
48
|
@storage.remove p
|
49
49
|
expect(@storage.get(p)).to be_nil
|
50
|
-
expect(@storage.count).to be 0
|
50
|
+
expect(@storage.count.to_i).to be 0
|
51
51
|
end
|
52
52
|
|
53
53
|
it 'should store a page removing a query string from the uuid generation' do
|
@@ -83,7 +83,7 @@ describe Polipus::Storage::MongoStore do
|
|
83
83
|
p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
|
84
84
|
storage.add p
|
85
85
|
p = storage.get p
|
86
|
-
expect(p.body).to
|
86
|
+
expect(p.body).to be_nil
|
87
87
|
storage.clear
|
88
88
|
end
|
89
89
|
|
data/spec/polipus_spec.rb
CHANGED
@@ -93,5 +93,14 @@ describe Polipus::PolipusCrawler do
|
|
93
93
|
polipus.takeover
|
94
94
|
polipus.storage.each { |_id, page| expect(page.url.path =~ /$\/downloads\//).to be_falsey }
|
95
95
|
end
|
96
|
+
|
97
|
+
it 'should obey to the robots.txt file with list user_agent' do
|
98
|
+
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)'
|
99
|
+
lopt = p_options
|
100
|
+
lopt[:obey_robots_txt] = true
|
101
|
+
lopt[:user_agent] = [user_agent]
|
102
|
+
flexmock(Polipus::Robotex).should_receive(:new).with(user_agent)
|
103
|
+
Polipus::PolipusCrawler.new('polipus-rspec', ['https://rubygems.org/gems/polipus'], lopt)
|
104
|
+
end
|
96
105
|
end
|
97
106
|
end
|
metadata
CHANGED
@@ -1,248 +1,262 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polipus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francesco Laurita
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-07-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.6'
|
20
|
-
- -
|
20
|
+
- - ">="
|
21
21
|
- !ruby/object:Gem::Version
|
22
22
|
version: 1.6.0
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
|
-
- - ~>
|
27
|
+
- - "~>"
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '1.6'
|
30
|
-
- -
|
30
|
+
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.6.0
|
33
33
|
- !ruby/object:Gem::Dependency
|
34
34
|
name: http-cookie
|
35
35
|
requirement: !ruby/object:Gem::Requirement
|
36
36
|
requirements:
|
37
|
-
- - ~>
|
37
|
+
- - "~>"
|
38
38
|
- !ruby/object:Gem::Version
|
39
39
|
version: '1.0'
|
40
|
-
- -
|
40
|
+
- - ">="
|
41
41
|
- !ruby/object:Gem::Version
|
42
42
|
version: 1.0.1
|
43
43
|
type: :runtime
|
44
44
|
prerelease: false
|
45
45
|
version_requirements: !ruby/object:Gem::Requirement
|
46
46
|
requirements:
|
47
|
-
- - ~>
|
47
|
+
- - "~>"
|
48
48
|
- !ruby/object:Gem::Version
|
49
49
|
version: '1.0'
|
50
|
-
- -
|
50
|
+
- - ">="
|
51
51
|
- !ruby/object:Gem::Version
|
52
52
|
version: 1.0.1
|
53
53
|
- !ruby/object:Gem::Dependency
|
54
54
|
name: redis
|
55
55
|
requirement: !ruby/object:Gem::Requirement
|
56
56
|
requirements:
|
57
|
-
- - ~>
|
57
|
+
- - "~>"
|
58
58
|
- !ruby/object:Gem::Version
|
59
59
|
version: '3.0'
|
60
|
-
- -
|
60
|
+
- - ">="
|
61
61
|
- !ruby/object:Gem::Version
|
62
62
|
version: 3.0.4
|
63
63
|
type: :runtime
|
64
64
|
prerelease: false
|
65
65
|
version_requirements: !ruby/object:Gem::Requirement
|
66
66
|
requirements:
|
67
|
-
- - ~>
|
67
|
+
- - "~>"
|
68
68
|
- !ruby/object:Gem::Version
|
69
69
|
version: '3.0'
|
70
|
-
- -
|
70
|
+
- - ">="
|
71
71
|
- !ruby/object:Gem::Version
|
72
72
|
version: 3.0.4
|
73
73
|
- !ruby/object:Gem::Dependency
|
74
74
|
name: hiredis
|
75
75
|
requirement: !ruby/object:Gem::Requirement
|
76
76
|
requirements:
|
77
|
-
- - ~>
|
77
|
+
- - "~>"
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: '0.5'
|
80
|
-
- -
|
80
|
+
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 0.4.5
|
83
83
|
type: :runtime
|
84
84
|
prerelease: false
|
85
85
|
version_requirements: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- - ~>
|
87
|
+
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: '0.5'
|
90
|
-
- -
|
90
|
+
- - ">="
|
91
91
|
- !ruby/object:Gem::Version
|
92
92
|
version: 0.4.5
|
93
93
|
- !ruby/object:Gem::Dependency
|
94
94
|
name: redis-queue
|
95
95
|
requirement: !ruby/object:Gem::Requirement
|
96
96
|
requirements:
|
97
|
-
- - ~>
|
97
|
+
- - "~>"
|
98
98
|
- !ruby/object:Gem::Version
|
99
99
|
version: '0.0'
|
100
|
-
- -
|
100
|
+
- - ">="
|
101
101
|
- !ruby/object:Gem::Version
|
102
102
|
version: 0.0.4
|
103
103
|
type: :runtime
|
104
104
|
prerelease: false
|
105
105
|
version_requirements: !ruby/object:Gem::Requirement
|
106
106
|
requirements:
|
107
|
-
- - ~>
|
107
|
+
- - "~>"
|
108
108
|
- !ruby/object:Gem::Version
|
109
109
|
version: '0.0'
|
110
|
-
- -
|
110
|
+
- - ">="
|
111
111
|
- !ruby/object:Gem::Version
|
112
112
|
version: 0.0.4
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: redis-bloomfilter
|
115
115
|
requirement: !ruby/object:Gem::Requirement
|
116
116
|
requirements:
|
117
|
-
- - ~>
|
117
|
+
- - "~>"
|
118
118
|
- !ruby/object:Gem::Version
|
119
119
|
version: '0.0'
|
120
|
-
- -
|
120
|
+
- - ">="
|
121
121
|
- !ruby/object:Gem::Version
|
122
122
|
version: 0.0.3
|
123
123
|
type: :runtime
|
124
124
|
prerelease: false
|
125
125
|
version_requirements: !ruby/object:Gem::Requirement
|
126
126
|
requirements:
|
127
|
-
- - ~>
|
127
|
+
- - "~>"
|
128
128
|
- !ruby/object:Gem::Version
|
129
129
|
version: '0.0'
|
130
|
-
- -
|
130
|
+
- - ">="
|
131
131
|
- !ruby/object:Gem::Version
|
132
132
|
version: 0.0.3
|
133
133
|
- !ruby/object:Gem::Dependency
|
134
134
|
name: mongo
|
135
135
|
requirement: !ruby/object:Gem::Requirement
|
136
136
|
requirements:
|
137
|
-
- - ~>
|
137
|
+
- - "~>"
|
138
138
|
- !ruby/object:Gem::Version
|
139
|
-
version:
|
139
|
+
version: 2.0.6
|
140
140
|
type: :development
|
141
141
|
prerelease: false
|
142
142
|
version_requirements: !ruby/object:Gem::Requirement
|
143
143
|
requirements:
|
144
|
-
- - ~>
|
144
|
+
- - "~>"
|
145
145
|
- !ruby/object:Gem::Version
|
146
|
-
version:
|
146
|
+
version: 2.0.6
|
147
147
|
- !ruby/object:Gem::Dependency
|
148
148
|
name: rethinkdb
|
149
149
|
requirement: !ruby/object:Gem::Requirement
|
150
150
|
requirements:
|
151
|
-
- - ~>
|
151
|
+
- - "~>"
|
152
152
|
- !ruby/object:Gem::Version
|
153
153
|
version: 1.15.0
|
154
154
|
type: :development
|
155
155
|
prerelease: false
|
156
156
|
version_requirements: !ruby/object:Gem::Requirement
|
157
157
|
requirements:
|
158
|
-
- - ~>
|
158
|
+
- - "~>"
|
159
159
|
- !ruby/object:Gem::Version
|
160
160
|
version: 1.15.0
|
161
161
|
- !ruby/object:Gem::Dependency
|
162
162
|
name: rake
|
163
163
|
requirement: !ruby/object:Gem::Requirement
|
164
164
|
requirements:
|
165
|
-
- - ~>
|
165
|
+
- - "~>"
|
166
166
|
- !ruby/object:Gem::Version
|
167
167
|
version: '10.3'
|
168
168
|
type: :development
|
169
169
|
prerelease: false
|
170
170
|
version_requirements: !ruby/object:Gem::Requirement
|
171
171
|
requirements:
|
172
|
-
- - ~>
|
172
|
+
- - "~>"
|
173
173
|
- !ruby/object:Gem::Version
|
174
174
|
version: '10.3'
|
175
175
|
- !ruby/object:Gem::Dependency
|
176
176
|
name: rspec
|
177
177
|
requirement: !ruby/object:Gem::Requirement
|
178
178
|
requirements:
|
179
|
-
- - ~>
|
179
|
+
- - "~>"
|
180
180
|
- !ruby/object:Gem::Version
|
181
181
|
version: 3.1.0
|
182
182
|
type: :development
|
183
183
|
prerelease: false
|
184
184
|
version_requirements: !ruby/object:Gem::Requirement
|
185
185
|
requirements:
|
186
|
-
- - ~>
|
186
|
+
- - "~>"
|
187
187
|
- !ruby/object:Gem::Version
|
188
188
|
version: 3.1.0
|
189
189
|
- !ruby/object:Gem::Dependency
|
190
190
|
name: flexmock
|
191
191
|
requirement: !ruby/object:Gem::Requirement
|
192
192
|
requirements:
|
193
|
-
- - ~>
|
193
|
+
- - "~>"
|
194
194
|
- !ruby/object:Gem::Version
|
195
195
|
version: '1.3'
|
196
196
|
type: :development
|
197
197
|
prerelease: false
|
198
198
|
version_requirements: !ruby/object:Gem::Requirement
|
199
199
|
requirements:
|
200
|
-
- - ~>
|
200
|
+
- - "~>"
|
201
201
|
- !ruby/object:Gem::Version
|
202
202
|
version: '1.3'
|
203
203
|
- !ruby/object:Gem::Dependency
|
204
204
|
name: vcr
|
205
205
|
requirement: !ruby/object:Gem::Requirement
|
206
206
|
requirements:
|
207
|
-
- - ~>
|
207
|
+
- - "~>"
|
208
208
|
- !ruby/object:Gem::Version
|
209
209
|
version: 2.9.0
|
210
210
|
type: :development
|
211
211
|
prerelease: false
|
212
212
|
version_requirements: !ruby/object:Gem::Requirement
|
213
213
|
requirements:
|
214
|
-
- - ~>
|
214
|
+
- - "~>"
|
215
215
|
- !ruby/object:Gem::Version
|
216
216
|
version: 2.9.0
|
217
217
|
- !ruby/object:Gem::Dependency
|
218
218
|
name: webmock
|
219
219
|
requirement: !ruby/object:Gem::Requirement
|
220
220
|
requirements:
|
221
|
-
- - ~>
|
221
|
+
- - "~>"
|
222
222
|
- !ruby/object:Gem::Version
|
223
223
|
version: 1.20.0
|
224
224
|
type: :development
|
225
225
|
prerelease: false
|
226
226
|
version_requirements: !ruby/object:Gem::Requirement
|
227
227
|
requirements:
|
228
|
-
- - ~>
|
228
|
+
- - "~>"
|
229
229
|
- !ruby/object:Gem::Version
|
230
230
|
version: 1.20.0
|
231
231
|
- !ruby/object:Gem::Dependency
|
232
232
|
name: coveralls
|
233
233
|
requirement: !ruby/object:Gem::Requirement
|
234
234
|
requirements:
|
235
|
-
- -
|
235
|
+
- - ">="
|
236
236
|
- !ruby/object:Gem::Version
|
237
237
|
version: '0'
|
238
238
|
type: :development
|
239
239
|
prerelease: false
|
240
240
|
version_requirements: !ruby/object:Gem::Requirement
|
241
241
|
requirements:
|
242
|
-
- -
|
242
|
+
- - ">="
|
243
243
|
- !ruby/object:Gem::Version
|
244
244
|
version: '0'
|
245
|
-
|
245
|
+
- !ruby/object:Gem::Dependency
|
246
|
+
name: pry
|
247
|
+
requirement: !ruby/object:Gem::Requirement
|
248
|
+
requirements:
|
249
|
+
- - ">="
|
250
|
+
- !ruby/object:Gem::Version
|
251
|
+
version: '0'
|
252
|
+
type: :development
|
253
|
+
prerelease: false
|
254
|
+
version_requirements: !ruby/object:Gem::Requirement
|
255
|
+
requirements:
|
256
|
+
- - ">="
|
257
|
+
- !ruby/object:Gem::Version
|
258
|
+
version: '0'
|
259
|
+
description: "\n An easy to use distributed web-crawler framework based on Redis\n
|
246
260
|
\ "
|
247
261
|
email:
|
248
262
|
- francesco.laurita@gmail.com
|
@@ -250,12 +264,12 @@ executables: []
|
|
250
264
|
extensions: []
|
251
265
|
extra_rdoc_files: []
|
252
266
|
files:
|
253
|
-
- .document
|
254
|
-
- .gitignore
|
255
|
-
- .rspec
|
256
|
-
- .rubocop.yml
|
257
|
-
- .rubocop_todo.yml
|
258
|
-
- .travis.yml
|
267
|
+
- ".document"
|
268
|
+
- ".gitignore"
|
269
|
+
- ".rspec"
|
270
|
+
- ".rubocop.yml"
|
271
|
+
- ".rubocop_todo.yml"
|
272
|
+
- ".travis.yml"
|
259
273
|
- AUTHORS.md
|
260
274
|
- CHANGELOG.md
|
261
275
|
- Gemfile
|
@@ -330,17 +344,17 @@ require_paths:
|
|
330
344
|
- lib
|
331
345
|
required_ruby_version: !ruby/object:Gem::Requirement
|
332
346
|
requirements:
|
333
|
-
- -
|
347
|
+
- - ">="
|
334
348
|
- !ruby/object:Gem::Version
|
335
349
|
version: '0'
|
336
350
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
337
351
|
requirements:
|
338
|
-
- -
|
352
|
+
- - ">="
|
339
353
|
- !ruby/object:Gem::Version
|
340
354
|
version: '0'
|
341
355
|
requirements: []
|
342
356
|
rubyforge_project: polipus
|
343
|
-
rubygems_version: 2.
|
357
|
+
rubygems_version: 2.2.2
|
344
358
|
signing_key:
|
345
359
|
specification_version: 4
|
346
360
|
summary: Polipus distributed web-crawler framework
|
@@ -371,4 +385,3 @@ test_files:
|
|
371
385
|
- spec/polipus/url_tracker_spec.rb
|
372
386
|
- spec/polipus_spec.rb
|
373
387
|
- spec/spec_helper.rb
|
374
|
-
has_rdoc:
|