polipus 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/.rubocop_todo.yml +23 -8
- data/AUTHORS.md +1 -0
- data/CHANGELOG.md +7 -0
- data/lib/polipus.rb +11 -5
- data/lib/polipus/http.rb +14 -4
- data/lib/polipus/page.rb +21 -4
- data/lib/polipus/queue_overflow.rb +4 -4
- data/lib/polipus/queue_overflow/mongo_queue.rb +7 -6
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +2 -2
- data/lib/polipus/storage.rb +3 -3
- data/lib/polipus/storage/memory_store.rb +1 -1
- data/lib/polipus/storage/mongo_store.rb +13 -9
- data/lib/polipus/storage/rethink_store.rb +3 -0
- data/lib/polipus/version.rb +1 -1
- data/polipus.gemspec +7 -5
- data/spec/polipus/http_spec.rb +28 -0
- data/spec/polipus/queue_overflow/manager_spec.rb +1 -1
- data/spec/polipus/storage/memory_store_spec.rb +1 -1
- data/spec/polipus/storage/mongo_store_spec.rb +6 -6
- data/spec/polipus_spec.rb +9 -0
- metadata +68 -55
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
MTAxYjZmYWNjMTlkYzk3Y2ZhNjdjMDFmODM2OTQ5YmQ3ZDcyNTQzNw==
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 0e3c15f4ab5c3b4eca8f4703b80d7d8fa74210c3
|
4
|
+
data.tar.gz: ae0dd32d81bbcbef350949e0c51fecaaf59ac8ff
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
NmM1MDc5ZTI0ODkxNzlhY2ZkZGVjZjI2ZTMyNjZhNTMwOWZhZjQ2OGY4ZTJl
|
11
|
-
ZmI4ZjUwNmQ3YjAxOTdlODI2MWRmMWUzNjY3Yzk1MmNiZTk0Y2E=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
YjNlYTZlMjU5OTBmN2ZjMTI1OTY1MzgzNzBhYjBjYTAzN2MwYzY0M2U1ZGYw
|
14
|
-
MjcyMmMxM2I1ZTY5OWEzZDQ3MjQ1OTgzMzcyN2I5NGQ5NTAzZTEzY2FmMDRl
|
15
|
-
MmJiZDdjOTFmMjQwZWU1MzM1OGJhN2E4NTRhNGExZTAyMTVmN2I=
|
6
|
+
metadata.gz: c3a57d1cbfbab4c77cf3b746c7b73eb84cf675e7998eb5979af6d17d997ee26781803e519eae50de6762590fc7287d09ccf4ac0d640dff40602b8a507a356de2
|
7
|
+
data.tar.gz: 743b128ee0dd8fefbbdd1bd7443afbc4a2c62dd2f3d3751d3d693719e9a8e2e145668ccd58945b77f5f780f50757a11248df2b1963c8e7cc35706e937627e5e0
|
data/.rubocop_todo.yml
CHANGED
@@ -1,25 +1,36 @@
|
|
1
1
|
# This configuration was generated by `rubocop --auto-gen-config`
|
2
|
-
# on
|
2
|
+
# on 2015-07-08 20:22:49 -0700 using RuboCop version 0.29.1.
|
3
3
|
# The point is for the user to remove these configuration records
|
4
4
|
# one by one as the offenses are removed from the code base.
|
5
5
|
# Note that changes in the inspected code, or installation of new
|
6
6
|
# versions of RuboCop, may require this file to be generated again.
|
7
7
|
|
8
8
|
# Offense count: 1
|
9
|
-
|
9
|
+
Lint/HandleExceptions:
|
10
10
|
Enabled: false
|
11
11
|
|
12
|
-
# Offense count:
|
12
|
+
# Offense count: 21
|
13
|
+
Metrics/AbcSize:
|
14
|
+
Max: 103
|
15
|
+
|
16
|
+
# Offense count: 12
|
13
17
|
Metrics/CyclomaticComplexity:
|
14
|
-
Max:
|
18
|
+
Max: 15
|
15
19
|
|
16
|
-
# Offense count:
|
17
|
-
|
20
|
+
# Offense count: 10
|
21
|
+
Metrics/PerceivedComplexity:
|
22
|
+
Max: 17
|
23
|
+
|
24
|
+
# Offense count: 1
|
25
|
+
Style/ClassVars:
|
18
26
|
Enabled: false
|
19
27
|
|
28
|
+
# Offense count: 27
|
29
|
+
Style/Documentation:
|
30
|
+
Enabled: false
|
20
31
|
|
21
32
|
# Offense count: 2
|
22
|
-
# Configuration parameters: EnforcedStyle, SupportedStyles.
|
33
|
+
# Configuration parameters: EnforcedStyle, MinBodyLength, SupportedStyles.
|
23
34
|
Style/Next:
|
24
35
|
Enabled: false
|
25
36
|
|
@@ -28,6 +39,10 @@ Style/Next:
|
|
28
39
|
Style/RegexpLiteral:
|
29
40
|
Enabled: false
|
30
41
|
|
31
|
-
# Offense count:
|
42
|
+
# Offense count: 3
|
32
43
|
Style/RescueModifier:
|
33
44
|
Enabled: false
|
45
|
+
|
46
|
+
# Offense count: 1
|
47
|
+
Style/UnlessElse:
|
48
|
+
Enabled: false
|
data/AUTHORS.md
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## 0.5.0 (2015-07-08)
|
4
|
+
|
5
|
+
[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.4.0...0.5.0)
|
6
|
+
|
7
|
+
* Support for MongoDB driver ~> 2.0.6 has been added
|
8
|
+
* Minor code cleanup
|
9
|
+
|
3
10
|
## 0.4.0 (2015-01-12)
|
4
11
|
|
5
12
|
[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.3...0.4.0)
|
data/lib/polipus.rb
CHANGED
@@ -129,7 +129,14 @@ module Polipus
|
|
129
129
|
|
130
130
|
@urls = [urls].flatten.map { |url| URI(url) }
|
131
131
|
@urls.each { |url| url.path = '/' if url.path.empty? }
|
132
|
-
|
132
|
+
if @options[:obey_robots_txt]
|
133
|
+
@robots =
|
134
|
+
if @options[:user_agent].respond_to?(:sample)
|
135
|
+
Polipus::Robotex.new(@options[:user_agent].sample)
|
136
|
+
else
|
137
|
+
Polipus::Robotex.new(@options[:user_agent])
|
138
|
+
end
|
139
|
+
end
|
133
140
|
# Attach signal handling if enabled
|
134
141
|
SignalHandler.enable if @options[:enable_signal_handler]
|
135
142
|
|
@@ -170,7 +177,6 @@ module Polipus
|
|
170
177
|
http = HTTP.new(@options)
|
171
178
|
queue = queue_factory
|
172
179
|
queue.process(false, @options[:queue_timeout]) do |message|
|
173
|
-
|
174
180
|
next if message.nil?
|
175
181
|
|
176
182
|
execute_plugin 'on_message_received'
|
@@ -199,7 +205,7 @@ module Polipus
|
|
199
205
|
rurls = pages.map { |e| e.url.to_s }.join(' --> ')
|
200
206
|
@logger.info { "Got redirects! #{rurls}" }
|
201
207
|
page = pages.pop
|
202
|
-
page.aliases = pages.map
|
208
|
+
page.aliases = pages.map(&:url)
|
203
209
|
if page_exists? page
|
204
210
|
@logger.info { "[worker ##{worker_number}] Page (#{page.url}) already stored." }
|
205
211
|
queue.commit
|
@@ -253,7 +259,7 @@ module Polipus
|
|
253
259
|
end
|
254
260
|
end
|
255
261
|
|
256
|
-
@workers_pool.each
|
262
|
+
@workers_pool.each(&:join)
|
257
263
|
@on_crawl_end.each { |e| e.call(self) }
|
258
264
|
execute_plugin 'on_crawl_end'
|
259
265
|
end
|
@@ -474,7 +480,7 @@ module Polipus
|
|
474
480
|
next unless p.respond_to?(method)
|
475
481
|
@logger.info { "Running plugin method #{method} on #{k}" }
|
476
482
|
ret_val = p.send(method, self)
|
477
|
-
instance_eval(&ret_val) if ret_val.
|
483
|
+
instance_eval(&ret_val) if ret_val.is_a? Proc
|
478
484
|
end
|
479
485
|
end
|
480
486
|
end
|
data/lib/polipus/http.rb
CHANGED
@@ -80,7 +80,11 @@ module Polipus
|
|
80
80
|
# or nil if no such option is set
|
81
81
|
#
|
82
82
|
def user_agent
|
83
|
-
@opts[:user_agent]
|
83
|
+
if @opts[:user_agent].respond_to?(:sample)
|
84
|
+
@opts[:user_agent].sample
|
85
|
+
else
|
86
|
+
@opts[:user_agent]
|
87
|
+
end
|
84
88
|
end
|
85
89
|
|
86
90
|
#
|
@@ -108,7 +112,7 @@ module Polipus
|
|
108
112
|
# The proxy password
|
109
113
|
#
|
110
114
|
def proxy_pass
|
111
|
-
#return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
|
115
|
+
# return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
|
112
116
|
@opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
|
113
117
|
end
|
114
118
|
|
@@ -162,7 +166,13 @@ module Polipus
|
|
162
166
|
|
163
167
|
response, response_time = get_response(loc, referer)
|
164
168
|
code = Integer(response.code)
|
165
|
-
redirect_to =
|
169
|
+
redirect_to =
|
170
|
+
begin
|
171
|
+
response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
172
|
+
rescue URI::InvalidURIError => e
|
173
|
+
@opts[:logger].debug { "Request #{url} got #{e}" } if @opts[:logger]
|
174
|
+
nil
|
175
|
+
end
|
166
176
|
yield response, code, loc, redirect_to, response_time
|
167
177
|
limit -= 1
|
168
178
|
break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
@@ -233,7 +243,7 @@ module Polipus
|
|
233
243
|
|
234
244
|
# Block has higher priority
|
235
245
|
unless @opts[:proxy_host_port].nil?
|
236
|
-
p_host, p_port, p_user, p_pass = proxy_host_port
|
246
|
+
p_host, p_port, p_user, p_pass = proxy_host_port
|
237
247
|
else
|
238
248
|
p_host = proxy_host
|
239
249
|
p_port = proxy_port
|
data/lib/polipus/page.rb
CHANGED
@@ -3,6 +3,8 @@ require 'nokogiri'
|
|
3
3
|
require 'json'
|
4
4
|
require 'ostruct'
|
5
5
|
require 'set'
|
6
|
+
require 'kconv'
|
7
|
+
|
6
8
|
module Polipus
|
7
9
|
class Page
|
8
10
|
# The URL of the page
|
@@ -70,7 +72,7 @@ module Polipus
|
|
70
72
|
u = a['href']
|
71
73
|
next if u.nil? || u.empty?
|
72
74
|
abs = to_absolute(u) rescue next
|
73
|
-
@links << abs if in_domain?(abs)
|
75
|
+
@links << abs if abs && in_domain?(abs)
|
74
76
|
end
|
75
77
|
@links.to_a
|
76
78
|
end
|
@@ -80,7 +82,10 @@ module Polipus
|
|
80
82
|
#
|
81
83
|
def doc
|
82
84
|
return @doc if @doc
|
83
|
-
@
|
85
|
+
@body ||= ''
|
86
|
+
@body = @body.encode('utf-8', 'binary', invalid: :replace,
|
87
|
+
undef: :replace, replace: '')
|
88
|
+
@doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
|
84
89
|
end
|
85
90
|
|
86
91
|
#
|
@@ -166,10 +171,22 @@ module Polipus
|
|
166
171
|
def to_absolute(link)
|
167
172
|
return nil if link.nil?
|
168
173
|
|
174
|
+
valid_link = link.to_s.encode('utf-8', 'binary', invalid: :replace,
|
175
|
+
undef: :replace, replace: '')
|
176
|
+
|
169
177
|
# remove anchor
|
170
|
-
link =
|
178
|
+
link =
|
179
|
+
begin
|
180
|
+
URI.encode(URI.decode(valid_link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
|
181
|
+
rescue URI::Error
|
182
|
+
return nil
|
183
|
+
end
|
171
184
|
|
172
|
-
relative =
|
185
|
+
relative = begin
|
186
|
+
URI(link)
|
187
|
+
rescue URI::Error
|
188
|
+
return nil
|
189
|
+
end
|
173
190
|
absolute = base ? base.merge(relative) : @url.merge(relative)
|
174
191
|
|
175
192
|
absolute.path = '/' if absolute.path.empty?
|
@@ -5,15 +5,15 @@ module Polipus
|
|
5
5
|
module QueueOverflow
|
6
6
|
def self.mongo_queue(mongo_db, queue_name, options = {})
|
7
7
|
require 'polipus/queue_overflow/mongo_queue'
|
8
|
-
mongo_db ||= Mongo::
|
9
|
-
fail 'First argument must be an instance of Mongo::
|
8
|
+
mongo_db ||= Mongo::Client.new(['localhost:27_017'], database: 'polipus')
|
9
|
+
fail 'First argument must be an instance of Mongo::Client' unless mongo_db.is_a?(Mongo::Client)
|
10
10
|
self::MongoQueue.new mongo_db, queue_name, options
|
11
11
|
end
|
12
12
|
|
13
13
|
def self.mongo_queue_capped(mongo_db, queue_name, options = {})
|
14
14
|
require 'polipus/queue_overflow/mongo_queue_capped'
|
15
|
-
mongo_db ||= Mongo::
|
16
|
-
fail 'First argument must be an instance of Mongo::
|
15
|
+
mongo_db ||= Mongo::Client.new(['localhost:27_017'], database: 'polipus')
|
16
|
+
fail 'First argument must be an instance of Mongo::Client' unless mongo_db.is_a?(Mongo::Client)
|
17
17
|
options[:max] = 1_000_000 if options[:max].nil?
|
18
18
|
self::MongoQueueCapped.new mongo_db, queue_name, options
|
19
19
|
end
|
@@ -14,7 +14,7 @@ module Polipus
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def length
|
17
|
-
@mongo_db[@collection_name].count
|
17
|
+
@mongo_db[@collection_name].find.count
|
18
18
|
end
|
19
19
|
|
20
20
|
def empty?
|
@@ -28,18 +28,18 @@ module Polipus
|
|
28
28
|
|
29
29
|
def push(data)
|
30
30
|
if @options[:ensure_uniq]
|
31
|
-
@mongo_db[@collection_name].
|
31
|
+
@mongo_db[@collection_name].find(payload: data).replace_one({ payload: data }, upsert: true)
|
32
32
|
else
|
33
|
-
@mongo_db[@collection_name].
|
33
|
+
@mongo_db[@collection_name].insert_one(payload: data)
|
34
34
|
end
|
35
35
|
true
|
36
36
|
end
|
37
37
|
|
38
38
|
def pop(_ = false)
|
39
39
|
@semaphore.synchronize do
|
40
|
-
doc = @mongo_db[@collection_name].find(
|
40
|
+
doc = @mongo_db[@collection_name].find.sort(_id: 1).limit(1).first
|
41
41
|
return nil if doc.nil?
|
42
|
-
@mongo_db[@collection_name].
|
42
|
+
@mongo_db[@collection_name].find(_id: doc['_id']).delete_one
|
43
43
|
doc && doc['payload'] ? doc['payload'] : nil
|
44
44
|
end
|
45
45
|
end
|
@@ -53,7 +53,8 @@ module Polipus
|
|
53
53
|
protected
|
54
54
|
|
55
55
|
def ensure_index
|
56
|
-
@
|
56
|
+
# @TODO: Drop dups option was removed. We may want to add something here to remove duplications
|
57
|
+
@mongo_db[@collection_name].indexes.create_one({ payload: 1 }, background: true, unique: true)
|
57
58
|
end
|
58
59
|
end
|
59
60
|
end
|
@@ -13,8 +13,8 @@ module Polipus
|
|
13
13
|
@semaphore.synchronize do
|
14
14
|
s = size
|
15
15
|
if s > @max
|
16
|
-
docs = @mongo_db[@collection_name].find(
|
17
|
-
@mongo_db[@collection_name].
|
16
|
+
docs = @mongo_db[@collection_name].find.sort(_id: 1).projection(_id: 1).limit(s - @max).map { |e| e['_id'] }
|
17
|
+
@mongo_db[@collection_name].find(_id: { '$in' => docs }).delete_many
|
18
18
|
end
|
19
19
|
end
|
20
20
|
end
|
data/lib/polipus/storage.rb
CHANGED
@@ -6,14 +6,14 @@ module Polipus
|
|
6
6
|
|
7
7
|
def self.mongo_store(mongo = nil, collection = COLLECTION, except = [])
|
8
8
|
require 'polipus/storage/mongo_store'
|
9
|
-
mongo ||= Mongo::
|
10
|
-
fail 'First argument must be an instance of Mongo::
|
9
|
+
mongo ||= Mongo::Client.new(['localhost:27_017'], database: 'polipus')
|
10
|
+
fail 'First argument must be an instance of Mongo::Client' unless mongo.is_a?(Mongo::Client)
|
11
11
|
self::MongoStore.new(mongo: mongo, collection: collection, except: except)
|
12
12
|
end
|
13
13
|
|
14
14
|
def self.rethink_store(conn = nil, table = COLLECTION, except = [])
|
15
15
|
require 'polipus/storage/rethink_store'
|
16
|
-
conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus'
|
16
|
+
conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus')
|
17
17
|
fail "First argument must be a RethinkDB::Connection, got `#{conn.class}`" unless conn.is_a?(RethinkDB::Connection)
|
18
18
|
self::RethinkStore.new(conn: conn, table: table, except: except)
|
19
19
|
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'mongo'
|
3
3
|
require 'zlib'
|
4
4
|
require 'thread'
|
5
|
+
require 'pry'
|
5
6
|
|
6
7
|
module Polipus
|
7
8
|
module Storage
|
@@ -10,10 +11,9 @@ module Polipus
|
|
10
11
|
def initialize(options = {})
|
11
12
|
@mongo = options[:mongo]
|
12
13
|
@collection = options[:collection]
|
13
|
-
@mongo.create_collection(@collection)
|
14
14
|
begin
|
15
15
|
@mongo[@collection].ensure_index(:uuid, unique: true, dropDups: true, background: true)
|
16
|
-
rescue
|
16
|
+
rescue StandardError
|
17
17
|
end
|
18
18
|
|
19
19
|
@compress_body = options[:compress_body] ||= true
|
@@ -28,16 +28,20 @@ module Polipus
|
|
28
28
|
obj['uuid'] = uuid(page)
|
29
29
|
obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
|
30
30
|
BINARY_FIELDS.each do |field|
|
31
|
-
obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
|
31
|
+
obj[field] = BSON::Binary.new(obj[field].force_encoding('UTF-8').encode('UTF-8')) unless obj[field].nil?
|
32
32
|
end
|
33
|
-
|
33
|
+
|
34
|
+
# We really need 2.0.6+ version for this to work
|
35
|
+
# https://jira.mongodb.org/browse/RUBY-881
|
36
|
+
@mongo[@collection].find(uuid: uuid(page)).replace_one(obj, upsert: true)
|
37
|
+
|
34
38
|
obj['uuid']
|
35
39
|
end
|
36
40
|
end
|
37
41
|
|
38
42
|
def exists?(page)
|
39
43
|
@semaphore.synchronize do
|
40
|
-
doc = @mongo[@collection].find(
|
44
|
+
doc = @mongo[@collection].find(uuid: uuid(page)).projection(_id: 1).limit(1).first
|
41
45
|
!doc.nil?
|
42
46
|
end
|
43
47
|
end
|
@@ -51,16 +55,16 @@ module Polipus
|
|
51
55
|
|
52
56
|
def remove(page)
|
53
57
|
@semaphore.synchronize do
|
54
|
-
@mongo[@collection].
|
58
|
+
@mongo[@collection].find(uuid: uuid(page)).delete_one
|
55
59
|
end
|
56
60
|
end
|
57
61
|
|
58
62
|
def count
|
59
|
-
@mongo[@collection].count
|
63
|
+
@mongo[@collection].find.count
|
60
64
|
end
|
61
65
|
|
62
66
|
def each
|
63
|
-
@mongo[@collection].find
|
67
|
+
@mongo[@collection].find.no_cursor_timeout do |cursor|
|
64
68
|
cursor.each do |doc|
|
65
69
|
page = load_page(doc)
|
66
70
|
yield doc['uuid'], page
|
@@ -76,7 +80,7 @@ module Polipus
|
|
76
80
|
|
77
81
|
def load_page(hash)
|
78
82
|
BINARY_FIELDS.each do |field|
|
79
|
-
hash[field] = hash[field].
|
83
|
+
hash[field] = hash[field].data unless hash[field].nil?
|
80
84
|
end
|
81
85
|
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
82
86
|
page = Page.from_hash(hash)
|
@@ -12,6 +12,9 @@ module Polipus
|
|
12
12
|
@rethink = options[:conn]
|
13
13
|
@table = options[:table]
|
14
14
|
|
15
|
+
unless @r.db_list.run(@rethink).include?(@rethink.default_db)
|
16
|
+
@r.db_create(@rethink.default_db).run(@rethink)
|
17
|
+
end
|
15
18
|
unless @r.table_list.run(@rethink).include?(@table)
|
16
19
|
@r.table_create(@table).run(@rethink)
|
17
20
|
@r.table(@table).index_create('created_at')
|
data/lib/polipus/version.rb
CHANGED
data/polipus.gemspec
CHANGED
@@ -8,10 +8,10 @@ Gem::Specification.new do |s|
|
|
8
8
|
s.authors = ['Francesco Laurita']
|
9
9
|
s.email = ['francesco.laurita@gmail.com']
|
10
10
|
s.homepage = Polipus::HOMEPAGE
|
11
|
-
s.summary =
|
12
|
-
s.description =
|
11
|
+
s.summary = 'Polipus distributed web-crawler framework'
|
12
|
+
s.description = '
|
13
13
|
An easy to use distributed web-crawler framework based on Redis
|
14
|
-
|
14
|
+
'
|
15
15
|
s.licenses = ['MIT']
|
16
16
|
s.platform = Gem::Platform::RUBY
|
17
17
|
|
@@ -30,8 +30,8 @@ Gem::Specification.new do |s|
|
|
30
30
|
s.add_runtime_dependency 'redis-queue', '~> 0.0', '>= 0.0.4'
|
31
31
|
s.add_runtime_dependency 'redis-bloomfilter', '~> 0.0', '>= 0.0.3'
|
32
32
|
|
33
|
-
s.add_development_dependency 'mongo', '~>
|
34
|
-
s.add_development_dependency 'rethinkdb', '~>1.15.0'
|
33
|
+
s.add_development_dependency 'mongo', '~> 2.0.6'
|
34
|
+
s.add_development_dependency 'rethinkdb', '~> 1.15.0'
|
35
35
|
|
36
36
|
s.add_development_dependency 'rake', '~> 10.3'
|
37
37
|
s.add_development_dependency 'rspec', '~> 3.1.0'
|
@@ -41,4 +41,6 @@ Gem::Specification.new do |s|
|
|
41
41
|
s.add_development_dependency 'webmock', '~> 1.20.0'
|
42
42
|
|
43
43
|
s.add_development_dependency 'coveralls'
|
44
|
+
|
45
|
+
s.add_development_dependency 'pry'
|
44
46
|
end
|
data/spec/polipus/http_spec.rb
CHANGED
@@ -108,4 +108,32 @@ describe Polipus::HTTP do
|
|
108
108
|
end
|
109
109
|
end
|
110
110
|
end
|
111
|
+
|
112
|
+
describe 'random user_agent' do
|
113
|
+
context 'when user_agent is string' do
|
114
|
+
it '#user_agent' do
|
115
|
+
http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1, user_agent: 'Googlebot')
|
116
|
+
expect(http.user_agent).to eq('Googlebot')
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
context 'when user_agent is list' do
|
121
|
+
let(:user_agents) do
|
122
|
+
['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1',
|
123
|
+
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
|
124
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
|
125
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
|
126
|
+
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10',
|
127
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
|
128
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5',
|
129
|
+
'Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)'
|
130
|
+
]
|
131
|
+
end
|
132
|
+
|
133
|
+
it '#user_agent' do
|
134
|
+
http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1, user_agent: user_agents)
|
135
|
+
expect(user_agents).to include(http.user_agent)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
111
139
|
end
|
@@ -6,7 +6,7 @@ require 'redis-queue'
|
|
6
6
|
|
7
7
|
describe Polipus::QueueOverflow::Manager do
|
8
8
|
before(:all) do
|
9
|
-
@mongo = Mongo::
|
9
|
+
@mongo = Mongo::Client.new(['localhost:27_017'], database: '_test_polipus')
|
10
10
|
@mongo['_test_pages'].drop
|
11
11
|
@storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
12
12
|
@redis_q = Redis::Queue.new('queue_test', 'bp_queue_test', redis: Redis.new)
|
@@ -4,7 +4,7 @@ require 'mongo'
|
|
4
4
|
require 'polipus/storage/mongo_store'
|
5
5
|
describe Polipus::Storage::MongoStore do
|
6
6
|
before(:all)do
|
7
|
-
@mongo = Mongo::
|
7
|
+
@mongo = Mongo::Client.new(['localhost:27_017'], database: '_test_polipus')
|
8
8
|
@mongo['_test_pages'].drop
|
9
9
|
@storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
10
10
|
end
|
@@ -21,8 +21,8 @@ describe Polipus::Storage::MongoStore do
|
|
21
21
|
p = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
|
22
22
|
uuid = @storage.add p
|
23
23
|
expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
|
24
|
-
expect(@storage.count).to be 1
|
25
|
-
expect(@mongo['_test_pages'].count).to be 1
|
24
|
+
expect(@storage.count.to_i).to be 1
|
25
|
+
expect(@mongo['_test_pages'].find.count.to_i).to be 1
|
26
26
|
p = @storage.get p
|
27
27
|
expect(p.url.to_s).to eq('http://www.google.com')
|
28
28
|
expect(p.body).to eq('<html></html>')
|
@@ -33,7 +33,7 @@ describe Polipus::Storage::MongoStore do
|
|
33
33
|
@storage.add p
|
34
34
|
p = @storage.get p
|
35
35
|
expect(p.code).to eq(301)
|
36
|
-
expect(@mongo['_test_pages'].count).to be 1
|
36
|
+
expect(@mongo['_test_pages'].find.count.to_i).to be 1
|
37
37
|
end
|
38
38
|
|
39
39
|
it 'should iterate over stored pages' do
|
@@ -47,7 +47,7 @@ describe Polipus::Storage::MongoStore do
|
|
47
47
|
p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
48
48
|
@storage.remove p
|
49
49
|
expect(@storage.get(p)).to be_nil
|
50
|
-
expect(@storage.count).to be 0
|
50
|
+
expect(@storage.count.to_i).to be 0
|
51
51
|
end
|
52
52
|
|
53
53
|
it 'should store a page removing a query string from the uuid generation' do
|
@@ -83,7 +83,7 @@ describe Polipus::Storage::MongoStore do
|
|
83
83
|
p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
|
84
84
|
storage.add p
|
85
85
|
p = storage.get p
|
86
|
-
expect(p.body).to
|
86
|
+
expect(p.body).to be_nil
|
87
87
|
storage.clear
|
88
88
|
end
|
89
89
|
|
data/spec/polipus_spec.rb
CHANGED
@@ -93,5 +93,14 @@ describe Polipus::PolipusCrawler do
|
|
93
93
|
polipus.takeover
|
94
94
|
polipus.storage.each { |_id, page| expect(page.url.path =~ /$\/downloads\//).to be_falsey }
|
95
95
|
end
|
96
|
+
|
97
|
+
it 'should obey to the robots.txt file with list user_agent' do
|
98
|
+
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)'
|
99
|
+
lopt = p_options
|
100
|
+
lopt[:obey_robots_txt] = true
|
101
|
+
lopt[:user_agent] = [user_agent]
|
102
|
+
flexmock(Polipus::Robotex).should_receive(:new).with(user_agent)
|
103
|
+
Polipus::PolipusCrawler.new('polipus-rspec', ['https://rubygems.org/gems/polipus'], lopt)
|
104
|
+
end
|
96
105
|
end
|
97
106
|
end
|
metadata
CHANGED
@@ -1,248 +1,262 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polipus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francesco Laurita
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-07-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.6'
|
20
|
-
- -
|
20
|
+
- - ">="
|
21
21
|
- !ruby/object:Gem::Version
|
22
22
|
version: 1.6.0
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
|
-
- - ~>
|
27
|
+
- - "~>"
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '1.6'
|
30
|
-
- -
|
30
|
+
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.6.0
|
33
33
|
- !ruby/object:Gem::Dependency
|
34
34
|
name: http-cookie
|
35
35
|
requirement: !ruby/object:Gem::Requirement
|
36
36
|
requirements:
|
37
|
-
- - ~>
|
37
|
+
- - "~>"
|
38
38
|
- !ruby/object:Gem::Version
|
39
39
|
version: '1.0'
|
40
|
-
- -
|
40
|
+
- - ">="
|
41
41
|
- !ruby/object:Gem::Version
|
42
42
|
version: 1.0.1
|
43
43
|
type: :runtime
|
44
44
|
prerelease: false
|
45
45
|
version_requirements: !ruby/object:Gem::Requirement
|
46
46
|
requirements:
|
47
|
-
- - ~>
|
47
|
+
- - "~>"
|
48
48
|
- !ruby/object:Gem::Version
|
49
49
|
version: '1.0'
|
50
|
-
- -
|
50
|
+
- - ">="
|
51
51
|
- !ruby/object:Gem::Version
|
52
52
|
version: 1.0.1
|
53
53
|
- !ruby/object:Gem::Dependency
|
54
54
|
name: redis
|
55
55
|
requirement: !ruby/object:Gem::Requirement
|
56
56
|
requirements:
|
57
|
-
- - ~>
|
57
|
+
- - "~>"
|
58
58
|
- !ruby/object:Gem::Version
|
59
59
|
version: '3.0'
|
60
|
-
- -
|
60
|
+
- - ">="
|
61
61
|
- !ruby/object:Gem::Version
|
62
62
|
version: 3.0.4
|
63
63
|
type: :runtime
|
64
64
|
prerelease: false
|
65
65
|
version_requirements: !ruby/object:Gem::Requirement
|
66
66
|
requirements:
|
67
|
-
- - ~>
|
67
|
+
- - "~>"
|
68
68
|
- !ruby/object:Gem::Version
|
69
69
|
version: '3.0'
|
70
|
-
- -
|
70
|
+
- - ">="
|
71
71
|
- !ruby/object:Gem::Version
|
72
72
|
version: 3.0.4
|
73
73
|
- !ruby/object:Gem::Dependency
|
74
74
|
name: hiredis
|
75
75
|
requirement: !ruby/object:Gem::Requirement
|
76
76
|
requirements:
|
77
|
-
- - ~>
|
77
|
+
- - "~>"
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: '0.5'
|
80
|
-
- -
|
80
|
+
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 0.4.5
|
83
83
|
type: :runtime
|
84
84
|
prerelease: false
|
85
85
|
version_requirements: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- - ~>
|
87
|
+
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: '0.5'
|
90
|
-
- -
|
90
|
+
- - ">="
|
91
91
|
- !ruby/object:Gem::Version
|
92
92
|
version: 0.4.5
|
93
93
|
- !ruby/object:Gem::Dependency
|
94
94
|
name: redis-queue
|
95
95
|
requirement: !ruby/object:Gem::Requirement
|
96
96
|
requirements:
|
97
|
-
- - ~>
|
97
|
+
- - "~>"
|
98
98
|
- !ruby/object:Gem::Version
|
99
99
|
version: '0.0'
|
100
|
-
- -
|
100
|
+
- - ">="
|
101
101
|
- !ruby/object:Gem::Version
|
102
102
|
version: 0.0.4
|
103
103
|
type: :runtime
|
104
104
|
prerelease: false
|
105
105
|
version_requirements: !ruby/object:Gem::Requirement
|
106
106
|
requirements:
|
107
|
-
- - ~>
|
107
|
+
- - "~>"
|
108
108
|
- !ruby/object:Gem::Version
|
109
109
|
version: '0.0'
|
110
|
-
- -
|
110
|
+
- - ">="
|
111
111
|
- !ruby/object:Gem::Version
|
112
112
|
version: 0.0.4
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: redis-bloomfilter
|
115
115
|
requirement: !ruby/object:Gem::Requirement
|
116
116
|
requirements:
|
117
|
-
- - ~>
|
117
|
+
- - "~>"
|
118
118
|
- !ruby/object:Gem::Version
|
119
119
|
version: '0.0'
|
120
|
-
- -
|
120
|
+
- - ">="
|
121
121
|
- !ruby/object:Gem::Version
|
122
122
|
version: 0.0.3
|
123
123
|
type: :runtime
|
124
124
|
prerelease: false
|
125
125
|
version_requirements: !ruby/object:Gem::Requirement
|
126
126
|
requirements:
|
127
|
-
- - ~>
|
127
|
+
- - "~>"
|
128
128
|
- !ruby/object:Gem::Version
|
129
129
|
version: '0.0'
|
130
|
-
- -
|
130
|
+
- - ">="
|
131
131
|
- !ruby/object:Gem::Version
|
132
132
|
version: 0.0.3
|
133
133
|
- !ruby/object:Gem::Dependency
|
134
134
|
name: mongo
|
135
135
|
requirement: !ruby/object:Gem::Requirement
|
136
136
|
requirements:
|
137
|
-
- - ~>
|
137
|
+
- - "~>"
|
138
138
|
- !ruby/object:Gem::Version
|
139
|
-
version:
|
139
|
+
version: 2.0.6
|
140
140
|
type: :development
|
141
141
|
prerelease: false
|
142
142
|
version_requirements: !ruby/object:Gem::Requirement
|
143
143
|
requirements:
|
144
|
-
- - ~>
|
144
|
+
- - "~>"
|
145
145
|
- !ruby/object:Gem::Version
|
146
|
-
version:
|
146
|
+
version: 2.0.6
|
147
147
|
- !ruby/object:Gem::Dependency
|
148
148
|
name: rethinkdb
|
149
149
|
requirement: !ruby/object:Gem::Requirement
|
150
150
|
requirements:
|
151
|
-
- - ~>
|
151
|
+
- - "~>"
|
152
152
|
- !ruby/object:Gem::Version
|
153
153
|
version: 1.15.0
|
154
154
|
type: :development
|
155
155
|
prerelease: false
|
156
156
|
version_requirements: !ruby/object:Gem::Requirement
|
157
157
|
requirements:
|
158
|
-
- - ~>
|
158
|
+
- - "~>"
|
159
159
|
- !ruby/object:Gem::Version
|
160
160
|
version: 1.15.0
|
161
161
|
- !ruby/object:Gem::Dependency
|
162
162
|
name: rake
|
163
163
|
requirement: !ruby/object:Gem::Requirement
|
164
164
|
requirements:
|
165
|
-
- - ~>
|
165
|
+
- - "~>"
|
166
166
|
- !ruby/object:Gem::Version
|
167
167
|
version: '10.3'
|
168
168
|
type: :development
|
169
169
|
prerelease: false
|
170
170
|
version_requirements: !ruby/object:Gem::Requirement
|
171
171
|
requirements:
|
172
|
-
- - ~>
|
172
|
+
- - "~>"
|
173
173
|
- !ruby/object:Gem::Version
|
174
174
|
version: '10.3'
|
175
175
|
- !ruby/object:Gem::Dependency
|
176
176
|
name: rspec
|
177
177
|
requirement: !ruby/object:Gem::Requirement
|
178
178
|
requirements:
|
179
|
-
- - ~>
|
179
|
+
- - "~>"
|
180
180
|
- !ruby/object:Gem::Version
|
181
181
|
version: 3.1.0
|
182
182
|
type: :development
|
183
183
|
prerelease: false
|
184
184
|
version_requirements: !ruby/object:Gem::Requirement
|
185
185
|
requirements:
|
186
|
-
- - ~>
|
186
|
+
- - "~>"
|
187
187
|
- !ruby/object:Gem::Version
|
188
188
|
version: 3.1.0
|
189
189
|
- !ruby/object:Gem::Dependency
|
190
190
|
name: flexmock
|
191
191
|
requirement: !ruby/object:Gem::Requirement
|
192
192
|
requirements:
|
193
|
-
- - ~>
|
193
|
+
- - "~>"
|
194
194
|
- !ruby/object:Gem::Version
|
195
195
|
version: '1.3'
|
196
196
|
type: :development
|
197
197
|
prerelease: false
|
198
198
|
version_requirements: !ruby/object:Gem::Requirement
|
199
199
|
requirements:
|
200
|
-
- - ~>
|
200
|
+
- - "~>"
|
201
201
|
- !ruby/object:Gem::Version
|
202
202
|
version: '1.3'
|
203
203
|
- !ruby/object:Gem::Dependency
|
204
204
|
name: vcr
|
205
205
|
requirement: !ruby/object:Gem::Requirement
|
206
206
|
requirements:
|
207
|
-
- - ~>
|
207
|
+
- - "~>"
|
208
208
|
- !ruby/object:Gem::Version
|
209
209
|
version: 2.9.0
|
210
210
|
type: :development
|
211
211
|
prerelease: false
|
212
212
|
version_requirements: !ruby/object:Gem::Requirement
|
213
213
|
requirements:
|
214
|
-
- - ~>
|
214
|
+
- - "~>"
|
215
215
|
- !ruby/object:Gem::Version
|
216
216
|
version: 2.9.0
|
217
217
|
- !ruby/object:Gem::Dependency
|
218
218
|
name: webmock
|
219
219
|
requirement: !ruby/object:Gem::Requirement
|
220
220
|
requirements:
|
221
|
-
- - ~>
|
221
|
+
- - "~>"
|
222
222
|
- !ruby/object:Gem::Version
|
223
223
|
version: 1.20.0
|
224
224
|
type: :development
|
225
225
|
prerelease: false
|
226
226
|
version_requirements: !ruby/object:Gem::Requirement
|
227
227
|
requirements:
|
228
|
-
- - ~>
|
228
|
+
- - "~>"
|
229
229
|
- !ruby/object:Gem::Version
|
230
230
|
version: 1.20.0
|
231
231
|
- !ruby/object:Gem::Dependency
|
232
232
|
name: coveralls
|
233
233
|
requirement: !ruby/object:Gem::Requirement
|
234
234
|
requirements:
|
235
|
-
- -
|
235
|
+
- - ">="
|
236
236
|
- !ruby/object:Gem::Version
|
237
237
|
version: '0'
|
238
238
|
type: :development
|
239
239
|
prerelease: false
|
240
240
|
version_requirements: !ruby/object:Gem::Requirement
|
241
241
|
requirements:
|
242
|
-
- -
|
242
|
+
- - ">="
|
243
243
|
- !ruby/object:Gem::Version
|
244
244
|
version: '0'
|
245
|
-
|
245
|
+
- !ruby/object:Gem::Dependency
|
246
|
+
name: pry
|
247
|
+
requirement: !ruby/object:Gem::Requirement
|
248
|
+
requirements:
|
249
|
+
- - ">="
|
250
|
+
- !ruby/object:Gem::Version
|
251
|
+
version: '0'
|
252
|
+
type: :development
|
253
|
+
prerelease: false
|
254
|
+
version_requirements: !ruby/object:Gem::Requirement
|
255
|
+
requirements:
|
256
|
+
- - ">="
|
257
|
+
- !ruby/object:Gem::Version
|
258
|
+
version: '0'
|
259
|
+
description: "\n An easy to use distributed web-crawler framework based on Redis\n
|
246
260
|
\ "
|
247
261
|
email:
|
248
262
|
- francesco.laurita@gmail.com
|
@@ -250,12 +264,12 @@ executables: []
|
|
250
264
|
extensions: []
|
251
265
|
extra_rdoc_files: []
|
252
266
|
files:
|
253
|
-
- .document
|
254
|
-
- .gitignore
|
255
|
-
- .rspec
|
256
|
-
- .rubocop.yml
|
257
|
-
- .rubocop_todo.yml
|
258
|
-
- .travis.yml
|
267
|
+
- ".document"
|
268
|
+
- ".gitignore"
|
269
|
+
- ".rspec"
|
270
|
+
- ".rubocop.yml"
|
271
|
+
- ".rubocop_todo.yml"
|
272
|
+
- ".travis.yml"
|
259
273
|
- AUTHORS.md
|
260
274
|
- CHANGELOG.md
|
261
275
|
- Gemfile
|
@@ -330,17 +344,17 @@ require_paths:
|
|
330
344
|
- lib
|
331
345
|
required_ruby_version: !ruby/object:Gem::Requirement
|
332
346
|
requirements:
|
333
|
-
- -
|
347
|
+
- - ">="
|
334
348
|
- !ruby/object:Gem::Version
|
335
349
|
version: '0'
|
336
350
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
337
351
|
requirements:
|
338
|
-
- -
|
352
|
+
- - ">="
|
339
353
|
- !ruby/object:Gem::Version
|
340
354
|
version: '0'
|
341
355
|
requirements: []
|
342
356
|
rubyforge_project: polipus
|
343
|
-
rubygems_version: 2.
|
357
|
+
rubygems_version: 2.2.2
|
344
358
|
signing_key:
|
345
359
|
specification_version: 4
|
346
360
|
summary: Polipus distributed web-crawler framework
|
@@ -371,4 +385,3 @@ test_files:
|
|
371
385
|
- spec/polipus/url_tracker_spec.rb
|
372
386
|
- spec/polipus_spec.rb
|
373
387
|
- spec/spec_helper.rb
|
374
|
-
has_rdoc:
|