polipus 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- NGU0MWZlMWIwMGM2MWNhMmJiOTU3MDM0YWI0ZjY0MzI3NmNkYzgyNA==
5
- data.tar.gz: !binary |-
6
- MTAxYjZmYWNjMTlkYzk3Y2ZhNjdjMDFmODM2OTQ5YmQ3ZDcyNTQzNw==
2
+ SHA1:
3
+ metadata.gz: 0e3c15f4ab5c3b4eca8f4703b80d7d8fa74210c3
4
+ data.tar.gz: ae0dd32d81bbcbef350949e0c51fecaaf59ac8ff
7
5
  SHA512:
8
- metadata.gz: !binary |-
9
- ZDE5NDExNmE1M2M4MDM0MTExNzVkNmM0ZmVjNTQyOTg5Y2JkYTdmZjBhY2Fi
10
- NmM1MDc5ZTI0ODkxNzlhY2ZkZGVjZjI2ZTMyNjZhNTMwOWZhZjQ2OGY4ZTJl
11
- ZmI4ZjUwNmQ3YjAxOTdlODI2MWRmMWUzNjY3Yzk1MmNiZTk0Y2E=
12
- data.tar.gz: !binary |-
13
- YjNlYTZlMjU5OTBmN2ZjMTI1OTY1MzgzNzBhYjBjYTAzN2MwYzY0M2U1ZGYw
14
- MjcyMmMxM2I1ZTY5OWEzZDQ3MjQ1OTgzMzcyN2I5NGQ5NTAzZTEzY2FmMDRl
15
- MmJiZDdjOTFmMjQwZWU1MzM1OGJhN2E4NTRhNGExZTAyMTVmN2I=
6
+ metadata.gz: c3a57d1cbfbab4c77cf3b746c7b73eb84cf675e7998eb5979af6d17d997ee26781803e519eae50de6762590fc7287d09ccf4ac0d640dff40602b8a507a356de2
7
+ data.tar.gz: 743b128ee0dd8fefbbdd1bd7443afbc4a2c62dd2f3d3751d3d693719e9a8e2e145668ccd58945b77f5f780f50757a11248df2b1963c8e7cc35706e937627e5e0
@@ -1,25 +1,36 @@
1
1
  # This configuration was generated by `rubocop --auto-gen-config`
2
- # on 2014-06-08 11:25:39 -0700 using RuboCop version 0.23.0.
2
+ # on 2015-07-08 20:22:49 -0700 using RuboCop version 0.29.1.
3
3
  # The point is for the user to remove these configuration records
4
4
  # one by one as the offenses are removed from the code base.
5
5
  # Note that changes in the inspected code, or installation of new
6
6
  # versions of RuboCop, may require this file to be generated again.
7
7
 
8
8
  # Offense count: 1
9
- Style/ClassVars:
9
+ Lint/HandleExceptions:
10
10
  Enabled: false
11
11
 
12
- # Offense count: 10
12
+ # Offense count: 21
13
+ Metrics/AbcSize:
14
+ Max: 103
15
+
16
+ # Offense count: 12
13
17
  Metrics/CyclomaticComplexity:
14
- Max: 16
18
+ Max: 15
15
19
 
16
- # Offense count: 26
17
- Style/Documentation:
20
+ # Offense count: 10
21
+ Metrics/PerceivedComplexity:
22
+ Max: 17
23
+
24
+ # Offense count: 1
25
+ Style/ClassVars:
18
26
  Enabled: false
19
27
 
28
+ # Offense count: 27
29
+ Style/Documentation:
30
+ Enabled: false
20
31
 
21
32
  # Offense count: 2
22
- # Configuration parameters: EnforcedStyle, SupportedStyles.
33
+ # Configuration parameters: EnforcedStyle, MinBodyLength, SupportedStyles.
23
34
  Style/Next:
24
35
  Enabled: false
25
36
 
@@ -28,6 +39,10 @@ Style/Next:
28
39
  Style/RegexpLiteral:
29
40
  Enabled: false
30
41
 
31
- # Offense count: 4
42
+ # Offense count: 3
32
43
  Style/RescueModifier:
33
44
  Enabled: false
45
+
46
+ # Offense count: 1
47
+ Style/UnlessElse:
48
+ Enabled: false
data/AUTHORS.md CHANGED
@@ -3,3 +3,4 @@
3
3
  * [Francesco Laurita](francesco.laurita@gmail.com)
4
4
  * [Tobias L. Maier](http://tobiasmaier.info/)
5
5
  * [Marcos Piccinini](https://github.com/nofxx)
6
+ * [Martin Bianculli](https://github.com/lepek)
@@ -1,5 +1,12 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.5.0 (2015-07-08)
4
+
5
+ [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.4.0...0.5.0)
6
+
7
+ * Support for MongoDB driver ~> 2.0.6 has been added
8
+ * Minor code cleanup
9
+
3
10
  ## 0.4.0 (2015-01-12)
4
11
 
5
12
  [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.3...0.4.0)
@@ -129,7 +129,14 @@ module Polipus
129
129
 
130
130
  @urls = [urls].flatten.map { |url| URI(url) }
131
131
  @urls.each { |url| url.path = '/' if url.path.empty? }
132
- @robots = Polipus::Robotex.new(@options[:user_agent]) if @options[:obey_robots_txt]
132
+ if @options[:obey_robots_txt]
133
+ @robots =
134
+ if @options[:user_agent].respond_to?(:sample)
135
+ Polipus::Robotex.new(@options[:user_agent].sample)
136
+ else
137
+ Polipus::Robotex.new(@options[:user_agent])
138
+ end
139
+ end
133
140
  # Attach signal handling if enabled
134
141
  SignalHandler.enable if @options[:enable_signal_handler]
135
142
 
@@ -170,7 +177,6 @@ module Polipus
170
177
  http = HTTP.new(@options)
171
178
  queue = queue_factory
172
179
  queue.process(false, @options[:queue_timeout]) do |message|
173
-
174
180
  next if message.nil?
175
181
 
176
182
  execute_plugin 'on_message_received'
@@ -199,7 +205,7 @@ module Polipus
199
205
  rurls = pages.map { |e| e.url.to_s }.join(' --> ')
200
206
  @logger.info { "Got redirects! #{rurls}" }
201
207
  page = pages.pop
202
- page.aliases = pages.map { |e| e.url }
208
+ page.aliases = pages.map(&:url)
203
209
  if page_exists? page
204
210
  @logger.info { "[worker ##{worker_number}] Page (#{page.url}) already stored." }
205
211
  queue.commit
@@ -253,7 +259,7 @@ module Polipus
253
259
  end
254
260
  end
255
261
 
256
- @workers_pool.each { |w| w.join }
262
+ @workers_pool.each(&:join)
257
263
  @on_crawl_end.each { |e| e.call(self) }
258
264
  execute_plugin 'on_crawl_end'
259
265
  end
@@ -474,7 +480,7 @@ module Polipus
474
480
  next unless p.respond_to?(method)
475
481
  @logger.info { "Running plugin method #{method} on #{k}" }
476
482
  ret_val = p.send(method, self)
477
- instance_eval(&ret_val) if ret_val.kind_of? Proc
483
+ instance_eval(&ret_val) if ret_val.is_a? Proc
478
484
  end
479
485
  end
480
486
  end
@@ -80,7 +80,11 @@ module Polipus
80
80
  # or nil if no such option is set
81
81
  #
82
82
  def user_agent
83
- @opts[:user_agent]
83
+ if @opts[:user_agent].respond_to?(:sample)
84
+ @opts[:user_agent].sample
85
+ else
86
+ @opts[:user_agent]
87
+ end
84
88
  end
85
89
 
86
90
  #
@@ -108,7 +112,7 @@ module Polipus
108
112
  # The proxy password
109
113
  #
110
114
  def proxy_pass
111
- #return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
115
+ # return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
112
116
  @opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
113
117
  end
114
118
 
@@ -162,7 +166,13 @@ module Polipus
162
166
 
163
167
  response, response_time = get_response(loc, referer)
164
168
  code = Integer(response.code)
165
- redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
169
+ redirect_to =
170
+ begin
171
+ response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
172
+ rescue URI::InvalidURIError => e
173
+ @opts[:logger].debug { "Request #{url} got #{e}" } if @opts[:logger]
174
+ nil
175
+ end
166
176
  yield response, code, loc, redirect_to, response_time
167
177
  limit -= 1
168
178
  break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
@@ -233,7 +243,7 @@ module Polipus
233
243
 
234
244
  # Block has higher priority
235
245
  unless @opts[:proxy_host_port].nil?
236
- p_host, p_port, p_user, p_pass = proxy_host_port
246
+ p_host, p_port, p_user, p_pass = proxy_host_port
237
247
  else
238
248
  p_host = proxy_host
239
249
  p_port = proxy_port
@@ -3,6 +3,8 @@ require 'nokogiri'
3
3
  require 'json'
4
4
  require 'ostruct'
5
5
  require 'set'
6
+ require 'kconv'
7
+
6
8
  module Polipus
7
9
  class Page
8
10
  # The URL of the page
@@ -70,7 +72,7 @@ module Polipus
70
72
  u = a['href']
71
73
  next if u.nil? || u.empty?
72
74
  abs = to_absolute(u) rescue next
73
- @links << abs if in_domain?(abs)
75
+ @links << abs if abs && in_domain?(abs)
74
76
  end
75
77
  @links.to_a
76
78
  end
@@ -80,7 +82,10 @@ module Polipus
80
82
  #
81
83
  def doc
82
84
  return @doc if @doc
83
- @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
85
+ @body ||= ''
86
+ @body = @body.encode('utf-8', 'binary', invalid: :replace,
87
+ undef: :replace, replace: '')
88
+ @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
84
89
  end
85
90
 
86
91
  #
@@ -166,10 +171,22 @@ module Polipus
166
171
  def to_absolute(link)
167
172
  return nil if link.nil?
168
173
 
174
+ valid_link = link.to_s.encode('utf-8', 'binary', invalid: :replace,
175
+ undef: :replace, replace: '')
176
+
169
177
  # remove anchor
170
- link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
178
+ link =
179
+ begin
180
+ URI.encode(URI.decode(valid_link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
181
+ rescue URI::Error
182
+ return nil
183
+ end
171
184
 
172
- relative = URI(link)
185
+ relative = begin
186
+ URI(link)
187
+ rescue URI::Error
188
+ return nil
189
+ end
173
190
  absolute = base ? base.merge(relative) : @url.merge(relative)
174
191
 
175
192
  absolute.path = '/' if absolute.path.empty?
@@ -5,15 +5,15 @@ module Polipus
5
5
  module QueueOverflow
6
6
  def self.mongo_queue(mongo_db, queue_name, options = {})
7
7
  require 'polipus/queue_overflow/mongo_queue'
8
- mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
9
- fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
8
+ mongo_db ||= Mongo::Client.new(['localhost:27_017'], database: 'polipus')
9
+ fail 'First argument must be an instance of Mongo::Client' unless mongo_db.is_a?(Mongo::Client)
10
10
  self::MongoQueue.new mongo_db, queue_name, options
11
11
  end
12
12
 
13
13
  def self.mongo_queue_capped(mongo_db, queue_name, options = {})
14
14
  require 'polipus/queue_overflow/mongo_queue_capped'
15
- mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
16
- fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
15
+ mongo_db ||= Mongo::Client.new(['localhost:27_017'], database: 'polipus')
16
+ fail 'First argument must be an instance of Mongo::Client' unless mongo_db.is_a?(Mongo::Client)
17
17
  options[:max] = 1_000_000 if options[:max].nil?
18
18
  self::MongoQueueCapped.new mongo_db, queue_name, options
19
19
  end
@@ -14,7 +14,7 @@ module Polipus
14
14
  end
15
15
 
16
16
  def length
17
- @mongo_db[@collection_name].count
17
+ @mongo_db[@collection_name].find.count
18
18
  end
19
19
 
20
20
  def empty?
@@ -28,18 +28,18 @@ module Polipus
28
28
 
29
29
  def push(data)
30
30
  if @options[:ensure_uniq]
31
- @mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: true, w: 1 })
31
+ @mongo_db[@collection_name].find(payload: data).replace_one({ payload: data }, upsert: true)
32
32
  else
33
- @mongo_db[@collection_name].insert(payload: data)
33
+ @mongo_db[@collection_name].insert_one(payload: data)
34
34
  end
35
35
  true
36
36
  end
37
37
 
38
38
  def pop(_ = false)
39
39
  @semaphore.synchronize do
40
- doc = @mongo_db[@collection_name].find({}, sort: { _id: 1 }).limit(1).first
40
+ doc = @mongo_db[@collection_name].find.sort(_id: 1).limit(1).first
41
41
  return nil if doc.nil?
42
- @mongo_db[@collection_name].remove(_id: doc['_id'])
42
+ @mongo_db[@collection_name].find(_id: doc['_id']).delete_one
43
43
  doc && doc['payload'] ? doc['payload'] : nil
44
44
  end
45
45
  end
@@ -53,7 +53,8 @@ module Polipus
53
53
  protected
54
54
 
55
55
  def ensure_index
56
- @mongo_db[@collection_name].ensure_index({ payload: 1 }, { background: 1, unique: 1, drop_dups: 1 })
56
+ # @TODO: Drop dups option was removed. We may want to add something here to remove duplications
57
+ @mongo_db[@collection_name].indexes.create_one({ payload: 1 }, background: true, unique: true)
57
58
  end
58
59
  end
59
60
  end
@@ -13,8 +13,8 @@ module Polipus
13
13
  @semaphore.synchronize do
14
14
  s = size
15
15
  if s > @max
16
- docs = @mongo_db[@collection_name].find({}, { sort: { _id: 1 }, fields: [:_id] }).limit(s - @max).map { |e| e['_id'] }
17
- @mongo_db[@collection_name].remove(:_id => { '$in' => docs }, '$isolated' => 1)
16
+ docs = @mongo_db[@collection_name].find.sort(_id: 1).projection(_id: 1).limit(s - @max).map { |e| e['_id'] }
17
+ @mongo_db[@collection_name].find(_id: { '$in' => docs }).delete_many
18
18
  end
19
19
  end
20
20
  end
@@ -6,14 +6,14 @@ module Polipus
6
6
 
7
7
  def self.mongo_store(mongo = nil, collection = COLLECTION, except = [])
8
8
  require 'polipus/storage/mongo_store'
9
- mongo ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
10
- fail 'First argument must be an instance of Mongo::DB' unless mongo.is_a?(Mongo::DB)
9
+ mongo ||= Mongo::Client.new(['localhost:27_017'], database: 'polipus')
10
+ fail 'First argument must be an instance of Mongo::Client' unless mongo.is_a?(Mongo::Client)
11
11
  self::MongoStore.new(mongo: mongo, collection: collection, except: except)
12
12
  end
13
13
 
14
14
  def self.rethink_store(conn = nil, table = COLLECTION, except = [])
15
15
  require 'polipus/storage/rethink_store'
16
- conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus' )
16
+ conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus')
17
17
  fail "First argument must be a RethinkDB::Connection, got `#{conn.class}`" unless conn.is_a?(RethinkDB::Connection)
18
18
  self::RethinkStore.new(conn: conn, table: table, except: except)
19
19
  end
@@ -48,7 +48,7 @@ module Polipus
48
48
 
49
49
  def clear
50
50
  @semaphore.synchronize do
51
- @store = Hash.new
51
+ @store = {}
52
52
  end
53
53
  end
54
54
  end
@@ -2,6 +2,7 @@
2
2
  require 'mongo'
3
3
  require 'zlib'
4
4
  require 'thread'
5
+ require 'pry'
5
6
 
6
7
  module Polipus
7
8
  module Storage
@@ -10,10 +11,9 @@ module Polipus
10
11
  def initialize(options = {})
11
12
  @mongo = options[:mongo]
12
13
  @collection = options[:collection]
13
- @mongo.create_collection(@collection)
14
14
  begin
15
15
  @mongo[@collection].ensure_index(:uuid, unique: true, dropDups: true, background: true)
16
- rescue Exception
16
+ rescue StandardError
17
17
  end
18
18
 
19
19
  @compress_body = options[:compress_body] ||= true
@@ -28,16 +28,20 @@ module Polipus
28
28
  obj['uuid'] = uuid(page)
29
29
  obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
30
30
  BINARY_FIELDS.each do |field|
31
- obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
31
+ obj[field] = BSON::Binary.new(obj[field].force_encoding('UTF-8').encode('UTF-8')) unless obj[field].nil?
32
32
  end
33
- @mongo[@collection].update({ uuid: obj['uuid'] }, obj, upsert: true, w: 1)
33
+
34
+ # We really need 2.0.6+ version for this to work
35
+ # https://jira.mongodb.org/browse/RUBY-881
36
+ @mongo[@collection].find(uuid: uuid(page)).replace_one(obj, upsert: true)
37
+
34
38
  obj['uuid']
35
39
  end
36
40
  end
37
41
 
38
42
  def exists?(page)
39
43
  @semaphore.synchronize do
40
- doc = @mongo[@collection].find({ uuid: uuid(page) }, { fields: [:_id] }).limit(1).first
44
+ doc = @mongo[@collection].find(uuid: uuid(page)).projection(_id: 1).limit(1).first
41
45
  !doc.nil?
42
46
  end
43
47
  end
@@ -51,16 +55,16 @@ module Polipus
51
55
 
52
56
  def remove(page)
53
57
  @semaphore.synchronize do
54
- @mongo[@collection].remove(uuid: uuid(page))
58
+ @mongo[@collection].find(uuid: uuid(page)).delete_one
55
59
  end
56
60
  end
57
61
 
58
62
  def count
59
- @mongo[@collection].count
63
+ @mongo[@collection].find.count
60
64
  end
61
65
 
62
66
  def each
63
- @mongo[@collection].find({}, timeout: false) do |cursor|
67
+ @mongo[@collection].find.no_cursor_timeout do |cursor|
64
68
  cursor.each do |doc|
65
69
  page = load_page(doc)
66
70
  yield doc['uuid'], page
@@ -76,7 +80,7 @@ module Polipus
76
80
 
77
81
  def load_page(hash)
78
82
  BINARY_FIELDS.each do |field|
79
- hash[field] = hash[field].to_s
83
+ hash[field] = hash[field].data unless hash[field].nil?
80
84
  end
81
85
  hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
82
86
  page = Page.from_hash(hash)
@@ -12,6 +12,9 @@ module Polipus
12
12
  @rethink = options[:conn]
13
13
  @table = options[:table]
14
14
 
15
+ unless @r.db_list.run(@rethink).include?(@rethink.default_db)
16
+ @r.db_create(@rethink.default_db).run(@rethink)
17
+ end
15
18
  unless @r.table_list.run(@rethink).include?(@table)
16
19
  @r.table_create(@table).run(@rethink)
17
20
  @r.table(@table).index_create('created_at')
@@ -1,5 +1,5 @@
1
1
  # encoding: UTF-8
2
2
  module Polipus
3
- VERSION = '0.4.0'
3
+ VERSION = '0.5.0'
4
4
  HOMEPAGE = 'https://github.com/taganaka/polipus'
5
5
  end
@@ -8,10 +8,10 @@ Gem::Specification.new do |s|
8
8
  s.authors = ['Francesco Laurita']
9
9
  s.email = ['francesco.laurita@gmail.com']
10
10
  s.homepage = Polipus::HOMEPAGE
11
- s.summary = %q(Polipus distributed web-crawler framework)
12
- s.description = %q(
11
+ s.summary = 'Polipus distributed web-crawler framework'
12
+ s.description = '
13
13
  An easy to use distributed web-crawler framework based on Redis
14
- )
14
+ '
15
15
  s.licenses = ['MIT']
16
16
  s.platform = Gem::Platform::RUBY
17
17
 
@@ -30,8 +30,8 @@ Gem::Specification.new do |s|
30
30
  s.add_runtime_dependency 'redis-queue', '~> 0.0', '>= 0.0.4'
31
31
  s.add_runtime_dependency 'redis-bloomfilter', '~> 0.0', '>= 0.0.3'
32
32
 
33
- s.add_development_dependency 'mongo', '~>1.11.0'
34
- s.add_development_dependency 'rethinkdb', '~>1.15.0'
33
+ s.add_development_dependency 'mongo', '~> 2.0.6'
34
+ s.add_development_dependency 'rethinkdb', '~> 1.15.0'
35
35
 
36
36
  s.add_development_dependency 'rake', '~> 10.3'
37
37
  s.add_development_dependency 'rspec', '~> 3.1.0'
@@ -41,4 +41,6 @@ Gem::Specification.new do |s|
41
41
  s.add_development_dependency 'webmock', '~> 1.20.0'
42
42
 
43
43
  s.add_development_dependency 'coveralls'
44
+
45
+ s.add_development_dependency 'pry'
44
46
  end
@@ -108,4 +108,32 @@ describe Polipus::HTTP do
108
108
  end
109
109
  end
110
110
  end
111
+
112
+ describe 'random user_agent' do
113
+ context 'when user_agent is string' do
114
+ it '#user_agent' do
115
+ http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1, user_agent: 'Googlebot')
116
+ expect(http.user_agent).to eq('Googlebot')
117
+ end
118
+ end
119
+
120
+ context 'when user_agent is list' do
121
+ let(:user_agents) do
122
+ ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1',
123
+ 'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
124
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
125
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
126
+ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10',
127
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
128
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5',
129
+ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)'
130
+ ]
131
+ end
132
+
133
+ it '#user_agent' do
134
+ http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1, user_agent: user_agents)
135
+ expect(user_agents).to include(http.user_agent)
136
+ end
137
+ end
138
+ end
111
139
  end
@@ -6,7 +6,7 @@ require 'redis-queue'
6
6
 
7
7
  describe Polipus::QueueOverflow::Manager do
8
8
  before(:all) do
9
- @mongo = Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('_test_polipus')
9
+ @mongo = Mongo::Client.new(['localhost:27_017'], database: '_test_polipus')
10
10
  @mongo['_test_pages'].drop
11
11
  @storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
12
12
  @redis_q = Redis::Queue.new('queue_test', 'bp_queue_test', redis: Redis.new)
@@ -69,7 +69,7 @@ describe Polipus::Storage::MemoryStore do
69
69
  p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
70
70
  storage.add p
71
71
  p = storage.get p
72
- expect(p.body).to be_empty
72
+ expect(p.body).to be_nil
73
73
  storage.clear
74
74
  end
75
75
 
@@ -4,7 +4,7 @@ require 'mongo'
4
4
  require 'polipus/storage/mongo_store'
5
5
  describe Polipus::Storage::MongoStore do
6
6
  before(:all)do
7
- @mongo = Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('_test_polipus')
7
+ @mongo = Mongo::Client.new(['localhost:27_017'], database: '_test_polipus')
8
8
  @mongo['_test_pages'].drop
9
9
  @storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
10
10
  end
@@ -21,8 +21,8 @@ describe Polipus::Storage::MongoStore do
21
21
  p = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
22
22
  uuid = @storage.add p
23
23
  expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
24
- expect(@storage.count).to be 1
25
- expect(@mongo['_test_pages'].count).to be 1
24
+ expect(@storage.count.to_i).to be 1
25
+ expect(@mongo['_test_pages'].find.count.to_i).to be 1
26
26
  p = @storage.get p
27
27
  expect(p.url.to_s).to eq('http://www.google.com')
28
28
  expect(p.body).to eq('<html></html>')
@@ -33,7 +33,7 @@ describe Polipus::Storage::MongoStore do
33
33
  @storage.add p
34
34
  p = @storage.get p
35
35
  expect(p.code).to eq(301)
36
- expect(@mongo['_test_pages'].count).to be 1
36
+ expect(@mongo['_test_pages'].find.count.to_i).to be 1
37
37
  end
38
38
 
39
39
  it 'should iterate over stored pages' do
@@ -47,7 +47,7 @@ describe Polipus::Storage::MongoStore do
47
47
  p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
48
48
  @storage.remove p
49
49
  expect(@storage.get(p)).to be_nil
50
- expect(@storage.count).to be 0
50
+ expect(@storage.count.to_i).to be 0
51
51
  end
52
52
 
53
53
  it 'should store a page removing a query string from the uuid generation' do
@@ -83,7 +83,7 @@ describe Polipus::Storage::MongoStore do
83
83
  p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
84
84
  storage.add p
85
85
  p = storage.get p
86
- expect(p.body).to be_empty
86
+ expect(p.body).to be_nil
87
87
  storage.clear
88
88
  end
89
89
 
@@ -93,5 +93,14 @@ describe Polipus::PolipusCrawler do
93
93
  polipus.takeover
94
94
  polipus.storage.each { |_id, page| expect(page.url.path =~ /$\/downloads\//).to be_falsey }
95
95
  end
96
+
97
+ it 'should obey to the robots.txt file with list user_agent' do
98
+ user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)'
99
+ lopt = p_options
100
+ lopt[:obey_robots_txt] = true
101
+ lopt[:user_agent] = [user_agent]
102
+ flexmock(Polipus::Robotex).should_receive(:new).with(user_agent)
103
+ Polipus::PolipusCrawler.new('polipus-rspec', ['https://rubygems.org/gems/polipus'], lopt)
104
+ end
96
105
  end
97
106
  end
metadata CHANGED
@@ -1,248 +1,262 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: polipus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francesco Laurita
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-26 00:00:00.000000000 Z
11
+ date: 2015-07-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.6'
20
- - - ! '>='
20
+ - - ">="
21
21
  - !ruby/object:Gem::Version
22
22
  version: 1.6.0
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
- - - ~>
27
+ - - "~>"
28
28
  - !ruby/object:Gem::Version
29
29
  version: '1.6'
30
- - - ! '>='
30
+ - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.6.0
33
33
  - !ruby/object:Gem::Dependency
34
34
  name: http-cookie
35
35
  requirement: !ruby/object:Gem::Requirement
36
36
  requirements:
37
- - - ~>
37
+ - - "~>"
38
38
  - !ruby/object:Gem::Version
39
39
  version: '1.0'
40
- - - ! '>='
40
+ - - ">="
41
41
  - !ruby/object:Gem::Version
42
42
  version: 1.0.1
43
43
  type: :runtime
44
44
  prerelease: false
45
45
  version_requirements: !ruby/object:Gem::Requirement
46
46
  requirements:
47
- - - ~>
47
+ - - "~>"
48
48
  - !ruby/object:Gem::Version
49
49
  version: '1.0'
50
- - - ! '>='
50
+ - - ">="
51
51
  - !ruby/object:Gem::Version
52
52
  version: 1.0.1
53
53
  - !ruby/object:Gem::Dependency
54
54
  name: redis
55
55
  requirement: !ruby/object:Gem::Requirement
56
56
  requirements:
57
- - - ~>
57
+ - - "~>"
58
58
  - !ruby/object:Gem::Version
59
59
  version: '3.0'
60
- - - ! '>='
60
+ - - ">="
61
61
  - !ruby/object:Gem::Version
62
62
  version: 3.0.4
63
63
  type: :runtime
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
- - - ~>
67
+ - - "~>"
68
68
  - !ruby/object:Gem::Version
69
69
  version: '3.0'
70
- - - ! '>='
70
+ - - ">="
71
71
  - !ruby/object:Gem::Version
72
72
  version: 3.0.4
73
73
  - !ruby/object:Gem::Dependency
74
74
  name: hiredis
75
75
  requirement: !ruby/object:Gem::Requirement
76
76
  requirements:
77
- - - ~>
77
+ - - "~>"
78
78
  - !ruby/object:Gem::Version
79
79
  version: '0.5'
80
- - - ! '>='
80
+ - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: 0.4.5
83
83
  type: :runtime
84
84
  prerelease: false
85
85
  version_requirements: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - ~>
87
+ - - "~>"
88
88
  - !ruby/object:Gem::Version
89
89
  version: '0.5'
90
- - - ! '>='
90
+ - - ">="
91
91
  - !ruby/object:Gem::Version
92
92
  version: 0.4.5
93
93
  - !ruby/object:Gem::Dependency
94
94
  name: redis-queue
95
95
  requirement: !ruby/object:Gem::Requirement
96
96
  requirements:
97
- - - ~>
97
+ - - "~>"
98
98
  - !ruby/object:Gem::Version
99
99
  version: '0.0'
100
- - - ! '>='
100
+ - - ">="
101
101
  - !ruby/object:Gem::Version
102
102
  version: 0.0.4
103
103
  type: :runtime
104
104
  prerelease: false
105
105
  version_requirements: !ruby/object:Gem::Requirement
106
106
  requirements:
107
- - - ~>
107
+ - - "~>"
108
108
  - !ruby/object:Gem::Version
109
109
  version: '0.0'
110
- - - ! '>='
110
+ - - ">="
111
111
  - !ruby/object:Gem::Version
112
112
  version: 0.0.4
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: redis-bloomfilter
115
115
  requirement: !ruby/object:Gem::Requirement
116
116
  requirements:
117
- - - ~>
117
+ - - "~>"
118
118
  - !ruby/object:Gem::Version
119
119
  version: '0.0'
120
- - - ! '>='
120
+ - - ">="
121
121
  - !ruby/object:Gem::Version
122
122
  version: 0.0.3
123
123
  type: :runtime
124
124
  prerelease: false
125
125
  version_requirements: !ruby/object:Gem::Requirement
126
126
  requirements:
127
- - - ~>
127
+ - - "~>"
128
128
  - !ruby/object:Gem::Version
129
129
  version: '0.0'
130
- - - ! '>='
130
+ - - ">="
131
131
  - !ruby/object:Gem::Version
132
132
  version: 0.0.3
133
133
  - !ruby/object:Gem::Dependency
134
134
  name: mongo
135
135
  requirement: !ruby/object:Gem::Requirement
136
136
  requirements:
137
- - - ~>
137
+ - - "~>"
138
138
  - !ruby/object:Gem::Version
139
- version: 1.11.0
139
+ version: 2.0.6
140
140
  type: :development
141
141
  prerelease: false
142
142
  version_requirements: !ruby/object:Gem::Requirement
143
143
  requirements:
144
- - - ~>
144
+ - - "~>"
145
145
  - !ruby/object:Gem::Version
146
- version: 1.11.0
146
+ version: 2.0.6
147
147
  - !ruby/object:Gem::Dependency
148
148
  name: rethinkdb
149
149
  requirement: !ruby/object:Gem::Requirement
150
150
  requirements:
151
- - - ~>
151
+ - - "~>"
152
152
  - !ruby/object:Gem::Version
153
153
  version: 1.15.0
154
154
  type: :development
155
155
  prerelease: false
156
156
  version_requirements: !ruby/object:Gem::Requirement
157
157
  requirements:
158
- - - ~>
158
+ - - "~>"
159
159
  - !ruby/object:Gem::Version
160
160
  version: 1.15.0
161
161
  - !ruby/object:Gem::Dependency
162
162
  name: rake
163
163
  requirement: !ruby/object:Gem::Requirement
164
164
  requirements:
165
- - - ~>
165
+ - - "~>"
166
166
  - !ruby/object:Gem::Version
167
167
  version: '10.3'
168
168
  type: :development
169
169
  prerelease: false
170
170
  version_requirements: !ruby/object:Gem::Requirement
171
171
  requirements:
172
- - - ~>
172
+ - - "~>"
173
173
  - !ruby/object:Gem::Version
174
174
  version: '10.3'
175
175
  - !ruby/object:Gem::Dependency
176
176
  name: rspec
177
177
  requirement: !ruby/object:Gem::Requirement
178
178
  requirements:
179
- - - ~>
179
+ - - "~>"
180
180
  - !ruby/object:Gem::Version
181
181
  version: 3.1.0
182
182
  type: :development
183
183
  prerelease: false
184
184
  version_requirements: !ruby/object:Gem::Requirement
185
185
  requirements:
186
- - - ~>
186
+ - - "~>"
187
187
  - !ruby/object:Gem::Version
188
188
  version: 3.1.0
189
189
  - !ruby/object:Gem::Dependency
190
190
  name: flexmock
191
191
  requirement: !ruby/object:Gem::Requirement
192
192
  requirements:
193
- - - ~>
193
+ - - "~>"
194
194
  - !ruby/object:Gem::Version
195
195
  version: '1.3'
196
196
  type: :development
197
197
  prerelease: false
198
198
  version_requirements: !ruby/object:Gem::Requirement
199
199
  requirements:
200
- - - ~>
200
+ - - "~>"
201
201
  - !ruby/object:Gem::Version
202
202
  version: '1.3'
203
203
  - !ruby/object:Gem::Dependency
204
204
  name: vcr
205
205
  requirement: !ruby/object:Gem::Requirement
206
206
  requirements:
207
- - - ~>
207
+ - - "~>"
208
208
  - !ruby/object:Gem::Version
209
209
  version: 2.9.0
210
210
  type: :development
211
211
  prerelease: false
212
212
  version_requirements: !ruby/object:Gem::Requirement
213
213
  requirements:
214
- - - ~>
214
+ - - "~>"
215
215
  - !ruby/object:Gem::Version
216
216
  version: 2.9.0
217
217
  - !ruby/object:Gem::Dependency
218
218
  name: webmock
219
219
  requirement: !ruby/object:Gem::Requirement
220
220
  requirements:
221
- - - ~>
221
+ - - "~>"
222
222
  - !ruby/object:Gem::Version
223
223
  version: 1.20.0
224
224
  type: :development
225
225
  prerelease: false
226
226
  version_requirements: !ruby/object:Gem::Requirement
227
227
  requirements:
228
- - - ~>
228
+ - - "~>"
229
229
  - !ruby/object:Gem::Version
230
230
  version: 1.20.0
231
231
  - !ruby/object:Gem::Dependency
232
232
  name: coveralls
233
233
  requirement: !ruby/object:Gem::Requirement
234
234
  requirements:
235
- - - ! '>='
235
+ - - ">="
236
236
  - !ruby/object:Gem::Version
237
237
  version: '0'
238
238
  type: :development
239
239
  prerelease: false
240
240
  version_requirements: !ruby/object:Gem::Requirement
241
241
  requirements:
242
- - - ! '>='
242
+ - - ">="
243
243
  - !ruby/object:Gem::Version
244
244
  version: '0'
245
- description: ! "\n An easy to use distributed web-crawler framework based on Redis\n
245
+ - !ruby/object:Gem::Dependency
246
+ name: pry
247
+ requirement: !ruby/object:Gem::Requirement
248
+ requirements:
249
+ - - ">="
250
+ - !ruby/object:Gem::Version
251
+ version: '0'
252
+ type: :development
253
+ prerelease: false
254
+ version_requirements: !ruby/object:Gem::Requirement
255
+ requirements:
256
+ - - ">="
257
+ - !ruby/object:Gem::Version
258
+ version: '0'
259
+ description: "\n An easy to use distributed web-crawler framework based on Redis\n
246
260
  \ "
247
261
  email:
248
262
  - francesco.laurita@gmail.com
@@ -250,12 +264,12 @@ executables: []
250
264
  extensions: []
251
265
  extra_rdoc_files: []
252
266
  files:
253
- - .document
254
- - .gitignore
255
- - .rspec
256
- - .rubocop.yml
257
- - .rubocop_todo.yml
258
- - .travis.yml
267
+ - ".document"
268
+ - ".gitignore"
269
+ - ".rspec"
270
+ - ".rubocop.yml"
271
+ - ".rubocop_todo.yml"
272
+ - ".travis.yml"
259
273
  - AUTHORS.md
260
274
  - CHANGELOG.md
261
275
  - Gemfile
@@ -330,17 +344,17 @@ require_paths:
330
344
  - lib
331
345
  required_ruby_version: !ruby/object:Gem::Requirement
332
346
  requirements:
333
- - - ! '>='
347
+ - - ">="
334
348
  - !ruby/object:Gem::Version
335
349
  version: '0'
336
350
  required_rubygems_version: !ruby/object:Gem::Requirement
337
351
  requirements:
338
- - - ! '>='
352
+ - - ">="
339
353
  - !ruby/object:Gem::Version
340
354
  version: '0'
341
355
  requirements: []
342
356
  rubyforge_project: polipus
343
- rubygems_version: 2.4.5
357
+ rubygems_version: 2.2.2
344
358
  signing_key:
345
359
  specification_version: 4
346
360
  summary: Polipus distributed web-crawler framework
@@ -371,4 +385,3 @@ test_files:
371
385
  - spec/polipus/url_tracker_spec.rb
372
386
  - spec/polipus_spec.rb
373
387
  - spec/spec_helper.rb
374
- has_rdoc: