polipus 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- NGU0MWZlMWIwMGM2MWNhMmJiOTU3MDM0YWI0ZjY0MzI3NmNkYzgyNA==
5
- data.tar.gz: !binary |-
6
- MTAxYjZmYWNjMTlkYzk3Y2ZhNjdjMDFmODM2OTQ5YmQ3ZDcyNTQzNw==
2
+ SHA1:
3
+ metadata.gz: 0e3c15f4ab5c3b4eca8f4703b80d7d8fa74210c3
4
+ data.tar.gz: ae0dd32d81bbcbef350949e0c51fecaaf59ac8ff
7
5
  SHA512:
8
- metadata.gz: !binary |-
9
- ZDE5NDExNmE1M2M4MDM0MTExNzVkNmM0ZmVjNTQyOTg5Y2JkYTdmZjBhY2Fi
10
- NmM1MDc5ZTI0ODkxNzlhY2ZkZGVjZjI2ZTMyNjZhNTMwOWZhZjQ2OGY4ZTJl
11
- ZmI4ZjUwNmQ3YjAxOTdlODI2MWRmMWUzNjY3Yzk1MmNiZTk0Y2E=
12
- data.tar.gz: !binary |-
13
- YjNlYTZlMjU5OTBmN2ZjMTI1OTY1MzgzNzBhYjBjYTAzN2MwYzY0M2U1ZGYw
14
- MjcyMmMxM2I1ZTY5OWEzZDQ3MjQ1OTgzMzcyN2I5NGQ5NTAzZTEzY2FmMDRl
15
- MmJiZDdjOTFmMjQwZWU1MzM1OGJhN2E4NTRhNGExZTAyMTVmN2I=
6
+ metadata.gz: c3a57d1cbfbab4c77cf3b746c7b73eb84cf675e7998eb5979af6d17d997ee26781803e519eae50de6762590fc7287d09ccf4ac0d640dff40602b8a507a356de2
7
+ data.tar.gz: 743b128ee0dd8fefbbdd1bd7443afbc4a2c62dd2f3d3751d3d693719e9a8e2e145668ccd58945b77f5f780f50757a11248df2b1963c8e7cc35706e937627e5e0
@@ -1,25 +1,36 @@
1
1
  # This configuration was generated by `rubocop --auto-gen-config`
2
- # on 2014-06-08 11:25:39 -0700 using RuboCop version 0.23.0.
2
+ # on 2015-07-08 20:22:49 -0700 using RuboCop version 0.29.1.
3
3
  # The point is for the user to remove these configuration records
4
4
  # one by one as the offenses are removed from the code base.
5
5
  # Note that changes in the inspected code, or installation of new
6
6
  # versions of RuboCop, may require this file to be generated again.
7
7
 
8
8
  # Offense count: 1
9
- Style/ClassVars:
9
+ Lint/HandleExceptions:
10
10
  Enabled: false
11
11
 
12
- # Offense count: 10
12
+ # Offense count: 21
13
+ Metrics/AbcSize:
14
+ Max: 103
15
+
16
+ # Offense count: 12
13
17
  Metrics/CyclomaticComplexity:
14
- Max: 16
18
+ Max: 15
15
19
 
16
- # Offense count: 26
17
- Style/Documentation:
20
+ # Offense count: 10
21
+ Metrics/PerceivedComplexity:
22
+ Max: 17
23
+
24
+ # Offense count: 1
25
+ Style/ClassVars:
18
26
  Enabled: false
19
27
 
28
+ # Offense count: 27
29
+ Style/Documentation:
30
+ Enabled: false
20
31
 
21
32
  # Offense count: 2
22
- # Configuration parameters: EnforcedStyle, SupportedStyles.
33
+ # Configuration parameters: EnforcedStyle, MinBodyLength, SupportedStyles.
23
34
  Style/Next:
24
35
  Enabled: false
25
36
 
@@ -28,6 +39,10 @@ Style/Next:
28
39
  Style/RegexpLiteral:
29
40
  Enabled: false
30
41
 
31
- # Offense count: 4
42
+ # Offense count: 3
32
43
  Style/RescueModifier:
33
44
  Enabled: false
45
+
46
+ # Offense count: 1
47
+ Style/UnlessElse:
48
+ Enabled: false
data/AUTHORS.md CHANGED
@@ -3,3 +3,4 @@
3
3
  * [Francesco Laurita](francesco.laurita@gmail.com)
4
4
  * [Tobias L. Maier](http://tobiasmaier.info/)
5
5
  * [Marcos Piccinini](https://github.com/nofxx)
6
+ * [Martin Bianculli](https://github.com/lepek)
@@ -1,5 +1,12 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.5.0 (2015-07-08)
4
+
5
+ [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.4.0...0.5.0)
6
+
7
+ * Support for MongoDB driver ~> 2.0.6 has been added
8
+ * Minor code cleanup
9
+
3
10
  ## 0.4.0 (2015-01-12)
4
11
 
5
12
  [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.3...0.4.0)
@@ -129,7 +129,14 @@ module Polipus
129
129
 
130
130
  @urls = [urls].flatten.map { |url| URI(url) }
131
131
  @urls.each { |url| url.path = '/' if url.path.empty? }
132
- @robots = Polipus::Robotex.new(@options[:user_agent]) if @options[:obey_robots_txt]
132
+ if @options[:obey_robots_txt]
133
+ @robots =
134
+ if @options[:user_agent].respond_to?(:sample)
135
+ Polipus::Robotex.new(@options[:user_agent].sample)
136
+ else
137
+ Polipus::Robotex.new(@options[:user_agent])
138
+ end
139
+ end
133
140
  # Attach signal handling if enabled
134
141
  SignalHandler.enable if @options[:enable_signal_handler]
135
142
 
@@ -170,7 +177,6 @@ module Polipus
170
177
  http = HTTP.new(@options)
171
178
  queue = queue_factory
172
179
  queue.process(false, @options[:queue_timeout]) do |message|
173
-
174
180
  next if message.nil?
175
181
 
176
182
  execute_plugin 'on_message_received'
@@ -199,7 +205,7 @@ module Polipus
199
205
  rurls = pages.map { |e| e.url.to_s }.join(' --> ')
200
206
  @logger.info { "Got redirects! #{rurls}" }
201
207
  page = pages.pop
202
- page.aliases = pages.map { |e| e.url }
208
+ page.aliases = pages.map(&:url)
203
209
  if page_exists? page
204
210
  @logger.info { "[worker ##{worker_number}] Page (#{page.url}) already stored." }
205
211
  queue.commit
@@ -253,7 +259,7 @@ module Polipus
253
259
  end
254
260
  end
255
261
 
256
- @workers_pool.each { |w| w.join }
262
+ @workers_pool.each(&:join)
257
263
  @on_crawl_end.each { |e| e.call(self) }
258
264
  execute_plugin 'on_crawl_end'
259
265
  end
@@ -474,7 +480,7 @@ module Polipus
474
480
  next unless p.respond_to?(method)
475
481
  @logger.info { "Running plugin method #{method} on #{k}" }
476
482
  ret_val = p.send(method, self)
477
- instance_eval(&ret_val) if ret_val.kind_of? Proc
483
+ instance_eval(&ret_val) if ret_val.is_a? Proc
478
484
  end
479
485
  end
480
486
  end
@@ -80,7 +80,11 @@ module Polipus
80
80
  # or nil if no such option is set
81
81
  #
82
82
  def user_agent
83
- @opts[:user_agent]
83
+ if @opts[:user_agent].respond_to?(:sample)
84
+ @opts[:user_agent].sample
85
+ else
86
+ @opts[:user_agent]
87
+ end
84
88
  end
85
89
 
86
90
  #
@@ -108,7 +112,7 @@ module Polipus
108
112
  # The proxy password
109
113
  #
110
114
  def proxy_pass
111
- #return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
115
+ # return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
112
116
  @opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
113
117
  end
114
118
 
@@ -162,7 +166,13 @@ module Polipus
162
166
 
163
167
  response, response_time = get_response(loc, referer)
164
168
  code = Integer(response.code)
165
- redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
169
+ redirect_to =
170
+ begin
171
+ response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
172
+ rescue URI::InvalidURIError => e
173
+ @opts[:logger].debug { "Request #{url} got #{e}" } if @opts[:logger]
174
+ nil
175
+ end
166
176
  yield response, code, loc, redirect_to, response_time
167
177
  limit -= 1
168
178
  break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
@@ -233,7 +243,7 @@ module Polipus
233
243
 
234
244
  # Block has higher priority
235
245
  unless @opts[:proxy_host_port].nil?
236
- p_host, p_port, p_user, p_pass = proxy_host_port
246
+ p_host, p_port, p_user, p_pass = proxy_host_port
237
247
  else
238
248
  p_host = proxy_host
239
249
  p_port = proxy_port
@@ -3,6 +3,8 @@ require 'nokogiri'
3
3
  require 'json'
4
4
  require 'ostruct'
5
5
  require 'set'
6
+ require 'kconv'
7
+
6
8
  module Polipus
7
9
  class Page
8
10
  # The URL of the page
@@ -70,7 +72,7 @@ module Polipus
70
72
  u = a['href']
71
73
  next if u.nil? || u.empty?
72
74
  abs = to_absolute(u) rescue next
73
- @links << abs if in_domain?(abs)
75
+ @links << abs if abs && in_domain?(abs)
74
76
  end
75
77
  @links.to_a
76
78
  end
@@ -80,7 +82,10 @@ module Polipus
80
82
  #
81
83
  def doc
82
84
  return @doc if @doc
83
- @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
85
+ @body ||= ''
86
+ @body = @body.encode('utf-8', 'binary', invalid: :replace,
87
+ undef: :replace, replace: '')
88
+ @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
84
89
  end
85
90
 
86
91
  #
@@ -166,10 +171,22 @@ module Polipus
166
171
  def to_absolute(link)
167
172
  return nil if link.nil?
168
173
 
174
+ valid_link = link.to_s.encode('utf-8', 'binary', invalid: :replace,
175
+ undef: :replace, replace: '')
176
+
169
177
  # remove anchor
170
- link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
178
+ link =
179
+ begin
180
+ URI.encode(URI.decode(valid_link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
181
+ rescue URI::Error
182
+ return nil
183
+ end
171
184
 
172
- relative = URI(link)
185
+ relative = begin
186
+ URI(link)
187
+ rescue URI::Error
188
+ return nil
189
+ end
173
190
  absolute = base ? base.merge(relative) : @url.merge(relative)
174
191
 
175
192
  absolute.path = '/' if absolute.path.empty?
@@ -5,15 +5,15 @@ module Polipus
5
5
  module QueueOverflow
6
6
  def self.mongo_queue(mongo_db, queue_name, options = {})
7
7
  require 'polipus/queue_overflow/mongo_queue'
8
- mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
9
- fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
8
+ mongo_db ||= Mongo::Client.new(['localhost:27_017'], database: 'polipus')
9
+ fail 'First argument must be an instance of Mongo::Client' unless mongo_db.is_a?(Mongo::Client)
10
10
  self::MongoQueue.new mongo_db, queue_name, options
11
11
  end
12
12
 
13
13
  def self.mongo_queue_capped(mongo_db, queue_name, options = {})
14
14
  require 'polipus/queue_overflow/mongo_queue_capped'
15
- mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
16
- fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
15
+ mongo_db ||= Mongo::Client.new(['localhost:27_017'], database: 'polipus')
16
+ fail 'First argument must be an instance of Mongo::Client' unless mongo_db.is_a?(Mongo::Client)
17
17
  options[:max] = 1_000_000 if options[:max].nil?
18
18
  self::MongoQueueCapped.new mongo_db, queue_name, options
19
19
  end
@@ -14,7 +14,7 @@ module Polipus
14
14
  end
15
15
 
16
16
  def length
17
- @mongo_db[@collection_name].count
17
+ @mongo_db[@collection_name].find.count
18
18
  end
19
19
 
20
20
  def empty?
@@ -28,18 +28,18 @@ module Polipus
28
28
 
29
29
  def push(data)
30
30
  if @options[:ensure_uniq]
31
- @mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: true, w: 1 })
31
+ @mongo_db[@collection_name].find(payload: data).replace_one({ payload: data }, upsert: true)
32
32
  else
33
- @mongo_db[@collection_name].insert(payload: data)
33
+ @mongo_db[@collection_name].insert_one(payload: data)
34
34
  end
35
35
  true
36
36
  end
37
37
 
38
38
  def pop(_ = false)
39
39
  @semaphore.synchronize do
40
- doc = @mongo_db[@collection_name].find({}, sort: { _id: 1 }).limit(1).first
40
+ doc = @mongo_db[@collection_name].find.sort(_id: 1).limit(1).first
41
41
  return nil if doc.nil?
42
- @mongo_db[@collection_name].remove(_id: doc['_id'])
42
+ @mongo_db[@collection_name].find(_id: doc['_id']).delete_one
43
43
  doc && doc['payload'] ? doc['payload'] : nil
44
44
  end
45
45
  end
@@ -53,7 +53,8 @@ module Polipus
53
53
  protected
54
54
 
55
55
  def ensure_index
56
- @mongo_db[@collection_name].ensure_index({ payload: 1 }, { background: 1, unique: 1, drop_dups: 1 })
56
+ # @TODO: Drop dups option was removed. We may want to add something here to remove duplications
57
+ @mongo_db[@collection_name].indexes.create_one({ payload: 1 }, background: true, unique: true)
57
58
  end
58
59
  end
59
60
  end
@@ -13,8 +13,8 @@ module Polipus
13
13
  @semaphore.synchronize do
14
14
  s = size
15
15
  if s > @max
16
- docs = @mongo_db[@collection_name].find({}, { sort: { _id: 1 }, fields: [:_id] }).limit(s - @max).map { |e| e['_id'] }
17
- @mongo_db[@collection_name].remove(:_id => { '$in' => docs }, '$isolated' => 1)
16
+ docs = @mongo_db[@collection_name].find.sort(_id: 1).projection(_id: 1).limit(s - @max).map { |e| e['_id'] }
17
+ @mongo_db[@collection_name].find(_id: { '$in' => docs }).delete_many
18
18
  end
19
19
  end
20
20
  end
@@ -6,14 +6,14 @@ module Polipus
6
6
 
7
7
  def self.mongo_store(mongo = nil, collection = COLLECTION, except = [])
8
8
  require 'polipus/storage/mongo_store'
9
- mongo ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
10
- fail 'First argument must be an instance of Mongo::DB' unless mongo.is_a?(Mongo::DB)
9
+ mongo ||= Mongo::Client.new(['localhost:27_017'], database: 'polipus')
10
+ fail 'First argument must be an instance of Mongo::Client' unless mongo.is_a?(Mongo::Client)
11
11
  self::MongoStore.new(mongo: mongo, collection: collection, except: except)
12
12
  end
13
13
 
14
14
  def self.rethink_store(conn = nil, table = COLLECTION, except = [])
15
15
  require 'polipus/storage/rethink_store'
16
- conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus' )
16
+ conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus')
17
17
  fail "First argument must be a RethinkDB::Connection, got `#{conn.class}`" unless conn.is_a?(RethinkDB::Connection)
18
18
  self::RethinkStore.new(conn: conn, table: table, except: except)
19
19
  end
@@ -48,7 +48,7 @@ module Polipus
48
48
 
49
49
  def clear
50
50
  @semaphore.synchronize do
51
- @store = Hash.new
51
+ @store = {}
52
52
  end
53
53
  end
54
54
  end
@@ -2,6 +2,7 @@
2
2
  require 'mongo'
3
3
  require 'zlib'
4
4
  require 'thread'
5
+ require 'pry'
5
6
 
6
7
  module Polipus
7
8
  module Storage
@@ -10,10 +11,9 @@ module Polipus
10
11
  def initialize(options = {})
11
12
  @mongo = options[:mongo]
12
13
  @collection = options[:collection]
13
- @mongo.create_collection(@collection)
14
14
  begin
15
15
  @mongo[@collection].ensure_index(:uuid, unique: true, dropDups: true, background: true)
16
- rescue Exception
16
+ rescue StandardError
17
17
  end
18
18
 
19
19
  @compress_body = options[:compress_body] ||= true
@@ -28,16 +28,20 @@ module Polipus
28
28
  obj['uuid'] = uuid(page)
29
29
  obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
30
30
  BINARY_FIELDS.each do |field|
31
- obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
31
+ obj[field] = BSON::Binary.new(obj[field].force_encoding('UTF-8').encode('UTF-8')) unless obj[field].nil?
32
32
  end
33
- @mongo[@collection].update({ uuid: obj['uuid'] }, obj, upsert: true, w: 1)
33
+
34
+ # We really need 2.0.6+ version for this to work
35
+ # https://jira.mongodb.org/browse/RUBY-881
36
+ @mongo[@collection].find(uuid: uuid(page)).replace_one(obj, upsert: true)
37
+
34
38
  obj['uuid']
35
39
  end
36
40
  end
37
41
 
38
42
  def exists?(page)
39
43
  @semaphore.synchronize do
40
- doc = @mongo[@collection].find({ uuid: uuid(page) }, { fields: [:_id] }).limit(1).first
44
+ doc = @mongo[@collection].find(uuid: uuid(page)).projection(_id: 1).limit(1).first
41
45
  !doc.nil?
42
46
  end
43
47
  end
@@ -51,16 +55,16 @@ module Polipus
51
55
 
52
56
  def remove(page)
53
57
  @semaphore.synchronize do
54
- @mongo[@collection].remove(uuid: uuid(page))
58
+ @mongo[@collection].find(uuid: uuid(page)).delete_one
55
59
  end
56
60
  end
57
61
 
58
62
  def count
59
- @mongo[@collection].count
63
+ @mongo[@collection].find.count
60
64
  end
61
65
 
62
66
  def each
63
- @mongo[@collection].find({}, timeout: false) do |cursor|
67
+ @mongo[@collection].find.no_cursor_timeout do |cursor|
64
68
  cursor.each do |doc|
65
69
  page = load_page(doc)
66
70
  yield doc['uuid'], page
@@ -76,7 +80,7 @@ module Polipus
76
80
 
77
81
  def load_page(hash)
78
82
  BINARY_FIELDS.each do |field|
79
- hash[field] = hash[field].to_s
83
+ hash[field] = hash[field].data unless hash[field].nil?
80
84
  end
81
85
  hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
82
86
  page = Page.from_hash(hash)
@@ -12,6 +12,9 @@ module Polipus
12
12
  @rethink = options[:conn]
13
13
  @table = options[:table]
14
14
 
15
+ unless @r.db_list.run(@rethink).include?(@rethink.default_db)
16
+ @r.db_create(@rethink.default_db).run(@rethink)
17
+ end
15
18
  unless @r.table_list.run(@rethink).include?(@table)
16
19
  @r.table_create(@table).run(@rethink)
17
20
  @r.table(@table).index_create('created_at')
@@ -1,5 +1,5 @@
1
1
  # encoding: UTF-8
2
2
  module Polipus
3
- VERSION = '0.4.0'
3
+ VERSION = '0.5.0'
4
4
  HOMEPAGE = 'https://github.com/taganaka/polipus'
5
5
  end
@@ -8,10 +8,10 @@ Gem::Specification.new do |s|
8
8
  s.authors = ['Francesco Laurita']
9
9
  s.email = ['francesco.laurita@gmail.com']
10
10
  s.homepage = Polipus::HOMEPAGE
11
- s.summary = %q(Polipus distributed web-crawler framework)
12
- s.description = %q(
11
+ s.summary = 'Polipus distributed web-crawler framework'
12
+ s.description = '
13
13
  An easy to use distributed web-crawler framework based on Redis
14
- )
14
+ '
15
15
  s.licenses = ['MIT']
16
16
  s.platform = Gem::Platform::RUBY
17
17
 
@@ -30,8 +30,8 @@ Gem::Specification.new do |s|
30
30
  s.add_runtime_dependency 'redis-queue', '~> 0.0', '>= 0.0.4'
31
31
  s.add_runtime_dependency 'redis-bloomfilter', '~> 0.0', '>= 0.0.3'
32
32
 
33
- s.add_development_dependency 'mongo', '~>1.11.0'
34
- s.add_development_dependency 'rethinkdb', '~>1.15.0'
33
+ s.add_development_dependency 'mongo', '~> 2.0.6'
34
+ s.add_development_dependency 'rethinkdb', '~> 1.15.0'
35
35
 
36
36
  s.add_development_dependency 'rake', '~> 10.3'
37
37
  s.add_development_dependency 'rspec', '~> 3.1.0'
@@ -41,4 +41,6 @@ Gem::Specification.new do |s|
41
41
  s.add_development_dependency 'webmock', '~> 1.20.0'
42
42
 
43
43
  s.add_development_dependency 'coveralls'
44
+
45
+ s.add_development_dependency 'pry'
44
46
  end
@@ -108,4 +108,32 @@ describe Polipus::HTTP do
108
108
  end
109
109
  end
110
110
  end
111
+
112
+ describe 'random user_agent' do
113
+ context 'when user_agent is string' do
114
+ it '#user_agent' do
115
+ http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1, user_agent: 'Googlebot')
116
+ expect(http.user_agent).to eq('Googlebot')
117
+ end
118
+ end
119
+
120
+ context 'when user_agent is list' do
121
+ let(:user_agents) do
122
+ ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1',
123
+ 'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
124
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
125
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
126
+ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10',
127
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
128
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5',
129
+ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)'
130
+ ]
131
+ end
132
+
133
+ it '#user_agent' do
134
+ http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1, user_agent: user_agents)
135
+ expect(user_agents).to include(http.user_agent)
136
+ end
137
+ end
138
+ end
111
139
  end
@@ -6,7 +6,7 @@ require 'redis-queue'
6
6
 
7
7
  describe Polipus::QueueOverflow::Manager do
8
8
  before(:all) do
9
- @mongo = Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('_test_polipus')
9
+ @mongo = Mongo::Client.new(['localhost:27_017'], database: '_test_polipus')
10
10
  @mongo['_test_pages'].drop
11
11
  @storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
12
12
  @redis_q = Redis::Queue.new('queue_test', 'bp_queue_test', redis: Redis.new)
@@ -69,7 +69,7 @@ describe Polipus::Storage::MemoryStore do
69
69
  p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
70
70
  storage.add p
71
71
  p = storage.get p
72
- expect(p.body).to be_empty
72
+ expect(p.body).to be_nil
73
73
  storage.clear
74
74
  end
75
75
 
@@ -4,7 +4,7 @@ require 'mongo'
4
4
  require 'polipus/storage/mongo_store'
5
5
  describe Polipus::Storage::MongoStore do
6
6
  before(:all)do
7
- @mongo = Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('_test_polipus')
7
+ @mongo = Mongo::Client.new(['localhost:27_017'], database: '_test_polipus')
8
8
  @mongo['_test_pages'].drop
9
9
  @storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
10
10
  end
@@ -21,8 +21,8 @@ describe Polipus::Storage::MongoStore do
21
21
  p = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
22
22
  uuid = @storage.add p
23
23
  expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
24
- expect(@storage.count).to be 1
25
- expect(@mongo['_test_pages'].count).to be 1
24
+ expect(@storage.count.to_i).to be 1
25
+ expect(@mongo['_test_pages'].find.count.to_i).to be 1
26
26
  p = @storage.get p
27
27
  expect(p.url.to_s).to eq('http://www.google.com')
28
28
  expect(p.body).to eq('<html></html>')
@@ -33,7 +33,7 @@ describe Polipus::Storage::MongoStore do
33
33
  @storage.add p
34
34
  p = @storage.get p
35
35
  expect(p.code).to eq(301)
36
- expect(@mongo['_test_pages'].count).to be 1
36
+ expect(@mongo['_test_pages'].find.count.to_i).to be 1
37
37
  end
38
38
 
39
39
  it 'should iterate over stored pages' do
@@ -47,7 +47,7 @@ describe Polipus::Storage::MongoStore do
47
47
  p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
48
48
  @storage.remove p
49
49
  expect(@storage.get(p)).to be_nil
50
- expect(@storage.count).to be 0
50
+ expect(@storage.count.to_i).to be 0
51
51
  end
52
52
 
53
53
  it 'should store a page removing a query string from the uuid generation' do
@@ -83,7 +83,7 @@ describe Polipus::Storage::MongoStore do
83
83
  p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
84
84
  storage.add p
85
85
  p = storage.get p
86
- expect(p.body).to be_empty
86
+ expect(p.body).to be_nil
87
87
  storage.clear
88
88
  end
89
89
 
@@ -93,5 +93,14 @@ describe Polipus::PolipusCrawler do
93
93
  polipus.takeover
94
94
  polipus.storage.each { |_id, page| expect(page.url.path =~ /$\/downloads\//).to be_falsey }
95
95
  end
96
+
97
+ it 'should obey to the robots.txt file with list user_agent' do
98
+ user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)'
99
+ lopt = p_options
100
+ lopt[:obey_robots_txt] = true
101
+ lopt[:user_agent] = [user_agent]
102
+ flexmock(Polipus::Robotex).should_receive(:new).with(user_agent)
103
+ Polipus::PolipusCrawler.new('polipus-rspec', ['https://rubygems.org/gems/polipus'], lopt)
104
+ end
96
105
  end
97
106
  end
metadata CHANGED
@@ -1,248 +1,262 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: polipus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francesco Laurita
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-26 00:00:00.000000000 Z
11
+ date: 2015-07-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.6'
20
- - - ! '>='
20
+ - - ">="
21
21
  - !ruby/object:Gem::Version
22
22
  version: 1.6.0
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
- - - ~>
27
+ - - "~>"
28
28
  - !ruby/object:Gem::Version
29
29
  version: '1.6'
30
- - - ! '>='
30
+ - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.6.0
33
33
  - !ruby/object:Gem::Dependency
34
34
  name: http-cookie
35
35
  requirement: !ruby/object:Gem::Requirement
36
36
  requirements:
37
- - - ~>
37
+ - - "~>"
38
38
  - !ruby/object:Gem::Version
39
39
  version: '1.0'
40
- - - ! '>='
40
+ - - ">="
41
41
  - !ruby/object:Gem::Version
42
42
  version: 1.0.1
43
43
  type: :runtime
44
44
  prerelease: false
45
45
  version_requirements: !ruby/object:Gem::Requirement
46
46
  requirements:
47
- - - ~>
47
+ - - "~>"
48
48
  - !ruby/object:Gem::Version
49
49
  version: '1.0'
50
- - - ! '>='
50
+ - - ">="
51
51
  - !ruby/object:Gem::Version
52
52
  version: 1.0.1
53
53
  - !ruby/object:Gem::Dependency
54
54
  name: redis
55
55
  requirement: !ruby/object:Gem::Requirement
56
56
  requirements:
57
- - - ~>
57
+ - - "~>"
58
58
  - !ruby/object:Gem::Version
59
59
  version: '3.0'
60
- - - ! '>='
60
+ - - ">="
61
61
  - !ruby/object:Gem::Version
62
62
  version: 3.0.4
63
63
  type: :runtime
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
- - - ~>
67
+ - - "~>"
68
68
  - !ruby/object:Gem::Version
69
69
  version: '3.0'
70
- - - ! '>='
70
+ - - ">="
71
71
  - !ruby/object:Gem::Version
72
72
  version: 3.0.4
73
73
  - !ruby/object:Gem::Dependency
74
74
  name: hiredis
75
75
  requirement: !ruby/object:Gem::Requirement
76
76
  requirements:
77
- - - ~>
77
+ - - "~>"
78
78
  - !ruby/object:Gem::Version
79
79
  version: '0.5'
80
- - - ! '>='
80
+ - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: 0.4.5
83
83
  type: :runtime
84
84
  prerelease: false
85
85
  version_requirements: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - ~>
87
+ - - "~>"
88
88
  - !ruby/object:Gem::Version
89
89
  version: '0.5'
90
- - - ! '>='
90
+ - - ">="
91
91
  - !ruby/object:Gem::Version
92
92
  version: 0.4.5
93
93
  - !ruby/object:Gem::Dependency
94
94
  name: redis-queue
95
95
  requirement: !ruby/object:Gem::Requirement
96
96
  requirements:
97
- - - ~>
97
+ - - "~>"
98
98
  - !ruby/object:Gem::Version
99
99
  version: '0.0'
100
- - - ! '>='
100
+ - - ">="
101
101
  - !ruby/object:Gem::Version
102
102
  version: 0.0.4
103
103
  type: :runtime
104
104
  prerelease: false
105
105
  version_requirements: !ruby/object:Gem::Requirement
106
106
  requirements:
107
- - - ~>
107
+ - - "~>"
108
108
  - !ruby/object:Gem::Version
109
109
  version: '0.0'
110
- - - ! '>='
110
+ - - ">="
111
111
  - !ruby/object:Gem::Version
112
112
  version: 0.0.4
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: redis-bloomfilter
115
115
  requirement: !ruby/object:Gem::Requirement
116
116
  requirements:
117
- - - ~>
117
+ - - "~>"
118
118
  - !ruby/object:Gem::Version
119
119
  version: '0.0'
120
- - - ! '>='
120
+ - - ">="
121
121
  - !ruby/object:Gem::Version
122
122
  version: 0.0.3
123
123
  type: :runtime
124
124
  prerelease: false
125
125
  version_requirements: !ruby/object:Gem::Requirement
126
126
  requirements:
127
- - - ~>
127
+ - - "~>"
128
128
  - !ruby/object:Gem::Version
129
129
  version: '0.0'
130
- - - ! '>='
130
+ - - ">="
131
131
  - !ruby/object:Gem::Version
132
132
  version: 0.0.3
133
133
  - !ruby/object:Gem::Dependency
134
134
  name: mongo
135
135
  requirement: !ruby/object:Gem::Requirement
136
136
  requirements:
137
- - - ~>
137
+ - - "~>"
138
138
  - !ruby/object:Gem::Version
139
- version: 1.11.0
139
+ version: 2.0.6
140
140
  type: :development
141
141
  prerelease: false
142
142
  version_requirements: !ruby/object:Gem::Requirement
143
143
  requirements:
144
- - - ~>
144
+ - - "~>"
145
145
  - !ruby/object:Gem::Version
146
- version: 1.11.0
146
+ version: 2.0.6
147
147
  - !ruby/object:Gem::Dependency
148
148
  name: rethinkdb
149
149
  requirement: !ruby/object:Gem::Requirement
150
150
  requirements:
151
- - - ~>
151
+ - - "~>"
152
152
  - !ruby/object:Gem::Version
153
153
  version: 1.15.0
154
154
  type: :development
155
155
  prerelease: false
156
156
  version_requirements: !ruby/object:Gem::Requirement
157
157
  requirements:
158
- - - ~>
158
+ - - "~>"
159
159
  - !ruby/object:Gem::Version
160
160
  version: 1.15.0
161
161
  - !ruby/object:Gem::Dependency
162
162
  name: rake
163
163
  requirement: !ruby/object:Gem::Requirement
164
164
  requirements:
165
- - - ~>
165
+ - - "~>"
166
166
  - !ruby/object:Gem::Version
167
167
  version: '10.3'
168
168
  type: :development
169
169
  prerelease: false
170
170
  version_requirements: !ruby/object:Gem::Requirement
171
171
  requirements:
172
- - - ~>
172
+ - - "~>"
173
173
  - !ruby/object:Gem::Version
174
174
  version: '10.3'
175
175
  - !ruby/object:Gem::Dependency
176
176
  name: rspec
177
177
  requirement: !ruby/object:Gem::Requirement
178
178
  requirements:
179
- - - ~>
179
+ - - "~>"
180
180
  - !ruby/object:Gem::Version
181
181
  version: 3.1.0
182
182
  type: :development
183
183
  prerelease: false
184
184
  version_requirements: !ruby/object:Gem::Requirement
185
185
  requirements:
186
- - - ~>
186
+ - - "~>"
187
187
  - !ruby/object:Gem::Version
188
188
  version: 3.1.0
189
189
  - !ruby/object:Gem::Dependency
190
190
  name: flexmock
191
191
  requirement: !ruby/object:Gem::Requirement
192
192
  requirements:
193
- - - ~>
193
+ - - "~>"
194
194
  - !ruby/object:Gem::Version
195
195
  version: '1.3'
196
196
  type: :development
197
197
  prerelease: false
198
198
  version_requirements: !ruby/object:Gem::Requirement
199
199
  requirements:
200
- - - ~>
200
+ - - "~>"
201
201
  - !ruby/object:Gem::Version
202
202
  version: '1.3'
203
203
  - !ruby/object:Gem::Dependency
204
204
  name: vcr
205
205
  requirement: !ruby/object:Gem::Requirement
206
206
  requirements:
207
- - - ~>
207
+ - - "~>"
208
208
  - !ruby/object:Gem::Version
209
209
  version: 2.9.0
210
210
  type: :development
211
211
  prerelease: false
212
212
  version_requirements: !ruby/object:Gem::Requirement
213
213
  requirements:
214
- - - ~>
214
+ - - "~>"
215
215
  - !ruby/object:Gem::Version
216
216
  version: 2.9.0
217
217
  - !ruby/object:Gem::Dependency
218
218
  name: webmock
219
219
  requirement: !ruby/object:Gem::Requirement
220
220
  requirements:
221
- - - ~>
221
+ - - "~>"
222
222
  - !ruby/object:Gem::Version
223
223
  version: 1.20.0
224
224
  type: :development
225
225
  prerelease: false
226
226
  version_requirements: !ruby/object:Gem::Requirement
227
227
  requirements:
228
- - - ~>
228
+ - - "~>"
229
229
  - !ruby/object:Gem::Version
230
230
  version: 1.20.0
231
231
  - !ruby/object:Gem::Dependency
232
232
  name: coveralls
233
233
  requirement: !ruby/object:Gem::Requirement
234
234
  requirements:
235
- - - ! '>='
235
+ - - ">="
236
236
  - !ruby/object:Gem::Version
237
237
  version: '0'
238
238
  type: :development
239
239
  prerelease: false
240
240
  version_requirements: !ruby/object:Gem::Requirement
241
241
  requirements:
242
- - - ! '>='
242
+ - - ">="
243
243
  - !ruby/object:Gem::Version
244
244
  version: '0'
245
- description: ! "\n An easy to use distributed web-crawler framework based on Redis\n
245
+ - !ruby/object:Gem::Dependency
246
+ name: pry
247
+ requirement: !ruby/object:Gem::Requirement
248
+ requirements:
249
+ - - ">="
250
+ - !ruby/object:Gem::Version
251
+ version: '0'
252
+ type: :development
253
+ prerelease: false
254
+ version_requirements: !ruby/object:Gem::Requirement
255
+ requirements:
256
+ - - ">="
257
+ - !ruby/object:Gem::Version
258
+ version: '0'
259
+ description: "\n An easy to use distributed web-crawler framework based on Redis\n
246
260
  \ "
247
261
  email:
248
262
  - francesco.laurita@gmail.com
@@ -250,12 +264,12 @@ executables: []
250
264
  extensions: []
251
265
  extra_rdoc_files: []
252
266
  files:
253
- - .document
254
- - .gitignore
255
- - .rspec
256
- - .rubocop.yml
257
- - .rubocop_todo.yml
258
- - .travis.yml
267
+ - ".document"
268
+ - ".gitignore"
269
+ - ".rspec"
270
+ - ".rubocop.yml"
271
+ - ".rubocop_todo.yml"
272
+ - ".travis.yml"
259
273
  - AUTHORS.md
260
274
  - CHANGELOG.md
261
275
  - Gemfile
@@ -330,17 +344,17 @@ require_paths:
330
344
  - lib
331
345
  required_ruby_version: !ruby/object:Gem::Requirement
332
346
  requirements:
333
- - - ! '>='
347
+ - - ">="
334
348
  - !ruby/object:Gem::Version
335
349
  version: '0'
336
350
  required_rubygems_version: !ruby/object:Gem::Requirement
337
351
  requirements:
338
- - - ! '>='
352
+ - - ">="
339
353
  - !ruby/object:Gem::Version
340
354
  version: '0'
341
355
  requirements: []
342
356
  rubyforge_project: polipus
343
- rubygems_version: 2.4.5
357
+ rubygems_version: 2.2.2
344
358
  signing_key:
345
359
  specification_version: 4
346
360
  summary: Polipus distributed web-crawler framework
@@ -371,4 +385,3 @@ test_files:
371
385
  - spec/polipus/url_tracker_spec.rb
372
386
  - spec/polipus_spec.rb
373
387
  - spec/spec_helper.rb
374
- has_rdoc: