polipus 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/spec/http_spec.rb CHANGED
@@ -12,6 +12,7 @@ describe Polipus::HTTP do
12
12
  page.should be_an_instance_of(Polipus::Page)
13
13
  page.doc.search("title").text.strip.should eq "SF bay area apts/housing for rent classifieds - craigslist"
14
14
  page.fetched_at.should_not be_nil
15
+ page.fetched?.should be_true
15
16
  end
16
17
  end
17
18
 
@@ -52,16 +53,22 @@ describe Polipus::HTTP do
52
53
  end
53
54
 
54
55
 
55
- describe 'gzipped content handling' do
56
+ describe 'compressed content handling' do
56
57
 
57
58
  it 'should decode gzip content' do
58
59
  VCR.use_cassette('gzipped_on') do
59
- http = Polipus::HTTP.new(gzip_enabled: true, logger: Logger.new(STDOUT))
60
+ http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
60
61
  page = http.fetch_page("http://www.whatsmyip.org/http-compression-test/")
61
62
  page.doc.css('.gzip_yes').should_not be_empty
62
63
  end
63
64
  end
64
65
 
66
+ it 'should decode deflate content' do
67
+ http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
68
+ page = http.fetch_page("http://david.fullrecall.com/browser-http-compression-test?compression=deflate-http")
69
+ page.headers.fetch('content-encoding').first.should eq 'deflate'
70
+ page.body.include?("deflate-http").should be_true
71
+ end
65
72
 
66
73
  end
67
74
 
@@ -82,6 +89,29 @@ describe Polipus::HTTP do
82
89
  http.connections['www.yahoo.com'][443].should_not be old_conn
83
90
  end
84
91
  end
92
+
93
+ end
94
+
95
+ describe 'cookies' do
96
+
97
+ it 'should handle cookies correctly' do
98
+ VCR.use_cassette('http_cookies') do
99
+ http = Polipus::HTTP.new(accept_cookies: true)
100
+ http.fetch_page "http://www.whatarecookies.com/cookietest.asp"
101
+ http.accept_cookies?.should be_true
102
+ http.cookie_jar.cookies(URI("http://www.whatarecookies.com/cookietest.asp")).should_not be_empty
103
+ end
104
+ end
105
+
106
+ end
107
+
108
+ describe 'net errors' do
109
+ it 'should handle net errors correctly' do
110
+ VCR.use_cassette('http_errors') do
111
+ http = Polipus::HTTP.new(open_timeout:1, read_timeout: 1)
112
+ http.fetch_page("http://www.wrong-domain.lol/").error.should_not be_nil
113
+ end
114
+ end
85
115
  end
86
116
 
87
117
  end
data/spec/page_spec.rb CHANGED
@@ -48,5 +48,24 @@ EOF
48
48
  end
49
49
  end
50
50
 
51
+ context 'page error' do
52
+
53
+ let(:page) do
54
+ Polipus::Page.new 'http://www.google.com/', error: 'an error'
55
+ end
56
+
57
+ it 'should serialize an error' do
58
+ page.to_hash['error'].should eq 'an error'
59
+ end
60
+
61
+ end
62
+
63
+ context 'page code' do
64
+ it 'should identify HTTPSuccess code' do
65
+ Polipus::Page.new('http://www.google.com/', code: 201).success?.should be_true
66
+ Polipus::Page.new('http://www.google.com/', code: 404).success?.should be_false
67
+ end
68
+
69
+ end
51
70
 
52
71
  end
data/spec/polipus_spec.rb CHANGED
@@ -73,5 +73,23 @@ describe Polipus::PolipusCrawler do
73
73
  cache_hit["http://rubygems.org/gems"].should be 2
74
74
  end
75
75
 
76
+ it "should call on_page_error code blocks when a page has error" do
77
+ p = Polipus::PolipusCrawler.new("polipus-rspec", ["http://dasd.adad.dom/"], p_options.merge(open_timeout:1, read_timeout: 1))
78
+ a_page = nil
79
+ p.on_page_error {|page| a_page = page}
80
+ p.takeover
81
+ a_page.should_not be_nil
82
+ a_page.error.should_not be_nil
83
+ end
84
+
85
+ it "should obey to the robots.txt file" do
86
+ lopt = p_options
87
+ lopt[:obey_robots_txt] = true
88
+ polipus = Polipus::PolipusCrawler.new("polipus-rspec", ["https://rubygems.org/gems/polipus"], lopt)
89
+ polipus.depth_limit = 1
90
+ polipus.takeover
91
+ polipus.storage.each {|id, page| (page.url.path =~ /$\/downloads\//).should be_false}
92
+ end
93
+
76
94
  end
77
95
  end
@@ -0,0 +1,86 @@
1
+ require 'spec_helper'
2
+ require "polipus/robotex"
3
+ describe Polipus::Robotex do
4
+ let(:spec_domain){"http://www.example.com/"}
5
+ before(:each) do
6
+ robots = <<-END
7
+ User-Agent: msnbot
8
+ Crawl-Delay: 20
9
+
10
+ User-Agent: bender
11
+ Disallow: /my_shiny_metal_ass
12
+
13
+ User-Agent: *
14
+ Disallow: /login
15
+ Allow: /
16
+
17
+ Disallow: /locked
18
+ Allow: /locked
19
+ END
20
+ stub_request(:get, 'http://www.example.com/robots.txt')
21
+ .to_return(:body => robots, :status => [200, "OK"], :headers => { "Content-Type" => 'text/plain' })
22
+ end
23
+
24
+
25
+ describe '#initialize' do
26
+ context 'when no arguments are supplied' do
27
+ it 'returns a Robotex with the default user-agent' do
28
+ Polipus::Robotex.new.user_agent.should == "Robotex/#{Polipus::Robotex::VERSION} (http://www.github.com/chriskite/robotex)"
29
+ end
30
+ end
31
+
32
+ context 'when a user-agent is specified' do
33
+ it 'returns a Robotex with the specified user-agent' do
34
+ ua = 'My User Agent'
35
+ Polipus::Robotex.new(ua).user_agent.should == ua
36
+ end
37
+ end
38
+ end
39
+
40
+ describe '#allowed?' do
41
+ context 'when the robots.txt disallows the user-agent to the url' do
42
+ it 'returns false' do
43
+ robotex = Polipus::Robotex.new('bender')
44
+ robotex.allowed?(spec_domain + 'my_shiny_metal_ass').should be_false
45
+ end
46
+ end
47
+
48
+ context 'when the robots.txt disallows the user-agent to some urls, but allows this one' do
49
+ it 'returns true' do
50
+ robotex = Polipus::Robotex.new('bender')
51
+ robotex.allowed?(spec_domain + 'cigars').should be_true
52
+ end
53
+ end
54
+
55
+ context 'when the robots.txt disallows any user-agent to the url' do
56
+ it 'returns false' do
57
+ robotex = Polipus::Robotex.new
58
+ robotex.allowed?(spec_domain + 'login').should be_false
59
+ end
60
+ end
61
+
62
+ context 'when the robots.txt disallows and then allows the url' do
63
+ it 'returns false' do
64
+ robotex = Polipus::Robotex.new
65
+ robotex.allowed?(spec_domain + 'locked').should be_false
66
+ end
67
+ end
68
+ end
69
+
70
+ describe '#delay' do
71
+ context 'when no Crawl-Delay is specified for the user-agent' do
72
+ it 'returns nil' do
73
+ robotex = Polipus::Robotex.new
74
+ robotex.delay(spec_domain).should be_nil
75
+ end
76
+
77
+ context 'when Crawl-Delay is specified for the user-agent' do
78
+ it 'returns the delay as a Fixnum' do
79
+ robotex = Polipus::Robotex.new('msnbot')
80
+ robotex.delay(spec_domain).should == 20
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ end
data/spec/spec_helper.rb CHANGED
@@ -6,6 +6,7 @@
6
6
  # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
7
  require "digest/md5"
8
8
  require "coveralls"
9
+ require "webmock/rspec"
9
10
 
10
11
  Coveralls.wear!
11
12
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: polipus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francesco Laurita
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-23 00:00:00.000000000 Z
11
+ date: 2014-06-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redis-bloomfilter
@@ -316,14 +316,16 @@ files:
316
316
  - .gitignore
317
317
  - .rspec
318
318
  - .travis.yml
319
- - AUTHORS
319
+ - AUTHORS.md
320
+ - CHANGELOG.md
320
321
  - Gemfile
321
322
  - LICENSE.txt
322
323
  - README.md
323
- - README.rdoc
324
324
  - Rakefile
325
325
  - examples/basic.rb
326
+ - examples/error_handling.rb
326
327
  - examples/incremental.rb
328
+ - examples/robots_txt_handling.rb
327
329
  - examples/survival.rb
328
330
  - lib/polipus.rb
329
331
  - lib/polipus/http.rb
@@ -338,6 +340,7 @@ files:
338
340
  - lib/polipus/queue_overflow/manager.rb
339
341
  - lib/polipus/queue_overflow/mongo_queue.rb
340
342
  - lib/polipus/queue_overflow/mongo_queue_capped.rb
343
+ - lib/polipus/robotex.rb
341
344
  - lib/polipus/storage.rb
342
345
  - lib/polipus/storage/base.rb
343
346
  - lib/polipus/storage/dev_null.rb
@@ -350,6 +353,7 @@ files:
350
353
  - lib/polipus/version.rb
351
354
  - polipus.gemspec
352
355
  - spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
356
+ - spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml
353
357
  - spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
354
358
  - spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
355
359
  - spec/cassettes/4640919145753505af2d0f8423de37f3.yml
@@ -361,10 +365,12 @@ files:
361
365
  - spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
362
366
  - spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
363
367
  - spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
368
+ - spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml
364
369
  - spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
365
370
  - spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
366
371
  - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
367
372
  - spec/cassettes/gzipped_on.yml
373
+ - spec/cassettes/http_cookies.yml
368
374
  - spec/cassettes/http_tconnection_max_hits.yml
369
375
  - spec/cassettes/http_test.yml
370
376
  - spec/cassettes/http_test_redirect.yml
@@ -374,6 +380,7 @@ files:
374
380
  - spec/polipus_spec.rb
375
381
  - spec/queue_overflow_manager_spec.rb
376
382
  - spec/queue_overflow_spec.rb
383
+ - spec/robotex_spec.rb
377
384
  - spec/spec_helper.rb
378
385
  - spec/storage_memory_spec.rb
379
386
  - spec/storage_mongo_spec.rb
@@ -405,6 +412,7 @@ specification_version: 4
405
412
  summary: Polipus distributed web-crawler framework
406
413
  test_files:
407
414
  - spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
415
+ - spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml
408
416
  - spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
409
417
  - spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
410
418
  - spec/cassettes/4640919145753505af2d0f8423de37f3.yml
@@ -416,10 +424,12 @@ test_files:
416
424
  - spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
417
425
  - spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
418
426
  - spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
427
+ - spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml
419
428
  - spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
420
429
  - spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
421
430
  - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
422
431
  - spec/cassettes/gzipped_on.yml
432
+ - spec/cassettes/http_cookies.yml
423
433
  - spec/cassettes/http_tconnection_max_hits.yml
424
434
  - spec/cassettes/http_test.yml
425
435
  - spec/cassettes/http_test_redirect.yml
@@ -429,6 +439,7 @@ test_files:
429
439
  - spec/polipus_spec.rb
430
440
  - spec/queue_overflow_manager_spec.rb
431
441
  - spec/queue_overflow_spec.rb
442
+ - spec/robotex_spec.rb
432
443
  - spec/spec_helper.rb
433
444
  - spec/storage_memory_spec.rb
434
445
  - spec/storage_mongo_spec.rb
data/AUTHORS DELETED
@@ -1,2 +0,0 @@
1
- Francesco Laurita <francesco.laurita@gmail.com>
2
- Tobias L. Maier <http://tobiasmaier.info/>
data/README.rdoc DELETED
@@ -1,3 +0,0 @@
1
- = polipus
2
-
3
- Visit https://github.com/taganaka/polipus for further details.