polipus 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/spec/http_spec.rb CHANGED
@@ -12,6 +12,7 @@ describe Polipus::HTTP do
12
12
  page.should be_an_instance_of(Polipus::Page)
13
13
  page.doc.search("title").text.strip.should eq "SF bay area apts/housing for rent classifieds - craigslist"
14
14
  page.fetched_at.should_not be_nil
15
+ page.fetched?.should be_true
15
16
  end
16
17
  end
17
18
 
@@ -52,16 +53,22 @@ describe Polipus::HTTP do
52
53
  end
53
54
 
54
55
 
55
- describe 'gzipped content handling' do
56
+ describe 'compressed content handling' do
56
57
 
57
58
  it 'should decode gzip content' do
58
59
  VCR.use_cassette('gzipped_on') do
59
- http = Polipus::HTTP.new(gzip_enabled: true, logger: Logger.new(STDOUT))
60
+ http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
60
61
  page = http.fetch_page("http://www.whatsmyip.org/http-compression-test/")
61
62
  page.doc.css('.gzip_yes').should_not be_empty
62
63
  end
63
64
  end
64
65
 
66
+ it 'should decode deflate content' do
67
+ http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
68
+ page = http.fetch_page("http://david.fullrecall.com/browser-http-compression-test?compression=deflate-http")
69
+ page.headers.fetch('content-encoding').first.should eq 'deflate'
70
+ page.body.include?("deflate-http").should be_true
71
+ end
65
72
 
66
73
  end
67
74
 
@@ -82,6 +89,29 @@ describe Polipus::HTTP do
82
89
  http.connections['www.yahoo.com'][443].should_not be old_conn
83
90
  end
84
91
  end
92
+
93
+ end
94
+
95
+ describe 'cookies' do
96
+
97
+ it 'should handle cookies correctly' do
98
+ VCR.use_cassette('http_cookies') do
99
+ http = Polipus::HTTP.new(accept_cookies: true)
100
+ http.fetch_page "http://www.whatarecookies.com/cookietest.asp"
101
+ http.accept_cookies?.should be_true
102
+ http.cookie_jar.cookies(URI("http://www.whatarecookies.com/cookietest.asp")).should_not be_empty
103
+ end
104
+ end
105
+
106
+ end
107
+
108
+ describe 'net errors' do
109
+ it 'should handle net errors correctly' do
110
+ VCR.use_cassette('http_errors') do
111
+ http = Polipus::HTTP.new(open_timeout:1, read_timeout: 1)
112
+ http.fetch_page("http://www.wrong-domain.lol/").error.should_not be_nil
113
+ end
114
+ end
85
115
  end
86
116
 
87
117
  end
data/spec/page_spec.rb CHANGED
@@ -48,5 +48,24 @@ EOF
48
48
  end
49
49
  end
50
50
 
51
+ context 'page error' do
52
+
53
+ let(:page) do
54
+ Polipus::Page.new 'http://www.google.com/', error: 'an error'
55
+ end
56
+
57
+ it 'should serialize an error' do
58
+ page.to_hash['error'].should eq 'an error'
59
+ end
60
+
61
+ end
62
+
63
+ context 'page code' do
64
+ it 'should identify HTTPSuccess code' do
65
+ Polipus::Page.new('http://www.google.com/', code: 201).success?.should be_true
66
+ Polipus::Page.new('http://www.google.com/', code: 404).success?.should be_false
67
+ end
68
+
69
+ end
51
70
 
52
71
  end
data/spec/polipus_spec.rb CHANGED
@@ -73,5 +73,23 @@ describe Polipus::PolipusCrawler do
73
73
  cache_hit["http://rubygems.org/gems"].should be 2
74
74
  end
75
75
 
76
+ it "should call on_page_error code blocks when a page has error" do
77
+ p = Polipus::PolipusCrawler.new("polipus-rspec", ["http://dasd.adad.dom/"], p_options.merge(open_timeout:1, read_timeout: 1))
78
+ a_page = nil
79
+ p.on_page_error {|page| a_page = page}
80
+ p.takeover
81
+ a_page.should_not be_nil
82
+ a_page.error.should_not be_nil
83
+ end
84
+
85
+ it "should obey to the robots.txt file" do
86
+ lopt = p_options
87
+ lopt[:obey_robots_txt] = true
88
+ polipus = Polipus::PolipusCrawler.new("polipus-rspec", ["https://rubygems.org/gems/polipus"], lopt)
89
+ polipus.depth_limit = 1
90
+ polipus.takeover
91
+ polipus.storage.each {|id, page| (page.url.path =~ /$\/downloads\//).should be_false}
92
+ end
93
+
76
94
  end
77
95
  end
@@ -0,0 +1,86 @@
1
+ require 'spec_helper'
2
+ require "polipus/robotex"
3
+ describe Polipus::Robotex do
4
+ let(:spec_domain){"http://www.example.com/"}
5
+ before(:each) do
6
+ robots = <<-END
7
+ User-Agent: msnbot
8
+ Crawl-Delay: 20
9
+
10
+ User-Agent: bender
11
+ Disallow: /my_shiny_metal_ass
12
+
13
+ User-Agent: *
14
+ Disallow: /login
15
+ Allow: /
16
+
17
+ Disallow: /locked
18
+ Allow: /locked
19
+ END
20
+ stub_request(:get, 'http://www.example.com/robots.txt')
21
+ .to_return(:body => robots, :status => [200, "OK"], :headers => { "Content-Type" => 'text/plain' })
22
+ end
23
+
24
+
25
+ describe '#initialize' do
26
+ context 'when no arguments are supplied' do
27
+ it 'returns a Robotex with the default user-agent' do
28
+ Polipus::Robotex.new.user_agent.should == "Robotex/#{Polipus::Robotex::VERSION} (http://www.github.com/chriskite/robotex)"
29
+ end
30
+ end
31
+
32
+ context 'when a user-agent is specified' do
33
+ it 'returns a Robotex with the specified user-agent' do
34
+ ua = 'My User Agent'
35
+ Polipus::Robotex.new(ua).user_agent.should == ua
36
+ end
37
+ end
38
+ end
39
+
40
+ describe '#allowed?' do
41
+ context 'when the robots.txt disallows the user-agent to the url' do
42
+ it 'returns false' do
43
+ robotex = Polipus::Robotex.new('bender')
44
+ robotex.allowed?(spec_domain + 'my_shiny_metal_ass').should be_false
45
+ end
46
+ end
47
+
48
+ context 'when the robots.txt disallows the user-agent to some urls, but allows this one' do
49
+ it 'returns true' do
50
+ robotex = Polipus::Robotex.new('bender')
51
+ robotex.allowed?(spec_domain + 'cigars').should be_true
52
+ end
53
+ end
54
+
55
+ context 'when the robots.txt disallows any user-agent to the url' do
56
+ it 'returns false' do
57
+ robotex = Polipus::Robotex.new
58
+ robotex.allowed?(spec_domain + 'login').should be_false
59
+ end
60
+ end
61
+
62
+ context 'when the robots.txt disallows and then allows the url' do
63
+ it 'returns false' do
64
+ robotex = Polipus::Robotex.new
65
+ robotex.allowed?(spec_domain + 'locked').should be_false
66
+ end
67
+ end
68
+ end
69
+
70
+ describe '#delay' do
71
+ context 'when no Crawl-Delay is specified for the user-agent' do
72
+ it 'returns nil' do
73
+ robotex = Polipus::Robotex.new
74
+ robotex.delay(spec_domain).should be_nil
75
+ end
76
+
77
+ context 'when Crawl-Delay is specified for the user-agent' do
78
+ it 'returns the delay as a Fixnum' do
79
+ robotex = Polipus::Robotex.new('msnbot')
80
+ robotex.delay(spec_domain).should == 20
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ end
data/spec/spec_helper.rb CHANGED
@@ -6,6 +6,7 @@
6
6
  # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
7
  require "digest/md5"
8
8
  require "coveralls"
9
+ require "webmock/rspec"
9
10
 
10
11
  Coveralls.wear!
11
12
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: polipus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francesco Laurita
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-23 00:00:00.000000000 Z
11
+ date: 2014-06-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redis-bloomfilter
@@ -316,14 +316,16 @@ files:
316
316
  - .gitignore
317
317
  - .rspec
318
318
  - .travis.yml
319
- - AUTHORS
319
+ - AUTHORS.md
320
+ - CHANGELOG.md
320
321
  - Gemfile
321
322
  - LICENSE.txt
322
323
  - README.md
323
- - README.rdoc
324
324
  - Rakefile
325
325
  - examples/basic.rb
326
+ - examples/error_handling.rb
326
327
  - examples/incremental.rb
328
+ - examples/robots_txt_handling.rb
327
329
  - examples/survival.rb
328
330
  - lib/polipus.rb
329
331
  - lib/polipus/http.rb
@@ -338,6 +340,7 @@ files:
338
340
  - lib/polipus/queue_overflow/manager.rb
339
341
  - lib/polipus/queue_overflow/mongo_queue.rb
340
342
  - lib/polipus/queue_overflow/mongo_queue_capped.rb
343
+ - lib/polipus/robotex.rb
341
344
  - lib/polipus/storage.rb
342
345
  - lib/polipus/storage/base.rb
343
346
  - lib/polipus/storage/dev_null.rb
@@ -350,6 +353,7 @@ files:
350
353
  - lib/polipus/version.rb
351
354
  - polipus.gemspec
352
355
  - spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
356
+ - spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml
353
357
  - spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
354
358
  - spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
355
359
  - spec/cassettes/4640919145753505af2d0f8423de37f3.yml
@@ -361,10 +365,12 @@ files:
361
365
  - spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
362
366
  - spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
363
367
  - spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
368
+ - spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml
364
369
  - spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
365
370
  - spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
366
371
  - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
367
372
  - spec/cassettes/gzipped_on.yml
373
+ - spec/cassettes/http_cookies.yml
368
374
  - spec/cassettes/http_tconnection_max_hits.yml
369
375
  - spec/cassettes/http_test.yml
370
376
  - spec/cassettes/http_test_redirect.yml
@@ -374,6 +380,7 @@ files:
374
380
  - spec/polipus_spec.rb
375
381
  - spec/queue_overflow_manager_spec.rb
376
382
  - spec/queue_overflow_spec.rb
383
+ - spec/robotex_spec.rb
377
384
  - spec/spec_helper.rb
378
385
  - spec/storage_memory_spec.rb
379
386
  - spec/storage_mongo_spec.rb
@@ -405,6 +412,7 @@ specification_version: 4
405
412
  summary: Polipus distributed web-crawler framework
406
413
  test_files:
407
414
  - spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
415
+ - spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml
408
416
  - spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
409
417
  - spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
410
418
  - spec/cassettes/4640919145753505af2d0f8423de37f3.yml
@@ -416,10 +424,12 @@ test_files:
416
424
  - spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
417
425
  - spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
418
426
  - spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
427
+ - spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml
419
428
  - spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
420
429
  - spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
421
430
  - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
422
431
  - spec/cassettes/gzipped_on.yml
432
+ - spec/cassettes/http_cookies.yml
423
433
  - spec/cassettes/http_tconnection_max_hits.yml
424
434
  - spec/cassettes/http_test.yml
425
435
  - spec/cassettes/http_test_redirect.yml
@@ -429,6 +439,7 @@ test_files:
429
439
  - spec/polipus_spec.rb
430
440
  - spec/queue_overflow_manager_spec.rb
431
441
  - spec/queue_overflow_spec.rb
442
+ - spec/robotex_spec.rb
432
443
  - spec/spec_helper.rb
433
444
  - spec/storage_memory_spec.rb
434
445
  - spec/storage_mongo_spec.rb
data/AUTHORS DELETED
@@ -1,2 +0,0 @@
1
- Francesco Laurita <francesco.laurita@gmail.com>
2
- Tobias L. Maier <http://tobiasmaier.info/>
data/README.rdoc DELETED
@@ -1,3 +0,0 @@
1
- = polipus
2
-
3
- Visit https://github.com/taganaka/polipus for further details.