polipus 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/AUTHORS.md +4 -0
- data/CHANGELOG.md +20 -0
- data/Gemfile +2 -2
- data/examples/error_handling.rb +22 -0
- data/examples/robots_txt_handling.rb +13 -0
- data/lib/polipus.rb +54 -21
- data/lib/polipus/http.rb +29 -14
- data/lib/polipus/page.rb +15 -6
- data/lib/polipus/robotex.rb +154 -0
- data/lib/polipus/version.rb +1 -1
- data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
- data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
- data/spec/cassettes/gzipped_on.yml +80 -70
- data/spec/cassettes/http_cookies.yml +133 -0
- data/spec/cassettes/http_tconnection_max_hits.yml +4091 -7461
- data/spec/http_spec.rb +32 -2
- data/spec/page_spec.rb +19 -0
- data/spec/polipus_spec.rb +18 -0
- data/spec/robotex_spec.rb +86 -0
- data/spec/spec_helper.rb +1 -0
- metadata +15 -4
- data/AUTHORS +0 -2
- data/README.rdoc +0 -3
data/spec/http_spec.rb
CHANGED
@@ -12,6 +12,7 @@ describe Polipus::HTTP do
|
|
12
12
|
page.should be_an_instance_of(Polipus::Page)
|
13
13
|
page.doc.search("title").text.strip.should eq "SF bay area apts/housing for rent classifieds - craigslist"
|
14
14
|
page.fetched_at.should_not be_nil
|
15
|
+
page.fetched?.should be_true
|
15
16
|
end
|
16
17
|
end
|
17
18
|
|
@@ -52,16 +53,22 @@ describe Polipus::HTTP do
|
|
52
53
|
end
|
53
54
|
|
54
55
|
|
55
|
-
describe '
|
56
|
+
describe 'compressed content handling' do
|
56
57
|
|
57
58
|
it 'should decode gzip content' do
|
58
59
|
VCR.use_cassette('gzipped_on') do
|
59
|
-
http = Polipus::HTTP.new(
|
60
|
+
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
60
61
|
page = http.fetch_page("http://www.whatsmyip.org/http-compression-test/")
|
61
62
|
page.doc.css('.gzip_yes').should_not be_empty
|
62
63
|
end
|
63
64
|
end
|
64
65
|
|
66
|
+
it 'should decode deflate content' do
|
67
|
+
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
68
|
+
page = http.fetch_page("http://david.fullrecall.com/browser-http-compression-test?compression=deflate-http")
|
69
|
+
page.headers.fetch('content-encoding').first.should eq 'deflate'
|
70
|
+
page.body.include?("deflate-http").should be_true
|
71
|
+
end
|
65
72
|
|
66
73
|
end
|
67
74
|
|
@@ -82,6 +89,29 @@ describe Polipus::HTTP do
|
|
82
89
|
http.connections['www.yahoo.com'][443].should_not be old_conn
|
83
90
|
end
|
84
91
|
end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
describe 'cookies' do
|
96
|
+
|
97
|
+
it 'should handle cookies correctly' do
|
98
|
+
VCR.use_cassette('http_cookies') do
|
99
|
+
http = Polipus::HTTP.new(accept_cookies: true)
|
100
|
+
http.fetch_page "http://www.whatarecookies.com/cookietest.asp"
|
101
|
+
http.accept_cookies?.should be_true
|
102
|
+
http.cookie_jar.cookies(URI("http://www.whatarecookies.com/cookietest.asp")).should_not be_empty
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
describe 'net errors' do
|
109
|
+
it 'should handle net errors correctly' do
|
110
|
+
VCR.use_cassette('http_errors') do
|
111
|
+
http = Polipus::HTTP.new(open_timeout:1, read_timeout: 1)
|
112
|
+
http.fetch_page("http://www.wrong-domain.lol/").error.should_not be_nil
|
113
|
+
end
|
114
|
+
end
|
85
115
|
end
|
86
116
|
|
87
117
|
end
|
data/spec/page_spec.rb
CHANGED
@@ -48,5 +48,24 @@ EOF
|
|
48
48
|
end
|
49
49
|
end
|
50
50
|
|
51
|
+
context 'page error' do
|
52
|
+
|
53
|
+
let(:page) do
|
54
|
+
Polipus::Page.new 'http://www.google.com/', error: 'an error'
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should serialize an error' do
|
58
|
+
page.to_hash['error'].should eq 'an error'
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
context 'page code' do
|
64
|
+
it 'should identify HTTPSuccess code' do
|
65
|
+
Polipus::Page.new('http://www.google.com/', code: 201).success?.should be_true
|
66
|
+
Polipus::Page.new('http://www.google.com/', code: 404).success?.should be_false
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
51
70
|
|
52
71
|
end
|
data/spec/polipus_spec.rb
CHANGED
@@ -73,5 +73,23 @@ describe Polipus::PolipusCrawler do
|
|
73
73
|
cache_hit["http://rubygems.org/gems"].should be 2
|
74
74
|
end
|
75
75
|
|
76
|
+
it "should call on_page_error code blocks when a page has error" do
|
77
|
+
p = Polipus::PolipusCrawler.new("polipus-rspec", ["http://dasd.adad.dom/"], p_options.merge(open_timeout:1, read_timeout: 1))
|
78
|
+
a_page = nil
|
79
|
+
p.on_page_error {|page| a_page = page}
|
80
|
+
p.takeover
|
81
|
+
a_page.should_not be_nil
|
82
|
+
a_page.error.should_not be_nil
|
83
|
+
end
|
84
|
+
|
85
|
+
it "should obey to the robots.txt file" do
|
86
|
+
lopt = p_options
|
87
|
+
lopt[:obey_robots_txt] = true
|
88
|
+
polipus = Polipus::PolipusCrawler.new("polipus-rspec", ["https://rubygems.org/gems/polipus"], lopt)
|
89
|
+
polipus.depth_limit = 1
|
90
|
+
polipus.takeover
|
91
|
+
polipus.storage.each {|id, page| (page.url.path =~ /$\/downloads\//).should be_false}
|
92
|
+
end
|
93
|
+
|
76
94
|
end
|
77
95
|
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require "polipus/robotex"
|
3
|
+
describe Polipus::Robotex do
|
4
|
+
let(:spec_domain){"http://www.example.com/"}
|
5
|
+
before(:each) do
|
6
|
+
robots = <<-END
|
7
|
+
User-Agent: msnbot
|
8
|
+
Crawl-Delay: 20
|
9
|
+
|
10
|
+
User-Agent: bender
|
11
|
+
Disallow: /my_shiny_metal_ass
|
12
|
+
|
13
|
+
User-Agent: *
|
14
|
+
Disallow: /login
|
15
|
+
Allow: /
|
16
|
+
|
17
|
+
Disallow: /locked
|
18
|
+
Allow: /locked
|
19
|
+
END
|
20
|
+
stub_request(:get, 'http://www.example.com/robots.txt')
|
21
|
+
.to_return(:body => robots, :status => [200, "OK"], :headers => { "Content-Type" => 'text/plain' })
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
describe '#initialize' do
|
26
|
+
context 'when no arguments are supplied' do
|
27
|
+
it 'returns a Robotex with the default user-agent' do
|
28
|
+
Polipus::Robotex.new.user_agent.should == "Robotex/#{Polipus::Robotex::VERSION} (http://www.github.com/chriskite/robotex)"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context 'when a user-agent is specified' do
|
33
|
+
it 'returns a Robotex with the specified user-agent' do
|
34
|
+
ua = 'My User Agent'
|
35
|
+
Polipus::Robotex.new(ua).user_agent.should == ua
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe '#allowed?' do
|
41
|
+
context 'when the robots.txt disallows the user-agent to the url' do
|
42
|
+
it 'returns false' do
|
43
|
+
robotex = Polipus::Robotex.new('bender')
|
44
|
+
robotex.allowed?(spec_domain + 'my_shiny_metal_ass').should be_false
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
context 'when the robots.txt disallows the user-agent to some urls, but allows this one' do
|
49
|
+
it 'returns true' do
|
50
|
+
robotex = Polipus::Robotex.new('bender')
|
51
|
+
robotex.allowed?(spec_domain + 'cigars').should be_true
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
context 'when the robots.txt disallows any user-agent to the url' do
|
56
|
+
it 'returns false' do
|
57
|
+
robotex = Polipus::Robotex.new
|
58
|
+
robotex.allowed?(spec_domain + 'login').should be_false
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
context 'when the robots.txt disallows and then allows the url' do
|
63
|
+
it 'returns false' do
|
64
|
+
robotex = Polipus::Robotex.new
|
65
|
+
robotex.allowed?(spec_domain + 'locked').should be_false
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
describe '#delay' do
|
71
|
+
context 'when no Crawl-Delay is specified for the user-agent' do
|
72
|
+
it 'returns nil' do
|
73
|
+
robotex = Polipus::Robotex.new
|
74
|
+
robotex.delay(spec_domain).should be_nil
|
75
|
+
end
|
76
|
+
|
77
|
+
context 'when Crawl-Delay is specified for the user-agent' do
|
78
|
+
it 'returns the delay as a Fixnum' do
|
79
|
+
robotex = Polipus::Robotex.new('msnbot')
|
80
|
+
robotex.delay(spec_domain).should == 20
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polipus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francesco Laurita
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-06-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis-bloomfilter
|
@@ -316,14 +316,16 @@ files:
|
|
316
316
|
- .gitignore
|
317
317
|
- .rspec
|
318
318
|
- .travis.yml
|
319
|
-
- AUTHORS
|
319
|
+
- AUTHORS.md
|
320
|
+
- CHANGELOG.md
|
320
321
|
- Gemfile
|
321
322
|
- LICENSE.txt
|
322
323
|
- README.md
|
323
|
-
- README.rdoc
|
324
324
|
- Rakefile
|
325
325
|
- examples/basic.rb
|
326
|
+
- examples/error_handling.rb
|
326
327
|
- examples/incremental.rb
|
328
|
+
- examples/robots_txt_handling.rb
|
327
329
|
- examples/survival.rb
|
328
330
|
- lib/polipus.rb
|
329
331
|
- lib/polipus/http.rb
|
@@ -338,6 +340,7 @@ files:
|
|
338
340
|
- lib/polipus/queue_overflow/manager.rb
|
339
341
|
- lib/polipus/queue_overflow/mongo_queue.rb
|
340
342
|
- lib/polipus/queue_overflow/mongo_queue_capped.rb
|
343
|
+
- lib/polipus/robotex.rb
|
341
344
|
- lib/polipus/storage.rb
|
342
345
|
- lib/polipus/storage/base.rb
|
343
346
|
- lib/polipus/storage/dev_null.rb
|
@@ -350,6 +353,7 @@ files:
|
|
350
353
|
- lib/polipus/version.rb
|
351
354
|
- polipus.gemspec
|
352
355
|
- spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
|
356
|
+
- spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml
|
353
357
|
- spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
|
354
358
|
- spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
|
355
359
|
- spec/cassettes/4640919145753505af2d0f8423de37f3.yml
|
@@ -361,10 +365,12 @@ files:
|
|
361
365
|
- spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
|
362
366
|
- spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
|
363
367
|
- spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
|
368
|
+
- spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml
|
364
369
|
- spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
|
365
370
|
- spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
|
366
371
|
- spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
|
367
372
|
- spec/cassettes/gzipped_on.yml
|
373
|
+
- spec/cassettes/http_cookies.yml
|
368
374
|
- spec/cassettes/http_tconnection_max_hits.yml
|
369
375
|
- spec/cassettes/http_test.yml
|
370
376
|
- spec/cassettes/http_test_redirect.yml
|
@@ -374,6 +380,7 @@ files:
|
|
374
380
|
- spec/polipus_spec.rb
|
375
381
|
- spec/queue_overflow_manager_spec.rb
|
376
382
|
- spec/queue_overflow_spec.rb
|
383
|
+
- spec/robotex_spec.rb
|
377
384
|
- spec/spec_helper.rb
|
378
385
|
- spec/storage_memory_spec.rb
|
379
386
|
- spec/storage_mongo_spec.rb
|
@@ -405,6 +412,7 @@ specification_version: 4
|
|
405
412
|
summary: Polipus distributed web-crawler framework
|
406
413
|
test_files:
|
407
414
|
- spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
|
415
|
+
- spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml
|
408
416
|
- spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
|
409
417
|
- spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
|
410
418
|
- spec/cassettes/4640919145753505af2d0f8423de37f3.yml
|
@@ -416,10 +424,12 @@ test_files:
|
|
416
424
|
- spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
|
417
425
|
- spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
|
418
426
|
- spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
|
427
|
+
- spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml
|
419
428
|
- spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
|
420
429
|
- spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
|
421
430
|
- spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
|
422
431
|
- spec/cassettes/gzipped_on.yml
|
432
|
+
- spec/cassettes/http_cookies.yml
|
423
433
|
- spec/cassettes/http_tconnection_max_hits.yml
|
424
434
|
- spec/cassettes/http_test.yml
|
425
435
|
- spec/cassettes/http_test_redirect.yml
|
@@ -429,6 +439,7 @@ test_files:
|
|
429
439
|
- spec/polipus_spec.rb
|
430
440
|
- spec/queue_overflow_manager_spec.rb
|
431
441
|
- spec/queue_overflow_spec.rb
|
442
|
+
- spec/robotex_spec.rb
|
432
443
|
- spec/spec_helper.rb
|
433
444
|
- spec/storage_memory_spec.rb
|
434
445
|
- spec/storage_mongo_spec.rb
|
data/AUTHORS
DELETED
data/README.rdoc
DELETED