polipus 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/AUTHORS.md +4 -0
- data/CHANGELOG.md +20 -0
- data/Gemfile +2 -2
- data/examples/error_handling.rb +22 -0
- data/examples/robots_txt_handling.rb +13 -0
- data/lib/polipus.rb +54 -21
- data/lib/polipus/http.rb +29 -14
- data/lib/polipus/page.rb +15 -6
- data/lib/polipus/robotex.rb +154 -0
- data/lib/polipus/version.rb +1 -1
- data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
- data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
- data/spec/cassettes/gzipped_on.yml +80 -70
- data/spec/cassettes/http_cookies.yml +133 -0
- data/spec/cassettes/http_tconnection_max_hits.yml +4091 -7461
- data/spec/http_spec.rb +32 -2
- data/spec/page_spec.rb +19 -0
- data/spec/polipus_spec.rb +18 -0
- data/spec/robotex_spec.rb +86 -0
- data/spec/spec_helper.rb +1 -0
- metadata +15 -4
- data/AUTHORS +0 -2
- data/README.rdoc +0 -3
data/spec/http_spec.rb
CHANGED
@@ -12,6 +12,7 @@ describe Polipus::HTTP do
|
|
12
12
|
page.should be_an_instance_of(Polipus::Page)
|
13
13
|
page.doc.search("title").text.strip.should eq "SF bay area apts/housing for rent classifieds - craigslist"
|
14
14
|
page.fetched_at.should_not be_nil
|
15
|
+
page.fetched?.should be_true
|
15
16
|
end
|
16
17
|
end
|
17
18
|
|
@@ -52,16 +53,22 @@ describe Polipus::HTTP do
|
|
52
53
|
end
|
53
54
|
|
54
55
|
|
55
|
-
describe '
|
56
|
+
describe 'compressed content handling' do
|
56
57
|
|
57
58
|
it 'should decode gzip content' do
|
58
59
|
VCR.use_cassette('gzipped_on') do
|
59
|
-
http = Polipus::HTTP.new(
|
60
|
+
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
60
61
|
page = http.fetch_page("http://www.whatsmyip.org/http-compression-test/")
|
61
62
|
page.doc.css('.gzip_yes').should_not be_empty
|
62
63
|
end
|
63
64
|
end
|
64
65
|
|
66
|
+
it 'should decode deflate content' do
|
67
|
+
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
68
|
+
page = http.fetch_page("http://david.fullrecall.com/browser-http-compression-test?compression=deflate-http")
|
69
|
+
page.headers.fetch('content-encoding').first.should eq 'deflate'
|
70
|
+
page.body.include?("deflate-http").should be_true
|
71
|
+
end
|
65
72
|
|
66
73
|
end
|
67
74
|
|
@@ -82,6 +89,29 @@ describe Polipus::HTTP do
|
|
82
89
|
http.connections['www.yahoo.com'][443].should_not be old_conn
|
83
90
|
end
|
84
91
|
end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
describe 'cookies' do
|
96
|
+
|
97
|
+
it 'should handle cookies correctly' do
|
98
|
+
VCR.use_cassette('http_cookies') do
|
99
|
+
http = Polipus::HTTP.new(accept_cookies: true)
|
100
|
+
http.fetch_page "http://www.whatarecookies.com/cookietest.asp"
|
101
|
+
http.accept_cookies?.should be_true
|
102
|
+
http.cookie_jar.cookies(URI("http://www.whatarecookies.com/cookietest.asp")).should_not be_empty
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
describe 'net errors' do
|
109
|
+
it 'should handle net errors correctly' do
|
110
|
+
VCR.use_cassette('http_errors') do
|
111
|
+
http = Polipus::HTTP.new(open_timeout:1, read_timeout: 1)
|
112
|
+
http.fetch_page("http://www.wrong-domain.lol/").error.should_not be_nil
|
113
|
+
end
|
114
|
+
end
|
85
115
|
end
|
86
116
|
|
87
117
|
end
|
data/spec/page_spec.rb
CHANGED
@@ -48,5 +48,24 @@ EOF
|
|
48
48
|
end
|
49
49
|
end
|
50
50
|
|
51
|
+
context 'page error' do
|
52
|
+
|
53
|
+
let(:page) do
|
54
|
+
Polipus::Page.new 'http://www.google.com/', error: 'an error'
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should serialize an error' do
|
58
|
+
page.to_hash['error'].should eq 'an error'
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
context 'page code' do
|
64
|
+
it 'should identify HTTPSuccess code' do
|
65
|
+
Polipus::Page.new('http://www.google.com/', code: 201).success?.should be_true
|
66
|
+
Polipus::Page.new('http://www.google.com/', code: 404).success?.should be_false
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
51
70
|
|
52
71
|
end
|
data/spec/polipus_spec.rb
CHANGED
@@ -73,5 +73,23 @@ describe Polipus::PolipusCrawler do
|
|
73
73
|
cache_hit["http://rubygems.org/gems"].should be 2
|
74
74
|
end
|
75
75
|
|
76
|
+
it "should call on_page_error code blocks when a page has error" do
|
77
|
+
p = Polipus::PolipusCrawler.new("polipus-rspec", ["http://dasd.adad.dom/"], p_options.merge(open_timeout:1, read_timeout: 1))
|
78
|
+
a_page = nil
|
79
|
+
p.on_page_error {|page| a_page = page}
|
80
|
+
p.takeover
|
81
|
+
a_page.should_not be_nil
|
82
|
+
a_page.error.should_not be_nil
|
83
|
+
end
|
84
|
+
|
85
|
+
it "should obey to the robots.txt file" do
|
86
|
+
lopt = p_options
|
87
|
+
lopt[:obey_robots_txt] = true
|
88
|
+
polipus = Polipus::PolipusCrawler.new("polipus-rspec", ["https://rubygems.org/gems/polipus"], lopt)
|
89
|
+
polipus.depth_limit = 1
|
90
|
+
polipus.takeover
|
91
|
+
polipus.storage.each {|id, page| (page.url.path =~ /$\/downloads\//).should be_false}
|
92
|
+
end
|
93
|
+
|
76
94
|
end
|
77
95
|
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require "polipus/robotex"
|
3
|
+
describe Polipus::Robotex do
|
4
|
+
let(:spec_domain){"http://www.example.com/"}
|
5
|
+
before(:each) do
|
6
|
+
robots = <<-END
|
7
|
+
User-Agent: msnbot
|
8
|
+
Crawl-Delay: 20
|
9
|
+
|
10
|
+
User-Agent: bender
|
11
|
+
Disallow: /my_shiny_metal_ass
|
12
|
+
|
13
|
+
User-Agent: *
|
14
|
+
Disallow: /login
|
15
|
+
Allow: /
|
16
|
+
|
17
|
+
Disallow: /locked
|
18
|
+
Allow: /locked
|
19
|
+
END
|
20
|
+
stub_request(:get, 'http://www.example.com/robots.txt')
|
21
|
+
.to_return(:body => robots, :status => [200, "OK"], :headers => { "Content-Type" => 'text/plain' })
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
describe '#initialize' do
|
26
|
+
context 'when no arguments are supplied' do
|
27
|
+
it 'returns a Robotex with the default user-agent' do
|
28
|
+
Polipus::Robotex.new.user_agent.should == "Robotex/#{Polipus::Robotex::VERSION} (http://www.github.com/chriskite/robotex)"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context 'when a user-agent is specified' do
|
33
|
+
it 'returns a Robotex with the specified user-agent' do
|
34
|
+
ua = 'My User Agent'
|
35
|
+
Polipus::Robotex.new(ua).user_agent.should == ua
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe '#allowed?' do
|
41
|
+
context 'when the robots.txt disallows the user-agent to the url' do
|
42
|
+
it 'returns false' do
|
43
|
+
robotex = Polipus::Robotex.new('bender')
|
44
|
+
robotex.allowed?(spec_domain + 'my_shiny_metal_ass').should be_false
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
context 'when the robots.txt disallows the user-agent to some urls, but allows this one' do
|
49
|
+
it 'returns true' do
|
50
|
+
robotex = Polipus::Robotex.new('bender')
|
51
|
+
robotex.allowed?(spec_domain + 'cigars').should be_true
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
context 'when the robots.txt disallows any user-agent to the url' do
|
56
|
+
it 'returns false' do
|
57
|
+
robotex = Polipus::Robotex.new
|
58
|
+
robotex.allowed?(spec_domain + 'login').should be_false
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
context 'when the robots.txt disallows and then allows the url' do
|
63
|
+
it 'returns false' do
|
64
|
+
robotex = Polipus::Robotex.new
|
65
|
+
robotex.allowed?(spec_domain + 'locked').should be_false
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
describe '#delay' do
|
71
|
+
context 'when no Crawl-Delay is specified for the user-agent' do
|
72
|
+
it 'returns nil' do
|
73
|
+
robotex = Polipus::Robotex.new
|
74
|
+
robotex.delay(spec_domain).should be_nil
|
75
|
+
end
|
76
|
+
|
77
|
+
context 'when Crawl-Delay is specified for the user-agent' do
|
78
|
+
it 'returns the delay as a Fixnum' do
|
79
|
+
robotex = Polipus::Robotex.new('msnbot')
|
80
|
+
robotex.delay(spec_domain).should == 20
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polipus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francesco Laurita
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-06-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis-bloomfilter
|
@@ -316,14 +316,16 @@ files:
|
|
316
316
|
- .gitignore
|
317
317
|
- .rspec
|
318
318
|
- .travis.yml
|
319
|
-
- AUTHORS
|
319
|
+
- AUTHORS.md
|
320
|
+
- CHANGELOG.md
|
320
321
|
- Gemfile
|
321
322
|
- LICENSE.txt
|
322
323
|
- README.md
|
323
|
-
- README.rdoc
|
324
324
|
- Rakefile
|
325
325
|
- examples/basic.rb
|
326
|
+
- examples/error_handling.rb
|
326
327
|
- examples/incremental.rb
|
328
|
+
- examples/robots_txt_handling.rb
|
327
329
|
- examples/survival.rb
|
328
330
|
- lib/polipus.rb
|
329
331
|
- lib/polipus/http.rb
|
@@ -338,6 +340,7 @@ files:
|
|
338
340
|
- lib/polipus/queue_overflow/manager.rb
|
339
341
|
- lib/polipus/queue_overflow/mongo_queue.rb
|
340
342
|
- lib/polipus/queue_overflow/mongo_queue_capped.rb
|
343
|
+
- lib/polipus/robotex.rb
|
341
344
|
- lib/polipus/storage.rb
|
342
345
|
- lib/polipus/storage/base.rb
|
343
346
|
- lib/polipus/storage/dev_null.rb
|
@@ -350,6 +353,7 @@ files:
|
|
350
353
|
- lib/polipus/version.rb
|
351
354
|
- polipus.gemspec
|
352
355
|
- spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
|
356
|
+
- spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml
|
353
357
|
- spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
|
354
358
|
- spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
|
355
359
|
- spec/cassettes/4640919145753505af2d0f8423de37f3.yml
|
@@ -361,10 +365,12 @@ files:
|
|
361
365
|
- spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
|
362
366
|
- spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
|
363
367
|
- spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
|
368
|
+
- spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml
|
364
369
|
- spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
|
365
370
|
- spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
|
366
371
|
- spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
|
367
372
|
- spec/cassettes/gzipped_on.yml
|
373
|
+
- spec/cassettes/http_cookies.yml
|
368
374
|
- spec/cassettes/http_tconnection_max_hits.yml
|
369
375
|
- spec/cassettes/http_test.yml
|
370
376
|
- spec/cassettes/http_test_redirect.yml
|
@@ -374,6 +380,7 @@ files:
|
|
374
380
|
- spec/polipus_spec.rb
|
375
381
|
- spec/queue_overflow_manager_spec.rb
|
376
382
|
- spec/queue_overflow_spec.rb
|
383
|
+
- spec/robotex_spec.rb
|
377
384
|
- spec/spec_helper.rb
|
378
385
|
- spec/storage_memory_spec.rb
|
379
386
|
- spec/storage_mongo_spec.rb
|
@@ -405,6 +412,7 @@ specification_version: 4
|
|
405
412
|
summary: Polipus distributed web-crawler framework
|
406
413
|
test_files:
|
407
414
|
- spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
|
415
|
+
- spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml
|
408
416
|
- spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
|
409
417
|
- spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
|
410
418
|
- spec/cassettes/4640919145753505af2d0f8423de37f3.yml
|
@@ -416,10 +424,12 @@ test_files:
|
|
416
424
|
- spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
|
417
425
|
- spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
|
418
426
|
- spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
|
427
|
+
- spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml
|
419
428
|
- spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
|
420
429
|
- spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
|
421
430
|
- spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
|
422
431
|
- spec/cassettes/gzipped_on.yml
|
432
|
+
- spec/cassettes/http_cookies.yml
|
423
433
|
- spec/cassettes/http_tconnection_max_hits.yml
|
424
434
|
- spec/cassettes/http_test.yml
|
425
435
|
- spec/cassettes/http_test_redirect.yml
|
@@ -429,6 +439,7 @@ test_files:
|
|
429
439
|
- spec/polipus_spec.rb
|
430
440
|
- spec/queue_overflow_manager_spec.rb
|
431
441
|
- spec/queue_overflow_spec.rb
|
442
|
+
- spec/robotex_spec.rb
|
432
443
|
- spec/spec_helper.rb
|
433
444
|
- spec/storage_memory_spec.rb
|
434
445
|
- spec/storage_mongo_spec.rb
|
data/AUTHORS
DELETED
data/README.rdoc
DELETED