parallel588_polipus 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +17 -0
  6. data/.rubocop_todo.yml +33 -0
  7. data/.travis.yml +22 -0
  8. data/AUTHORS.md +5 -0
  9. data/CHANGELOG.md +61 -0
  10. data/Gemfile +12 -0
  11. data/LICENSE.txt +20 -0
  12. data/README.md +70 -0
  13. data/Rakefile +8 -0
  14. data/examples/basic.rb +63 -0
  15. data/examples/error_handling.rb +23 -0
  16. data/examples/incremental.rb +63 -0
  17. data/examples/robots_txt_handling.rb +14 -0
  18. data/examples/survival.rb +10 -0
  19. data/lib/polipus.rb +488 -0
  20. data/lib/polipus/http.rb +282 -0
  21. data/lib/polipus/page.rb +256 -0
  22. data/lib/polipus/plugin.rb +14 -0
  23. data/lib/polipus/plugins/cleaner.rb +25 -0
  24. data/lib/polipus/plugins/sample.rb +15 -0
  25. data/lib/polipus/plugins/sleeper.rb +22 -0
  26. data/lib/polipus/queue_overflow.rb +26 -0
  27. data/lib/polipus/queue_overflow/base.rb +7 -0
  28. data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
  29. data/lib/polipus/queue_overflow/manager.rb +57 -0
  30. data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
  31. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
  32. data/lib/polipus/queue_overflow/worker.rb +24 -0
  33. data/lib/polipus/robotex.rb +145 -0
  34. data/lib/polipus/signal_handler.rb +42 -0
  35. data/lib/polipus/storage.rb +31 -0
  36. data/lib/polipus/storage/base.rb +20 -0
  37. data/lib/polipus/storage/dev_null.rb +35 -0
  38. data/lib/polipus/storage/memory_store.rb +56 -0
  39. data/lib/polipus/storage/mongo_store.rb +90 -0
  40. data/lib/polipus/storage/rethink_store.rb +90 -0
  41. data/lib/polipus/url_tracker.rb +21 -0
  42. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  43. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  44. data/lib/polipus/version.rb +5 -0
  45. data/polipus.gemspec +44 -0
  46. data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
  47. data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
  48. data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
  49. data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
  50. data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
  51. data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
  52. data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
  53. data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
  54. data/spec/cassettes/gzipped_on.yml +147 -0
  55. data/spec/cassettes/http_cookies.yml +133 -0
  56. data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
  57. data/spec/cassettes/http_test.yml +1418 -0
  58. data/spec/cassettes/http_test_redirect.yml +71 -0
  59. data/spec/clear.rb +12 -0
  60. data/spec/polipus/http_spec.rb +139 -0
  61. data/spec/polipus/page_spec.rb +68 -0
  62. data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
  63. data/spec/polipus/queue_overflow_spec.rb +66 -0
  64. data/spec/polipus/robotex_spec.rb +85 -0
  65. data/spec/polipus/signal_handler_spec.rb +15 -0
  66. data/spec/polipus/storage/memory_store_spec.rb +87 -0
  67. data/spec/polipus/storage/mongo_store_spec.rb +119 -0
  68. data/spec/polipus/storage/rethink_store_spec.rb +117 -0
  69. data/spec/polipus/url_tracker_spec.rb +29 -0
  70. data/spec/polipus_spec.rb +107 -0
  71. data/spec/spec_helper.rb +42 -0
  72. metadata +348 -0
@@ -0,0 +1,85 @@
1
+ require 'spec_helper'
2
+ require 'polipus/robotex'
3
+
4
+ describe Polipus::Robotex do
5
+ let(:spec_domain) { 'http://www.example.com/' }
6
+ before(:each) do
7
+ robots = <<-END
8
+ User-Agent: msnbot
9
+ Crawl-Delay: 20
10
+
11
+ User-Agent: bender
12
+ Disallow: /my_shiny_metal_ass
13
+
14
+ User-Agent: *
15
+ Disallow: /login
16
+ Allow: /
17
+
18
+ Disallow: /locked
19
+ Allow: /locked
20
+ END
21
+ stub_request(:get, 'http://www.example.com/robots.txt')
22
+ .to_return(body: robots, status: [200, 'OK'], headers: { 'Content-Type' => 'text/plain' })
23
+ end
24
+
25
+ describe '#initialize' do
26
+ context 'when no arguments are supplied' do
27
+ it 'returns a Robotex with the default user-agent' do
28
+ expect(Polipus::Robotex.new.user_agent).to eq("Robotex/#{Polipus::Robotex::VERSION} (http://www.github.com/chriskite/robotex)")
29
+ end
30
+ end
31
+
32
+ context 'when a user-agent is specified' do
33
+ it 'returns a Robotex with the specified user-agent' do
34
+ ua = 'My User Agent'
35
+ expect(Polipus::Robotex.new(ua).user_agent).to eq(ua)
36
+ end
37
+ end
38
+ end
39
+
40
+ describe '#allowed?' do
41
+ context 'when the robots.txt disallows the user-agent to the url' do
42
+ it 'returns false' do
43
+ robotex = Polipus::Robotex.new('bender')
44
+ expect(robotex.allowed?(spec_domain + 'my_shiny_metal_ass')).to be_falsey
45
+ end
46
+ end
47
+
48
+ context 'when the robots.txt disallows the user-agent to some urls, but allows this one' do
49
+ it 'returns true' do
50
+ robotex = Polipus::Robotex.new('bender')
51
+ expect(robotex.allowed?(spec_domain + 'cigars')).to be_truthy
52
+ end
53
+ end
54
+
55
+ context 'when the robots.txt disallows any user-agent to the url' do
56
+ it 'returns false' do
57
+ robotex = Polipus::Robotex.new
58
+ expect(robotex.allowed?(spec_domain + 'login')).to be_falsey
59
+ end
60
+ end
61
+
62
+ context 'when the robots.txt disallows and then allows the url' do
63
+ it 'returns false' do
64
+ robotex = Polipus::Robotex.new
65
+ expect(robotex.allowed?(spec_domain + 'locked')).to be_falsey
66
+ end
67
+ end
68
+ end
69
+
70
+ describe '#delay' do
71
+ context 'when no Crawl-Delay is specified for the user-agent' do
72
+ it 'returns nil' do
73
+ robotex = Polipus::Robotex.new
74
+ expect(robotex.delay(spec_domain)).to be_nil
75
+ end
76
+
77
+ context 'when Crawl-Delay is specified for the user-agent' do
78
+ it 'returns the delay as a Fixnum' do
79
+ robotex = Polipus::Robotex.new('msnbot')
80
+ expect(robotex.delay(spec_domain)).to eq(20)
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe Polipus::SignalHandler do
4
+ context 'signal handler' do
5
+ it 'should be enabled by default' do
6
+ Polipus::PolipusCrawler.new('polipus-rspec', [])
7
+ expect(Polipus::SignalHandler.enabled?).to be true
8
+ end
9
+
10
+ it 'should be disabled if specified' do
11
+ Polipus::PolipusCrawler.new('polipus-rspec', [], enable_signal_handler: false)
12
+ expect(Polipus::SignalHandler.enabled?).to be false
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,87 @@
1
+ # encoding: UTF-8
2
+ require 'spec_helper'
3
+ require 'mongo'
4
+ require 'polipus/storage/memory_store'
5
+ describe Polipus::Storage::MemoryStore do
6
+ let(:storage) { Polipus::Storage.memory_store }
7
+
8
+ it 'should store a page' do
9
+ p = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
10
+ uuid = storage.add p
11
+ expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
12
+ expect(storage.count).to be 1
13
+ p = storage.get p
14
+ expect(p.url.to_s).to eq('http://www.google.com')
15
+ expect(p.body).to eq('<html></html>')
16
+ end
17
+
18
+ it 'should update a page' do
19
+ p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
20
+ storage.add p
21
+ p = storage.get p
22
+ expect(p.code).to eq(301)
23
+ end
24
+
25
+ it 'should iterate over stored pages' do
26
+ storage.each do |k, page|
27
+ expect(k).to eq('ed646a3334ca891fd3467db131372140')
28
+ expect(page.url.to_s).to eq('http://www.google.com')
29
+ end
30
+ end
31
+
32
+ it 'should delete a page' do
33
+ p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
34
+ storage.remove p
35
+ expect(storage.get(p)).to be_nil
36
+ expect(storage.count).to be 0
37
+ end
38
+
39
+ it 'should store a page removing a query string from the uuid generation' do
40
+ p = page_factory 'http://www.asd.com/?asd=lol', code: 200, body: '<html></html>'
41
+ p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
42
+ storage.include_query_string_in_uuid = false
43
+ storage.add p
44
+ expect(storage.exists?(p_no_query)).to be_truthy
45
+ storage.remove p
46
+ end
47
+
48
+ it 'should store a page removing a query string from the uuid generation no ending slash' do
49
+ p = page_factory 'http://www.asd.com?asd=lol', code: 200, body: '<html></html>'
50
+ p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
51
+ storage.include_query_string_in_uuid = false
52
+ storage.add p
53
+ expect(storage.exists?(p_no_query)).to be_truthy
54
+ storage.remove p
55
+ end
56
+
57
+ it 'should store a page with user data associated' do
58
+ p = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
59
+ p.user_data.name = 'Test User Data'
60
+ storage.add p
61
+ expect(storage.exists?(p)).to be_truthy
62
+ p = storage.get(p)
63
+ expect(p.user_data.name).to eq('Test User Data')
64
+ storage.remove p
65
+ end
66
+
67
+ it 'should honor the except parameters' do
68
+ storage = Polipus::Storage.mongo_store(@mongo, '_test_pages', ['body'])
69
+ p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
70
+ storage.add p
71
+ p = storage.get p
72
+ expect(p.body).to be_empty
73
+ storage.clear
74
+ end
75
+
76
+ it 'should return false if a doc not exists' do
77
+ storage.include_query_string_in_uuid = false
78
+ p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
79
+ expect(storage.exists?(p_other)).to be_falsey
80
+ storage.add p_other
81
+ expect(storage.exists?(p_other)).to be_truthy
82
+ p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
83
+ expect(storage.exists?(p_other)).to be_truthy
84
+ storage.include_query_string_in_uuid = true
85
+ expect(storage.exists?(p_other)).to be_falsey
86
+ end
87
+ end
@@ -0,0 +1,119 @@
1
+ # encoding: UTF-8
2
+ require 'spec_helper'
3
+ require 'mongo'
4
+ require 'polipus/storage/mongo_store'
5
+ describe Polipus::Storage::MongoStore do
6
+ before(:all)do
7
+ @mongo = Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('_test_polipus')
8
+ @mongo['_test_pages'].drop
9
+ @storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
10
+ end
11
+
12
+ after(:all) do
13
+ @mongo['_test_pages'].drop
14
+ end
15
+
16
+ after(:each) do
17
+ @mongo['_test_pages'].drop
18
+ end
19
+
20
+ it 'should store a page' do
21
+ p = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
22
+ uuid = @storage.add p
23
+ expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
24
+ expect(@storage.count).to be 1
25
+ expect(@mongo['_test_pages'].count).to be 1
26
+ p = @storage.get p
27
+ expect(p.url.to_s).to eq('http://www.google.com')
28
+ expect(p.body).to eq('<html></html>')
29
+ end
30
+
31
+ it 'should update a page' do
32
+ p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
33
+ @storage.add p
34
+ p = @storage.get p
35
+ expect(p.code).to eq(301)
36
+ expect(@mongo['_test_pages'].count).to be 1
37
+ end
38
+
39
+ it 'should iterate over stored pages' do
40
+ @storage.each do |k, page|
41
+ expect(k).to eq('ed646a3334ca891fd3467db131372140')
42
+ expect(page.url.to_s).to eq('http://www.google.com')
43
+ end
44
+ end
45
+
46
+ it 'should delete a page' do
47
+ p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
48
+ @storage.remove p
49
+ expect(@storage.get(p)).to be_nil
50
+ expect(@storage.count).to be 0
51
+ end
52
+
53
+ it 'should store a page removing a query string from the uuid generation' do
54
+ p = page_factory 'http://www.asd.com/?asd=lol', code: 200, body: '<html></html>'
55
+ p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
56
+ @storage.include_query_string_in_uuid = false
57
+ @storage.add p
58
+ expect(@storage.exists?(p_no_query)).to be_truthy
59
+ @storage.remove p
60
+ end
61
+
62
+ it 'should store a page removing a query string from the uuid generation no ending slash' do
63
+ p = page_factory 'http://www.asd.com?asd=lol', code: 200, body: '<html></html>'
64
+ p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
65
+ @storage.include_query_string_in_uuid = false
66
+ @storage.add p
67
+ expect(@storage.exists?(p_no_query)).to be_truthy
68
+ @storage.remove p
69
+ end
70
+
71
+ it 'should store a page with user data associated' do
72
+ p = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
73
+ p.user_data.name = 'Test User Data'
74
+ @storage.add p
75
+ expect(@storage.exists?(p)).to be_truthy
76
+ p = @storage.get(p)
77
+ expect(p.user_data.name).to eq('Test User Data')
78
+ @storage.remove p
79
+ end
80
+
81
+ it 'should honor the except parameters' do
82
+ storage = Polipus::Storage.mongo_store(@mongo, '_test_pages', ['body'])
83
+ p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
84
+ storage.add p
85
+ p = storage.get p
86
+ expect(p.body).to be_empty
87
+ storage.clear
88
+ end
89
+
90
+ it 'should return false if a doc not exists' do
91
+ @storage.include_query_string_in_uuid = false
92
+ p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
93
+ expect(@storage.exists?(p_other)).to be_falsey
94
+ @storage.add p_other
95
+ expect(@storage.exists?(p_other)).to be_truthy
96
+ p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
97
+ expect(@storage.exists?(p_other)).to be_truthy
98
+ @storage.include_query_string_in_uuid = true
99
+ expect(@storage.exists?(p_other)).to be_falsey
100
+ end
101
+
102
+ it 'should set page.fetched_at based on the id creation' do
103
+ storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
104
+ p = page_factory 'http://www.user-doojo.com', code: 200, body: '<html></html>'
105
+ storage.add p
106
+ expect(p.fetched_at).to be_nil
107
+ p = storage.get p
108
+ expect(p.fetched_at).not_to be_nil
109
+ end
110
+
111
+ it 'should NOT set page.fetched_at if already present' do
112
+ storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
113
+ p = page_factory 'http://www.user-doojooo.com', code: 200, body: '<html></html>'
114
+ p.fetched_at = 10
115
+ storage.add p
116
+ p = storage.get p
117
+ expect(p.fetched_at).to be 10
118
+ end
119
+ end
@@ -0,0 +1,117 @@
1
+ # encoding: UTF-8
2
+ require 'spec_helper'
3
+ require 'polipus/storage/rethink_store'
4
+
5
+ describe Polipus::Storage::RethinkStore do
6
+ before(:all)do
7
+ @r = RethinkDB::RQL.new
8
+ @rethink = @r.connect(host: 'localhost', port: 28_015, db: 'polipus_spec')
9
+ @r.db_create('polipus_spec').run(@rethink) unless @r.db_list.run(@rethink).include?('polipus_spec')
10
+ @table = 'test_pages'
11
+ @storage = Polipus::Storage.rethink_store(@rethink, @table)
12
+ end
13
+
14
+ after(:each) do
15
+ @r.table(@table).delete.run(@rethink)
16
+ end
17
+
18
+ it 'should store a page' do
19
+ page = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
20
+ uuid = @storage.add page
21
+ expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
22
+ expect(@storage.count).to eq(1)
23
+ expect(@r.table(@table).count.run(@rethink)).to eq(1)
24
+ page = @storage.get page
25
+ expect(page.url.to_s).to eq('http://www.google.com')
26
+ expect(page.body).to eq('<html></html>')
27
+ end
28
+
29
+ it 'should update a page' do
30
+ page = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
31
+ @storage.add page
32
+ page = @storage.get page
33
+ expect(page.code).to eq(301)
34
+ expect(@r.table(@table).count.run(@rethink)).to eq(1)
35
+ end
36
+
37
+ it 'should iterate over stored pages' do
38
+ @storage.each do |k, page|
39
+ expect(k).to eq('ed646a3334ca891fd3467db131372140')
40
+ expect(page.url.to_s).to eq('http://www.google.com')
41
+ end
42
+ end
43
+
44
+ it 'should delete a page' do
45
+ page = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
46
+ @storage.remove page
47
+ expect(@storage.get(page)).to be_nil
48
+ expect(@storage.count).to be 0
49
+ end
50
+
51
+ it 'should store a page removing a query string from the uuid generation' do
52
+ page = page_factory 'http://www.asd.com/?asd=lol', code: 200, body: '<html></html>'
53
+ p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
54
+ @storage.include_query_string_in_uuid = false
55
+ @storage.add page
56
+ expect(@storage.exists?(p_no_query)).to be_truthy
57
+ @storage.remove page
58
+ end
59
+
60
+ it 'should store a page removing a query string from the uuid generation no ending slash' do
61
+ page = page_factory 'http://www.asd.com?asd=lol', code: 200, body: '<html></html>'
62
+ p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
63
+ @storage.include_query_string_in_uuid = false
64
+ @storage.add page
65
+ expect(@storage.exists?(p_no_query)).to be_truthy
66
+ @storage.remove page
67
+ end
68
+
69
+ it 'should store a page with user data associated' do
70
+ page = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
71
+ page.user_data.name = 'Test User Data'
72
+ @storage.add page
73
+ expect(@storage.exists?(page)).to be_truthy
74
+ page = @storage.get(page)
75
+ expect(page.user_data.name).to eq('Test User Data')
76
+ @storage.remove page
77
+ end
78
+
79
+ it 'should honor the except parameters' do
80
+ storage = Polipus::Storage.rethink_store(@rethink, @table, ['body'])
81
+ page = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
82
+ storage.add page
83
+ page = storage.get page
84
+ expect(page.body).to be_empty
85
+ storage.clear
86
+ end
87
+
88
+ it 'should return false if a doc not exists' do
89
+ @storage.include_query_string_in_uuid = false
90
+ p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
91
+ expect(@storage.exists?(p_other)).to be_falsey
92
+ @storage.add p_other
93
+ expect(@storage.exists?(p_other)).to be_truthy
94
+ p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
95
+ expect(@storage.exists?(p_other)).to be_truthy
96
+ @storage.include_query_string_in_uuid = true
97
+ expect(@storage.exists?(p_other)).to be_falsey
98
+ end
99
+
100
+ it 'should set page.fetched_at based on the id creation' do
101
+ storage = Polipus::Storage.rethink_store(@rethink, @table)
102
+ page = page_factory 'http://www.user-doojo.com', code: 200, body: '<html></html>'
103
+ storage.add page
104
+ expect(page.fetched_at).to be_nil
105
+ page = storage.get page
106
+ expect(page.fetched_at).not_to be_nil
107
+ end
108
+
109
+ it 'should NOT set page.fetched_at if already present' do
110
+ storage = Polipus::Storage.rethink_store(@rethink, @table)
111
+ page = page_factory 'http://www.user-doojooo.com', code: 200, body: '<html></html>'
112
+ page.fetched_at = 10
113
+ storage.add page
114
+ page = storage.get page
115
+ expect(page.fetched_at).to be 10
116
+ end
117
+ end
@@ -0,0 +1,29 @@
1
+ # encoding: UTF-8
2
+ require 'spec_helper'
3
+ require 'polipus/url_tracker'
4
+
5
+ describe Polipus::UrlTracker do
6
+ before(:all) do
7
+ @bf = Polipus::UrlTracker.bloomfilter
8
+ @set = Polipus::UrlTracker.redis_set
9
+ end
10
+
11
+ after(:all) do
12
+ @bf.clear
13
+ @set.clear
14
+ end
15
+
16
+ it 'should work (bf)' do
17
+ url = 'http://www.asd.com/asd/lol'
18
+ @bf.visit url
19
+ expect(@bf.visited?(url)).to be_truthy
20
+ expect(@bf.visited?('http://www.google.com')).to be_falsey
21
+ end
22
+
23
+ it 'should work (redis_set)' do
24
+ url = 'http://www.asd.com/asd/lol'
25
+ @set.visit url
26
+ expect(@set.visited?(url)).to be_truthy
27
+ expect(@set.visited?('http://www.google.com')).to be_falsey
28
+ end
29
+ end