parallel588_polipus 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +17 -0
  6. data/.rubocop_todo.yml +33 -0
  7. data/.travis.yml +22 -0
  8. data/AUTHORS.md +5 -0
  9. data/CHANGELOG.md +61 -0
  10. data/Gemfile +12 -0
  11. data/LICENSE.txt +20 -0
  12. data/README.md +70 -0
  13. data/Rakefile +8 -0
  14. data/examples/basic.rb +63 -0
  15. data/examples/error_handling.rb +23 -0
  16. data/examples/incremental.rb +63 -0
  17. data/examples/robots_txt_handling.rb +14 -0
  18. data/examples/survival.rb +10 -0
  19. data/lib/polipus.rb +488 -0
  20. data/lib/polipus/http.rb +282 -0
  21. data/lib/polipus/page.rb +256 -0
  22. data/lib/polipus/plugin.rb +14 -0
  23. data/lib/polipus/plugins/cleaner.rb +25 -0
  24. data/lib/polipus/plugins/sample.rb +15 -0
  25. data/lib/polipus/plugins/sleeper.rb +22 -0
  26. data/lib/polipus/queue_overflow.rb +26 -0
  27. data/lib/polipus/queue_overflow/base.rb +7 -0
  28. data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
  29. data/lib/polipus/queue_overflow/manager.rb +57 -0
  30. data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
  31. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
  32. data/lib/polipus/queue_overflow/worker.rb +24 -0
  33. data/lib/polipus/robotex.rb +145 -0
  34. data/lib/polipus/signal_handler.rb +42 -0
  35. data/lib/polipus/storage.rb +31 -0
  36. data/lib/polipus/storage/base.rb +20 -0
  37. data/lib/polipus/storage/dev_null.rb +35 -0
  38. data/lib/polipus/storage/memory_store.rb +56 -0
  39. data/lib/polipus/storage/mongo_store.rb +90 -0
  40. data/lib/polipus/storage/rethink_store.rb +90 -0
  41. data/lib/polipus/url_tracker.rb +21 -0
  42. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  43. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  44. data/lib/polipus/version.rb +5 -0
  45. data/polipus.gemspec +44 -0
  46. data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
  47. data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
  48. data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
  49. data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
  50. data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
  51. data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
  52. data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
  53. data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
  54. data/spec/cassettes/gzipped_on.yml +147 -0
  55. data/spec/cassettes/http_cookies.yml +133 -0
  56. data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
  57. data/spec/cassettes/http_test.yml +1418 -0
  58. data/spec/cassettes/http_test_redirect.yml +71 -0
  59. data/spec/clear.rb +12 -0
  60. data/spec/polipus/http_spec.rb +139 -0
  61. data/spec/polipus/page_spec.rb +68 -0
  62. data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
  63. data/spec/polipus/queue_overflow_spec.rb +66 -0
  64. data/spec/polipus/robotex_spec.rb +85 -0
  65. data/spec/polipus/signal_handler_spec.rb +15 -0
  66. data/spec/polipus/storage/memory_store_spec.rb +87 -0
  67. data/spec/polipus/storage/mongo_store_spec.rb +119 -0
  68. data/spec/polipus/storage/rethink_store_spec.rb +117 -0
  69. data/spec/polipus/url_tracker_spec.rb +29 -0
  70. data/spec/polipus_spec.rb +107 -0
  71. data/spec/spec_helper.rb +42 -0
  72. metadata +348 -0
@@ -0,0 +1,85 @@
1
+ require 'spec_helper'
2
+ require 'polipus/robotex'
3
+
4
+ describe Polipus::Robotex do
5
+ let(:spec_domain) { 'http://www.example.com/' }
6
+ before(:each) do
7
+ robots = <<-END
8
+ User-Agent: msnbot
9
+ Crawl-Delay: 20
10
+
11
+ User-Agent: bender
12
+ Disallow: /my_shiny_metal_ass
13
+
14
+ User-Agent: *
15
+ Disallow: /login
16
+ Allow: /
17
+
18
+ Disallow: /locked
19
+ Allow: /locked
20
+ END
21
+ stub_request(:get, 'http://www.example.com/robots.txt')
22
+ .to_return(body: robots, status: [200, 'OK'], headers: { 'Content-Type' => 'text/plain' })
23
+ end
24
+
25
+ describe '#initialize' do
26
+ context 'when no arguments are supplied' do
27
+ it 'returns a Robotex with the default user-agent' do
28
+ expect(Polipus::Robotex.new.user_agent).to eq("Robotex/#{Polipus::Robotex::VERSION} (http://www.github.com/chriskite/robotex)")
29
+ end
30
+ end
31
+
32
+ context 'when a user-agent is specified' do
33
+ it 'returns a Robotex with the specified user-agent' do
34
+ ua = 'My User Agent'
35
+ expect(Polipus::Robotex.new(ua).user_agent).to eq(ua)
36
+ end
37
+ end
38
+ end
39
+
40
+ describe '#allowed?' do
41
+ context 'when the robots.txt disallows the user-agent to the url' do
42
+ it 'returns false' do
43
+ robotex = Polipus::Robotex.new('bender')
44
+ expect(robotex.allowed?(spec_domain + 'my_shiny_metal_ass')).to be_falsey
45
+ end
46
+ end
47
+
48
+ context 'when the robots.txt disallows the user-agent to some urls, but allows this one' do
49
+ it 'returns true' do
50
+ robotex = Polipus::Robotex.new('bender')
51
+ expect(robotex.allowed?(spec_domain + 'cigars')).to be_truthy
52
+ end
53
+ end
54
+
55
+ context 'when the robots.txt disallows any user-agent to the url' do
56
+ it 'returns false' do
57
+ robotex = Polipus::Robotex.new
58
+ expect(robotex.allowed?(spec_domain + 'login')).to be_falsey
59
+ end
60
+ end
61
+
62
+ context 'when the robots.txt disallows and then allows the url' do
63
+ it 'returns false' do
64
+ robotex = Polipus::Robotex.new
65
+ expect(robotex.allowed?(spec_domain + 'locked')).to be_falsey
66
+ end
67
+ end
68
+ end
69
+
70
+ describe '#delay' do
71
+ context 'when no Crawl-Delay is specified for the user-agent' do
72
+ it 'returns nil' do
73
+ robotex = Polipus::Robotex.new
74
+ expect(robotex.delay(spec_domain)).to be_nil
75
+ end
76
+
77
+ context 'when Crawl-Delay is specified for the user-agent' do
78
+ it 'returns the delay as a Fixnum' do
79
+ robotex = Polipus::Robotex.new('msnbot')
80
+ expect(robotex.delay(spec_domain)).to eq(20)
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe Polipus::SignalHandler do
4
+ context 'signal handler' do
5
+ it 'should be enabled by default' do
6
+ Polipus::PolipusCrawler.new('polipus-rspec', [])
7
+ expect(Polipus::SignalHandler.enabled?).to be true
8
+ end
9
+
10
+ it 'should be disabled if specified' do
11
+ Polipus::PolipusCrawler.new('polipus-rspec', [], enable_signal_handler: false)
12
+ expect(Polipus::SignalHandler.enabled?).to be false
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,87 @@
1
+ # encoding: UTF-8
2
+ require 'spec_helper'
3
+ require 'mongo'
4
+ require 'polipus/storage/memory_store'
5
+ describe Polipus::Storage::MemoryStore do
6
+ let(:storage) { Polipus::Storage.memory_store }
7
+
8
+ it 'should store a page' do
9
+ p = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
10
+ uuid = storage.add p
11
+ expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
12
+ expect(storage.count).to be 1
13
+ p = storage.get p
14
+ expect(p.url.to_s).to eq('http://www.google.com')
15
+ expect(p.body).to eq('<html></html>')
16
+ end
17
+
18
+ it 'should update a page' do
19
+ p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
20
+ storage.add p
21
+ p = storage.get p
22
+ expect(p.code).to eq(301)
23
+ end
24
+
25
+ it 'should iterate over stored pages' do
26
+ storage.each do |k, page|
27
+ expect(k).to eq('ed646a3334ca891fd3467db131372140')
28
+ expect(page.url.to_s).to eq('http://www.google.com')
29
+ end
30
+ end
31
+
32
+ it 'should delete a page' do
33
+ p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
34
+ storage.remove p
35
+ expect(storage.get(p)).to be_nil
36
+ expect(storage.count).to be 0
37
+ end
38
+
39
+ it 'should store a page removing a query string from the uuid generation' do
40
+ p = page_factory 'http://www.asd.com/?asd=lol', code: 200, body: '<html></html>'
41
+ p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
42
+ storage.include_query_string_in_uuid = false
43
+ storage.add p
44
+ expect(storage.exists?(p_no_query)).to be_truthy
45
+ storage.remove p
46
+ end
47
+
48
+ it 'should store a page removing a query string from the uuid generation no ending slash' do
49
+ p = page_factory 'http://www.asd.com?asd=lol', code: 200, body: '<html></html>'
50
+ p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
51
+ storage.include_query_string_in_uuid = false
52
+ storage.add p
53
+ expect(storage.exists?(p_no_query)).to be_truthy
54
+ storage.remove p
55
+ end
56
+
57
+ it 'should store a page with user data associated' do
58
+ p = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
59
+ p.user_data.name = 'Test User Data'
60
+ storage.add p
61
+ expect(storage.exists?(p)).to be_truthy
62
+ p = storage.get(p)
63
+ expect(p.user_data.name).to eq('Test User Data')
64
+ storage.remove p
65
+ end
66
+
67
+ it 'should honor the except parameters' do
68
+ storage = Polipus::Storage.mongo_store(@mongo, '_test_pages', ['body'])
69
+ p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
70
+ storage.add p
71
+ p = storage.get p
72
+ expect(p.body).to be_empty
73
+ storage.clear
74
+ end
75
+
76
+ it 'should return false if a doc not exists' do
77
+ storage.include_query_string_in_uuid = false
78
+ p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
79
+ expect(storage.exists?(p_other)).to be_falsey
80
+ storage.add p_other
81
+ expect(storage.exists?(p_other)).to be_truthy
82
+ p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
83
+ expect(storage.exists?(p_other)).to be_truthy
84
+ storage.include_query_string_in_uuid = true
85
+ expect(storage.exists?(p_other)).to be_falsey
86
+ end
87
+ end
@@ -0,0 +1,119 @@
1
+ # encoding: UTF-8
2
+ require 'spec_helper'
3
+ require 'mongo'
4
+ require 'polipus/storage/mongo_store'
5
+ describe Polipus::Storage::MongoStore do
6
+ before(:all)do
7
+ @mongo = Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('_test_polipus')
8
+ @mongo['_test_pages'].drop
9
+ @storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
10
+ end
11
+
12
+ after(:all) do
13
+ @mongo['_test_pages'].drop
14
+ end
15
+
16
+ after(:each) do
17
+ @mongo['_test_pages'].drop
18
+ end
19
+
20
+ it 'should store a page' do
21
+ p = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
22
+ uuid = @storage.add p
23
+ expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
24
+ expect(@storage.count).to be 1
25
+ expect(@mongo['_test_pages'].count).to be 1
26
+ p = @storage.get p
27
+ expect(p.url.to_s).to eq('http://www.google.com')
28
+ expect(p.body).to eq('<html></html>')
29
+ end
30
+
31
+ it 'should update a page' do
32
+ p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
33
+ @storage.add p
34
+ p = @storage.get p
35
+ expect(p.code).to eq(301)
36
+ expect(@mongo['_test_pages'].count).to be 1
37
+ end
38
+
39
+ it 'should iterate over stored pages' do
40
+ @storage.each do |k, page|
41
+ expect(k).to eq('ed646a3334ca891fd3467db131372140')
42
+ expect(page.url.to_s).to eq('http://www.google.com')
43
+ end
44
+ end
45
+
46
+ it 'should delete a page' do
47
+ p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
48
+ @storage.remove p
49
+ expect(@storage.get(p)).to be_nil
50
+ expect(@storage.count).to be 0
51
+ end
52
+
53
+ it 'should store a page removing a query string from the uuid generation' do
54
+ p = page_factory 'http://www.asd.com/?asd=lol', code: 200, body: '<html></html>'
55
+ p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
56
+ @storage.include_query_string_in_uuid = false
57
+ @storage.add p
58
+ expect(@storage.exists?(p_no_query)).to be_truthy
59
+ @storage.remove p
60
+ end
61
+
62
+ it 'should store a page removing a query string from the uuid generation no ending slash' do
63
+ p = page_factory 'http://www.asd.com?asd=lol', code: 200, body: '<html></html>'
64
+ p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
65
+ @storage.include_query_string_in_uuid = false
66
+ @storage.add p
67
+ expect(@storage.exists?(p_no_query)).to be_truthy
68
+ @storage.remove p
69
+ end
70
+
71
+ it 'should store a page with user data associated' do
72
+ p = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
73
+ p.user_data.name = 'Test User Data'
74
+ @storage.add p
75
+ expect(@storage.exists?(p)).to be_truthy
76
+ p = @storage.get(p)
77
+ expect(p.user_data.name).to eq('Test User Data')
78
+ @storage.remove p
79
+ end
80
+
81
+ it 'should honor the except parameters' do
82
+ storage = Polipus::Storage.mongo_store(@mongo, '_test_pages', ['body'])
83
+ p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
84
+ storage.add p
85
+ p = storage.get p
86
+ expect(p.body).to be_empty
87
+ storage.clear
88
+ end
89
+
90
+ it 'should return false if a doc not exists' do
91
+ @storage.include_query_string_in_uuid = false
92
+ p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
93
+ expect(@storage.exists?(p_other)).to be_falsey
94
+ @storage.add p_other
95
+ expect(@storage.exists?(p_other)).to be_truthy
96
+ p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
97
+ expect(@storage.exists?(p_other)).to be_truthy
98
+ @storage.include_query_string_in_uuid = true
99
+ expect(@storage.exists?(p_other)).to be_falsey
100
+ end
101
+
102
+ it 'should set page.fetched_at based on the id creation' do
103
+ storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
104
+ p = page_factory 'http://www.user-doojo.com', code: 200, body: '<html></html>'
105
+ storage.add p
106
+ expect(p.fetched_at).to be_nil
107
+ p = storage.get p
108
+ expect(p.fetched_at).not_to be_nil
109
+ end
110
+
111
+ it 'should NOT set page.fetched_at if already present' do
112
+ storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
113
+ p = page_factory 'http://www.user-doojooo.com', code: 200, body: '<html></html>'
114
+ p.fetched_at = 10
115
+ storage.add p
116
+ p = storage.get p
117
+ expect(p.fetched_at).to be 10
118
+ end
119
+ end
@@ -0,0 +1,117 @@
1
+ # encoding: UTF-8
2
+ require 'spec_helper'
3
+ require 'polipus/storage/rethink_store'
4
+
5
+ describe Polipus::Storage::RethinkStore do
6
+ before(:all)do
7
+ @r = RethinkDB::RQL.new
8
+ @rethink = @r.connect(host: 'localhost', port: 28_015, db: 'polipus_spec')
9
+ @r.db_create('polipus_spec').run(@rethink) unless @r.db_list.run(@rethink).include?('polipus_spec')
10
+ @table = 'test_pages'
11
+ @storage = Polipus::Storage.rethink_store(@rethink, @table)
12
+ end
13
+
14
+ after(:each) do
15
+ @r.table(@table).delete.run(@rethink)
16
+ end
17
+
18
+ it 'should store a page' do
19
+ page = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
20
+ uuid = @storage.add page
21
+ expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
22
+ expect(@storage.count).to eq(1)
23
+ expect(@r.table(@table).count.run(@rethink)).to eq(1)
24
+ page = @storage.get page
25
+ expect(page.url.to_s).to eq('http://www.google.com')
26
+ expect(page.body).to eq('<html></html>')
27
+ end
28
+
29
+ it 'should update a page' do
30
+ page = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
31
+ @storage.add page
32
+ page = @storage.get page
33
+ expect(page.code).to eq(301)
34
+ expect(@r.table(@table).count.run(@rethink)).to eq(1)
35
+ end
36
+
37
+ it 'should iterate over stored pages' do
38
+ @storage.each do |k, page|
39
+ expect(k).to eq('ed646a3334ca891fd3467db131372140')
40
+ expect(page.url.to_s).to eq('http://www.google.com')
41
+ end
42
+ end
43
+
44
+ it 'should delete a page' do
45
+ page = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
46
+ @storage.remove page
47
+ expect(@storage.get(page)).to be_nil
48
+ expect(@storage.count).to be 0
49
+ end
50
+
51
+ it 'should store a page removing a query string from the uuid generation' do
52
+ page = page_factory 'http://www.asd.com/?asd=lol', code: 200, body: '<html></html>'
53
+ p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
54
+ @storage.include_query_string_in_uuid = false
55
+ @storage.add page
56
+ expect(@storage.exists?(p_no_query)).to be_truthy
57
+ @storage.remove page
58
+ end
59
+
60
+ it 'should store a page removing a query string from the uuid generation no ending slash' do
61
+ page = page_factory 'http://www.asd.com?asd=lol', code: 200, body: '<html></html>'
62
+ p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
63
+ @storage.include_query_string_in_uuid = false
64
+ @storage.add page
65
+ expect(@storage.exists?(p_no_query)).to be_truthy
66
+ @storage.remove page
67
+ end
68
+
69
+ it 'should store a page with user data associated' do
70
+ page = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
71
+ page.user_data.name = 'Test User Data'
72
+ @storage.add page
73
+ expect(@storage.exists?(page)).to be_truthy
74
+ page = @storage.get(page)
75
+ expect(page.user_data.name).to eq('Test User Data')
76
+ @storage.remove page
77
+ end
78
+
79
+ it 'should honor the except parameters' do
80
+ storage = Polipus::Storage.rethink_store(@rethink, @table, ['body'])
81
+ page = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
82
+ storage.add page
83
+ page = storage.get page
84
+ expect(page.body).to be_empty
85
+ storage.clear
86
+ end
87
+
88
+ it 'should return false if a doc not exists' do
89
+ @storage.include_query_string_in_uuid = false
90
+ p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
91
+ expect(@storage.exists?(p_other)).to be_falsey
92
+ @storage.add p_other
93
+ expect(@storage.exists?(p_other)).to be_truthy
94
+ p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
95
+ expect(@storage.exists?(p_other)).to be_truthy
96
+ @storage.include_query_string_in_uuid = true
97
+ expect(@storage.exists?(p_other)).to be_falsey
98
+ end
99
+
100
+ it 'should set page.fetched_at based on the id creation' do
101
+ storage = Polipus::Storage.rethink_store(@rethink, @table)
102
+ page = page_factory 'http://www.user-doojo.com', code: 200, body: '<html></html>'
103
+ storage.add page
104
+ expect(page.fetched_at).to be_nil
105
+ page = storage.get page
106
+ expect(page.fetched_at).not_to be_nil
107
+ end
108
+
109
+ it 'should NOT set page.fetched_at if already present' do
110
+ storage = Polipus::Storage.rethink_store(@rethink, @table)
111
+ page = page_factory 'http://www.user-doojooo.com', code: 200, body: '<html></html>'
112
+ page.fetched_at = 10
113
+ storage.add page
114
+ page = storage.get page
115
+ expect(page.fetched_at).to be 10
116
+ end
117
+ end
@@ -0,0 +1,29 @@
1
+ # encoding: UTF-8
2
+ require 'spec_helper'
3
+ require 'polipus/url_tracker'
4
+
5
+ describe Polipus::UrlTracker do
6
+ before(:all) do
7
+ @bf = Polipus::UrlTracker.bloomfilter
8
+ @set = Polipus::UrlTracker.redis_set
9
+ end
10
+
11
+ after(:all) do
12
+ @bf.clear
13
+ @set.clear
14
+ end
15
+
16
+ it 'should work (bf)' do
17
+ url = 'http://www.asd.com/asd/lol'
18
+ @bf.visit url
19
+ expect(@bf.visited?(url)).to be_truthy
20
+ expect(@bf.visited?('http://www.google.com')).to be_falsey
21
+ end
22
+
23
+ it 'should work (redis_set)' do
24
+ url = 'http://www.asd.com/asd/lol'
25
+ @set.visit url
26
+ expect(@set.visited?(url)).to be_truthy
27
+ expect(@set.visited?('http://www.google.com')).to be_falsey
28
+ end
29
+ end