polipus 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +15 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +55 -0
  8. data/README.rdoc +3 -0
  9. data/Rakefile +9 -0
  10. data/examples/basic.rb +58 -0
  11. data/examples/survival.rb +9 -0
  12. data/lib/polipus.rb +451 -0
  13. data/lib/polipus/http.rb +195 -0
  14. data/lib/polipus/page.rb +219 -0
  15. data/lib/polipus/plugin.rb +13 -0
  16. data/lib/polipus/plugins/cleaner.rb +25 -0
  17. data/lib/polipus/plugins/sample.rb +17 -0
  18. data/lib/polipus/plugins/sleeper.rb +22 -0
  19. data/lib/polipus/queue_overflow.rb +24 -0
  20. data/lib/polipus/queue_overflow/base.rb +6 -0
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
  22. data/lib/polipus/queue_overflow/manager.rb +50 -0
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
  25. data/lib/polipus/storage.rb +31 -0
  26. data/lib/polipus/storage/base.rb +17 -0
  27. data/lib/polipus/storage/dev_null.rb +35 -0
  28. data/lib/polipus/storage/mongo_store.rb +86 -0
  29. data/lib/polipus/storage/s3_store.rb +100 -0
  30. data/lib/polipus/url_tracker.rb +20 -0
  31. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  32. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  33. data/lib/polipus/version.rb +4 -0
  34. data/polipus.gemspec +39 -0
  35. data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
  36. data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
  37. data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
  38. data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
  39. data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
  40. data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
  41. data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
  42. data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
  43. data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
  44. data/spec/cassettes/http_test.yml +1418 -0
  45. data/spec/cassettes/http_test_redirect.yml +71 -0
  46. data/spec/clear.rb +11 -0
  47. data/spec/http_spec.rb +31 -0
  48. data/spec/page_spec.rb +22 -0
  49. data/spec/queue_overflow_manager_spec.rb +89 -0
  50. data/spec/queue_overflow_spec.rb +71 -0
  51. data/spec/spec_helper.rb +34 -0
  52. data/spec/storage_mongo_spec.rb +102 -0
  53. data/spec/storage_s3_spec.rb +115 -0
  54. data/spec/url_tracker_spec.rb +28 -0
  55. metadata +313 -0
@@ -0,0 +1,20 @@
1
+ module Polipus
2
+ module UrlTracker
3
+ def self.bloomfilter(options = {})
4
+ require "polipus/url_tracker/bloomfilter"
5
+ options[:size] ||= 1_000_000
6
+ options[:error_rate] ||= 0.01
7
+ options[:key_name] ||= 'polipus-bloomfilter'
8
+ options[:redis] ||= Redis.current
9
+ options[:driver] ||= 'lua'
10
+ self::Bloomfilter.new options
11
+ end
12
+
13
+ def self.redis_set(options = {})
14
+ require "polipus/url_tracker/redis_set"
15
+ options[:redis] ||= Redis.current
16
+ options[:key_name] ||= 'polipus-set'
17
+ self::RedisSet.new options
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,27 @@
1
+ require "redis-bloomfilter"
2
+ module Polipus
3
+ module UrlTracker
4
+ class Bloomfilter
5
+ def initialize(options = {})
6
+ @bf = Redis::Bloomfilter.new options
7
+ end
8
+
9
+ def visited?(url)
10
+ @bf.include?(url)
11
+ end
12
+
13
+ def visit url
14
+ @bf.insert url
15
+ end
16
+
17
+ def remove url
18
+ @bf.remove url
19
+ end
20
+
21
+ def clear
22
+ @bf.clear
23
+ end
24
+
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,27 @@
1
+ module Polipus
2
+ module UrlTracker
3
+ class RedisSet
4
+
5
+ def initialize(options = {})
6
+ @redis = options[:redis] || Redis.current
7
+ @set_name = options[:key_name]
8
+ end
9
+
10
+ def visited?(url)
11
+ @redis.sismember(@set_name,url)
12
+ end
13
+
14
+ def visit url
15
+ @redis.sadd(@set_name, url)
16
+ end
17
+
18
+ def remove
19
+ @redis.srem(@set_name, url, 0)
20
+ end
21
+
22
+ def clear
23
+ @redis.del @set_name
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,4 @@
1
+ module Polipus
2
+ VERSION = "0.0.1"
3
+ HOMEPAGE = "https://github.com/taganaka/polipus"
4
+ end
data/polipus.gemspec ADDED
@@ -0,0 +1,39 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "polipus/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "polipus"
7
+ s.version = Polipus::VERSION
8
+ s.authors = ["Francesco Laurita"]
9
+ s.email = ["francesco.laurita@gmail.com"]
10
+ s.homepage = "https://github.com/taganaka/polipus"
11
+ s.summary = %q{Polipus distributed web-crawler framework}
12
+ s.description = %q{
13
+ An easy to use distributed web-crawler framework based on Redis
14
+ }
15
+
16
+ s.rubyforge_project = "polipus"
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+
23
+ s.add_dependency "redis-bloomfilter", "~> 0.0.1"
24
+ s.add_dependency "redis-queue", "~> 0.0.3"
25
+ s.add_dependency "nokogiri", "~> 1.6.0"
26
+ s.add_dependency "hiredis", "~> 0.4.5"
27
+ s.add_dependency "redis", "~> 3.0.4"
28
+ s.add_dependency "mongo", "~> 1.8.6"
29
+ s.add_dependency "bson_ext", "~> 1.8.6"
30
+ s.add_dependency "json", "~> 1.8.0"
31
+ s.add_dependency "aws-s3", "~> 0.6.3"
32
+ s.add_dependency "http-cookie", "~> 1.0.1"
33
+
34
+ s.add_development_dependency "rspec"
35
+ s.add_development_dependency "vcr", "~> 2.5.0"
36
+ s.add_development_dependency "webmock"
37
+ s.add_development_dependency "flexmock", "~> 1.3.2"
38
+
39
+ end
@@ -0,0 +1,166 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ response:
10
+ status:
11
+ code: 200
12
+ message: OK
13
+ body:
14
+ encoding: US-ASCII
15
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
16
+
17
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
18
+ http_version:
19
+ recorded_at: Thu, 18 Jul 2013 11:04:53 GMT
20
+ - request:
21
+ method: put
22
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/8b86345aba1588c6b0b0c1729e077c7b
23
+ body:
24
+ encoding: ASCII-8BIT
25
+ string: !binary |-
26
+ eJxVTUtugzAQvcusaTBpAtRJ21Uj9QyhQv5MZFSDkRkLRSh37xTURVfvO28W
27
+ SNGDBEc0yjyf53mnJrszoX9nfPXBQwYOlcU4ca1JQohDo5eVlJ/N6hSFCQPh
28
+ QE90H3GL5AYfl+u/qhCbPG3iwus62DtPnx31/u2cr8C274Zvfnn9ysAEiyD3
29
+ QmRgcSQHklnEG0aMfAm/wnYRDbUU/oxpDMOELXU93w7J+wxuSMahBUkxYQZp
30
+ wthaRQrk8mCZOo6g1nX5fDgqrYpjXZtSCy1MUe1fUFSVqTQ8fgD3C19t
31
+ response:
32
+ status:
33
+ code: 200
34
+ message: OK
35
+ body:
36
+ encoding: US-ASCII
37
+ string: ''
38
+ http_version:
39
+ recorded_at: Thu, 18 Jul 2013 11:04:53 GMT
40
+ - request:
41
+ method: head
42
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/8b86345aba1588c6b0b0c1729e077c7b
43
+ body:
44
+ encoding: US-ASCII
45
+ string: ''
46
+ response:
47
+ status:
48
+ code: 200
49
+ message: OK
50
+ body:
51
+ encoding: US-ASCII
52
+ string: ''
53
+ http_version:
54
+ recorded_at: Thu, 18 Jul 2013 11:04:54 GMT
55
+ - request:
56
+ method: head
57
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/8b86345aba1588c6b0b0c1729e077c7b
58
+ body:
59
+ encoding: US-ASCII
60
+ string: ''
61
+ response:
62
+ status:
63
+ code: 200
64
+ message: OK
65
+ body:
66
+ encoding: US-ASCII
67
+ string: ''
68
+ http_version:
69
+ recorded_at: Thu, 18 Jul 2013 11:04:54 GMT
70
+ - request:
71
+ method: delete
72
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/8b86345aba1588c6b0b0c1729e077c7b
73
+ body:
74
+ encoding: US-ASCII
75
+ string: ''
76
+ response:
77
+ status:
78
+ code: 204
79
+ message: No Content
80
+ body:
81
+ encoding: US-ASCII
82
+ string: ''
83
+ http_version:
84
+ recorded_at: Thu, 18 Jul 2013 11:04:54 GMT
85
+ - request:
86
+ method: get
87
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
88
+ body:
89
+ encoding: US-ASCII
90
+ string: ''
91
+ response:
92
+ status:
93
+ code: 200
94
+ message: OK
95
+ body:
96
+ encoding: US-ASCII
97
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
98
+
99
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
100
+ http_version:
101
+ recorded_at: Thu, 18 Jul 2013 11:04:54 GMT
102
+ - request:
103
+ method: get
104
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
105
+ body:
106
+ encoding: US-ASCII
107
+ string: ''
108
+ response:
109
+ status:
110
+ code: 200
111
+ message: OK
112
+ body:
113
+ encoding: US-ASCII
114
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
115
+
116
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
117
+ http_version:
118
+ recorded_at: Thu, 18 Jul 2013 11:04:55 GMT
119
+ - request:
120
+ method: delete
121
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
122
+ body:
123
+ encoding: US-ASCII
124
+ string: ''
125
+ response:
126
+ status:
127
+ code: 204
128
+ message: No Content
129
+ body:
130
+ encoding: US-ASCII
131
+ string: ''
132
+ http_version:
133
+ recorded_at: Thu, 18 Jul 2013 11:04:55 GMT
134
+ - request:
135
+ method: put
136
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
137
+ body:
138
+ encoding: US-ASCII
139
+ string: ''
140
+ response:
141
+ status:
142
+ code: 200
143
+ message: OK
144
+ body:
145
+ encoding: US-ASCII
146
+ string: ''
147
+ http_version:
148
+ recorded_at: Thu, 18 Jul 2013 11:04:55 GMT
149
+ - request:
150
+ method: get
151
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
152
+ body:
153
+ encoding: US-ASCII
154
+ string: ''
155
+ response:
156
+ status:
157
+ code: 200
158
+ message: OK
159
+ body:
160
+ encoding: US-ASCII
161
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
162
+
163
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
164
+ http_version:
165
+ recorded_at: Thu, 18 Jul 2013 11:04:56 GMT
166
+ recorded_with: VCR 2.5.0
@@ -0,0 +1,166 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ response:
10
+ status:
11
+ code: 200
12
+ message: OK
13
+ body:
14
+ encoding: US-ASCII
15
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
16
+
17
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
18
+ http_version:
19
+ recorded_at: Thu, 18 Jul 2013 11:04:38 GMT
20
+ - request:
21
+ method: put
22
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/ef727a04499928b997ca7f26fcedd1b4
23
+ body:
24
+ encoding: ASCII-8BIT
25
+ string: !binary |-
26
+ eJxVTc1qhDAQfpc5p2sUWWu6bU9d6DOsRWIyojQaSSbIIvvunSo99PT9zjcb
27
+ pOBAwUC0qCxb1/Wkoz0ZP2XvTF6ddyBgQG0xRO41SUpZNt22k/Nnszt5bvxM
28
+ ONMT3Rc8InXAx/X2ryrlIV8OceX1zts7T18GmtzbJduBbTfO3/zy9iXAeIug
29
+ CikFWFxoAMUsYI8BA1/Cr7BjQEMt+T8jLn6O2NI48e2cnBPQI5kBLSgKCQWk
30
+ iKG1mjSo7cEyjRwB9lVRaVmWdV0Xz11dV0ZXfXHuDVqbdyU8fgA35WAW
31
+ response:
32
+ status:
33
+ code: 200
34
+ message: OK
35
+ body:
36
+ encoding: US-ASCII
37
+ string: ''
38
+ http_version:
39
+ recorded_at: Thu, 18 Jul 2013 11:04:38 GMT
40
+ - request:
41
+ method: head
42
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/ef727a04499928b997ca7f26fcedd1b4
43
+ body:
44
+ encoding: US-ASCII
45
+ string: ''
46
+ response:
47
+ status:
48
+ code: 200
49
+ message: OK
50
+ body:
51
+ encoding: US-ASCII
52
+ string: ''
53
+ http_version:
54
+ recorded_at: Thu, 18 Jul 2013 11:04:39 GMT
55
+ - request:
56
+ method: head
57
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/ef727a04499928b997ca7f26fcedd1b4
58
+ body:
59
+ encoding: US-ASCII
60
+ string: ''
61
+ response:
62
+ status:
63
+ code: 200
64
+ message: OK
65
+ body:
66
+ encoding: US-ASCII
67
+ string: ''
68
+ http_version:
69
+ recorded_at: Thu, 18 Jul 2013 11:04:39 GMT
70
+ - request:
71
+ method: delete
72
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/ef727a04499928b997ca7f26fcedd1b4
73
+ body:
74
+ encoding: US-ASCII
75
+ string: ''
76
+ response:
77
+ status:
78
+ code: 204
79
+ message: No Content
80
+ body:
81
+ encoding: US-ASCII
82
+ string: ''
83
+ http_version:
84
+ recorded_at: Thu, 18 Jul 2013 11:04:39 GMT
85
+ - request:
86
+ method: get
87
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
88
+ body:
89
+ encoding: US-ASCII
90
+ string: ''
91
+ response:
92
+ status:
93
+ code: 200
94
+ message: OK
95
+ body:
96
+ encoding: US-ASCII
97
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
98
+
99
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
100
+ http_version:
101
+ recorded_at: Thu, 18 Jul 2013 11:04:39 GMT
102
+ - request:
103
+ method: get
104
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
105
+ body:
106
+ encoding: US-ASCII
107
+ string: ''
108
+ response:
109
+ status:
110
+ code: 200
111
+ message: OK
112
+ body:
113
+ encoding: US-ASCII
114
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
115
+
116
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
117
+ http_version:
118
+ recorded_at: Thu, 18 Jul 2013 11:04:40 GMT
119
+ - request:
120
+ method: delete
121
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
122
+ body:
123
+ encoding: US-ASCII
124
+ string: ''
125
+ response:
126
+ status:
127
+ code: 204
128
+ message: No Content
129
+ body:
130
+ encoding: US-ASCII
131
+ string: ''
132
+ http_version:
133
+ recorded_at: Thu, 18 Jul 2013 11:04:40 GMT
134
+ - request:
135
+ method: put
136
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
137
+ body:
138
+ encoding: US-ASCII
139
+ string: ''
140
+ response:
141
+ status:
142
+ code: 200
143
+ message: OK
144
+ body:
145
+ encoding: US-ASCII
146
+ string: ''
147
+ http_version:
148
+ recorded_at: Thu, 18 Jul 2013 11:04:40 GMT
149
+ - request:
150
+ method: get
151
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
152
+ body:
153
+ encoding: US-ASCII
154
+ string: ''
155
+ response:
156
+ status:
157
+ code: 200
158
+ message: OK
159
+ body:
160
+ encoding: US-ASCII
161
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
162
+
163
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
164
+ http_version:
165
+ recorded_at: Thu, 18 Jul 2013 11:04:41 GMT
166
+ recorded_with: VCR 2.5.0