polipus 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +15 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +55 -0
  8. data/README.rdoc +3 -0
  9. data/Rakefile +9 -0
  10. data/examples/basic.rb +58 -0
  11. data/examples/survival.rb +9 -0
  12. data/lib/polipus.rb +451 -0
  13. data/lib/polipus/http.rb +195 -0
  14. data/lib/polipus/page.rb +219 -0
  15. data/lib/polipus/plugin.rb +13 -0
  16. data/lib/polipus/plugins/cleaner.rb +25 -0
  17. data/lib/polipus/plugins/sample.rb +17 -0
  18. data/lib/polipus/plugins/sleeper.rb +22 -0
  19. data/lib/polipus/queue_overflow.rb +24 -0
  20. data/lib/polipus/queue_overflow/base.rb +6 -0
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
  22. data/lib/polipus/queue_overflow/manager.rb +50 -0
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
  25. data/lib/polipus/storage.rb +31 -0
  26. data/lib/polipus/storage/base.rb +17 -0
  27. data/lib/polipus/storage/dev_null.rb +35 -0
  28. data/lib/polipus/storage/mongo_store.rb +86 -0
  29. data/lib/polipus/storage/s3_store.rb +100 -0
  30. data/lib/polipus/url_tracker.rb +20 -0
  31. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  32. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  33. data/lib/polipus/version.rb +4 -0
  34. data/polipus.gemspec +39 -0
  35. data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
  36. data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
  37. data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
  38. data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
  39. data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
  40. data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
  41. data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
  42. data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
  43. data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
  44. data/spec/cassettes/http_test.yml +1418 -0
  45. data/spec/cassettes/http_test_redirect.yml +71 -0
  46. data/spec/clear.rb +11 -0
  47. data/spec/http_spec.rb +31 -0
  48. data/spec/page_spec.rb +22 -0
  49. data/spec/queue_overflow_manager_spec.rb +89 -0
  50. data/spec/queue_overflow_spec.rb +71 -0
  51. data/spec/spec_helper.rb +34 -0
  52. data/spec/storage_mongo_spec.rb +102 -0
  53. data/spec/storage_s3_spec.rb +115 -0
  54. data/spec/url_tracker_spec.rb +28 -0
  55. metadata +313 -0
@@ -0,0 +1,20 @@
1
+ module Polipus
2
+ module UrlTracker
3
+ def self.bloomfilter(options = {})
4
+ require "polipus/url_tracker/bloomfilter"
5
+ options[:size] ||= 1_000_000
6
+ options[:error_rate] ||= 0.01
7
+ options[:key_name] ||= 'polipus-bloomfilter'
8
+ options[:redis] ||= Redis.current
9
+ options[:driver] ||= 'lua'
10
+ self::Bloomfilter.new options
11
+ end
12
+
13
+ def self.redis_set(options = {})
14
+ require "polipus/url_tracker/redis_set"
15
+ options[:redis] ||= Redis.current
16
+ options[:key_name] ||= 'polipus-set'
17
+ self::RedisSet.new options
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,27 @@
1
+ require "redis-bloomfilter"
2
+ module Polipus
3
+ module UrlTracker
4
+ class Bloomfilter
5
+ def initialize(options = {})
6
+ @bf = Redis::Bloomfilter.new options
7
+ end
8
+
9
+ def visited?(url)
10
+ @bf.include?(url)
11
+ end
12
+
13
+ def visit url
14
+ @bf.insert url
15
+ end
16
+
17
+ def remove url
18
+ @bf.remove url
19
+ end
20
+
21
+ def clear
22
+ @bf.clear
23
+ end
24
+
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,27 @@
1
+ module Polipus
2
+ module UrlTracker
3
+ class RedisSet
4
+
5
+ def initialize(options = {})
6
+ @redis = options[:redis] || Redis.current
7
+ @set_name = options[:key_name]
8
+ end
9
+
10
+ def visited?(url)
11
+ @redis.sismember(@set_name,url)
12
+ end
13
+
14
+ def visit url
15
+ @redis.sadd(@set_name, url)
16
+ end
17
+
18
+ def remove
19
+ @redis.srem(@set_name, url, 0)
20
+ end
21
+
22
+ def clear
23
+ @redis.del @set_name
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,4 @@
1
+ module Polipus
2
+ VERSION = "0.0.1"
3
+ HOMEPAGE = "https://github.com/taganaka/polipus"
4
+ end
data/polipus.gemspec ADDED
@@ -0,0 +1,39 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "polipus/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "polipus"
7
+ s.version = Polipus::VERSION
8
+ s.authors = ["Francesco Laurita"]
9
+ s.email = ["francesco.laurita@gmail.com"]
10
+ s.homepage = "https://github.com/taganaka/polipus"
11
+ s.summary = %q{Polipus distributed web-crawler framework}
12
+ s.description = %q{
13
+ An easy to use distributed web-crawler framework based on Redis
14
+ }
15
+
16
+ s.rubyforge_project = "polipus"
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+
23
+ s.add_dependency "redis-bloomfilter", "~> 0.0.1"
24
+ s.add_dependency "redis-queue", "~> 0.0.3"
25
+ s.add_dependency "nokogiri", "~> 1.6.0"
26
+ s.add_dependency "hiredis", "~> 0.4.5"
27
+ s.add_dependency "redis", "~> 3.0.4"
28
+ s.add_dependency "mongo", "~> 1.8.6"
29
+ s.add_dependency "bson_ext", "~> 1.8.6"
30
+ s.add_dependency "json", "~> 1.8.0"
31
+ s.add_dependency "aws-s3", "~> 0.6.3"
32
+ s.add_dependency "http-cookie", "~> 1.0.1"
33
+
34
+ s.add_development_dependency "rspec"
35
+ s.add_development_dependency "vcr", "~> 2.5.0"
36
+ s.add_development_dependency "webmock"
37
+ s.add_development_dependency "flexmock", "~> 1.3.2"
38
+
39
+ end
@@ -0,0 +1,166 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ response:
10
+ status:
11
+ code: 200
12
+ message: OK
13
+ body:
14
+ encoding: US-ASCII
15
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
16
+
17
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
18
+ http_version:
19
+ recorded_at: Thu, 18 Jul 2013 11:04:53 GMT
20
+ - request:
21
+ method: put
22
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/8b86345aba1588c6b0b0c1729e077c7b
23
+ body:
24
+ encoding: ASCII-8BIT
25
+ string: !binary |-
26
+ eJxVTUtugzAQvcusaTBpAtRJ21Uj9QyhQv5MZFSDkRkLRSh37xTURVfvO28W
27
+ SNGDBEc0yjyf53mnJrszoX9nfPXBQwYOlcU4ca1JQohDo5eVlJ/N6hSFCQPh
28
+ QE90H3GL5AYfl+u/qhCbPG3iwus62DtPnx31/u2cr8C274Zvfnn9ysAEiyD3
29
+ QmRgcSQHklnEG0aMfAm/wnYRDbUU/oxpDMOELXU93w7J+wxuSMahBUkxYQZp
30
+ wthaRQrk8mCZOo6g1nX5fDgqrYpjXZtSCy1MUe1fUFSVqTQ8fgD3C19t
31
+ response:
32
+ status:
33
+ code: 200
34
+ message: OK
35
+ body:
36
+ encoding: US-ASCII
37
+ string: ''
38
+ http_version:
39
+ recorded_at: Thu, 18 Jul 2013 11:04:53 GMT
40
+ - request:
41
+ method: head
42
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/8b86345aba1588c6b0b0c1729e077c7b
43
+ body:
44
+ encoding: US-ASCII
45
+ string: ''
46
+ response:
47
+ status:
48
+ code: 200
49
+ message: OK
50
+ body:
51
+ encoding: US-ASCII
52
+ string: ''
53
+ http_version:
54
+ recorded_at: Thu, 18 Jul 2013 11:04:54 GMT
55
+ - request:
56
+ method: head
57
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/8b86345aba1588c6b0b0c1729e077c7b
58
+ body:
59
+ encoding: US-ASCII
60
+ string: ''
61
+ response:
62
+ status:
63
+ code: 200
64
+ message: OK
65
+ body:
66
+ encoding: US-ASCII
67
+ string: ''
68
+ http_version:
69
+ recorded_at: Thu, 18 Jul 2013 11:04:54 GMT
70
+ - request:
71
+ method: delete
72
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/8b86345aba1588c6b0b0c1729e077c7b
73
+ body:
74
+ encoding: US-ASCII
75
+ string: ''
76
+ response:
77
+ status:
78
+ code: 204
79
+ message: No Content
80
+ body:
81
+ encoding: US-ASCII
82
+ string: ''
83
+ http_version:
84
+ recorded_at: Thu, 18 Jul 2013 11:04:54 GMT
85
+ - request:
86
+ method: get
87
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
88
+ body:
89
+ encoding: US-ASCII
90
+ string: ''
91
+ response:
92
+ status:
93
+ code: 200
94
+ message: OK
95
+ body:
96
+ encoding: US-ASCII
97
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
98
+
99
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
100
+ http_version:
101
+ recorded_at: Thu, 18 Jul 2013 11:04:54 GMT
102
+ - request:
103
+ method: get
104
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
105
+ body:
106
+ encoding: US-ASCII
107
+ string: ''
108
+ response:
109
+ status:
110
+ code: 200
111
+ message: OK
112
+ body:
113
+ encoding: US-ASCII
114
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
115
+
116
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
117
+ http_version:
118
+ recorded_at: Thu, 18 Jul 2013 11:04:55 GMT
119
+ - request:
120
+ method: delete
121
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
122
+ body:
123
+ encoding: US-ASCII
124
+ string: ''
125
+ response:
126
+ status:
127
+ code: 204
128
+ message: No Content
129
+ body:
130
+ encoding: US-ASCII
131
+ string: ''
132
+ http_version:
133
+ recorded_at: Thu, 18 Jul 2013 11:04:55 GMT
134
+ - request:
135
+ method: put
136
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
137
+ body:
138
+ encoding: US-ASCII
139
+ string: ''
140
+ response:
141
+ status:
142
+ code: 200
143
+ message: OK
144
+ body:
145
+ encoding: US-ASCII
146
+ string: ''
147
+ http_version:
148
+ recorded_at: Thu, 18 Jul 2013 11:04:55 GMT
149
+ - request:
150
+ method: get
151
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
152
+ body:
153
+ encoding: US-ASCII
154
+ string: ''
155
+ response:
156
+ status:
157
+ code: 200
158
+ message: OK
159
+ body:
160
+ encoding: US-ASCII
161
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
162
+
163
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
164
+ http_version:
165
+ recorded_at: Thu, 18 Jul 2013 11:04:56 GMT
166
+ recorded_with: VCR 2.5.0
@@ -0,0 +1,166 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ response:
10
+ status:
11
+ code: 200
12
+ message: OK
13
+ body:
14
+ encoding: US-ASCII
15
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
16
+
17
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
18
+ http_version:
19
+ recorded_at: Thu, 18 Jul 2013 11:04:38 GMT
20
+ - request:
21
+ method: put
22
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/ef727a04499928b997ca7f26fcedd1b4
23
+ body:
24
+ encoding: ASCII-8BIT
25
+ string: !binary |-
26
+ eJxVTc1qhDAQfpc5p2sUWWu6bU9d6DOsRWIyojQaSSbIIvvunSo99PT9zjcb
27
+ pOBAwUC0qCxb1/Wkoz0ZP2XvTF6ddyBgQG0xRO41SUpZNt22k/Nnszt5bvxM
28
+ ONMT3Rc8InXAx/X2ryrlIV8OceX1zts7T18GmtzbJduBbTfO3/zy9iXAeIug
29
+ CikFWFxoAMUsYI8BA1/Cr7BjQEMt+T8jLn6O2NI48e2cnBPQI5kBLSgKCQWk
30
+ iKG1mjSo7cEyjRwB9lVRaVmWdV0Xz11dV0ZXfXHuDVqbdyU8fgA35WAW
31
+ response:
32
+ status:
33
+ code: 200
34
+ message: OK
35
+ body:
36
+ encoding: US-ASCII
37
+ string: ''
38
+ http_version:
39
+ recorded_at: Thu, 18 Jul 2013 11:04:38 GMT
40
+ - request:
41
+ method: head
42
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/ef727a04499928b997ca7f26fcedd1b4
43
+ body:
44
+ encoding: US-ASCII
45
+ string: ''
46
+ response:
47
+ status:
48
+ code: 200
49
+ message: OK
50
+ body:
51
+ encoding: US-ASCII
52
+ string: ''
53
+ http_version:
54
+ recorded_at: Thu, 18 Jul 2013 11:04:39 GMT
55
+ - request:
56
+ method: head
57
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/ef727a04499928b997ca7f26fcedd1b4
58
+ body:
59
+ encoding: US-ASCII
60
+ string: ''
61
+ response:
62
+ status:
63
+ code: 200
64
+ message: OK
65
+ body:
66
+ encoding: US-ASCII
67
+ string: ''
68
+ http_version:
69
+ recorded_at: Thu, 18 Jul 2013 11:04:39 GMT
70
+ - request:
71
+ method: delete
72
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages/ef727a04499928b997ca7f26fcedd1b4
73
+ body:
74
+ encoding: US-ASCII
75
+ string: ''
76
+ response:
77
+ status:
78
+ code: 204
79
+ message: No Content
80
+ body:
81
+ encoding: US-ASCII
82
+ string: ''
83
+ http_version:
84
+ recorded_at: Thu, 18 Jul 2013 11:04:39 GMT
85
+ - request:
86
+ method: get
87
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
88
+ body:
89
+ encoding: US-ASCII
90
+ string: ''
91
+ response:
92
+ status:
93
+ code: 200
94
+ message: OK
95
+ body:
96
+ encoding: US-ASCII
97
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
98
+
99
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
100
+ http_version:
101
+ recorded_at: Thu, 18 Jul 2013 11:04:39 GMT
102
+ - request:
103
+ method: get
104
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
105
+ body:
106
+ encoding: US-ASCII
107
+ string: ''
108
+ response:
109
+ status:
110
+ code: 200
111
+ message: OK
112
+ body:
113
+ encoding: US-ASCII
114
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
115
+
116
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
117
+ http_version:
118
+ recorded_at: Thu, 18 Jul 2013 11:04:40 GMT
119
+ - request:
120
+ method: delete
121
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
122
+ body:
123
+ encoding: US-ASCII
124
+ string: ''
125
+ response:
126
+ status:
127
+ code: 204
128
+ message: No Content
129
+ body:
130
+ encoding: US-ASCII
131
+ string: ''
132
+ http_version:
133
+ recorded_at: Thu, 18 Jul 2013 11:04:40 GMT
134
+ - request:
135
+ method: put
136
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
137
+ body:
138
+ encoding: US-ASCII
139
+ string: ''
140
+ response:
141
+ status:
142
+ code: 200
143
+ message: OK
144
+ body:
145
+ encoding: US-ASCII
146
+ string: ''
147
+ http_version:
148
+ recorded_at: Thu, 18 Jul 2013 11:04:40 GMT
149
+ - request:
150
+ method: get
151
+ uri: http://s3.amazonaws.com/com.polipus.pages._test_pages
152
+ body:
153
+ encoding: US-ASCII
154
+ string: ''
155
+ response:
156
+ status:
157
+ code: 200
158
+ message: OK
159
+ body:
160
+ encoding: US-ASCII
161
+ string: ! '<?xml version="1.0" encoding="UTF-8"?>
162
+
163
+ <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>com.polipus.pages._test_pages</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated></ListBucketResult>'
164
+ http_version:
165
+ recorded_at: Thu, 18 Jul 2013 11:04:41 GMT
166
+ recorded_with: VCR 2.5.0