polipus 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +15 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +55 -0
  8. data/README.rdoc +3 -0
  9. data/Rakefile +9 -0
  10. data/examples/basic.rb +58 -0
  11. data/examples/survival.rb +9 -0
  12. data/lib/polipus.rb +451 -0
  13. data/lib/polipus/http.rb +195 -0
  14. data/lib/polipus/page.rb +219 -0
  15. data/lib/polipus/plugin.rb +13 -0
  16. data/lib/polipus/plugins/cleaner.rb +25 -0
  17. data/lib/polipus/plugins/sample.rb +17 -0
  18. data/lib/polipus/plugins/sleeper.rb +22 -0
  19. data/lib/polipus/queue_overflow.rb +24 -0
  20. data/lib/polipus/queue_overflow/base.rb +6 -0
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
  22. data/lib/polipus/queue_overflow/manager.rb +50 -0
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
  25. data/lib/polipus/storage.rb +31 -0
  26. data/lib/polipus/storage/base.rb +17 -0
  27. data/lib/polipus/storage/dev_null.rb +35 -0
  28. data/lib/polipus/storage/mongo_store.rb +86 -0
  29. data/lib/polipus/storage/s3_store.rb +100 -0
  30. data/lib/polipus/url_tracker.rb +20 -0
  31. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  32. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  33. data/lib/polipus/version.rb +4 -0
  34. data/polipus.gemspec +39 -0
  35. data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
  36. data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
  37. data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
  38. data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
  39. data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
  40. data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
  41. data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
  42. data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
  43. data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
  44. data/spec/cassettes/http_test.yml +1418 -0
  45. data/spec/cassettes/http_test_redirect.yml +71 -0
  46. data/spec/clear.rb +11 -0
  47. data/spec/http_spec.rb +31 -0
  48. data/spec/page_spec.rb +22 -0
  49. data/spec/queue_overflow_manager_spec.rb +89 -0
  50. data/spec/queue_overflow_spec.rb +71 -0
  51. data/spec/spec_helper.rb +34 -0
  52. data/spec/storage_mongo_spec.rb +102 -0
  53. data/spec/storage_s3_spec.rb +115 -0
  54. data/spec/url_tracker_spec.rb +28 -0
  55. metadata +313 -0
@@ -0,0 +1,28 @@
1
+ require "spec_helper"
2
+ require "polipus/url_tracker"
3
+
4
+ describe Polipus::UrlTracker do
5
+ before(:all) do
6
+ @bf = Polipus::UrlTracker.bloomfilter
7
+ @set = Polipus::UrlTracker.redis_set
8
+ end
9
+
10
+ after(:all) do
11
+ @bf.clear
12
+ @set.clear
13
+ end
14
+
15
+ it 'should work (bf)' do
16
+ url = "http://www.asd.com/asd/lol"
17
+ @bf.visit url
18
+ @bf.visited?(url).should be_true
19
+ @bf.visited?("http://www.google.com").should be_false
20
+ end
21
+
22
+ it 'should work (redis_set)' do
23
+ url = "http://www.asd.com/asd/lol"
24
+ @set.visit url
25
+ @set.visited?(url).should be_true
26
+ @set.visited?("http://www.google.com").should be_false
27
+ end
28
+ end
metadata ADDED
@@ -0,0 +1,313 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: polipus
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Francesco Laurita
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-01-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: redis-bloomfilter
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 0.0.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 0.0.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: redis-queue
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.0.3
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 0.0.3
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: 1.6.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 1.6.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: hiredis
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.4.5
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 0.4.5
69
+ - !ruby/object:Gem::Dependency
70
+ name: redis
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ version: 3.0.4
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: 3.0.4
83
+ - !ruby/object:Gem::Dependency
84
+ name: mongo
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: 1.8.6
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: 1.8.6
97
+ - !ruby/object:Gem::Dependency
98
+ name: bson_ext
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ~>
102
+ - !ruby/object:Gem::Version
103
+ version: 1.8.6
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ version: 1.8.6
111
+ - !ruby/object:Gem::Dependency
112
+ name: json
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: 1.8.0
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ~>
123
+ - !ruby/object:Gem::Version
124
+ version: 1.8.0
125
+ - !ruby/object:Gem::Dependency
126
+ name: aws-s3
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ~>
130
+ - !ruby/object:Gem::Version
131
+ version: 0.6.3
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ~>
137
+ - !ruby/object:Gem::Version
138
+ version: 0.6.3
139
+ - !ruby/object:Gem::Dependency
140
+ name: http-cookie
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ~>
144
+ - !ruby/object:Gem::Version
145
+ version: 1.0.1
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ~>
151
+ - !ruby/object:Gem::Version
152
+ version: 1.0.1
153
+ - !ruby/object:Gem::Dependency
154
+ name: rspec
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ! '>='
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ! '>='
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: vcr
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ~>
172
+ - !ruby/object:Gem::Version
173
+ version: 2.5.0
174
+ type: :development
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ~>
179
+ - !ruby/object:Gem::Version
180
+ version: 2.5.0
181
+ - !ruby/object:Gem::Dependency
182
+ name: webmock
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - ! '>='
186
+ - !ruby/object:Gem::Version
187
+ version: '0'
188
+ type: :development
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - ! '>='
193
+ - !ruby/object:Gem::Version
194
+ version: '0'
195
+ - !ruby/object:Gem::Dependency
196
+ name: flexmock
197
+ requirement: !ruby/object:Gem::Requirement
198
+ requirements:
199
+ - - ~>
200
+ - !ruby/object:Gem::Version
201
+ version: 1.3.2
202
+ type: :development
203
+ prerelease: false
204
+ version_requirements: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - ~>
207
+ - !ruby/object:Gem::Version
208
+ version: 1.3.2
209
+ description: ! "\n An easy to use distributed web-crawler framework based on Redis\n
210
+ \ "
211
+ email:
212
+ - francesco.laurita@gmail.com
213
+ executables: []
214
+ extensions: []
215
+ extra_rdoc_files: []
216
+ files:
217
+ - .document
218
+ - .gitignore
219
+ - .rspec
220
+ - Gemfile
221
+ - LICENSE.txt
222
+ - README.md
223
+ - README.rdoc
224
+ - Rakefile
225
+ - examples/basic.rb
226
+ - examples/survival.rb
227
+ - lib/polipus.rb
228
+ - lib/polipus/http.rb
229
+ - lib/polipus/page.rb
230
+ - lib/polipus/plugin.rb
231
+ - lib/polipus/plugins/cleaner.rb
232
+ - lib/polipus/plugins/sample.rb
233
+ - lib/polipus/plugins/sleeper.rb
234
+ - lib/polipus/queue_overflow.rb
235
+ - lib/polipus/queue_overflow/base.rb
236
+ - lib/polipus/queue_overflow/dev_null_queue.rb
237
+ - lib/polipus/queue_overflow/manager.rb
238
+ - lib/polipus/queue_overflow/mongo_queue.rb
239
+ - lib/polipus/queue_overflow/mongo_queue_capped.rb
240
+ - lib/polipus/storage.rb
241
+ - lib/polipus/storage/base.rb
242
+ - lib/polipus/storage/dev_null.rb
243
+ - lib/polipus/storage/mongo_store.rb
244
+ - lib/polipus/storage/s3_store.rb
245
+ - lib/polipus/url_tracker.rb
246
+ - lib/polipus/url_tracker/bloomfilter.rb
247
+ - lib/polipus/url_tracker/redis_set.rb
248
+ - lib/polipus/version.rb
249
+ - polipus.gemspec
250
+ - spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
251
+ - spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
252
+ - spec/cassettes/4640919145753505af2d0f8423de37f3.yml
253
+ - spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml
254
+ - spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml
255
+ - spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml
256
+ - spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
257
+ - spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
258
+ - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
259
+ - spec/cassettes/http_test.yml
260
+ - spec/cassettes/http_test_redirect.yml
261
+ - spec/clear.rb
262
+ - spec/http_spec.rb
263
+ - spec/page_spec.rb
264
+ - spec/queue_overflow_manager_spec.rb
265
+ - spec/queue_overflow_spec.rb
266
+ - spec/spec_helper.rb
267
+ - spec/storage_mongo_spec.rb
268
+ - spec/storage_s3_spec.rb
269
+ - spec/url_tracker_spec.rb
270
+ homepage: https://github.com/taganaka/polipus
271
+ licenses: []
272
+ metadata: {}
273
+ post_install_message:
274
+ rdoc_options: []
275
+ require_paths:
276
+ - lib
277
+ required_ruby_version: !ruby/object:Gem::Requirement
278
+ requirements:
279
+ - - ! '>='
280
+ - !ruby/object:Gem::Version
281
+ version: '0'
282
+ required_rubygems_version: !ruby/object:Gem::Requirement
283
+ requirements:
284
+ - - ! '>='
285
+ - !ruby/object:Gem::Version
286
+ version: '0'
287
+ requirements: []
288
+ rubyforge_project: polipus
289
+ rubygems_version: 2.0.7
290
+ signing_key:
291
+ specification_version: 4
292
+ summary: Polipus distributed web-crawler framework
293
+ test_files:
294
+ - spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
295
+ - spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
296
+ - spec/cassettes/4640919145753505af2d0f8423de37f3.yml
297
+ - spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml
298
+ - spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml
299
+ - spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml
300
+ - spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
301
+ - spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
302
+ - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
303
+ - spec/cassettes/http_test.yml
304
+ - spec/cassettes/http_test_redirect.yml
305
+ - spec/clear.rb
306
+ - spec/http_spec.rb
307
+ - spec/page_spec.rb
308
+ - spec/queue_overflow_manager_spec.rb
309
+ - spec/queue_overflow_spec.rb
310
+ - spec/spec_helper.rb
311
+ - spec/storage_mongo_spec.rb
312
+ - spec/storage_s3_spec.rb
313
+ - spec/url_tracker_spec.rb