polipus 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +15 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +55 -0
  8. data/README.rdoc +3 -0
  9. data/Rakefile +9 -0
  10. data/examples/basic.rb +58 -0
  11. data/examples/survival.rb +9 -0
  12. data/lib/polipus.rb +451 -0
  13. data/lib/polipus/http.rb +195 -0
  14. data/lib/polipus/page.rb +219 -0
  15. data/lib/polipus/plugin.rb +13 -0
  16. data/lib/polipus/plugins/cleaner.rb +25 -0
  17. data/lib/polipus/plugins/sample.rb +17 -0
  18. data/lib/polipus/plugins/sleeper.rb +22 -0
  19. data/lib/polipus/queue_overflow.rb +24 -0
  20. data/lib/polipus/queue_overflow/base.rb +6 -0
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
  22. data/lib/polipus/queue_overflow/manager.rb +50 -0
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
  25. data/lib/polipus/storage.rb +31 -0
  26. data/lib/polipus/storage/base.rb +17 -0
  27. data/lib/polipus/storage/dev_null.rb +35 -0
  28. data/lib/polipus/storage/mongo_store.rb +86 -0
  29. data/lib/polipus/storage/s3_store.rb +100 -0
  30. data/lib/polipus/url_tracker.rb +20 -0
  31. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  32. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  33. data/lib/polipus/version.rb +4 -0
  34. data/polipus.gemspec +39 -0
  35. data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
  36. data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
  37. data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
  38. data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
  39. data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
  40. data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
  41. data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
  42. data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
  43. data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
  44. data/spec/cassettes/http_test.yml +1418 -0
  45. data/spec/cassettes/http_test_redirect.yml +71 -0
  46. data/spec/clear.rb +11 -0
  47. data/spec/http_spec.rb +31 -0
  48. data/spec/page_spec.rb +22 -0
  49. data/spec/queue_overflow_manager_spec.rb +89 -0
  50. data/spec/queue_overflow_spec.rb +71 -0
  51. data/spec/spec_helper.rb +34 -0
  52. data/spec/storage_mongo_spec.rb +102 -0
  53. data/spec/storage_s3_spec.rb +115 -0
  54. data/spec/url_tracker_spec.rb +28 -0
  55. metadata +313 -0
@@ -0,0 +1,28 @@
1
+ require "spec_helper"
2
+ require "polipus/url_tracker"
3
+
4
+ describe Polipus::UrlTracker do
5
+ before(:all) do
6
+ @bf = Polipus::UrlTracker.bloomfilter
7
+ @set = Polipus::UrlTracker.redis_set
8
+ end
9
+
10
+ after(:all) do
11
+ @bf.clear
12
+ @set.clear
13
+ end
14
+
15
+ it 'should work (bf)' do
16
+ url = "http://www.asd.com/asd/lol"
17
+ @bf.visit url
18
+ @bf.visited?(url).should be_true
19
+ @bf.visited?("http://www.google.com").should be_false
20
+ end
21
+
22
+ it 'should work (redis_set)' do
23
+ url = "http://www.asd.com/asd/lol"
24
+ @set.visit url
25
+ @set.visited?(url).should be_true
26
+ @set.visited?("http://www.google.com").should be_false
27
+ end
28
+ end
metadata ADDED
@@ -0,0 +1,313 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: polipus
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Francesco Laurita
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-01-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: redis-bloomfilter
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 0.0.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 0.0.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: redis-queue
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.0.3
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 0.0.3
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: 1.6.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 1.6.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: hiredis
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.4.5
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 0.4.5
69
+ - !ruby/object:Gem::Dependency
70
+ name: redis
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ version: 3.0.4
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: 3.0.4
83
+ - !ruby/object:Gem::Dependency
84
+ name: mongo
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: 1.8.6
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: 1.8.6
97
+ - !ruby/object:Gem::Dependency
98
+ name: bson_ext
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ~>
102
+ - !ruby/object:Gem::Version
103
+ version: 1.8.6
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ version: 1.8.6
111
+ - !ruby/object:Gem::Dependency
112
+ name: json
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: 1.8.0
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ~>
123
+ - !ruby/object:Gem::Version
124
+ version: 1.8.0
125
+ - !ruby/object:Gem::Dependency
126
+ name: aws-s3
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ~>
130
+ - !ruby/object:Gem::Version
131
+ version: 0.6.3
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ~>
137
+ - !ruby/object:Gem::Version
138
+ version: 0.6.3
139
+ - !ruby/object:Gem::Dependency
140
+ name: http-cookie
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ~>
144
+ - !ruby/object:Gem::Version
145
+ version: 1.0.1
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ~>
151
+ - !ruby/object:Gem::Version
152
+ version: 1.0.1
153
+ - !ruby/object:Gem::Dependency
154
+ name: rspec
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ! '>='
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ! '>='
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: vcr
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ~>
172
+ - !ruby/object:Gem::Version
173
+ version: 2.5.0
174
+ type: :development
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ~>
179
+ - !ruby/object:Gem::Version
180
+ version: 2.5.0
181
+ - !ruby/object:Gem::Dependency
182
+ name: webmock
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - ! '>='
186
+ - !ruby/object:Gem::Version
187
+ version: '0'
188
+ type: :development
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - ! '>='
193
+ - !ruby/object:Gem::Version
194
+ version: '0'
195
+ - !ruby/object:Gem::Dependency
196
+ name: flexmock
197
+ requirement: !ruby/object:Gem::Requirement
198
+ requirements:
199
+ - - ~>
200
+ - !ruby/object:Gem::Version
201
+ version: 1.3.2
202
+ type: :development
203
+ prerelease: false
204
+ version_requirements: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - ~>
207
+ - !ruby/object:Gem::Version
208
+ version: 1.3.2
209
+ description: ! "\n An easy to use distributed web-crawler framework based on Redis\n
210
+ \ "
211
+ email:
212
+ - francesco.laurita@gmail.com
213
+ executables: []
214
+ extensions: []
215
+ extra_rdoc_files: []
216
+ files:
217
+ - .document
218
+ - .gitignore
219
+ - .rspec
220
+ - Gemfile
221
+ - LICENSE.txt
222
+ - README.md
223
+ - README.rdoc
224
+ - Rakefile
225
+ - examples/basic.rb
226
+ - examples/survival.rb
227
+ - lib/polipus.rb
228
+ - lib/polipus/http.rb
229
+ - lib/polipus/page.rb
230
+ - lib/polipus/plugin.rb
231
+ - lib/polipus/plugins/cleaner.rb
232
+ - lib/polipus/plugins/sample.rb
233
+ - lib/polipus/plugins/sleeper.rb
234
+ - lib/polipus/queue_overflow.rb
235
+ - lib/polipus/queue_overflow/base.rb
236
+ - lib/polipus/queue_overflow/dev_null_queue.rb
237
+ - lib/polipus/queue_overflow/manager.rb
238
+ - lib/polipus/queue_overflow/mongo_queue.rb
239
+ - lib/polipus/queue_overflow/mongo_queue_capped.rb
240
+ - lib/polipus/storage.rb
241
+ - lib/polipus/storage/base.rb
242
+ - lib/polipus/storage/dev_null.rb
243
+ - lib/polipus/storage/mongo_store.rb
244
+ - lib/polipus/storage/s3_store.rb
245
+ - lib/polipus/url_tracker.rb
246
+ - lib/polipus/url_tracker/bloomfilter.rb
247
+ - lib/polipus/url_tracker/redis_set.rb
248
+ - lib/polipus/version.rb
249
+ - polipus.gemspec
250
+ - spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
251
+ - spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
252
+ - spec/cassettes/4640919145753505af2d0f8423de37f3.yml
253
+ - spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml
254
+ - spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml
255
+ - spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml
256
+ - spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
257
+ - spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
258
+ - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
259
+ - spec/cassettes/http_test.yml
260
+ - spec/cassettes/http_test_redirect.yml
261
+ - spec/clear.rb
262
+ - spec/http_spec.rb
263
+ - spec/page_spec.rb
264
+ - spec/queue_overflow_manager_spec.rb
265
+ - spec/queue_overflow_spec.rb
266
+ - spec/spec_helper.rb
267
+ - spec/storage_mongo_spec.rb
268
+ - spec/storage_s3_spec.rb
269
+ - spec/url_tracker_spec.rb
270
+ homepage: https://github.com/taganaka/polipus
271
+ licenses: []
272
+ metadata: {}
273
+ post_install_message:
274
+ rdoc_options: []
275
+ require_paths:
276
+ - lib
277
+ required_ruby_version: !ruby/object:Gem::Requirement
278
+ requirements:
279
+ - - ! '>='
280
+ - !ruby/object:Gem::Version
281
+ version: '0'
282
+ required_rubygems_version: !ruby/object:Gem::Requirement
283
+ requirements:
284
+ - - ! '>='
285
+ - !ruby/object:Gem::Version
286
+ version: '0'
287
+ requirements: []
288
+ rubyforge_project: polipus
289
+ rubygems_version: 2.0.7
290
+ signing_key:
291
+ specification_version: 4
292
+ summary: Polipus distributed web-crawler framework
293
+ test_files:
294
+ - spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
295
+ - spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
296
+ - spec/cassettes/4640919145753505af2d0f8423de37f3.yml
297
+ - spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml
298
+ - spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml
299
+ - spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml
300
+ - spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
301
+ - spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
302
+ - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
303
+ - spec/cassettes/http_test.yml
304
+ - spec/cassettes/http_test_redirect.yml
305
+ - spec/clear.rb
306
+ - spec/http_spec.rb
307
+ - spec/page_spec.rb
308
+ - spec/queue_overflow_manager_spec.rb
309
+ - spec/queue_overflow_spec.rb
310
+ - spec/spec_helper.rb
311
+ - spec/storage_mongo_spec.rb
312
+ - spec/storage_s3_spec.rb
313
+ - spec/url_tracker_spec.rb