parallel588_polipus 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +17 -0
  6. data/.rubocop_todo.yml +33 -0
  7. data/.travis.yml +22 -0
  8. data/AUTHORS.md +5 -0
  9. data/CHANGELOG.md +61 -0
  10. data/Gemfile +12 -0
  11. data/LICENSE.txt +20 -0
  12. data/README.md +70 -0
  13. data/Rakefile +8 -0
  14. data/examples/basic.rb +63 -0
  15. data/examples/error_handling.rb +23 -0
  16. data/examples/incremental.rb +63 -0
  17. data/examples/robots_txt_handling.rb +14 -0
  18. data/examples/survival.rb +10 -0
  19. data/lib/polipus.rb +488 -0
  20. data/lib/polipus/http.rb +282 -0
  21. data/lib/polipus/page.rb +256 -0
  22. data/lib/polipus/plugin.rb +14 -0
  23. data/lib/polipus/plugins/cleaner.rb +25 -0
  24. data/lib/polipus/plugins/sample.rb +15 -0
  25. data/lib/polipus/plugins/sleeper.rb +22 -0
  26. data/lib/polipus/queue_overflow.rb +26 -0
  27. data/lib/polipus/queue_overflow/base.rb +7 -0
  28. data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
  29. data/lib/polipus/queue_overflow/manager.rb +57 -0
  30. data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
  31. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
  32. data/lib/polipus/queue_overflow/worker.rb +24 -0
  33. data/lib/polipus/robotex.rb +145 -0
  34. data/lib/polipus/signal_handler.rb +42 -0
  35. data/lib/polipus/storage.rb +31 -0
  36. data/lib/polipus/storage/base.rb +20 -0
  37. data/lib/polipus/storage/dev_null.rb +35 -0
  38. data/lib/polipus/storage/memory_store.rb +56 -0
  39. data/lib/polipus/storage/mongo_store.rb +90 -0
  40. data/lib/polipus/storage/rethink_store.rb +90 -0
  41. data/lib/polipus/url_tracker.rb +21 -0
  42. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  43. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  44. data/lib/polipus/version.rb +5 -0
  45. data/polipus.gemspec +44 -0
  46. data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
  47. data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
  48. data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
  49. data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
  50. data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
  51. data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
  52. data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
  53. data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
  54. data/spec/cassettes/gzipped_on.yml +147 -0
  55. data/spec/cassettes/http_cookies.yml +133 -0
  56. data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
  57. data/spec/cassettes/http_test.yml +1418 -0
  58. data/spec/cassettes/http_test_redirect.yml +71 -0
  59. data/spec/clear.rb +12 -0
  60. data/spec/polipus/http_spec.rb +139 -0
  61. data/spec/polipus/page_spec.rb +68 -0
  62. data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
  63. data/spec/polipus/queue_overflow_spec.rb +66 -0
  64. data/spec/polipus/robotex_spec.rb +85 -0
  65. data/spec/polipus/signal_handler_spec.rb +15 -0
  66. data/spec/polipus/storage/memory_store_spec.rb +87 -0
  67. data/spec/polipus/storage/mongo_store_spec.rb +119 -0
  68. data/spec/polipus/storage/rethink_store_spec.rb +117 -0
  69. data/spec/polipus/url_tracker_spec.rb +29 -0
  70. data/spec/polipus_spec.rb +107 -0
  71. data/spec/spec_helper.rb +42 -0
  72. metadata +348 -0
@@ -0,0 +1,107 @@
1
+ # encoding: UTF-8
2
+ require 'spec_helper'
3
+
4
+ describe Polipus::PolipusCrawler do
5
+ after(:each) { Redis.new(db: 10).flushdb }
6
+ let(:p_options) do
7
+ {
8
+ workers: 1,
9
+ redis_options: { host: 'localhost', db: 10 },
10
+ depth_limit: 1,
11
+ queue_timeout: 1,
12
+ user_agent: 'polipus-rspec',
13
+ logger: logger,
14
+ logger_level: Logger::DEBUG,
15
+ storage: Polipus::Storage.memory_store
16
+ }
17
+ end
18
+ let(:polipus) do
19
+ Polipus::PolipusCrawler.new('polipus-rspec', ['http://rubygems.org/gems'], p_options)
20
+ end
21
+
22
+ let(:init_page)do
23
+ Polipus::Page.new 'http://rubygems.org/gems'
24
+ end
25
+
26
+ let(:logger) { Logger.new(nil) }
27
+
28
+ context 'polipus' do
29
+ it 'should create a polipus instance' do
30
+ expect(polipus).to be_an_instance_of Polipus::PolipusCrawler
31
+ end
32
+
33
+ it 'should execute a crawling session' do
34
+ polipus.takeover
35
+ expect(polipus.storage.exists?(init_page)).to be_truthy
36
+ expect(polipus.storage.get(init_page).links.count).to be polipus.storage.count
37
+ end
38
+
39
+ it 'should filter unwanted urls' do
40
+ polipus.skip_links_like(/\/pages\//)
41
+ polipus.takeover
42
+ expect(polipus.storage.get(init_page).links
43
+ .reject { |e| e.path.to_s =~ /\/pages\// }.count).to be polipus.storage.count
44
+ end
45
+
46
+ it 'should follow only wanted urls' do
47
+ polipus.follow_links_like(/\/pages\//)
48
+ polipus.follow_links_like(/\/gems$/)
49
+ polipus.takeover
50
+ expect(polipus.storage.get(init_page).links
51
+ .reject { |e| ![/\/pages\//, /\/gems$/].any? { |p| e.path =~ p } }
52
+ .count).to be polipus.storage.count
53
+ end
54
+
55
+ it 'should refresh expired pages' do
56
+ polipus.ttl_page = 3600
57
+ polipus.takeover
58
+ polipus.storage.each do |_id, page|
59
+ page.fetched_at = page.fetched_at - 3600
60
+ polipus.storage.add(page)
61
+ end
62
+ polipus.storage.each { |_id, page| expect(page.expired?(3600)).to be_truthy }
63
+ polipus.takeover
64
+ polipus.storage.each { |_id, page| expect(page.expired?(3600)).to be_falsey }
65
+ end
66
+
67
+ it 'should re-download seeder urls no matter what' do
68
+ cache_hit = {}
69
+ polipus.follow_links_like(/\/gems$/)
70
+ polipus.on_page_downloaded do |page|
71
+ cache_hit[page.url.to_s] ||= 0
72
+ cache_hit[page.url.to_s] += 1
73
+ end
74
+ polipus.takeover
75
+ polipus.takeover
76
+ expect(cache_hit['http://rubygems.org/gems']).to be 2
77
+ end
78
+
79
+ it 'should call on_page_error code blocks when a page has error' do
80
+ p = Polipus::PolipusCrawler.new('polipus-rspec', ['http://dasd.adad.dom/'], p_options.merge(open_timeout: 1, read_timeout: 1))
81
+ a_page = nil
82
+ p.on_page_error { |page| a_page = page }
83
+ p.takeover
84
+ expect(a_page).not_to be_nil
85
+ expect(a_page.error).not_to be_nil
86
+ end
87
+
88
+ it 'should obey to the robots.txt file' do
89
+ lopt = p_options
90
+ lopt[:obey_robots_txt] = true
91
+ polipus = Polipus::PolipusCrawler.new('polipus-rspec', ['https://rubygems.org/gems/polipus'], lopt)
92
+ polipus.depth_limit = 1
93
+ polipus.takeover
94
+ polipus.storage.each { |_id, page| expect(page.url.path =~ /$\/downloads\//).to be_falsey }
95
+ end
96
+
97
+ it 'should obey to the robots.txt file with list user_agent' do
98
+ user_agent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)"
99
+ lopt = p_options
100
+ lopt[:obey_robots_txt] = true
101
+ lopt[:user_agent] = [user_agent]
102
+ flexmock(Polipus::Robotex).should_receive(:new).with(user_agent)
103
+ Polipus::PolipusCrawler.new('polipus-rspec', ['https://rubygems.org/gems/polipus'], lopt)
104
+
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,42 @@
1
+ # Require this file using `require "spec_helper"`
2
+ # to ensure that it is only loaded once.
3
+ #
4
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
5
+ require 'digest/md5'
6
+ require 'coveralls'
7
+ require 'vcr'
8
+ require 'webmock/rspec'
9
+
10
+ Coveralls.wear!
11
+
12
+ VCR.configure do |c|
13
+ c.cassette_library_dir = "#{File.dirname(__FILE__)}/cassettes"
14
+ c.hook_into :webmock
15
+ end
16
+
17
+ require 'polipus'
18
+
19
+ RSpec.configure do |config|
20
+ config.run_all_when_everything_filtered = true
21
+ config.filter_run :focus
22
+
23
+ # Run specs in random order to surface order dependencies. If you find an
24
+ # order dependency and want to debug it, you can fix the order by providing
25
+ # the seed, which is printed after each run.
26
+ # --seed 1234
27
+ config.order = 'random'
28
+ config.mock_with :flexmock
29
+ config.around(:each) do |example|
30
+ t = Time.now
31
+ print example.metadata[:full_description]
32
+ VCR.use_cassette(Digest::MD5.hexdigest(example.metadata[:full_description])) do
33
+ example.run
34
+ puts " [#{Time.now - t}s]"
35
+ end
36
+ end
37
+ config.before(:each) { Polipus::SignalHandler.disable }
38
+ end
39
+
40
+ def page_factory(url, params = {})
41
+ Polipus::Page.new url, params
42
+ end
metadata ADDED
@@ -0,0 +1,348 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: parallel588_polipus
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
5
+ platform: ruby
6
+ authors:
7
+ - Francesco Laurita
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.6.0
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.6'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.6.0
33
+ - !ruby/object:Gem::Dependency
34
+ name: http-cookie
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '1.0'
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 1.0.1
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '1.0'
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 1.0.1
53
+ - !ruby/object:Gem::Dependency
54
+ name: redis
55
+ requirement: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - "~>"
58
+ - !ruby/object:Gem::Version
59
+ version: '3.0'
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 3.0.4
63
+ type: :runtime
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: '3.0'
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 3.0.4
73
+ - !ruby/object:Gem::Dependency
74
+ name: hiredis
75
+ requirement: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - "~>"
78
+ - !ruby/object:Gem::Version
79
+ version: '0.5'
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: 0.4.5
83
+ type: :runtime
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.5'
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: 0.4.5
93
+ - !ruby/object:Gem::Dependency
94
+ name: redis-queue
95
+ requirement: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - "~>"
98
+ - !ruby/object:Gem::Version
99
+ version: '0.0'
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: 0.0.4
103
+ type: :runtime
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '0.0'
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: 0.0.4
113
+ - !ruby/object:Gem::Dependency
114
+ name: redis-bloomfilter
115
+ requirement: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - "~>"
118
+ - !ruby/object:Gem::Version
119
+ version: '0.0'
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: 0.0.3
123
+ type: :runtime
124
+ prerelease: false
125
+ version_requirements: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - "~>"
128
+ - !ruby/object:Gem::Version
129
+ version: '0.0'
130
+ - - ">="
131
+ - !ruby/object:Gem::Version
132
+ version: 0.0.3
133
+ - !ruby/object:Gem::Dependency
134
+ name: mongo
135
+ requirement: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - "~>"
138
+ - !ruby/object:Gem::Version
139
+ version: 1.11.0
140
+ type: :development
141
+ prerelease: false
142
+ version_requirements: !ruby/object:Gem::Requirement
143
+ requirements:
144
+ - - "~>"
145
+ - !ruby/object:Gem::Version
146
+ version: 1.11.0
147
+ - !ruby/object:Gem::Dependency
148
+ name: rethinkdb
149
+ requirement: !ruby/object:Gem::Requirement
150
+ requirements:
151
+ - - "~>"
152
+ - !ruby/object:Gem::Version
153
+ version: 1.15.0
154
+ type: :development
155
+ prerelease: false
156
+ version_requirements: !ruby/object:Gem::Requirement
157
+ requirements:
158
+ - - "~>"
159
+ - !ruby/object:Gem::Version
160
+ version: 1.15.0
161
+ - !ruby/object:Gem::Dependency
162
+ name: rake
163
+ requirement: !ruby/object:Gem::Requirement
164
+ requirements:
165
+ - - "~>"
166
+ - !ruby/object:Gem::Version
167
+ version: '10.3'
168
+ type: :development
169
+ prerelease: false
170
+ version_requirements: !ruby/object:Gem::Requirement
171
+ requirements:
172
+ - - "~>"
173
+ - !ruby/object:Gem::Version
174
+ version: '10.3'
175
+ - !ruby/object:Gem::Dependency
176
+ name: rspec
177
+ requirement: !ruby/object:Gem::Requirement
178
+ requirements:
179
+ - - "~>"
180
+ - !ruby/object:Gem::Version
181
+ version: 3.1.0
182
+ type: :development
183
+ prerelease: false
184
+ version_requirements: !ruby/object:Gem::Requirement
185
+ requirements:
186
+ - - "~>"
187
+ - !ruby/object:Gem::Version
188
+ version: 3.1.0
189
+ - !ruby/object:Gem::Dependency
190
+ name: flexmock
191
+ requirement: !ruby/object:Gem::Requirement
192
+ requirements:
193
+ - - "~>"
194
+ - !ruby/object:Gem::Version
195
+ version: '1.3'
196
+ type: :development
197
+ prerelease: false
198
+ version_requirements: !ruby/object:Gem::Requirement
199
+ requirements:
200
+ - - "~>"
201
+ - !ruby/object:Gem::Version
202
+ version: '1.3'
203
+ - !ruby/object:Gem::Dependency
204
+ name: vcr
205
+ requirement: !ruby/object:Gem::Requirement
206
+ requirements:
207
+ - - "~>"
208
+ - !ruby/object:Gem::Version
209
+ version: 2.9.0
210
+ type: :development
211
+ prerelease: false
212
+ version_requirements: !ruby/object:Gem::Requirement
213
+ requirements:
214
+ - - "~>"
215
+ - !ruby/object:Gem::Version
216
+ version: 2.9.0
217
+ - !ruby/object:Gem::Dependency
218
+ name: webmock
219
+ requirement: !ruby/object:Gem::Requirement
220
+ requirements:
221
+ - - "~>"
222
+ - !ruby/object:Gem::Version
223
+ version: 1.20.0
224
+ type: :development
225
+ prerelease: false
226
+ version_requirements: !ruby/object:Gem::Requirement
227
+ requirements:
228
+ - - "~>"
229
+ - !ruby/object:Gem::Version
230
+ version: 1.20.0
231
+ - !ruby/object:Gem::Dependency
232
+ name: coveralls
233
+ requirement: !ruby/object:Gem::Requirement
234
+ requirements:
235
+ - - ">="
236
+ - !ruby/object:Gem::Version
237
+ version: '0'
238
+ type: :development
239
+ prerelease: false
240
+ version_requirements: !ruby/object:Gem::Requirement
241
+ requirements:
242
+ - - ">="
243
+ - !ruby/object:Gem::Version
244
+ version: '0'
245
+ description: "\n An easy to use distributed web-crawler framework based on Redis\n
246
+ \ "
247
+ email:
248
+ - francesco.laurita@gmail.com
249
+ executables: []
250
+ extensions: []
251
+ extra_rdoc_files: []
252
+ files:
253
+ - ".document"
254
+ - ".gitignore"
255
+ - ".rspec"
256
+ - ".rubocop.yml"
257
+ - ".rubocop_todo.yml"
258
+ - ".travis.yml"
259
+ - AUTHORS.md
260
+ - CHANGELOG.md
261
+ - Gemfile
262
+ - LICENSE.txt
263
+ - README.md
264
+ - Rakefile
265
+ - examples/basic.rb
266
+ - examples/error_handling.rb
267
+ - examples/incremental.rb
268
+ - examples/robots_txt_handling.rb
269
+ - examples/survival.rb
270
+ - lib/polipus.rb
271
+ - lib/polipus/http.rb
272
+ - lib/polipus/page.rb
273
+ - lib/polipus/plugin.rb
274
+ - lib/polipus/plugins/cleaner.rb
275
+ - lib/polipus/plugins/sample.rb
276
+ - lib/polipus/plugins/sleeper.rb
277
+ - lib/polipus/queue_overflow.rb
278
+ - lib/polipus/queue_overflow/base.rb
279
+ - lib/polipus/queue_overflow/dev_null_queue.rb
280
+ - lib/polipus/queue_overflow/manager.rb
281
+ - lib/polipus/queue_overflow/mongo_queue.rb
282
+ - lib/polipus/queue_overflow/mongo_queue_capped.rb
283
+ - lib/polipus/queue_overflow/worker.rb
284
+ - lib/polipus/robotex.rb
285
+ - lib/polipus/signal_handler.rb
286
+ - lib/polipus/storage.rb
287
+ - lib/polipus/storage/base.rb
288
+ - lib/polipus/storage/dev_null.rb
289
+ - lib/polipus/storage/memory_store.rb
290
+ - lib/polipus/storage/mongo_store.rb
291
+ - lib/polipus/storage/rethink_store.rb
292
+ - lib/polipus/url_tracker.rb
293
+ - lib/polipus/url_tracker/bloomfilter.rb
294
+ - lib/polipus/url_tracker/redis_set.rb
295
+ - lib/polipus/version.rb
296
+ - polipus.gemspec
297
+ - spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml
298
+ - spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
299
+ - spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml
300
+ - spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml
301
+ - spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
302
+ - spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml
303
+ - spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
304
+ - spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
305
+ - spec/cassettes/gzipped_on.yml
306
+ - spec/cassettes/http_cookies.yml
307
+ - spec/cassettes/http_tconnection_max_hits.yml
308
+ - spec/cassettes/http_test.yml
309
+ - spec/cassettes/http_test_redirect.yml
310
+ - spec/clear.rb
311
+ - spec/polipus/http_spec.rb
312
+ - spec/polipus/page_spec.rb
313
+ - spec/polipus/queue_overflow/manager_spec.rb
314
+ - spec/polipus/queue_overflow_spec.rb
315
+ - spec/polipus/robotex_spec.rb
316
+ - spec/polipus/signal_handler_spec.rb
317
+ - spec/polipus/storage/memory_store_spec.rb
318
+ - spec/polipus/storage/mongo_store_spec.rb
319
+ - spec/polipus/storage/rethink_store_spec.rb
320
+ - spec/polipus/url_tracker_spec.rb
321
+ - spec/polipus_spec.rb
322
+ - spec/spec_helper.rb
323
+ homepage: https://github.com/taganaka/polipus
324
+ licenses:
325
+ - MIT
326
+ metadata: {}
327
+ post_install_message:
328
+ rdoc_options: []
329
+ require_paths:
330
+ - lib
331
+ required_ruby_version: !ruby/object:Gem::Requirement
332
+ requirements:
333
+ - - ">="
334
+ - !ruby/object:Gem::Version
335
+ version: '0'
336
+ required_rubygems_version: !ruby/object:Gem::Requirement
337
+ requirements:
338
+ - - ">="
339
+ - !ruby/object:Gem::Version
340
+ version: '0'
341
+ requirements: []
342
+ rubyforge_project: polipus
343
+ rubygems_version: 2.4.5
344
+ signing_key:
345
+ specification_version: 4
346
+ summary: Polipus distributed web-crawler framework
347
+ test_files: []
348
+ has_rdoc: