polipus 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/spec/http_spec.rb CHANGED
@@ -12,6 +12,7 @@ describe Polipus::HTTP do
12
12
  page = http.fetch_page("http://sfbay.craigslist.org/apa/")
13
13
  page.should be_an_instance_of(Polipus::Page)
14
14
  page.doc.search("title").text.strip.should eq "SF bay area apts/housing for rent classifieds - craigslist"
15
+ page.fetched_at.should_not be_nil
15
16
  end
16
17
  end
17
18
 
data/spec/page_spec.rb CHANGED
@@ -28,4 +28,25 @@ EOF
28
28
  it 'should honor domain_aliases attribute' do
29
29
  page.links.count.should be 4
30
30
  end
31
+
32
+ context 'page expiring' do
33
+ let(:page) do
34
+ Polipus::Page.new 'http://www.google.com/',
35
+ code: 200,
36
+ body: '',
37
+ headers: {'content-type' => ['text/html']},
38
+ domain_aliases: %w(www.google.com google.com),
39
+ fetched_at: (Time.now.to_i - 30)
40
+ end
41
+
42
+ it 'should be marked at expired' do
43
+ page.expired?(20).should be_true
44
+ end
45
+
46
+ it 'should NOT be marked at expired' do
47
+ page.expired?(60).should be_false
48
+ end
49
+ end
50
+
51
+
31
52
  end
@@ -0,0 +1,77 @@
1
+ require "spec_helper"
2
+
3
+ describe Polipus::PolipusCrawler do
4
+ after(:each) {Redis.new(db:10).flushdb}
5
+ let(:p_options) {
6
+ {
7
+ workers: 1,
8
+ redis_options: {host: 'localhost', db:10},
9
+ depth_limit: 1,
10
+ queue_timeout: 1,
11
+ user_agent: 'polipus-rspec',
12
+ logger: logger,
13
+ logger_level: Logger::DEBUG,
14
+ storage: Polipus::Storage.memory_store
15
+ }
16
+ }
17
+ let(:polipus) {
18
+ Polipus::PolipusCrawler.new("polipus-rspec", ["http://rubygems.org/gems"], p_options)
19
+ }
20
+
21
+ let(:init_page){
22
+ init_page = Polipus::Page.new "http://rubygems.org/gems"
23
+ }
24
+
25
+ let(:logger){Logger.new(nil)}
26
+
27
+ context "polipus" do
28
+
29
+ it "should create a polipus instance" do
30
+ polipus.should be_an_instance_of Polipus::PolipusCrawler
31
+ end
32
+
33
+ it "should execute a crawling session" do
34
+ polipus.takeover
35
+ polipus.storage.exists?(init_page).should be_true
36
+ polipus.storage.get(init_page).links.count.should be polipus.storage.count
37
+ end
38
+
39
+ it "should filter unwanted urls" do
40
+ polipus.skip_links_like(/\/pages\//)
41
+ polipus.takeover
42
+ polipus.storage.get(init_page).links
43
+ .reject { |e| e.path.to_s =~ /\/pages\// }.count.should be polipus.storage.count
44
+ end
45
+
46
+ it "should follow only wanted urls" do
47
+ polipus.follow_links_like(/\/pages\//)
48
+ polipus.follow_links_like(/\/gems$/)
49
+ polipus.takeover
50
+ polipus.storage.get(init_page).links
51
+ .reject { |e| ![/\/pages\//, /\/gems$/].any?{|p| e.path =~ p} }
52
+ .count.should be polipus.storage.count
53
+ end
54
+
55
+ it "should refresh expired pages" do
56
+ polipus.ttl_page = 3600
57
+ polipus.takeover
58
+ polipus.storage.each {|id, page| page.fetched_at = page.fetched_at - 3600; polipus.storage.add(page)}
59
+ polipus.storage.each {|id, page| page.expired?(3600).should be_true}
60
+ polipus.takeover
61
+ polipus.storage.each {|id, page| page.expired?(3600).should be_false}
62
+ end
63
+
64
+ it "should re-download seeder urls no matter what" do
65
+ cache_hit = {}
66
+ polipus.follow_links_like(/\/gems$/)
67
+ polipus.on_page_downloaded do |page|
68
+ cache_hit[page.url.to_s] ||= 0
69
+ cache_hit[page.url.to_s] += 1
70
+ end
71
+ polipus.takeover
72
+ polipus.takeover
73
+ cache_hit["http://rubygems.org/gems"].should be 2
74
+ end
75
+
76
+ end
77
+ end
@@ -0,0 +1,89 @@
1
+ require "spec_helper"
2
+ require "mongo"
3
+ require "polipus/storage/memory_store"
4
+ describe Polipus::Storage::MemoryStore do
5
+
6
+ let(:storage){Polipus::Storage.memory_store}
7
+
8
+ it 'should store a page' do
9
+ p = page_factory 'http://www.google.com', :code => 200, :body => '<html></html>'
10
+ uuid = storage.add p
11
+ uuid.should be == 'ed646a3334ca891fd3467db131372140'
12
+ storage.count.should be 1
13
+ p = storage.get p
14
+ p.url.to_s.should be == 'http://www.google.com'
15
+ p.body.should be == '<html></html>'
16
+ end
17
+
18
+ it 'should update a page' do
19
+ p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
20
+ storage.add p
21
+ p = storage.get p
22
+ p.code.should be == 301
23
+ end
24
+
25
+ it 'should iterate over stored pages' do
26
+ storage.each do |k, page|
27
+ k.should be == "ed646a3334ca891fd3467db131372140"
28
+ page.url.to_s.should be == 'http://www.google.com'
29
+ end
30
+ end
31
+
32
+ it 'should delete a page' do
33
+ p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
34
+ storage.remove p
35
+ storage.get(p).should be_nil
36
+ storage.count.should be 0
37
+ end
38
+
39
+ it 'should store a page removing a query string from the uuid generation' do
40
+ p = page_factory 'http://www.asd.com/?asd=lol', :code => 200, :body => '<html></html>'
41
+ p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', :code => 200, :body => '<html></html>'
42
+ storage.include_query_string_in_uuid = false
43
+ storage.add p
44
+ storage.exists?(p_no_query).should be_true
45
+ storage.remove p
46
+ end
47
+
48
+ it 'should store a page removing a query string from the uuid generation no ending slash' do
49
+ p = page_factory 'http://www.asd.com?asd=lol', :code => 200, :body => '<html></html>'
50
+ p_no_query = page_factory 'http://www.asd.com', :code => 200, :body => '<html></html>'
51
+ storage.include_query_string_in_uuid = false
52
+ storage.add p
53
+ storage.exists?(p_no_query).should be_true
54
+ storage.remove p
55
+ end
56
+
57
+ it 'should store a page with user data associated' do
58
+ p = page_factory 'http://www.user.com', :code => 200, :body => '<html></html>'
59
+ p.user_data.name = 'Test User Data'
60
+ storage.add p
61
+ storage.exists?(p).should be_true
62
+ p = storage.get(p)
63
+ p.user_data.name.should be == 'Test User Data'
64
+ storage.remove p
65
+ end
66
+
67
+ it 'should honor the except parameters' do
68
+ storage = Polipus::Storage.mongo_store(@mongo, '_test_pages', ['body'])
69
+ p = page_factory 'http://www.user-doo.com', :code => 200, :body => '<html></html>'
70
+ storage.add p
71
+ p = storage.get p
72
+ p.body.should be_empty
73
+ storage.clear
74
+ end
75
+
76
+ it 'should return false if a doc not exists' do
77
+ storage.include_query_string_in_uuid = false
78
+ p_other = page_factory 'http://www.asdrrrr.com', :code => 200, :body => '<html></html>'
79
+ storage.exists?(p_other).should be_false
80
+ storage.add p_other
81
+ storage.exists?(p_other).should be_true
82
+ p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', :code => 200, :body => '<html></html>'
83
+ storage.exists?(p_other).should be_true
84
+ storage.include_query_string_in_uuid = true
85
+ storage.exists?(p_other).should be_false
86
+
87
+ end
88
+
89
+ end
@@ -99,4 +99,22 @@ describe Polipus::Storage::MongoStore do
99
99
 
100
100
  end
101
101
 
102
+ it 'should set page.fetched_at based on the id creation' do
103
+ storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
104
+ p = page_factory 'http://www.user-doojo.com', :code => 200, :body => '<html></html>'
105
+ storage.add p
106
+ p.fetched_at.should be_nil
107
+ p = storage.get p
108
+ p.fetched_at.should_not be_nil
109
+ end
110
+
111
+ it 'should NOT set page.fetched_at if already present' do
112
+ storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
113
+ p = page_factory 'http://www.user-doojooo.com', :code => 200, :body => '<html></html>'
114
+ p.fetched_at = 10
115
+ storage.add p
116
+ p = storage.get p
117
+ p.fetched_at.should be 10
118
+ end
119
+
102
120
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: polipus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francesco Laurita
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-02 00:00:00.000000000 Z
11
+ date: 2014-05-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redis-bloomfilter
@@ -287,6 +287,7 @@ files:
287
287
  - README.rdoc
288
288
  - Rakefile
289
289
  - examples/basic.rb
290
+ - examples/incremental.rb
290
291
  - examples/survival.rb
291
292
  - lib/polipus.rb
292
293
  - lib/polipus/http.rb
@@ -304,6 +305,7 @@ files:
304
305
  - lib/polipus/storage.rb
305
306
  - lib/polipus/storage/base.rb
306
307
  - lib/polipus/storage/dev_null.rb
308
+ - lib/polipus/storage/memory_store.rb
307
309
  - lib/polipus/storage/mongo_store.rb
308
310
  - lib/polipus/storage/s3_store.rb
309
311
  - lib/polipus/url_tracker.rb
@@ -312,13 +314,19 @@ files:
312
314
  - lib/polipus/version.rb
313
315
  - polipus.gemspec
314
316
  - spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
317
+ - spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
315
318
  - spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
316
319
  - spec/cassettes/4640919145753505af2d0f8423de37f3.yml
317
320
  - spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml
321
+ - spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml
318
322
  - spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml
323
+ - spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml
319
324
  - spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml
320
325
  - spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
321
326
  - spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
327
+ - spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
328
+ - spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
329
+ - spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
322
330
  - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
323
331
  - spec/cassettes/http_tconnection_max_hits.yml
324
332
  - spec/cassettes/http_test.yml
@@ -326,9 +334,11 @@ files:
326
334
  - spec/clear.rb
327
335
  - spec/http_spec.rb
328
336
  - spec/page_spec.rb
337
+ - spec/polipus_spec.rb
329
338
  - spec/queue_overflow_manager_spec.rb
330
339
  - spec/queue_overflow_spec.rb
331
340
  - spec/spec_helper.rb
341
+ - spec/storage_memory_spec.rb
332
342
  - spec/storage_mongo_spec.rb
333
343
  - spec/storage_s3_spec.rb
334
344
  - spec/url_tracker_spec.rb
@@ -358,13 +368,19 @@ specification_version: 4
358
368
  summary: Polipus distributed web-crawler framework
359
369
  test_files:
360
370
  - spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
371
+ - spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
361
372
  - spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
362
373
  - spec/cassettes/4640919145753505af2d0f8423de37f3.yml
363
374
  - spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml
375
+ - spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml
364
376
  - spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml
377
+ - spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml
365
378
  - spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml
366
379
  - spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
367
380
  - spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
381
+ - spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
382
+ - spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
383
+ - spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
368
384
  - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
369
385
  - spec/cassettes/http_tconnection_max_hits.yml
370
386
  - spec/cassettes/http_test.yml
@@ -372,9 +388,11 @@ test_files:
372
388
  - spec/clear.rb
373
389
  - spec/http_spec.rb
374
390
  - spec/page_spec.rb
391
+ - spec/polipus_spec.rb
375
392
  - spec/queue_overflow_manager_spec.rb
376
393
  - spec/queue_overflow_spec.rb
377
394
  - spec/spec_helper.rb
395
+ - spec/storage_memory_spec.rb
378
396
  - spec/storage_mongo_spec.rb
379
397
  - spec/storage_s3_spec.rb
380
398
  - spec/url_tracker_spec.rb