polipus 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/spec/http_spec.rb CHANGED
@@ -12,6 +12,7 @@ describe Polipus::HTTP do
12
12
  page = http.fetch_page("http://sfbay.craigslist.org/apa/")
13
13
  page.should be_an_instance_of(Polipus::Page)
14
14
  page.doc.search("title").text.strip.should eq "SF bay area apts/housing for rent classifieds - craigslist"
15
+ page.fetched_at.should_not be_nil
15
16
  end
16
17
  end
17
18
 
data/spec/page_spec.rb CHANGED
@@ -28,4 +28,25 @@ EOF
28
28
  it 'should honor domain_aliases attribute' do
29
29
  page.links.count.should be 4
30
30
  end
31
+
32
+ context 'page expiring' do
33
+ let(:page) do
34
+ Polipus::Page.new 'http://www.google.com/',
35
+ code: 200,
36
+ body: '',
37
+ headers: {'content-type' => ['text/html']},
38
+ domain_aliases: %w(www.google.com google.com),
39
+ fetched_at: (Time.now.to_i - 30)
40
+ end
41
+
42
+ it 'should be marked at expired' do
43
+ page.expired?(20).should be_true
44
+ end
45
+
46
+ it 'should NOT be marked at expired' do
47
+ page.expired?(60).should be_false
48
+ end
49
+ end
50
+
51
+
31
52
  end
@@ -0,0 +1,77 @@
1
+ require "spec_helper"
2
+
3
+ describe Polipus::PolipusCrawler do
4
+ after(:each) {Redis.new(db:10).flushdb}
5
+ let(:p_options) {
6
+ {
7
+ workers: 1,
8
+ redis_options: {host: 'localhost', db:10},
9
+ depth_limit: 1,
10
+ queue_timeout: 1,
11
+ user_agent: 'polipus-rspec',
12
+ logger: logger,
13
+ logger_level: Logger::DEBUG,
14
+ storage: Polipus::Storage.memory_store
15
+ }
16
+ }
17
+ let(:polipus) {
18
+ Polipus::PolipusCrawler.new("polipus-rspec", ["http://rubygems.org/gems"], p_options)
19
+ }
20
+
21
+ let(:init_page){
22
+ init_page = Polipus::Page.new "http://rubygems.org/gems"
23
+ }
24
+
25
+ let(:logger){Logger.new(nil)}
26
+
27
+ context "polipus" do
28
+
29
+ it "should create a polipus instance" do
30
+ polipus.should be_an_instance_of Polipus::PolipusCrawler
31
+ end
32
+
33
+ it "should execute a crawling session" do
34
+ polipus.takeover
35
+ polipus.storage.exists?(init_page).should be_true
36
+ polipus.storage.get(init_page).links.count.should be polipus.storage.count
37
+ end
38
+
39
+ it "should filter unwanted urls" do
40
+ polipus.skip_links_like(/\/pages\//)
41
+ polipus.takeover
42
+ polipus.storage.get(init_page).links
43
+ .reject { |e| e.path.to_s =~ /\/pages\// }.count.should be polipus.storage.count
44
+ end
45
+
46
+ it "should follow only wanted urls" do
47
+ polipus.follow_links_like(/\/pages\//)
48
+ polipus.follow_links_like(/\/gems$/)
49
+ polipus.takeover
50
+ polipus.storage.get(init_page).links
51
+ .reject { |e| ![/\/pages\//, /\/gems$/].any?{|p| e.path =~ p} }
52
+ .count.should be polipus.storage.count
53
+ end
54
+
55
+ it "should refresh expired pages" do
56
+ polipus.ttl_page = 3600
57
+ polipus.takeover
58
+ polipus.storage.each {|id, page| page.fetched_at = page.fetched_at - 3600; polipus.storage.add(page)}
59
+ polipus.storage.each {|id, page| page.expired?(3600).should be_true}
60
+ polipus.takeover
61
+ polipus.storage.each {|id, page| page.expired?(3600).should be_false}
62
+ end
63
+
64
+ it "should re-download seeder urls no matter what" do
65
+ cache_hit = {}
66
+ polipus.follow_links_like(/\/gems$/)
67
+ polipus.on_page_downloaded do |page|
68
+ cache_hit[page.url.to_s] ||= 0
69
+ cache_hit[page.url.to_s] += 1
70
+ end
71
+ polipus.takeover
72
+ polipus.takeover
73
+ cache_hit["http://rubygems.org/gems"].should be 2
74
+ end
75
+
76
+ end
77
+ end
@@ -0,0 +1,89 @@
1
+ require "spec_helper"
2
+ require "mongo"
3
+ require "polipus/storage/memory_store"
4
+ describe Polipus::Storage::MemoryStore do
5
+
6
+ let(:storage){Polipus::Storage.memory_store}
7
+
8
+ it 'should store a page' do
9
+ p = page_factory 'http://www.google.com', :code => 200, :body => '<html></html>'
10
+ uuid = storage.add p
11
+ uuid.should be == 'ed646a3334ca891fd3467db131372140'
12
+ storage.count.should be 1
13
+ p = storage.get p
14
+ p.url.to_s.should be == 'http://www.google.com'
15
+ p.body.should be == '<html></html>'
16
+ end
17
+
18
+ it 'should update a page' do
19
+ p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
20
+ storage.add p
21
+ p = storage.get p
22
+ p.code.should be == 301
23
+ end
24
+
25
+ it 'should iterate over stored pages' do
26
+ storage.each do |k, page|
27
+ k.should be == "ed646a3334ca891fd3467db131372140"
28
+ page.url.to_s.should be == 'http://www.google.com'
29
+ end
30
+ end
31
+
32
+ it 'should delete a page' do
33
+ p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
34
+ storage.remove p
35
+ storage.get(p).should be_nil
36
+ storage.count.should be 0
37
+ end
38
+
39
+ it 'should store a page removing a query string from the uuid generation' do
40
+ p = page_factory 'http://www.asd.com/?asd=lol', :code => 200, :body => '<html></html>'
41
+ p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', :code => 200, :body => '<html></html>'
42
+ storage.include_query_string_in_uuid = false
43
+ storage.add p
44
+ storage.exists?(p_no_query).should be_true
45
+ storage.remove p
46
+ end
47
+
48
+ it 'should store a page removing a query string from the uuid generation no ending slash' do
49
+ p = page_factory 'http://www.asd.com?asd=lol', :code => 200, :body => '<html></html>'
50
+ p_no_query = page_factory 'http://www.asd.com', :code => 200, :body => '<html></html>'
51
+ storage.include_query_string_in_uuid = false
52
+ storage.add p
53
+ storage.exists?(p_no_query).should be_true
54
+ storage.remove p
55
+ end
56
+
57
+ it 'should store a page with user data associated' do
58
+ p = page_factory 'http://www.user.com', :code => 200, :body => '<html></html>'
59
+ p.user_data.name = 'Test User Data'
60
+ storage.add p
61
+ storage.exists?(p).should be_true
62
+ p = storage.get(p)
63
+ p.user_data.name.should be == 'Test User Data'
64
+ storage.remove p
65
+ end
66
+
67
+ it 'should honor the except parameters' do
68
+ storage = Polipus::Storage.mongo_store(@mongo, '_test_pages', ['body'])
69
+ p = page_factory 'http://www.user-doo.com', :code => 200, :body => '<html></html>'
70
+ storage.add p
71
+ p = storage.get p
72
+ p.body.should be_empty
73
+ storage.clear
74
+ end
75
+
76
+ it 'should return false if a doc not exists' do
77
+ storage.include_query_string_in_uuid = false
78
+ p_other = page_factory 'http://www.asdrrrr.com', :code => 200, :body => '<html></html>'
79
+ storage.exists?(p_other).should be_false
80
+ storage.add p_other
81
+ storage.exists?(p_other).should be_true
82
+ p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', :code => 200, :body => '<html></html>'
83
+ storage.exists?(p_other).should be_true
84
+ storage.include_query_string_in_uuid = true
85
+ storage.exists?(p_other).should be_false
86
+
87
+ end
88
+
89
+ end
@@ -99,4 +99,22 @@ describe Polipus::Storage::MongoStore do
99
99
 
100
100
  end
101
101
 
102
+ it 'should set page.fetched_at based on the id creation' do
103
+ storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
104
+ p = page_factory 'http://www.user-doojo.com', :code => 200, :body => '<html></html>'
105
+ storage.add p
106
+ p.fetched_at.should be_nil
107
+ p = storage.get p
108
+ p.fetched_at.should_not be_nil
109
+ end
110
+
111
+ it 'should NOT set page.fetched_at if already present' do
112
+ storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
113
+ p = page_factory 'http://www.user-doojooo.com', :code => 200, :body => '<html></html>'
114
+ p.fetched_at = 10
115
+ storage.add p
116
+ p = storage.get p
117
+ p.fetched_at.should be 10
118
+ end
119
+
102
120
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: polipus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francesco Laurita
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-02 00:00:00.000000000 Z
11
+ date: 2014-05-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redis-bloomfilter
@@ -287,6 +287,7 @@ files:
287
287
  - README.rdoc
288
288
  - Rakefile
289
289
  - examples/basic.rb
290
+ - examples/incremental.rb
290
291
  - examples/survival.rb
291
292
  - lib/polipus.rb
292
293
  - lib/polipus/http.rb
@@ -304,6 +305,7 @@ files:
304
305
  - lib/polipus/storage.rb
305
306
  - lib/polipus/storage/base.rb
306
307
  - lib/polipus/storage/dev_null.rb
308
+ - lib/polipus/storage/memory_store.rb
307
309
  - lib/polipus/storage/mongo_store.rb
308
310
  - lib/polipus/storage/s3_store.rb
309
311
  - lib/polipus/url_tracker.rb
@@ -312,13 +314,19 @@ files:
312
314
  - lib/polipus/version.rb
313
315
  - polipus.gemspec
314
316
  - spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
317
+ - spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
315
318
  - spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
316
319
  - spec/cassettes/4640919145753505af2d0f8423de37f3.yml
317
320
  - spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml
321
+ - spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml
318
322
  - spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml
323
+ - spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml
319
324
  - spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml
320
325
  - spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
321
326
  - spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
327
+ - spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
328
+ - spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
329
+ - spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
322
330
  - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
323
331
  - spec/cassettes/http_tconnection_max_hits.yml
324
332
  - spec/cassettes/http_test.yml
@@ -326,9 +334,11 @@ files:
326
334
  - spec/clear.rb
327
335
  - spec/http_spec.rb
328
336
  - spec/page_spec.rb
337
+ - spec/polipus_spec.rb
329
338
  - spec/queue_overflow_manager_spec.rb
330
339
  - spec/queue_overflow_spec.rb
331
340
  - spec/spec_helper.rb
341
+ - spec/storage_memory_spec.rb
332
342
  - spec/storage_mongo_spec.rb
333
343
  - spec/storage_s3_spec.rb
334
344
  - spec/url_tracker_spec.rb
@@ -358,13 +368,19 @@ specification_version: 4
358
368
  summary: Polipus distributed web-crawler framework
359
369
  test_files:
360
370
  - spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
371
+ - spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
361
372
  - spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
362
373
  - spec/cassettes/4640919145753505af2d0f8423de37f3.yml
363
374
  - spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml
375
+ - spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml
364
376
  - spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml
377
+ - spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml
365
378
  - spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml
366
379
  - spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
367
380
  - spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
381
+ - spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
382
+ - spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
383
+ - spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
368
384
  - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
369
385
  - spec/cassettes/http_tconnection_max_hits.yml
370
386
  - spec/cassettes/http_test.yml
@@ -372,9 +388,11 @@ test_files:
372
388
  - spec/clear.rb
373
389
  - spec/http_spec.rb
374
390
  - spec/page_spec.rb
391
+ - spec/polipus_spec.rb
375
392
  - spec/queue_overflow_manager_spec.rb
376
393
  - spec/queue_overflow_spec.rb
377
394
  - spec/spec_helper.rb
395
+ - spec/storage_memory_spec.rb
378
396
  - spec/storage_mongo_spec.rb
379
397
  - spec/storage_s3_spec.rb
380
398
  - spec/url_tracker_spec.rb