hash_spidey 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
@@ -1,112 +1,108 @@
|
|
1
1
|
module HashSpidey
|
2
2
|
module Strategies
|
3
|
-
|
4
3
|
module HashStore
|
5
4
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
super(attrs)
|
11
|
-
end
|
5
|
+
def initialize(attrs = {})
|
6
|
+
@url_collection = {}
|
7
|
+
@error_collection = []
|
12
8
|
|
13
|
-
|
9
|
+
super(attrs)
|
10
|
+
end
|
14
11
|
|
12
|
+
#### process strategies
|
15
13
|
|
16
|
-
## conveinence methods
|
17
|
-
def crawls
|
18
|
-
@url_collection.select{|k,v| v.crawled?}
|
19
|
-
end
|
20
14
|
|
15
|
+
## conveinence methods
|
16
|
+
def crawls
|
17
|
+
@url_collection.select{|k,v| v.crawled?}
|
18
|
+
end
|
21
19
|
|
22
|
-
def uncrawled
|
23
|
-
@url_collection.reject{|k,v| v.crawled?}
|
24
|
-
end
|
25
20
|
|
26
|
-
|
27
|
-
|
28
|
-
|
21
|
+
def uncrawled
|
22
|
+
@url_collection.reject{|k,v| v.crawled?}
|
23
|
+
end
|
29
24
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
end
|
25
|
+
def records
|
26
|
+
@url_collection.select{|k,v| v.recorded?}
|
27
|
+
end
|
34
28
|
|
29
|
+
def process_crawl(url, page)
|
30
|
+
h_url = @url_collection[url]
|
31
|
+
h_url.mark_as_crawled(page)
|
32
|
+
end
|
35
33
|
|
36
|
-
def crawl(options = {})
|
37
|
-
@crawl_started_at = Time.now
|
38
|
-
@until = Time.now + options[:crawl_for] if options[:crawl_for]
|
39
34
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
35
|
+
def crawl(options = {})
|
36
|
+
@crawl_started_at = Time.now
|
37
|
+
@until = Time.now + options[:crawl_for] if options[:crawl_for]
|
38
|
+
|
39
|
+
i = 0
|
40
|
+
each_url do |url, handler, default_data|
|
41
|
+
break if options[:max_urls] && i >= options[:max_urls]
|
42
|
+
begin
|
43
|
+
page = agent.get(url)
|
44
|
+
Spidey.logger.info "Handling #{url.inspect}"
|
45
|
+
process_crawl(url, page)
|
46
|
+
send handler, page, default_data
|
47
|
+
rescue => ex
|
48
|
+
add_error url: url, handler: handler, error: ex
|
49
|
+
end
|
50
|
+
sleep request_interval if request_interval > 0
|
51
|
+
i += 1
|
52
|
+
end
|
53
|
+
end
|
55
54
|
|
56
55
|
|
57
|
-
|
58
|
-
|
56
|
+
def handle(url, handler, handle_data = {})
|
57
|
+
Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
|
59
58
|
|
60
|
-
|
61
|
-
|
62
|
-
|
59
|
+
spider_name = self.class.name
|
60
|
+
@url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data )
|
61
|
+
end
|
63
62
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
63
|
+
# expects @url_collection to have :url, but if not, creates new HashUrlRecord
|
64
|
+
def record(data_hashie)
|
65
|
+
url = data_hashie.url
|
66
|
+
h_url = @url_collection[url] || HashUrlRecord.new(url)
|
68
67
|
|
69
|
-
|
70
|
-
|
68
|
+
# set the content and record_timestamp of the HashUrlRecord
|
69
|
+
h_url.record_content(data_hashie.content)
|
71
70
|
|
72
|
-
|
73
|
-
|
74
|
-
|
71
|
+
# reassign, update collection
|
72
|
+
@url_collection[url] = h_url
|
73
|
+
end
|
75
74
|
|
76
75
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
76
|
+
# wrapper around #record
|
77
|
+
def record_page(page, default_data={})
|
78
|
+
msh = Hashie::Mash.new(default_data)
|
79
|
+
msh.url = page.uri.to_s
|
80
|
+
msh.content = page.content
|
82
81
|
|
83
|
-
|
84
|
-
|
82
|
+
record(msh)
|
83
|
+
end
|
85
84
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
85
|
+
def each_url(&block)
|
86
|
+
while h_url = get_next_url_hash
|
87
|
+
yield h_url.url, h_url.handler, h_url.handle_data
|
88
|
+
end
|
89
|
+
end
|
91
90
|
|
92
|
-
|
91
|
+
protected
|
93
92
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
93
|
+
def add_error(attrs)
|
94
|
+
@error_collection << attrs
|
95
|
+
Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
|
96
|
+
end
|
98
97
|
|
99
98
|
|
100
99
|
private
|
101
100
|
|
102
101
|
def get_next_url_hash
|
103
102
|
return nil if (@until && Time.now >= @until) # exceeded time bound
|
104
|
-
|
105
103
|
# uncrawled is a filtered collection
|
106
104
|
uncrawled.values.first
|
107
105
|
end
|
108
|
-
|
109
|
-
|
110
106
|
end
|
111
107
|
end
|
112
108
|
end
|
data/lib/hash_spidey/version.rb
CHANGED
@@ -16,13 +16,13 @@ describe HashSpidey::Strategies::HashStore do
|
|
16
16
|
end
|
17
17
|
|
18
18
|
end
|
19
|
-
|
19
|
+
|
20
20
|
context 'generic #handle' do
|
21
21
|
|
22
22
|
before(:each) do
|
23
23
|
FakeWeb.register_uri(:get, "http://www.example.com/", :body => "Hello World", code: 200,
|
24
24
|
"content-type"=>"text/html; charset=UTF-8"
|
25
|
-
|
25
|
+
)
|
26
26
|
@spider = TestSpider.new request_interval: 0
|
27
27
|
@spider.handle "http://www.example.com/", :process_size
|
28
28
|
@spider.crawl
|
@@ -36,21 +36,20 @@ describe HashSpidey::Strategies::HashStore do
|
|
36
36
|
|
37
37
|
it 'should update #crawled_timestamp' do
|
38
38
|
@crawled_url = @spider.crawls.values.first
|
39
|
-
|
40
|
-
|
39
|
+
expect( @crawled_url.url ).to eq 'http://www.example.com/'
|
40
|
+
expect( @crawled_url.crawled_timestamp > @crawled_url.initialized_timestamp).to be_true
|
41
41
|
end
|
42
42
|
|
43
43
|
it 'should have #crawls act as a Hash' do
|
44
44
|
expect( @spider.crawls['http://www.example.com/'].url).to eq 'http://www.example.com/'
|
45
45
|
end
|
46
|
-
|
46
|
+
|
47
47
|
it "should not add duplicate URLs" do
|
48
|
-
|
49
|
-
|
48
|
+
@spider.handle "http://www.example.com/", :process_something_else # second time
|
49
|
+
expect( @spider.crawls.count ).to eq 1
|
50
50
|
end
|
51
|
-
|
52
|
-
context '@crawl_record' do
|
53
51
|
|
52
|
+
context '@crawl_record' do
|
54
53
|
before(:each) do
|
55
54
|
@crawled_url = @spider.crawls["http://www.example.com/"]
|
56
55
|
end
|
@@ -60,13 +59,10 @@ describe HashSpidey::Strategies::HashStore do
|
|
60
59
|
end
|
61
60
|
|
62
61
|
it 'should respond to header#content-type' do
|
63
|
-
expect(@crawled_url.
|
62
|
+
expect(@crawled_url.crawleheader['content-type']).to eq "text/html; charset=UTF-8"
|
64
63
|
end
|
65
64
|
end
|
66
65
|
end
|
67
|
-
|
68
|
-
|
69
|
-
|
70
66
|
end
|
71
67
|
|
72
68
|
|
@@ -91,8 +87,4 @@ describe HashSpidey::Strategies::HashStore do
|
|
91
87
|
end
|
92
88
|
end
|
93
89
|
end
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
90
|
end
|