hash_spidey 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,112 +1,108 @@
|
|
1
1
|
module HashSpidey
|
2
2
|
module Strategies
|
3
|
-
|
4
3
|
module HashStore
|
5
4
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
super(attrs)
|
11
|
-
end
|
5
|
+
def initialize(attrs = {})
|
6
|
+
@url_collection = {}
|
7
|
+
@error_collection = []
|
12
8
|
|
13
|
-
|
9
|
+
super(attrs)
|
10
|
+
end
|
14
11
|
|
12
|
+
#### process strategies
|
15
13
|
|
16
|
-
## conveinence methods
|
17
|
-
def crawls
|
18
|
-
@url_collection.select{|k,v| v.crawled?}
|
19
|
-
end
|
20
14
|
|
15
|
+
## conveinence methods
|
16
|
+
def crawls
|
17
|
+
@url_collection.select{|k,v| v.crawled?}
|
18
|
+
end
|
21
19
|
|
22
|
-
def uncrawled
|
23
|
-
@url_collection.reject{|k,v| v.crawled?}
|
24
|
-
end
|
25
20
|
|
26
|
-
|
27
|
-
|
28
|
-
|
21
|
+
def uncrawled
|
22
|
+
@url_collection.reject{|k,v| v.crawled?}
|
23
|
+
end
|
29
24
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
end
|
25
|
+
def records
|
26
|
+
@url_collection.select{|k,v| v.recorded?}
|
27
|
+
end
|
34
28
|
|
29
|
+
def process_crawl(url, page)
|
30
|
+
h_url = @url_collection[url]
|
31
|
+
h_url.mark_as_crawled(page)
|
32
|
+
end
|
35
33
|
|
36
|
-
def crawl(options = {})
|
37
|
-
@crawl_started_at = Time.now
|
38
|
-
@until = Time.now + options[:crawl_for] if options[:crawl_for]
|
39
34
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
35
|
+
def crawl(options = {})
|
36
|
+
@crawl_started_at = Time.now
|
37
|
+
@until = Time.now + options[:crawl_for] if options[:crawl_for]
|
38
|
+
|
39
|
+
i = 0
|
40
|
+
each_url do |url, handler, default_data|
|
41
|
+
break if options[:max_urls] && i >= options[:max_urls]
|
42
|
+
begin
|
43
|
+
page = agent.get(url)
|
44
|
+
Spidey.logger.info "Handling #{url.inspect}"
|
45
|
+
process_crawl(url, page)
|
46
|
+
send handler, page, default_data
|
47
|
+
rescue => ex
|
48
|
+
add_error url: url, handler: handler, error: ex
|
49
|
+
end
|
50
|
+
sleep request_interval if request_interval > 0
|
51
|
+
i += 1
|
52
|
+
end
|
53
|
+
end
|
55
54
|
|
56
55
|
|
57
|
-
|
58
|
-
|
56
|
+
def handle(url, handler, handle_data = {})
|
57
|
+
Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
|
59
58
|
|
60
|
-
|
61
|
-
|
62
|
-
|
59
|
+
spider_name = self.class.name
|
60
|
+
@url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data )
|
61
|
+
end
|
63
62
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
63
|
+
# expects @url_collection to have :url, but if not, creates new HashUrlRecord
|
64
|
+
def record(data_hashie)
|
65
|
+
url = data_hashie.url
|
66
|
+
h_url = @url_collection[url] || HashUrlRecord.new(url)
|
68
67
|
|
69
|
-
|
70
|
-
|
68
|
+
# set the content and record_timestamp of the HashUrlRecord
|
69
|
+
h_url.record_content(data_hashie.content)
|
71
70
|
|
72
|
-
|
73
|
-
|
74
|
-
|
71
|
+
# reassign, update collection
|
72
|
+
@url_collection[url] = h_url
|
73
|
+
end
|
75
74
|
|
76
75
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
76
|
+
# wrapper around #record
|
77
|
+
def record_page(page, default_data={})
|
78
|
+
msh = Hashie::Mash.new(default_data)
|
79
|
+
msh.url = page.uri.to_s
|
80
|
+
msh.content = page.content
|
82
81
|
|
83
|
-
|
84
|
-
|
82
|
+
record(msh)
|
83
|
+
end
|
85
84
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
85
|
+
def each_url(&block)
|
86
|
+
while h_url = get_next_url_hash
|
87
|
+
yield h_url.url, h_url.handler, h_url.handle_data
|
88
|
+
end
|
89
|
+
end
|
91
90
|
|
92
|
-
|
91
|
+
protected
|
93
92
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
93
|
+
def add_error(attrs)
|
94
|
+
@error_collection << attrs
|
95
|
+
Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
|
96
|
+
end
|
98
97
|
|
99
98
|
|
100
99
|
private
|
101
100
|
|
102
101
|
def get_next_url_hash
|
103
102
|
return nil if (@until && Time.now >= @until) # exceeded time bound
|
104
|
-
|
105
103
|
# uncrawled is a filtered collection
|
106
104
|
uncrawled.values.first
|
107
105
|
end
|
108
|
-
|
109
|
-
|
110
106
|
end
|
111
107
|
end
|
112
108
|
end
|
data/lib/hash_spidey/version.rb
CHANGED
@@ -16,13 +16,13 @@ describe HashSpidey::Strategies::HashStore do
|
|
16
16
|
end
|
17
17
|
|
18
18
|
end
|
19
|
-
|
19
|
+
|
20
20
|
context 'generic #handle' do
|
21
21
|
|
22
22
|
before(:each) do
|
23
23
|
FakeWeb.register_uri(:get, "http://www.example.com/", :body => "Hello World", code: 200,
|
24
24
|
"content-type"=>"text/html; charset=UTF-8"
|
25
|
-
|
25
|
+
)
|
26
26
|
@spider = TestSpider.new request_interval: 0
|
27
27
|
@spider.handle "http://www.example.com/", :process_size
|
28
28
|
@spider.crawl
|
@@ -36,21 +36,20 @@ describe HashSpidey::Strategies::HashStore do
|
|
36
36
|
|
37
37
|
it 'should update #crawled_timestamp' do
|
38
38
|
@crawled_url = @spider.crawls.values.first
|
39
|
-
|
40
|
-
|
39
|
+
expect( @crawled_url.url ).to eq 'http://www.example.com/'
|
40
|
+
expect( @crawled_url.crawled_timestamp > @crawled_url.initialized_timestamp).to be_true
|
41
41
|
end
|
42
42
|
|
43
43
|
it 'should have #crawls act as a Hash' do
|
44
44
|
expect( @spider.crawls['http://www.example.com/'].url).to eq 'http://www.example.com/'
|
45
45
|
end
|
46
|
-
|
46
|
+
|
47
47
|
it "should not add duplicate URLs" do
|
48
|
-
|
49
|
-
|
48
|
+
@spider.handle "http://www.example.com/", :process_something_else # second time
|
49
|
+
expect( @spider.crawls.count ).to eq 1
|
50
50
|
end
|
51
|
-
|
52
|
-
context '@crawl_record' do
|
53
51
|
|
52
|
+
context '@crawl_record' do
|
54
53
|
before(:each) do
|
55
54
|
@crawled_url = @spider.crawls["http://www.example.com/"]
|
56
55
|
end
|
@@ -60,13 +59,10 @@ describe HashSpidey::Strategies::HashStore do
|
|
60
59
|
end
|
61
60
|
|
62
61
|
it 'should respond to header#content-type' do
|
63
|
-
expect(@crawled_url.
|
62
|
+
expect(@crawled_url.crawleheader['content-type']).to eq "text/html; charset=UTF-8"
|
64
63
|
end
|
65
64
|
end
|
66
65
|
end
|
67
|
-
|
68
|
-
|
69
|
-
|
70
66
|
end
|
71
67
|
|
72
68
|
|
@@ -91,8 +87,4 @@ describe HashSpidey::Strategies::HashStore do
|
|
91
87
|
end
|
92
88
|
end
|
93
89
|
end
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
90
|
end
|