apollo-crawler 0.1.28 → 0.1.30
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/apollo_crawler/agent/fetcher_agent.rb +4 -4
- data/lib/apollo_crawler/crawler/base_crawler.rb +4 -2
- data/lib/apollo_crawler/fetcher/base_fetcher.rb +32 -3
- data/lib/apollo_crawler/planner/smart_planner.rb +1 -1
- data/lib/apollo_crawler/scheduler/base_scheduler.rb +10 -0
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4f7597400d1ab12d2ab2d1d52330ca29e4f2192f
|
4
|
+
data.tar.gz: 297216744ee4ea8a9af6595b1361dd3bfcf8d0b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71bf482ad465efa67976547c5e0a99d8a728478c5c346f987e675da8574b45e4dba44375fd501e8699c083e42eaf7e19ac1db6d2d9962f9736d408c52f896844
|
7
|
+
data.tar.gz: 01651a1aa4d3408ce63142a46c6e3727ceee35995520cfcc2df4b6451495b09cf5cebe1cc1244dca6ebef2138bb06b6ede97ac1720b75b8de866b3f381c2e25f
|
@@ -96,10 +96,10 @@ module Apollo
|
|
96
96
|
url = queued_url["url"]
|
97
97
|
|
98
98
|
res = Apollo::Model::RawDocument.new
|
99
|
-
res.headers = doc
|
100
|
-
res.body = doc
|
101
|
-
res.sha_hash = Digest::SHA1.hexdigest(doc
|
102
|
-
res.status = doc
|
99
|
+
res.headers = doc[:headers]
|
100
|
+
res.body = doc[:body]
|
101
|
+
res.sha_hash = Digest::SHA1.hexdigest(doc[:body])
|
102
|
+
res.status = doc[:status]
|
103
103
|
res.url = url
|
104
104
|
|
105
105
|
return res
|
@@ -170,10 +170,12 @@ module Apollo
|
|
170
170
|
end
|
171
171
|
|
172
172
|
def self.create_metadoc(url, doc)
|
173
|
+
body = doc[:body].encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'})
|
174
|
+
|
173
175
|
return {
|
174
176
|
'url' => url,
|
175
|
-
'doc' =>
|
176
|
-
'hash' => Digest::SHA256.new.update(
|
177
|
+
'doc' => body,
|
178
|
+
'hash' => Digest::SHA256.new.update(body).hexdigest,
|
177
179
|
'created_at' => Time.now.utc,
|
178
180
|
'expires_at' => nil,
|
179
181
|
'version' => 0
|
@@ -19,9 +19,15 @@
|
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
21
|
require 'cgi'
|
22
|
-
|
23
|
-
require
|
22
|
+
|
23
|
+
require 'open-uri'
|
24
|
+
require 'net/http'
|
25
|
+
|
24
26
|
require 'faraday'
|
27
|
+
require 'faraday_middleware'
|
28
|
+
|
29
|
+
require 'mechanize'
|
30
|
+
|
25
31
|
require 'ipaddr'
|
26
32
|
|
27
33
|
# require 'resolv'
|
@@ -38,7 +44,7 @@ module Apollo
|
|
38
44
|
}
|
39
45
|
end
|
40
46
|
|
41
|
-
def self.
|
47
|
+
def self.fetch_old(url, options = {})
|
42
48
|
begin
|
43
49
|
uri = URI.parse(url.to_s)
|
44
50
|
rescue Exception => e
|
@@ -64,6 +70,29 @@ module Apollo
|
|
64
70
|
end
|
65
71
|
|
66
72
|
# Return result
|
73
|
+
return res
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.fetch(url, options = {})
|
77
|
+
begin
|
78
|
+
uri = URI.parse(url.to_s)
|
79
|
+
rescue Exception => e
|
80
|
+
puts "EXCEPTION: BaseFetcher::fetch() - Unable to fetch: '#{e.to_s}'"
|
81
|
+
return nil
|
82
|
+
end
|
83
|
+
|
84
|
+
agent = Mechanize.new do |agent|
|
85
|
+
agent.user_agent = 'Apollo Crawler'
|
86
|
+
end
|
87
|
+
|
88
|
+
page = agent.get(uri)
|
89
|
+
|
90
|
+
res = {
|
91
|
+
:status => page.code,
|
92
|
+
:headers => page.header.to_hash,
|
93
|
+
:body => page.content
|
94
|
+
}
|
95
|
+
|
67
96
|
return res
|
68
97
|
end
|
69
98
|
end # class BaseFetcher
|
@@ -90,7 +90,7 @@ module Apollo
|
|
90
90
|
declarations[:queues]["planner.crawled.queue"].bind(declarations[:exchanges]["planner.crawled"]).subscribe do |delivery_info, metadata, payload|
|
91
91
|
msg = JSON.parse(payload)
|
92
92
|
|
93
|
-
puts "Crawled - msg.inspect"
|
93
|
+
# puts "Crawled - #{msg.inspect}"
|
94
94
|
|
95
95
|
request = msg['request']
|
96
96
|
response = msg['response']
|
@@ -18,6 +18,9 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
+
require "net/http"
|
22
|
+
require "uri"
|
23
|
+
|
21
24
|
require File.join(File.dirname(__FILE__), "../model/models.rb")
|
22
25
|
|
23
26
|
module Apollo
|
@@ -29,6 +32,13 @@ module Apollo
|
|
29
32
|
if queued_url.nil? == false
|
30
33
|
return queued_url
|
31
34
|
end
|
35
|
+
|
36
|
+
uri = URI.parse(url)
|
37
|
+
domain = Apollo::Model::Domain.where(:name => uri.hostname).first
|
38
|
+
if(domain.nil?)
|
39
|
+
domain = Apollo::Model::Domain.new(:name => uri.hostname)
|
40
|
+
domain.save
|
41
|
+
end
|
32
42
|
|
33
43
|
res = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
|
34
44
|
res.save
|