apollo-crawler 0.1.28 → 0.1.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/apollo_crawler/agent/fetcher_agent.rb +4 -4
- data/lib/apollo_crawler/crawler/base_crawler.rb +4 -2
- data/lib/apollo_crawler/fetcher/base_fetcher.rb +32 -3
- data/lib/apollo_crawler/planner/smart_planner.rb +1 -1
- data/lib/apollo_crawler/scheduler/base_scheduler.rb +10 -0
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4f7597400d1ab12d2ab2d1d52330ca29e4f2192f
|
4
|
+
data.tar.gz: 297216744ee4ea8a9af6595b1361dd3bfcf8d0b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71bf482ad465efa67976547c5e0a99d8a728478c5c346f987e675da8574b45e4dba44375fd501e8699c083e42eaf7e19ac1db6d2d9962f9736d408c52f896844
|
7
|
+
data.tar.gz: 01651a1aa4d3408ce63142a46c6e3727ceee35995520cfcc2df4b6451495b09cf5cebe1cc1244dca6ebef2138bb06b6ede97ac1720b75b8de866b3f381c2e25f
|
@@ -96,10 +96,10 @@ module Apollo
|
|
96
96
|
url = queued_url["url"]
|
97
97
|
|
98
98
|
res = Apollo::Model::RawDocument.new
|
99
|
-
res.headers = doc
|
100
|
-
res.body = doc
|
101
|
-
res.sha_hash = Digest::SHA1.hexdigest(doc
|
102
|
-
res.status = doc
|
99
|
+
res.headers = doc[:headers]
|
100
|
+
res.body = doc[:body]
|
101
|
+
res.sha_hash = Digest::SHA1.hexdigest(doc[:body])
|
102
|
+
res.status = doc[:status]
|
103
103
|
res.url = url
|
104
104
|
|
105
105
|
return res
|
@@ -170,10 +170,12 @@ module Apollo
|
|
170
170
|
end
|
171
171
|
|
172
172
|
def self.create_metadoc(url, doc)
|
173
|
+
body = doc[:body].encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'})
|
174
|
+
|
173
175
|
return {
|
174
176
|
'url' => url,
|
175
|
-
'doc' =>
|
176
|
-
'hash' => Digest::SHA256.new.update(
|
177
|
+
'doc' => body,
|
178
|
+
'hash' => Digest::SHA256.new.update(body).hexdigest,
|
177
179
|
'created_at' => Time.now.utc,
|
178
180
|
'expires_at' => nil,
|
179
181
|
'version' => 0
|
@@ -19,9 +19,15 @@
|
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
21
|
require 'cgi'
|
22
|
-
|
23
|
-
require
|
22
|
+
|
23
|
+
require 'open-uri'
|
24
|
+
require 'net/http'
|
25
|
+
|
24
26
|
require 'faraday'
|
27
|
+
require 'faraday_middleware'
|
28
|
+
|
29
|
+
require 'mechanize'
|
30
|
+
|
25
31
|
require 'ipaddr'
|
26
32
|
|
27
33
|
# require 'resolv'
|
@@ -38,7 +44,7 @@ module Apollo
|
|
38
44
|
}
|
39
45
|
end
|
40
46
|
|
41
|
-
def self.
|
47
|
+
def self.fetch_old(url, options = {})
|
42
48
|
begin
|
43
49
|
uri = URI.parse(url.to_s)
|
44
50
|
rescue Exception => e
|
@@ -64,6 +70,29 @@ module Apollo
|
|
64
70
|
end
|
65
71
|
|
66
72
|
# Return result
|
73
|
+
return res
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.fetch(url, options = {})
|
77
|
+
begin
|
78
|
+
uri = URI.parse(url.to_s)
|
79
|
+
rescue Exception => e
|
80
|
+
puts "EXCEPTION: BaseFetcher::fetch() - Unable to fetch: '#{e.to_s}'"
|
81
|
+
return nil
|
82
|
+
end
|
83
|
+
|
84
|
+
agent = Mechanize.new do |agent|
|
85
|
+
agent.user_agent = 'Apollo Crawler'
|
86
|
+
end
|
87
|
+
|
88
|
+
page = agent.get(uri)
|
89
|
+
|
90
|
+
res = {
|
91
|
+
:status => page.code,
|
92
|
+
:headers => page.header.to_hash,
|
93
|
+
:body => page.content
|
94
|
+
}
|
95
|
+
|
67
96
|
return res
|
68
97
|
end
|
69
98
|
end # class BaseFetcher
|
@@ -90,7 +90,7 @@ module Apollo
|
|
90
90
|
declarations[:queues]["planner.crawled.queue"].bind(declarations[:exchanges]["planner.crawled"]).subscribe do |delivery_info, metadata, payload|
|
91
91
|
msg = JSON.parse(payload)
|
92
92
|
|
93
|
-
puts "Crawled - msg.inspect"
|
93
|
+
# puts "Crawled - #{msg.inspect}"
|
94
94
|
|
95
95
|
request = msg['request']
|
96
96
|
response = msg['response']
|
@@ -18,6 +18,9 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
+
require "net/http"
|
22
|
+
require "uri"
|
23
|
+
|
21
24
|
require File.join(File.dirname(__FILE__), "../model/models.rb")
|
22
25
|
|
23
26
|
module Apollo
|
@@ -29,6 +32,13 @@ module Apollo
|
|
29
32
|
if queued_url.nil? == false
|
30
33
|
return queued_url
|
31
34
|
end
|
35
|
+
|
36
|
+
uri = URI.parse(url)
|
37
|
+
domain = Apollo::Model::Domain.where(:name => uri.hostname).first
|
38
|
+
if(domain.nil?)
|
39
|
+
domain = Apollo::Model::Domain.new(:name => uri.hostname)
|
40
|
+
domain.save
|
41
|
+
end
|
32
42
|
|
33
43
|
res = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
|
34
44
|
res.save
|