apollo-crawler 0.1.24 → 0.1.25
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/apollo_crawler.rb +3 -0
- data/lib/apollo_crawler/agent/agents.rb +3 -0
- data/lib/apollo_crawler/agent/base_agent.rb +2 -0
- data/lib/apollo_crawler/agent/crawler_agent.rb +77 -0
- data/lib/apollo_crawler/agent/domainer_agent.rb +51 -0
- data/lib/apollo_crawler/agent/exchanges.rb +55 -0
- data/lib/apollo_crawler/agent/fetcher_agent.rb +62 -12
- data/lib/apollo_crawler/crawler/base_crawler.rb +2 -2
- data/lib/apollo_crawler/crawler/slashdot_crawler.rb +1 -8
- data/lib/apollo_crawler/fetcher/base_fetcher.rb +17 -4
- data/lib/apollo_crawler/lib.rb +3 -0
- data/lib/apollo_crawler/model/data_source.rb +37 -0
- data/lib/apollo_crawler/model/domain.rb +37 -0
- data/lib/apollo_crawler/model/models.rb +3 -1
- data/lib/apollo_crawler/model/queued_url.rb +2 -1
- data/lib/apollo_crawler/model/raw_document.rb +5 -1
- data/lib/apollo_crawler/planner/smart_planner.rb +76 -18
- data/lib/apollo_crawler/program/platform_program.rb +40 -8
- data/lib/apollo_crawler/scheduler/base_scheduler.rb +39 -0
- data/lib/apollo_crawler/scheduler/schedulers.rb +21 -0
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +9 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 79e9ecdfed577a1ce13b74b24d6d5bc26bf75843
|
4
|
+
data.tar.gz: 6d93c6da6316d4666ddc5e434bab1caadc213ba3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 863d10a255722bd53c9ee998e2886fd86d04cf7808284323d33f7da1fe77fc99ac0874f1e2a61f20c8656e039bc3bac2d2a9cef911e9b6e6d12266e91636b3bc
|
7
|
+
data.tar.gz: db77a2d4606dcecbec1ae2e0d872cc41f6fea64012280e179b1147d8bbabfe7eef5446bf364978ee930f338cb367624e30a86e11fe0613e4499e03dcbc670e4b
|
data/lib/apollo_crawler.rb
CHANGED
@@ -57,5 +57,8 @@ require File.join(File.dirname(__FILE__), 'apollo_crawler/planner/planners')
|
|
57
57
|
# Program
|
58
58
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/program/programs')
|
59
59
|
|
60
|
+
# Scheduler
|
61
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/scheduler/schedulers')
|
62
|
+
|
60
63
|
# Stores
|
61
64
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/store/stores')
|
@@ -18,5 +18,8 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
+
require File.join(File.dirname(__FILE__), 'exchanges')
|
21
22
|
require File.join(File.dirname(__FILE__), 'base_agent')
|
23
|
+
require File.join(File.dirname(__FILE__), 'domainer_agent')
|
24
|
+
require File.join(File.dirname(__FILE__), 'crawler_agent')
|
22
25
|
require File.join(File.dirname(__FILE__), 'fetcher_agent')
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'exchanges')
|
22
|
+
require File.join(File.dirname(__FILE__), 'base_agent')
|
23
|
+
|
24
|
+
require File.join(File.dirname(__FILE__), '../crawler/crawlers')
|
25
|
+
|
26
|
+
require 'nokogiri'
|
27
|
+
|
28
|
+
module Apollo
|
29
|
+
module Agent
|
30
|
+
class CrawlerAgent < BaseAgent
|
31
|
+
attr_accessor :declarations
|
32
|
+
|
33
|
+
def initialize(amqp, opts={})
|
34
|
+
if(opts[:verbose])
|
35
|
+
puts "Initializing crawler agent..."
|
36
|
+
end
|
37
|
+
|
38
|
+
# Declarations
|
39
|
+
channel = amqp.create_channel
|
40
|
+
declarations = Apollo::Agent.declare_exchanges(channel, opts)
|
41
|
+
|
42
|
+
# Binding
|
43
|
+
declarations[:queues]["crawler.queue"].bind(declarations[:exchanges]["crawler"]).subscribe do |delivery_info, metadata, payload|
|
44
|
+
puts "CrawlerAgent: Received, metadata #{metadata.inspect}" if opts[:verbose]
|
45
|
+
|
46
|
+
msg = JSON.parse(payload)
|
47
|
+
|
48
|
+
request = msg['request']
|
49
|
+
response = msg['response']
|
50
|
+
|
51
|
+
# puts "PLANEEEER: #{msg.inspect}"
|
52
|
+
|
53
|
+
doc = Nokogiri::HTML(response['body'])
|
54
|
+
crawler = request['crawler_name'].constantize.new
|
55
|
+
data = crawler.extract_data(doc)
|
56
|
+
links = crawler.extract_links(doc)
|
57
|
+
|
58
|
+
# puts crawler.to_s
|
59
|
+
# puts res.inspect
|
60
|
+
|
61
|
+
if(metadata[:reply_to] != nil)
|
62
|
+
x = declarations[:exchanges][metadata[:reply_to]]
|
63
|
+
|
64
|
+
msg = {
|
65
|
+
:request => request,
|
66
|
+
:response => response,
|
67
|
+
:data => data,
|
68
|
+
:links => links
|
69
|
+
}
|
70
|
+
|
71
|
+
x.publish(msg.to_json)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end # class CrawlerAgent
|
76
|
+
end # module Agent
|
77
|
+
end # module Apollo
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'exchanges')
|
22
|
+
require File.join(File.dirname(__FILE__), 'base_agent')
|
23
|
+
|
24
|
+
require File.join(File.dirname(__FILE__), '../crawler/crawlers')
|
25
|
+
|
26
|
+
require 'nokogiri'
|
27
|
+
|
28
|
+
module Apollo
|
29
|
+
module Agent
|
30
|
+
class DomainerAgent < BaseAgent
|
31
|
+
attr_accessor :declarations
|
32
|
+
|
33
|
+
def initialize(amqp, opts={})
|
34
|
+
if(opts[:verbose])
|
35
|
+
puts "Initializing crawler agent..."
|
36
|
+
end
|
37
|
+
|
38
|
+
# Declarations
|
39
|
+
channel = amqp.create_channel
|
40
|
+
declarations = Apollo::Agent.declare_exchanges(channel, opts)
|
41
|
+
|
42
|
+
# Binding
|
43
|
+
declarations[:queues]["domainer.queue"].bind(declarations[:exchanges]["domainer"]).subscribe do |delivery_info, metadata, payload|
|
44
|
+
puts "DomainerAgent: Received, metadata #{metadata.inspect}" if opts[:verbose]
|
45
|
+
|
46
|
+
msg = JSON.parse(payload)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end # class DomainerAgent
|
50
|
+
end # module Agent
|
51
|
+
end # module Apollo
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
module Apollo
|
22
|
+
module Agent
|
23
|
+
def self.declare_exchanges(channel, opts={})
|
24
|
+
if(opts[:verbose])
|
25
|
+
puts "Declaring AMQP Exchanges"
|
26
|
+
end
|
27
|
+
|
28
|
+
# Exchanges
|
29
|
+
exchanges = {}
|
30
|
+
exchanges["crawler"] = channel.direct("crawler", :auto_delete => false, :durable => true)
|
31
|
+
exchanges["domainer"] = channel.direct("domainer", :auto_delete => false, :durable => true)
|
32
|
+
exchanges["fetcher"] = channel.direct("fetcher", :auto_delete => false, :durable => true)
|
33
|
+
exchanges["planner.crawled"] = channel.direct("planner.crawled", :auto_delete => false, :durable => true)
|
34
|
+
exchanges["planner.domained"] = channel.direct("planner.domained", :auto_delete => false, :durable => true)
|
35
|
+
exchanges["planner.fetched"] = channel.direct("planner.fetched", :auto_delete => false, :durable => true)
|
36
|
+
|
37
|
+
# Queues
|
38
|
+
queues = {}
|
39
|
+
queues["crawler.queue"] = channel.queue("crawler.queue", :auto_delete => false, :durable => true)
|
40
|
+
queues["domainer.queue"] = channel.queue("domainer.queue", :auto_delete => false, :durable => true)
|
41
|
+
queues["fetcher.queue"] = channel.queue("fetcher.queue", :auto_delete => false, :durable => true)
|
42
|
+
queues["planner.crawled.queue"] = channel.queue("planner.crawled.queue", :auto_delete => false, :durable => true)
|
43
|
+
queues["planner.domained.queue"] = channel.queue("planner.domained.queue", :auto_delete => false, :durable => true)
|
44
|
+
queues["planner.fetched.queue"] = channel.queue("planner.fetched.queue", :auto_delete => false, :durable => true)
|
45
|
+
|
46
|
+
# Compose res
|
47
|
+
res = {
|
48
|
+
:exchanges => exchanges,
|
49
|
+
:queues => queues
|
50
|
+
}
|
51
|
+
|
52
|
+
return res
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -18,13 +18,22 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
+
require File.join(File.dirname(__FILE__), 'exchanges')
|
21
22
|
require File.join(File.dirname(__FILE__), 'base_agent')
|
23
|
+
|
22
24
|
require File.join(File.dirname(__FILE__), '../fetcher/fetchers')
|
23
25
|
|
26
|
+
require 'digest/sha1'
|
27
|
+
require 'thread/pool'
|
28
|
+
|
24
29
|
module Apollo
|
25
30
|
module Agent
|
26
31
|
class FetcherAgent < BaseAgent
|
32
|
+
THREAD_POOL_SIZE = 10
|
33
|
+
|
27
34
|
attr_accessor :fetcher
|
35
|
+
attr_accessor :declarations
|
36
|
+
attr_accessor :thread_pool
|
28
37
|
|
29
38
|
def initialize(amqp, opts={})
|
30
39
|
self.fetcher = Apollo::Fetcher::SmartFetcher.new
|
@@ -33,23 +42,64 @@ module Apollo
|
|
33
42
|
puts "Initializing fetcher agent..."
|
34
43
|
end
|
35
44
|
|
36
|
-
|
37
|
-
q = ch.queue("fetcher", :auto_delete => false, :durable => true)
|
38
|
-
x = ch.default_exchange
|
45
|
+
thread_pool = Thread::Pool.new(THREAD_POOL_SIZE)
|
39
46
|
|
40
|
-
|
41
|
-
|
47
|
+
# Declarations
|
48
|
+
channel = amqp.create_channel
|
49
|
+
declarations = Apollo::Agent.declare_exchanges(channel, opts)
|
42
50
|
|
43
|
-
|
44
|
-
|
45
|
-
|
51
|
+
# Binding
|
52
|
+
declarations[:queues]["fetcher.queue"].bind(declarations[:exchanges]["fetcher"]).subscribe do |delivery_info, metadata, payload|
|
53
|
+
thread_pool.process {
|
46
54
|
queued_url = JSON.parse(payload)
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
55
|
+
url = queued_url["url"]
|
56
|
+
|
57
|
+
puts "FetcherAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
|
58
|
+
|
59
|
+
doc = nil
|
60
|
+
begin
|
61
|
+
doc = Apollo::Fetcher::SmartFetcher::fetch(url)
|
62
|
+
rescue Exception => e
|
63
|
+
puts "EXCEPTION: FetcherAgent::initialize() - Unable to fetch '#{url}', reason: '#{e.to_s}'"
|
64
|
+
end
|
65
|
+
|
66
|
+
doc = get_fetched_doc(queued_url, doc, metadata, opts)
|
67
|
+
|
68
|
+
# send_response_msg(metadata[:reply_to], get_response_msg(queued_url, doc))
|
69
|
+
|
70
|
+
if(metadata && metadata[:reply_to])
|
71
|
+
exchange_name = metadata[:reply_to]
|
72
|
+
|
73
|
+
if(exchange_name != nil)
|
74
|
+
msg = get_response_msg(queued_url, doc)
|
75
|
+
|
76
|
+
x = declarations[:exchanges][exchange_name]
|
77
|
+
x.publish(msg.to_json)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
}
|
51
81
|
end
|
52
82
|
end
|
83
|
+
|
84
|
+
def get_fetched_doc(queued_url, doc, metadata, opts={})
|
85
|
+
url = queued_url["url"]
|
86
|
+
|
87
|
+
res = Apollo::Model::RawDocument.new
|
88
|
+
res.headers = doc.headers
|
89
|
+
res.body = doc.body
|
90
|
+
res.sha_hash = Digest::SHA1.hexdigest(doc.body)
|
91
|
+
res.status = doc.status
|
92
|
+
res.url = url
|
93
|
+
|
94
|
+
return res
|
95
|
+
end
|
96
|
+
|
97
|
+
def get_response_msg(queued_url, doc)
|
98
|
+
return {
|
99
|
+
:request => queued_url,
|
100
|
+
:response => doc
|
101
|
+
}
|
102
|
+
end
|
53
103
|
end # class FetcherAgent
|
54
104
|
end # module Agent
|
55
105
|
end # module Apollo
|
@@ -172,8 +172,8 @@ module Apollo
|
|
172
172
|
def self.create_metadoc(url, doc)
|
173
173
|
return {
|
174
174
|
'url' => url,
|
175
|
-
'doc' => doc.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
|
176
|
-
'hash' => Digest::SHA256.new.update(doc).hexdigest,
|
175
|
+
'doc' => doc.body.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
|
176
|
+
'hash' => Digest::SHA256.new.update(doc.body).hexdigest,
|
177
177
|
'created_at' => Time.now.utc,
|
178
178
|
'expires_at' => nil,
|
179
179
|
'version' => 0
|
@@ -46,14 +46,7 @@ module Apollo
|
|
46
46
|
end
|
47
47
|
|
48
48
|
def extract_links(doc)
|
49
|
-
|
50
|
-
url = BaseCrawler.try_get_url(self.url, node['href']).to_s
|
51
|
-
next if url.nil?
|
52
|
-
|
53
|
-
{
|
54
|
-
:link => url
|
55
|
-
}
|
56
|
-
}
|
49
|
+
return []
|
57
50
|
end
|
58
51
|
end # class SlashdotCrawler
|
59
52
|
end # module Crawler
|
@@ -24,6 +24,9 @@ require "open-uri"
|
|
24
24
|
require 'faraday'
|
25
25
|
require 'ipaddr'
|
26
26
|
|
27
|
+
# require 'resolv'
|
28
|
+
# require 'resolv-replace'
|
29
|
+
|
27
30
|
module Apollo
|
28
31
|
module Fetcher
|
29
32
|
class BaseFetcher
|
@@ -36,7 +39,12 @@ module Apollo
|
|
36
39
|
end
|
37
40
|
|
38
41
|
def self.fetch(url, options = {})
|
39
|
-
|
42
|
+
begin
|
43
|
+
uri = URI.parse(url.to_s)
|
44
|
+
rescue Exception => e
|
45
|
+
puts "EXCEPTION: BaseFetcher::fetch() - Unable to fetch: '#{e.to_s}'"
|
46
|
+
return nil
|
47
|
+
end
|
40
48
|
|
41
49
|
# See https://github.com/lostisland/faraday
|
42
50
|
conn = Faraday.new(:url => url) do |faraday|
|
@@ -46,12 +54,17 @@ module Apollo
|
|
46
54
|
end
|
47
55
|
|
48
56
|
# Make request
|
49
|
-
|
50
|
-
|
57
|
+
begin
|
58
|
+
res = conn.get(uri) do |request|
|
59
|
+
request.headers = BaseFetcher.get_fake_headers(uri)
|
60
|
+
end
|
61
|
+
rescue Exception => e
|
62
|
+
puts "EXCEPTION: BaseFetcher::fetch() - Unable to fetch: '#{e.to_s}'"
|
63
|
+
return nil
|
51
64
|
end
|
52
65
|
|
53
66
|
# Return result
|
54
|
-
return res
|
67
|
+
return res
|
55
68
|
end
|
56
69
|
end # class BaseFetcher
|
57
70
|
end # module Fetcher
|
data/lib/apollo_crawler/lib.rb
CHANGED
@@ -54,5 +54,8 @@ require File.join(File.dirname(__FILE__), 'planner/planners')
|
|
54
54
|
# Programs
|
55
55
|
require File.join(File.dirname(__FILE__), 'program/programs')
|
56
56
|
|
57
|
+
# Programs
|
58
|
+
require File.join(File.dirname(__FILE__), 'scheduler/schedulers')
|
59
|
+
|
57
60
|
# Stores
|
58
61
|
require File.join(File.dirname(__FILE__), 'store/stores')
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_model')
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Model
|
25
|
+
class DataSource < BaseModel
|
26
|
+
include Mongoid::Document
|
27
|
+
include Mongoid::Timestamps
|
28
|
+
|
29
|
+
store_in collection: "data_sources"
|
30
|
+
|
31
|
+
field :url
|
32
|
+
|
33
|
+
# Indexes
|
34
|
+
index({ created_at: 1, updated_at: 1, url: 1 })
|
35
|
+
end # class DataSource
|
36
|
+
end # module Model
|
37
|
+
end # module Apollo
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_model')
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Model
|
25
|
+
class Domain < BaseModel
|
26
|
+
include Mongoid::Document
|
27
|
+
include Mongoid::Timestamps
|
28
|
+
|
29
|
+
store_in collection: "domains"
|
30
|
+
|
31
|
+
field :name
|
32
|
+
|
33
|
+
# Indexes
|
34
|
+
index({ created_at: 1, updated_at: 1, name: 1 })
|
35
|
+
end # class Domain
|
36
|
+
end # module Model
|
37
|
+
end # module Apollo
|
@@ -20,5 +20,7 @@
|
|
20
20
|
|
21
21
|
require File.join(File.dirname(__FILE__), 'base_model')
|
22
22
|
require File.join(File.dirname(__FILE__), 'crawler')
|
23
|
+
require File.join(File.dirname(__FILE__), 'data_source')
|
24
|
+
require File.join(File.dirname(__FILE__), 'domain')
|
23
25
|
require File.join(File.dirname(__FILE__), 'queued_url')
|
24
|
-
require File.join(File.dirname(__FILE__), 'raw_document')
|
26
|
+
require File.join(File.dirname(__FILE__), 'raw_document')
|
@@ -30,9 +30,10 @@ module Apollo
|
|
30
30
|
|
31
31
|
field :url
|
32
32
|
field :state
|
33
|
+
field :crawler_name
|
33
34
|
|
34
35
|
# Indexes
|
35
|
-
index({ created_at: 1, updated_at: 1 })
|
36
|
+
index({ created_at: 1, updated_at: 1, crawler_name: 1 })
|
36
37
|
end # class QueuedUrl
|
37
38
|
end # module Model
|
38
39
|
end # module Apollo
|
@@ -28,10 +28,14 @@ module Apollo
|
|
28
28
|
|
29
29
|
store_in collection: "raw_docs"
|
30
30
|
|
31
|
+
field :url
|
32
|
+
field :headers
|
31
33
|
field :body
|
34
|
+
field :status
|
35
|
+
field :sha_hash
|
32
36
|
|
33
37
|
# Indexes
|
34
|
-
index({ created_at: 1, updated_at: 1 })
|
38
|
+
index({ created_at: 1, updated_at: 1, sha_hash: 1 })
|
35
39
|
end # class RawDocument
|
36
40
|
end # module Model
|
37
41
|
end # module Apollo
|
@@ -22,43 +22,101 @@ require File.join(File.dirname(__FILE__),'base_planner')
|
|
22
22
|
|
23
23
|
require File.join(File.dirname(__FILE__),'../model/models.rb')
|
24
24
|
|
25
|
+
require File.join(File.dirname(__FILE__),'../agent/exchanges.rb')
|
26
|
+
|
27
|
+
require File.join(File.dirname(__FILE__),'../scheduler/schedulers')
|
28
|
+
|
29
|
+
require 'nokogiri'
|
30
|
+
|
25
31
|
module Apollo
|
26
32
|
module Planner
|
27
33
|
class SmartPlanner < BasePlanner
|
28
34
|
attr_accessor :amqp
|
29
35
|
attr_accessor :mongo
|
36
|
+
attr_accessor :declarations
|
30
37
|
|
31
|
-
def initialize(amqp=nil, mongo=nil)
|
38
|
+
def initialize(amqp=nil, mongo=nil, opts={})
|
32
39
|
self.amqp = amqp
|
33
40
|
self.mongo = mongo
|
34
|
-
end
|
35
41
|
|
36
|
-
|
37
|
-
|
42
|
+
# Declarations
|
43
|
+
channel = amqp.create_channel
|
44
|
+
self.declarations = Apollo::Agent.declare_exchanges(channel, opts)
|
38
45
|
|
39
|
-
|
40
|
-
|
41
|
-
|
46
|
+
# Bindings
|
47
|
+
declarations[:queues]["planner.fetched.queue"].bind(declarations[:exchanges]["planner.fetched"]).subscribe do |delivery_info, metadata, payload|
|
48
|
+
msg = JSON.parse(payload)
|
42
49
|
|
43
|
-
|
50
|
+
request = msg['request']
|
51
|
+
response = msg['response']
|
44
52
|
|
45
|
-
|
46
|
-
|
47
|
-
|
53
|
+
doc = Apollo::Model::QueuedUrl.find(request["_id"])
|
54
|
+
doc.update_attributes(msg['request'])
|
55
|
+
doc.state = "fetched"
|
56
|
+
doc.save
|
57
|
+
|
58
|
+
doc = Apollo::Model::RawDocument.where(:url => request['url']).first
|
59
|
+
if doc
|
60
|
+
if doc.sha_hash != response['sha_hash']
|
61
|
+
puts "Removing old cached version of '#{request['url']}'" if opts[:verbose]
|
62
|
+
|
63
|
+
doc.destroy
|
64
|
+
doc = nil
|
65
|
+
else
|
66
|
+
puts "Using cached version of '#{request['url']}'" if opts[:verbose]
|
67
|
+
end
|
68
|
+
else
|
69
|
+
doc = Apollo::Model::RawDocument.where(:sha_hash => response['sha_hash']).first
|
70
|
+
end
|
71
|
+
|
72
|
+
if(doc.nil?)
|
73
|
+
doc = Apollo::Model::RawDocument.new(response).save
|
74
|
+
end
|
75
|
+
|
76
|
+
# Publish
|
77
|
+
declarations[:exchanges]["crawler"].publish(msg.to_json, :reply_to => "planner.crawled")
|
78
|
+
end
|
79
|
+
|
80
|
+
declarations[:queues]["planner.domained.queue"].bind(declarations[:exchanges]["planner.domained"]).subscribe do |delivery_info, metadata, payload|
|
81
|
+
msg = JSON.parse(payload)
|
48
82
|
|
83
|
+
puts "DOMAINED !!!"
|
84
|
+
end
|
85
|
+
|
86
|
+
declarations[:queues]["planner.crawled.queue"].bind(declarations[:exchanges]["planner.crawled"]).subscribe do |delivery_info, metadata, payload|
|
87
|
+
msg = JSON.parse(payload)
|
88
|
+
|
89
|
+
request = msg['request']
|
90
|
+
response = msg['response']
|
91
|
+
data = msg['data']
|
92
|
+
links = msg['links']
|
93
|
+
links = [] if links.nil?
|
94
|
+
|
95
|
+
links.each do |url|
|
96
|
+
link = url['link']
|
97
|
+
|
98
|
+
Apollo::Scheduler::BaseScheduler::schedule(link, request['crawler_name'])
|
99
|
+
end
|
100
|
+
|
101
|
+
# puts JSON.pretty_generate(data)
|
102
|
+
# puts JSON.pretty_generate(links)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def fetch_url(url, opts={})
|
49
107
|
if(opts[:verbose])
|
50
|
-
puts "
|
108
|
+
puts "AMQP fetching '#{url.inspect}'"
|
51
109
|
end
|
52
110
|
|
53
|
-
|
111
|
+
# Publish
|
112
|
+
declarations[:exchanges]["fetcher"].publish(url.to_json, :reply_to => "planner.fetched")
|
113
|
+
end
|
54
114
|
|
55
|
-
|
56
|
-
|
57
|
-
url.
|
115
|
+
def fetch_queued_urls(opts={})
|
116
|
+
while url = Apollo::Model::QueuedUrl.where({:state => :queued}).find_and_modify({ "$set" => { state: :fetching }}, new: true)
|
117
|
+
# puts "Count of URLs in Queue: #{url.count}" if opts[:verbose]
|
58
118
|
|
59
119
|
fetch_url(url, opts)
|
60
|
-
|
61
|
-
# puts "Removing URL from Queue '#{url.inspect}'" if opts[:verbose]
|
62
120
|
end
|
63
121
|
end
|
64
122
|
|
@@ -37,6 +37,7 @@ require 'eventmachine'
|
|
37
37
|
require 'em-http'
|
38
38
|
|
39
39
|
require 'fileutils'
|
40
|
+
require 'csv'
|
40
41
|
|
41
42
|
require 'mongoid'
|
42
43
|
|
@@ -44,6 +45,8 @@ require File.join(File.dirname(__FILE__), '..', 'version')
|
|
44
45
|
|
45
46
|
require File.join(File.dirname(__FILE__),'base_program')
|
46
47
|
|
48
|
+
require File.join(File.dirname(__FILE__),'../scheduler/schedulers')
|
49
|
+
|
47
50
|
module Apollo
|
48
51
|
# Apollo Crawler Base Directory
|
49
52
|
APOLLO_PLATFORM_BASE_DIR = File.join(File.dirname(__FILE__), "..")
|
@@ -97,29 +100,56 @@ module Apollo
|
|
97
100
|
def enqueue_crawlers_urls(amqp, crawlers=Apollo::Crawler::BaseCrawler.subclasses, opts={})
|
98
101
|
crawlers.each do |crawler|
|
99
102
|
i = crawler.new
|
100
|
-
|
101
|
-
qu = Apollo::Model::QueuedUrl.new(:url => i.url, :state => :queued)
|
102
|
-
qu.save
|
103
|
+
Apollo::Scheduler::BaseScheduler::schedule(i.url, crawler)
|
103
104
|
end
|
104
105
|
end
|
105
106
|
|
107
|
+
def init_crawlers(amqp, opts={})
|
108
|
+
crawlers = []
|
109
|
+
crawlers << Apollo::Agent::CrawlerAgent.new(amqp, self.options)
|
110
|
+
end
|
111
|
+
|
112
|
+
def init_domainers(amqp, opts={})
|
113
|
+
domainers = []
|
114
|
+
domainers << Apollo::Agent::DomainerAgent.new(amqp, self.options)
|
115
|
+
end
|
116
|
+
|
106
117
|
def init_fetchers(amqp, opts={})
|
107
118
|
fetchers = []
|
108
119
|
fetchers << Apollo::Agent::FetcherAgent.new(amqp, self.options)
|
109
120
|
|
121
|
+
# TODO: This should not be here!
|
110
122
|
enqueue_crawlers_urls(amqp, Apollo::Crawler::BaseCrawler.subclasses, opts)
|
111
|
-
|
112
|
-
# ch = self.amqp.create_channel
|
113
|
-
# x = ch.default_exchange
|
114
|
-
# x.publish("Hello!", :routing_key => "fetcher")
|
115
123
|
end
|
116
124
|
|
117
125
|
def init_agents(amqp, opts={})
|
118
126
|
puts "Initializing agents"
|
119
127
|
|
128
|
+
init_crawlers(amqp, opts)
|
129
|
+
init_domainers(amqp, opts)
|
120
130
|
init_fetchers(amqp, opts)
|
121
131
|
end
|
122
132
|
|
133
|
+
def init_domains(opts={})
|
134
|
+
path = File.join(File.dirname(__FILE__), "../../../tmp/top-1m.csv")
|
135
|
+
puts "#{path}"
|
136
|
+
if(File.exists?(path) == false)
|
137
|
+
return 0
|
138
|
+
end
|
139
|
+
|
140
|
+
Thread::new {
|
141
|
+
CSV.foreach(path) do |row|
|
142
|
+
name = row[1]
|
143
|
+
domain = Apollo::Model::Domain.where({:name => name}).first()
|
144
|
+
if(domain.nil?)
|
145
|
+
domain = Apollo::Model::Domain.new({:name => name})
|
146
|
+
domain.save
|
147
|
+
print "."
|
148
|
+
end
|
149
|
+
end
|
150
|
+
}
|
151
|
+
end
|
152
|
+
|
123
153
|
def init_program(args)
|
124
154
|
res = super(args)
|
125
155
|
return res unless res.nil?
|
@@ -148,6 +178,8 @@ module Apollo
|
|
148
178
|
res = super(args)
|
149
179
|
return res unless res.nil?
|
150
180
|
|
181
|
+
init_domains()
|
182
|
+
|
151
183
|
# Here we start
|
152
184
|
# if(ARGV.length < 1)
|
153
185
|
# puts optparser
|
@@ -156,7 +188,7 @@ module Apollo
|
|
156
188
|
|
157
189
|
res_code = 0
|
158
190
|
if(self.options[:daemon])
|
159
|
-
planner = Apollo::Planner::SmartPlanner.new(self.amqp, self.mongo)
|
191
|
+
planner = Apollo::Planner::SmartPlanner.new(self.amqp, self.mongo, self.options)
|
160
192
|
res_code = planner.run(self.options)
|
161
193
|
end
|
162
194
|
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), "../model/models")
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Scheduler
|
25
|
+
class BaseScheduler
|
26
|
+
def self.schedule(url, crawler=nil, opts={})
|
27
|
+
queued_url = Apollo::Model::QueuedUrl.where(:url => url).first
|
28
|
+
|
29
|
+
if(queued_url.nil?)
|
30
|
+
qu = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
|
31
|
+
qu.save
|
32
|
+
return qu
|
33
|
+
end
|
34
|
+
|
35
|
+
return nil
|
36
|
+
end
|
37
|
+
end # class BaseScheduler
|
38
|
+
end # module Scheduler
|
39
|
+
end # module Apollo
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_scheduler')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.25
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tomas Korcak
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-03-
|
11
|
+
date: 2013-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: awesome_print
|
@@ -414,6 +414,9 @@ files:
|
|
414
414
|
- ./lib/apollo_crawler/adapter/mongo_adapter.rb
|
415
415
|
- ./lib/apollo_crawler/agent/agents.rb
|
416
416
|
- ./lib/apollo_crawler/agent/base_agent.rb
|
417
|
+
- ./lib/apollo_crawler/agent/crawler_agent.rb
|
418
|
+
- ./lib/apollo_crawler/agent/domainer_agent.rb
|
419
|
+
- ./lib/apollo_crawler/agent/exchanges.rb
|
417
420
|
- ./lib/apollo_crawler/agent/fetcher_agent.rb
|
418
421
|
- ./lib/apollo_crawler/cache/base_cache.rb
|
419
422
|
- ./lib/apollo_crawler/cache/caches.rb
|
@@ -452,6 +455,8 @@ files:
|
|
452
455
|
- ./lib/apollo_crawler/logger/loggers.rb
|
453
456
|
- ./lib/apollo_crawler/model/base_model.rb
|
454
457
|
- ./lib/apollo_crawler/model/crawler.rb
|
458
|
+
- ./lib/apollo_crawler/model/data_source.rb
|
459
|
+
- ./lib/apollo_crawler/model/domain.rb
|
455
460
|
- ./lib/apollo_crawler/model/models.rb
|
456
461
|
- ./lib/apollo_crawler/model/queued_url.rb
|
457
462
|
- ./lib/apollo_crawler/model/raw_document.rb
|
@@ -463,6 +468,8 @@ files:
|
|
463
468
|
- ./lib/apollo_crawler/program/crawler_program.rb
|
464
469
|
- ./lib/apollo_crawler/program/platform_program.rb
|
465
470
|
- ./lib/apollo_crawler/program/programs.rb
|
471
|
+
- ./lib/apollo_crawler/scheduler/base_scheduler.rb
|
472
|
+
- ./lib/apollo_crawler/scheduler/schedulers.rb
|
466
473
|
- ./lib/apollo_crawler/store/base_store.rb
|
467
474
|
- ./lib/apollo_crawler/store/stores.rb
|
468
475
|
- ./lib/apollo_crawler/version.rb
|